├── .circleci └── config.yml ├── .github └── workflows │ ├── codeql.yml │ └── python-package ├── LICENSE ├── PyPi Package ├── .vscode │ └── settings.json ├── LICENSE ├── MANIFEST.in ├── README.md ├── pyproject.toml ├── setup.cfg ├── setup.py └── src │ ├── denmune.egg-info │ ├── PKG-INFO │ ├── SOURCES.txt │ ├── dependency_links.txt │ ├── requires.txt │ └── top_level.txt │ └── denmune │ ├── .idea │ ├── .gitignore │ ├── .name │ ├── denmune.iml │ ├── inspectionProfiles │ │ └── profiles_settings.xml │ ├── misc.xml │ └── modules.xml │ ├── __init__.py │ └── denmune.py ├── README.md ├── codecov.yml ├── colab ├── 2D_shapes_datasets.ipynb ├── Get_97_by_training_MNIST_dataset.ipynb ├── MNIST_dataset.ipynb ├── chameleon_datasets.ipynb ├── clustering_propagation.ipynb ├── clustering_propagation_snapshots.ipynb ├── how_to_use_it.ipynb ├── iris_dataset.ipynb ├── k_nearest_evolution.ipynb ├── noise_detection.ipynb ├── scalability_and_speed.ipynb ├── stability_vs_knn.ipynb ├── training_MNIST.ipynb └── validation.ipynb ├── images ├── denmune-illustration.png └── denmune_propagation.png ├── kaggle ├── beauty-of-propagation-part3.ipynb ├── detecting-non-groundtruth-datasets.ipynb ├── detection-of-2d-shape-datasets.ipynb ├── get-97-using-simple-yet-one-parameter-algorithm.ipynb ├── iris-dataset.ipynb ├── k-nearest-evolution.ipynb ├── noise-detection.ipynb ├── scalability-vs-speed.ipynb ├── stability-vs-number-of-nearest-neighbor.ipynb ├── the-beauty-of-clusters-propagation.ipynb ├── the-beauty-of-propagation-part2.ipynb ├── training-MNIST-dataset-to-get-97.ipynb ├── training-pendigits-dataset-to-get-97.ipynb ├── validation.ipynb └── when-simple-means-powerful.ipynb ├── requirements.txt └── src ├── __init__.py ├── denmune.py └── tests └── test_denmune.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Use the latest 2.1 version of CircleCI pipeline process engine. 2 | # See: https://circleci.com/docs/2.0/configuration-reference 3 | version: 2.1 4 | 5 | # Orbs are reusable packages of CircleCI configuration that you may share across projects, enabling you to create encapsulated, parameterized commands, jobs, and executors that can be used across multiple projects. 6 | # See: https://circleci.com/docs/2.0/orb-intro/ 7 | orbs: 8 | # The python orb contains a set of prepackaged CircleCI configuration you can use repeatedly in your configuration files 9 | # Orb commands and jobs help you with common scripting around a language/tool 10 | # so you dont have to copy and paste it everywhere. 11 | # See the orb documentation here: https://circleci.com/developer/orbs/orb/circleci/python 12 | codecov: codecov/codecov@3.0.0 13 | slack: circleci/slack@4.4.4 14 | python: circleci/python@2.1.1 15 | 16 | 17 | # Define a job to be invoked later in a workflow. 18 | # See: https://circleci.com/docs/2.0/configuration-reference/#jobs 19 | jobs: 20 | build-and-test: # This is the name of the job, feel free to change it to better match what you're trying to do! 21 | # These next lines defines a Docker executors: https://circleci.com/docs/2.0/executor-types/ 22 | # You can specify an image from Dockerhub or use one of the convenience images from CircleCI's Developer Hub 23 | # A list of available CircleCI Docker convenience images are available here: https://circleci.com/developer/images/image/cimg/python 24 | # The executor is the environment in which the steps below will be executed - below will use a python 3.8 container 25 | # Change the version below to your required version of python 26 | docker: 27 | - image: cimg/python:3.10 28 | 29 | 30 | # Checkout the code as the first step. This is a dedicated CircleCI step. 31 | # The python orb's install-packages step will install the dependencies from a Pipfile via Pipenv by default. 32 | # Here we're making sure we use just use the system-wide pip. By default it uses the project root's requirements.txt. 33 | # Then run your tests! 34 | # CircleCI will report the results back to your VCS provider. 35 | steps: 36 | - checkout 37 | - python/install-packages: 38 | pkg-manager: pip 39 | # app-dir: ~/project/package-directory/ # If you're requirements.txt isn't in the root directory. 40 | # pip-dependency-file: test-requirements.txt # if you have a different name for your requirements file, maybe one that combines your runtime and test requirements. 41 | 42 | - run: 43 | name: Treon Test 44 | command: | 45 | cd colab 46 | # git clone https://github.com/egy1st/datasets 47 | # treon --threads=2 48 | 49 | - run: 50 | name: CodeCov pyTest 51 | command: | 52 | coverage run -m pytest 53 | coverage report 54 | coverage html 55 | coverage xml 56 | cp coverage.xml htmlcov/coverage.xml 57 | 58 | - codecov/upload 59 | 60 | - store_artifacts: 61 | path: htmlcov 62 | 63 | - slack/notify: 64 | template: basic_success_1 65 | channel: C0326UK1VFY 66 | # Invoke jobs via workflows 67 | # See: https://circleci.com/docs/2.0/configuration-reference/#workflows 68 | workflows: 69 | Python-3.10: # This is the name of the workflow, feel free to change it to better match your workflow. 70 | # Inside the workflow, you define the jobs you want to run. 71 | jobs: 72 | - build-and-test: 73 | context: Slack 74 | 75 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ "main" ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ "main" ] 20 | schedule: 21 | - cron: '45 0 * * 6' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v3 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v2 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | 52 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 53 | # queries: security-extended,security-and-quality 54 | 55 | 56 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 57 | # If this step fails, then you should remove it and run the build manually (see below) 58 | - name: Autobuild 59 | uses: github/codeql-action/autobuild@v2 60 | 61 | # ℹ️ Command-line programs to run using the OS shell. 62 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 63 | 64 | # If the Autobuild fails above, remove it and uncomment the following three lines. 65 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 66 | 67 | # - run: | 68 | # echo "Run, Build Application using script" 69 | # ./location_of_script_within_repo/buildscript.sh 70 | 71 | - name: Perform CodeQL Analysis 72 | uses: github/codeql-action/analyze@v2 73 | with: 74 | category: "/language:${{matrix.language}}" 75 | -------------------------------------------------------------------------------- /.github/workflows/python-package: -------------------------------------------------------------------------------- 1 | name: workflow for codecov 2 | on: [push] 3 | jobs: 4 | run: 5 | runs-on: ${{ matrix.os }} 6 | strategy: 7 | matrix: 8 | os: [ubuntu-latest] 9 | python: ['3.6', '3.7', '3.8', '3.9'] 10 | env: 11 | OS: ${{ matrix.os }} 12 | PYTHON: ${{ matrix.python }} 13 | steps: 14 | - uses: actions/checkout@master 15 | - name: Setup Python 16 | uses: actions/setup-python@master 17 | with: 18 | python-version: 3.7 19 | - name: Generate coverage report 20 | run: | 21 | pip install pytest 22 | pip install pytest-cov 23 | pip install numpy 24 | pip install -U scikit-learn 25 | pip install denmune 26 | pytest --cov=./ --cov-report=xml 27 | - name: Upload coverage to Codecov 28 | uses: codecov/codecov-action@v2 29 | with: 30 | token: 'fce1be95-36c5-4c80-83c1-fe9fa8539dae' 31 | files: ./coverage.xml 32 | env_vars: OS,PYTHON 33 | fail_ci_if_error: true 34 | flags: unittests 35 | name: codecov-umbrella 36 | verbose: true 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2021, Mohamed Ali Abbas 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /PyPi Package/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "restructuredtext.confPath": "" 3 | } -------------------------------------------------------------------------------- /PyPi Package/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021, Mohamed Ali Abbas 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | 8 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 9 | 10 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 11 | 12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /PyPi Package/MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include data *.ipynb *.py *.txt *.csv -------------------------------------------------------------------------------- /PyPi Package/README.md: -------------------------------------------------------------------------------- 1 | DenMune: A density-peak clustering algorithm 2 | ============================================= 3 | 4 | DenMune a clustering algorithm that can find clusters of arbitrary size, shapes and densities in two-dimensions. Higher dimensions are first reduced to 2-D using the t-sne. The algorithm relies on a single parameter K (the number of nearest neighbors). The results show the superiority of the algorithm. Enjoy the simplicity but the power of DenMune. 5 | 6 | 7 | [![PyPI Version](https://img.shields.io/pypi/v/denmune.svg)]( https://pypi.org/project/denmune/) 8 | [![Launch notebook examples in Binder](https://static.mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/egy1st/denmune-clustering-algorithm/HEAD) 9 | [![Documentation Status](https://readthedocs.org/projects/denmune/badge/?version=latest)](https://denmune.readthedocs.io/en/latest/?badge=latest) 10 | [![Launch notebook examples in Colaboratory, Google Research]( https://colab.research.google.com/assets/colab-badge.svg)](#colab) 11 | [![Launch notebook examples in Kaggle, the workspace where data scientist meet](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/denmune-clustering-iris-dataset?scriptVersionId=84775816) 12 | [![Elsevier, journal's article publisher ](https://img.shields.io/badge/elsevier-published-orange)](https://www.sciencedirect.com/science/article/abs/pii/S0031320320303927) 13 | [![Research datasets at Mendeley ](https://img.shields.io/badge/mendeley-data-bluegreen)](https://data.mendeley.com/datasets/b73cw5n43r/4) 14 | [![BSD 3-Clause “New” or “Revised” License" ](https://img.shields.io/badge/license-BSD-green)](https://choosealicense.com/licenses/bsd-3-clause/) 15 | [![CircleCI, continuous integration](https://circleci.com/gh/egy1st/denmune-clustering-algorithm/tree/main.svg?style=shield)](https://circleci.com/gh/egy1st/denmune-clustering-algorithm/tree/main) 16 | [![codecov](https://codecov.io/gh/egy1st/denmune-clustering-algorithm/branch/main/graph/badge.svg?token=E2ZY0DSUM2)](https://codecov.io/gh/egy1st/denmune-clustering-algorithm) 17 | 18 | Based on the paper 19 | ------------------- 20 | 21 | |Paper|Journal| 22 | |-------------------------------------------------------------------------------------------|-----------------------------| 23 | |Mohamed Abbas, Adel El-Zoghabi, Amin Ahoukry, [![scimagojr](https://www.scimagojr.com/journal_img.php?id=24823)](https://www.scimagojr.com/journalsearch.php?q=24823&tip=sid&clean=0) 24 | |*DenMune: Density peak based clustering using mutual nearest neighbors* 25 | |In: Journal of Pattern Recognition, Elsevier, 26 | |volume 109, number 107589, January 2021 27 | |DOI: https://doi.org/10.1016/j.patcog.2020.107589 28 | 29 | Documentation: 30 | --------------- 31 | Documentation, including tutorials, are available on https://denmune.readthedocs.io 32 | 33 | [![read the documentation](https://img.shields.io/badge/read_the-docs-orange)](https://denmune.readthedocs.io/en/latest/?badge=latest) 34 | 35 | 36 | Watch it in action 37 | ------------------- 38 | This 30 seconds will tell you how a density-baased algorithm, DenMune propagates: 39 | 40 | [![interact with the propagation](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1o-tP3uvDGjxBOGYkir1lnbr74sZ06e0U?usp=sharing) 41 | 42 | [![Propagation in DenMune](https://raw.githubusercontent.com/egy1st/denmune-clustering-algorithm/main/images/propagation.gif)]() 43 | 44 | 45 | 46 | When less means more 47 | -------------------- 48 | Most calssic clustering algorithms fail in detecting complex clusters where clusters are of different size, shape, density, and being exist in noisy data. 49 | Recently, a density-based algorithm named DenMune showed great ability in detecting complex shapes even in noisy data. it can detect number of clusters automatically, detect both pre-identified-noise and post-identified-noise automatically and removing them. 50 | 51 | It can achieve accuracy reach 100% in most classic pattern problems, achieve 97% in MNIST dataset. A great advantage of this algorithm is being single-parameter algorithm. All you need is to set number of k-nearest neighbor and the algorithm will care about the rest. Being Non-senstive to changes in k, make it robust and stable. 52 | 53 | Keep in mind, the algorithm reduce any N-D dataset to only 2-D dataset initially, so it is a good benefit of this algorithm is being always to plot your data and explore it which make this algorithm a good candidate for data exploration. Finally, the algorithm comes with neat package for visualizing data, validating it and analyze the whole clustering process. 54 | 55 | How to install DenMune 56 | ------------------------ 57 | Simply install DenMune clustering algorithm using pip command from the official Python repository 58 | 59 | [![PyPI Version](https://img.shields.io/pypi/v/denmune.svg)]( https://pypi.org/project/denmune/) 60 | 61 | From the shell run the command 62 | 63 | ```shell 64 | pip install denmune 65 | ``` 66 | 67 | From jupyter notebook cell run the command 68 | 69 | ```ipython3 70 | !pip install denmune 71 | ``` 72 | 73 | How to use DenMune 74 | -------------------- 75 | Once DenMune is installed, you just need to import it 76 | 77 | ```python 78 | from denmune import DenMune 79 | ``` 80 | ###### Please note that first denmune (the package) in small letters, while the other one(the class itself) has D and M in capital case. 81 | 82 | 83 | Read data 84 | ----------- 85 | 86 | There are four possible cases of data: 87 | - only train data without labels 88 | - only labeld train data 89 | - labeled train data in addition to test data without labels 90 | - labeled train data in addition to labeled test data 91 | 92 | 93 | ```python 94 | #============================================= 95 | # First scenario: train data without labels 96 | # ============================================ 97 | 98 | data_path = 'datasets/denmune/chameleon/' 99 | dataset = "t7.10k.csv" 100 | data_file = data_path + dataset 101 | 102 | # train data without labels 103 | X_train = pd.read_csv(data_file, sep=',', header=None) 104 | 105 | knn = 39 # k-nearest neighbor, the only parameter required by the algorithm 106 | 107 | dm = DenMune(train_data=X_train, k_nearest=knn) 108 | labels, validity = dm.fit_predict(show_analyzer=False, show_noise=True) 109 | 110 | ``` 111 | This is an intutive dataset which has no groundtruth provided 112 | 113 | ![t710](https://raw.githubusercontent.com/egy1st/images/main/clustering/t710.png) 114 | 115 | ```python 116 | #============================================= 117 | # Second scenario: train data with labels 118 | # ============================================ 119 | 120 | data_path = 'datasets/denmune/shapes/' 121 | dataset = "aggregation.csv" 122 | data_file = data_path + dataset 123 | 124 | # train data with labels 125 | X_train = pd.read_csv(data_file, sep=',', header=None) 126 | y_train = X_train.iloc[:, -1] 127 | X_train = X_train.drop(X_train.columns[-1], axis=1) 128 | 129 | knn = 6 # k-nearest neighbor, the only parameter required by the algorithm 130 | 131 | dm = DenMune(train_data=X_train, train_truth= y_train, k_nearest=knn) 132 | labels, validity = dm.fit_predict(show_analyzer=False, show_noise=True) 133 | ``` 134 | Datset groundtruth 135 | 136 | ![aggregation groundtruth](https://raw.githubusercontent.com/egy1st/images/main/clustering/aggregation_ground.png) 137 | 138 | Datset as detected by DenMune at k=6 139 | 140 | ![aggregation train](https://raw.githubusercontent.com/egy1st/images/main/clustering/aggregation_6.png) 141 | 142 | 143 | ```python 144 | #================================================================= 145 | # Third scenario: train data with labels in addition to test data 146 | # ================================================================ 147 | 148 | data_path = 'datasets/denmune/pendigits/' 149 | file_2d = data_path + 'pendigits-2d.csv' 150 | 151 | # train data with labels 152 | X_train = pd.read_csv(data_path + 'train.csv', sep=',', header=None) 153 | y_train = X_train.iloc[:, -1] 154 | X_train = X_train.drop(X_train.columns[-1], axis=1) 155 | 156 | # test data without labels 157 | X_test = pd.read_csv(data_path + 'test.csv', sep=',', header=None) 158 | X_test = X_test.drop(X_test.columns[-1], axis=1) 159 | 160 | knn = 50 # k-nearest neighbor, the only parameter required by the algorithm 161 | 162 | dm = DenMune(train_data=X_train, train_truth= y_train, 163 | test_data= X_test, 164 | k_nearest=knn) 165 | labels, validity = dm.fit_predict(show_analyzer=True, show_noise=True) 166 | ``` 167 | dataset groundtruth 168 | 169 | ![pendigits groundtruth](https://raw.githubusercontent.com/egy1st/images/main/clustering/pendigits_ground.png) 170 | 171 | 172 | dataset as detected by DenMune at k=50 173 | 174 | ![pendigits train](https://raw.githubusercontent.com/egy1st/images/main/clustering/pendigits_50.png) 175 | 176 | test data as predicted by DenMune on training the dataset at k=50 177 | 178 | ![pendigits test](https://raw.githubusercontent.com/egy1st/images/main/clustering/pendigits_test_50.png) 179 | 180 | 181 | Algorithm's Parameters 182 | ----------------------- 183 | 1. Parameters used within the initialization of the DenMune class 184 | 185 | ```python 186 | def __init__ (self, 187 | train_data=None, test_data=None, 188 | train_truth=None, test_truth=None, 189 | file_2d =None, k_nearest=None, 190 | rgn_tsne=False, prop_step=0, 191 | ): 192 | ``` 193 | 194 | - train_data: 195 | - data used for training the algorithm 196 | - default: None. It should be provided by the use, otherwise an error will riase. 197 | 198 | - train_truth: 199 | - labels of training data 200 | - default: None 201 | 202 | - test_data: 203 | - data used for testing the algorithm 204 | 205 | - test_truth: 206 | - labels of testing data 207 | - default: None 208 | 209 | - k_nearest: 210 | - number of nearest neighbor 211 | - default: 0. the default is invalid. k-nearest neighbor should be at leat 1. 212 | 213 | - rgn_tsn: 214 | - when set to True: It will regenerate the reduced 2-D version of the N-D dataset each time the algorithm run. 215 | - when set to False: It will generate the reduced 2-D version of the N-D dataset first time only, then will reuse the saved exist file 216 | - default: True 217 | 218 | - file_2d: name (include location) of file used save/load the reduced 2-d version 219 | - if empty: the algorithm will create temporary file named '_temp_2d' 220 | - default: None 221 | 222 | - prop_step: 223 | - size of increment used in showing the clustering propagation. 224 | - leave this parameter set to 0, the default value, unless you are willing intentionally to enter the propagation mode. 225 | - default: 0 226 | 227 | 228 | 2. Parameters used within the fit_predict function: 229 | 230 | ```python 231 | def fit_predict(self, 232 | validate=True, 233 | show_plots=True, 234 | show_noise=True, 235 | show_analyzer=True 236 | ): 237 | ``` 238 | 239 | - validate: 240 | - validate data on/off according to five measures integrated with DenMUne (Accuracy. F1-score, NMI index, AMI index, ARI index) 241 | - default: True 242 | 243 | - show_plots: 244 | - show/hide plotting of data 245 | - default: True 246 | 247 | - show_noise: 248 | - show/hide noise and outlier 249 | - default: True 250 | 251 | - show_analyzer: 252 | - show/hide the analyzer 253 | - default: True 254 | 255 | The Analyzer 256 | ------------- 257 | 258 | The algorithm provide an intutive tool called analyzer, once called it will provide you with in-depth analysis on how your clustering results perform. 259 | 260 | ![DenMune Analyzer](https://raw.githubusercontent.com/egy1st/images/main/clustering/analyzer.png) 261 | 262 | Noise Detection 263 | ---------------- 264 | 265 | DenMune detects noise and outlier automatically, no need to any further work from your side. 266 | 267 | - It plots pre-identified noise in black 268 | - It plots post-identified noise in light grey 269 | 270 | You can set show_noise parameter to False. 271 | 272 | 273 | ```python 274 | 275 | # let us show noise 276 | 277 | m = DenMune(train_data=X_train, k_nearest=knn) 278 | labels, validity = dm.fit_predict(show_noise=True) 279 | ``` 280 | 281 | ```python 282 | 283 | # let us show clean data by removing noise 284 | 285 | m = DenMune(train_data=X_train, k_nearest=knn) 286 | labels, validity = dm.fit_predict(show_noise=False) 287 | ``` 288 | 289 | | noisy data | clean data | 290 | ----------| ---------------------------------------------------------------------------------------------------| 291 | | ![noisy data](https://raw.githubusercontent.com/egy1st/images/main/clustering/noisy_data.png) | ![clean data](https://raw.githubusercontent.com/egy1st/images/main/clustering/clean_data.png) | 292 | 293 | 294 | Validatation 295 | -------------- 296 | You can get your validation results using 3 methods 297 | 298 | - by showing the Analyzer 299 | - extract values from the validity returned list from fit_predict function 300 | - extract values from the Analyzer dictionary 301 | - 302 | There are five validity measures built-in the algorithm, which are: 303 | 304 | - ACC, Accuracy 305 | - F1 score 306 | - NMI index (Normalized Mutual Information) 307 | - AMI index (Adjusted Mutual Information) 308 | - ARI index (Adjusted Rand Index) 309 | 310 | ![Validation snapshot](https://raw.githubusercontent.com/egy1st/images/main/clustering/validation.png) 311 | 312 | K-nearest Evolution 313 | ------------------- 314 | The following chart shows the evolution of pre and post identified noise in correspondence to increase of number of knn. Also, detected number of clusters is analyzed in the same chart in relation with both types of identified noise. 315 | 316 | ![knn evolution chart](https://raw.githubusercontent.com/egy1st/images/main/clustering/knn_vs_noise.png) 317 | 318 | 319 | The Scalability 320 | ---------------- 321 | | data size | time | 322 | |------------------| ------------------- | 323 | | data size: 5000 | time: 2.3139 seconds | 324 | | data size: 10000 | time: 5.8752 seconds | 325 | | data size: 15000 | time: 12.4535 seconds | 326 | | data size: 20000 | time: 18.8466 seconds | 327 | | data size: 25000 | time: 28.992 seconds | 328 | | data size: 30000 | time: 39.3166 seconds | 329 | | data size: 35000 | time: 39.4842 seconds | 330 | | data size: 40000 | time: 63.7649 seconds | 331 | | data size: 45000 | time: 73.6828 seconds | 332 | | data size: 50000 | time: 86.9194 seconds | 333 | | data size: 55000 | time: 90.1077 seconds | 334 | | data size: 60000 | time: 125.0228 seconds | 335 | | data size: 65000 | time: 149.1858 seconds | 336 | | data size: 70000 | time: 177.4184 seconds | 337 | | data size: 75000 | time: 204.0712 seconds | 338 | | data size: 80000 | time: 220.502 seconds | 339 | | data size: 85000 | time: 251.7625 seconds | 340 | | data size: 100000 | time: 257.563 seconds | 341 | 342 | | ![noisy data chart](https://raw.githubusercontent.com/egy1st/images/main/clustering/scalability.png) 343 | 344 | The Stability 345 | -------------- 346 | 347 | The algorithm is only single-parameter, even more it not sensitive to changes in that parameter, k. You may guess that from the following chart yourself. This is of greate benfit for you as a data exploration analyst. You can simply explore the dataset using an arbitrary k. Being Non-senstive to changes in k, make it robust and stable. 348 | 349 | ![DenMune Stability chart](https://raw.githubusercontent.com/egy1st/images/main/clustering/stability.png) 350 | 351 | 352 | Reveal the propagation 353 | ----------------------- 354 | 355 | one of the top performing feature in this algorithm is enabling you to watch how your clusters propagate to construct the final output clusters. 356 | just use the parameter 'prop_step' as in the following example: 357 | 358 | ```python 359 | dataset = "t7.10k" # 360 | data_path = 'datasets/denmune/chameleon/' 361 | 362 | # train file 363 | data_file = data_path + dataset +'.csv' 364 | X_train = pd.read_csv(data_file, sep=',', header=None) 365 | 366 | 367 | from itertools import chain 368 | 369 | # Denmune's Paramaters 370 | knn = 39 # number of k-nearest neighbor, the only parameter required by the algorithm 371 | 372 | # create list of differnt snapshots of the propagation 373 | snapshots = chain(range(2,5), range(5,50,10), range(50, 100, 25), range(100,500,100), range(500,2000, 250), range(1000,5500, 500)) 374 | 375 | from IPython.display import clear_output 376 | for snapshot in snapshots: 377 | print ("itration", snapshot ) 378 | clear_output(wait=True) 379 | dm = DenMune(train_data=X_train, k_nearest=knn, rgn_tsne=False, prop_step=snapshot) 380 | labels, validity = dm.fit_predict(show_analyzer=False, show_noise=False) 381 | ``` 382 | 383 | [![Propagation in DenMune](https://raw.githubusercontent.com/egy1st/denmune-clustering-algorithm/main/images/propagation.gif)]() 384 | 385 | Interact with the algorithm 386 | --------------------------- 387 | [![chameleon datasets](https://raw.githubusercontent.com/egy1st/denmune-clustering-algorithm/main/images/chameleon_detection.png)](https://colab.research.google.com/drive/1EUROd6TRwxW3A_XD3KTxL8miL2ias4Ue?usp=sharing) 388 | 389 | This notebook allows you interact with the algorithm in many asspects: 390 | - you can choose which dataset to cluster (among 4 chameleon datasets) 391 | - you can decide which number of k-nearest neighbor to use 392 | - show noise on/off; thus you can invesitigate noise detected by the algorithm 393 | - show analyzer on/off 394 | 395 | How to run and test 396 | -------------------- 397 | 398 | 1. Launch Examples in Repo2Docker Binder 399 | 400 | Simply use our repo2docker offered by mybinder.org, which encapsulate the algorithm and all required data in one virtual machine instance. All jupter notebooks examples found in this repository will be also available to you in action to practice in this respo2docer. Thanks mybinder.org, you made it possible! 401 | 402 | [![Launch notebook examples in Binder](https://static.mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/egy1st/denmune-clustering-algorithm/HEAD) 403 | 404 | 2. Launch each Example in Kaggle workspace 405 | 406 | If you are a kaggler like me, then Kaggle, the best workspace where data scientist meet, should fit you to test the algorithm with great experince. 407 | 408 | | Dataset | Kaggle URL | 409 | ----------| ---------------------------------------------------------------------------------------------------| 410 | |When less means more - kaggle |[![When less means more - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)]( https://www.kaggle.com/egyfirst/when-less-means-more) | 411 | |Non-groundtruth datasets - kaggle|[![Non-groundtruth datasets](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/detecting-non-groundtruth-datasets) | 412 | |2D Shape datasets - kaggle|[![2D Shape datasets - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/detection-of-2d-shape-datasets) | 413 | |MNIST dataset kaggle|[![MNIST dataset - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/get-97-using-simple-yet-one-parameter-algorithm) | 414 | |Iris dataset kaggle| [![iris dataset - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/denmune-clustering-iris-dataset) | 415 | |Training MNIST to get 97%| [![Training MNIST to get 97%](https://kaggle.com/static/images/open-in-kaggle.svg)]( https://www.kaggle.com/egyfirst/training-mnist-dataset-to-get-97) | 416 | |Noise detection - kaggle| [![Noise detection - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)]( https://www.kaggle.com/egyfirst/noise-detection) | 417 | |Validation - kaggle| [![Validation - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/validate-in-5-built-in-validity-insexes) | 418 | |The beauty of propagation - kaggle| [![The beauty of propagation - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/the-beauty-of-clusters-propagation) | 419 | |The beauty of propagation part2 - kaggle | [![The beauty of propagation part 2 - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/the-beauty-of-propagation-part2) | 420 | |Snapshots of propagation -kaggle| [![The beauty of propagation - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/beauty-of-propagation-part3) | 421 | |Scalability kaggle| [![Scalability - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/scalability-vs-speed) | 422 | |Stability - kaggle| [![Stability - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/stability-vs-number-of-nearest-neighbor) | 423 | |k-nearest-evolution - kaggle| [![k-nearest-evolution - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/k-nearest-evolution) | 424 | 425 | 3. Launch each Example in Google Research, CoLab 426 | 427 | Need to test examples one by one, then here another option. Use colab offered by google research to test each example individually. 428 | 429 | 430 | 431 | Here is a list of Google CoLab URL to use the algorithm interactively 432 | ---------------------------------------------------------------------- 433 | 434 | 435 | | Dataset | CoLab URL | 436 | ----------| ---------------------------------------------------------------------------------------------------| 437 | |How to use it - colab|[![How to use it - colab](https://colab.research.google.com/assets/colab-badge.svg)]( https://colab.research.google.com/drive/1J_uKdhZ3z1KeY0-wJ7Ruw2PZSY1orKQm)| 438 | |Chameleon datasets - colab|[![Chameleon datasets - colab]( https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1EUROd6TRwxW3A_XD3KTxL8miL2ias4Ue?usp=sharing) | 439 | |2D Shape datasets - colab|[![2D Shape datasets - colab]( https://colab.research.google.com/assets/colab-badge.svg)]( https://colab.research.google.com/drive/1EaqTPCRHSuTKB-qEbnWHpGKFj6XytMIk?usp=sharing) | 440 | |MNIST dataset - colab|[![MNIST dataset - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1a9FGHRA6IPc5jhLOV46iEbpUeQXptSJp?usp=sharing) | 441 | |iris dataset - colab|[![iris dataset - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKql57Xh7xVVu6NpTbg3vRdRg42R7hjm?usp=sharing) | 442 | |Get 97% by training MNIST dataset - colab|[![Get 97% by training MNIST dataset - colab](https://colab.research.google.com/assets/colab-badge.svg)]( https://colab.research.google.com/drive/1NeOtXEQY94oD98Ufbh3IhTHnnYwIA659) | 443 | |Non-groundtruth datasets - colab|[![Non-groundtruth datasets - colab](https://colab.research.google.com/assets/colab-badge.svg)]( https://colab.research.google.com/drive/1d17ejQ83aUy0CZIeQ7bHTugSC9AjJ2mU?usp=sharing) | 444 | |Noise detection - colab|[![Noise detection - colab](https://colab.research.google.com/assets/colab-badge.svg)]( https://colab.research.google.com/drive/1Bp3c-cJfjLWxupmrBJ_6Q4-nqIfZcII4) | 445 | |Validation - colab|[![Validation - colab](https://colab.research.google.com/assets/colab-badge.svg)]( https://colab.research.google.com/drive/13_EVaQOv_QiNmQiMWJAcFFHPJHGCrQLe) | 446 | |How it propagates - colab|[![How it propagates - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1o-tP3uvDGjxBOGYkir1lnbr74sZ06e0U?usp=sharing)| 447 | |Snapshots of propagation - colab|[![snapshots of the propagation - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1vPXNKa8Rf3TnqDHSD3YSWl3g1iNSqjl2?usp=sharing)| 448 | |Scalability - colab|[![Scalability - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1d55wkBndLLapO7Yx1ePHhE8mL61j9-TH?usp=sharing)| 449 | |Stability vs number of nearest neighbors - colab|[![Stability vs number of nearest neighbors - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/17VgVRMFBWvkSIH1yA3tMl6UQ7Eu68K2l?usp=sharing)| 450 | |k-nearest-evolution - colab|[![k-nearest-evolution - colab](https://colab.research.google.com/assets/colab-badge.svg)]( https://colab.research.google.com/drive/1DZ-CQPV3WwJSiaV3-rjwPwmXw4RUh8Qj)| 451 | 452 | 453 | 454 | How to cite 455 | ===== 456 | If you have used this codebase in a scientific publication and wish to cite it, please use the [Journal of Pattern Recognition article](https://www.sciencedirect.com/science/article/abs/pii/S0031320320303927) 457 | 458 | Mohamed Abbas McInnes, Adel El-Zoghaby, Amin Ahoukry, *DenMune: Density peak based clustering using mutual nearest neighbors* 459 | In: Journal of Pattern Recognition, Elsevier, volume 109, number 107589. 460 | January 2021 461 | 462 | 463 | ```bib 464 | @article{ABBAS2021107589, 465 | title = {DenMune: Density peak based clustering using mutual nearest neighbors}, 466 | journal = {Pattern Recognition}, 467 | volume = {109}, 468 | pages = {107589}, 469 | year = {2021}, 470 | issn = {0031-3203}, 471 | doi = {https://doi.org/10.1016/j.patcog.2020.107589}, 472 | url = {https://www.sciencedirect.com/science/article/pii/S0031320320303927}, 473 | author = {Mohamed Abbas and Adel El-Zoghabi and Amin Shoukry}, 474 | keywords = {Clustering, Mutual neighbors, Dimensionality reduction, Arbitrary shapes, Pattern recognition, Nearest neighbors, Density peak}, 475 | abstract = {Many clustering algorithms fail when clusters are of arbitrary shapes, of varying densities, or the data classes are unbalanced and close to each other, even in two dimensions. A novel clustering algorithm “DenMune” is presented to meet this challenge. It is based on identifying dense regions using mutual nearest neighborhoods of size K, where K is the only parameter required from the user, besides obeying the mutual nearest neighbor consistency principle. The algorithm is stable for a wide range of values of K. Moreover, it is able to automatically detect and remove noise from the clustering process as well as detecting the target clusters. It produces robust results on various low and high dimensional datasets relative to several known state of the art clustering algorithms.} 476 | } 477 | ``` 478 | 479 | Licensing 480 | ------------ 481 | 482 | The DenMune algorithm is 3-clause BSD licensed. Enjoy. 483 | 484 | [![BSD 3-Clause “New” or “Revised” License" ](https://img.shields.io/badge/license-BSD-green)](https://choosealicense.com/licenses/bsd-3-clause/) 485 | 486 | 487 | Task List 488 | ------------ 489 | 490 | - [x] Update Github with the DenMune sourcode 491 | - [x] create repo2docker repository 492 | - [x] Create pip Package 493 | - [x] create CoLab shared examples 494 | - [x] create documentation 495 | - [x] create Kaggle shared examples 496 | - [x] PEP8 compliant 497 | - [x] Continuous integration 498 | - [x] scikit-learn compatible 499 | - [X] Unit tests (coverage: 100%) 500 | - [ ] create conda package 501 | 502 | -------------------------------------------------------------------------------- /PyPi Package/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /PyPi Package/setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = denmune 3 | version = 0.0.96 4 | author = Mohamed Ali Abbas 5 | author_email = mohamed.alyabbas@outlook.com 6 | description = This is the package for DenMune Clustering Algorithm published in paper https://doi.org/10.1016/j.patcog.2020.107589 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | url = https://github.com/egy1st/denmune-clustering-algorithm 10 | project_urls = 11 | Bug Tracker = https://github.com/pypa/sampleproject/issues 12 | classifiers = 13 | Programming Language :: Python :: 3 14 | License :: OSI Approved :: BSD License 15 | Operating System :: OS Independent 16 | 17 | [options] 18 | package_dir = 19 | = src 20 | packages = find: 21 | python_requires = >=3.6 22 | 23 | [options.packages.find] 24 | where = src 25 | -------------------------------------------------------------------------------- /PyPi Package/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | install_requires=[ 5 | 6 | 'numpy==1.23.5', 7 | 'pandas==1.5.3', 8 | 'matplotlib==3.7.2', 9 | 'scikit-learn==1.2.2', 10 | 'seaborn==0.12.2', 11 | 'ngt==2.0.4', 12 | 'anytree==2.8', 13 | 'treelib==1.6.1', 14 | ] 15 | 16 | ) 17 | -------------------------------------------------------------------------------- /PyPi Package/src/denmune.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: denmune 3 | Version: 0.0.96 4 | Summary: This is the package for DenMune Clustering Algorithm published in paper https://doi.org/10.1016/j.patcog.2020.107589 5 | Home-page: https://github.com/egy1st/denmune-clustering-algorithm 6 | Author: Mohamed Ali Abbas 7 | Author-email: mohamed.alyabbas@outlook.com 8 | License: UNKNOWN 9 | Project-URL: Bug Tracker, https://github.com/pypa/sampleproject/issues 10 | Platform: UNKNOWN 11 | Classifier: Programming Language :: Python :: 3 12 | Classifier: License :: OSI Approved :: BSD License 13 | Classifier: Operating System :: OS Independent 14 | Requires-Python: >=3.6 15 | Description-Content-Type: text/markdown 16 | License-File: LICENSE 17 | 18 | DenMune: A density-peak clustering algorithm 19 | ============================================= 20 | 21 | DenMune a clustering algorithm that can find clusters of arbitrary size, shapes and densities in two-dimensions. Higher dimensions are first reduced to 2-D using the t-sne. The algorithm relies on a single parameter K (the number of nearest neighbors). The results show the superiority of the algorithm. Enjoy the simplicity but the power of DenMune. 22 | 23 | 24 | [![PyPI Version](https://img.shields.io/pypi/v/denmune.svg)]( https://pypi.org/project/denmune/) 25 | [![Launch notebook examples in Binder](https://static.mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/egy1st/denmune-clustering-algorithm/HEAD) 26 | [![Documentation Status](https://readthedocs.org/projects/denmune/badge/?version=latest)](https://denmune.readthedocs.io/en/latest/?badge=latest) 27 | [![Launch notebook examples in Colaboratory, Google Research]( https://colab.research.google.com/assets/colab-badge.svg)](#colab) 28 | [![Launch notebook examples in Kaggle, the workspace where data scientist meet](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/denmune-clustering-iris-dataset?scriptVersionId=84775816) 29 | [![Elsevier, journal's article publisher ](https://img.shields.io/badge/elsevier-published-orange)](https://www.sciencedirect.com/science/article/abs/pii/S0031320320303927) 30 | [![Research datasets at Mendeley ](https://img.shields.io/badge/mendeley-data-bluegreen)](https://data.mendeley.com/datasets/b73cw5n43r/4) 31 | [![BSD 3-Clause “New” or “Revised” License" ](https://img.shields.io/badge/license-BSD-green)](https://choosealicense.com/licenses/bsd-3-clause/) 32 | [![CircleCI, continuous integration](https://circleci.com/gh/egy1st/denmune-clustering-algorithm/tree/main.svg?style=shield)](https://circleci.com/gh/egy1st/denmune-clustering-algorithm/tree/main) 33 | [![codecov](https://codecov.io/gh/egy1st/denmune-clustering-algorithm/branch/main/graph/badge.svg?token=E2ZY0DSUM2)](https://codecov.io/gh/egy1st/denmune-clustering-algorithm) 34 | [![workflow for codecov](https://github.com/egy1st/denmune-clustering-algorithm/actions/workflows/python-package.yml/badge.svg)](https://github.com/egy1st/denmune-clustering-algorithm/actions/workflows/python-package.yml) 35 | 36 | Based on the paper 37 | ------------------- 38 | 39 | |Paper|Journal| 40 | |-------------------------------------------------------------------------------------------|-----------------------------| 41 | |Mohamed Abbas, Adel El-Zoghabi, Amin Ahoukry, [![scimagojr](https://www.scimagojr.com/journal_img.php?id=24823)](https://www.scimagojr.com/journalsearch.php?q=24823&tip=sid&clean=0) 42 | |*DenMune: Density peak based clustering using mutual nearest neighbors* 43 | |In: Journal of Pattern Recognition, Elsevier, 44 | |volume 109, number 107589, January 2021 45 | |DOI: https://doi.org/10.1016/j.patcog.2020.107589 46 | 47 | Documentation: 48 | --------------- 49 | Documentation, including tutorials, are available on https://denmune.readthedocs.io 50 | 51 | [![read the documentation](https://img.shields.io/badge/read_the-docs-orange)](https://denmune.readthedocs.io/en/latest/?badge=latest) 52 | 53 | 54 | Watch it in action 55 | ------------------- 56 | This 30 seconds will tell you how a density-baased algorithm, DenMune propagates: 57 | 58 | [![interact with the propagation](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1o-tP3uvDGjxBOGYkir1lnbr74sZ06e0U?usp=sharing) 59 | 60 | [![Propagation in DenMune](https://raw.githubusercontent.com/egy1st/denmune-clustering-algorithm/main/images/propagation.gif)]() 61 | 62 | 63 | 64 | When less means more 65 | -------------------- 66 | Most calssic clustering algorithms fail in detecting complex clusters where clusters are of different size, shape, density, and being exist in noisy data. 67 | Recently, a density-based algorithm named DenMune showed great ability in detecting complex shapes even in noisy data. it can detect number of clusters automatically, detect both pre-identified-noise and post-identified-noise automatically and removing them. 68 | 69 | It can achieve accuracy reach 100% in most classic pattern problems, achieve 97% in MNIST dataset. A great advantage of this algorithm is being single-parameter algorithm. All you need is to set number of k-nearest neighbor and the algorithm will care about the rest. Being Non-senstive to changes in k, make it robust and stable. 70 | 71 | Keep in mind, the algorithm reduce any N-D dataset to only 2-D dataset initially, so it is a good benefit of this algorithm is being always to plot your data and explore it which make this algorithm a good candidate for data exploration. Finally, the algorithm comes with neat package for visualizing data, validating it and analyze the whole clustering process. 72 | 73 | How to install DenMune 74 | ------------------------ 75 | Simply install DenMune clustering algorithm using pip command from the official Python repository 76 | 77 | [![PyPI Version](https://img.shields.io/pypi/v/denmune.svg)]( https://pypi.org/project/denmune/) 78 | 79 | From the shell run the command 80 | 81 | ```shell 82 | pip install denmune 83 | ``` 84 | 85 | From jupyter notebook cell run the command 86 | 87 | ```ipython3 88 | !pip install denmune 89 | ``` 90 | 91 | How to use DenMune 92 | -------------------- 93 | Once DenMune is installed, you just need to import it 94 | 95 | ```python 96 | from denmune import DenMune 97 | ``` 98 | ###### Please note that first denmune (the package) in small letters, while the other one(the class itself) has D and M in capital case. 99 | 100 | 101 | Read data 102 | ----------- 103 | 104 | There are four possible cases of data: 105 | - only train data without labels 106 | - only labeld train data 107 | - labeled train data in addition to test data without labels 108 | - labeled train data in addition to labeled test data 109 | 110 | 111 | ```python 112 | #============================================= 113 | # First scenario: train data without labels 114 | # ============================================ 115 | 116 | data_path = 'datasets/denmune/chameleon/' 117 | dataset = "t7.10k.csv" 118 | data_file = data_path + dataset 119 | 120 | # train data without labels 121 | X_train = pd.read_csv(data_file, sep=',', header=None) 122 | 123 | knn = 39 # k-nearest neighbor, the only parameter required by the algorithm 124 | 125 | dm = DenMune(train_data=X_train, k_nearest=knn) 126 | labels, validity = dm.fit_predict(show_analyzer=False, show_noise=True) 127 | 128 | ``` 129 | This is an intutive dataset which has no groundtruth provided 130 | 131 | ![t710](https://raw.githubusercontent.com/egy1st/images/main/clustering/t710.png) 132 | 133 | ```python 134 | #============================================= 135 | # Second scenario: train data with labels 136 | # ============================================ 137 | 138 | data_path = 'datasets/denmune/shapes/' 139 | dataset = "aggregation.csv" 140 | data_file = data_path + dataset 141 | 142 | # train data with labels 143 | X_train = pd.read_csv(data_file, sep=',', header=None) 144 | y_train = X_train.iloc[:, -1] 145 | X_train = X_train.drop(X_train.columns[-1], axis=1) 146 | 147 | knn = 6 # k-nearest neighbor, the only parameter required by the algorithm 148 | 149 | dm = DenMune(train_data=X_train, train_truth= y_train, k_nearest=knn) 150 | labels, validity = dm.fit_predict(show_analyzer=False, show_noise=True) 151 | ``` 152 | Datset groundtruth 153 | 154 | ![aggregation groundtruth](https://raw.githubusercontent.com/egy1st/images/main/clustering/aggregation_ground.png) 155 | 156 | Datset as detected by DenMune at k=6 157 | 158 | ![aggregation train](https://raw.githubusercontent.com/egy1st/images/main/clustering/aggregation_6.png) 159 | 160 | 161 | ```python 162 | #================================================================= 163 | # Third scenario: train data with labels in addition to test data 164 | # ================================================================ 165 | 166 | data_path = 'datasets/denmune/pendigits/' 167 | file_2d = data_path + 'pendigits-2d.csv' 168 | 169 | # train data with labels 170 | X_train = pd.read_csv(data_path + 'train.csv', sep=',', header=None) 171 | y_train = X_train.iloc[:, -1] 172 | X_train = X_train.drop(X_train.columns[-1], axis=1) 173 | 174 | # test data without labels 175 | X_test = pd.read_csv(data_path + 'test.csv', sep=',', header=None) 176 | X_test = X_test.drop(X_test.columns[-1], axis=1) 177 | 178 | knn = 50 # k-nearest neighbor, the only parameter required by the algorithm 179 | 180 | dm = DenMune(train_data=X_train, train_truth= y_train, 181 | test_data= X_test, 182 | k_nearest=knn) 183 | labels, validity = dm.fit_predict(show_analyzer=True, show_noise=True) 184 | ``` 185 | dataset groundtruth 186 | 187 | ![pendigits groundtruth](https://raw.githubusercontent.com/egy1st/images/main/clustering/pendigits_ground.png) 188 | 189 | 190 | dataset as detected by DenMune at k=50 191 | 192 | ![pendigits train](https://raw.githubusercontent.com/egy1st/images/main/clustering/pendigits_50.png) 193 | 194 | test data as predicted by DenMune on training the dataset at k=50 195 | 196 | ![pendigits test](https://raw.githubusercontent.com/egy1st/images/main/clustering/pendigits_test_50.png) 197 | 198 | 199 | Algorithm's Parameters 200 | ----------------------- 201 | 1. Parameters used within the initialization of the DenMune class 202 | 203 | ```python 204 | def __init__ (self, 205 | train_data=None, test_data=None, 206 | train_truth=None, test_truth=None, 207 | file_2d ='_temp_2d', k_nearest=10, 208 | rgn_tsne=False, prop_step=0, 209 | ): 210 | ``` 211 | 212 | - train_data: 213 | - data used for training the algorithm 214 | - default: None. It should be provided by the use, otherwise an error will riase. 215 | 216 | - train_truth: 217 | - labels of training data 218 | - default: None 219 | 220 | - test_data: 221 | - data used for testing the algorithm 222 | 223 | - test_truth: 224 | - labels of testing data 225 | - default: None 226 | 227 | - k_nearest: 228 | - number of nearest neighbor 229 | - default: 10. It should be provided by the user. 230 | 231 | - rgn_tsn: 232 | - when set to True: It will regenerate the reduced 2-D version of the N-D dataset each time the algorithm run. 233 | - when set to False: It will generate the reduced 2-D version of the N-D dataset first time only, then will reuse the saved exist file 234 | - default: True 235 | 236 | - file_2d: name (include location) of file used save/load the reduced 2-d version 237 | - if empty: the algorithm will create temporary file named '_temp_2d' 238 | - default: _temp_2d 239 | 240 | - prop_step: 241 | - size of increment used in showing the clustering propagation. 242 | - leave this parameter set to 0, the default value, unless you are willing intentionally to enter the propagation mode. 243 | - default: 0 244 | 245 | 246 | 2. Parameters used within the fit_predict function: 247 | 248 | ```python 249 | def fit_predict(self, 250 | validate=True, 251 | show_plots=True, 252 | show_noise=True, 253 | show_analyzer=True 254 | ): 255 | ``` 256 | 257 | - validate: 258 | - validate data on/off according to five measures integrated with DenMUne (Accuracy. F1-score, NMI index, AMI index, ARI index) 259 | - default: True 260 | 261 | - show_plots: 262 | - show/hide plotting of data 263 | - default: True 264 | 265 | - show_noise: 266 | - show/hide noise and outlier 267 | - default: True 268 | 269 | - show_analyzer: 270 | - show/hide the analyzer 271 | - default: True 272 | 273 | The Analyzer 274 | ------------- 275 | 276 | The algorithm provide an intutive tool called analyzer, once called it will provide you with in-depth analysis on how your clustering results perform. 277 | 278 | ![DenMune Analyzer](https://raw.githubusercontent.com/egy1st/images/main/clustering/analyzer.png) 279 | 280 | Noise Detection 281 | ---------------- 282 | 283 | DenMune detects noise and outlier automatically, no need to any further work from your side. 284 | 285 | - It plots pre-identified noise in black 286 | - It plots post-identified noise in light grey 287 | 288 | You can set show_noise parameter to False. 289 | 290 | 291 | ```python 292 | 293 | # let us show noise 294 | 295 | m = DenMune(train_data=X_train, k_nearest=knn) 296 | labels, validity = dm.fit_predict(show_noise=True) 297 | ``` 298 | 299 | ```python 300 | 301 | # let us show clean data by removing noise 302 | 303 | m = DenMune(train_data=X_train, k_nearest=knn) 304 | labels, validity = dm.fit_predict(show_noise=False) 305 | ``` 306 | 307 | | noisy data | clean data | 308 | ----------| ---------------------------------------------------------------------------------------------------| 309 | | ![noisy data](https://raw.githubusercontent.com/egy1st/images/main/clustering/noisy_data.png) | ![clean data](https://raw.githubusercontent.com/egy1st/images/main/clustering/clean_data.png) | 310 | 311 | 312 | Validatation 313 | -------------- 314 | You can get your validation results using 3 methods 315 | 316 | - by showing the Analyzer 317 | - extract values from the validity returned list from fit_predict function 318 | - extract values from the Analyzer dictionary 319 | - 320 | There are five validity measures built-in the algorithm, which are: 321 | 322 | - ACC, Accuracy 323 | - F1 score 324 | - NMI index (Normalized Mutual Information) 325 | - AMI index (Adjusted Mutual Information) 326 | - ARI index (Adjusted Rand Index) 327 | 328 | ![Validation snapshot](https://raw.githubusercontent.com/egy1st/images/main/clustering/validation.png) 329 | 330 | K-nearest Evolution 331 | ------------------- 332 | The following chart shows the evolution of pre and post identified noise in correspondence to increase of number of knn. Also, detected number of clusters is analyzed in the same chart in relation with both types of identified noise. 333 | 334 | ![knn evolution chart](https://raw.githubusercontent.com/egy1st/images/main/clustering/knn_vs_noise.png) 335 | 336 | 337 | The Scalability 338 | ---------------- 339 | | data size | time | 340 | |------------------| ------------------- | 341 | | data size: 5000 | time: 2.3139 seconds | 342 | | data size: 10000 | time: 5.8752 seconds | 343 | | data size: 15000 | time: 12.4535 seconds | 344 | | data size: 20000 | time: 18.8466 seconds | 345 | | data size: 25000 | time: 28.992 seconds | 346 | | data size: 30000 | time: 39.3166 seconds | 347 | | data size: 35000 | time: 39.4842 seconds | 348 | | data size: 40000 | time: 63.7649 seconds | 349 | | data size: 45000 | time: 73.6828 seconds | 350 | | data size: 50000 | time: 86.9194 seconds | 351 | | data size: 55000 | time: 90.1077 seconds | 352 | | data size: 60000 | time: 125.0228 seconds | 353 | | data size: 65000 | time: 149.1858 seconds | 354 | | data size: 70000 | time: 177.4184 seconds | 355 | | data size: 75000 | time: 204.0712 seconds | 356 | | data size: 80000 | time: 220.502 seconds | 357 | | data size: 85000 | time: 251.7625 seconds | 358 | | data size: 100000 | time: 257.563 seconds | 359 | 360 | | ![noisy data chart](https://raw.githubusercontent.com/egy1st/images/main/clustering/scalability.png) 361 | 362 | The Stability 363 | -------------- 364 | 365 | The algorithm is only single-parameter, even more it not sensitive to changes in that parameter, k. You may guess that from the following chart yourself. This is of greate benfit for you as a data exploration analyst. You can simply explore the dataset using an arbitrary k. Being Non-senstive to changes in k, make it robust and stable. 366 | 367 | ![DenMune Stability chart](https://raw.githubusercontent.com/egy1st/images/main/clustering/stability.png) 368 | 369 | 370 | Reveal the propagation 371 | ----------------------- 372 | 373 | one of the top performing feature in this algorithm is enabling you to watch how your clusters propagate to construct the final output clusters. 374 | just use the parameter 'prop_step' as in the following example: 375 | 376 | ```python 377 | dataset = "t7.10k" # 378 | data_path = 'datasets/denmune/chameleon/' 379 | 380 | # train file 381 | data_file = data_path + dataset +'.csv' 382 | X_train = pd.read_csv(data_file, sep=',', header=None) 383 | 384 | 385 | from itertools import chain 386 | 387 | # Denmune's Paramaters 388 | knn = 39 # number of k-nearest neighbor, the only parameter required by the algorithm 389 | 390 | # create list of differnt snapshots of the propagation 391 | snapshots = chain(range(2,5), range(5,50,10), range(50, 100, 25), range(100,500,100), range(500,2000, 250), range(1000,5500, 500)) 392 | 393 | from IPython.display import clear_output 394 | for snapshot in snapshots: 395 | print ("itration", snapshot ) 396 | clear_output(wait=True) 397 | dm = DenMune(train_data=X_train, k_nearest=knn, rgn_tsne=False, prop_step=snapshot) 398 | labels, validity = dm.fit_predict(show_analyzer=False, show_noise=False) 399 | ``` 400 | 401 | [![Propagation in DenMune](https://raw.githubusercontent.com/egy1st/denmune-clustering-algorithm/main/images/propagation.gif)]() 402 | 403 | Interact with the algorithm 404 | --------------------------- 405 | [![chameleon datasets](https://raw.githubusercontent.com/egy1st/denmune-clustering-algorithm/main/images/chameleon_detection.png)](https://colab.research.google.com/drive/1EUROd6TRwxW3A_XD3KTxL8miL2ias4Ue?usp=sharing) 406 | 407 | This notebook allows you interact with the algorithm in many asspects: 408 | - you can choose which dataset to cluster (among 4 chameleon datasets) 409 | - you can decide which number of k-nearest neighbor to use 410 | - show noise on/off; thus you can invesitigate noise detected by the algorithm 411 | - show analyzer on/off 412 | 413 | How to run and test 414 | -------------------- 415 | 416 | 1. Launch Examples in Repo2Docker Binder 417 | 418 | Simply use our repo2docker offered by mybinder.org, which encapsulate the algorithm and all required data in one virtual machine instance. All jupter notebooks examples found in this repository will be also available to you in action to practice in this respo2docer. Thanks mybinder.org, you made it possible! 419 | 420 | [![Launch notebook examples in Binder](https://static.mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/egy1st/denmune-clustering-algorithm/HEAD) 421 | 422 | 2. Launch each Example in Kaggle workspace 423 | 424 | If you are a kaggler like me, then Kaggle, the best workspace where data scientist meet, should fit you to test the algorithm with great experince. 425 | 426 | | Dataset | Kaggle URL | 427 | ----------| ---------------------------------------------------------------------------------------------------| 428 | |When less means more - kaggle |[![When less means more - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)]( https://www.kaggle.com/egyfirst/when-less-means-more) | 429 | |Non-groundtruth datasets - kaggle|[![Non-groundtruth datasets](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/detecting-non-groundtruth-datasets) | 430 | |2D Shape datasets - kaggle|[![2D Shape datasets - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/detection-of-2d-shape-datasets) | 431 | |MNIST dataset kaggle|[![MNIST dataset - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/get-97-using-simple-yet-one-parameter-algorithm) | 432 | |Iris dataset kaggle| [![iris dataset - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/denmune-clustering-iris-dataset) | 433 | |Training MNIST to get 97%| [![Training MNIST to get 97%](https://kaggle.com/static/images/open-in-kaggle.svg)]( https://www.kaggle.com/egyfirst/training-mnist-dataset-to-get-97) | 434 | |Noise detection - kaggle| [![Noise detection - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)]( https://www.kaggle.com/egyfirst/noise-detection) | 435 | |Validation - kaggle| [![Validation - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/validate-in-5-built-in-validity-insexes) | 436 | |The beauty of propagation - kaggle| [![The beauty of propagation - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/the-beauty-of-clusters-propagation) | 437 | |The beauty of propagation part2 - kaggle | [![The beauty of propagation part 2 - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/the-beauty-of-propagation-part2) | 438 | |Snapshots of propagation -kaggle| [![The beauty of propagation - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/beauty-of-propagation-part3) | 439 | |Scalability kaggle| [![Scalability - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/scalability-vs-speed) | 440 | |Stability - kaggle| [![Stability - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/stability-vs-number-of-nearest-neighbor) | 441 | |k-nearest-evolution - kaggle| [![k-nearest-evolution - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/egyfirst/k-nearest-evolution) | 442 | 443 | 3. Launch each Example in Google Research, CoLab 444 | 445 | Need to test examples one by one, then here another option. Use colab offered by google research to test each example individually. 446 | 447 | 448 | 449 | Here is a list of Google CoLab URL to use the algorithm interactively 450 | ---------------------------------------------------------------------- 451 | 452 | 453 | | Dataset | CoLab URL | 454 | ----------| ---------------------------------------------------------------------------------------------------| 455 | |How to use it - colab|[![How to use it - colab](https://colab.research.google.com/assets/colab-badge.svg)]( https://colab.research.google.com/drive/1J_uKdhZ3z1KeY0-wJ7Ruw2PZSY1orKQm)| 456 | |Chameleon datasets - colab|[![Chameleon datasets - colab]( https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1EUROd6TRwxW3A_XD3KTxL8miL2ias4Ue?usp=sharing) | 457 | |2D Shape datasets - colab|[![2D Shape datasets - colab]( https://colab.research.google.com/assets/colab-badge.svg)]( https://colab.research.google.com/drive/1EaqTPCRHSuTKB-qEbnWHpGKFj6XytMIk?usp=sharing) | 458 | |MNIST dataset - colab|[![MNIST dataset - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1a9FGHRA6IPc5jhLOV46iEbpUeQXptSJp?usp=sharing) | 459 | |iris dataset - colab|[![iris dataset - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKql57Xh7xVVu6NpTbg3vRdRg42R7hjm?usp=sharing) | 460 | |Get 97% by training MNIST dataset - colab|[![Get 97% by training MNIST dataset - colab](https://colab.research.google.com/assets/colab-badge.svg)]( https://colab.research.google.com/drive/1NeOtXEQY94oD98Ufbh3IhTHnnYwIA659) | 461 | |Non-groundtruth datasets - colab|[![Non-groundtruth datasets - colab](https://colab.research.google.com/assets/colab-badge.svg)]( https://colab.research.google.com/drive/1d17ejQ83aUy0CZIeQ7bHTugSC9AjJ2mU?usp=sharing) | 462 | |Noise detection - colab|[![Noise detection - colab](https://colab.research.google.com/assets/colab-badge.svg)]( https://colab.research.google.com/drive/1Bp3c-cJfjLWxupmrBJ_6Q4-nqIfZcII4) | 463 | |Validation - colab|[![Validation - colab](https://colab.research.google.com/assets/colab-badge.svg)]( https://colab.research.google.com/drive/13_EVaQOv_QiNmQiMWJAcFFHPJHGCrQLe) | 464 | |How it propagates - colab|[![How it propagates - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1o-tP3uvDGjxBOGYkir1lnbr74sZ06e0U?usp=sharing)| 465 | |Snapshots of propagation - colab|[![snapshots of the propagation - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1vPXNKa8Rf3TnqDHSD3YSWl3g1iNSqjl2?usp=sharing)| 466 | |Scalability - colab|[![Scalability - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1d55wkBndLLapO7Yx1ePHhE8mL61j9-TH?usp=sharing)| 467 | |Stability vs number of nearest neighbors - colab|[![Stability vs number of nearest neighbors - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/17VgVRMFBWvkSIH1yA3tMl6UQ7Eu68K2l?usp=sharing)| 468 | |k-nearest-evolution - colab|[![k-nearest-evolution - colab](https://colab.research.google.com/assets/colab-badge.svg)]( https://colab.research.google.com/drive/1DZ-CQPV3WwJSiaV3-rjwPwmXw4RUh8Qj)| 469 | 470 | 471 | 472 | How to cite 473 | ===== 474 | If you have used this codebase in a scientific publication and wish to cite it, please use the [Journal of Pattern Recognition article](https://www.sciencedirect.com/science/article/abs/pii/S0031320320303927) 475 | 476 | Mohamed Abbas McInnes, Adel El-Zoghaby, Amin Ahoukry, *DenMune: Density peak based clustering using mutual nearest neighbors* 477 | In: Journal of Pattern Recognition, Elsevier, volume 109, number 107589. 478 | January 2021 479 | 480 | 481 | ```bib 482 | @article{ABBAS2021107589, 483 | title = {DenMune: Density peak based clustering using mutual nearest neighbors}, 484 | journal = {Pattern Recognition}, 485 | volume = {109}, 486 | pages = {107589}, 487 | year = {2021}, 488 | issn = {0031-3203}, 489 | doi = {https://doi.org/10.1016/j.patcog.2020.107589}, 490 | url = {https://www.sciencedirect.com/science/article/pii/S0031320320303927}, 491 | author = {Mohamed Abbas and Adel El-Zoghabi and Amin Shoukry}, 492 | keywords = {Clustering, Mutual neighbors, Dimensionality reduction, Arbitrary shapes, Pattern recognition, Nearest neighbors, Density peak}, 493 | abstract = {Many clustering algorithms fail when clusters are of arbitrary shapes, of varying densities, or the data classes are unbalanced and close to each other, even in two dimensions. A novel clustering algorithm “DenMune” is presented to meet this challenge. It is based on identifying dense regions using mutual nearest neighborhoods of size K, where K is the only parameter required from the user, besides obeying the mutual nearest neighbor consistency principle. The algorithm is stable for a wide range of values of K. Moreover, it is able to automatically detect and remove noise from the clustering process as well as detecting the target clusters. It produces robust results on various low and high dimensional datasets relative to several known state of the art clustering algorithms.} 494 | } 495 | ``` 496 | 497 | Licensing 498 | ------------ 499 | 500 | The DenMune algorithm is 3-clause BSD licensed. Enjoy. 501 | 502 | [![BSD 3-Clause “New” or “Revised” License" ](https://img.shields.io/badge/license-BSD-green)](https://choosealicense.com/licenses/bsd-3-clause/) 503 | 504 | 505 | Task List 506 | ------------ 507 | 508 | - [x] Update Github with the DenMune sourcode 509 | - [x] create repo2docker repository 510 | - [x] Create pip Package 511 | - [x] create CoLab shared examples 512 | - [x] create documentation 513 | - [x] create Kaggle shared examples 514 | - [x] PEP8 compliant 515 | - [x] Continuous integration 516 | - [x] scikit-learn compatible 517 | - [X] Unit tests (coverage: 97%) 518 | - [ ] create conda package 519 | 520 | 521 | 522 | -------------------------------------------------------------------------------- /PyPi Package/src/denmune.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | LICENSE 2 | MANIFEST.in 3 | README.md 4 | pyproject.toml 5 | setup.cfg 6 | setup.py 7 | src/denmune/__init__.py 8 | src/denmune/denmune.py 9 | src/denmune.egg-info/PKG-INFO 10 | src/denmune.egg-info/SOURCES.txt 11 | src/denmune.egg-info/dependency_links.txt 12 | src/denmune.egg-info/requires.txt 13 | src/denmune.egg-info/top_level.txt -------------------------------------------------------------------------------- /PyPi Package/src/denmune.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /PyPi Package/src/denmune.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.18.5 2 | pandas>=1.0.3 3 | matplotlib>=3.2.1 4 | scikit-learn>=0.22.1 5 | seaborn>=0.10.1 6 | ngt>=1.11.6 7 | anytree>=2.8.0 8 | treelib>=1.6.1 9 | -------------------------------------------------------------------------------- /PyPi Package/src/denmune.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | denmune 2 | -------------------------------------------------------------------------------- /PyPi Package/src/denmune/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /PyPi Package/src/denmune/.idea/.name: -------------------------------------------------------------------------------- 1 | denmune.py -------------------------------------------------------------------------------- /PyPi Package/src/denmune/.idea/denmune.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /PyPi Package/src/denmune/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /PyPi Package/src/denmune/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /PyPi Package/src/denmune/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /PyPi Package/src/denmune/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .denmune import DenMune -------------------------------------------------------------------------------- /PyPi Package/src/denmune/denmune.py: -------------------------------------------------------------------------------- 1 | # ==================================================================================================================== 2 | # About the source code and the associated published paper 3 | # ==================================================================================================================== 4 | # This is the source code of DenMune Clustering Algorithm accompanied with the experimental work 5 | # which is published in Elsevier Pattern Recognition, Volume 109, January 2021 6 | # paper can be accessed from 107589 https://doi.org/10.1016/j.patcog.2020.107589 7 | # source code and several examples on using it, can be accessed from 8 | # Gitbub's repository at https://github.com/egy1st/denmune-clustering-algorithm 9 | # Authors: Mohamed Abbas, Adel El-Zoghabi, and Amin Shoukry 10 | # Edition 0.0.2.3 Released 29-12-2021 11 | # PyPi package installation from https://pypi.org/project/denmune/ 12 | # ==================================================================================================================== 13 | 14 | 15 | # ==================================================================================================================== 16 | # About the DenMune Algorithm 17 | # ==================================================================================================================== 18 | # DenMune Clustering Algorithm's Highlights 19 | # DenMune is a clustering algorithm that can find clusters of arbitrary size, shapes and densities in two-dimensions. 20 | # Higher dimensions are first reduced to 2-D using the t-sne. 21 | # The algorithm relies on a single parameter K (the number of nearest neighbors). 22 | # The results show the superiority of DenMune. 23 | # ===================================================================================================================== 24 | 25 | 26 | # ===================================================================================================================== 27 | # About me 28 | # ===================================================================================================================== 29 | # Name: Mohamed Ali Abbas 30 | # Egypt - Alexandria - Smouha 31 | # Cell-phone: +20-01007500290 32 | # Personal E-mail: mohamed.alyabbas@outlook.com 33 | # Business E-meal: 01@zerobytes.one 34 | # website: https://zerobytes.one 35 | # LinkedIn: https://www.linkedin.com/in/mohabbas/ 36 | # Github: https://github.com/egy1st 37 | # Kaggle: https://www.kaggle.com/egyfirst 38 | # Udemy: https://www.udemy.com/user/mohammad-ali-abbas/ 39 | # Facebook: https://www.facebook.com/ZeroBytes.One 40 | # ===================================================================================================================== 41 | 42 | import operator 43 | import os.path 44 | import time 45 | 46 | import matplotlib.pyplot as plt 47 | import ngtpy 48 | import numpy as np 49 | import pandas as pd 50 | import seaborn as sns 51 | from anytree import Node 52 | from numpy import genfromtxt 53 | from sklearn.manifold import TSNE 54 | from treelib import Tree as tr 55 | 56 | sns.set_context('poster') 57 | sns.set_color_codes() 58 | plot_kwds = {'alpha': 0.99, 's': 80, 'linewidths': 0} 59 | 60 | 61 | # import for possible needs 62 | # from sklearn.metrics import confusion_matrix 63 | # from sklearn import metrics 64 | # import sklearn.cluster as cluster 65 | 66 | 67 | class DataPoint(): 68 | 69 | def __init__(self, id): 70 | self.point_id = id 71 | self.class_id = 0 # 0 not clustered but -1 means a noise 72 | self.refer_to = [] 73 | self.referred_by = [] 74 | self.reference = [] 75 | self.visited = False 76 | self.homogeneity = 0 77 | 78 | 79 | class DenMune(): 80 | 81 | def __init__(self, 82 | train_data=None, test_data=None, 83 | train_truth=None, test_truth=None, 84 | file_2d=None, k_nearest=0, 85 | rgn_tsne=False, prop_step=0, 86 | ): 87 | 88 | if train_data is None: 89 | raise Exception("No data is provided. At least train data should be provided. Set train_data argmunt properly.") 90 | else: 91 | self.data_indicator = 1 92 | 93 | if train_truth is not None: 94 | self.data_indicator += 2 95 | 96 | if test_data is not None: 97 | self.data_indicator += 4 98 | 99 | if test_truth is not None: 100 | self.data_indicator += 8 101 | 102 | if train_data is not None and train_truth is None and test_truth is not None: 103 | raise Exception("you should provide labels for your traing data to be allowed to work with test data. Set train_truth argmunt properly.") 104 | if train_data is not None and train_truth is None and test_data is not None : 105 | raise Exception("you should provide labels for your traing data to be allowed to work with test data. Set train_truth argmunt properly.") 106 | if train_data is not None and train_truth is not None and test_truth is not None and test_data is None: 107 | raise Exception("Although labels of testing data is provided, the test data itself isnot. Set test_data argument properly.") 108 | 109 | self.analyzer = {} 110 | self.analyzer['n_points'] = {} 111 | if isinstance(train_data, pd.DataFrame): 112 | train_data = train_data.to_numpy() 113 | train_data = train_data.copy(order='C') 114 | if isinstance(test_data, pd.DataFrame): 115 | test_data = test_data.to_numpy() 116 | test_data = test_data.copy(order='C') 117 | if isinstance(train_truth, pd.Series): 118 | train_truth = train_truth.to_numpy() 119 | train_truth = train_truth.copy(order='C') 120 | if isinstance(test_truth, pd.Series): 121 | test_truth = test_truth.to_numpy() 122 | test_truth = test_truth.copy(order='C') 123 | 124 | self.train_sz = len(train_data) 125 | 126 | if test_data is not None: 127 | data = np.append(train_data, test_data, axis=0) 128 | self.test_sz = len(test_data) 129 | else: 130 | self.test_sz = 0 131 | data = train_data 132 | 133 | if test_truth is not None: 134 | self.labels_truth = np.append(train_truth, test_truth, axis=0) 135 | else: 136 | self.labels_truth = train_truth 137 | 138 | self.analyzer["n_points"]["size"] = len( 139 | data) # data.shape[0] # this will changed in preplot when we plot only train or test data 140 | 141 | self.analyzer['exec_time'] = {} 142 | self.analyzer["exec_time"]["t_SNE"] = 0 143 | self.analyzer['n_points']["noise"] = {} 144 | self.analyzer["n_points"]["noise"]["type-1"] = 0 145 | self.analyzer["n_points"]["noise"]["type-2"] = 0 146 | self.analyzer['n_points']["weak"] = {} 147 | self.analyzer["n_points"]["weak"]["all"] = 0 148 | self.analyzer["n_points"]["weak"]["succeeded to merge"] = 0 149 | self.analyzer["n_points"]["weak"]["failed to merge"] = 0 150 | self.analyzer["n_points"]["dim"] = data.shape[1] 151 | self.analyzer["n_clusters"] = {} 152 | self.analyzer["n_clusters"]["actual"] = 0 153 | self.analyzer["n_clusters"]["detected"] = 0 154 | # self.delimiter = delimiter 155 | self.debuger = {} 156 | 157 | if k_nearest == 0: 158 | raise Exception("k-nearest neighbor should be at least 1") 159 | 160 | if file_2d is None: 161 | file_2d = '_temp_2d' 162 | 163 | if data.shape[1] != 2 and file_2d == '_temp_2d': 164 | # raise Exception("Sorry, this is N-D dataset, file-2d parameter should not be empty") 165 | start = time.time() 166 | self.generate_tsne(data, 2, file_2d='_temp_2d') 167 | end = time.time() 168 | self.analyzer["exec_time"]["t_SNE"] = end - start 169 | data = genfromtxt(file_2d, delimiter=',') 170 | elif data.shape[1] != 2 and file_2d != '_temp_2d': 171 | if not os.path.isfile(file_2d) or rgn_tsne == True: 172 | start = time.time() 173 | self.generate_tsne(data, 2, file_2d) 174 | end = time.time() 175 | self.analyzer["exec_time"]["t_SNE"] = end - start 176 | data = genfromtxt(file_2d, delimiter=',') 177 | 178 | start_time = time.time() 179 | 180 | self.alg_name = 'denmune' 181 | self.prop_step = prop_step 182 | self.data = data 183 | self.train_data = train_data 184 | self.test_data = test_data 185 | self.dp_count = self.data.shape[0] 186 | self.dp_dim = self.data.shape[1] 187 | self.k_nearest = k_nearest 188 | self.dp_dis = [] 189 | self.train_truth = train_truth 190 | self.test_truth = test_truth 191 | 192 | self.DataPoints = [] 193 | self.ClassPoints = {} 194 | self.KernelPoints = [] 195 | 196 | self.init_DataPoints() 197 | self.kd_NGT() 198 | self.load_DataPoints() # load_DataPoints must come after kd_NGT() 199 | self.compute_Links() 200 | # self.semi_init_DataPoints #it is useful with csharp and CNune only 201 | self.find_Noise() 202 | self.sort_DataPoints() 203 | self.prepare_Clusters() 204 | self.attach_Points() 205 | 206 | end_time = time.time() 207 | self.analyzer["exec_time"]["DenMune"] = end_time - start_time 208 | 209 | return None # __init__ should return None 210 | 211 | def kd_NGT(self): 212 | 213 | if len(self.dp_dis) == 0: 214 | 215 | ngtpy.create(b"tmp", self.dp_dim) 216 | index = ngtpy.Index(b"tmp") 217 | index.batch_insert(self.data) 218 | index.save() 219 | 220 | k = self.k_nearest 221 | start = time.time() 222 | self.dp_dis = [] 223 | for i in range(self.dp_count): 224 | query = self.data[i] 225 | result = index.search(query, k + 1)[1:] # we skip first distance from a point to itself 226 | self.dp_dis.append(result) 227 | 228 | end = time.time() 229 | self.analyzer["exec_time"]["NGT"] = end - start 230 | 231 | def getValue(self, dic, what, who, other=False): 232 | 233 | if what == 'max' and who == 'value' and other == True: 234 | val = max(dic.items(), key=operator.itemgetter(1))[0] # max value==>key 235 | # these cases will never be used here but keep them for future use. 236 | """" 237 | elif what == 'max' and who == 'key' and other == False: 238 | val = max(dic.items(), key=operator.itemgetter(0))[0] # max key 239 | elif what == 'max' and who == 'key' and other == True: 240 | val = max(dic.items(), key=operator.itemgetter(0))[1] # max key==>Value 241 | elif what == 'max' and who == 'value' and other == False: 242 | val = max(dic.items(), key=operator.itemgetter(1))[1] # max value 243 | """ 244 | return val 245 | 246 | def init_DataPoints(self): 247 | 248 | self.DataPoints = [] 249 | self.KernelPoints = [] 250 | 251 | for i in range(self.dp_count): 252 | dp = DataPoint(i) 253 | # no need since datapoint is initialised with these values 254 | """ 255 | dp.refer_to = [] 256 | dp.referred_by = [] 257 | dp.reference = [] 258 | dp.class_id = 0 259 | dp.visited = False 260 | dp.homogeneity = 0.0 261 | """ 262 | self.DataPoints.append(dp) 263 | return 0 264 | 265 | """ 266 | this function is useful with csharp and CNune only 267 | 268 | def semi_init_DataPoints(self): 269 | 270 | for dp in self.DataPoints: 271 | dp.visited = False 272 | dp.class_id = 0 273 | dp.homogeneity = 0 274 | return 0 275 | """ 276 | 277 | def find_Noise(self): 278 | 279 | self.ClassPoints[-1] = Node(-1, parent=None) 280 | self.ClassPoints[0] = Node(0, parent=None) 281 | 282 | for i in range(self.dp_count): 283 | dp = self.DataPoints[i] 284 | if len(dp.reference) == 0: 285 | dp.class_id = -1 286 | self.ClassPoints[i] = self.ClassPoints[-1] # Node(-1, parent=None) # this it is a noise 287 | else: # at least one point 288 | dp.class_id = 0 # this is allready set initally 289 | self.ClassPoints[i] = self.ClassPoints[0] # Node(0, parent=None) # this it is a non-clustered point 290 | # where -1 is noise and 0 is non-clustered 291 | return 0 292 | 293 | def sort_DataPoints(self): 294 | 295 | for dp in self.DataPoints: 296 | if len(dp.reference) != 0: 297 | self.KernelPoints.append([dp.point_id, dp.homogeneity]) 298 | 299 | self.KernelPoints = self.sort_Tuple(self.KernelPoints, reverse=True) 300 | 301 | return 0 302 | 303 | def compute_Links(self): 304 | start = time.time() 305 | 306 | for i in range(self.dp_count): 307 | for pos in self.DataPoints[i].refer_to: 308 | 309 | for pos2 in self.DataPoints[i].referred_by: 310 | if pos[0] == pos2[0]: 311 | self.DataPoints[i].reference.append(pos) 312 | break 313 | 314 | self.analyzer["n_points"]["strong"] = 0 315 | for i in range(self.dp_count): 316 | self.DataPoints[i].referred_by = self.sort_Tuple(self.DataPoints[i].referred_by, reverse=False) 317 | if len(self.DataPoints[i].referred_by) >= self.k_nearest: 318 | self.analyzer["n_points"]["strong"] += 1 319 | else: 320 | self.analyzer["n_points"]["weak"]["all"] += 1 321 | 322 | self.DataPoints[i].reference = self.sort_Tuple(self.DataPoints[i].reference, reverse=False) 323 | homogeneity = (100 * len(self.DataPoints[i].referred_by)) + len(self.DataPoints[i].reference) 324 | self.DataPoints[i].homogeneity = homogeneity 325 | 326 | end = time.time() 327 | 328 | return 0 329 | 330 | def sort_Tuple(self, li, reverse=False): 331 | 332 | # reverse = None (Sorts in Ascending order) 333 | # key is set to sort using second element of 334 | # sublist lambda has been used 335 | li.sort(key=lambda x: x[1], reverse=reverse) 336 | return li 337 | 338 | def load_DataPoints(self): 339 | 340 | # initialize datapoints to its default values 341 | self.init_DataPoints() 342 | 343 | for i in range(self.dp_count): 344 | result = self.dp_dis[i] 345 | for k, o in enumerate(result): 346 | # no need to this condition, it wont happen 347 | #if k >= self.k_nearest: 348 | # break 349 | 350 | # if k != 0: 351 | _dis = round(o[1], 6) 352 | _point = o[0] 353 | 354 | self.DataPoints[i].refer_to.append([_point, _dis]) 355 | self.DataPoints[_point].referred_by.append([i, _dis]) 356 | 357 | return 0 358 | 359 | def prepare_Clusters(self): 360 | start = time.time() 361 | class_id = 0 362 | 363 | itr = 0 364 | for dp_kern in self.KernelPoints: 365 | itr += 1 366 | if self.prop_step and self.prop_step <= itr: 367 | continue 368 | 369 | dp_core = self.DataPoints[dp_kern[0]] 370 | 371 | # remember no strong points & weak points in Tirann 372 | # all points with at least one refernce are considered (ignore noises) 373 | if len(dp_core.reference) > 0 and len(dp_core.referred_by) >= len(dp_core.refer_to): 374 | 375 | class_id += 1 376 | dp_core.visited = True 377 | dp_core.class_id = class_id 378 | self.ClassPoints[class_id] = Node(class_id, parent=None) 379 | max_class = -1 380 | weight_map = {} 381 | # Class_Points[class_id] = new TreeCls::Node(class_id) 382 | 383 | for pos2 in dp_core.reference: 384 | # if DataPoints[*pos2].visited && visited was tested not to affect on results, so you can ommit it 385 | if self.DataPoints[pos2[0]].class_id > 0 and len(self.DataPoints[pos2[0]].referred_by) >= len( 386 | self.DataPoints[pos2[0]].refer_to): 387 | 388 | # this condition is a must, as some points may be visited but not classified yet 389 | # maa we may neglect is noise as long as it is in our refernce points 390 | 391 | _cls = self.DataPoints[pos2[0]].class_id 392 | _class_id = self.ClassPoints[_cls].root.name 393 | # _class_id = _cls 394 | 395 | if _class_id not in weight_map.keys(): 396 | weight_map[_class_id] = 1 397 | else: 398 | weight_map[_class_id] += 1 399 | 400 | 401 | elif self.DataPoints[pos2[0]].visited == False: 402 | self.DataPoints[pos2[0]].visited = True # this point is visited but not classified yet 403 | 404 | while len(weight_map) > 0: 405 | # weight_no = self.getValue(dic=weight_map, what='max', who='value') # no need to it in DenMune 406 | max_class = self.getValue(dic=weight_map, what='max', who='value', other=True) 407 | 408 | if max_class != -1 and max_class != class_id: 409 | self.ClassPoints[max_class].parent = self.ClassPoints[class_id] 410 | 411 | del weight_map[max_class] 412 | 413 | for i in range(self.dp_count): 414 | clsid = self.DataPoints[i].class_id 415 | clsroot = self.ClassPoints[clsid].root.name 416 | self.DataPoints[i].class_id = clsroot 417 | 418 | if self.prop_step: 419 | # let us update class 0 to be -2 420 | for dp in self.DataPoints: 421 | if dp.class_id == 0: 422 | dp.class_id = -2 423 | 424 | end = time.time() 425 | 426 | return 0 427 | 428 | def attach_Points(self): 429 | 430 | start = time.time() 431 | olditr = 0 432 | newitr = -1 433 | while olditr != newitr: 434 | newitr = olditr 435 | olditr = 0 436 | 437 | for pos in self.KernelPoints: 438 | if self.DataPoints[pos[0]].class_id == 0: 439 | self.DataPoints[pos[0]].class_id = self.attach_StrongPoint(pos[0]) 440 | olditr += 1 441 | 442 | olditr = 0 443 | newitr = -1 444 | while olditr != newitr: 445 | newitr = olditr 446 | olditr = 0 447 | 448 | for pos in self.KernelPoints: 449 | if self.DataPoints[pos[0]].class_id == 0: 450 | self.DataPoints[pos[0]].class_id = self.attach_WeakPoint(pos[0]) 451 | olditr += 1 452 | 453 | end = time.time() 454 | 455 | # let us update class 0 to be -2 456 | for dp in self.DataPoints: 457 | if dp.class_id == 0: 458 | dp.class_id = -2 459 | 460 | def attach_StrongPoint(self, point_id): 461 | weight_map = {} 462 | max_class = 0 # max_class in attach point = 0 , thus if a point faild to merge with any cluster, it has one more time 463 | # to merge in attach weak point 464 | dp_core = self.DataPoints[point_id] 465 | if len(dp_core.reference) != 0: 466 | dp_core.visited = True 467 | 468 | for pos2 in dp_core.reference: 469 | 470 | if self.DataPoints[pos2[0]].visited == True and len(self.DataPoints[pos2[0]].referred_by) >= len( 471 | self.DataPoints[pos2[0]].refer_to): 472 | 473 | clsid = self.DataPoints[pos2[0]].class_id 474 | clsroot = self.ClassPoints[clsid].root.name 475 | self.DataPoints[pos2[0]].class_id = clsroot 476 | 477 | if clsroot not in weight_map.keys(): 478 | weight_map[clsroot] = 1 479 | else: 480 | weight_map[clsroot] += 1 481 | 482 | if len(weight_map) != 0: 483 | weight_map = dict(sorted(weight_map.items())) 484 | max_class = self.getValue(dic=weight_map, what='max', who='value', other=True) 485 | 486 | return max_class # this will return get_Root(max_class) as we computed earlier _class_id = get_Root(_cls) 487 | 488 | def attach_WeakPoint(self, point_id): 489 | 490 | weight_map = {} 491 | max_class = -1 # max_class in attach weak point = -1 , thus if a point faild to merge with any cluster it is a noise 492 | 493 | dp_core = self.DataPoints[point_id] 494 | if len(dp_core.reference) != 0: 495 | dp_core.visited = True 496 | 497 | for pos2 in dp_core.reference: 498 | 499 | if self.DataPoints[pos2[0]].visited == True: 500 | 501 | clsid = self.DataPoints[pos2[0]].class_id 502 | clsroot = self.ClassPoints[clsid].root.name 503 | self.DataPoints[pos2[0]].class_id = clsroot 504 | 505 | if clsroot not in weight_map.keys(): 506 | weight_map[clsroot] = 1 507 | else: 508 | weight_map[clsroot] += 1 509 | 510 | if len(weight_map) != 0: 511 | weight_map = dict(sorted(weight_map.items())) 512 | max_class = self.getValue(dic=weight_map, what='max', who='value', other=True) 513 | 514 | return max_class # this will return get_Root(max_class) as we computed earlier _class_id = get_Root(_cls) 515 | 516 | def fit_predict(self, 517 | validate=True, 518 | show_plots=True, 519 | show_noise=True, 520 | show_analyzer=True 521 | ): 522 | data_type = None 523 | validity_scores = [] 524 | solution_file = 'solution.txt' 525 | 526 | if os.path.isfile(solution_file): 527 | os.remove(solution_file) 528 | 529 | pred_list = [] 530 | for dp in self.DataPoints: 531 | pred_list.append(dp.class_id) 532 | 533 | with open(solution_file, 'w') as f: 534 | f.writelines("%s\n" % pred for pred in pred_list) 535 | 536 | labels_dic = {} 537 | self.train_pred = pred_list[:self.train_sz] 538 | self.test_pred = pred_list[self.train_sz:] 539 | 540 | if self.test_data is not None: 541 | self.labels_pred = np.append(self.train_pred, self.test_pred, axis=0) 542 | else: 543 | self.labels_pred = self.train_pred 544 | 545 | if self.prop_step > 0: 546 | print("Propagation at iteration:", self.prop_step) 547 | self.plot_clusters(show_plots=show_plots, show_noise=show_noise, data_type='train') 548 | 549 | if show_analyzer: 550 | self.show_Analyzer() 551 | 552 | return None, None 553 | 554 | else: 555 | if self.data_indicator >= 3: 556 | 557 | if show_analyzer: 558 | print("Plotting dataset Groundtruth") 559 | self.plot_clusters(show_plots=show_plots, show_noise=show_noise, data_type='ground') 560 | 561 | if validate and self.data_indicator >= 1: 562 | 563 | if self.data_indicator >= 3: 564 | self.analyzer["validity"] = {} 565 | self.analyzer["validity"]['train'] = {} 566 | validity_scores = self.validate_Clusters(data_type='train') 567 | 568 | if show_analyzer: 569 | print('Plotting train data') 570 | self.plot_clusters(show_plots=show_plots, show_noise=show_noise, data_type='train') 571 | if show_analyzer: 572 | self.show_Analyzer(root='Validating train data') 573 | 574 | if self.data_indicator == 15: 575 | validity_scores = self.validate_Clusters(data_type='test') 576 | if show_analyzer: 577 | # self.analyzer["validity"]['test'] = {} 578 | self.show_Analyzer(self.analyzer['validity']['test'], root='Validating test data') 579 | 580 | if self.data_indicator > 3: 581 | if show_analyzer: 582 | print('Plotting test data') 583 | self.plot_clusters(show_plots=show_plots, show_noise=show_noise, data_type='test') 584 | 585 | """" 586 | if self.data_indicator == 15: 587 | validity_scores = self.validate_Clusters(data_type='augmented') 588 | if show_analyzer: 589 | self.analyzer["validity"]['augmented'] = {} 590 | self.show_Analyzer(self.analyzer['validity']['augmented'], root='Validating augmented data (train & test)') 591 | if self.data_indicator > 3: 592 | if show_analyzer: 593 | print ('Plotting augmented data (train & test)') 594 | self.plot_clusters(show_plots=show_plots, show_noise=show_noise, data_type='augmented') 595 | """ 596 | 597 | labels_dic['train'] = self.train_pred 598 | labels_dic['test'] = self.test_pred 599 | 600 | if self.data_indicator == 1: 601 | return labels_dic, None 602 | elif validate == False: 603 | return labels_dic, None 604 | elif self.data_indicator >= 3 and validate == True: 605 | return labels_dic, self.analyzer['validity'] 606 | 607 | def match_Labels(self): 608 | 609 | labels_true = self.labels_truth 610 | 611 | """" 612 | if isinstance(self.labels_pred, np.ndarray): 613 | # labels_pred = np.array(self.labels_pred, dtype=np.int64) 614 | labels_pred = self.labels_pred.tolist() 615 | else: 616 | labels_pred = self.labels_pred 617 | """ 618 | 619 | labels_pred = self.labels_pred 620 | pred_set = set(labels_pred) 621 | index = [] 622 | x = 1 623 | old_item = labels_true[0] 624 | old_x = 0 625 | 626 | for item in labels_true: 627 | 628 | if item != old_item: 629 | count = x - old_x 630 | index.append([old_x, old_item, count]) 631 | old_item = item 632 | old_x = x 633 | x += 1 634 | 635 | ln = len(labels_true) 636 | count = x - old_x 637 | index.append([old_x, old_item, count]) 638 | index[0][2] = index[0][2] - 1 639 | 640 | index.sort(key=lambda x: x[2], reverse=True) 641 | 642 | labeled = [] 643 | for n in range(len(index)): 644 | newval = index[n][1] 645 | max_class = max(set(labels_pred), key=labels_pred[index[n][0]:index[n][0] + index[n][2] - 1].count) 646 | if max_class not in labeled: 647 | labels_pred = [newval if x == max_class else x for x in labels_pred] 648 | labeled.append(newval) 649 | 650 | labels_pred = np.array(labels_pred, dtype=np.int64) 651 | self.labels_pred = labels_pred 652 | self.train_pred = labels_pred[:self.train_sz] 653 | self.test_pred = labels_pred[self.train_sz:] 654 | 655 | return labels_pred 656 | 657 | def validate_Clusters(self, data_type=None): 658 | 659 | labels_true = self.labels_truth 660 | if data_type == 'train': 661 | labels_true = labels_true[:self.train_sz] 662 | elif data_type == 'test': 663 | labels_true = labels_true[self.train_sz:] 664 | #elif data_type == 'augmented': 665 | # keep it as it 666 | 667 | if isinstance(self.labels_pred, np.ndarray): 668 | # labels_pred = np.array(self.labels_pred, dtype=np.int64) 669 | self.labels_pred = self.labels_pred.tolist() 670 | 671 | labels_pred = self.labels_pred 672 | if self.prop_step == 0: # do not match labels if yoy are in propagation mode 673 | labels_pred = self.match_Labels() 674 | 675 | 676 | if data_type == 'train': 677 | labels_pred = labels_pred[:self.train_sz] 678 | elif data_type == 'test': 679 | labels_pred = labels_pred[self.train_sz:] 680 | #elif data_type == 'augmented': 681 | # keep it as it 682 | 683 | self.analyzer["n_clusters"]["actual"] = len(np.unique(labels_true)) 684 | 685 | # Score the clustering 686 | from sklearn.metrics.cluster import adjusted_mutual_info_score # 2010 687 | from sklearn.metrics.cluster import adjusted_rand_score # 1985 688 | 689 | # from sklearn.metrics import davies_bouldin_score 690 | # #1975 - 2001 ## no ground truth ##Values closer to zero indicate a better partition. 691 | 692 | ## also known as the Variance Ratio Criterion - can be used to evaluate the model, 693 | ## where a higher Calinski-Harabasz score relates to a model with better defined clusters. 694 | 695 | from sklearn import metrics # for homogeneity, completeness, fowlkes 696 | ## homogeneity: each cluster contains only members of a single class. 697 | ## completeness: all members of a given class are assigned to the same cluster. 698 | # v-measure the harmonic mean of homogeneity and completeness called V-measure 2007 699 | 700 | acc = metrics.accuracy_score(labels_true, labels_pred, normalize=False) 701 | 702 | # mi = metrics.mutual_info_score(labels_true, labels_pred) 703 | # print("mutual_info_score: %f." % mi) 704 | 705 | nmi = metrics.normalized_mutual_info_score(labels_true, labels_pred, average_method='arithmetic') 706 | # print("normalized_mutual_info_score: %f." % nmi) 707 | 708 | ami = adjusted_mutual_info_score(labels_true, labels_pred, average_method='arithmetic') 709 | # print("Adjusted_mutual_info_score: %f." % adj_nmi) 710 | 711 | homogeneity = metrics.homogeneity_score(labels_true, labels_pred) 712 | # print("homogeneity_score: %f." % homogeneity_score) 713 | 714 | completeness = metrics.completeness_score(labels_true, labels_pred) 715 | # print("completeness_score: %f." % completeness_score) 716 | 717 | f1_weight = metrics.f1_score(labels_true, labels_pred, average='weighted') 718 | # f1_micro = metrics.f1_score(labels_true, labels_pred, average='micro') 719 | # f1_macro = metrics.f1_score(labels_true, labels_pred, average='macro') 720 | # print("f1_score: %f." % f1_score) 721 | 722 | ari = adjusted_rand_score(labels_true, labels_pred) 723 | # print("adjusted_rand_score: %f." % adj_rand) 724 | 725 | f1 = f1_weight 726 | 727 | validity = {"ACC": acc, 728 | "F1": f1, 729 | "NMI": nmi, 730 | "AMI": ami, 731 | "ARI": ari, 732 | "homogeneity": homogeneity, 733 | "completeness": completeness 734 | } 735 | 736 | # val = [acc, f1, nmi, ami, ari, homogeneity, completeness] 737 | self.analyzer["validity"][data_type] = validity 738 | # self.analyzer["validity"] = validity 739 | 740 | return self.analyzer["validity"][data_type] 741 | 742 | def preplot_Clusters(self, data_type=None): 743 | 744 | self.analyzer["n_points"]["size"] = self.dp_count 745 | if data_type == 'test': 746 | self.analyzer["n_points"]["plot_size"] = self.test_sz 747 | elif data_type == 'train': 748 | self.analyzer["n_points"]["plot_size"] = self.train_sz 749 | #elif data_type == 'augmented': 750 | # self.analyzer["n_points"]["plot_size"] = self.dp_count 751 | 752 | if data_type == 'ground': 753 | labels = self.labels_truth 754 | labels = np.array(labels, dtype=np.int64) 755 | else: 756 | labels = self.labels_pred 757 | if data_type == 'train': 758 | labels = labels[:self.train_sz] 759 | elif data_type == 'test': 760 | labels = labels[self.train_sz:] 761 | #elif data_type == 'augmented': 762 | # nothing to do 763 | 764 | noise_1 = list(labels).count(-1) 765 | self.analyzer["n_points"]["noise"]["type-1"] = noise_1 766 | 767 | noise_2 = list(labels).count(-2) 768 | self.analyzer["n_points"]["noise"]["type-2"] = noise_2 769 | 770 | self.analyzer["n_points"]["weak"]["succeeded to merge"] = self.analyzer["n_points"]["weak"]["all"] - \ 771 | self.analyzer["n_points"]["noise"]["type-2"] 772 | self.analyzer["n_points"]["weak"]["failed to merge"] = self.analyzer["n_points"]["noise"]["type-2"] 773 | 774 | unique_labels = np.unique(labels) 775 | num_of_clusters = len(unique_labels) 776 | 777 | fake_clusters = 0 # otlier = -1 and weak points that fail to merge (noise) = 0 778 | 779 | i = 0 780 | for n in (unique_labels): 781 | 782 | if n >= 0: # num_of_clusters: 783 | labels = np.where(labels == n, i, labels) 784 | i += 1 785 | else: 786 | fake_clusters += 1 787 | 788 | self.analyzer["n_clusters"]["detected"] = num_of_clusters - fake_clusters 789 | 790 | return labels 791 | 792 | def plot_clusters(self, data_type=None, show_noise=False, show_plots=True): 793 | data2 = [] 794 | colors2 = [] 795 | 796 | labels = self.preplot_Clusters(data_type=data_type) 797 | 798 | if show_plots: 799 | 800 | palette = sns.color_palette('deep', 801 | np.unique(labels).max() + 2) # deep, dark, bright, muted, pastel, colorblind 802 | 803 | if self.prop_step: 804 | colors = [palette[x] if x >= 0 else ((0.0, 0.0, 0.0) if x == -1 else (0.0, 0.0, 0.0)) for x in labels] 805 | v = 0 806 | for c in colors: 807 | if (c[0] + c[1] + c[2]) > 0.0: # outlier :: keep it away. Note that even outliers are -1, -2, it become in black after the previous step: color (0.0, 0.0, 0.0 808 | colors2.append((c[0], c[1], c[2], 1.0)) 809 | data2.append((self.data[v][0], self.data[v][1])) 810 | v += 1 811 | data2 = np.array(data2) 812 | 813 | else: 814 | if show_noise == False: 815 | colors = [palette[x] if x >= 0 else (1.0, 1.0, 1.0) for x in 816 | labels] # noise points wont be printed due to x > 0 , else (1.0, 1.0, 1.0) 817 | else: 818 | colors = [palette[x] if x >= 0 else ((0.0, 0.0, 0.0) if x == -1 else (0.9, 0.9, 0.9)) for x in 819 | labels] # noise points wont be printed due to x > 0 , else (1.0, 1.0, 1.0) 820 | 821 | # plt.figure(figsize=(12, 8)) 822 | 823 | if self.prop_step: 824 | # lenght of data2 will be always equlas to length of the specific data type (test, train) 825 | #print ('datatype', data_type) 826 | if data_type == 'train': 827 | plt.scatter(data2.T[0], data2.T[1], c=colors2, **plot_kwds, marker='o') 828 | #plt.scatter(data2[:self.train_sz].T[0], data2[:self.train_sz].T[1], c=colors2, **plot_kwds, marker='o') 829 | 830 | """" 831 | elif data_type == 'test': 832 | print ('train_sz', self.train_sz, 'test_sz', self.test_sz) 833 | #plt.scatter(data2[self.test_sz:self.train_sz:].T[0], data2[self.train_sz:].T[1], c=colors2, **plot_kwds, marker='o' ) 834 | plt.scatter(data2.T[0], data2.T[1], c=colors2, **plot_kwds, marker='o') 835 | elif data_type == 'augmented': 836 | print ('3') 837 | plt.scatter(data2.T[0], data2.T[1], c=colors2, **plot_kwds, marker='o') 838 | elif data_type == 'ground': 839 | #plt.scatter(data2[:self.train_sz].T[0], data2[:self.train_sz].T[1], c=colors2, **plot_kwds, marker='o') 840 | plt.scatter(data2.T[0], data2.T[1], c=colors2, **plot_kwds, marker='o') 841 | """ 842 | 843 | else: 844 | if data_type == 'train': 845 | plt.scatter(self.data[:self.train_sz].T[0], self.data[:self.train_sz].T[1], c=colors, **plot_kwds, 846 | marker='o') 847 | elif data_type == 'test': 848 | plt.scatter(self.data[self.train_sz:].T[0], self.data[self.train_sz:].T[1], c=colors, **plot_kwds, 849 | marker='o') 850 | #elif data_type == 'augmented': 851 | # plt.scatter(self.data.T[0], self.data.T[1], c=colors, **plot_kwds, marker='o') 852 | elif data_type == 'ground': 853 | if self.data_indicator == 15: 854 | plt.scatter(self.data.T[0], self.data.T[1], c=colors, **plot_kwds, marker='o') 855 | else: 856 | plt.scatter(self.data[:self.train_sz].T[0], self.data[:self.train_sz].T[1], c=colors, 857 | **plot_kwds, marker='o') 858 | 859 | self.colors = colors 860 | frame = plt.gca() 861 | frame.axes.get_xaxis().set_visible(False) 862 | frame.axes.get_yaxis().set_visible(False) 863 | if show_plots: 864 | if self.prop_step: 865 | prop_folder = 'propagation' 866 | if not os.path.exists(prop_folder): 867 | os.mkdir(prop_folder) 868 | plt.savefig(prop_folder + '/' + str(self.prop_step) + '.png') 869 | plt.show() 870 | # plt.clf() # this is a must to clear figures if you plot continously 871 | 872 | return 0 873 | 874 | def generate_tsne(self, data, d, file_2d): 875 | 876 | dim_two = TSNE(n_components=d, random_state=1971, init='random').fit_transform(data) 877 | 878 | mystr = "" 879 | data_len = len(dim_two) 880 | for i in range(data_len): 881 | for n in range(d): 882 | mystr += str(round(dim_two[i][n], 6)) 883 | if (n < d - 1): mystr += ',' 884 | if (n == d - 1): mystr += '\n' 885 | 886 | text_file = open(file_2d, "w") 887 | text_file.write(mystr) 888 | text_file.close() 889 | 890 | return 0 891 | 892 | def show_Analyzer(self, mydic=None, root="DenMune"): 893 | 894 | if mydic is None: 895 | mydic = self.analyzer 896 | 897 | tree = tr() 898 | tree.create_node(root, "root") 899 | 900 | def creat_TreefromDict(self, tree, mydict, key, parent): 901 | if type(mydict[key]) is not dict: 902 | val = key + ': ' + str(round(mydict[key], 3)) 903 | tree.create_node(val, key, parent=parent) 904 | 905 | for d in mydic: 906 | # print ('sub_roots', d) 907 | if type(mydic[d]) is not dict: 908 | creat_TreefromDict(self, tree, mydic, d, parent='root') 909 | # print('0', 'key:', d , 'value:', mydic[d], 'parent: root') 910 | else: 911 | tree.create_node(d, d, parent="root") 912 | subdic = mydic[d] 913 | # print('1', 'key:', d , 'value:', subdic, 'parent: root') 914 | for v in subdic: 915 | if type(subdic[v]) is not dict: 916 | # print('2', 'key:', v , 'value:', subdic[v], 'parent:', d) 917 | creat_TreefromDict(self, tree, subdic, v, parent=d) 918 | else: 919 | tree.create_node(v, v, parent=d) 920 | subsubdic = subdic[v] 921 | # print('3', 'key:', v , 'value:', subsubdic, 'parent:', d) 922 | for z in subsubdic: 923 | # print('4', 'key:', z , 'value:', subsubdic[z], 'parent:', v) 924 | creat_TreefromDict(self, tree, subsubdic, z, parent=v) 925 | tree.show() 926 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DenMune: A Density-Peak Clustering Algorithm 2 | 3 | DenMune is a clustering algorithm that can find clusters of arbitrary size, shapes, and densities in two-dimensions. Higher dimensions are first reduced to 2D using t-SNE. The algorithm relies on a single parameter K (the number of nearest neighbors). The results show the superiority of the algorithm. Enjoy the simplicity but the power of DenMune. 4 | 5 | ## Listen to this amazing interview podcast 6 | 7 | [![DenMune Podcast](https://raw.githubusercontent.com/egy1st/images/refs/heads/main/clustering/denmune-podcast.png)](https://on.soundcloud.com/z7WeqJnHjDd26hD76) 8 | 9 | *click image to listen (24 min)* 10 | 11 | ## Reproducibility & Test Drives 12 | 13 | Now you can reproduce all the research experiments, and even share the results and collaborate with the algorithm using our capsule on CodeOcean. Each Capsule is a self-contained computational experiment with computing environment, code, data, version history, and results. 14 | 15 | Also, you may use our repo2docker offered by mybinder.org, which encapsulates the algorithm and all required data in one virtual machine instance. All Jupyter notebooks examples found in this repository will be also available to you in action to practice in this respo2docer. Thanks mybinder.org, you made it possible! 16 | 17 | | Test-drive | URL | 18 | | ---------------------------------------- | ------------------------------------------------------------ | 19 | | Reproduce our code capsule on Code Ocean | [![Open in Code Ocean](https://codeocean.com/codeocean-assets/badge/open-in-code-ocean.svg)](https://bit.ly/codeocean-capsule) | 20 | | Use our test-drive on MyBinder | [![Launch notebook examples in Binder](https://static.mybinder.org/badge_logo.svg)](https://bit.ly/mybinder-repo2docker) | 21 | 22 | ## Scientific Work 23 | 24 | | Paper & data | Journals | ResearchGate Stats | 25 | | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | 26 | | [![Elsevier, journal's article publisher](https://img.shields.io/badge/elsevier-published-orange)](https://bit.ly/denmune-research-paper)
[![Research datasets at Mendeley](https://img.shields.io/badge/mendeley-data-bluegreen)](https://bit.ly/mendeley-data) | [![scimagojr](https://www.scimagojr.com/journal_img.php?id=24823)](https://www.scimagojr.com/journalsearch.php?q=24823&tip=sid&clean=0) [![scimagojr](https://www.scimagojr.com/journal_img.php?id=21101060167)](https://www.scimagojr.com/journalsearch.php?q=21101060167&tip=sid&clean=0) | ![ResearchGate Stats](https://raw.githubusercontent.com/egy1st/images/main/clustering/researshgate.jpg) | 27 | 28 | ## Coding, Security & Maintenance 29 | 30 | | Code Style | Installation | CI Workflow | Code Coverage | Code Scanning | 31 | | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | 32 | | ![Code Style: Black](https://img.shields.io/badge/code%20style-black-black) | [![PyPI Version](https://img.shields.io/pypi/v/denmune.svg)](https://pypi.org/project/denmune/) | [![CircleCI, continuous integration](https://circleci.com/gh/egy1st/denmune-clustering-algorithm/tree/main.svg?style=shield)](https://circleci.com/gh/egy1st/denmune-clustering-algorithm/tree/main) | [![codecov](https://codecov.io/gh/egy1st/denmune-clustering-algorithm/branch/main/graph/badge.svg?token=QCbRdRtzYE)](https://codecov.io/gh/egy1st/denmune-clustering-algorithm) | [![CodeQL](https://github.com/adrinjalali/denmune-clustering-algorithm/actions/workflows/codeql.yml/badge.svg)](https://github.com/adrinjalali/denmune-clustering-algorithm/actions/workflows/codeql.yml) | 33 | 34 | ## Tutorials 35 | 36 | | Reproducible Capsule | Repo2Docker | Colab | Kaggle | 37 | | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | 38 | | [![Open in Code Ocean](https://codeocean.com/codeocean-assets/badge/open-in-code-ocean.svg)](https://bit.ly/codeocean-capsule) | [![Launch notebook examples in Binder](https://static.mybinder.org/badge_logo.svg)](https://bit.ly/mybinder-repo2docker) | [![Launch notebook examples in Colaboratory, Google Research](https://colab.research.google.com/assets/colab-badge.svg)](#colab) | [![Launch notebook examples in Kaggle, the workspace where data scientist meet](https://kaggle.com/static/images/open-in-kaggle.svg)](#kaggle) | 39 | 40 | ## Downloads Stats 41 | 42 | | Download/Week | Download/Month | Total Downloads | 43 | | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | 44 | | [![Downloads](https://static.pepy.tech/personalized-badge/denmune?period=week&units=international_system&left_color=black&right_color=orange&left_text=Downloads)](https://pepy.tech/project/denmune) | [![Downloads](https://static.pepy.tech/personalized-badge/denmune?period=month&units=international_system&left_color=black&right_color=orange&left_text=Downloads)](https://pepy.tech/project/denmune) | [![Downloads](https://static.pepy.tech/personalized-badge/denmune?period=total&units=international_system&left_color=black&right_color=orange&left_text=Downloads)](https://pepy.tech/project/denmune) | 45 | 46 | ## Based on the paper 47 | 48 | | Paper | 49 | |- 50 | | Mohamed Abbas, Adel El-Zoghabi, Amin Shoukry, 51 | | *DenMune: Density peak based clustering using mutual nearest neighbors* 52 | | In: Journal of Pattern Recognition, Elsevier, 53 | | volume 109, number 107589, January 2021 54 | | DOI: https://doi.org/10.1016/j.patcog.2020.107589 55 | 56 | ## Documentation: 57 | 58 | - [![read the docs](https://img.shields.io/badge/read_the-docs-orange)](https://denmune.readthedocs.io/en/latest/?badge=latest) 59 | - [![Read my docs](https://img.shields.io/badge/read_my-docs-green)](https://denmune-docs.vercel.app) 60 | 61 | ## Watch it in action 62 | 63 | This 30 seconds will tell you how a density-based algorithm, DenMune propagates: 64 | 65 | [![interact with the propagation](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1o-tP3uvDGjxBOGYkir1lnbr74sZ06e0U?usp=sharing) 66 | 67 | [![Propagation in DenMune](https://raw.githubusercontent.com/egy1st/images/main/clustering/propagation.gif)]() 68 | 69 | ## Still interested? 70 | 71 | Watch this ***10-min*** illustrative video on: 72 | 73 | - [![watch on Vimeo](https://img.shields.io/badge/watch_on-Vimeo-green)](https://player.vimeo.com/video/827209757) 74 | - [![YouTube Video Views](https://img.shields.io/badge/watch_on-Youtube-red)](https://www.youtube.com/watch?v=o77raaasuOM) 75 | 76 | ## When less means more 77 | 78 | Most classic clustering algorithms fail to detect complex clusters where clusters are of different sizes, shapes, density, and exist in noisy data. Recently, a density-based algorithm named DenMune showed great ability in detecting complex shapes even in noisy data. it can detect a number of clusters automatically, detect both pre-identified-noise and post-identified-noise automatically, and remove them. 79 | 80 | It can achieve an accuracy reach 100% in some classic pattern problems, achieve 97% in the MNIST dataset. A great advantage of this algorithm is being a single-parameter algorithm. All you need is to set a number of k-nearest neighbors and the algorithm will care about the rest. Being Non-sensitive to changes in k, make it robust and stable. 81 | 82 | Keep in mind, that the algorithm reduces any N-D dataset to only a 2-D dataset initially, so it is a good benefit of this algorithm is always to plot your data and explore it which makes this algorithm a good candidate for data exploration. Finally, the algorithm comes with a neat package for visualizing data, validating it, and analyzing the whole clustering process. 83 | 84 | ## How to install DenMune 85 | 86 | Simply install DenMune clustering algorithm using pip command from the official Python repository 87 | 88 | [![PyPI Version](https://img.shields.io/pypi/v/denmune.svg)](https://pypi.org/project/denmune/) 89 | 90 | From the shell run the command 91 | 92 | ```shell 93 | pip install denmune 94 | ``` 95 | 96 | From Jupyter notebook cell run the command 97 | 98 | ```ipython3 99 | !pip install denmune 100 | ``` 101 | 102 | ## How to use DenMune 103 | 104 | Once DenMune is installed, you just need to import it 105 | 106 | ```python 107 | from denmune import DenMune 108 | ``` 109 | 110 | *Please note that first denmune (the package) in small letters, while the other one(the class itself) has D and M in capital case.* 111 | 112 | ## Read data 113 | 114 | There are four possible cases of data: 115 | 116 | - only train data without labels 117 | - only labeled train data 118 | - labeled train data in addition to test data without labels 119 | - labeled train data in addition to labeled test data 120 | 121 | ```python 122 | #============================================= 123 | # First scenario: train data without labels 124 | # ============================================ 125 | 126 | data_path = 'datasets/denmune/chameleon/' 127 | dataset = "t7.10k.csv" 128 | data_file = data_path + dataset 129 | 130 | # train data without labels 131 | X_train = pd.read_csv(data_file, sep=',', header=None) 132 | 133 | knn = 39 # k-nearest neighbor, the only parameter required by the algorithm 134 | 135 | dm = DenMune(train_data=X_train, k_nearest=knn) 136 | labels, validity = dm.fit_predict(show_analyzer=False, show_noise=True) 137 | 138 | ``` 139 | 140 | This is an intuitive dataset which has no groundtruth provided 141 | 142 | ![t710](https://raw.githubusercontent.com/egy1st/images/main/clustering/t710.png) 143 | 144 | ```python 145 | #============================================= 146 | # Second scenario: train data with labels 147 | # ============================================ 148 | 149 | data_path = 'datasets/denmune/shapes/' 150 | dataset = "aggregation.csv" 151 | data_file = data_path + dataset 152 | 153 | # train data with labels 154 | X_train = pd.read_csv(data_file, sep=',', header=None) 155 | y_train = X_train.iloc[:, -1] 156 | X_train = X_train.drop(X_train.columns[-1], axis=1) 157 | 158 | knn = 6 # k-nearest neighbor, the only parameter required by the algorithm 159 | 160 | dm = DenMune(train_data=X_train, train_truth= y_train, k_nearest=knn) 161 | labels, validity = dm.fit_predict(show_analyzer=False, show_noise=True) 162 | ``` 163 | 164 | Datset groundtruth 165 | 166 | ![aggregation groundtruth](https://raw.githubusercontent.com/egy1st/images/main/clustering/aggregation_ground.png) 167 | 168 | Dataset as detected by DenMune at k=6 169 | 170 | ![aggregation train](https://raw.githubusercontent.com/egy1st/images/main/clustering/aggregation_6.png) 171 | 172 | 173 | ```python 174 | #================================================================= 175 | # Third scenario: train data with labels in addition to test data 176 | # =============================================================== 177 | 178 | data_path = 'datasets/denmune/pendigits/' 179 | file_2d = data_path + 'pendigits-2d.csv' 180 | 181 | # train data with labels 182 | X_train = pd.read_csv(data_path + 'train.csv', sep=',', header=None) 183 | y_train = X_train.iloc[:, -1] 184 | X_train = X_train.drop(X_train.columns[-1], axis=1) 185 | 186 | # test data without labels 187 | X_test = pd.read_csv(data_path + 'test.csv', sep=',', header=None) 188 | X_test = X_test.drop(X_test.columns[-1], axis=1) 189 | 190 | knn = 50 # k-nearest neighbor, the only parameter required by the algorithm 191 | 192 | dm = DenMune(train_data=X_train, train_truth= y_train, 193 | test_data= X_test, 194 | k_nearest=knn) 195 | labels, validity = dm.fit_predict(show_analyzer=True, show_noise=True) 196 | ``` 197 | 198 | dataset groundtruth 199 | 200 | ![pendigits groundtruth](https://raw.githubusercontent.com/egy1st/images/main/clustering/pendigits_ground.png) 201 | 202 | 203 | dataset as detected by DenMune at k=50 204 | 205 | ![pendigits train](https://raw.githubusercontent.com/egy1st/images/main/clustering/pendigits_50.png) 206 | 207 | test data as predicted by DenMune on training the dataset at k=50 208 | 209 | ![pendigits test](https://raw.githubusercontent.com/egy1st/images/main/clustering/pendigits_test_50.png) 210 | 211 | 212 | ## Algorithm's Parameters 213 | 214 | 1. **Parameters used within the initialization of the DenMune class** 215 | 216 | ```python 217 | def __init__ (self, 218 | train_data=None, test_data=None, 219 | train_truth=None, test_truth=None, 220 | file_2d =None, k_nearest=1, 221 | rgn_tsne=False, prop_step=0, 222 | ): 223 | ``` 224 | 225 | - train_data: 226 | 227 | - data used for training the algorithm 228 | - default: None. It should be provided by the use, otherwise an error will raise. 229 | 230 | - train_truth: 231 | 232 | - labels of training data 233 | - default: None 234 | 235 | - test_data: 236 | 237 | - data used for testing the algorithm 238 | 239 | - test_truth: 240 | 241 | - labels of testing data 242 | - default: None 243 | 244 | - k_nearest: 245 | 246 | - number of nearest neighbor 247 | - default: 1. k-nearest neighbor should be at least 1. 248 | 249 | - rgn_tsn: 250 | 251 | - when set to True: It will regenerate the reduced 2-D version of the N-D dataset each time the algorithm run. 252 | - when set to False: It will generate the reduced 2-D version of the N-D dataset first time only, then will reuse the saved exist file 253 | - default: True 254 | 255 | - file_2d: name (include location) of file used save/load the reduced 2-d version 256 | 257 | - if empty: the algorithm will create temporary file named '_temp_2d' 258 | - default: None 259 | 260 | - prop_step: 261 | 262 | - size of increment used in showing the clustering propagation. 263 | - leave this parameter set to 0, the default value, unless you are willing intentionally to enter the propagation mode. 264 | - default: 0 265 | 266 | 267 | 2. **Parameters used within the fit_predict function:** 268 | 269 | ```python 270 | def fit_predict(self, 271 | validate=True, 272 | show_plots=True, 273 | show_noise=True, 274 | show_analyzer=True 275 | ): 276 | ``` 277 | 278 | - validate: 279 | - validate data on/off according to five measures integrated with DenMune (Accuracy. F1-score, NMI index, AMI index, ARI index) 280 | - default: True 281 | 282 | - show_plots: 283 | - show/hide plotting of data 284 | - default: True 285 | 286 | - show_noise: 287 | - show/hide noise and outlier 288 | - default: True 289 | 290 | - show_analyzer: 291 | - show/hide the analyzer 292 | - default: True 293 | 294 | ## The Analyzer 295 | 296 | The algorithm provide an exploratory tool called analyzer, once called it will provide you with in-depth analysis on how your clustering results perform. 297 | 298 | ![DenMune Analyzer](https://raw.githubusercontent.com/egy1st/images/main/clustering/analyzer.png) 299 | 300 | 301 | ## Noise Detection 302 | 303 | DenMune detects noise and outlier automatically, no need to any further work from your side. 304 | 305 | - It plots pre-identified noise in black 306 | - It plots post-identified noise in light grey 307 | 308 | You can set show_noise parameter to False. 309 | 310 | ```python 311 | # let us show noise 312 | 313 | m = DenMune(train_data=X_train, k_nearest=knn) 314 | labels, validity = dm.fit_predict(show_noise=True) 315 | ``` 316 | 317 | ```python 318 | # let us show clean data by removing noise 319 | 320 | m = DenMune(train_data=X_train, k_nearest=knn) 321 | labels, validity = dm.fit_predict(show_noise=False) 322 | ``` 323 | 324 | | noisy data | clean data | 325 | | ------------------------------------------------------------ | ------------------------------------------------------------ | 326 | | ![noisy data](https://raw.githubusercontent.com/egy1st/images/main/clustering/noisy_data.png) | ![clean data](https://raw.githubusercontent.com/egy1st/images/main/clustering/clean_data.png) | 327 | 328 | 329 | 330 | ## Validation 331 | 332 | You can get your validation results using 3 methods 333 | 334 | - by showing the Analyzer 335 | - extract values from the validity returned list from fit_predict function 336 | - extract values from the Analyzer dictionary 337 | - There are five validity measures built-in the algorithm, which are: 338 | - ACC, Accuracy 339 | - F1 score 340 | - NMI index (Normalized Mutual Information) 341 | - AMI index (Adjusted Mutual Information) 342 | - ARI index (Adjusted Rand Index) 343 | 344 | ![Validation snapshot](https://raw.githubusercontent.com/egy1st/images/main/clustering/validation.png) 345 | 346 | ## K-nearest Evolution 347 | 348 | The following chart shows the evolution of pre and post identified noise in correspondence to increase of number of knn. Also, detected number of clusters is analyzed in the same chart in relation with both types of identified noise. 349 | 350 | ![knn evolution chart](https://raw.githubusercontent.com/egy1st/images/main/clustering/knn_vs_noise.png) 351 | 352 | ## The Scalability 353 | 354 | | Data Size | Time | 355 | | ----------------- | ---------------------- | 356 | | data size: 5000 | time: 2.3139 seconds | 357 | | data size: 10000 | time: 5.8752 seconds | 358 | | data size: 15000 | time: 12.4535 seconds | 359 | | data size: 20000 | time: 18.8466 seconds | 360 | | data size: 25000 | time: 28.992 seconds | 361 | | data size: 30000 | time: 39.3166 seconds | 362 | | data size: 35000 | time: 39.4842 seconds | 363 | | data size: 40000 | time: 63.7649 seconds | 364 | | data size: 45000 | time: 73.6828 seconds | 365 | | data size: 50000 | time: 86.9194 seconds | 366 | | data size: 55000 | time: 90.1077 seconds | 367 | | data size: 60000 | time: 125.0228 seconds | 368 | | data size: 65000 | time: 149.1858 seconds | 369 | | data size: 70000 | time: 177.4184 seconds | 370 | | data size: 75000 | time: 204.0712 seconds | 371 | | data size: 80000 | time: 220.502 seconds | 372 | | data size: 85000 | time: 251.7625 seconds | 373 | | data size: 100000 | time: 257.563 seconds | 374 | 375 | |![noisy data chart](https://raw.githubusercontent.com/egy1st/images/main/clustering/scalability.png) 376 | 377 | 378 | ## The Stability 379 | 380 | The algorithm is only single-parameter, even more it not sensitive to changes in that parameter, k. You may guess that from the following chart yourself. This is of great benefit for you as a data exploration analyst. You can simply explore the dataset using an arbitrary k. Being Non-sensitive to changes in k, make it robust and stable. 381 | 382 | ![DenMune Stability chart](https://raw.githubusercontent.com/egy1st/images/main/clustering/stability.png) 383 | 384 | ## Reveal the propagation 385 | 386 | One of the top performing features in this algorithm is enabling you to watch how your clusters propagate to construct the final output clusters. Just use the parameter 'prop_step' as in the following example: 387 | 388 | ```python 389 | dataset = "t7.10k" # 390 | data_path = 'datasets/denmune/chameleon/' 391 | 392 | # train file 393 | data_file = data_path + dataset +'.csv' 394 | X_train = pd.read_csv(data_file, sep=',', header=None) 395 | 396 | 397 | from itertools import chain 398 | 399 | # Denmune's Paramaters 400 | knn = 39 # number of k-nearest neighbor, the only parameter required by the algorithm 401 | 402 | # create list of differnt snapshots of the propagation 403 | snapshots = chain(range(2,5), range(5,50,10), range(50, 100, 25), range(100,500,100), range(500,2000, 250), range(1000,5500, 500)) 404 | 405 | from IPython.display import clear_output 406 | for snapshot in snapshots: 407 | print ("itration", snapshot ) 408 | clear_output(wait=True) 409 | dm = DenMune(train_data=X_train, k_nearest=knn, rgn_tsne=False, prop_step=snapshot) 410 | labels, validity = dm.fit_predict(show_analyzer=False, show_noise=False) 411 | ``` 412 | 413 | ## Interact with the algorithm 414 | 415 | [![chameleon datasets](https://raw.githubusercontent.com/scikit-learn-contrib/denmune-clustering-algorithm/main/images/denmune_propagation.png)](https://colab.research.google.com/drive/1EUROd6TRwxW3A_XD3KTxL8miL2ias4Ue?usp=sharing) 416 | 417 | *click image to interact* 418 | 419 | 420 | This notebook allows you interact with the algorithm in many aspects: 421 | 422 | - you can choose which dataset to cluster (among 4 chameleon datasets) 423 | - you can decide which number of k-nearest neighbor to use 424 | - show noise on/off; thus you can invesetigate noise detected by the algorithm 425 | - show analyzer on/off 426 | 427 | 428 | ## We love Jupyter Notebooks 429 | 430 | Need to test examples one by one, then here other two options 431 | 432 | - Use colab offered by google research to test each example individually. 433 | - If you are a kaggler like me, then Kaggle, the best workspace where data scientist meet, should fit you to test the algorithm with great experience. 434 | 435 | 436 | Here is a list of Google CoLab & Kaggle notebooks to practice the use of the algorithm interactively. 437 | 438 | 439 | | Dataset | CoLab Notebook | Kaggle Notebook | 440 | | ---------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | 441 | | How to use it? | [![How to use it - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://bit.ly/colab-how-to-use) | [![When less means more - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://bit.ly/kaggle-how-to-use) | 442 | | Chameleon datasets | [![Chameleon datasets - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://bit.ly/colab-chameleon) | [![Non-groundtruth datasets](https://kaggle.com/static/images/open-in-kaggle.svg)](https://bit.ly/kaggle-chameleon) | 443 | | 2D Shape datasets | [![2D Shape datasets - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://bit.ly/colab-2d-shapes) | [![2D Shape datasets - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://bit.ly/kaggle-2d-shapes) | 444 | | Clustering unlabeled data | [![Non-groundtruth datasets - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://bit.ly/colab-unlabeled-data) | [![Non-groundtruth datasets](https://kaggle.com/static/images/open-in-kaggle.svg)](https://bit.ly/kaggle-chameleon) | 445 | | iris dataset | [![iris dataset - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://bit.ly/colab-iris-dataset) | [![iris dataset - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://bit.ly/kaggle-iris-dataset) | 446 | | MNIST dataset | [![MNIST dataset - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://bit.ly/colab-mnist-dataset) | [![MNIST dataset - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://bit.ly/kaggle-score-97-mnist) | 447 | | Scoring 97% on MNIST dataset | [![Get 97% by training MNIST dataset - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://bit.ly/colab-score-97-mnist) | [![Training MNIST to get 97%](https://kaggle.com/static/images/open-in-kaggle.svg)](https://bit.ly/kaggle-score-97-mnist) | 448 | | Noise detection | [![Noise detection - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://bit.ly/colab-noise-detection) | [![Noise detection - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://bit.ly/kaggle-noise-detection) | 449 | | Validation | [![Validation - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://bit.ly/colab-how-to-validate) | [![Validation - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://bit.ly/kaggle-how-to-validate) | 450 | | How does it propagate? | [![How it propagates - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://bit.ly/colab-how-propagate) | [![The beauty of propagation - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://bit.ly/kaggle-how-propagate)
[![The beauty of propagation part 2 - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://bit.ly/kaggle-how-propagate-2) | 451 | | Snapshots of propagation | [![snapshots of the propagation - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://bit.ly/colab-propagation-shots) | [![The beauty of propagation - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://bit.ly/kaggle-propagation-shots) | 452 | | Scalability | [![Scalability - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://bit.ly/colab-scalability) | [![Scalability - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://bit.ly/kaggle-scalability) | 453 | | Stability | [![Stability vs number of nearest neighbors - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://bit.ly/colab-stability) | [![Stability - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://bit.ly/kaggle-stability) | 454 | | k-nearest-evolution | [![k-nearest-evolution - colab](https://colab.research.google.com/assets/colab-badge.svg)](https://bit.ly/colab-knn-evolution) | [![k-nearest-evolution - kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://bit.ly/kaggle-knn-evolution) | 455 | 456 | 457 | 458 | ## Software Impact 459 | 460 | Discover robust clustering without density cutoffs using this open-source Python library pyMune, implementing the parameter-free DenMune algorithm. PyMune identifies and expands cluster cores while removing noise. Fully scikit-learn compatible. pyMune (DenMune implementation) is a cutting-edge tool incorporating advanced techniques, robust performance, and effective propagation strategies. This positions it as the current state-of-the-art in its field, contributing to its high adoption and impact. 461 | 462 | - After extensive research and rigorous validation, we are proud to release pyMune as an open-source tool on GitHub and PyPi for the benefit of the scientific community. 463 | - With over 230,000 downloads already, pyMune has demonstrated its real-world impact and usefulness. We integrated it with [![Open in Code Ocean](https://codeocean.com/codeocean-assets/badge/open-in-code-ocean.svg)](https://bit.ly/codeocean-capsule) and [![Launch notebook examples in Binder](https://static.mybinder.org/badge_logo.svg)](https://bit.ly/mybinder-repo2docker) to further enhance reproducibility and reuse - encapsulating code, data, and outputs for turnkey sharing. 464 | - It is part of a special issue of R-badged articles, https://www.sciencedirect.com/journal/software-impacts/special-issue/10XXN6LQ0J1 465 | - it is part of Scikit-learn-contrib , https://github.com/scikit-learn-contrib 466 | 467 | ![Software Impact](https://github.com/egy1st/images/blob/main/clustering/software-impacts.jpg?raw=true) 468 | 469 | ### Warning: Plagiarized Works 470 | 471 | It has come to our attention that the following papers have plagiarized significant portions of the DenMune algorithm and research work: 472 | 473 | 1. **Paper 1:** "DEDIC: Density Estimation Clustering Method Using Directly Interconnected Cores" published in IEEE Access, doi: 10.1109/ACCESS.2022.3229582 Authors: Yisen Lin, Xinlun Zhang, Lei Liu, and Huichen Qu, reported at https://pubpeer.com/publications/AFC4E173A4FC0A2AD7E70DE688DDA5 474 | 2. **Paper 2:** "Research on stress curve clustering algorithm of Fiber Bragg grating sensor" published in Nature Scientific Reports, doi: 10.1038/s41598-023-39058-w Authors: Yisen Lin, Ye Wang, Huichen Qu  & Yiwen Xiong, reported at https://pubpeer.com/publications/7AEF7D0F7505A8B8C130D142522741 475 | 476 | We have conducted a thorough analysis and found extensive evidence of plagiarism in these papers, including: 477 | 478 | - Verbatim copying of the core algorithm logic and steps from DenMune, with only superficial naming and implementation differences intended to obfuscate the similarity. 479 | - Plagiarized background, related work, and technical details from the original DenMune paper, with minor paraphrasing and without proper attribution. 480 | - Copying of mathematical formulations, concepts, and point classifications from DenMune. 481 | - Reuse of experimental setup, datasets, and compared algorithms from DenMune without justification or acknowledgment. 482 | - Fabricated experimental results, with values directly copied from DenMune's results and falsely claimed as their own. 483 | - Lack of substantive analysis or discussion, further indicating that the experiments were likely not conducted. 484 | 485 | Despite our efforts to address these concerns through proper channels, the publishers have decided to allow these plagiarized papers to remain published with only a correction acknowledging the issues, rather than retracting them or mandating a comprehensive correction. 486 | 487 | We strongly condemn such academic misconduct and the potential enabling of plagiarism by reputable publishers. Researchers and practitioners should exercise caution when referring to or using the methods described in these plagiarized works. 488 | 489 | For the original, properly cited implementation of the DenMune clustering algorithm, please refer to the official repository and resources provided here. 490 | 491 | We remain committed to upholding academic integrity and ethical research practices, and we urge the scientific community to take a firm stance against plagiarism and misconduct in scholarly publications. 492 | 493 | 494 | 495 | ## How to cite 496 | 497 | - How to cite ***The paper*** 498 | 499 | If you have used this codebase in a scientific publication and wish to cite it, please use the [Journal of Pattern Recognition article](https://www.sciencedirect.com/science/article/abs/pii/S0031320320303927): 500 | 501 | ``` 502 | Mohamed Abbas, Adel El-Zoghaby, Amin Shoukry, *DenMune: Density peak-based clustering using mutual nearest neighbors* 503 | In: Journal of Pattern Recognition, Elsevier, volume 109, number 107589. 504 | January 2021 505 | ``` 506 | 507 | ```bib 508 | @article{ABBAS2021107589, 509 | title = {DenMune: Density peak-based clustering using mutual nearest neighbors}, 510 | journal = {Pattern Recognition}, 511 | volume = {109}, 512 | pages = {107589}, 513 | year = {2021}, 514 | issn = {0031-3203}, 515 | doi = {https://doi.org/10.1016/j.patcog.2020.107589}, 516 | url = {https://www.sciencedirect.com/science/article/pii/S0031320320303927}, 517 | author = {Mohamed Abbas and Adel El-Zoghabi and Amin Shoukry}, 518 | keywords = {Clustering, Mutual neighbors, Dimensionality reduction, Arbitrary shapes, Pattern recognition, Nearest neighbors, Density peak}, 519 | abstract = {Many clustering algorithms fail when clusters are of arbitrary shapes, of varying densities, or the data classes are unbalanced and close to each other, even in two dimensions. A novel clustering algorithm, “DenMune” is presented to meet this challenge. It is based on identifying dense regions using mutual nearest neighborhoods of size K, where K is the only parameter required from the user, besides obeying the mutual nearest neighbor consistency principle. The algorithm is stable for a wide range of values of K. Moreover, it is able to automatically detect and remove noise from the clustering process as well as detect the target clusters. It produces robust results on various low and high-dimensional datasets relative to several known state-of-the-art clustering algorithms.} 520 | } 521 | ``` 522 | 523 | 524 | 525 | 526 | 527 | - How to cite ***The Software*** 528 | If you have used this codebase in a scientific publication and wish to cite it, please use the [Journal of Software Impacts article](https://www.sciencedirect.com/science/article/pii/S266596382300101X): 529 | 530 | ``` 531 | Abbas, M. A., El-Zoghabi, A., & Shoukry, A. (2023). PyMune: A Python package for complex clusters detection. Software Impacts, 17, 100564. https://doi.org/10.1016/j.simpa.2023.100564 532 | ``` 533 | 534 | ```bib 535 | @article{ABBAS2023100564, 536 | title = {pyMune: A Python package for complex clusters detection}, 537 | journal = {Software Impacts}, 538 | volume = {17}, 539 | pages = {100564}, 540 | year = {2023}, 541 | issn = {2665-9638}, 542 | doi = {https://doi.org/10.1016/j.simpa.2023.100564}, 543 | url = {https://www.sciencedirect.com/science/article/pii/S266596382300101X}, 544 | author = {Mohamed Ali Abbas and Adel El-Zoghabi and Amin Shoukry}, 545 | keywords = {Machine learning, Pattern recognition, Dimensionality reduction, Mutual nearest neighbors, Nearest neighbors approximation, DenMune}, 546 | abstract = {We introduce pyMune, an open-source Python library for robust clustering of complex real-world datasets without density cutoff parameters. It implements DenMune (Abbas et al., 2021), a mutual nearest neighbor algorithm that uses dimensionality reduction and approximate nearest neighbor search to identify and expand cluster cores. Noise is removed with a mutual nearest-neighbor voting system. In addition to clustering, pyMune provides classification, visualization, and validation functionalities. It is fully compatible with scikit-learn and has been accepted into the scikit-learn-contrib repository. The code, documentation, and demos are available on GitHub, PyPi, and CodeOcean for easy use and reproducibility.} 547 | } 548 | ``` 549 | 550 | ## Licensing 551 | 552 | The DenMune algorithm is 3-clause BSD licensed. Enjoy. 553 | 554 | [![BSD 3-Clause “New” or “Revised” License](https://img.shields.io/badge/license-BSD-green)](https://choosealicense.com/licenses/bsd-3-clause/) 555 | 556 | ## Task List 557 | 558 | - [x] Update Github with the DenMune source code 559 | - [x] create repo2docker repository 560 | - [x] Create pip Package 561 | - [x] create CoLab shared examples 562 | - [x] create documentation 563 | - [x] create Kaggle shared examples 564 | - [x] PEP8 compliant 565 | - [x] Continuous integration 566 | - [x] scikit-learn compatible 567 | - [x] creating unit tests (coverage: 100%) 568 | - [x] generating API documentation 569 | - [x] Create a reproducible capsule on code ocean 570 | - [x] Submitting pyMune to Software Impacts (Published August 5 , 2023) 571 | - [ ] create conda package (*postponed until NGT has conda installation*) 572 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | require_ci_to_pass: yes 3 | 4 | coverage: 5 | precision: 2 6 | round: down 7 | range: "70...100" 8 | 9 | status: 10 | project: 11 | default: false # disable the default status that measures entire project 12 | tests: # declare a new status context "tests" 13 | target: 100% # we always want 100% coverage here 14 | paths: "tests/" # only include coverage in "tests/" folder 15 | jupyter: # declare a new status context "app" 16 | paths: "!tests/" # remove all files in "tests/" 17 | 18 | if_ci_failed: error #success, failure, error, ignore 19 | informational: true 20 | 21 | parsers: 22 | gcov: 23 | branch_detection: 24 | conditional: yes 25 | loop: yes 26 | method: no 27 | macro: no 28 | 29 | comment: 30 | layout: "reach,diff,flags,files,footer" 31 | behavior: default 32 | require_changes: no 33 | 34 | -------------------------------------------------------------------------------- /colab/iris_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "view-in-github", 7 | "colab_type": "text" 8 | }, 9 | "source": [ 10 | "\"Open" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": { 17 | "id": "zaaLaJHT35Fd" 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "import pandas as pd\n", 22 | "import numpy as np\n", 23 | "import time\n", 24 | "import os.path\n", 25 | "\n", 26 | "import warnings\n", 27 | "warnings.filterwarnings('ignore')" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "scrolled": true, 35 | "id": "69XXeoif35Fn" 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "# install DenMune clustering algorithm using pip command from the offecial Python repository, PyPi\n", 40 | "# from https://pypi.org/project/denmune/\n", 41 | "!pip install denmune\n", 42 | "\n", 43 | "# then import it\n", 44 | "from denmune import DenMune" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "id": "H3H8DYwU35Fo", 52 | "colab": { 53 | "base_uri": "https://localhost:8080/" 54 | }, 55 | "outputId": "fee68095-9fd7-456d-f288-9140ceef8ea0" 56 | }, 57 | "outputs": [ 58 | { 59 | "output_type": "stream", 60 | "name": "stdout", 61 | "text": [ 62 | "Cloning into 'datasets'...\n", 63 | "remote: Enumerating objects: 57, done.\u001b[K\n", 64 | "remote: Counting objects: 100% (57/57), done.\u001b[K\n", 65 | "remote: Compressing objects: 100% (46/46), done.\u001b[K\n", 66 | "remote: Total 57 (delta 9), reused 54 (delta 9), pack-reused 0\u001b[K\n", 67 | "Unpacking objects: 100% (57/57), done.\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "# clone datasets from our repository datasets\n", 73 | "if not os.path.exists('datasets'):\n", 74 | " !git clone https://github.com/egy1st/datasets" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": { 81 | "colab": { 82 | "base_uri": "https://localhost:8080/", 83 | "height": 1000 84 | }, 85 | "id": "xm54UWO835Fq", 86 | "outputId": "195b4bb6-b755-467c-82c2-c099f2a9445e" 87 | }, 88 | "outputs": [ 89 | { 90 | "output_type": "stream", 91 | "name": "stdout", 92 | "text": [ 93 | "Plotting dataset Groundtruth\n" 94 | ] 95 | }, 96 | { 97 | "output_type": "display_data", 98 | "data": { 99 | "image/png": "\n", 100 | "text/plain": [ 101 | "
" 102 | ] 103 | }, 104 | "metadata": { 105 | "needs_background": "light" 106 | } 107 | }, 108 | { 109 | "output_type": "stream", 110 | "name": "stdout", 111 | "text": [ 112 | "Plotting train data\n" 113 | ] 114 | }, 115 | { 116 | "output_type": "display_data", 117 | "data": { 118 | "image/png": "\n", 119 | "text/plain": [ 120 | "
" 121 | ] 122 | }, 123 | "metadata": { 124 | "needs_background": "light" 125 | } 126 | }, 127 | { 128 | "output_type": "stream", 129 | "name": "stdout", 130 | "text": [ 131 | "Validating train data\n", 132 | "├── exec_time\n", 133 | "│ ├── DenMune: 0.019\n", 134 | "│ ├── NGT: 0.002\n", 135 | "│ └── t_SNE: 0.85\n", 136 | "├── n_clusters\n", 137 | "│ ├── actual: 3\n", 138 | "│ └── detected: 3\n", 139 | "├── n_points\n", 140 | "│ ├── dim: 4\n", 141 | "│ ├── noise\n", 142 | "│ │ ├── type-1: 0\n", 143 | "│ │ └── type-2: 0\n", 144 | "│ ├── plot_size: 150\n", 145 | "│ ├── size: 150\n", 146 | "│ ├── strong: 84\n", 147 | "│ └── weak\n", 148 | "│ ├── all: 66\n", 149 | "│ ├── failed to merge: 0\n", 150 | "│ └── succeeded to merge: 66\n", 151 | "└── validity\n", 152 | " └── train\n", 153 | " ├── ACC: 135\n", 154 | " ├── AMI: 0.795\n", 155 | " ├── ARI: 0.746\n", 156 | " ├── F1: 0.898\n", 157 | " ├── NMI: 0.798\n", 158 | " ├── completeness: 0.809\n", 159 | " └── homogeneity: 0.787\n", 160 | "\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "data_path = 'datasets/denmune/uci/' \n", 166 | "dataset='iris' \n", 167 | "data_file = data_path + dataset + '.csv'\n", 168 | "\n", 169 | "X_train = pd.read_csv(data_file, sep=',', header=None)\n", 170 | "y_train = X_train.iloc[:, -1]\n", 171 | "X_train = X_train.drop(X_train.columns[-1], axis=1) \n", 172 | "\n", 173 | "knn = 11 # k-nearest neighbor, the only parameter required by the algorithm\n", 174 | "dm = DenMune(train_data=X_train,\n", 175 | " train_truth=y_train,\n", 176 | " k_nearest=knn,\n", 177 | " rgn_tsne=False)\n", 178 | "\n", 179 | "labels, validity = dm.fit_predict(show_noise=True, show_analyzer=True)\n" 180 | ] 181 | } 182 | ], 183 | "metadata": { 184 | "kernelspec": { 185 | "display_name": "Python 3", 186 | "language": "python", 187 | "name": "python3" 188 | }, 189 | "language_info": { 190 | "codemirror_mode": { 191 | "name": "ipython", 192 | "version": 3 193 | }, 194 | "file_extension": ".py", 195 | "mimetype": "text/x-python", 196 | "name": "python", 197 | "nbconvert_exporter": "python", 198 | "pygments_lexer": "ipython3", 199 | "version": "3.7.3" 200 | }, 201 | "colab": { 202 | "name": "iris_dataset.ipynb", 203 | "provenance": [], 204 | "collapsed_sections": [], 205 | "include_colab_link": true 206 | } 207 | }, 208 | "nbformat": 4, 209 | "nbformat_minor": 0 210 | } -------------------------------------------------------------------------------- /images/denmune-illustration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/denmune-clustering-algorithm/a023e9283d7ea11af2d3e6dadae1c54e3b90528c/images/denmune-illustration.png -------------------------------------------------------------------------------- /images/denmune_propagation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/denmune-clustering-algorithm/a023e9283d7ea11af2d3e6dadae1c54e3b90528c/images/denmune_propagation.png -------------------------------------------------------------------------------- /kaggle/the-beauty-of-clusters-propagation.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"source":"\"Kaggle\"","metadata":{},"cell_type":"markdown","outputs":[],"execution_count":0},{"cell_type":"markdown","id":"e7d9b26d","metadata":{"papermill":{"duration":0.004085,"end_time":"2022-01-28T16:59:53.754281","exception":false,"start_time":"2022-01-28T16:59:53.750196","status":"completed"},"tags":[]},"source":["##### Have you ever wondered how a cluster propgate. It is time to reveal the beuty of clusters propgation. It as simple as\n","- running the next cell,\n","- wait,\n","- watch and\n","- ENJOY."]},{"cell_type":"code","execution_count":1,"id":"7d1fe6fe","metadata":{"execution":{"iopub.execute_input":"2022-01-28T16:59:53.765377Z","iopub.status.busy":"2022-01-28T16:59:53.764201Z","iopub.status.idle":"2022-01-28T17:03:36.326661Z","shell.execute_reply":"2022-01-28T17:03:36.325883Z","shell.execute_reply.started":"2022-01-24T22:08:58.752738Z"},"id":"FZgP6jwmzFtZ","papermill":{"duration":222.569057,"end_time":"2022-01-28T17:03:36.326871","exception":false,"start_time":"2022-01-28T16:59:53.757814","status":"completed"},"tags":[]},"outputs":[{"data":{"image/png":"\n","text/plain":["
"]},"metadata":{"needs_background":"light"},"output_type":"display_data"}],"source":["import pandas as pd\n","import time\n","import os.path\n","import warnings\n","warnings.filterwarnings('ignore')\n","\n","# install DenMune clustering algorithm using pip from https://pypi.org/project/denmune/\n","!pip install denmune\n","# now import it\n","from denmune import DenMune\n","\n","#let us create data folder to hold our data\n","if not os.path.exists('data'):\n"," os.makedirs('data')\n","data_path = 'data/' \n","\n","# download datasets and extract them to our data folder\n","if not os.path.exists(\"chameleon-data.zip\"):\n"," !wget https://data.zerobytes.one/clustering/chameleon-data.zip\n"," !unzip -o chameleon-data.zip -d data \n","\n","#@title { run: \"auto\", vertical-output: true, form-width: \"50%\" }\n","chameleon_dataset = \"t7.10k.dat\" #@param [\"t4.8k.dat\", \"t5.8k.dat\", \"t7.10k.dat\", \"t8.8k.dat\"]\n","show_noize_checkbox = True #@param {type:\"boolean\"}\n","data_path = 'data/' \n","\n","# train file\n","data_file = data_path + chameleon_dataset\n","X_train = pd.read_csv(data_file, sep=',', header=None)\n","\n","\n","# Denmune's Paramaters\n","verpose_mode = True # view in-depth analysis of time complexity and outlier detection, num of clusters\n","show_plot = True # show plots on/off\n","show_noise = True # show noise and outlier on/off\n","\n","knn = 39\n","from IPython.display import clear_output\n","for x in range (250, 5500, 250 ):\n"," print (\"itration\", x )\n"," clear_output(wait=True)\n"," dm = DenMune(train_data=X_train, k_nearest=knn, rgn_tsne=False, prop_step=x)\n"," labels, validity = dm.fit_predict(show_analyzer=False, show_noise=show_noize_checkbox)\n"," #time.sleep(0.2)\n"," \n"]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.12"},"papermill":{"default_parameters":{},"duration":234.174038,"end_time":"2022-01-28T17:03:37.445211","environment_variables":{},"exception":null,"input_path":"__notebook__.ipynb","output_path":"__notebook__.ipynb","parameters":{},"start_time":"2022-01-28T16:59:43.271173","version":"2.3.3"}},"nbformat":4,"nbformat_minor":5} -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ngt>=2.0.4 2 | numpy>=1.23.5 3 | pandas>=1.5.3 4 | matplotlib>=3.7.2 5 | scikit-learn>=1.2.2 6 | seaborn>=0.12.2 7 | anytree>=2.8 8 | treelib>=1.6.1 9 | pytest>=6.2.5 10 | coverage>=6.3.1 11 | treon 12 | testbook 13 | notebook 14 | 15 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | from .denmune import DenMune 2 | 3 | __all__ = ["DenMune"] 4 | -------------------------------------------------------------------------------- /src/tests/test_denmune.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from itertools import chain 3 | import pandas as pd 4 | import pytest 5 | from sklearn.datasets import make_blobs 6 | from sklearn.datasets import load_iris 7 | from src.denmune import DenMune 8 | 9 | 10 | # test DenMune's results 11 | X_cc, y_cc = make_blobs( 12 | n_samples=1000, 13 | centers=np.array([[-1, -1], [1, 1]]), 14 | random_state=0, 15 | shuffle=False, 16 | cluster_std=0.5, 17 | ) 18 | 19 | knn = 10 20 | 21 | def test_DenMune_results(): 22 | dm = DenMune(train_data=X_cc, train_truth=y_cc, k_nearest=knn) 23 | labels, validity = dm.fit_predict(show_analyzer=False) 24 | # This test use data that are not perfectly separable so the 25 | # accuracy is not 1. Accuracy around 0.90 26 | assert (np.mean(dm.labels_pred == y_cc) > 0.90) or (1 - np.mean(dm.labels_pred == y_cc) > 0.90) 27 | 28 | 29 | @pytest.mark.parametrize("train_data", [None, X_cc[:800] ]) 30 | @pytest.mark.parametrize("train_truth", [None, y_cc[:800] ]) 31 | @pytest.mark.parametrize("test_data", [None, X_cc[800:] ]) 32 | @pytest.mark.parametrize("test_truth", [None, y_cc[800:] ]) 33 | @pytest.mark.parametrize("validate", [True, False]) 34 | @pytest.mark.parametrize("show_plots", [True, False]) 35 | @pytest.mark.parametrize("show_noise", [True, False]) 36 | @pytest.mark.parametrize("show_analyzer", [True, False]) 37 | @pytest.mark.parametrize("prop_step", [0, 600]) 38 | 39 | # all possible combinations will be tested over all parameters. Actually, 257 tests will be covered 40 | def test_parameters(train_data, train_truth, test_data, test_truth, validate, show_plots, show_noise, show_analyzer, prop_step): 41 | if not (train_data is None): 42 | if not (train_data is not None and train_truth is None and test_truth is not None): 43 | if not (train_data is not None and test_data is not None and train_truth is None): 44 | if not (train_data is not None and train_truth is not None and test_truth is not None and test_data is None): 45 | dm = DenMune(train_data=train_data, train_truth=train_truth, test_data=test_data, test_truth=test_truth, k_nearest=10,prop_step=prop_step) 46 | labels, validity = dm.fit_predict(validate=validate, show_plots=show_plots, show_noise=show_noise, show_analyzer=show_analyzer) 47 | # This test use data that are not perfectly separable so the 48 | # accuracy is not 1. Accuracy around 0.70 49 | assert ( np.mean(labels == y_cc) > 0.70 or (1 - np.mean( labels == y_cc) > 0.70) ) 50 | 51 | 52 | def test_DenMune_propagation(): 53 | snapshots = chain([0], range(2,5), range(5,50,5), range(50, 100, 10), range(100,500,50), range(500,1100, 100)) 54 | for snapshot in snapshots: 55 | dm = DenMune(train_data=X_cc, k_nearest=knn, prop_step=snapshot) 56 | labels, validity = dm.fit_predict(show_analyzer=False, show_plots=False) 57 | # if snapshot iteration = 1000, this means we could propagate to the end properly 58 | assert (snapshot == 1000) 59 | 60 | # we are going to do some tests using iris data 61 | X_iris = load_iris()["data"] 62 | y_iris = load_iris()["target"] 63 | 64 | # we test t_SNE reduction by applying it on Iris dataset which has 4 dimentions. 65 | @pytest.mark.parametrize("file_2d", [None, 'iris_2d.csv']) 66 | @pytest.mark.parametrize("rgn_tsne", [True, False]) 67 | 68 | 69 | def test_t_SNE(rgn_tsne, file_2d): 70 | dm = DenMune(train_data=X_iris, train_truth=y_iris, k_nearest=knn, rgn_tsne=rgn_tsne, file_2d=file_2d) 71 | labels, validity = dm.fit_predict(show_analyzer=False, show_plots=False) 72 | assert (dm.data.shape[1] == 2) # this means it was reduced properly to 2-d using t-SNE 73 | 74 | def test_knn(): 75 | for k in range (5, 55, 5): 76 | dm = DenMune(train_data=X_iris, train_truth=y_iris, k_nearest=k, rgn_tsne=False) 77 | labels, validity = dm.fit_predict(show_analyzer=False, show_plots=False) 78 | #assert (k == 50) # this means we tested the algorithm works fine with several knn inputs 79 | 80 | 81 | data_file = 'https://raw.githubusercontent.com/egy1st/datasets/dd90854f92cb5ef73b4146606c1c158c32e69b94/denmune/shapes/aggr_rand.csv' 82 | data = pd.read_csv(data_file, sep=',', header=None) 83 | labels = data.iloc[:, -1] 84 | data = data.drop(data.columns[-1], axis=1) 85 | train_data = data [:555] 86 | test_data = data [555:] 87 | train_labels = labels [:555] 88 | test_labels = labels [555:] 89 | 90 | # check if data will be treated correctly when comes as dataframe 91 | def test_dataframe(): 92 | knn = 11 # k-nearest neighbor, the only parameter required by the algorithm 93 | dm = DenMune(train_data=train_data, train_truth=train_labels, test_data=test_data, test_truth=test_labels, k_nearest=knn, rgn_tsne=True) 94 | labels, validity = dm.fit_predict(validate=True, show_noise=True, show_analyzer=True) 95 | assert ( np.mean(dm.labels_pred == labels) > 0.97 or (1 - np.mean( dm.labels_pred == labels) > 0.97) ) 96 | 97 | 98 | def test_exceptions(): 99 | 100 | with pytest.raises(Exception) as execinfo: 101 | dm = DenMune(train_data=None, k_nearest=10) 102 | #labels, validity = dm.fit_predict() 103 | #raise Exception('train data is None') 104 | 105 | with pytest.raises(Exception) as execinfo: 106 | dm = DenMune(train_data=train_data, test_truth=test_labels, k_nearest=10) 107 | #labels, validity = dm.fit_predict() 108 | #raise Exception('train_data is not None and train_truth is None and test_truth is not None') 109 | 110 | with pytest.raises(Exception) as execinfo: 111 | dm = DenMune(train_data=train_data, test_data=test_data, k_nearest=10) 112 | #labels, validity = dm.fit_predict() 113 | #raise Exception('train_data is not None and test_data is not None and train_truth is None') 114 | 115 | with pytest.raises(Exception) as execinfo: 116 | dm = DenMune(train_data=train_data, train_truth=train_labels, test_truth=test_labels, test_data=None, k_nearest=10) 117 | #labels, validity = dm.fit_predict() 118 | #raise Exception('train_data is not None and train_truth is not None and test_truth is not None and test_data is None') 119 | with pytest.raises(Exception) as execinfo: 120 | 121 | dm = DenMune(train_data=train_data, train_truth=train_labels, k_nearest=0) # default value for k_nearest is 1 which is valid 122 | #labels, validity = dm.fit_predict() 123 | #raise Exception('k-nearest neighbor should be at least 1') 124 | --------------------------------------------------------------------------------