├── .circleci
└── config.yml
├── .github
└── workflows
│ ├── codeql.yml
│ └── python-package
├── LICENSE
├── PyPi Package
├── .vscode
│ └── settings.json
├── LICENSE
├── MANIFEST.in
├── README.md
├── pyproject.toml
├── setup.cfg
├── setup.py
└── src
│ ├── denmune.egg-info
│ ├── PKG-INFO
│ ├── SOURCES.txt
│ ├── dependency_links.txt
│ ├── requires.txt
│ └── top_level.txt
│ └── denmune
│ ├── .idea
│ ├── .gitignore
│ ├── .name
│ ├── denmune.iml
│ ├── inspectionProfiles
│ │ └── profiles_settings.xml
│ ├── misc.xml
│ └── modules.xml
│ ├── __init__.py
│ └── denmune.py
├── README.md
├── codecov.yml
├── colab
├── 2D_shapes_datasets.ipynb
├── Get_97_by_training_MNIST_dataset.ipynb
├── MNIST_dataset.ipynb
├── chameleon_datasets.ipynb
├── clustering_propagation.ipynb
├── clustering_propagation_snapshots.ipynb
├── how_to_use_it.ipynb
├── iris_dataset.ipynb
├── k_nearest_evolution.ipynb
├── noise_detection.ipynb
├── scalability_and_speed.ipynb
├── stability_vs_knn.ipynb
├── training_MNIST.ipynb
└── validation.ipynb
├── images
├── denmune-illustration.png
└── denmune_propagation.png
├── kaggle
├── beauty-of-propagation-part3.ipynb
├── detecting-non-groundtruth-datasets.ipynb
├── detection-of-2d-shape-datasets.ipynb
├── get-97-using-simple-yet-one-parameter-algorithm.ipynb
├── iris-dataset.ipynb
├── k-nearest-evolution.ipynb
├── noise-detection.ipynb
├── scalability-vs-speed.ipynb
├── stability-vs-number-of-nearest-neighbor.ipynb
├── the-beauty-of-clusters-propagation.ipynb
├── the-beauty-of-propagation-part2.ipynb
├── training-MNIST-dataset-to-get-97.ipynb
├── training-pendigits-dataset-to-get-97.ipynb
├── validation.ipynb
└── when-simple-means-powerful.ipynb
├── requirements.txt
└── src
├── __init__.py
├── denmune.py
└── tests
└── test_denmune.py
/.circleci/config.yml:
--------------------------------------------------------------------------------
1 | # Use the latest 2.1 version of CircleCI pipeline process engine.
2 | # See: https://circleci.com/docs/2.0/configuration-reference
3 | version: 2.1
4 |
5 | # Orbs are reusable packages of CircleCI configuration that you may share across projects, enabling you to create encapsulated, parameterized commands, jobs, and executors that can be used across multiple projects.
6 | # See: https://circleci.com/docs/2.0/orb-intro/
7 | orbs:
8 | # The python orb contains a set of prepackaged CircleCI configuration you can use repeatedly in your configuration files
9 | # Orb commands and jobs help you with common scripting around a language/tool
10 | # so you dont have to copy and paste it everywhere.
11 | # See the orb documentation here: https://circleci.com/developer/orbs/orb/circleci/python
12 | codecov: codecov/codecov@3.0.0
13 | slack: circleci/slack@4.4.4
14 | python: circleci/python@2.1.1
15 |
16 |
17 | # Define a job to be invoked later in a workflow.
18 | # See: https://circleci.com/docs/2.0/configuration-reference/#jobs
19 | jobs:
20 | build-and-test: # This is the name of the job, feel free to change it to better match what you're trying to do!
21 | # These next lines defines a Docker executors: https://circleci.com/docs/2.0/executor-types/
22 | # You can specify an image from Dockerhub or use one of the convenience images from CircleCI's Developer Hub
23 | # A list of available CircleCI Docker convenience images are available here: https://circleci.com/developer/images/image/cimg/python
24 | # The executor is the environment in which the steps below will be executed - below will use a python 3.8 container
25 | # Change the version below to your required version of python
26 | docker:
27 | - image: cimg/python:3.10
28 |
29 |
30 | # Checkout the code as the first step. This is a dedicated CircleCI step.
31 | # The python orb's install-packages step will install the dependencies from a Pipfile via Pipenv by default.
32 | # Here we're making sure we use just use the system-wide pip. By default it uses the project root's requirements.txt.
33 | # Then run your tests!
34 | # CircleCI will report the results back to your VCS provider.
35 | steps:
36 | - checkout
37 | - python/install-packages:
38 | pkg-manager: pip
39 | # app-dir: ~/project/package-directory/ # If you're requirements.txt isn't in the root directory.
40 | # pip-dependency-file: test-requirements.txt # if you have a different name for your requirements file, maybe one that combines your runtime and test requirements.
41 |
42 | - run:
43 | name: Treon Test
44 | command: |
45 | cd colab
46 | # git clone https://github.com/egy1st/datasets
47 | # treon --threads=2
48 |
49 | - run:
50 | name: CodeCov pyTest
51 | command: |
52 | coverage run -m pytest
53 | coverage report
54 | coverage html
55 | coverage xml
56 | cp coverage.xml htmlcov/coverage.xml
57 |
58 | - codecov/upload
59 |
60 | - store_artifacts:
61 | path: htmlcov
62 |
63 | - slack/notify:
64 | template: basic_success_1
65 | channel: C0326UK1VFY
66 | # Invoke jobs via workflows
67 | # See: https://circleci.com/docs/2.0/configuration-reference/#workflows
68 | workflows:
69 | Python-3.10: # This is the name of the workflow, feel free to change it to better match your workflow.
70 | # Inside the workflow, you define the jobs you want to run.
71 | jobs:
72 | - build-and-test:
73 | context: Slack
74 |
75 |
--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | #
7 | # ******** NOTE ********
8 | # We have attempted to detect the languages in your repository. Please check
9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 |
14 | on:
15 | push:
16 | branches: [ "main" ]
17 | pull_request:
18 | # The branches below must be a subset of the branches above
19 | branches: [ "main" ]
20 | schedule:
21 | - cron: '45 0 * * 6'
22 |
23 | jobs:
24 | analyze:
25 | name: Analyze
26 | runs-on: ubuntu-latest
27 | permissions:
28 | actions: read
29 | contents: read
30 | security-events: write
31 |
32 | strategy:
33 | fail-fast: false
34 | matrix:
35 | language: [ 'python' ]
36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
38 |
39 | steps:
40 | - name: Checkout repository
41 | uses: actions/checkout@v3
42 |
43 | # Initializes the CodeQL tools for scanning.
44 | - name: Initialize CodeQL
45 | uses: github/codeql-action/init@v2
46 | with:
47 | languages: ${{ matrix.language }}
48 | # If you wish to specify custom queries, you can do so here or in a config file.
49 | # By default, queries listed here will override any specified in a config file.
50 | # Prefix the list here with "+" to use these queries and those in the config file.
51 |
52 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
53 | # queries: security-extended,security-and-quality
54 |
55 |
56 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
57 | # If this step fails, then you should remove it and run the build manually (see below)
58 | - name: Autobuild
59 | uses: github/codeql-action/autobuild@v2
60 |
61 | # ℹ️ Command-line programs to run using the OS shell.
62 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
63 |
64 | # If the Autobuild fails above, remove it and uncomment the following three lines.
65 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
66 |
67 | # - run: |
68 | # echo "Run, Build Application using script"
69 | # ./location_of_script_within_repo/buildscript.sh
70 |
71 | - name: Perform CodeQL Analysis
72 | uses: github/codeql-action/analyze@v2
73 | with:
74 | category: "/language:${{matrix.language}}"
75 |
--------------------------------------------------------------------------------
/.github/workflows/python-package:
--------------------------------------------------------------------------------
1 | name: workflow for codecov
2 | on: [push]
3 | jobs:
4 | run:
5 | runs-on: ${{ matrix.os }}
6 | strategy:
7 | matrix:
8 | os: [ubuntu-latest]
9 | python: ['3.6', '3.7', '3.8', '3.9']
10 | env:
11 | OS: ${{ matrix.os }}
12 | PYTHON: ${{ matrix.python }}
13 | steps:
14 | - uses: actions/checkout@master
15 | - name: Setup Python
16 | uses: actions/setup-python@master
17 | with:
18 | python-version: 3.7
19 | - name: Generate coverage report
20 | run: |
21 | pip install pytest
22 | pip install pytest-cov
23 | pip install numpy
24 | pip install -U scikit-learn
25 | pip install denmune
26 | pytest --cov=./ --cov-report=xml
27 | - name: Upload coverage to Codecov
28 | uses: codecov/codecov-action@v2
29 | with:
30 | token: 'fce1be95-36c5-4c80-83c1-fe9fa8539dae'
31 | files: ./coverage.xml
32 | env_vars: OS,PYTHON
33 | fail_ci_if_error: true
34 | flags: unittests
35 | name: codecov-umbrella
36 | verbose: true
37 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2021, Mohamed Ali Abbas
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/PyPi Package/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "restructuredtext.confPath": ""
3 | }
--------------------------------------------------------------------------------
/PyPi Package/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2021, Mohamed Ali Abbas
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
5 |
6 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
7 |
8 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
9 |
10 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11 |
12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/PyPi Package/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include data *.ipynb *.py *.txt *.csv
--------------------------------------------------------------------------------
/PyPi Package/README.md:
--------------------------------------------------------------------------------
1 | DenMune: A density-peak clustering algorithm
2 | =============================================
3 |
4 | DenMune a clustering algorithm that can find clusters of arbitrary size, shapes and densities in two-dimensions. Higher dimensions are first reduced to 2-D using the t-sne. The algorithm relies on a single parameter K (the number of nearest neighbors). The results show the superiority of the algorithm. Enjoy the simplicity but the power of DenMune.
5 |
6 |
7 | []( https://pypi.org/project/denmune/)
8 | [](https://mybinder.org/v2/gh/egy1st/denmune-clustering-algorithm/HEAD)
9 | [](https://denmune.readthedocs.io/en/latest/?badge=latest)
10 | [](#colab)
11 | [](https://www.kaggle.com/egyfirst/denmune-clustering-iris-dataset?scriptVersionId=84775816)
12 | [](https://www.sciencedirect.com/science/article/abs/pii/S0031320320303927)
13 | [](https://data.mendeley.com/datasets/b73cw5n43r/4)
14 | [](https://choosealicense.com/licenses/bsd-3-clause/)
15 | [](https://circleci.com/gh/egy1st/denmune-clustering-algorithm/tree/main)
16 | [](https://codecov.io/gh/egy1st/denmune-clustering-algorithm)
17 |
18 | Based on the paper
19 | -------------------
20 |
21 | |Paper|Journal|
22 | |-------------------------------------------------------------------------------------------|-----------------------------|
23 | |Mohamed Abbas, Adel El-Zoghabi, Amin Ahoukry,
[](https://www.scimagojr.com/journalsearch.php?q=24823&tip=sid&clean=0)
24 | |*DenMune: Density peak based clustering using mutual nearest neighbors*
25 | |In: Journal of Pattern Recognition, Elsevier,
26 | |volume 109, number 107589, January 2021
27 | |DOI: https://doi.org/10.1016/j.patcog.2020.107589
28 |
29 | Documentation:
30 | ---------------
31 | Documentation, including tutorials, are available on https://denmune.readthedocs.io
32 |
33 | [](https://denmune.readthedocs.io/en/latest/?badge=latest)
34 |
35 |
36 | Watch it in action
37 | -------------------
38 | This 30 seconds will tell you how a density-baased algorithm, DenMune propagates:
39 |
40 | [](https://colab.research.google.com/drive/1o-tP3uvDGjxBOGYkir1lnbr74sZ06e0U?usp=sharing)
41 |
42 | []()
43 |
44 |
45 |
46 | When less means more
47 | --------------------
48 | Most calssic clustering algorithms fail in detecting complex clusters where clusters are of different size, shape, density, and being exist in noisy data.
49 | Recently, a density-based algorithm named DenMune showed great ability in detecting complex shapes even in noisy data. it can detect number of clusters automatically, detect both pre-identified-noise and post-identified-noise automatically and removing them.
50 |
51 | It can achieve accuracy reach 100% in most classic pattern problems, achieve 97% in MNIST dataset. A great advantage of this algorithm is being single-parameter algorithm. All you need is to set number of k-nearest neighbor and the algorithm will care about the rest. Being Non-senstive to changes in k, make it robust and stable.
52 |
53 | Keep in mind, the algorithm reduce any N-D dataset to only 2-D dataset initially, so it is a good benefit of this algorithm is being always to plot your data and explore it which make this algorithm a good candidate for data exploration. Finally, the algorithm comes with neat package for visualizing data, validating it and analyze the whole clustering process.
54 |
55 | How to install DenMune
56 | ------------------------
57 | Simply install DenMune clustering algorithm using pip command from the official Python repository
58 |
59 | []( https://pypi.org/project/denmune/)
60 |
61 | From the shell run the command
62 |
63 | ```shell
64 | pip install denmune
65 | ```
66 |
67 | From jupyter notebook cell run the command
68 |
69 | ```ipython3
70 | !pip install denmune
71 | ```
72 |
73 | How to use DenMune
74 | --------------------
75 | Once DenMune is installed, you just need to import it
76 |
77 | ```python
78 | from denmune import DenMune
79 | ```
80 | ###### Please note that first denmune (the package) in small letters, while the other one(the class itself) has D and M in capital case.
81 |
82 |
83 | Read data
84 | -----------
85 |
86 | There are four possible cases of data:
87 | - only train data without labels
88 | - only labeld train data
89 | - labeled train data in addition to test data without labels
90 | - labeled train data in addition to labeled test data
91 |
92 |
93 | ```python
94 | #=============================================
95 | # First scenario: train data without labels
96 | # ============================================
97 |
98 | data_path = 'datasets/denmune/chameleon/'
99 | dataset = "t7.10k.csv"
100 | data_file = data_path + dataset
101 |
102 | # train data without labels
103 | X_train = pd.read_csv(data_file, sep=',', header=None)
104 |
105 | knn = 39 # k-nearest neighbor, the only parameter required by the algorithm
106 |
107 | dm = DenMune(train_data=X_train, k_nearest=knn)
108 | labels, validity = dm.fit_predict(show_analyzer=False, show_noise=True)
109 |
110 | ```
111 | This is an intutive dataset which has no groundtruth provided
112 |
113 | 
114 |
115 | ```python
116 | #=============================================
117 | # Second scenario: train data with labels
118 | # ============================================
119 |
120 | data_path = 'datasets/denmune/shapes/'
121 | dataset = "aggregation.csv"
122 | data_file = data_path + dataset
123 |
124 | # train data with labels
125 | X_train = pd.read_csv(data_file, sep=',', header=None)
126 | y_train = X_train.iloc[:, -1]
127 | X_train = X_train.drop(X_train.columns[-1], axis=1)
128 |
129 | knn = 6 # k-nearest neighbor, the only parameter required by the algorithm
130 |
131 | dm = DenMune(train_data=X_train, train_truth= y_train, k_nearest=knn)
132 | labels, validity = dm.fit_predict(show_analyzer=False, show_noise=True)
133 | ```
134 | Datset groundtruth
135 |
136 | 
137 |
138 | Datset as detected by DenMune at k=6
139 |
140 | 
141 |
142 |
143 | ```python
144 | #=================================================================
145 | # Third scenario: train data with labels in addition to test data
146 | # ================================================================
147 |
148 | data_path = 'datasets/denmune/pendigits/'
149 | file_2d = data_path + 'pendigits-2d.csv'
150 |
151 | # train data with labels
152 | X_train = pd.read_csv(data_path + 'train.csv', sep=',', header=None)
153 | y_train = X_train.iloc[:, -1]
154 | X_train = X_train.drop(X_train.columns[-1], axis=1)
155 |
156 | # test data without labels
157 | X_test = pd.read_csv(data_path + 'test.csv', sep=',', header=None)
158 | X_test = X_test.drop(X_test.columns[-1], axis=1)
159 |
160 | knn = 50 # k-nearest neighbor, the only parameter required by the algorithm
161 |
162 | dm = DenMune(train_data=X_train, train_truth= y_train,
163 | test_data= X_test,
164 | k_nearest=knn)
165 | labels, validity = dm.fit_predict(show_analyzer=True, show_noise=True)
166 | ```
167 | dataset groundtruth
168 |
169 | 
170 |
171 |
172 | dataset as detected by DenMune at k=50
173 |
174 | 
175 |
176 | test data as predicted by DenMune on training the dataset at k=50
177 |
178 | 
179 |
180 |
181 | Algorithm's Parameters
182 | -----------------------
183 | 1. Parameters used within the initialization of the DenMune class
184 |
185 | ```python
186 | def __init__ (self,
187 | train_data=None, test_data=None,
188 | train_truth=None, test_truth=None,
189 | file_2d =None, k_nearest=None,
190 | rgn_tsne=False, prop_step=0,
191 | ):
192 | ```
193 |
194 | - train_data:
195 | - data used for training the algorithm
196 | - default: None. It should be provided by the use, otherwise an error will riase.
197 |
198 | - train_truth:
199 | - labels of training data
200 | - default: None
201 |
202 | - test_data:
203 | - data used for testing the algorithm
204 |
205 | - test_truth:
206 | - labels of testing data
207 | - default: None
208 |
209 | - k_nearest:
210 | - number of nearest neighbor
211 | - default: 0. the default is invalid. k-nearest neighbor should be at leat 1.
212 |
213 | - rgn_tsn:
214 | - when set to True: It will regenerate the reduced 2-D version of the N-D dataset each time the algorithm run.
215 | - when set to False: It will generate the reduced 2-D version of the N-D dataset first time only, then will reuse the saved exist file
216 | - default: True
217 |
218 | - file_2d: name (include location) of file used save/load the reduced 2-d version
219 | - if empty: the algorithm will create temporary file named '_temp_2d'
220 | - default: None
221 |
222 | - prop_step:
223 | - size of increment used in showing the clustering propagation.
224 | - leave this parameter set to 0, the default value, unless you are willing intentionally to enter the propagation mode.
225 | - default: 0
226 |
227 |
228 | 2. Parameters used within the fit_predict function:
229 |
230 | ```python
231 | def fit_predict(self,
232 | validate=True,
233 | show_plots=True,
234 | show_noise=True,
235 | show_analyzer=True
236 | ):
237 | ```
238 |
239 | - validate:
240 | - validate data on/off according to five measures integrated with DenMUne (Accuracy. F1-score, NMI index, AMI index, ARI index)
241 | - default: True
242 |
243 | - show_plots:
244 | - show/hide plotting of data
245 | - default: True
246 |
247 | - show_noise:
248 | - show/hide noise and outlier
249 | - default: True
250 |
251 | - show_analyzer:
252 | - show/hide the analyzer
253 | - default: True
254 |
255 | The Analyzer
256 | -------------
257 |
258 | The algorithm provide an intutive tool called analyzer, once called it will provide you with in-depth analysis on how your clustering results perform.
259 |
260 | 
261 |
262 | Noise Detection
263 | ----------------
264 |
265 | DenMune detects noise and outlier automatically, no need to any further work from your side.
266 |
267 | - It plots pre-identified noise in black
268 | - It plots post-identified noise in light grey
269 |
270 | You can set show_noise parameter to False.
271 |
272 |
273 | ```python
274 |
275 | # let us show noise
276 |
277 | m = DenMune(train_data=X_train, k_nearest=knn)
278 | labels, validity = dm.fit_predict(show_noise=True)
279 | ```
280 |
281 | ```python
282 |
283 | # let us show clean data by removing noise
284 |
285 | m = DenMune(train_data=X_train, k_nearest=knn)
286 | labels, validity = dm.fit_predict(show_noise=False)
287 | ```
288 |
289 | | noisy data | clean data |
290 | ----------| ---------------------------------------------------------------------------------------------------|
291 | |  |  |
292 |
293 |
294 | Validatation
295 | --------------
296 | You can get your validation results using 3 methods
297 |
298 | - by showing the Analyzer
299 | - extract values from the validity returned list from fit_predict function
300 | - extract values from the Analyzer dictionary
301 | -
302 | There are five validity measures built-in the algorithm, which are:
303 |
304 | - ACC, Accuracy
305 | - F1 score
306 | - NMI index (Normalized Mutual Information)
307 | - AMI index (Adjusted Mutual Information)
308 | - ARI index (Adjusted Rand Index)
309 |
310 | 
311 |
312 | K-nearest Evolution
313 | -------------------
314 | The following chart shows the evolution of pre and post identified noise in correspondence to increase of number of knn. Also, detected number of clusters is analyzed in the same chart in relation with both types of identified noise.
315 |
316 | 
317 |
318 |
319 | The Scalability
320 | ----------------
321 | | data size | time |
322 | |------------------| ------------------- |
323 | | data size: 5000 | time: 2.3139 seconds |
324 | | data size: 10000 | time: 5.8752 seconds |
325 | | data size: 15000 | time: 12.4535 seconds |
326 | | data size: 20000 | time: 18.8466 seconds |
327 | | data size: 25000 | time: 28.992 seconds |
328 | | data size: 30000 | time: 39.3166 seconds |
329 | | data size: 35000 | time: 39.4842 seconds |
330 | | data size: 40000 | time: 63.7649 seconds |
331 | | data size: 45000 | time: 73.6828 seconds |
332 | | data size: 50000 | time: 86.9194 seconds |
333 | | data size: 55000 | time: 90.1077 seconds |
334 | | data size: 60000 | time: 125.0228 seconds |
335 | | data size: 65000 | time: 149.1858 seconds |
336 | | data size: 70000 | time: 177.4184 seconds |
337 | | data size: 75000 | time: 204.0712 seconds |
338 | | data size: 80000 | time: 220.502 seconds |
339 | | data size: 85000 | time: 251.7625 seconds |
340 | | data size: 100000 | time: 257.563 seconds |
341 |
342 | | 
343 |
344 | The Stability
345 | --------------
346 |
347 | The algorithm is only single-parameter, even more it not sensitive to changes in that parameter, k. You may guess that from the following chart yourself. This is of greate benfit for you as a data exploration analyst. You can simply explore the dataset using an arbitrary k. Being Non-senstive to changes in k, make it robust and stable.
348 |
349 | 
350 |
351 |
352 | Reveal the propagation
353 | -----------------------
354 |
355 | one of the top performing feature in this algorithm is enabling you to watch how your clusters propagate to construct the final output clusters.
356 | just use the parameter 'prop_step' as in the following example:
357 |
358 | ```python
359 | dataset = "t7.10k" #
360 | data_path = 'datasets/denmune/chameleon/'
361 |
362 | # train file
363 | data_file = data_path + dataset +'.csv'
364 | X_train = pd.read_csv(data_file, sep=',', header=None)
365 |
366 |
367 | from itertools import chain
368 |
369 | # Denmune's Paramaters
370 | knn = 39 # number of k-nearest neighbor, the only parameter required by the algorithm
371 |
372 | # create list of differnt snapshots of the propagation
373 | snapshots = chain(range(2,5), range(5,50,10), range(50, 100, 25), range(100,500,100), range(500,2000, 250), range(1000,5500, 500))
374 |
375 | from IPython.display import clear_output
376 | for snapshot in snapshots:
377 | print ("itration", snapshot )
378 | clear_output(wait=True)
379 | dm = DenMune(train_data=X_train, k_nearest=knn, rgn_tsne=False, prop_step=snapshot)
380 | labels, validity = dm.fit_predict(show_analyzer=False, show_noise=False)
381 | ```
382 |
383 | []()
384 |
385 | Interact with the algorithm
386 | ---------------------------
387 | [](https://colab.research.google.com/drive/1EUROd6TRwxW3A_XD3KTxL8miL2ias4Ue?usp=sharing)
388 |
389 | This notebook allows you interact with the algorithm in many asspects:
390 | - you can choose which dataset to cluster (among 4 chameleon datasets)
391 | - you can decide which number of k-nearest neighbor to use
392 | - show noise on/off; thus you can invesitigate noise detected by the algorithm
393 | - show analyzer on/off
394 |
395 | How to run and test
396 | --------------------
397 |
398 | 1. Launch Examples in Repo2Docker Binder
399 |
400 | Simply use our repo2docker offered by mybinder.org, which encapsulate the algorithm and all required data in one virtual machine instance. All jupter notebooks examples found in this repository will be also available to you in action to practice in this respo2docer. Thanks mybinder.org, you made it possible!
401 |
402 | [](https://mybinder.org/v2/gh/egy1st/denmune-clustering-algorithm/HEAD)
403 |
404 | 2. Launch each Example in Kaggle workspace
405 |
406 | If you are a kaggler like me, then Kaggle, the best workspace where data scientist meet, should fit you to test the algorithm with great experince.
407 |
408 | | Dataset | Kaggle URL |
409 | ----------| ---------------------------------------------------------------------------------------------------|
410 | |When less means more - kaggle |[]( https://www.kaggle.com/egyfirst/when-less-means-more) |
411 | |Non-groundtruth datasets - kaggle|[](https://www.kaggle.com/egyfirst/detecting-non-groundtruth-datasets) |
412 | |2D Shape datasets - kaggle|[](https://www.kaggle.com/egyfirst/detection-of-2d-shape-datasets) |
413 | |MNIST dataset kaggle|[](https://www.kaggle.com/egyfirst/get-97-using-simple-yet-one-parameter-algorithm) |
414 | |Iris dataset kaggle| [](https://www.kaggle.com/egyfirst/denmune-clustering-iris-dataset) |
415 | |Training MNIST to get 97%| []( https://www.kaggle.com/egyfirst/training-mnist-dataset-to-get-97) |
416 | |Noise detection - kaggle| []( https://www.kaggle.com/egyfirst/noise-detection) |
417 | |Validation - kaggle| [](https://www.kaggle.com/egyfirst/validate-in-5-built-in-validity-insexes) |
418 | |The beauty of propagation - kaggle| [](https://www.kaggle.com/egyfirst/the-beauty-of-clusters-propagation) |
419 | |The beauty of propagation part2 - kaggle | [](https://www.kaggle.com/egyfirst/the-beauty-of-propagation-part2) |
420 | |Snapshots of propagation -kaggle| [](https://www.kaggle.com/egyfirst/beauty-of-propagation-part3) |
421 | |Scalability kaggle| [](https://www.kaggle.com/egyfirst/scalability-vs-speed) |
422 | |Stability - kaggle| [](https://www.kaggle.com/egyfirst/stability-vs-number-of-nearest-neighbor) |
423 | |k-nearest-evolution - kaggle| [](https://www.kaggle.com/egyfirst/k-nearest-evolution) |
424 |
425 | 3. Launch each Example in Google Research, CoLab
426 |
427 | Need to test examples one by one, then here another option. Use colab offered by google research to test each example individually.
428 |
429 |
430 |
431 | Here is a list of Google CoLab URL to use the algorithm interactively
432 | ----------------------------------------------------------------------
433 |
434 |
435 | | Dataset | CoLab URL |
436 | ----------| ---------------------------------------------------------------------------------------------------|
437 | |How to use it - colab|[]( https://colab.research.google.com/drive/1J_uKdhZ3z1KeY0-wJ7Ruw2PZSY1orKQm)|
438 | |Chameleon datasets - colab|[](https://colab.research.google.com/drive/1EUROd6TRwxW3A_XD3KTxL8miL2ias4Ue?usp=sharing) |
439 | |2D Shape datasets - colab|[]( https://colab.research.google.com/drive/1EaqTPCRHSuTKB-qEbnWHpGKFj6XytMIk?usp=sharing) |
440 | |MNIST dataset - colab|[](https://colab.research.google.com/drive/1a9FGHRA6IPc5jhLOV46iEbpUeQXptSJp?usp=sharing) |
441 | |iris dataset - colab|[](https://colab.research.google.com/drive/1nKql57Xh7xVVu6NpTbg3vRdRg42R7hjm?usp=sharing) |
442 | |Get 97% by training MNIST dataset - colab|[]( https://colab.research.google.com/drive/1NeOtXEQY94oD98Ufbh3IhTHnnYwIA659) |
443 | |Non-groundtruth datasets - colab|[]( https://colab.research.google.com/drive/1d17ejQ83aUy0CZIeQ7bHTugSC9AjJ2mU?usp=sharing) |
444 | |Noise detection - colab|[]( https://colab.research.google.com/drive/1Bp3c-cJfjLWxupmrBJ_6Q4-nqIfZcII4) |
445 | |Validation - colab|[]( https://colab.research.google.com/drive/13_EVaQOv_QiNmQiMWJAcFFHPJHGCrQLe) |
446 | |How it propagates - colab|[](https://colab.research.google.com/drive/1o-tP3uvDGjxBOGYkir1lnbr74sZ06e0U?usp=sharing)|
447 | |Snapshots of propagation - colab|[](https://colab.research.google.com/drive/1vPXNKa8Rf3TnqDHSD3YSWl3g1iNSqjl2?usp=sharing)|
448 | |Scalability - colab|[](https://colab.research.google.com/drive/1d55wkBndLLapO7Yx1ePHhE8mL61j9-TH?usp=sharing)|
449 | |Stability vs number of nearest neighbors - colab|[](https://colab.research.google.com/drive/17VgVRMFBWvkSIH1yA3tMl6UQ7Eu68K2l?usp=sharing)|
450 | |k-nearest-evolution - colab|[]( https://colab.research.google.com/drive/1DZ-CQPV3WwJSiaV3-rjwPwmXw4RUh8Qj)|
451 |
452 |
453 |
454 | How to cite
455 | =====
456 | If you have used this codebase in a scientific publication and wish to cite it, please use the [Journal of Pattern Recognition article](https://www.sciencedirect.com/science/article/abs/pii/S0031320320303927)
457 |
458 | Mohamed Abbas McInnes, Adel El-Zoghaby, Amin Ahoukry, *DenMune: Density peak based clustering using mutual nearest neighbors*
459 | In: Journal of Pattern Recognition, Elsevier, volume 109, number 107589.
460 | January 2021
461 |
462 |
463 | ```bib
464 | @article{ABBAS2021107589,
465 | title = {DenMune: Density peak based clustering using mutual nearest neighbors},
466 | journal = {Pattern Recognition},
467 | volume = {109},
468 | pages = {107589},
469 | year = {2021},
470 | issn = {0031-3203},
471 | doi = {https://doi.org/10.1016/j.patcog.2020.107589},
472 | url = {https://www.sciencedirect.com/science/article/pii/S0031320320303927},
473 | author = {Mohamed Abbas and Adel El-Zoghabi and Amin Shoukry},
474 | keywords = {Clustering, Mutual neighbors, Dimensionality reduction, Arbitrary shapes, Pattern recognition, Nearest neighbors, Density peak},
475 | abstract = {Many clustering algorithms fail when clusters are of arbitrary shapes, of varying densities, or the data classes are unbalanced and close to each other, even in two dimensions. A novel clustering algorithm “DenMune” is presented to meet this challenge. It is based on identifying dense regions using mutual nearest neighborhoods of size K, where K is the only parameter required from the user, besides obeying the mutual nearest neighbor consistency principle. The algorithm is stable for a wide range of values of K. Moreover, it is able to automatically detect and remove noise from the clustering process as well as detecting the target clusters. It produces robust results on various low and high dimensional datasets relative to several known state of the art clustering algorithms.}
476 | }
477 | ```
478 |
479 | Licensing
480 | ------------
481 |
482 | The DenMune algorithm is 3-clause BSD licensed. Enjoy.
483 |
484 | [](https://choosealicense.com/licenses/bsd-3-clause/)
485 |
486 |
487 | Task List
488 | ------------
489 |
490 | - [x] Update Github with the DenMune sourcode
491 | - [x] create repo2docker repository
492 | - [x] Create pip Package
493 | - [x] create CoLab shared examples
494 | - [x] create documentation
495 | - [x] create Kaggle shared examples
496 | - [x] PEP8 compliant
497 | - [x] Continuous integration
498 | - [x] scikit-learn compatible
499 | - [X] Unit tests (coverage: 100%)
500 | - [ ] create conda package
501 |
502 |
--------------------------------------------------------------------------------
/PyPi Package/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools>=42",
4 | "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
--------------------------------------------------------------------------------
/PyPi Package/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = denmune
3 | version = 0.0.96
4 | author = Mohamed Ali Abbas
5 | author_email = mohamed.alyabbas@outlook.com
6 | description = This is the package for DenMune Clustering Algorithm published in paper https://doi.org/10.1016/j.patcog.2020.107589
7 | long_description = file: README.md
8 | long_description_content_type = text/markdown
9 | url = https://github.com/egy1st/denmune-clustering-algorithm
10 | project_urls =
11 | Bug Tracker = https://github.com/pypa/sampleproject/issues
12 | classifiers =
13 | Programming Language :: Python :: 3
14 | License :: OSI Approved :: BSD License
15 | Operating System :: OS Independent
16 |
17 | [options]
18 | package_dir =
19 | = src
20 | packages = find:
21 | python_requires = >=3.6
22 |
23 | [options.packages.find]
24 | where = src
25 |
--------------------------------------------------------------------------------
/PyPi Package/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup(
4 | install_requires=[
5 |
6 | 'numpy==1.23.5',
7 | 'pandas==1.5.3',
8 | 'matplotlib==3.7.2',
9 | 'scikit-learn==1.2.2',
10 | 'seaborn==0.12.2',
11 | 'ngt==2.0.4',
12 | 'anytree==2.8',
13 | 'treelib==1.6.1',
14 | ]
15 |
16 | )
17 |
--------------------------------------------------------------------------------
/PyPi Package/src/denmune.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: denmune
3 | Version: 0.0.96
4 | Summary: This is the package for DenMune Clustering Algorithm published in paper https://doi.org/10.1016/j.patcog.2020.107589
5 | Home-page: https://github.com/egy1st/denmune-clustering-algorithm
6 | Author: Mohamed Ali Abbas
7 | Author-email: mohamed.alyabbas@outlook.com
8 | License: UNKNOWN
9 | Project-URL: Bug Tracker, https://github.com/pypa/sampleproject/issues
10 | Platform: UNKNOWN
11 | Classifier: Programming Language :: Python :: 3
12 | Classifier: License :: OSI Approved :: BSD License
13 | Classifier: Operating System :: OS Independent
14 | Requires-Python: >=3.6
15 | Description-Content-Type: text/markdown
16 | License-File: LICENSE
17 |
18 | DenMune: A density-peak clustering algorithm
19 | =============================================
20 |
21 | DenMune a clustering algorithm that can find clusters of arbitrary size, shapes and densities in two-dimensions. Higher dimensions are first reduced to 2-D using the t-sne. The algorithm relies on a single parameter K (the number of nearest neighbors). The results show the superiority of the algorithm. Enjoy the simplicity but the power of DenMune.
22 |
23 |
24 | []( https://pypi.org/project/denmune/)
25 | [](https://mybinder.org/v2/gh/egy1st/denmune-clustering-algorithm/HEAD)
26 | [](https://denmune.readthedocs.io/en/latest/?badge=latest)
27 | [](#colab)
28 | [](https://www.kaggle.com/egyfirst/denmune-clustering-iris-dataset?scriptVersionId=84775816)
29 | [](https://www.sciencedirect.com/science/article/abs/pii/S0031320320303927)
30 | [](https://data.mendeley.com/datasets/b73cw5n43r/4)
31 | [](https://choosealicense.com/licenses/bsd-3-clause/)
32 | [](https://circleci.com/gh/egy1st/denmune-clustering-algorithm/tree/main)
33 | [](https://codecov.io/gh/egy1st/denmune-clustering-algorithm)
34 | [](https://github.com/egy1st/denmune-clustering-algorithm/actions/workflows/python-package.yml)
35 |
36 | Based on the paper
37 | -------------------
38 |
39 | |Paper|Journal|
40 | |-------------------------------------------------------------------------------------------|-----------------------------|
41 | |Mohamed Abbas, Adel El-Zoghabi, Amin Ahoukry, [](https://www.scimagojr.com/journalsearch.php?q=24823&tip=sid&clean=0)
42 | |*DenMune: Density peak based clustering using mutual nearest neighbors*
43 | |In: Journal of Pattern Recognition, Elsevier,
44 | |volume 109, number 107589, January 2021
45 | |DOI: https://doi.org/10.1016/j.patcog.2020.107589
46 |
47 | Documentation:
48 | ---------------
49 | Documentation, including tutorials, are available on https://denmune.readthedocs.io
50 |
51 | [](https://denmune.readthedocs.io/en/latest/?badge=latest)
52 |
53 |
54 | Watch it in action
55 | -------------------
56 | This 30 seconds will tell you how a density-baased algorithm, DenMune propagates:
57 |
58 | [](https://colab.research.google.com/drive/1o-tP3uvDGjxBOGYkir1lnbr74sZ06e0U?usp=sharing)
59 |
60 | []()
61 |
62 |
63 |
64 | When less means more
65 | --------------------
66 | Most calssic clustering algorithms fail in detecting complex clusters where clusters are of different size, shape, density, and being exist in noisy data.
67 | Recently, a density-based algorithm named DenMune showed great ability in detecting complex shapes even in noisy data. it can detect number of clusters automatically, detect both pre-identified-noise and post-identified-noise automatically and removing them.
68 |
69 | It can achieve accuracy reach 100% in most classic pattern problems, achieve 97% in MNIST dataset. A great advantage of this algorithm is being single-parameter algorithm. All you need is to set number of k-nearest neighbor and the algorithm will care about the rest. Being Non-senstive to changes in k, make it robust and stable.
70 |
71 | Keep in mind, the algorithm reduce any N-D dataset to only 2-D dataset initially, so it is a good benefit of this algorithm is being always to plot your data and explore it which make this algorithm a good candidate for data exploration. Finally, the algorithm comes with neat package for visualizing data, validating it and analyze the whole clustering process.
72 |
73 | How to install DenMune
74 | ------------------------
75 | Simply install DenMune clustering algorithm using pip command from the official Python repository
76 |
77 | []( https://pypi.org/project/denmune/)
78 |
79 | From the shell run the command
80 |
81 | ```shell
82 | pip install denmune
83 | ```
84 |
85 | From jupyter notebook cell run the command
86 |
87 | ```ipython3
88 | !pip install denmune
89 | ```
90 |
91 | How to use DenMune
92 | --------------------
93 | Once DenMune is installed, you just need to import it
94 |
95 | ```python
96 | from denmune import DenMune
97 | ```
98 | ###### Please note that first denmune (the package) in small letters, while the other one(the class itself) has D and M in capital case.
99 |
100 |
101 | Read data
102 | -----------
103 |
104 | There are four possible cases of data:
105 | - only train data without labels
106 | - only labeld train data
107 | - labeled train data in addition to test data without labels
108 | - labeled train data in addition to labeled test data
109 |
110 |
111 | ```python
112 | #=============================================
113 | # First scenario: train data without labels
114 | # ============================================
115 |
116 | data_path = 'datasets/denmune/chameleon/'
117 | dataset = "t7.10k.csv"
118 | data_file = data_path + dataset
119 |
120 | # train data without labels
121 | X_train = pd.read_csv(data_file, sep=',', header=None)
122 |
123 | knn = 39 # k-nearest neighbor, the only parameter required by the algorithm
124 |
125 | dm = DenMune(train_data=X_train, k_nearest=knn)
126 | labels, validity = dm.fit_predict(show_analyzer=False, show_noise=True)
127 |
128 | ```
129 | This is an intutive dataset which has no groundtruth provided
130 |
131 | 
132 |
133 | ```python
134 | #=============================================
135 | # Second scenario: train data with labels
136 | # ============================================
137 |
138 | data_path = 'datasets/denmune/shapes/'
139 | dataset = "aggregation.csv"
140 | data_file = data_path + dataset
141 |
142 | # train data with labels
143 | X_train = pd.read_csv(data_file, sep=',', header=None)
144 | y_train = X_train.iloc[:, -1]
145 | X_train = X_train.drop(X_train.columns[-1], axis=1)
146 |
147 | knn = 6 # k-nearest neighbor, the only parameter required by the algorithm
148 |
149 | dm = DenMune(train_data=X_train, train_truth= y_train, k_nearest=knn)
150 | labels, validity = dm.fit_predict(show_analyzer=False, show_noise=True)
151 | ```
152 | Datset groundtruth
153 |
154 | 
155 |
156 | Datset as detected by DenMune at k=6
157 |
158 | 
159 |
160 |
161 | ```python
162 | #=================================================================
163 | # Third scenario: train data with labels in addition to test data
164 | # ================================================================
165 |
166 | data_path = 'datasets/denmune/pendigits/'
167 | file_2d = data_path + 'pendigits-2d.csv'
168 |
169 | # train data with labels
170 | X_train = pd.read_csv(data_path + 'train.csv', sep=',', header=None)
171 | y_train = X_train.iloc[:, -1]
172 | X_train = X_train.drop(X_train.columns[-1], axis=1)
173 |
174 | # test data without labels
175 | X_test = pd.read_csv(data_path + 'test.csv', sep=',', header=None)
176 | X_test = X_test.drop(X_test.columns[-1], axis=1)
177 |
178 | knn = 50 # k-nearest neighbor, the only parameter required by the algorithm
179 |
180 | dm = DenMune(train_data=X_train, train_truth= y_train,
181 | test_data= X_test,
182 | k_nearest=knn)
183 | labels, validity = dm.fit_predict(show_analyzer=True, show_noise=True)
184 | ```
185 | dataset groundtruth
186 |
187 | 
188 |
189 |
190 | dataset as detected by DenMune at k=50
191 |
192 | 
193 |
194 | test data as predicted by DenMune on training the dataset at k=50
195 |
196 | 
197 |
198 |
199 | Algorithm's Parameters
200 | -----------------------
201 | 1. Parameters used within the initialization of the DenMune class
202 |
203 | ```python
204 | def __init__ (self,
205 | train_data=None, test_data=None,
206 | train_truth=None, test_truth=None,
207 | file_2d ='_temp_2d', k_nearest=10,
208 | rgn_tsne=False, prop_step=0,
209 | ):
210 | ```
211 |
212 | - train_data:
213 | - data used for training the algorithm
214 | - default: None. It should be provided by the use, otherwise an error will riase.
215 |
216 | - train_truth:
217 | - labels of training data
218 | - default: None
219 |
220 | - test_data:
221 | - data used for testing the algorithm
222 |
223 | - test_truth:
224 | - labels of testing data
225 | - default: None
226 |
227 | - k_nearest:
228 | - number of nearest neighbor
229 | - default: 10. It should be provided by the user.
230 |
231 | - rgn_tsn:
232 | - when set to True: It will regenerate the reduced 2-D version of the N-D dataset each time the algorithm run.
233 | - when set to False: It will generate the reduced 2-D version of the N-D dataset first time only, then will reuse the saved exist file
234 | - default: True
235 |
236 | - file_2d: name (include location) of file used save/load the reduced 2-d version
237 | - if empty: the algorithm will create temporary file named '_temp_2d'
238 | - default: _temp_2d
239 |
240 | - prop_step:
241 | - size of increment used in showing the clustering propagation.
242 | - leave this parameter set to 0, the default value, unless you are willing intentionally to enter the propagation mode.
243 | - default: 0
244 |
245 |
246 | 2. Parameters used within the fit_predict function:
247 |
248 | ```python
249 | def fit_predict(self,
250 | validate=True,
251 | show_plots=True,
252 | show_noise=True,
253 | show_analyzer=True
254 | ):
255 | ```
256 |
257 | - validate:
258 | - validate data on/off according to five measures integrated with DenMUne (Accuracy. F1-score, NMI index, AMI index, ARI index)
259 | - default: True
260 |
261 | - show_plots:
262 | - show/hide plotting of data
263 | - default: True
264 |
265 | - show_noise:
266 | - show/hide noise and outlier
267 | - default: True
268 |
269 | - show_analyzer:
270 | - show/hide the analyzer
271 | - default: True
272 |
273 | The Analyzer
274 | -------------
275 |
276 | The algorithm provide an intutive tool called analyzer, once called it will provide you with in-depth analysis on how your clustering results perform.
277 |
278 | 
279 |
280 | Noise Detection
281 | ----------------
282 |
283 | DenMune detects noise and outlier automatically, no need to any further work from your side.
284 |
285 | - It plots pre-identified noise in black
286 | - It plots post-identified noise in light grey
287 |
288 | You can set show_noise parameter to False.
289 |
290 |
291 | ```python
292 |
293 | # let us show noise
294 |
295 | m = DenMune(train_data=X_train, k_nearest=knn)
296 | labels, validity = dm.fit_predict(show_noise=True)
297 | ```
298 |
299 | ```python
300 |
301 | # let us show clean data by removing noise
302 |
303 | m = DenMune(train_data=X_train, k_nearest=knn)
304 | labels, validity = dm.fit_predict(show_noise=False)
305 | ```
306 |
307 | | noisy data | clean data |
308 | ----------| ---------------------------------------------------------------------------------------------------|
309 | |  |  |
310 |
311 |
312 | Validatation
313 | --------------
314 | You can get your validation results using 3 methods
315 |
316 | - by showing the Analyzer
317 | - extract values from the validity returned list from fit_predict function
318 | - extract values from the Analyzer dictionary
319 | -
320 | There are five validity measures built-in the algorithm, which are:
321 |
322 | - ACC, Accuracy
323 | - F1 score
324 | - NMI index (Normalized Mutual Information)
325 | - AMI index (Adjusted Mutual Information)
326 | - ARI index (Adjusted Rand Index)
327 |
328 | 
329 |
330 | K-nearest Evolution
331 | -------------------
332 | The following chart shows the evolution of pre and post identified noise in correspondence to increase of number of knn. Also, detected number of clusters is analyzed in the same chart in relation with both types of identified noise.
333 |
334 | 
335 |
336 |
337 | The Scalability
338 | ----------------
339 | | data size | time |
340 | |------------------| ------------------- |
341 | | data size: 5000 | time: 2.3139 seconds |
342 | | data size: 10000 | time: 5.8752 seconds |
343 | | data size: 15000 | time: 12.4535 seconds |
344 | | data size: 20000 | time: 18.8466 seconds |
345 | | data size: 25000 | time: 28.992 seconds |
346 | | data size: 30000 | time: 39.3166 seconds |
347 | | data size: 35000 | time: 39.4842 seconds |
348 | | data size: 40000 | time: 63.7649 seconds |
349 | | data size: 45000 | time: 73.6828 seconds |
350 | | data size: 50000 | time: 86.9194 seconds |
351 | | data size: 55000 | time: 90.1077 seconds |
352 | | data size: 60000 | time: 125.0228 seconds |
353 | | data size: 65000 | time: 149.1858 seconds |
354 | | data size: 70000 | time: 177.4184 seconds |
355 | | data size: 75000 | time: 204.0712 seconds |
356 | | data size: 80000 | time: 220.502 seconds |
357 | | data size: 85000 | time: 251.7625 seconds |
358 | | data size: 100000 | time: 257.563 seconds |
359 |
360 | | 
361 |
362 | The Stability
363 | --------------
364 |
365 | The algorithm is only single-parameter, even more it not sensitive to changes in that parameter, k. You may guess that from the following chart yourself. This is of greate benfit for you as a data exploration analyst. You can simply explore the dataset using an arbitrary k. Being Non-senstive to changes in k, make it robust and stable.
366 |
367 | 
368 |
369 |
370 | Reveal the propagation
371 | -----------------------
372 |
373 | one of the top performing feature in this algorithm is enabling you to watch how your clusters propagate to construct the final output clusters.
374 | just use the parameter 'prop_step' as in the following example:
375 |
376 | ```python
377 | dataset = "t7.10k" #
378 | data_path = 'datasets/denmune/chameleon/'
379 |
380 | # train file
381 | data_file = data_path + dataset +'.csv'
382 | X_train = pd.read_csv(data_file, sep=',', header=None)
383 |
384 |
385 | from itertools import chain
386 |
387 | # Denmune's Paramaters
388 | knn = 39 # number of k-nearest neighbor, the only parameter required by the algorithm
389 |
390 | # create list of differnt snapshots of the propagation
391 | snapshots = chain(range(2,5), range(5,50,10), range(50, 100, 25), range(100,500,100), range(500,2000, 250), range(1000,5500, 500))
392 |
393 | from IPython.display import clear_output
394 | for snapshot in snapshots:
395 | print ("itration", snapshot )
396 | clear_output(wait=True)
397 | dm = DenMune(train_data=X_train, k_nearest=knn, rgn_tsne=False, prop_step=snapshot)
398 | labels, validity = dm.fit_predict(show_analyzer=False, show_noise=False)
399 | ```
400 |
401 | []()
402 |
403 | Interact with the algorithm
404 | ---------------------------
405 | [](https://colab.research.google.com/drive/1EUROd6TRwxW3A_XD3KTxL8miL2ias4Ue?usp=sharing)
406 |
407 | This notebook allows you interact with the algorithm in many asspects:
408 | - you can choose which dataset to cluster (among 4 chameleon datasets)
409 | - you can decide which number of k-nearest neighbor to use
410 | - show noise on/off; thus you can invesitigate noise detected by the algorithm
411 | - show analyzer on/off
412 |
413 | How to run and test
414 | --------------------
415 |
416 | 1. Launch Examples in Repo2Docker Binder
417 |
418 | Simply use our repo2docker offered by mybinder.org, which encapsulate the algorithm and all required data in one virtual machine instance. All jupter notebooks examples found in this repository will be also available to you in action to practice in this respo2docer. Thanks mybinder.org, you made it possible!
419 |
420 | [](https://mybinder.org/v2/gh/egy1st/denmune-clustering-algorithm/HEAD)
421 |
422 | 2. Launch each Example in Kaggle workspace
423 |
424 | If you are a kaggler like me, then Kaggle, the best workspace where data scientist meet, should fit you to test the algorithm with great experince.
425 |
426 | | Dataset | Kaggle URL |
427 | ----------| ---------------------------------------------------------------------------------------------------|
428 | |When less means more - kaggle |[]( https://www.kaggle.com/egyfirst/when-less-means-more) |
429 | |Non-groundtruth datasets - kaggle|[](https://www.kaggle.com/egyfirst/detecting-non-groundtruth-datasets) |
430 | |2D Shape datasets - kaggle|[](https://www.kaggle.com/egyfirst/detection-of-2d-shape-datasets) |
431 | |MNIST dataset kaggle|[](https://www.kaggle.com/egyfirst/get-97-using-simple-yet-one-parameter-algorithm) |
432 | |Iris dataset kaggle| [](https://www.kaggle.com/egyfirst/denmune-clustering-iris-dataset) |
433 | |Training MNIST to get 97%| []( https://www.kaggle.com/egyfirst/training-mnist-dataset-to-get-97) |
434 | |Noise detection - kaggle| []( https://www.kaggle.com/egyfirst/noise-detection) |
435 | |Validation - kaggle| [](https://www.kaggle.com/egyfirst/validate-in-5-built-in-validity-insexes) |
436 | |The beauty of propagation - kaggle| [](https://www.kaggle.com/egyfirst/the-beauty-of-clusters-propagation) |
437 | |The beauty of propagation part2 - kaggle | [](https://www.kaggle.com/egyfirst/the-beauty-of-propagation-part2) |
438 | |Snapshots of propagation -kaggle| [](https://www.kaggle.com/egyfirst/beauty-of-propagation-part3) |
439 | |Scalability kaggle| [](https://www.kaggle.com/egyfirst/scalability-vs-speed) |
440 | |Stability - kaggle| [](https://www.kaggle.com/egyfirst/stability-vs-number-of-nearest-neighbor) |
441 | |k-nearest-evolution - kaggle| [](https://www.kaggle.com/egyfirst/k-nearest-evolution) |
442 |
443 | 3. Launch each Example in Google Research, CoLab
444 |
445 | Need to test examples one by one, then here another option. Use colab offered by google research to test each example individually.
446 |
447 |
448 |
449 | Here is a list of Google CoLab URL to use the algorithm interactively
450 | ----------------------------------------------------------------------
451 |
452 |
453 | | Dataset | CoLab URL |
454 | ----------| ---------------------------------------------------------------------------------------------------|
455 | |How to use it - colab|[]( https://colab.research.google.com/drive/1J_uKdhZ3z1KeY0-wJ7Ruw2PZSY1orKQm)|
456 | |Chameleon datasets - colab|[](https://colab.research.google.com/drive/1EUROd6TRwxW3A_XD3KTxL8miL2ias4Ue?usp=sharing) |
457 | |2D Shape datasets - colab|[]( https://colab.research.google.com/drive/1EaqTPCRHSuTKB-qEbnWHpGKFj6XytMIk?usp=sharing) |
458 | |MNIST dataset - colab|[](https://colab.research.google.com/drive/1a9FGHRA6IPc5jhLOV46iEbpUeQXptSJp?usp=sharing) |
459 | |iris dataset - colab|[](https://colab.research.google.com/drive/1nKql57Xh7xVVu6NpTbg3vRdRg42R7hjm?usp=sharing) |
460 | |Get 97% by training MNIST dataset - colab|[]( https://colab.research.google.com/drive/1NeOtXEQY94oD98Ufbh3IhTHnnYwIA659) |
461 | |Non-groundtruth datasets - colab|[]( https://colab.research.google.com/drive/1d17ejQ83aUy0CZIeQ7bHTugSC9AjJ2mU?usp=sharing) |
462 | |Noise detection - colab|[]( https://colab.research.google.com/drive/1Bp3c-cJfjLWxupmrBJ_6Q4-nqIfZcII4) |
463 | |Validation - colab|[]( https://colab.research.google.com/drive/13_EVaQOv_QiNmQiMWJAcFFHPJHGCrQLe) |
464 | |How it propagates - colab|[](https://colab.research.google.com/drive/1o-tP3uvDGjxBOGYkir1lnbr74sZ06e0U?usp=sharing)|
465 | |Snapshots of propagation - colab|[](https://colab.research.google.com/drive/1vPXNKa8Rf3TnqDHSD3YSWl3g1iNSqjl2?usp=sharing)|
466 | |Scalability - colab|[](https://colab.research.google.com/drive/1d55wkBndLLapO7Yx1ePHhE8mL61j9-TH?usp=sharing)|
467 | |Stability vs number of nearest neighbors - colab|[](https://colab.research.google.com/drive/17VgVRMFBWvkSIH1yA3tMl6UQ7Eu68K2l?usp=sharing)|
468 | |k-nearest-evolution - colab|[]( https://colab.research.google.com/drive/1DZ-CQPV3WwJSiaV3-rjwPwmXw4RUh8Qj)|
469 |
470 |
471 |
472 | How to cite
473 | =====
474 | If you have used this codebase in a scientific publication and wish to cite it, please use the [Journal of Pattern Recognition article](https://www.sciencedirect.com/science/article/abs/pii/S0031320320303927)
475 |
476 | Mohamed Abbas McInnes, Adel El-Zoghaby, Amin Ahoukry, *DenMune: Density peak based clustering using mutual nearest neighbors*
477 | In: Journal of Pattern Recognition, Elsevier, volume 109, number 107589.
478 | January 2021
479 |
480 |
481 | ```bib
482 | @article{ABBAS2021107589,
483 | title = {DenMune: Density peak based clustering using mutual nearest neighbors},
484 | journal = {Pattern Recognition},
485 | volume = {109},
486 | pages = {107589},
487 | year = {2021},
488 | issn = {0031-3203},
489 | doi = {https://doi.org/10.1016/j.patcog.2020.107589},
490 | url = {https://www.sciencedirect.com/science/article/pii/S0031320320303927},
491 | author = {Mohamed Abbas and Adel El-Zoghabi and Amin Shoukry},
492 | keywords = {Clustering, Mutual neighbors, Dimensionality reduction, Arbitrary shapes, Pattern recognition, Nearest neighbors, Density peak},
493 | abstract = {Many clustering algorithms fail when clusters are of arbitrary shapes, of varying densities, or the data classes are unbalanced and close to each other, even in two dimensions. A novel clustering algorithm “DenMune” is presented to meet this challenge. It is based on identifying dense regions using mutual nearest neighborhoods of size K, where K is the only parameter required from the user, besides obeying the mutual nearest neighbor consistency principle. The algorithm is stable for a wide range of values of K. Moreover, it is able to automatically detect and remove noise from the clustering process as well as detecting the target clusters. It produces robust results on various low and high dimensional datasets relative to several known state of the art clustering algorithms.}
494 | }
495 | ```
496 |
497 | Licensing
498 | ------------
499 |
500 | The DenMune algorithm is 3-clause BSD licensed. Enjoy.
501 |
502 | [](https://choosealicense.com/licenses/bsd-3-clause/)
503 |
504 |
505 | Task List
506 | ------------
507 |
508 | - [x] Update Github with the DenMune sourcode
509 | - [x] create repo2docker repository
510 | - [x] Create pip Package
511 | - [x] create CoLab shared examples
512 | - [x] create documentation
513 | - [x] create Kaggle shared examples
514 | - [x] PEP8 compliant
515 | - [x] Continuous integration
516 | - [x] scikit-learn compatible
517 | - [X] Unit tests (coverage: 97%)
518 | - [ ] create conda package
519 |
520 |
521 |
522 |
--------------------------------------------------------------------------------
/PyPi Package/src/denmune.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | LICENSE
2 | MANIFEST.in
3 | README.md
4 | pyproject.toml
5 | setup.cfg
6 | setup.py
7 | src/denmune/__init__.py
8 | src/denmune/denmune.py
9 | src/denmune.egg-info/PKG-INFO
10 | src/denmune.egg-info/SOURCES.txt
11 | src/denmune.egg-info/dependency_links.txt
12 | src/denmune.egg-info/requires.txt
13 | src/denmune.egg-info/top_level.txt
--------------------------------------------------------------------------------
/PyPi Package/src/denmune.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/PyPi Package/src/denmune.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.18.5
2 | pandas>=1.0.3
3 | matplotlib>=3.2.1
4 | scikit-learn>=0.22.1
5 | seaborn>=0.10.1
6 | ngt>=1.11.6
7 | anytree>=2.8.0
8 | treelib>=1.6.1
9 |
--------------------------------------------------------------------------------
/PyPi Package/src/denmune.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | denmune
2 |
--------------------------------------------------------------------------------
/PyPi Package/src/denmune/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/PyPi Package/src/denmune/.idea/.name:
--------------------------------------------------------------------------------
1 | denmune.py
--------------------------------------------------------------------------------
/PyPi Package/src/denmune/.idea/denmune.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/PyPi Package/src/denmune/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/PyPi Package/src/denmune/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/PyPi Package/src/denmune/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/PyPi Package/src/denmune/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from .denmune import DenMune
--------------------------------------------------------------------------------
/PyPi Package/src/denmune/denmune.py:
--------------------------------------------------------------------------------
1 | # ====================================================================================================================
2 | # About the source code and the associated published paper
3 | # ====================================================================================================================
4 | # This is the source code of DenMune Clustering Algorithm accompanied with the experimental work
5 | # which is published in Elsevier Pattern Recognition, Volume 109, January 2021
6 | # paper can be accessed from 107589 https://doi.org/10.1016/j.patcog.2020.107589
7 | # source code and several examples on using it, can be accessed from
8 | # Gitbub's repository at https://github.com/egy1st/denmune-clustering-algorithm
9 | # Authors: Mohamed Abbas, Adel El-Zoghabi, and Amin Shoukry
10 | # Edition 0.0.2.3 Released 29-12-2021
11 | # PyPi package installation from https://pypi.org/project/denmune/
12 | # ====================================================================================================================
13 |
14 |
15 | # ====================================================================================================================
16 | # About the DenMune Algorithm
17 | # ====================================================================================================================
18 | # DenMune Clustering Algorithm's Highlights
19 | # DenMune is a clustering algorithm that can find clusters of arbitrary size, shapes and densities in two-dimensions.
20 | # Higher dimensions are first reduced to 2-D using the t-sne.
21 | # The algorithm relies on a single parameter K (the number of nearest neighbors).
22 | # The results show the superiority of DenMune.
23 | # =====================================================================================================================
24 |
25 |
26 | # =====================================================================================================================
27 | # About me
28 | # =====================================================================================================================
29 | # Name: Mohamed Ali Abbas
30 | # Egypt - Alexandria - Smouha
31 | # Cell-phone: +20-01007500290
32 | # Personal E-mail: mohamed.alyabbas@outlook.com
33 | # Business E-meal: 01@zerobytes.one
34 | # website: https://zerobytes.one
35 | # LinkedIn: https://www.linkedin.com/in/mohabbas/
36 | # Github: https://github.com/egy1st
37 | # Kaggle: https://www.kaggle.com/egyfirst
38 | # Udemy: https://www.udemy.com/user/mohammad-ali-abbas/
39 | # Facebook: https://www.facebook.com/ZeroBytes.One
40 | # =====================================================================================================================
41 |
42 | import operator
43 | import os.path
44 | import time
45 |
46 | import matplotlib.pyplot as plt
47 | import ngtpy
48 | import numpy as np
49 | import pandas as pd
50 | import seaborn as sns
51 | from anytree import Node
52 | from numpy import genfromtxt
53 | from sklearn.manifold import TSNE
54 | from treelib import Tree as tr
55 |
56 | sns.set_context('poster')
57 | sns.set_color_codes()
58 | plot_kwds = {'alpha': 0.99, 's': 80, 'linewidths': 0}
59 |
60 |
61 | # import for possible needs
62 | # from sklearn.metrics import confusion_matrix
63 | # from sklearn import metrics
64 | # import sklearn.cluster as cluster
65 |
66 |
67 | class DataPoint():
68 |
69 | def __init__(self, id):
70 | self.point_id = id
71 | self.class_id = 0 # 0 not clustered but -1 means a noise
72 | self.refer_to = []
73 | self.referred_by = []
74 | self.reference = []
75 | self.visited = False
76 | self.homogeneity = 0
77 |
78 |
79 | class DenMune():
80 |
81 | def __init__(self,
82 | train_data=None, test_data=None,
83 | train_truth=None, test_truth=None,
84 | file_2d=None, k_nearest=0,
85 | rgn_tsne=False, prop_step=0,
86 | ):
87 |
88 | if train_data is None:
89 | raise Exception("No data is provided. At least train data should be provided. Set train_data argmunt properly.")
90 | else:
91 | self.data_indicator = 1
92 |
93 | if train_truth is not None:
94 | self.data_indicator += 2
95 |
96 | if test_data is not None:
97 | self.data_indicator += 4
98 |
99 | if test_truth is not None:
100 | self.data_indicator += 8
101 |
102 | if train_data is not None and train_truth is None and test_truth is not None:
103 | raise Exception("you should provide labels for your traing data to be allowed to work with test data. Set train_truth argmunt properly.")
104 | if train_data is not None and train_truth is None and test_data is not None :
105 | raise Exception("you should provide labels for your traing data to be allowed to work with test data. Set train_truth argmunt properly.")
106 | if train_data is not None and train_truth is not None and test_truth is not None and test_data is None:
107 | raise Exception("Although labels of testing data is provided, the test data itself isnot. Set test_data argument properly.")
108 |
109 | self.analyzer = {}
110 | self.analyzer['n_points'] = {}
111 | if isinstance(train_data, pd.DataFrame):
112 | train_data = train_data.to_numpy()
113 | train_data = train_data.copy(order='C')
114 | if isinstance(test_data, pd.DataFrame):
115 | test_data = test_data.to_numpy()
116 | test_data = test_data.copy(order='C')
117 | if isinstance(train_truth, pd.Series):
118 | train_truth = train_truth.to_numpy()
119 | train_truth = train_truth.copy(order='C')
120 | if isinstance(test_truth, pd.Series):
121 | test_truth = test_truth.to_numpy()
122 | test_truth = test_truth.copy(order='C')
123 |
124 | self.train_sz = len(train_data)
125 |
126 | if test_data is not None:
127 | data = np.append(train_data, test_data, axis=0)
128 | self.test_sz = len(test_data)
129 | else:
130 | self.test_sz = 0
131 | data = train_data
132 |
133 | if test_truth is not None:
134 | self.labels_truth = np.append(train_truth, test_truth, axis=0)
135 | else:
136 | self.labels_truth = train_truth
137 |
138 | self.analyzer["n_points"]["size"] = len(
139 | data) # data.shape[0] # this will changed in preplot when we plot only train or test data
140 |
141 | self.analyzer['exec_time'] = {}
142 | self.analyzer["exec_time"]["t_SNE"] = 0
143 | self.analyzer['n_points']["noise"] = {}
144 | self.analyzer["n_points"]["noise"]["type-1"] = 0
145 | self.analyzer["n_points"]["noise"]["type-2"] = 0
146 | self.analyzer['n_points']["weak"] = {}
147 | self.analyzer["n_points"]["weak"]["all"] = 0
148 | self.analyzer["n_points"]["weak"]["succeeded to merge"] = 0
149 | self.analyzer["n_points"]["weak"]["failed to merge"] = 0
150 | self.analyzer["n_points"]["dim"] = data.shape[1]
151 | self.analyzer["n_clusters"] = {}
152 | self.analyzer["n_clusters"]["actual"] = 0
153 | self.analyzer["n_clusters"]["detected"] = 0
154 | # self.delimiter = delimiter
155 | self.debuger = {}
156 |
157 | if k_nearest == 0:
158 | raise Exception("k-nearest neighbor should be at least 1")
159 |
160 | if file_2d is None:
161 | file_2d = '_temp_2d'
162 |
163 | if data.shape[1] != 2 and file_2d == '_temp_2d':
164 | # raise Exception("Sorry, this is N-D dataset, file-2d parameter should not be empty")
165 | start = time.time()
166 | self.generate_tsne(data, 2, file_2d='_temp_2d')
167 | end = time.time()
168 | self.analyzer["exec_time"]["t_SNE"] = end - start
169 | data = genfromtxt(file_2d, delimiter=',')
170 | elif data.shape[1] != 2 and file_2d != '_temp_2d':
171 | if not os.path.isfile(file_2d) or rgn_tsne == True:
172 | start = time.time()
173 | self.generate_tsne(data, 2, file_2d)
174 | end = time.time()
175 | self.analyzer["exec_time"]["t_SNE"] = end - start
176 | data = genfromtxt(file_2d, delimiter=',')
177 |
178 | start_time = time.time()
179 |
180 | self.alg_name = 'denmune'
181 | self.prop_step = prop_step
182 | self.data = data
183 | self.train_data = train_data
184 | self.test_data = test_data
185 | self.dp_count = self.data.shape[0]
186 | self.dp_dim = self.data.shape[1]
187 | self.k_nearest = k_nearest
188 | self.dp_dis = []
189 | self.train_truth = train_truth
190 | self.test_truth = test_truth
191 |
192 | self.DataPoints = []
193 | self.ClassPoints = {}
194 | self.KernelPoints = []
195 |
196 | self.init_DataPoints()
197 | self.kd_NGT()
198 | self.load_DataPoints() # load_DataPoints must come after kd_NGT()
199 | self.compute_Links()
200 | # self.semi_init_DataPoints #it is useful with csharp and CNune only
201 | self.find_Noise()
202 | self.sort_DataPoints()
203 | self.prepare_Clusters()
204 | self.attach_Points()
205 |
206 | end_time = time.time()
207 | self.analyzer["exec_time"]["DenMune"] = end_time - start_time
208 |
209 | return None # __init__ should return None
210 |
211 | def kd_NGT(self):
212 |
213 | if len(self.dp_dis) == 0:
214 |
215 | ngtpy.create(b"tmp", self.dp_dim)
216 | index = ngtpy.Index(b"tmp")
217 | index.batch_insert(self.data)
218 | index.save()
219 |
220 | k = self.k_nearest
221 | start = time.time()
222 | self.dp_dis = []
223 | for i in range(self.dp_count):
224 | query = self.data[i]
225 | result = index.search(query, k + 1)[1:] # we skip first distance from a point to itself
226 | self.dp_dis.append(result)
227 |
228 | end = time.time()
229 | self.analyzer["exec_time"]["NGT"] = end - start
230 |
231 | def getValue(self, dic, what, who, other=False):
232 |
233 | if what == 'max' and who == 'value' and other == True:
234 | val = max(dic.items(), key=operator.itemgetter(1))[0] # max value==>key
235 | # these cases will never be used here but keep them for future use.
236 | """"
237 | elif what == 'max' and who == 'key' and other == False:
238 | val = max(dic.items(), key=operator.itemgetter(0))[0] # max key
239 | elif what == 'max' and who == 'key' and other == True:
240 | val = max(dic.items(), key=operator.itemgetter(0))[1] # max key==>Value
241 | elif what == 'max' and who == 'value' and other == False:
242 | val = max(dic.items(), key=operator.itemgetter(1))[1] # max value
243 | """
244 | return val
245 |
246 | def init_DataPoints(self):
247 |
248 | self.DataPoints = []
249 | self.KernelPoints = []
250 |
251 | for i in range(self.dp_count):
252 | dp = DataPoint(i)
253 | # no need since datapoint is initialised with these values
254 | """
255 | dp.refer_to = []
256 | dp.referred_by = []
257 | dp.reference = []
258 | dp.class_id = 0
259 | dp.visited = False
260 | dp.homogeneity = 0.0
261 | """
262 | self.DataPoints.append(dp)
263 | return 0
264 |
265 | """
266 | this function is useful with csharp and CNune only
267 |
268 | def semi_init_DataPoints(self):
269 |
270 | for dp in self.DataPoints:
271 | dp.visited = False
272 | dp.class_id = 0
273 | dp.homogeneity = 0
274 | return 0
275 | """
276 |
277 | def find_Noise(self):
278 |
279 | self.ClassPoints[-1] = Node(-1, parent=None)
280 | self.ClassPoints[0] = Node(0, parent=None)
281 |
282 | for i in range(self.dp_count):
283 | dp = self.DataPoints[i]
284 | if len(dp.reference) == 0:
285 | dp.class_id = -1
286 | self.ClassPoints[i] = self.ClassPoints[-1] # Node(-1, parent=None) # this it is a noise
287 | else: # at least one point
288 | dp.class_id = 0 # this is allready set initally
289 | self.ClassPoints[i] = self.ClassPoints[0] # Node(0, parent=None) # this it is a non-clustered point
290 | # where -1 is noise and 0 is non-clustered
291 | return 0
292 |
293 | def sort_DataPoints(self):
294 |
295 | for dp in self.DataPoints:
296 | if len(dp.reference) != 0:
297 | self.KernelPoints.append([dp.point_id, dp.homogeneity])
298 |
299 | self.KernelPoints = self.sort_Tuple(self.KernelPoints, reverse=True)
300 |
301 | return 0
302 |
303 | def compute_Links(self):
304 | start = time.time()
305 |
306 | for i in range(self.dp_count):
307 | for pos in self.DataPoints[i].refer_to:
308 |
309 | for pos2 in self.DataPoints[i].referred_by:
310 | if pos[0] == pos2[0]:
311 | self.DataPoints[i].reference.append(pos)
312 | break
313 |
314 | self.analyzer["n_points"]["strong"] = 0
315 | for i in range(self.dp_count):
316 | self.DataPoints[i].referred_by = self.sort_Tuple(self.DataPoints[i].referred_by, reverse=False)
317 | if len(self.DataPoints[i].referred_by) >= self.k_nearest:
318 | self.analyzer["n_points"]["strong"] += 1
319 | else:
320 | self.analyzer["n_points"]["weak"]["all"] += 1
321 |
322 | self.DataPoints[i].reference = self.sort_Tuple(self.DataPoints[i].reference, reverse=False)
323 | homogeneity = (100 * len(self.DataPoints[i].referred_by)) + len(self.DataPoints[i].reference)
324 | self.DataPoints[i].homogeneity = homogeneity
325 |
326 | end = time.time()
327 |
328 | return 0
329 |
330 | def sort_Tuple(self, li, reverse=False):
331 |
332 | # reverse = None (Sorts in Ascending order)
333 | # key is set to sort using second element of
334 | # sublist lambda has been used
335 | li.sort(key=lambda x: x[1], reverse=reverse)
336 | return li
337 |
338 | def load_DataPoints(self):
339 |
340 | # initialize datapoints to its default values
341 | self.init_DataPoints()
342 |
343 | for i in range(self.dp_count):
344 | result = self.dp_dis[i]
345 | for k, o in enumerate(result):
346 | # no need to this condition, it wont happen
347 | #if k >= self.k_nearest:
348 | # break
349 |
350 | # if k != 0:
351 | _dis = round(o[1], 6)
352 | _point = o[0]
353 |
354 | self.DataPoints[i].refer_to.append([_point, _dis])
355 | self.DataPoints[_point].referred_by.append([i, _dis])
356 |
357 | return 0
358 |
359 | def prepare_Clusters(self):
360 | start = time.time()
361 | class_id = 0
362 |
363 | itr = 0
364 | for dp_kern in self.KernelPoints:
365 | itr += 1
366 | if self.prop_step and self.prop_step <= itr:
367 | continue
368 |
369 | dp_core = self.DataPoints[dp_kern[0]]
370 |
371 | # remember no strong points & weak points in Tirann
372 | # all points with at least one refernce are considered (ignore noises)
373 | if len(dp_core.reference) > 0 and len(dp_core.referred_by) >= len(dp_core.refer_to):
374 |
375 | class_id += 1
376 | dp_core.visited = True
377 | dp_core.class_id = class_id
378 | self.ClassPoints[class_id] = Node(class_id, parent=None)
379 | max_class = -1
380 | weight_map = {}
381 | # Class_Points[class_id] = new TreeCls::Node(class_id)
382 |
383 | for pos2 in dp_core.reference:
384 | # if DataPoints[*pos2].visited && visited was tested not to affect on results, so you can ommit it
385 | if self.DataPoints[pos2[0]].class_id > 0 and len(self.DataPoints[pos2[0]].referred_by) >= len(
386 | self.DataPoints[pos2[0]].refer_to):
387 |
388 | # this condition is a must, as some points may be visited but not classified yet
389 | # maa we may neglect is noise as long as it is in our refernce points
390 |
391 | _cls = self.DataPoints[pos2[0]].class_id
392 | _class_id = self.ClassPoints[_cls].root.name
393 | # _class_id = _cls
394 |
395 | if _class_id not in weight_map.keys():
396 | weight_map[_class_id] = 1
397 | else:
398 | weight_map[_class_id] += 1
399 |
400 |
401 | elif self.DataPoints[pos2[0]].visited == False:
402 | self.DataPoints[pos2[0]].visited = True # this point is visited but not classified yet
403 |
404 | while len(weight_map) > 0:
405 | # weight_no = self.getValue(dic=weight_map, what='max', who='value') # no need to it in DenMune
406 | max_class = self.getValue(dic=weight_map, what='max', who='value', other=True)
407 |
408 | if max_class != -1 and max_class != class_id:
409 | self.ClassPoints[max_class].parent = self.ClassPoints[class_id]
410 |
411 | del weight_map[max_class]
412 |
413 | for i in range(self.dp_count):
414 | clsid = self.DataPoints[i].class_id
415 | clsroot = self.ClassPoints[clsid].root.name
416 | self.DataPoints[i].class_id = clsroot
417 |
418 | if self.prop_step:
419 | # let us update class 0 to be -2
420 | for dp in self.DataPoints:
421 | if dp.class_id == 0:
422 | dp.class_id = -2
423 |
424 | end = time.time()
425 |
426 | return 0
427 |
428 | def attach_Points(self):
429 |
430 | start = time.time()
431 | olditr = 0
432 | newitr = -1
433 | while olditr != newitr:
434 | newitr = olditr
435 | olditr = 0
436 |
437 | for pos in self.KernelPoints:
438 | if self.DataPoints[pos[0]].class_id == 0:
439 | self.DataPoints[pos[0]].class_id = self.attach_StrongPoint(pos[0])
440 | olditr += 1
441 |
442 | olditr = 0
443 | newitr = -1
444 | while olditr != newitr:
445 | newitr = olditr
446 | olditr = 0
447 |
448 | for pos in self.KernelPoints:
449 | if self.DataPoints[pos[0]].class_id == 0:
450 | self.DataPoints[pos[0]].class_id = self.attach_WeakPoint(pos[0])
451 | olditr += 1
452 |
453 | end = time.time()
454 |
455 | # let us update class 0 to be -2
456 | for dp in self.DataPoints:
457 | if dp.class_id == 0:
458 | dp.class_id = -2
459 |
460 | def attach_StrongPoint(self, point_id):
461 | weight_map = {}
462 | max_class = 0 # max_class in attach point = 0 , thus if a point faild to merge with any cluster, it has one more time
463 | # to merge in attach weak point
464 | dp_core = self.DataPoints[point_id]
465 | if len(dp_core.reference) != 0:
466 | dp_core.visited = True
467 |
468 | for pos2 in dp_core.reference:
469 |
470 | if self.DataPoints[pos2[0]].visited == True and len(self.DataPoints[pos2[0]].referred_by) >= len(
471 | self.DataPoints[pos2[0]].refer_to):
472 |
473 | clsid = self.DataPoints[pos2[0]].class_id
474 | clsroot = self.ClassPoints[clsid].root.name
475 | self.DataPoints[pos2[0]].class_id = clsroot
476 |
477 | if clsroot not in weight_map.keys():
478 | weight_map[clsroot] = 1
479 | else:
480 | weight_map[clsroot] += 1
481 |
482 | if len(weight_map) != 0:
483 | weight_map = dict(sorted(weight_map.items()))
484 | max_class = self.getValue(dic=weight_map, what='max', who='value', other=True)
485 |
486 | return max_class # this will return get_Root(max_class) as we computed earlier _class_id = get_Root(_cls)
487 |
488 | def attach_WeakPoint(self, point_id):
489 |
490 | weight_map = {}
491 | max_class = -1 # max_class in attach weak point = -1 , thus if a point faild to merge with any cluster it is a noise
492 |
493 | dp_core = self.DataPoints[point_id]
494 | if len(dp_core.reference) != 0:
495 | dp_core.visited = True
496 |
497 | for pos2 in dp_core.reference:
498 |
499 | if self.DataPoints[pos2[0]].visited == True:
500 |
501 | clsid = self.DataPoints[pos2[0]].class_id
502 | clsroot = self.ClassPoints[clsid].root.name
503 | self.DataPoints[pos2[0]].class_id = clsroot
504 |
505 | if clsroot not in weight_map.keys():
506 | weight_map[clsroot] = 1
507 | else:
508 | weight_map[clsroot] += 1
509 |
510 | if len(weight_map) != 0:
511 | weight_map = dict(sorted(weight_map.items()))
512 | max_class = self.getValue(dic=weight_map, what='max', who='value', other=True)
513 |
514 | return max_class # this will return get_Root(max_class) as we computed earlier _class_id = get_Root(_cls)
515 |
516 | def fit_predict(self,
517 | validate=True,
518 | show_plots=True,
519 | show_noise=True,
520 | show_analyzer=True
521 | ):
522 | data_type = None
523 | validity_scores = []
524 | solution_file = 'solution.txt'
525 |
526 | if os.path.isfile(solution_file):
527 | os.remove(solution_file)
528 |
529 | pred_list = []
530 | for dp in self.DataPoints:
531 | pred_list.append(dp.class_id)
532 |
533 | with open(solution_file, 'w') as f:
534 | f.writelines("%s\n" % pred for pred in pred_list)
535 |
536 | labels_dic = {}
537 | self.train_pred = pred_list[:self.train_sz]
538 | self.test_pred = pred_list[self.train_sz:]
539 |
540 | if self.test_data is not None:
541 | self.labels_pred = np.append(self.train_pred, self.test_pred, axis=0)
542 | else:
543 | self.labels_pred = self.train_pred
544 |
545 | if self.prop_step > 0:
546 | print("Propagation at iteration:", self.prop_step)
547 | self.plot_clusters(show_plots=show_plots, show_noise=show_noise, data_type='train')
548 |
549 | if show_analyzer:
550 | self.show_Analyzer()
551 |
552 | return None, None
553 |
554 | else:
555 | if self.data_indicator >= 3:
556 |
557 | if show_analyzer:
558 | print("Plotting dataset Groundtruth")
559 | self.plot_clusters(show_plots=show_plots, show_noise=show_noise, data_type='ground')
560 |
561 | if validate and self.data_indicator >= 1:
562 |
563 | if self.data_indicator >= 3:
564 | self.analyzer["validity"] = {}
565 | self.analyzer["validity"]['train'] = {}
566 | validity_scores = self.validate_Clusters(data_type='train')
567 |
568 | if show_analyzer:
569 | print('Plotting train data')
570 | self.plot_clusters(show_plots=show_plots, show_noise=show_noise, data_type='train')
571 | if show_analyzer:
572 | self.show_Analyzer(root='Validating train data')
573 |
574 | if self.data_indicator == 15:
575 | validity_scores = self.validate_Clusters(data_type='test')
576 | if show_analyzer:
577 | # self.analyzer["validity"]['test'] = {}
578 | self.show_Analyzer(self.analyzer['validity']['test'], root='Validating test data')
579 |
580 | if self.data_indicator > 3:
581 | if show_analyzer:
582 | print('Plotting test data')
583 | self.plot_clusters(show_plots=show_plots, show_noise=show_noise, data_type='test')
584 |
585 | """"
586 | if self.data_indicator == 15:
587 | validity_scores = self.validate_Clusters(data_type='augmented')
588 | if show_analyzer:
589 | self.analyzer["validity"]['augmented'] = {}
590 | self.show_Analyzer(self.analyzer['validity']['augmented'], root='Validating augmented data (train & test)')
591 | if self.data_indicator > 3:
592 | if show_analyzer:
593 | print ('Plotting augmented data (train & test)')
594 | self.plot_clusters(show_plots=show_plots, show_noise=show_noise, data_type='augmented')
595 | """
596 |
597 | labels_dic['train'] = self.train_pred
598 | labels_dic['test'] = self.test_pred
599 |
600 | if self.data_indicator == 1:
601 | return labels_dic, None
602 | elif validate == False:
603 | return labels_dic, None
604 | elif self.data_indicator >= 3 and validate == True:
605 | return labels_dic, self.analyzer['validity']
606 |
607 | def match_Labels(self):
608 |
609 | labels_true = self.labels_truth
610 |
611 | """"
612 | if isinstance(self.labels_pred, np.ndarray):
613 | # labels_pred = np.array(self.labels_pred, dtype=np.int64)
614 | labels_pred = self.labels_pred.tolist()
615 | else:
616 | labels_pred = self.labels_pred
617 | """
618 |
619 | labels_pred = self.labels_pred
620 | pred_set = set(labels_pred)
621 | index = []
622 | x = 1
623 | old_item = labels_true[0]
624 | old_x = 0
625 |
626 | for item in labels_true:
627 |
628 | if item != old_item:
629 | count = x - old_x
630 | index.append([old_x, old_item, count])
631 | old_item = item
632 | old_x = x
633 | x += 1
634 |
635 | ln = len(labels_true)
636 | count = x - old_x
637 | index.append([old_x, old_item, count])
638 | index[0][2] = index[0][2] - 1
639 |
640 | index.sort(key=lambda x: x[2], reverse=True)
641 |
642 | labeled = []
643 | for n in range(len(index)):
644 | newval = index[n][1]
645 | max_class = max(set(labels_pred), key=labels_pred[index[n][0]:index[n][0] + index[n][2] - 1].count)
646 | if max_class not in labeled:
647 | labels_pred = [newval if x == max_class else x for x in labels_pred]
648 | labeled.append(newval)
649 |
650 | labels_pred = np.array(labels_pred, dtype=np.int64)
651 | self.labels_pred = labels_pred
652 | self.train_pred = labels_pred[:self.train_sz]
653 | self.test_pred = labels_pred[self.train_sz:]
654 |
655 | return labels_pred
656 |
657 | def validate_Clusters(self, data_type=None):
658 |
659 | labels_true = self.labels_truth
660 | if data_type == 'train':
661 | labels_true = labels_true[:self.train_sz]
662 | elif data_type == 'test':
663 | labels_true = labels_true[self.train_sz:]
664 | #elif data_type == 'augmented':
665 | # keep it as it
666 |
667 | if isinstance(self.labels_pred, np.ndarray):
668 | # labels_pred = np.array(self.labels_pred, dtype=np.int64)
669 | self.labels_pred = self.labels_pred.tolist()
670 |
671 | labels_pred = self.labels_pred
672 | if self.prop_step == 0: # do not match labels if yoy are in propagation mode
673 | labels_pred = self.match_Labels()
674 |
675 |
676 | if data_type == 'train':
677 | labels_pred = labels_pred[:self.train_sz]
678 | elif data_type == 'test':
679 | labels_pred = labels_pred[self.train_sz:]
680 | #elif data_type == 'augmented':
681 | # keep it as it
682 |
683 | self.analyzer["n_clusters"]["actual"] = len(np.unique(labels_true))
684 |
685 | # Score the clustering
686 | from sklearn.metrics.cluster import adjusted_mutual_info_score # 2010
687 | from sklearn.metrics.cluster import adjusted_rand_score # 1985
688 |
689 | # from sklearn.metrics import davies_bouldin_score
690 | # #1975 - 2001 ## no ground truth ##Values closer to zero indicate a better partition.
691 |
692 | ## also known as the Variance Ratio Criterion - can be used to evaluate the model,
693 | ## where a higher Calinski-Harabasz score relates to a model with better defined clusters.
694 |
695 | from sklearn import metrics # for homogeneity, completeness, fowlkes
696 | ## homogeneity: each cluster contains only members of a single class.
697 | ## completeness: all members of a given class are assigned to the same cluster.
698 | # v-measure the harmonic mean of homogeneity and completeness called V-measure 2007
699 |
700 | acc = metrics.accuracy_score(labels_true, labels_pred, normalize=False)
701 |
702 | # mi = metrics.mutual_info_score(labels_true, labels_pred)
703 | # print("mutual_info_score: %f." % mi)
704 |
705 | nmi = metrics.normalized_mutual_info_score(labels_true, labels_pred, average_method='arithmetic')
706 | # print("normalized_mutual_info_score: %f." % nmi)
707 |
708 | ami = adjusted_mutual_info_score(labels_true, labels_pred, average_method='arithmetic')
709 | # print("Adjusted_mutual_info_score: %f." % adj_nmi)
710 |
711 | homogeneity = metrics.homogeneity_score(labels_true, labels_pred)
712 | # print("homogeneity_score: %f." % homogeneity_score)
713 |
714 | completeness = metrics.completeness_score(labels_true, labels_pred)
715 | # print("completeness_score: %f." % completeness_score)
716 |
717 | f1_weight = metrics.f1_score(labels_true, labels_pred, average='weighted')
718 | # f1_micro = metrics.f1_score(labels_true, labels_pred, average='micro')
719 | # f1_macro = metrics.f1_score(labels_true, labels_pred, average='macro')
720 | # print("f1_score: %f." % f1_score)
721 |
722 | ari = adjusted_rand_score(labels_true, labels_pred)
723 | # print("adjusted_rand_score: %f." % adj_rand)
724 |
725 | f1 = f1_weight
726 |
727 | validity = {"ACC": acc,
728 | "F1": f1,
729 | "NMI": nmi,
730 | "AMI": ami,
731 | "ARI": ari,
732 | "homogeneity": homogeneity,
733 | "completeness": completeness
734 | }
735 |
736 | # val = [acc, f1, nmi, ami, ari, homogeneity, completeness]
737 | self.analyzer["validity"][data_type] = validity
738 | # self.analyzer["validity"] = validity
739 |
740 | return self.analyzer["validity"][data_type]
741 |
742 | def preplot_Clusters(self, data_type=None):
743 |
744 | self.analyzer["n_points"]["size"] = self.dp_count
745 | if data_type == 'test':
746 | self.analyzer["n_points"]["plot_size"] = self.test_sz
747 | elif data_type == 'train':
748 | self.analyzer["n_points"]["plot_size"] = self.train_sz
749 | #elif data_type == 'augmented':
750 | # self.analyzer["n_points"]["plot_size"] = self.dp_count
751 |
752 | if data_type == 'ground':
753 | labels = self.labels_truth
754 | labels = np.array(labels, dtype=np.int64)
755 | else:
756 | labels = self.labels_pred
757 | if data_type == 'train':
758 | labels = labels[:self.train_sz]
759 | elif data_type == 'test':
760 | labels = labels[self.train_sz:]
761 | #elif data_type == 'augmented':
762 | # nothing to do
763 |
764 | noise_1 = list(labels).count(-1)
765 | self.analyzer["n_points"]["noise"]["type-1"] = noise_1
766 |
767 | noise_2 = list(labels).count(-2)
768 | self.analyzer["n_points"]["noise"]["type-2"] = noise_2
769 |
770 | self.analyzer["n_points"]["weak"]["succeeded to merge"] = self.analyzer["n_points"]["weak"]["all"] - \
771 | self.analyzer["n_points"]["noise"]["type-2"]
772 | self.analyzer["n_points"]["weak"]["failed to merge"] = self.analyzer["n_points"]["noise"]["type-2"]
773 |
774 | unique_labels = np.unique(labels)
775 | num_of_clusters = len(unique_labels)
776 |
777 | fake_clusters = 0 # otlier = -1 and weak points that fail to merge (noise) = 0
778 |
779 | i = 0
780 | for n in (unique_labels):
781 |
782 | if n >= 0: # num_of_clusters:
783 | labels = np.where(labels == n, i, labels)
784 | i += 1
785 | else:
786 | fake_clusters += 1
787 |
788 | self.analyzer["n_clusters"]["detected"] = num_of_clusters - fake_clusters
789 |
790 | return labels
791 |
792 | def plot_clusters(self, data_type=None, show_noise=False, show_plots=True):
793 | data2 = []
794 | colors2 = []
795 |
796 | labels = self.preplot_Clusters(data_type=data_type)
797 |
798 | if show_plots:
799 |
800 | palette = sns.color_palette('deep',
801 | np.unique(labels).max() + 2) # deep, dark, bright, muted, pastel, colorblind
802 |
803 | if self.prop_step:
804 | colors = [palette[x] if x >= 0 else ((0.0, 0.0, 0.0) if x == -1 else (0.0, 0.0, 0.0)) for x in labels]
805 | v = 0
806 | for c in colors:
807 | if (c[0] + c[1] + c[2]) > 0.0: # outlier :: keep it away. Note that even outliers are -1, -2, it become in black after the previous step: color (0.0, 0.0, 0.0
808 | colors2.append((c[0], c[1], c[2], 1.0))
809 | data2.append((self.data[v][0], self.data[v][1]))
810 | v += 1
811 | data2 = np.array(data2)
812 |
813 | else:
814 | if show_noise == False:
815 | colors = [palette[x] if x >= 0 else (1.0, 1.0, 1.0) for x in
816 | labels] # noise points wont be printed due to x > 0 , else (1.0, 1.0, 1.0)
817 | else:
818 | colors = [palette[x] if x >= 0 else ((0.0, 0.0, 0.0) if x == -1 else (0.9, 0.9, 0.9)) for x in
819 | labels] # noise points wont be printed due to x > 0 , else (1.0, 1.0, 1.0)
820 |
821 | # plt.figure(figsize=(12, 8))
822 |
823 | if self.prop_step:
824 | # lenght of data2 will be always equlas to length of the specific data type (test, train)
825 | #print ('datatype', data_type)
826 | if data_type == 'train':
827 | plt.scatter(data2.T[0], data2.T[1], c=colors2, **plot_kwds, marker='o')
828 | #plt.scatter(data2[:self.train_sz].T[0], data2[:self.train_sz].T[1], c=colors2, **plot_kwds, marker='o')
829 |
830 | """"
831 | elif data_type == 'test':
832 | print ('train_sz', self.train_sz, 'test_sz', self.test_sz)
833 | #plt.scatter(data2[self.test_sz:self.train_sz:].T[0], data2[self.train_sz:].T[1], c=colors2, **plot_kwds, marker='o' )
834 | plt.scatter(data2.T[0], data2.T[1], c=colors2, **plot_kwds, marker='o')
835 | elif data_type == 'augmented':
836 | print ('3')
837 | plt.scatter(data2.T[0], data2.T[1], c=colors2, **plot_kwds, marker='o')
838 | elif data_type == 'ground':
839 | #plt.scatter(data2[:self.train_sz].T[0], data2[:self.train_sz].T[1], c=colors2, **plot_kwds, marker='o')
840 | plt.scatter(data2.T[0], data2.T[1], c=colors2, **plot_kwds, marker='o')
841 | """
842 |
843 | else:
844 | if data_type == 'train':
845 | plt.scatter(self.data[:self.train_sz].T[0], self.data[:self.train_sz].T[1], c=colors, **plot_kwds,
846 | marker='o')
847 | elif data_type == 'test':
848 | plt.scatter(self.data[self.train_sz:].T[0], self.data[self.train_sz:].T[1], c=colors, **plot_kwds,
849 | marker='o')
850 | #elif data_type == 'augmented':
851 | # plt.scatter(self.data.T[0], self.data.T[1], c=colors, **plot_kwds, marker='o')
852 | elif data_type == 'ground':
853 | if self.data_indicator == 15:
854 | plt.scatter(self.data.T[0], self.data.T[1], c=colors, **plot_kwds, marker='o')
855 | else:
856 | plt.scatter(self.data[:self.train_sz].T[0], self.data[:self.train_sz].T[1], c=colors,
857 | **plot_kwds, marker='o')
858 |
859 | self.colors = colors
860 | frame = plt.gca()
861 | frame.axes.get_xaxis().set_visible(False)
862 | frame.axes.get_yaxis().set_visible(False)
863 | if show_plots:
864 | if self.prop_step:
865 | prop_folder = 'propagation'
866 | if not os.path.exists(prop_folder):
867 | os.mkdir(prop_folder)
868 | plt.savefig(prop_folder + '/' + str(self.prop_step) + '.png')
869 | plt.show()
870 | # plt.clf() # this is a must to clear figures if you plot continously
871 |
872 | return 0
873 |
874 | def generate_tsne(self, data, d, file_2d):
875 |
876 | dim_two = TSNE(n_components=d, random_state=1971, init='random').fit_transform(data)
877 |
878 | mystr = ""
879 | data_len = len(dim_two)
880 | for i in range(data_len):
881 | for n in range(d):
882 | mystr += str(round(dim_two[i][n], 6))
883 | if (n < d - 1): mystr += ','
884 | if (n == d - 1): mystr += '\n'
885 |
886 | text_file = open(file_2d, "w")
887 | text_file.write(mystr)
888 | text_file.close()
889 |
890 | return 0
891 |
892 | def show_Analyzer(self, mydic=None, root="DenMune"):
893 |
894 | if mydic is None:
895 | mydic = self.analyzer
896 |
897 | tree = tr()
898 | tree.create_node(root, "root")
899 |
900 | def creat_TreefromDict(self, tree, mydict, key, parent):
901 | if type(mydict[key]) is not dict:
902 | val = key + ': ' + str(round(mydict[key], 3))
903 | tree.create_node(val, key, parent=parent)
904 |
905 | for d in mydic:
906 | # print ('sub_roots', d)
907 | if type(mydic[d]) is not dict:
908 | creat_TreefromDict(self, tree, mydic, d, parent='root')
909 | # print('0', 'key:', d , 'value:', mydic[d], 'parent: root')
910 | else:
911 | tree.create_node(d, d, parent="root")
912 | subdic = mydic[d]
913 | # print('1', 'key:', d , 'value:', subdic, 'parent: root')
914 | for v in subdic:
915 | if type(subdic[v]) is not dict:
916 | # print('2', 'key:', v , 'value:', subdic[v], 'parent:', d)
917 | creat_TreefromDict(self, tree, subdic, v, parent=d)
918 | else:
919 | tree.create_node(v, v, parent=d)
920 | subsubdic = subdic[v]
921 | # print('3', 'key:', v , 'value:', subsubdic, 'parent:', d)
922 | for z in subsubdic:
923 | # print('4', 'key:', z , 'value:', subsubdic[z], 'parent:', v)
924 | creat_TreefromDict(self, tree, subsubdic, z, parent=v)
925 | tree.show()
926 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DenMune: A Density-Peak Clustering Algorithm
2 |
3 | DenMune is a clustering algorithm that can find clusters of arbitrary size, shapes, and densities in two-dimensions. Higher dimensions are first reduced to 2D using t-SNE. The algorithm relies on a single parameter K (the number of nearest neighbors). The results show the superiority of the algorithm. Enjoy the simplicity but the power of DenMune.
4 |
5 | ## Listen to this amazing interview podcast
6 |
7 | [](https://on.soundcloud.com/z7WeqJnHjDd26hD76)
8 |
9 | *click image to listen (24 min)*
10 |
11 | ## Reproducibility & Test Drives
12 |
13 | Now you can reproduce all the research experiments, and even share the results and collaborate with the algorithm using our capsule on CodeOcean. Each Capsule is a self-contained computational experiment with computing environment, code, data, version history, and results.
14 |
15 | Also, you may use our repo2docker offered by mybinder.org, which encapsulates the algorithm and all required data in one virtual machine instance. All Jupyter notebooks examples found in this repository will be also available to you in action to practice in this respo2docer. Thanks mybinder.org, you made it possible!
16 |
17 | | Test-drive | URL |
18 | | ---------------------------------------- | ------------------------------------------------------------ |
19 | | Reproduce our code capsule on Code Ocean | [](https://bit.ly/codeocean-capsule) |
20 | | Use our test-drive on MyBinder | [](https://bit.ly/mybinder-repo2docker) |
21 |
22 | ## Scientific Work
23 |
24 | | Paper & data | Journals | ResearchGate Stats |
25 | | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
26 | | [](https://bit.ly/denmune-research-paper) [](https://bit.ly/mendeley-data) | [](https://www.scimagojr.com/journalsearch.php?q=24823&tip=sid&clean=0) [](https://www.scimagojr.com/journalsearch.php?q=21101060167&tip=sid&clean=0) |  |
27 |
28 | ## Coding, Security & Maintenance
29 |
30 | | Code Style | Installation | CI Workflow | Code Coverage | Code Scanning |
31 | | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
32 | |  | [](https://pypi.org/project/denmune/) | [](https://circleci.com/gh/egy1st/denmune-clustering-algorithm/tree/main) | [](https://codecov.io/gh/egy1st/denmune-clustering-algorithm) | [](https://github.com/adrinjalali/denmune-clustering-algorithm/actions/workflows/codeql.yml) |
33 |
34 | ## Tutorials
35 |
36 | | Reproducible Capsule | Repo2Docker | Colab | Kaggle |
37 | | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
38 | | [](https://bit.ly/codeocean-capsule) | [](https://bit.ly/mybinder-repo2docker) | [](#colab) | [](#kaggle) |
39 |
40 | ## Downloads Stats
41 |
42 | | Download/Week | Download/Month | Total Downloads |
43 | | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
44 | | [](https://pepy.tech/project/denmune) | [](https://pepy.tech/project/denmune) | [](https://pepy.tech/project/denmune) |
45 |
46 | ## Based on the paper
47 |
48 | | Paper |
49 | |-
50 | | Mohamed Abbas, Adel El-Zoghabi, Amin Shoukry,
51 | | *DenMune: Density peak based clustering using mutual nearest neighbors*
52 | | In: Journal of Pattern Recognition, Elsevier,
53 | | volume 109, number 107589, January 2021
54 | | DOI: https://doi.org/10.1016/j.patcog.2020.107589
55 |
56 | ## Documentation:
57 |
58 | - [](https://denmune.readthedocs.io/en/latest/?badge=latest)
59 | - [](https://denmune-docs.vercel.app)
60 |
61 | ## Watch it in action
62 |
63 | This 30 seconds will tell you how a density-based algorithm, DenMune propagates:
64 |
65 | [](https://colab.research.google.com/drive/1o-tP3uvDGjxBOGYkir1lnbr74sZ06e0U?usp=sharing)
66 |
67 | []()
68 |
69 | ## Still interested?
70 |
71 | Watch this ***10-min*** illustrative video on:
72 |
73 | - [](https://player.vimeo.com/video/827209757)
74 | - [](https://www.youtube.com/watch?v=o77raaasuOM)
75 |
76 | ## When less means more
77 |
78 | Most classic clustering algorithms fail to detect complex clusters where clusters are of different sizes, shapes, density, and exist in noisy data. Recently, a density-based algorithm named DenMune showed great ability in detecting complex shapes even in noisy data. it can detect a number of clusters automatically, detect both pre-identified-noise and post-identified-noise automatically, and remove them.
79 |
80 | It can achieve an accuracy reach 100% in some classic pattern problems, achieve 97% in the MNIST dataset. A great advantage of this algorithm is being a single-parameter algorithm. All you need is to set a number of k-nearest neighbors and the algorithm will care about the rest. Being Non-sensitive to changes in k, make it robust and stable.
81 |
82 | Keep in mind, that the algorithm reduces any N-D dataset to only a 2-D dataset initially, so it is a good benefit of this algorithm is always to plot your data and explore it which makes this algorithm a good candidate for data exploration. Finally, the algorithm comes with a neat package for visualizing data, validating it, and analyzing the whole clustering process.
83 |
84 | ## How to install DenMune
85 |
86 | Simply install DenMune clustering algorithm using pip command from the official Python repository
87 |
88 | [](https://pypi.org/project/denmune/)
89 |
90 | From the shell run the command
91 |
92 | ```shell
93 | pip install denmune
94 | ```
95 |
96 | From Jupyter notebook cell run the command
97 |
98 | ```ipython3
99 | !pip install denmune
100 | ```
101 |
102 | ## How to use DenMune
103 |
104 | Once DenMune is installed, you just need to import it
105 |
106 | ```python
107 | from denmune import DenMune
108 | ```
109 |
110 | *Please note that first denmune (the package) in small letters, while the other one(the class itself) has D and M in capital case .*
111 |
112 | ## Read data
113 |
114 | There are four possible cases of data:
115 |
116 | - only train data without labels
117 | - only labeled train data
118 | - labeled train data in addition to test data without labels
119 | - labeled train data in addition to labeled test data
120 |
121 | ```python
122 | #=============================================
123 | # First scenario: train data without labels
124 | # ============================================
125 |
126 | data_path = 'datasets/denmune/chameleon/'
127 | dataset = "t7.10k.csv"
128 | data_file = data_path + dataset
129 |
130 | # train data without labels
131 | X_train = pd.read_csv(data_file, sep=',', header=None)
132 |
133 | knn = 39 # k-nearest neighbor, the only parameter required by the algorithm
134 |
135 | dm = DenMune(train_data=X_train, k_nearest=knn)
136 | labels, validity = dm.fit_predict(show_analyzer=False, show_noise=True)
137 |
138 | ```
139 |
140 | This is an intuitive dataset which has no groundtruth provided
141 |
142 | 
143 |
144 | ```python
145 | #=============================================
146 | # Second scenario: train data with labels
147 | # ============================================
148 |
149 | data_path = 'datasets/denmune/shapes/'
150 | dataset = "aggregation.csv"
151 | data_file = data_path + dataset
152 |
153 | # train data with labels
154 | X_train = pd.read_csv(data_file, sep=',', header=None)
155 | y_train = X_train.iloc[:, -1]
156 | X_train = X_train.drop(X_train.columns[-1], axis=1)
157 |
158 | knn = 6 # k-nearest neighbor, the only parameter required by the algorithm
159 |
160 | dm = DenMune(train_data=X_train, train_truth= y_train, k_nearest=knn)
161 | labels, validity = dm.fit_predict(show_analyzer=False, show_noise=True)
162 | ```
163 |
164 | Datset groundtruth
165 |
166 | 
167 |
168 | Dataset as detected by DenMune at k=6
169 |
170 | 
171 |
172 |
173 | ```python
174 | #=================================================================
175 | # Third scenario: train data with labels in addition to test data
176 | # ===============================================================
177 |
178 | data_path = 'datasets/denmune/pendigits/'
179 | file_2d = data_path + 'pendigits-2d.csv'
180 |
181 | # train data with labels
182 | X_train = pd.read_csv(data_path + 'train.csv', sep=',', header=None)
183 | y_train = X_train.iloc[:, -1]
184 | X_train = X_train.drop(X_train.columns[-1], axis=1)
185 |
186 | # test data without labels
187 | X_test = pd.read_csv(data_path + 'test.csv', sep=',', header=None)
188 | X_test = X_test.drop(X_test.columns[-1], axis=1)
189 |
190 | knn = 50 # k-nearest neighbor, the only parameter required by the algorithm
191 |
192 | dm = DenMune(train_data=X_train, train_truth= y_train,
193 | test_data= X_test,
194 | k_nearest=knn)
195 | labels, validity = dm.fit_predict(show_analyzer=True, show_noise=True)
196 | ```
197 |
198 | dataset groundtruth
199 |
200 | 
201 |
202 |
203 | dataset as detected by DenMune at k=50
204 |
205 | 
206 |
207 | test data as predicted by DenMune on training the dataset at k=50
208 |
209 | 
210 |
211 |
212 | ## Algorithm's Parameters
213 |
214 | 1. **Parameters used within the initialization of the DenMune class**
215 |
216 | ```python
217 | def __init__ (self,
218 | train_data=None, test_data=None,
219 | train_truth=None, test_truth=None,
220 | file_2d =None, k_nearest=1,
221 | rgn_tsne=False, prop_step=0,
222 | ):
223 | ```
224 |
225 | - train_data:
226 |
227 | - data used for training the algorithm
228 | - default: None. It should be provided by the use, otherwise an error will raise.
229 |
230 | - train_truth:
231 |
232 | - labels of training data
233 | - default: None
234 |
235 | - test_data:
236 |
237 | - data used for testing the algorithm
238 |
239 | - test_truth:
240 |
241 | - labels of testing data
242 | - default: None
243 |
244 | - k_nearest:
245 |
246 | - number of nearest neighbor
247 | - default: 1. k-nearest neighbor should be at least 1.
248 |
249 | - rgn_tsn:
250 |
251 | - when set to True: It will regenerate the reduced 2-D version of the N-D dataset each time the algorithm run.
252 | - when set to False: It will generate the reduced 2-D version of the N-D dataset first time only, then will reuse the saved exist file
253 | - default: True
254 |
255 | - file_2d: name (include location) of file used save/load the reduced 2-d version
256 |
257 | - if empty: the algorithm will create temporary file named '_temp_2d'
258 | - default: None
259 |
260 | - prop_step:
261 |
262 | - size of increment used in showing the clustering propagation.
263 | - leave this parameter set to 0, the default value, unless you are willing intentionally to enter the propagation mode.
264 | - default: 0
265 |
266 |
267 | 2. **Parameters used within the fit_predict function:**
268 |
269 | ```python
270 | def fit_predict(self,
271 | validate=True,
272 | show_plots=True,
273 | show_noise=True,
274 | show_analyzer=True
275 | ):
276 | ```
277 |
278 | - validate:
279 | - validate data on/off according to five measures integrated with DenMune (Accuracy. F1-score, NMI index, AMI index, ARI index)
280 | - default: True
281 |
282 | - show_plots:
283 | - show/hide plotting of data
284 | - default: True
285 |
286 | - show_noise:
287 | - show/hide noise and outlier
288 | - default: True
289 |
290 | - show_analyzer:
291 | - show/hide the analyzer
292 | - default: True
293 |
294 | ## The Analyzer
295 |
296 | The algorithm provide an exploratory tool called analyzer, once called it will provide you with in-depth analysis on how your clustering results perform.
297 |
298 | 
299 |
300 |
301 | ## Noise Detection
302 |
303 | DenMune detects noise and outlier automatically, no need to any further work from your side.
304 |
305 | - It plots pre-identified noise in black
306 | - It plots post-identified noise in light grey
307 |
308 | You can set show_noise parameter to False.
309 |
310 | ```python
311 | # let us show noise
312 |
313 | m = DenMune(train_data=X_train, k_nearest=knn)
314 | labels, validity = dm.fit_predict(show_noise=True)
315 | ```
316 |
317 | ```python
318 | # let us show clean data by removing noise
319 |
320 | m = DenMune(train_data=X_train, k_nearest=knn)
321 | labels, validity = dm.fit_predict(show_noise=False)
322 | ```
323 |
324 | | noisy data | clean data |
325 | | ------------------------------------------------------------ | ------------------------------------------------------------ |
326 | |  |  |
327 |
328 |
329 |
330 | ## Validation
331 |
332 | You can get your validation results using 3 methods
333 |
334 | - by showing the Analyzer
335 | - extract values from the validity returned list from fit_predict function
336 | - extract values from the Analyzer dictionary
337 | - There are five validity measures built-in the algorithm, which are:
338 | - ACC, Accuracy
339 | - F1 score
340 | - NMI index (Normalized Mutual Information)
341 | - AMI index (Adjusted Mutual Information)
342 | - ARI index (Adjusted Rand Index)
343 |
344 | 
345 |
346 | ## K-nearest Evolution
347 |
348 | The following chart shows the evolution of pre and post identified noise in correspondence to increase of number of knn. Also, detected number of clusters is analyzed in the same chart in relation with both types of identified noise.
349 |
350 | 
351 |
352 | ## The Scalability
353 |
354 | | Data Size | Time |
355 | | ----------------- | ---------------------- |
356 | | data size: 5000 | time: 2.3139 seconds |
357 | | data size: 10000 | time: 5.8752 seconds |
358 | | data size: 15000 | time: 12.4535 seconds |
359 | | data size: 20000 | time: 18.8466 seconds |
360 | | data size: 25000 | time: 28.992 seconds |
361 | | data size: 30000 | time: 39.3166 seconds |
362 | | data size: 35000 | time: 39.4842 seconds |
363 | | data size: 40000 | time: 63.7649 seconds |
364 | | data size: 45000 | time: 73.6828 seconds |
365 | | data size: 50000 | time: 86.9194 seconds |
366 | | data size: 55000 | time: 90.1077 seconds |
367 | | data size: 60000 | time: 125.0228 seconds |
368 | | data size: 65000 | time: 149.1858 seconds |
369 | | data size: 70000 | time: 177.4184 seconds |
370 | | data size: 75000 | time: 204.0712 seconds |
371 | | data size: 80000 | time: 220.502 seconds |
372 | | data size: 85000 | time: 251.7625 seconds |
373 | | data size: 100000 | time: 257.563 seconds |
374 |
375 | |
376 |
377 |
378 | ## The Stability
379 |
380 | The algorithm is only single-parameter, even more it not sensitive to changes in that parameter, k. You may guess that from the following chart yourself. This is of great benefit for you as a data exploration analyst. You can simply explore the dataset using an arbitrary k. Being Non-sensitive to changes in k, make it robust and stable.
381 |
382 | 
383 |
384 | ## Reveal the propagation
385 |
386 | One of the top performing features in this algorithm is enabling you to watch how your clusters propagate to construct the final output clusters. Just use the parameter 'prop_step' as in the following example:
387 |
388 | ```python
389 | dataset = "t7.10k" #
390 | data_path = 'datasets/denmune/chameleon/'
391 |
392 | # train file
393 | data_file = data_path + dataset +'.csv'
394 | X_train = pd.read_csv(data_file, sep=',', header=None)
395 |
396 |
397 | from itertools import chain
398 |
399 | # Denmune's Paramaters
400 | knn = 39 # number of k-nearest neighbor, the only parameter required by the algorithm
401 |
402 | # create list of differnt snapshots of the propagation
403 | snapshots = chain(range(2,5), range(5,50,10), range(50, 100, 25), range(100,500,100), range(500,2000, 250), range(1000,5500, 500))
404 |
405 | from IPython.display import clear_output
406 | for snapshot in snapshots:
407 | print ("itration", snapshot )
408 | clear_output(wait=True)
409 | dm = DenMune(train_data=X_train, k_nearest=knn, rgn_tsne=False, prop_step=snapshot)
410 | labels, validity = dm.fit_predict(show_analyzer=False, show_noise=False)
411 | ```
412 |
413 | ## Interact with the algorithm
414 |
415 | [](https://colab.research.google.com/drive/1EUROd6TRwxW3A_XD3KTxL8miL2ias4Ue?usp=sharing)
416 |
417 | *click image to interact*
418 |
419 |
420 | This notebook allows you interact with the algorithm in many aspects:
421 |
422 | - you can choose which dataset to cluster (among 4 chameleon datasets)
423 | - you can decide which number of k-nearest neighbor to use
424 | - show noise on/off; thus you can invesetigate noise detected by the algorithm
425 | - show analyzer on/off
426 |
427 |
428 | ## We love Jupyter Notebooks
429 |
430 | Need to test examples one by one, then here other two options
431 |
432 | - Use colab offered by google research to test each example individually.
433 | - If you are a kaggler like me, then Kaggle, the best workspace where data scientist meet, should fit you to test the algorithm with great experience.
434 |
435 |
436 | Here is a list of Google CoLab & Kaggle notebooks to practice the use of the algorithm interactively.
437 |
438 |
439 | | Dataset | CoLab Notebook | Kaggle Notebook |
440 | | ---------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
441 | | How to use it? | [](https://bit.ly/colab-how-to-use) | [](https://bit.ly/kaggle-how-to-use) |
442 | | Chameleon datasets | [](https://bit.ly/colab-chameleon) | [](https://bit.ly/kaggle-chameleon) |
443 | | 2D Shape datasets | [](https://bit.ly/colab-2d-shapes) | [](https://bit.ly/kaggle-2d-shapes) |
444 | | Clustering unlabeled data | [](https://bit.ly/colab-unlabeled-data) | [](https://bit.ly/kaggle-chameleon) |
445 | | iris dataset | [](https://bit.ly/colab-iris-dataset) | [](https://bit.ly/kaggle-iris-dataset) |
446 | | MNIST dataset | [](https://bit.ly/colab-mnist-dataset) | [](https://bit.ly/kaggle-score-97-mnist) |
447 | | Scoring 97% on MNIST dataset | [](https://bit.ly/colab-score-97-mnist) | [](https://bit.ly/kaggle-score-97-mnist) |
448 | | Noise detection | [](https://bit.ly/colab-noise-detection) | [](https://bit.ly/kaggle-noise-detection) |
449 | | Validation | [](https://bit.ly/colab-how-to-validate) | [](https://bit.ly/kaggle-how-to-validate) |
450 | | How does it propagate? | [](https://bit.ly/colab-how-propagate) | [](https://bit.ly/kaggle-how-propagate) [](https://bit.ly/kaggle-how-propagate-2) |
451 | | Snapshots of propagation | [](https://bit.ly/colab-propagation-shots) | [](https://bit.ly/kaggle-propagation-shots) |
452 | | Scalability | [](https://bit.ly/colab-scalability) | [](https://bit.ly/kaggle-scalability) |
453 | | Stability | [](https://bit.ly/colab-stability) | [](https://bit.ly/kaggle-stability) |
454 | | k-nearest-evolution | [](https://bit.ly/colab-knn-evolution) | [](https://bit.ly/kaggle-knn-evolution) |
455 |
456 |
457 |
458 | ## Software Impact
459 |
460 | Discover robust clustering without density cutoffs using this open-source Python library pyMune, implementing the parameter-free DenMune algorithm. PyMune identifies and expands cluster cores while removing noise. Fully scikit-learn compatible. pyMune (DenMune implementation) is a cutting-edge tool incorporating advanced techniques, robust performance, and effective propagation strategies. This positions it as the current state-of-the-art in its field, contributing to its high adoption and impact.
461 |
462 | - After extensive research and rigorous validation, we are proud to release pyMune as an open-source tool on GitHub and PyPi for the benefit of the scientific community.
463 | - With over 230,000 downloads already, pyMune has demonstrated its real-world impact and usefulness. We integrated it with [](https://bit.ly/codeocean-capsule) and [](https://bit.ly/mybinder-repo2docker) to further enhance reproducibility and reuse - encapsulating code, data, and outputs for turnkey sharing.
464 | - It is part of a special issue of R-badged articles, https://www.sciencedirect.com/journal/software-impacts/special-issue/10XXN6LQ0J1
465 | - it is part of Scikit-learn-contrib , https://github.com/scikit-learn-contrib
466 |
467 | 
468 |
469 | ### Warning: Plagiarized Works
470 |
471 | It has come to our attention that the following papers have plagiarized significant portions of the DenMune algorithm and research work:
472 |
473 | 1. **Paper 1:** "DEDIC: Density Estimation Clustering Method Using Directly Interconnected Cores" published in IEEE Access, doi: 10.1109/ACCESS.2022.3229582 Authors: Yisen Lin, Xinlun Zhang, Lei Liu, and Huichen Qu, reported at https://pubpeer.com/publications/AFC4E173A4FC0A2AD7E70DE688DDA5
474 | 2. **Paper 2:** "Research on stress curve clustering algorithm of Fiber Bragg grating sensor" published in Nature Scientific Reports, doi: 10.1038/s41598-023-39058-w Authors: Yisen Lin, Ye Wang, Huichen Qu & Yiwen Xiong, reported at https://pubpeer.com/publications/7AEF7D0F7505A8B8C130D142522741
475 |
476 | We have conducted a thorough analysis and found extensive evidence of plagiarism in these papers, including:
477 |
478 | - Verbatim copying of the core algorithm logic and steps from DenMune, with only superficial naming and implementation differences intended to obfuscate the similarity.
479 | - Plagiarized background, related work, and technical details from the original DenMune paper, with minor paraphrasing and without proper attribution.
480 | - Copying of mathematical formulations, concepts, and point classifications from DenMune.
481 | - Reuse of experimental setup, datasets, and compared algorithms from DenMune without justification or acknowledgment.
482 | - Fabricated experimental results, with values directly copied from DenMune's results and falsely claimed as their own.
483 | - Lack of substantive analysis or discussion, further indicating that the experiments were likely not conducted.
484 |
485 | Despite our efforts to address these concerns through proper channels, the publishers have decided to allow these plagiarized papers to remain published with only a correction acknowledging the issues, rather than retracting them or mandating a comprehensive correction.
486 |
487 | We strongly condemn such academic misconduct and the potential enabling of plagiarism by reputable publishers. Researchers and practitioners should exercise caution when referring to or using the methods described in these plagiarized works.
488 |
489 | For the original, properly cited implementation of the DenMune clustering algorithm, please refer to the official repository and resources provided here.
490 |
491 | We remain committed to upholding academic integrity and ethical research practices, and we urge the scientific community to take a firm stance against plagiarism and misconduct in scholarly publications.
492 |
493 |
494 |
495 | ## How to cite
496 |
497 | - How to cite ***The paper***
498 |
499 | If you have used this codebase in a scientific publication and wish to cite it, please use the [Journal of Pattern Recognition article](https://www.sciencedirect.com/science/article/abs/pii/S0031320320303927):
500 |
501 | ```
502 | Mohamed Abbas, Adel El-Zoghaby, Amin Shoukry, *DenMune: Density peak-based clustering using mutual nearest neighbors*
503 | In: Journal of Pattern Recognition, Elsevier, volume 109, number 107589.
504 | January 2021
505 | ```
506 |
507 | ```bib
508 | @article{ABBAS2021107589,
509 | title = {DenMune: Density peak-based clustering using mutual nearest neighbors},
510 | journal = {Pattern Recognition},
511 | volume = {109},
512 | pages = {107589},
513 | year = {2021},
514 | issn = {0031-3203},
515 | doi = {https://doi.org/10.1016/j.patcog.2020.107589},
516 | url = {https://www.sciencedirect.com/science/article/pii/S0031320320303927},
517 | author = {Mohamed Abbas and Adel El-Zoghabi and Amin Shoukry},
518 | keywords = {Clustering, Mutual neighbors, Dimensionality reduction, Arbitrary shapes, Pattern recognition, Nearest neighbors, Density peak},
519 | abstract = {Many clustering algorithms fail when clusters are of arbitrary shapes, of varying densities, or the data classes are unbalanced and close to each other, even in two dimensions. A novel clustering algorithm, “DenMune” is presented to meet this challenge. It is based on identifying dense regions using mutual nearest neighborhoods of size K, where K is the only parameter required from the user, besides obeying the mutual nearest neighbor consistency principle. The algorithm is stable for a wide range of values of K. Moreover, it is able to automatically detect and remove noise from the clustering process as well as detect the target clusters. It produces robust results on various low and high-dimensional datasets relative to several known state-of-the-art clustering algorithms.}
520 | }
521 | ```
522 |
523 |
524 |
525 |
526 |
527 | - How to cite ***The Software***
528 | If you have used this codebase in a scientific publication and wish to cite it, please use the [Journal of Software Impacts article](https://www.sciencedirect.com/science/article/pii/S266596382300101X):
529 |
530 | ```
531 | Abbas, M. A., El-Zoghabi, A., & Shoukry, A. (2023). PyMune: A Python package for complex clusters detection. Software Impacts, 17, 100564. https://doi.org/10.1016/j.simpa.2023.100564
532 | ```
533 |
534 | ```bib
535 | @article{ABBAS2023100564,
536 | title = {pyMune: A Python package for complex clusters detection},
537 | journal = {Software Impacts},
538 | volume = {17},
539 | pages = {100564},
540 | year = {2023},
541 | issn = {2665-9638},
542 | doi = {https://doi.org/10.1016/j.simpa.2023.100564},
543 | url = {https://www.sciencedirect.com/science/article/pii/S266596382300101X},
544 | author = {Mohamed Ali Abbas and Adel El-Zoghabi and Amin Shoukry},
545 | keywords = {Machine learning, Pattern recognition, Dimensionality reduction, Mutual nearest neighbors, Nearest neighbors approximation, DenMune},
546 | abstract = {We introduce pyMune, an open-source Python library for robust clustering of complex real-world datasets without density cutoff parameters. It implements DenMune (Abbas et al., 2021), a mutual nearest neighbor algorithm that uses dimensionality reduction and approximate nearest neighbor search to identify and expand cluster cores. Noise is removed with a mutual nearest-neighbor voting system. In addition to clustering, pyMune provides classification, visualization, and validation functionalities. It is fully compatible with scikit-learn and has been accepted into the scikit-learn-contrib repository. The code, documentation, and demos are available on GitHub, PyPi, and CodeOcean for easy use and reproducibility.}
547 | }
548 | ```
549 |
550 | ## Licensing
551 |
552 | The DenMune algorithm is 3-clause BSD licensed. Enjoy.
553 |
554 | [](https://choosealicense.com/licenses/bsd-3-clause/)
555 |
556 | ## Task List
557 |
558 | - [x] Update Github with the DenMune source code
559 | - [x] create repo2docker repository
560 | - [x] Create pip Package
561 | - [x] create CoLab shared examples
562 | - [x] create documentation
563 | - [x] create Kaggle shared examples
564 | - [x] PEP8 compliant
565 | - [x] Continuous integration
566 | - [x] scikit-learn compatible
567 | - [x] creating unit tests (coverage: 100%)
568 | - [x] generating API documentation
569 | - [x] Create a reproducible capsule on code ocean
570 | - [x] Submitting pyMune to Software Impacts (Published August 5 , 2023)
571 | - [ ] create conda package (*postponed until NGT has conda installation*)
572 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | codecov:
2 | require_ci_to_pass: yes
3 |
4 | coverage:
5 | precision: 2
6 | round: down
7 | range: "70...100"
8 |
9 | status:
10 | project:
11 | default: false # disable the default status that measures entire project
12 | tests: # declare a new status context "tests"
13 | target: 100% # we always want 100% coverage here
14 | paths: "tests/" # only include coverage in "tests/" folder
15 | jupyter: # declare a new status context "app"
16 | paths: "!tests/" # remove all files in "tests/"
17 |
18 | if_ci_failed: error #success, failure, error, ignore
19 | informational: true
20 |
21 | parsers:
22 | gcov:
23 | branch_detection:
24 | conditional: yes
25 | loop: yes
26 | method: no
27 | macro: no
28 |
29 | comment:
30 | layout: "reach,diff,flags,files,footer"
31 | behavior: default
32 | require_changes: no
33 |
34 |
--------------------------------------------------------------------------------
/colab/iris_dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "view-in-github",
7 | "colab_type": "text"
8 | },
9 | "source": [
10 | " "
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {
17 | "id": "zaaLaJHT35Fd"
18 | },
19 | "outputs": [],
20 | "source": [
21 | "import pandas as pd\n",
22 | "import numpy as np\n",
23 | "import time\n",
24 | "import os.path\n",
25 | "\n",
26 | "import warnings\n",
27 | "warnings.filterwarnings('ignore')"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {
34 | "scrolled": true,
35 | "id": "69XXeoif35Fn"
36 | },
37 | "outputs": [],
38 | "source": [
39 | "# install DenMune clustering algorithm using pip command from the offecial Python repository, PyPi\n",
40 | "# from https://pypi.org/project/denmune/\n",
41 | "!pip install denmune\n",
42 | "\n",
43 | "# then import it\n",
44 | "from denmune import DenMune"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {
51 | "id": "H3H8DYwU35Fo",
52 | "colab": {
53 | "base_uri": "https://localhost:8080/"
54 | },
55 | "outputId": "fee68095-9fd7-456d-f288-9140ceef8ea0"
56 | },
57 | "outputs": [
58 | {
59 | "output_type": "stream",
60 | "name": "stdout",
61 | "text": [
62 | "Cloning into 'datasets'...\n",
63 | "remote: Enumerating objects: 57, done.\u001b[K\n",
64 | "remote: Counting objects: 100% (57/57), done.\u001b[K\n",
65 | "remote: Compressing objects: 100% (46/46), done.\u001b[K\n",
66 | "remote: Total 57 (delta 9), reused 54 (delta 9), pack-reused 0\u001b[K\n",
67 | "Unpacking objects: 100% (57/57), done.\n"
68 | ]
69 | }
70 | ],
71 | "source": [
72 | "# clone datasets from our repository datasets\n",
73 | "if not os.path.exists('datasets'):\n",
74 | " !git clone https://github.com/egy1st/datasets"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {
81 | "colab": {
82 | "base_uri": "https://localhost:8080/",
83 | "height": 1000
84 | },
85 | "id": "xm54UWO835Fq",
86 | "outputId": "195b4bb6-b755-467c-82c2-c099f2a9445e"
87 | },
88 | "outputs": [
89 | {
90 | "output_type": "stream",
91 | "name": "stdout",
92 | "text": [
93 | "Plotting dataset Groundtruth\n"
94 | ]
95 | },
96 | {
97 | "output_type": "display_data",
98 | "data": {
99 | "image/png": "\n",
100 | "text/plain": [
101 | ""
102 | ]
103 | },
104 | "metadata": {
105 | "needs_background": "light"
106 | }
107 | },
108 | {
109 | "output_type": "stream",
110 | "name": "stdout",
111 | "text": [
112 | "Plotting train data\n"
113 | ]
114 | },
115 | {
116 | "output_type": "display_data",
117 | "data": {
118 | "image/png": "\n",
119 | "text/plain": [
120 | ""
121 | ]
122 | },
123 | "metadata": {
124 | "needs_background": "light"
125 | }
126 | },
127 | {
128 | "output_type": "stream",
129 | "name": "stdout",
130 | "text": [
131 | "Validating train data\n",
132 | "├── exec_time\n",
133 | "│ ├── DenMune: 0.019\n",
134 | "│ ├── NGT: 0.002\n",
135 | "│ └── t_SNE: 0.85\n",
136 | "├── n_clusters\n",
137 | "│ ├── actual: 3\n",
138 | "│ └── detected: 3\n",
139 | "├── n_points\n",
140 | "│ ├── dim: 4\n",
141 | "│ ├── noise\n",
142 | "│ │ ├── type-1: 0\n",
143 | "│ │ └── type-2: 0\n",
144 | "│ ├── plot_size: 150\n",
145 | "│ ├── size: 150\n",
146 | "│ ├── strong: 84\n",
147 | "│ └── weak\n",
148 | "│ ├── all: 66\n",
149 | "│ ├── failed to merge: 0\n",
150 | "│ └── succeeded to merge: 66\n",
151 | "└── validity\n",
152 | " └── train\n",
153 | " ├── ACC: 135\n",
154 | " ├── AMI: 0.795\n",
155 | " ├── ARI: 0.746\n",
156 | " ├── F1: 0.898\n",
157 | " ├── NMI: 0.798\n",
158 | " ├── completeness: 0.809\n",
159 | " └── homogeneity: 0.787\n",
160 | "\n"
161 | ]
162 | }
163 | ],
164 | "source": [
165 | "data_path = 'datasets/denmune/uci/' \n",
166 | "dataset='iris' \n",
167 | "data_file = data_path + dataset + '.csv'\n",
168 | "\n",
169 | "X_train = pd.read_csv(data_file, sep=',', header=None)\n",
170 | "y_train = X_train.iloc[:, -1]\n",
171 | "X_train = X_train.drop(X_train.columns[-1], axis=1) \n",
172 | "\n",
173 | "knn = 11 # k-nearest neighbor, the only parameter required by the algorithm\n",
174 | "dm = DenMune(train_data=X_train,\n",
175 | " train_truth=y_train,\n",
176 | " k_nearest=knn,\n",
177 | " rgn_tsne=False)\n",
178 | "\n",
179 | "labels, validity = dm.fit_predict(show_noise=True, show_analyzer=True)\n"
180 | ]
181 | }
182 | ],
183 | "metadata": {
184 | "kernelspec": {
185 | "display_name": "Python 3",
186 | "language": "python",
187 | "name": "python3"
188 | },
189 | "language_info": {
190 | "codemirror_mode": {
191 | "name": "ipython",
192 | "version": 3
193 | },
194 | "file_extension": ".py",
195 | "mimetype": "text/x-python",
196 | "name": "python",
197 | "nbconvert_exporter": "python",
198 | "pygments_lexer": "ipython3",
199 | "version": "3.7.3"
200 | },
201 | "colab": {
202 | "name": "iris_dataset.ipynb",
203 | "provenance": [],
204 | "collapsed_sections": [],
205 | "include_colab_link": true
206 | }
207 | },
208 | "nbformat": 4,
209 | "nbformat_minor": 0
210 | }
--------------------------------------------------------------------------------
/images/denmune-illustration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/denmune-clustering-algorithm/a023e9283d7ea11af2d3e6dadae1c54e3b90528c/images/denmune-illustration.png
--------------------------------------------------------------------------------
/images/denmune_propagation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/denmune-clustering-algorithm/a023e9283d7ea11af2d3e6dadae1c54e3b90528c/images/denmune_propagation.png
--------------------------------------------------------------------------------
/kaggle/the-beauty-of-clusters-propagation.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"source":" ","metadata":{},"cell_type":"markdown","outputs":[],"execution_count":0},{"cell_type":"markdown","id":"e7d9b26d","metadata":{"papermill":{"duration":0.004085,"end_time":"2022-01-28T16:59:53.754281","exception":false,"start_time":"2022-01-28T16:59:53.750196","status":"completed"},"tags":[]},"source":["##### Have you ever wondered how a cluster propgate. It is time to reveal the beuty of clusters propgation. It as simple as\n","- running the next cell,\n","- wait,\n","- watch and\n","- ENJOY."]},{"cell_type":"code","execution_count":1,"id":"7d1fe6fe","metadata":{"execution":{"iopub.execute_input":"2022-01-28T16:59:53.765377Z","iopub.status.busy":"2022-01-28T16:59:53.764201Z","iopub.status.idle":"2022-01-28T17:03:36.326661Z","shell.execute_reply":"2022-01-28T17:03:36.325883Z","shell.execute_reply.started":"2022-01-24T22:08:58.752738Z"},"id":"FZgP6jwmzFtZ","papermill":{"duration":222.569057,"end_time":"2022-01-28T17:03:36.326871","exception":false,"start_time":"2022-01-28T16:59:53.757814","status":"completed"},"tags":[]},"outputs":[{"data":{"image/png":"\n","text/plain":[""]},"metadata":{"needs_background":"light"},"output_type":"display_data"}],"source":["import pandas as pd\n","import time\n","import os.path\n","import warnings\n","warnings.filterwarnings('ignore')\n","\n","# install DenMune clustering algorithm using pip from https://pypi.org/project/denmune/\n","!pip install denmune\n","# now import it\n","from denmune import DenMune\n","\n","#let us create data folder to hold our data\n","if not os.path.exists('data'):\n"," os.makedirs('data')\n","data_path = 'data/' \n","\n","# download datasets and extract them to our data folder\n","if not os.path.exists(\"chameleon-data.zip\"):\n"," !wget https://data.zerobytes.one/clustering/chameleon-data.zip\n"," !unzip -o chameleon-data.zip -d data \n","\n","#@title { run: \"auto\", vertical-output: true, form-width: \"50%\" }\n","chameleon_dataset = \"t7.10k.dat\" #@param [\"t4.8k.dat\", \"t5.8k.dat\", \"t7.10k.dat\", \"t8.8k.dat\"]\n","show_noize_checkbox = True #@param {type:\"boolean\"}\n","data_path = 'data/' \n","\n","# train file\n","data_file = data_path + chameleon_dataset\n","X_train = pd.read_csv(data_file, sep=',', header=None)\n","\n","\n","# Denmune's Paramaters\n","verpose_mode = True # view in-depth analysis of time complexity and outlier detection, num of clusters\n","show_plot = True # show plots on/off\n","show_noise = True # show noise and outlier on/off\n","\n","knn = 39\n","from IPython.display import clear_output\n","for x in range (250, 5500, 250 ):\n"," print (\"itration\", x )\n"," clear_output(wait=True)\n"," dm = DenMune(train_data=X_train, k_nearest=knn, rgn_tsne=False, prop_step=x)\n"," labels, validity = dm.fit_predict(show_analyzer=False, show_noise=show_noize_checkbox)\n"," #time.sleep(0.2)\n"," \n"]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.12"},"papermill":{"default_parameters":{},"duration":234.174038,"end_time":"2022-01-28T17:03:37.445211","environment_variables":{},"exception":null,"input_path":"__notebook__.ipynb","output_path":"__notebook__.ipynb","parameters":{},"start_time":"2022-01-28T16:59:43.271173","version":"2.3.3"}},"nbformat":4,"nbformat_minor":5}
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | ngt>=2.0.4
2 | numpy>=1.23.5
3 | pandas>=1.5.3
4 | matplotlib>=3.7.2
5 | scikit-learn>=1.2.2
6 | seaborn>=0.12.2
7 | anytree>=2.8
8 | treelib>=1.6.1
9 | pytest>=6.2.5
10 | coverage>=6.3.1
11 | treon
12 | testbook
13 | notebook
14 |
15 |
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | from .denmune import DenMune
2 |
3 | __all__ = ["DenMune"]
4 |
--------------------------------------------------------------------------------
/src/tests/test_denmune.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from itertools import chain
3 | import pandas as pd
4 | import pytest
5 | from sklearn.datasets import make_blobs
6 | from sklearn.datasets import load_iris
7 | from src.denmune import DenMune
8 |
9 |
10 | # test DenMune's results
11 | X_cc, y_cc = make_blobs(
12 | n_samples=1000,
13 | centers=np.array([[-1, -1], [1, 1]]),
14 | random_state=0,
15 | shuffle=False,
16 | cluster_std=0.5,
17 | )
18 |
19 | knn = 10
20 |
21 | def test_DenMune_results():
22 | dm = DenMune(train_data=X_cc, train_truth=y_cc, k_nearest=knn)
23 | labels, validity = dm.fit_predict(show_analyzer=False)
24 | # This test use data that are not perfectly separable so the
25 | # accuracy is not 1. Accuracy around 0.90
26 | assert (np.mean(dm.labels_pred == y_cc) > 0.90) or (1 - np.mean(dm.labels_pred == y_cc) > 0.90)
27 |
28 |
29 | @pytest.mark.parametrize("train_data", [None, X_cc[:800] ])
30 | @pytest.mark.parametrize("train_truth", [None, y_cc[:800] ])
31 | @pytest.mark.parametrize("test_data", [None, X_cc[800:] ])
32 | @pytest.mark.parametrize("test_truth", [None, y_cc[800:] ])
33 | @pytest.mark.parametrize("validate", [True, False])
34 | @pytest.mark.parametrize("show_plots", [True, False])
35 | @pytest.mark.parametrize("show_noise", [True, False])
36 | @pytest.mark.parametrize("show_analyzer", [True, False])
37 | @pytest.mark.parametrize("prop_step", [0, 600])
38 |
39 | # all possible combinations will be tested over all parameters. Actually, 257 tests will be covered
40 | def test_parameters(train_data, train_truth, test_data, test_truth, validate, show_plots, show_noise, show_analyzer, prop_step):
41 | if not (train_data is None):
42 | if not (train_data is not None and train_truth is None and test_truth is not None):
43 | if not (train_data is not None and test_data is not None and train_truth is None):
44 | if not (train_data is not None and train_truth is not None and test_truth is not None and test_data is None):
45 | dm = DenMune(train_data=train_data, train_truth=train_truth, test_data=test_data, test_truth=test_truth, k_nearest=10,prop_step=prop_step)
46 | labels, validity = dm.fit_predict(validate=validate, show_plots=show_plots, show_noise=show_noise, show_analyzer=show_analyzer)
47 | # This test use data that are not perfectly separable so the
48 | # accuracy is not 1. Accuracy around 0.70
49 | assert ( np.mean(labels == y_cc) > 0.70 or (1 - np.mean( labels == y_cc) > 0.70) )
50 |
51 |
52 | def test_DenMune_propagation():
53 | snapshots = chain([0], range(2,5), range(5,50,5), range(50, 100, 10), range(100,500,50), range(500,1100, 100))
54 | for snapshot in snapshots:
55 | dm = DenMune(train_data=X_cc, k_nearest=knn, prop_step=snapshot)
56 | labels, validity = dm.fit_predict(show_analyzer=False, show_plots=False)
57 | # if snapshot iteration = 1000, this means we could propagate to the end properly
58 | assert (snapshot == 1000)
59 |
60 | # we are going to do some tests using iris data
61 | X_iris = load_iris()["data"]
62 | y_iris = load_iris()["target"]
63 |
64 | # we test t_SNE reduction by applying it on Iris dataset which has 4 dimentions.
65 | @pytest.mark.parametrize("file_2d", [None, 'iris_2d.csv'])
66 | @pytest.mark.parametrize("rgn_tsne", [True, False])
67 |
68 |
69 | def test_t_SNE(rgn_tsne, file_2d):
70 | dm = DenMune(train_data=X_iris, train_truth=y_iris, k_nearest=knn, rgn_tsne=rgn_tsne, file_2d=file_2d)
71 | labels, validity = dm.fit_predict(show_analyzer=False, show_plots=False)
72 | assert (dm.data.shape[1] == 2) # this means it was reduced properly to 2-d using t-SNE
73 |
74 | def test_knn():
75 | for k in range (5, 55, 5):
76 | dm = DenMune(train_data=X_iris, train_truth=y_iris, k_nearest=k, rgn_tsne=False)
77 | labels, validity = dm.fit_predict(show_analyzer=False, show_plots=False)
78 | #assert (k == 50) # this means we tested the algorithm works fine with several knn inputs
79 |
80 |
81 | data_file = 'https://raw.githubusercontent.com/egy1st/datasets/dd90854f92cb5ef73b4146606c1c158c32e69b94/denmune/shapes/aggr_rand.csv'
82 | data = pd.read_csv(data_file, sep=',', header=None)
83 | labels = data.iloc[:, -1]
84 | data = data.drop(data.columns[-1], axis=1)
85 | train_data = data [:555]
86 | test_data = data [555:]
87 | train_labels = labels [:555]
88 | test_labels = labels [555:]
89 |
90 | # check if data will be treated correctly when comes as dataframe
91 | def test_dataframe():
92 | knn = 11 # k-nearest neighbor, the only parameter required by the algorithm
93 | dm = DenMune(train_data=train_data, train_truth=train_labels, test_data=test_data, test_truth=test_labels, k_nearest=knn, rgn_tsne=True)
94 | labels, validity = dm.fit_predict(validate=True, show_noise=True, show_analyzer=True)
95 | assert ( np.mean(dm.labels_pred == labels) > 0.97 or (1 - np.mean( dm.labels_pred == labels) > 0.97) )
96 |
97 |
98 | def test_exceptions():
99 |
100 | with pytest.raises(Exception) as execinfo:
101 | dm = DenMune(train_data=None, k_nearest=10)
102 | #labels, validity = dm.fit_predict()
103 | #raise Exception('train data is None')
104 |
105 | with pytest.raises(Exception) as execinfo:
106 | dm = DenMune(train_data=train_data, test_truth=test_labels, k_nearest=10)
107 | #labels, validity = dm.fit_predict()
108 | #raise Exception('train_data is not None and train_truth is None and test_truth is not None')
109 |
110 | with pytest.raises(Exception) as execinfo:
111 | dm = DenMune(train_data=train_data, test_data=test_data, k_nearest=10)
112 | #labels, validity = dm.fit_predict()
113 | #raise Exception('train_data is not None and test_data is not None and train_truth is None')
114 |
115 | with pytest.raises(Exception) as execinfo:
116 | dm = DenMune(train_data=train_data, train_truth=train_labels, test_truth=test_labels, test_data=None, k_nearest=10)
117 | #labels, validity = dm.fit_predict()
118 | #raise Exception('train_data is not None and train_truth is not None and test_truth is not None and test_data is None')
119 | with pytest.raises(Exception) as execinfo:
120 |
121 | dm = DenMune(train_data=train_data, train_truth=train_labels, k_nearest=0) # default value for k_nearest is 1 which is valid
122 | #labels, validity = dm.fit_predict()
123 | #raise Exception('k-nearest neighbor should be at least 1')
124 |
--------------------------------------------------------------------------------