├── dataset
├── utils
│ ├── __init__.py
│ └── functions.py
└── main.py
├── images
├── blobs.png
├── algorithm.pdf
├── algorithm.png
├── clustered_blobs.png
├── tarek-clustered.png
└── tarek-unclustered.png
├── pyproject.toml
├── .idea
├── misc.xml
├── vcs.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── modules.xml
├── asdas.iml
└── workspace.xml
├── setup.cfg
├── LICENSE
├── dataset.py
├── gradio-demos
└── demo-synthetic-datasets.py
├── segmentation
├── unet.py
└── utils.py
├── README.md
├── src
└── visual_clustering
│ ├── VisualClustering.py
│ └── __init__.py
└── examples
├── runtime.py
└── comparisons.py
/dataset/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/images/blobs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tareknaous/visual-clustering/HEAD/images/blobs.png
--------------------------------------------------------------------------------
/images/algorithm.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tareknaous/visual-clustering/HEAD/images/algorithm.pdf
--------------------------------------------------------------------------------
/images/algorithm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tareknaous/visual-clustering/HEAD/images/algorithm.png
--------------------------------------------------------------------------------
/images/clustered_blobs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tareknaous/visual-clustering/HEAD/images/clustered_blobs.png
--------------------------------------------------------------------------------
/images/tarek-clustered.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tareknaous/visual-clustering/HEAD/images/tarek-clustered.png
--------------------------------------------------------------------------------
/images/tarek-unclustered.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tareknaous/visual-clustering/HEAD/images/tarek-unclustered.png
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools>=42",
4 | "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/asdas.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = visual-clustering
3 | version = 1.0.0
4 | author = Tarek Naous
5 | author_email = tareknaous@gmail.com
6 | description = An implementation of the Visual Clustering algorithm
7 | long_description = file: README.md
8 | long_description_content_type = text/markdown
9 | url = https://github.com/tareknaous/visual-clustering
10 | project_urls =
11 | Bug Tracker = https://github.com/tareknaous/visual-clustering
12 | classifiers =
13 | Programming Language :: Python :: 3
14 | License :: OSI Approved :: MIT License
15 | Operating System :: OS Independent
16 |
17 | [options]
18 | package_dir =
19 | = src
20 | packages = find:
21 | python_requires = >=3.6
22 |
23 | [options.packages.find]
24 | where = src
--------------------------------------------------------------------------------
/dataset/main.py:
--------------------------------------------------------------------------------
1 | from utils.functions import create_polygons
2 | from utils.functions import find_intersections
3 | from utils.functions import return_unique_polygons
4 | from utils.functions import plot_new_polygons
5 | from utils.functions import create_mask
6 |
7 |
8 | import numpy as np
9 | import matplotlib.pyplot as plt
10 | from sklearn import datasets
11 | import alphashape
12 | from scipy.spatial import ConvexHull, convex_hull_plot_2d
13 | from shapely.ops import cascaded_union
14 | from shapely.geometry import Polygon
15 |
16 |
17 | #Test function
18 | NUM_SAMPLES= 1500
19 | NUM_CLUSTERS= 11
20 | test = create_polygons(type='blobs',
21 | num_samples=NUM_SAMPLES,
22 | num_clusters=NUM_CLUSTERS,
23 | random_state=19,
24 | keep_points=True)
25 |
26 | intersections = find_intersections(test)
27 | dictionary = return_unique_polygons(intersections)
28 | plt.savefig('cluster.png')
29 | polygons = plot_new_polygons(dictionary, test)
30 | plt.figure()
31 | create_mask(polygons)
32 | plt.savefig('annotation.png')
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2018 The Python Packaging Authority
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn import datasets
4 |
5 | #blobs
6 | n_samples = 1500
7 | blobs = datasets.make_blobs(n_samples=n_samples, centers=4, random_state=3)
8 | # plt.scatter(blobs[0][:,0],blobs[0][:,1])
9 | # plt.show()
10 |
11 | cluster_0_points = []
12 | cluster_1_points = []
13 | cluster_2_points = []
14 | cluster_3_points = []
15 |
16 | for i in range(0,len(blobs[0])):
17 | if blobs[1][i] == 0:
18 | cluster_0_points.append(blobs[0][i])
19 | if blobs[1][i] == 1:
20 | cluster_1_points.append(blobs[0][i])
21 | if blobs[1][i] == 2:
22 | cluster_2_points.append(blobs[0][i])
23 | if blobs[1][i] == 3:
24 | cluster_3_points.append(blobs[0][i])
25 |
26 |
27 | clusters = []
28 |
29 | clusters.append(cluster_0_points)
30 | clusters.append(cluster_1_points)
31 | clusters.append(cluster_2_points)
32 | clusters.append(cluster_3_points)
33 |
34 |
35 |
36 | from scipy.spatial import ConvexHull, convex_hull_plot_2d
37 | import matplotlib.pyplot as plt
38 |
39 | #Cluster 0
40 | hull_0 = ConvexHull(cluster_0_points)
41 | points_0 = np.array(cluster_0_points)
42 |
43 | for simplex in hull_0.simplices:
44 | plt.plot(points_0[simplex, 0], points_0[simplex, 1], 'k-')
45 |
46 |
47 |
48 | #Cluster 1
49 | hull_1 = ConvexHull(cluster_1_points)
50 | points_1 = np.array(cluster_1_points)
51 |
52 | for simplex in hull_1.simplices:
53 | plt.plot(points_1[simplex, 0], points_1[simplex, 1], 'k-')
54 |
55 |
56 | #Cluster 2
57 | hull_2 = ConvexHull(cluster_2_points)
58 | points_2 = np.array(cluster_2_points)
59 |
60 | for simplex in hull_2.simplices:
61 | plt.plot(points_2[simplex, 0], points_2[simplex, 1], 'k-')
62 |
63 |
64 |
65 | #Cluster 3
66 | hull_3 = ConvexHull(cluster_3_points)
67 | points_3 = np.array(cluster_3_points)
68 |
69 | for simplex in hull_3.simplices:
70 | plt.plot(points_3[simplex, 0], points_3[simplex, 1], 'k-')
71 |
72 |
73 | plt.show()
--------------------------------------------------------------------------------
/gradio-demos/demo-synthetic-datasets.py:
--------------------------------------------------------------------------------
1 | import gradio as gr
2 | from itertools import cycle, islice
3 |
4 |
5 | def visual_clustering(cluster_type, num_clusters, num_samples, random_state, median_kernel_size, max_kernel_size):
6 |
7 | NUM_CLUSTERS = num_clusters
8 | CLUSTER_STD = 4 * np.ones(NUM_CLUSTERS)
9 |
10 | if cluster_type == "blobs":
11 | data = datasets.make_blobs(n_samples=num_samples, centers=NUM_CLUSTERS, random_state=random_state,center_box=(0, 256), cluster_std=CLUSTER_STD)
12 |
13 | elif cluster_type == "varied blobs":
14 | cluster_std = 1.5 * np.ones(NUM_CLUSTERS)
15 | data = datasets.make_blobs(n_samples=num_samples, centers=NUM_CLUSTERS, cluster_std=cluster_std, random_state=random_state)
16 |
17 | elif cluster_type == "aniso":
18 | X, y = datasets.make_blobs(n_samples=num_samples, centers=NUM_CLUSTERS, random_state=random_state, center_box=(-30, 30))
19 | transformation = [[0.8, -0.6], [-0.4, 0.8]]
20 | X_aniso = np.dot(X, transformation)
21 | data = (X_aniso, y)
22 |
23 | elif cluster_type == "noisy moons":
24 | data = datasets.make_moons(n_samples=num_samples, noise=.05)
25 |
26 | elif cluster_type == "noisy circles":
27 | data = datasets.make_circles(n_samples=num_samples, factor=.01, noise=.05)
28 |
29 | max_x = max(data[0][:, 0])
30 | min_x = min(data[0][:, 0])
31 | new_max = 256
32 | new_min = 0
33 |
34 | data[0][:, 0] = (((data[0][:, 0] - min_x)*(new_max-new_min))/(max_x-min_x))+ new_min
35 |
36 | max_y = max(data[0][:, 1])
37 | min_y = min(data[0][:, 1])
38 | new_max_y = 256
39 | new_min_y = 0
40 |
41 | data[0][:, 1] = (((data[0][:, 1] - min_y)*(new_max_y-new_min_y))/(max_y-min_y))+ new_min_y
42 |
43 | fig1 = plt.figure()
44 | plt.scatter(data[0][:, 0], data[0][:, 1], s=1, c='black')
45 | plt.close()
46 |
47 | input = create_input_image(data[0])
48 | filtered = ndimage.median_filter(input, size=median_kernel_size)
49 | result = predict_sample(filtered)
50 | y_km = get_instances(result, data[0], max_filter_size=max_kernel_size)
51 |
52 | colors = np.array(list(islice(cycle(["#000000", '#377eb8', '#ff7f00', '#4daf4a',
53 | '#f781bf', '#a65628', '#984ea3',
54 | '#999999', '#e41a1c', '#dede00' ,'#491010']),
55 | int(max(y_km) + 1))))
56 | #add black color for outliers (if any)
57 | colors = np.append(colors, ["#000000"])
58 |
59 | fig2 = plt.figure()
60 | plt.scatter(data[0][:, 0], data[0][:, 1], s=10, color=colors[y_km.astype('int8')])
61 | plt.close()
62 |
63 | return fig1, fig2
64 |
65 | iface = gr.Interface(
66 |
67 | fn=visual_clustering,
68 |
69 | inputs=[
70 | gr.inputs.Dropdown(["blobs", "varied blobs", "aniso", "noisy moons", "noisy circles" ]),
71 | gr.inputs.Slider(1, 10, step=1, label='Number of Clusters'),
72 | gr.inputs.Slider(10000, 1000000, step=10000, label='Number of Samples'),
73 | gr.inputs.Slider(1, 100, step=1, label='Random State'),
74 | gr.inputs.Slider(1, 100, step=1, label='Denoising Filter Kernel Size'),
75 | gr.inputs.Slider(1,100, step=1, label='Max Filter Kernel Size')
76 | ],
77 |
78 | outputs=[
79 | gr.outputs.Image(type='plot', label='Dataset'),
80 | gr.outputs.Image(type='plot', label='Clustering Result')
81 | ]
82 | )
83 | iface.launch(share=True)
--------------------------------------------------------------------------------
/segmentation/unet.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import os
3 | import skimage.io as io
4 | import skimage.transform as trans
5 | import numpy as np
6 | from keras.models import *
7 | from keras.layers import *
8 | from keras.optimizers import *
9 | from keras.callbacks import ModelCheckpoint, LearningRateScheduler
10 | from keras import backend as keras
11 |
12 |
13 | inputs = Input((256,256,1))
14 | conv1 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(inputs)
15 | conv1 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv1)
16 | pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
17 | conv2 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(pool1)
18 | conv2 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv2)
19 | pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
20 | conv3 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(pool2)
21 | conv3 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv3)
22 | pool3 = MaxPooling2D(pool_size=(2, 2))(conv3)
23 | conv4 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(pool3)
24 | conv4 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv4)
25 | drop4 = Dropout(0.5)(conv4)
26 | pool4 = MaxPooling2D(pool_size=(2, 2))(drop4)
27 |
28 | conv5 = Conv2D(1024, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(pool4)
29 | conv5 = Conv2D(1024, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv5)
30 | drop5 = Dropout(0.5)(conv5)
31 |
32 | up6 = Conv2D(512, 2, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(drop5))
33 | merge6 = concatenate([drop4,up6], axis = 3)
34 | conv6 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(merge6)
35 | conv6 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv6)
36 |
37 | up7 = Conv2D(256, 2, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(conv6))
38 | merge7 = concatenate([conv3,up7], axis = 3)
39 | conv7 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(merge7)
40 | conv7 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv7)
41 |
42 | up8 = Conv2D(128, 2, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(conv7))
43 | merge8 = concatenate([conv2,up8], axis = 3)
44 | conv8 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(merge8)
45 | conv8 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv8)
46 |
47 | up9 = Conv2D(64, 2, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(conv8))
48 | merge9 = concatenate([conv1,up9], axis = 3)
49 | conv9 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(merge9)
50 | conv9 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv9)
51 | conv9 = Conv2D(2, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv9)
52 | conv10 = Conv2D(1, 1, activation = 'sigmoid')(conv9)
53 |
54 | model = Model(inputs, conv10)
55 |
56 | model.compile(optimizer = Adam(lr = 1e-4), loss = 'binary_crossentropy', metrics = ['accuracy'])
57 |
58 |
59 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Visual Clustering
2 |
3 | Clustering is a popular approach to detect patterns in unlabeled data. Existing clustering methods typically treat samples in a dataset as points in a metric space and compute distances to group together similar points. **Visual Clustering** is a different way of clustering points in 2-dimensional space, inspired by how humans *"visually"* cluster data. The algorithm is based on trained neural networks that perform instance segmentation on plotted data.
4 |
5 |
6 | For more details, see the accompanying paper: ["Clustering Plotted Data by Image Segmentation"](https://openaccess.thecvf.com/content/CVPR2022/html/Naous_Clustering_Plotted_Data_by_Image_Segmentation_CVPR_2022_paper.html), **2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)**, and please use the citation below.
7 |
8 | ```
9 | @article{naous2021clustering,
10 | title={Clustering Plotted Data by Image Segmentation},
11 | author={Naous, Tarek and Sarkar, Srinjay and Abid, Abubakar and Zou, James},
12 | journal={2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
13 | year={2022}
14 | }
15 | ```
16 |
17 |
18 |
19 | ## Installation
20 |
21 | ```python
22 | pip install visual-clustering
23 | ```
24 |
25 | ## Usage
26 |
27 | The algorithm can be used the same way as the classical clustering algorithms in scikit-learn: \
28 | You first import the class ```VisualClustering``` and create an instance of it.
29 |
30 | ```python
31 | from visual_clustering import VisualClustering
32 |
33 | model = VisualClustering(median_filter_size = 1, max_filter_size= 1)
34 | ```
35 | The parameters ```median_filter_size``` and ```max_filter_size``` are set to 1 by default. \
36 | You can experiment with different values to see *what works best for your dataset* !
37 |
38 |
39 | Let's create a simple synthetic dataset of blobs.
40 | ```python
41 | from sklearn import datasets
42 |
43 | data = datasets.make_blobs(n_samples=50000, centers=6, random_state=23,center_box=(-30, 30))
44 | plt.scatter(data[0][:, 0], data[0][:, 1], s=1, c='black')
45 | ```
46 |
47 |
48 |
49 | To cluster the dataset, use the ```fit``` function of the model:
50 | ```python
51 | predictions = model.fit(data[0])
52 | ```
53 |
54 | ## Visualizing the results
55 |
56 | You can visualize the results using matplotlib as you would normally do with classical clustering algorithms:
57 |
58 | ```python
59 | import matplotlib.pyplot as plt
60 | from itertools import cycle, islice
61 | import numpy as np
62 |
63 | colors = np.array(list(islice(cycle(["#000000", '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3']), int(max(predictions) + 1))))
64 | #Black color for outliers (if any)
65 | colors = np.append(colors, ["#000000"])
66 | plt.scatter(data[0][:, 0], data[0][:, 1], s=10, color=colors[predictions.astype('int8')])
67 | ```
68 |
69 |
70 |
71 | ## Colab Demo: Cluster Your Name!
72 |
73 | Run Visual Clustering inside a colab notebook to cluster your own name! \
74 | https://colab.research.google.com/drive/1DcZXhKnUpz1GDoGaJmpS6VVNXVuaRmE5?usp=sharing
75 |
76 | Name Plot:
77 |
78 |
79 |
80 | Name clustered via Visual Clustering:
81 |
82 |
83 |
84 |
85 | ## Dependencies
86 | Make sure that you have the following libraries installed:
87 | ```
88 | transformers 4.15.0
89 | scipy 1.4.1
90 | tensorflow 2.7.0
91 | keras 2.7.0
92 | numpy 1.19.5
93 | cv2 4.1.2
94 | skimage 0.18.3
95 | ```
96 | ## Contact
97 | **Tarek Naous**: [Scholar](https://scholar.google.com/citations?user=ImyLv44AAAAJ&hl=en) | [Github](https://github.com/tareknaous?tab=repositories) |
98 | [Linkedin](https://www.linkedin.com/in/tareknaous/) | [Research Gate](https://www.researchgate.net/profile/Tarek_Naous?ev=hdr_xprf) | [Personal Wesbite](https://tareknaous.github.io/)
99 | | tareknaous@gmail.com
100 |
--------------------------------------------------------------------------------
/src/visual_clustering/VisualClustering.py:
--------------------------------------------------------------------------------
1 | class VisualClustering:
2 | def __init__(self, max_filter_size = 1, median_filter_size = 1):
3 | """self (object): containing the loaded pre-trained U-Net from the Huggingface hub
4 | """
5 | self.unet = from_pretrained_keras("tareknaous/unet-visual-clustering")
6 | self.max_filter_size = max_filter_size
7 | self.median_filter_size = median_filter_size
8 |
9 | def predict_sample(self, image):
10 | """Run inference using the U-Net model and return result
11 |
12 | Args:
13 | image (numpy.ndarray (256, 256, 1)): input image representing plotted 2D dataset
14 |
15 | Returns:
16 | result (numpy.ndarray (256, 256, 1)): predicted binary segmentation mask
17 |
18 | """
19 | prediction = self.unet.predict(image[tf.newaxis, ...])
20 | prediction[prediction > 0.5 ] = 1
21 | prediction[prediction !=1] = 0
22 | result = prediction[0]*255
23 | return result
24 |
25 | def create_input_image(self, data):
26 | #Initialize input matrix
27 | input = np.ones((256,256))
28 | #Fill matrix with data point values
29 | for i in range(0,len(data)):
30 | if math.floor(data[i][0]) < 256 and math.floor(data[i][1]) < 256:
31 | input[math.floor(data[i][0])][math.floor(data[i][1])] = 0
32 | elif math.floor(data[i][0]) >= 256:
33 | input[255][math.floor(data[i][1])] = 0
34 | elif math.floor(data[i][1]) >= 256:
35 | input[math.floor(data[i][0])][255] = 0
36 |
37 | return input
38 |
39 | def denoise_input(self, image):
40 | denoised = ndimage.median_filter(image, size=self.median_filter_size)
41 | return denoised
42 |
43 | def linear_shifting(self, data):
44 | max_x = max(data[:, 0])
45 | min_x = min(data[:, 0])
46 | new_max = 256
47 | new_min = 0
48 |
49 | data[:, 0] = (((data[:, 0] - min_x)*(new_max-new_min))/(max_x-min_x))+ new_min
50 |
51 | max_y = max(data[:, 1])
52 | min_y = min(data[:, 1])
53 | new_max_y = 256
54 | new_min_y = 0
55 |
56 | data[:, 1] = (((data[:, 1] - min_y)*(new_max_y-new_min_y))/(max_y-min_y))+ new_min_y
57 |
58 | return data
59 |
60 | def get_instances(self, prediction, data):
61 | #Adjust format (clusters to be 255 and rest is 0)
62 | prediction[prediction == 255] = 3
63 | prediction[prediction == 0] = 4
64 | prediction[prediction == 3] = 0
65 | prediction[prediction == 4] = 255
66 |
67 | #Convert to 8-bit image
68 | prediction = image.img_to_array(prediction, dtype='uint8')
69 |
70 | #Get 1 color channel
71 | cells=prediction[:,:,0]
72 | #Threshold
73 | ret1, thresh = cv2.threshold(cells, 0, 255, cv2.THRESH_BINARY)
74 | #Filter to remove noise
75 | kernel = np.ones((3,3),np.uint8)
76 | opening = cv2.morphologyEx(thresh,cv2.MORPH_OPEN,kernel, iterations = 2)
77 |
78 | #Obtain background
79 | background = cv2.dilate(opening,kernel,iterations=5)
80 | dist_transform = cv2.distanceTransform(opening,cv2.DIST_L2,5)
81 | ret2, foreground = cv2.threshold(dist_transform,0.04*dist_transform.max(),255,0)
82 | foreground = np.uint8(foreground)
83 | unknown = cv2.subtract(background,foreground)
84 |
85 | #Connected Component Analysis
86 | ret3, markers = cv2.connectedComponents(foreground)
87 | markers = markers+10
88 | markers[unknown==255] = 0
89 |
90 | #Watershed
91 | img = cv2.merge((prediction,prediction,prediction))
92 | markers = cv2.watershed(img,markers)
93 | img[markers == -1] = [0,255,255]
94 |
95 | #Maximum filtering
96 | markers = ndimage.maximum_filter(markers, size=self.max_filter_size)
97 |
98 | #Get regions
99 | regions = measure.regionprops(markers, intensity_image=cells)
100 |
101 | #Get Cluster IDs (Cluster Assignment)
102 | cluster_ids = np.zeros(len(data))
103 |
104 | for i in range(0,len(cluster_ids)):
105 | row = math.floor(data[i][0])
106 | column = math.floor(data[i][1])
107 | if row < 256 and column < 256:
108 | cluster_ids[i] = markers[row][column] - 10
109 | elif row >= 256:
110 | # cluster_ids[i] = markers[255][column]
111 | cluster_ids[i] = 0
112 | elif column >= 256:
113 | # cluster_ids[i] = markers[row][255]
114 | cluster_ids[i] = 0
115 |
116 | cluster_ids = cluster_ids.astype('int8')
117 | cluster_ids[cluster_ids == -11] = 0
118 |
119 | return cluster_ids
120 |
121 | def fit(self, data):
122 | data = self.linear_shifting(data)
123 | input = self.create_input_image(data)
124 | if self.median_filter_size == 1:
125 | result = self.predict_sample(input)
126 | labels = self.get_instances(result, data)
127 | else:
128 | denoised_input = self.denoise_input(input)
129 | result = self.predict_sample(denoised_input)
130 | labels = self.get_instances(result, data)
131 | return labels
132 |
--------------------------------------------------------------------------------
/examples/runtime.py:
--------------------------------------------------------------------------------
1 | import time
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | from sklearn import datasets
5 | from sklearn.cluster import KMeans
6 | from sklearn import cluster, datasets, mixture
7 | from sklearn.neighbors import kneighbors_graph
8 |
9 |
10 |
11 |
12 | NUM_CLUSTERS = 10
13 | CLUSTER_STD = 4 * np.ones(NUM_CLUSTERS)
14 |
15 | # samples = [1000, 5000, 10000, 50000, 100000, 500000, 1000000, 1500000, 2000000]
16 | samples = [1000, 5000, 10000, 50000]
17 |
18 |
19 | time_visual = []
20 | time_kmeans = []
21 | time_dbscan = []
22 | time_affinity = []
23 | time_spectral = []
24 | time_optics = []
25 | time_gmm = []
26 | time_ward = []
27 | time_ms = []
28 | time_birch = []
29 | time_agglo = []
30 |
31 |
32 |
33 | for i in samples:
34 | data = datasets.make_blobs(n_samples=i, centers=NUM_CLUSTERS, random_state=151,center_box=(0, 256), cluster_std=CLUSTER_STD)
35 |
36 | #Compute Visual
37 | start = time.time()
38 | input = create_input_image(data)
39 | result = predict_sample(input)
40 | y_km = get_instances(result, data)
41 | end = time.time()
42 |
43 | time_visual.append(end-start)
44 |
45 | #Compute Kmeans
46 | km = KMeans(
47 | n_clusters=10, init='random',
48 | n_init=10, max_iter=300,
49 | tol=1e-04, random_state=0
50 | )
51 |
52 | start = time.time()
53 | y_km = km.fit_predict(data[0])
54 | end = time.time()
55 | time_kmeans.append(end - start)
56 |
57 | #Compute dbscan
58 | dbscan = cluster.DBSCAN(eps=0.15)
59 |
60 | start = time.time()
61 | y_km = dbscan.fit_predict(data[0])
62 | end = time.time()
63 | time_dbscan.append(end - start)
64 |
65 | #Compute Affinity Propagation
66 | # affinity_propagation = cluster.AffinityPropagation(damping=0.77, preference=-240)
67 |
68 | # start = time.time()
69 | # y_km = affinity_propagation.fit_predict(data[0])
70 | # end = time.time()
71 | # time_affinity.append(end - start)
72 |
73 | #Compute Spectral
74 | spectral = cluster.SpectralClustering(n_clusters=10, eigen_solver='arpack', affinity="nearest_neighbors")
75 | start = time.time()
76 | y_km = dbscan.fit_predict(data[0])
77 | end = time.time()
78 | time_spectral.append(end - start)
79 |
80 | #Compute OPTICS
81 | # optics = cluster.OPTICS(min_samples=20, xi=0.05, min_cluster_size=0.2)
82 | # start = time.time()
83 | # y_km = optics.fit_predict(data[0])
84 | # end = time.time()
85 | # time_optics.append(end - start)
86 |
87 | #Compute GMM
88 | gmm = mixture.GaussianMixture(n_components=10, covariance_type='full')
89 | start = time.time()
90 | y_km = gmm.fit_predict(data[0])
91 | end = time.time()
92 | time_gmm.append(end - start)
93 |
94 | #Ward
95 | # start = time.time()
96 | # # connectivity matrix for structured Ward
97 | # connectivity = kneighbors_graph(data[0], n_neighbors=10, include_self=False)
98 | # # make connectivity symmetric
99 | # connectivity = 0.5 * (connectivity + connectivity.T)
100 | # ward = cluster.AgglomerativeClustering(n_clusters=10, linkage='ward', connectivity=connectivity)
101 | # y_km = ward.fit_predict(data[0])
102 | # end = time.time()
103 | # time_ward.append(end - start)
104 |
105 | #Mean Shift
106 | # estimate bandwidth for mean shift
107 | # start = time.time()
108 | # bandwidth = cluster.estimate_bandwidth(data[0], quantile=0.2)
109 | # ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
110 | # y_km = ms.fit_predict(data[0])
111 | # end = time.time()
112 | # time_ms.append(end - start)
113 |
114 | #Birch
115 | # birch = cluster.Birch(n_clusters=10)
116 | # start = time.time()
117 | # y_km = birch.fit_predict(data[0])
118 | # end = time.time()
119 | # time_birch.append(end - start)
120 |
121 | #Agglomertive
122 | connectivity = kneighbors_graph(data[0], n_neighbors=10, include_self=False)
123 | # make connectivity symmetric
124 | connectivity = 0.5 * (connectivity + connectivity.T)
125 | average_linkage = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock",n_clusters=10, connectivity=connectivity)
126 | start = time.time()
127 | y_km = average_linkage.fit_predict(data[0])
128 | end = time.time()
129 | time_agglo.append(end - start)
130 |
131 | plt.plot(samples, time_visual , 'r', marker='o', linewidth=2)
132 | plt.plot(samples, time_kmeans, 'b', marker='o',linewidth=2)
133 | plt.plot(samples, time_dbscan, 'g', marker='o',linewidth=2)
134 | plt.plot(samples, time_spectral, 'm', marker='o',linewidth=2)
135 | plt.plot(samples, time_gmm, 'y', marker='o',linewidth=2)
136 | plt.plot(samples, time_agglo, 'c', marker='o',linewidth=2)
137 |
138 |
139 |
140 |
141 | plt.legend(['Our Method', 'K-Means', 'DBSCAN', 'Spectral Clustering', 'Gaussian Mixture', 'Agglomerative Clustering'], fontsize=11)
142 | plt.grid( linestyle='-', linewidth=0.5)
143 | plt.ylabel('Time (sec)', fontsize=16)
144 | plt.xlabel('Samples', fontsize=16)
145 | plt.title("Time vs Nb of Samples for 10 blobs")
146 |
147 |
148 |
149 |
150 |
--------------------------------------------------------------------------------
/segmentation/utils.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import numpy as np
3 | from matplotlib import pyplot as plt
4 | from scipy import ndimage
5 | from skimage import measure, color, io
6 | from tensorflow.keras.preprocessing import image
7 | import math
8 | from scipy.spatial import ConvexHull
9 | from shapely.geometry import Polygon
10 |
11 |
12 |
13 | #Function that predicts on only 1 sample
14 | def predict_sample(image):
15 | prediction = model.predict(image[tf.newaxis, ...])
16 | prediction[prediction > 0.5 ] = 1
17 | prediction[prediction !=1] = 0
18 | result = prediction[0]*255
19 | return result
20 |
21 |
22 | #Function that creates the matrix that will be used as input to the binary segmentation model
23 | def create_input_image(data, visualize=False):
24 | #Initialize input matrix
25 | input = np.ones((256,256))
26 |
27 | #Fill matrix with data point values
28 | for i in range(0,len(data[0])):
29 | if math.floor(data[0][i][0]) < 256 and math.floor(data[0][i][1]) < 256:
30 | input[math.floor(data[0][i][0])][math.floor(data[0][i][1])] = 0
31 | elif math.floor(data[0][i][0]) >= 256:
32 | input[255][math.floor(data[0][i][1])] = 0
33 | elif math.floor(data[0][i][1]) >= 256:
34 | input[math.floor(data[0][i][0])][255] = 0
35 |
36 | #Visualize
37 | if visualize == True:
38 | plt.imshow(input.T, cmap='gray')
39 | plt.gca().invert_yaxis()
40 |
41 | return input
42 |
43 |
44 | import cv2
45 | import numpy as np
46 | from matplotlib import pyplot as plt
47 | from scipy import ndimage
48 | from skimage import measure, color, io
49 | from tensorflow.keras.preprocessing import image
50 | from scipy import ndimage
51 |
52 |
53 | #Function that performs instance segmentation and clusters the dataset
54 | def get_instances(prediction, data, max_filter_size=1):
55 | #Adjust format (clusters to be 255 and rest is 0)
56 | prediction[prediction == 255] = 3
57 | prediction[prediction == 0] = 4
58 | prediction[prediction == 3] = 0
59 | prediction[prediction == 4] = 255
60 |
61 | #Convert to 8-bit image
62 | prediction = image.img_to_array(prediction, dtype='uint8')
63 |
64 | #Get 1 color channel
65 | cells=prediction[:,:,0]
66 | #Threshold
67 | ret1, thresh = cv2.threshold(cells, 0, 255, cv2.THRESH_BINARY)
68 | #Filter to remove noise
69 | kernel = np.ones((3,3),np.uint8)
70 | opening = cv2.morphologyEx(thresh,cv2.MORPH_OPEN,kernel, iterations = 2)
71 |
72 | #Get the background
73 | background = cv2.dilate(opening,kernel,iterations=5)
74 | dist_transform = cv2.distanceTransform(opening,cv2.DIST_L2,5)
75 | ret2, foreground = cv2.threshold(dist_transform,0.04*dist_transform.max(),255,0)
76 | foreground = np.uint8(foreground)
77 | unknown = cv2.subtract(background,foreground)
78 |
79 | #Connected Component Analysis
80 | ret3, markers = cv2.connectedComponents(foreground)
81 | markers = markers+10
82 | markers[unknown==255] = 0
83 |
84 | #Watershed
85 | img = cv2.merge((prediction,prediction,prediction))
86 | markers = cv2.watershed(img,markers)
87 | img[markers == -1] = [0,255,255]
88 |
89 | #Maximum filtering
90 | markers = ndimage.maximum_filter(markers, size=max_filter_size)
91 | # plt.imshow(markers.T, cmap='gray')
92 | # plt.gca().invert_yaxis()
93 |
94 | #Get an RGB colored image
95 | img2 = color.label2rgb(markers, bg_label=1)
96 | # plt.imshow(img2)
97 | # plt.gca().invert_yaxis()
98 |
99 | #Get regions
100 | regions = measure.regionprops(markers, intensity_image=cells)
101 |
102 | #Get Cluster IDs
103 | cluster_ids = np.zeros(len(data))
104 |
105 | for i in range(0,len(cluster_ids)):
106 | row = math.floor(data[i][0])
107 | column = math.floor(data[i][1])
108 | if row < 256 and column < 256:
109 | cluster_ids[i] = markers[row][column] - 10
110 | elif row >= 256:
111 | # cluster_ids[i] = markers[255][column]
112 | cluster_ids[i] = 0
113 | elif column >= 256:
114 | # cluster_ids[i] = markers[row][255]
115 | cluster_ids[i] = 0
116 |
117 | cluster_ids = cluster_ids.astype('int8')
118 | cluster_ids[cluster_ids == -11] = 0
119 |
120 | return cluster_ids
121 |
122 |
123 |
124 | def draw_clusters(regions,data):
125 | for i in range(1,len(regions)):
126 | #Get the coordinates of the region
127 | coordinates = regions[i].coords
128 | #Compute the convex hull
129 | hull = ConvexHull(coordinates)
130 | #Get the indexess of the vertices
131 | vertices_ids = hull.vertices
132 | #Append real values of the vertices
133 | hull_vertices = []
134 | for j in range(0,len(vertices_ids)):
135 | hull_vertices.append(coordinates[vertices_ids[j]])
136 | #Create and plot polygon of cluster
137 | polygon = Polygon(hull_vertices)
138 | x,y = polygon.exterior.xy
139 | plt.plot(x,y)
140 |
141 | #Overlay the data points on the image
142 | plt.scatter(data[0][:, 0], data[0][:, 1], s=1, c='black')
143 |
144 |
145 | def visual_clustering(data):
146 | input = create_input_image(data)
147 | result = predict_sample(input)
148 | regions = get_instances(result, data)
149 | draw_clusters(regions,data)
--------------------------------------------------------------------------------
/src/visual_clustering/__init__.py:
--------------------------------------------------------------------------------
1 | import math
2 | import cv2
3 | import numpy as np
4 | import tensorflow as tf
5 | from tensorflow.keras.preprocessing import image
6 | from scipy import ndimage
7 | from matplotlib import pyplot as plt
8 | from skimage import measure, color, io
9 | from huggingface_hub.keras_mixin import from_pretrained_keras
10 |
11 |
12 | class VisualClustering:
13 | def __init__(self, max_filter_size = 1, median_filter_size = 1):
14 | """self (object): containing the loaded pre-trained U-Net from the Huggingface hub
15 | """
16 | self.unet = from_pretrained_keras("tareknaous/unet-visual-clustering")
17 | self.max_filter_size = max_filter_size
18 | self.median_filter_size = median_filter_size
19 |
20 | def predict_sample(self, image):
21 | """Run inference using the U-Net model and return result
22 |
23 | Args:
24 | image (numpy.ndarray (256, 256, 1)): input image representing plotted 2D dataset
25 |
26 | Returns:
27 | result (numpy.ndarray (256, 256, 1)): predicted binary segmentation mask
28 |
29 | """
30 | prediction = self.unet.predict(image[tf.newaxis, ...])
31 | prediction[prediction > 0.5 ] = 1
32 | prediction[prediction !=1] = 0
33 | result = prediction[0]*255
34 | return result
35 |
36 | def create_input_image(self, data):
37 | #Initialize input matrix
38 | input = np.ones((256,256))
39 | #Fill matrix with data point values
40 | for i in range(0,len(data)):
41 | if math.floor(data[i][0]) < 256 and math.floor(data[i][1]) < 256:
42 | input[math.floor(data[i][0])][math.floor(data[i][1])] = 0
43 | elif math.floor(data[i][0]) >= 256:
44 | input[255][math.floor(data[i][1])] = 0
45 | elif math.floor(data[i][1]) >= 256:
46 | input[math.floor(data[i][0])][255] = 0
47 |
48 | return input
49 |
50 | def denoise_input(self, image):
51 | denoised = ndimage.median_filter(image, size=self.median_filter_size)
52 | return denoised
53 |
54 | def linear_shifting(self, data):
55 | max_x = max(data[:, 0])
56 | min_x = min(data[:, 0])
57 | new_max = 256
58 | new_min = 0
59 |
60 | data[:, 0] = (((data[:, 0] - min_x)*(new_max-new_min))/(max_x-min_x))+ new_min
61 |
62 | max_y = max(data[:, 1])
63 | min_y = min(data[:, 1])
64 | new_max_y = 256
65 | new_min_y = 0
66 |
67 | data[:, 1] = (((data[:, 1] - min_y)*(new_max_y-new_min_y))/(max_y-min_y))+ new_min_y
68 |
69 | return data
70 |
71 | def get_instances(self, prediction, data):
72 | #Adjust format (clusters to be 255 and rest is 0)
73 | prediction[prediction == 255] = 3
74 | prediction[prediction == 0] = 4
75 | prediction[prediction == 3] = 0
76 | prediction[prediction == 4] = 255
77 |
78 | #Convert to 8-bit image
79 | prediction = image.img_to_array(prediction, dtype='uint8')
80 |
81 | #Get 1 color channel
82 | cells=prediction[:,:,0]
83 | #Threshold
84 | ret1, thresh = cv2.threshold(cells, 0, 255, cv2.THRESH_BINARY)
85 | #Filter to remove noise
86 | kernel = np.ones((3,3),np.uint8)
87 | opening = cv2.morphologyEx(thresh,cv2.MORPH_OPEN,kernel, iterations = 2)
88 |
89 | #Obtain background
90 | background = cv2.dilate(opening,kernel,iterations=5)
91 | dist_transform = cv2.distanceTransform(opening,cv2.DIST_L2,5)
92 | ret2, foreground = cv2.threshold(dist_transform,0.04*dist_transform.max(),255,0)
93 | foreground = np.uint8(foreground)
94 | unknown = cv2.subtract(background,foreground)
95 |
96 | #Connected Component Analysis
97 | ret3, markers = cv2.connectedComponents(foreground)
98 | markers = markers+10
99 | markers[unknown==255] = 0
100 |
101 | #Watershed
102 | img = cv2.merge((prediction,prediction,prediction))
103 | markers = cv2.watershed(img,markers)
104 | img[markers == -1] = [0,255,255]
105 |
106 | #Maximum filtering
107 | markers = ndimage.maximum_filter(markers, size=self.max_filter_size)
108 |
109 | #Get regions
110 | regions = measure.regionprops(markers, intensity_image=cells)
111 |
112 | #Get Cluster IDs (Cluster Assignment)
113 | cluster_ids = np.zeros(len(data))
114 |
115 | for i in range(0,len(cluster_ids)):
116 | row = math.floor(data[i][0])
117 | column = math.floor(data[i][1])
118 | if row < 256 and column < 256:
119 | cluster_ids[i] = markers[row][column] - 10
120 | elif row >= 256:
121 | # cluster_ids[i] = markers[255][column]
122 | cluster_ids[i] = 0
123 | elif column >= 256:
124 | # cluster_ids[i] = markers[row][255]
125 | cluster_ids[i] = 0
126 |
127 | cluster_ids = cluster_ids.astype('int8')
128 | cluster_ids[cluster_ids == -11] = 0
129 |
130 | return cluster_ids
131 |
132 | def fit(self, data):
133 | data = self.linear_shifting(data)
134 | input = self.create_input_image(data)
135 | if self.median_filter_size == 1:
136 | result = self.predict_sample(input)
137 | labels = self.get_instances(result, data)
138 | else:
139 | denoised_input = self.denoise_input(input)
140 | result = self.predict_sample(denoised_input)
141 | labels = self.get_instances(result, data)
142 | return labels
143 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 | 1621928676719
92 |
93 |
94 | 1621928676719
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
--------------------------------------------------------------------------------
/dataset/utils/functions.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn import datasets
4 | import alphashape
5 | from scipy.spatial import ConvexHull, convex_hull_plot_2d
6 | from shapely.ops import cascaded_union
7 | from shapely.geometry import Polygon
8 |
9 | def find_intersections(polygons):
10 | 'CONSIDER CLUSTERS SHOULD BE UNITED BASED ON PERCENTAGE OF INTERSECTION'
11 | 'Add in dictionary whether the intersection should be united or subtracted'
12 | #Percentage threshold for uniting polygons
13 | THRESHOLD = 30
14 | #create empty dictionary
15 | intersections = dict()
16 |
17 | #create keys in dictionary
18 | for i in range(0,len(polygons)):
19 | key = i
20 | intersections[key] = []
21 |
22 | #Add intersections in the dictionary based on percentage criterion
23 | for i in range(0,len(polygons)):
24 | for j in range(i+1,len(polygons)):
25 | intersection_percentage = []
26 | intersection_percentage.append((polygons[i].intersection(polygons[j]).area)/polygons[i].area*100)
27 | intersection_percentage.append((polygons[i].intersection(polygons[j]).area)/polygons[j].area*100)
28 |
29 | if polygons[i].intersects(polygons[j]) == True:
30 | if intersection_percentage[0] >=THRESHOLD or intersection_percentage[0] >= THRESHOLD:
31 | key = i
32 | value = [j, 'union']
33 | intersections[key].append(value)
34 | else:
35 | key = i
36 | value = [j, 'subtraction']
37 | intersections[key].append(value)
38 |
39 | return intersections
40 |
41 |
42 |
43 |
44 | def return_unique_polygons(intersections):
45 | 'updated with union and subtraction criteria'
46 | remove = [] #used to store index of keys to remove
47 |
48 | #check which keys in the dictionary will need to be removed
49 | for key in intersections:
50 | for value in intersections[key]:
51 | if value[0] in intersections:
52 | remove.append(value[0])
53 |
54 | #remove key from dictionary
55 | for i in range(0,len(remove)):
56 | #Add exception if code was trying to remove key that was already removed
57 | try:
58 | intersections.pop(remove[i])
59 | except KeyError:
60 | continue
61 |
62 | return intersections
63 |
64 |
65 |
66 |
67 | def plot_new_polygons(unique_dictionary, polygons):
68 | 'Subtracts polygons with intersection % below threshold, and combine polygons with intersection % above threshold'
69 |
70 | mask_polygons = []
71 |
72 | #Variable to decide whether to perform subtraction in case we have 3 or more intersecting polygons
73 | need_subtract = False
74 |
75 | for key in unique_dictionary:
76 | need_subtract = False
77 | #check if the key is empty (has not values)
78 | if not unique_dictionary[key]:
79 | #plot the polygon with no intersections
80 | x,y = polygons[key].exterior.xy
81 | # plt.plot(x,y)
82 | mask_polygons.append(polygons[key])
83 |
84 | else:
85 | #create an array to add the polygons to be merged
86 | combination_merge = []
87 | #added the polygon in the key itself
88 | combination_merge.append(polygons[key])
89 | #create an array to add the polygons to be subtracted, in case there is any
90 | combination_substract = []
91 |
92 | for value in unique_dictionary[key]:
93 | if value[1] == 'union':
94 | combination_merge.append(polygons[value[0]])
95 |
96 | elif value[1] == 'subtraction':
97 | combination_substract.append(polygons[value[0]])
98 | need_subtract = True
99 |
100 | #merge the polygons to be merged
101 | merged = cascaded_union(combination_merge)
102 |
103 | #If no need to subtract, then just plot the merged polygons
104 | if need_subtract == False:
105 | x,y = merged.exterior.xy
106 | # plt.plot(x,y)
107 | mask_polygons.append(merged)
108 |
109 | elif need_subtract == True:
110 | #subtract the one to be subtracted from the merged ones
111 | subtracted = []
112 | for i in range(0,len(combination_substract)):
113 | subtracted.append(merged.symmetric_difference(combination_substract[i]))
114 | for j in range(0,len(subtracted[i])):
115 | x,y = subtracted[i][j].exterior.xy
116 | # plt.plot(x,y)
117 | mask_polygons.append(subtracted[i][j])
118 |
119 | return mask_polygons
120 |
121 |
122 |
123 | def create_mask(polygons):
124 | for i in range(0,len(polygons)):
125 | x, y = polygons[i].exterior.xy
126 | plt.fill(x,y, "black")
127 | plt.axis('off')
128 |
129 |
130 |
131 | def create_polygons(type, num_samples, num_clusters, random_state, *cluster_std, keep_points=False):
132 | if type == 'blobs': # works fine
133 | data = datasets.make_blobs(n_samples=num_samples, centers=num_clusters, random_state=random_state,
134 | center_box=(-30, 30))
135 |
136 | elif type == 'aniso': # works fine
137 | X, y = datasets.make_blobs(n_samples=num_samples, centers=num_clusters, random_state=random_state, center_box=(-30, 30))
138 | transformation = [[0.6, -0.6], [-0.4, 0.8]]
139 | X_aniso = np.dot(X, transformation)
140 | data = (X_aniso, y)
141 |
142 | elif type == 'noisy_moons': # works fine
143 | data = datasets.make_moons(n_samples=num_samples, noise=.05)
144 | if num_clusters != 2:
145 | raise Exception("Can only take 2 clusters for noisy_moons")
146 |
147 | elif type == 'noisy_circles': # works fine
148 | data = datasets.make_circles(n_samples=num_samples, factor=.01, noise=.2)
149 | if num_clusters != 2:
150 | raise Exception("Can only take 2 clusters for noisy_circles")
151 |
152 | elif type == 'varied_blobs': # works fine
153 | cluster_std = 1.5 * np.random.random(num_clusters)
154 | data = datasets.make_blobs(n_samples=num_samples,
155 | centers=num_clusters,
156 | cluster_std=cluster_std,
157 | random_state=random_state,
158 | center_box=(-30, 30))
159 | if keep_points==True:
160 | plt.figure()
161 | plt.scatter(data[0][:, 0], data[0][:, 1], s=1, c='black')
162 | plt.axis('off')
163 |
164 | # Create a list of empty arrays for each cluster
165 | clusters = [[] for _ in range(num_clusters)]
166 |
167 | # Check each point to which cluster it belongs and append to the list accordingly
168 | for i in range(0, len(data[0])):
169 | cluster_index = data[1][i]
170 | clusters[cluster_index].append(data[0][i])
171 |
172 | # Create emtpy arrays for convex hulls and data points
173 | hulls = [[] for _ in range(num_clusters)]
174 | points = [[] for _ in range(num_clusters)]
175 | hulls_vertices = [[] for _ in range(num_clusters)]
176 |
177 | # Use the Concave Hull for the noisy moons shape
178 | if type == "noisy_moons":
179 | ALPHA = 5
180 | for i in range(0, len(clusters)):
181 | hull = alphashape.alphashape(np.array(clusters[i]), ALPHA)
182 | hull_pts = hull.exterior.coords.xy
183 | hulls[i] = hull_pts
184 |
185 | # Append vertices
186 | for i in range(0, len(hulls)):
187 | for j in range(0, len(hulls[0][i])):
188 | vertex = [hulls[i][0][j], hulls[i][1][j]]
189 | hulls_vertices[i].append(vertex)
190 |
191 |
192 | # Use the ConvexHull for all other shapes
193 | else:
194 | # Append the hulls
195 | for i in range(0, len(clusters)):
196 | hulls[i] = ConvexHull(clusters[i])
197 |
198 | # Append vertices of the hulls
199 | for i in range(0, len(hulls)):
200 | for j in range(0, len(hulls[i].vertices)):
201 | hulls_vertices[i].append(clusters[i][hulls[i].vertices[j]])
202 |
203 | # Create empty array to append the polygons
204 | polygons = []
205 |
206 | # Create polygons from hull vertices
207 | for i in range(0, len(hulls_vertices)):
208 | polygon = Polygon(np.array(hulls_vertices[i]))
209 | polygons.append(polygon)
210 |
211 | return polygons
--------------------------------------------------------------------------------
/examples/comparisons.py:
--------------------------------------------------------------------------------
1 | print(__doc__)
2 |
3 | import time
4 | import warnings
5 |
6 | import numpy as np
7 | import matplotlib.pyplot as plt
8 |
9 | from sklearn import cluster, datasets, mixture
10 | from sklearn.neighbors import kneighbors_graph
11 | from sklearn.preprocessing import StandardScaler
12 | from itertools import cycle, islice
13 |
14 | np.random.seed(0)
15 |
16 | # ============
17 | # Generate datasets. We choose the size big enough to see the scalability
18 | # of the algorithms, but not too big to avoid too long running times
19 | # ============
20 | n_samples = 2000
21 | noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
22 | noise=.05)
23 | noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
24 | blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
25 | no_structure = np.random.rand(n_samples, 2), None
26 |
27 | # Anisotropicly distributed data
28 | random_state = 170
29 | X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
30 | transformation = [[0.6, -0.6], [-0.4, 0.8]]
31 | X_aniso = np.dot(X, transformation)
32 | aniso = (X_aniso, y)
33 |
34 | # blobs with varied variances
35 | varied = datasets.make_blobs(n_samples=n_samples,
36 | cluster_std=[1.0, 2.5, 0.5],
37 | random_state=random_state)
38 |
39 | # ============
40 | # Set up cluster parameters
41 | # ============
42 | plt.figure(figsize=(9 * 2 + 3, 13))
43 | plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.95, wspace=.05,
44 | hspace=.01)
45 |
46 | plot_num = 1
47 |
48 | default_base = {'quantile': .3,
49 | 'eps': .3,
50 | 'damping': .9,
51 | 'preference': -200,
52 | 'n_neighbors': 10,
53 | 'n_clusters': 3,
54 | 'min_samples': 20,
55 | 'xi': 0.05,
56 | 'min_cluster_size': 0.1}
57 |
58 | datasets = [
59 | (noisy_circles, {'damping': .77, 'preference': -240,
60 | 'quantile': .2, 'n_clusters': 2,
61 | 'min_samples': 20, 'xi': 0.25}),
62 | (noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}),
63 | (varied, {'eps': .18, 'n_neighbors': 2,
64 | 'min_samples': 5, 'xi': 0.035, 'min_cluster_size': .2}),
65 | (aniso, {'eps': .15, 'n_neighbors': 2,
66 | 'min_samples': 20, 'xi': 0.1, 'min_cluster_size': .2}),
67 | (blobs, {}),
68 | (no_structure, {})]
69 |
70 | for i_dataset, (dataset, algo_params) in enumerate(datasets):
71 | # update parameters with dataset-specific values
72 | params = default_base.copy()
73 | params.update(algo_params)
74 |
75 | X, y = dataset
76 |
77 | # normalize dataset for easier parameter selection
78 | X = StandardScaler().fit_transform(X)
79 |
80 | # estimate bandwidth for mean shift
81 | bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])
82 |
83 | # connectivity matrix for structured Ward
84 | connectivity = kneighbors_graph(
85 | X, n_neighbors=params['n_neighbors'], include_self=False)
86 | # make connectivity symmetric
87 | connectivity = 0.5 * (connectivity + connectivity.T)
88 |
89 | # ============
90 | # Create cluster objects
91 | # ============
92 | ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
93 | two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
94 | ward = cluster.AgglomerativeClustering(
95 | n_clusters=params['n_clusters'], linkage='ward',
96 | connectivity=connectivity)
97 | spectral = cluster.SpectralClustering(
98 | n_clusters=params['n_clusters'], eigen_solver='arpack',
99 | affinity="nearest_neighbors")
100 | dbscan = cluster.DBSCAN(eps=params['eps'])
101 | optics = cluster.OPTICS(min_samples=params['min_samples'],
102 | xi=params['xi'],
103 | min_cluster_size=params['min_cluster_size'])
104 | affinity_propagation = cluster.AffinityPropagation(
105 | damping=params['damping'], preference=params['preference'])
106 | average_linkage = cluster.AgglomerativeClustering(
107 | linkage="average", affinity="cityblock",
108 | n_clusters=params['n_clusters'], connectivity=connectivity)
109 | birch = cluster.Birch(n_clusters=params['n_clusters'])
110 | gmm = mixture.GaussianMixture(
111 | n_components=params['n_clusters'], covariance_type='full')
112 | visual = 1
113 |
114 | clustering_algorithms = (
115 | ('Visual\nClustering', visual),
116 | ('MiniBatch\nKMeans', two_means),
117 | ('Affinity\nPropagation', affinity_propagation),
118 | ('MeanShift', ms),
119 | ('Spectral\nClustering', spectral),
120 | ('Ward', ward),
121 | ('Agglomerative\nClustering', average_linkage),
122 | ('DBSCAN', dbscan),
123 | ('OPTICS', optics),
124 | ('BIRCH', birch),
125 | ('Gaussian\nMixture', gmm)
126 | )
127 |
128 | for name, algorithm in clustering_algorithms:
129 | if name == 'Visual\nClustering':
130 | t0 = time.time()
131 |
132 | max_x = max(dataset[0][:, 0])
133 | min_x = min(dataset[0][:, 0])
134 | new_max = 256
135 | new_min = 0
136 |
137 | dataset[0][:, 0] = (((dataset[0][:, 0] - min_x)*(new_max-new_min))/(max_x-min_x))+ new_min
138 |
139 | max_y = max(dataset[0][:, 1])
140 | min_y = min(dataset[0][:, 1])
141 | new_max_y = 256
142 | new_min_y = 0
143 |
144 | dataset[0][:, 1] = (((dataset[0][:, 1] - min_y)*(new_max_y-new_min_y))/(max_y-min_y))+ new_min_y
145 |
146 | input = create_input_image(dataset[0])
147 | result = predict_sample(input)
148 | y_pred = get_instances(result, dataset[0], max_filter_size=20)
149 | t1 = time.time()
150 |
151 | plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
152 |
153 | if i_dataset == 0:
154 | plt.title(name, size=18)
155 |
156 |
157 | colors = np.array(list(islice(cycle(["#000000", '#377eb8', '#ff7f00', '#4daf4a',
158 | '#f781bf', '#a65628', '#984ea3',
159 | '#999999', '#e41a1c', '#dede00' ,'#491010']),
160 | int(max(y_pred) + 1))))
161 | # add black color for outliers (if any)
162 | colors = np.append(colors, ["#000000"])
163 | plt.scatter(dataset[0][:, 0], dataset[0][:, 1], s=10, color=colors[y_pred.astype('int8')])
164 |
165 | plt.xlim(-20, 280)
166 | plt.ylim(-20, 280)
167 | plt.xticks(())
168 | plt.yticks(())
169 | plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
170 | transform=plt.gca().transAxes, size=15,
171 | horizontalalignment='right')
172 | plot_num += 1
173 |
174 |
175 | else:
176 | t0 = time.time()
177 |
178 | # catch warnings related to kneighbors_graph
179 | with warnings.catch_warnings():
180 | warnings.filterwarnings(
181 | "ignore",
182 | message="the number of connected components of the " +
183 | "connectivity matrix is [0-9]{1,2}" +
184 | " > 1. Completing it to avoid stopping the tree early.",
185 | category=UserWarning)
186 | warnings.filterwarnings(
187 | "ignore",
188 | message="Graph is not fully connected, spectral embedding" +
189 | " may not work as expected.",
190 | category=UserWarning)
191 | algorithm.fit(X)
192 |
193 | t1 = time.time()
194 | if hasattr(algorithm, 'labels_'):
195 | y_pred = algorithm.labels_.astype(int)
196 | else:
197 | y_pred = algorithm.predict(X)
198 |
199 | plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
200 | if i_dataset == 0:
201 | plt.title(name, size=18)
202 |
203 | colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
204 | '#f781bf', '#a65628', '#984ea3',
205 | '#999999', '#e41a1c', '#dede00']),
206 | int(max(y_pred) + 1))))
207 | # add black color for outliers (if any)
208 | colors = np.append(colors, ["#000000"])
209 | plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])
210 |
211 | plt.xlim(-2.5, 2.5)
212 | plt.ylim(-2.5, 2.5)
213 | plt.xticks(())
214 | plt.yticks(())
215 | plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
216 | transform=plt.gca().transAxes, size=15,
217 | horizontalalignment='right')
218 | plot_num += 1
219 |
220 | plt.show()
--------------------------------------------------------------------------------