├── dataset ├── utils │ ├── __init__.py │ └── functions.py └── main.py ├── images ├── blobs.png ├── algorithm.pdf ├── algorithm.png ├── clustered_blobs.png ├── tarek-clustered.png └── tarek-unclustered.png ├── pyproject.toml ├── .idea ├── misc.xml ├── vcs.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── modules.xml ├── asdas.iml └── workspace.xml ├── setup.cfg ├── LICENSE ├── dataset.py ├── gradio-demos └── demo-synthetic-datasets.py ├── segmentation ├── unet.py └── utils.py ├── README.md ├── src └── visual_clustering │ ├── VisualClustering.py │ └── __init__.py └── examples ├── runtime.py └── comparisons.py /dataset/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /images/blobs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tareknaous/visual-clustering/HEAD/images/blobs.png -------------------------------------------------------------------------------- /images/algorithm.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tareknaous/visual-clustering/HEAD/images/algorithm.pdf -------------------------------------------------------------------------------- /images/algorithm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tareknaous/visual-clustering/HEAD/images/algorithm.png -------------------------------------------------------------------------------- /images/clustered_blobs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tareknaous/visual-clustering/HEAD/images/clustered_blobs.png -------------------------------------------------------------------------------- /images/tarek-clustered.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tareknaous/visual-clustering/HEAD/images/tarek-clustered.png -------------------------------------------------------------------------------- /images/tarek-unclustered.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tareknaous/visual-clustering/HEAD/images/tarek-unclustered.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/asdas.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = visual-clustering 3 | version = 1.0.0 4 | author = Tarek Naous 5 | author_email = tareknaous@gmail.com 6 | description = An implementation of the Visual Clustering algorithm 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | url = https://github.com/tareknaous/visual-clustering 10 | project_urls = 11 | Bug Tracker = https://github.com/tareknaous/visual-clustering 12 | classifiers = 13 | Programming Language :: Python :: 3 14 | License :: OSI Approved :: MIT License 15 | Operating System :: OS Independent 16 | 17 | [options] 18 | package_dir = 19 | = src 20 | packages = find: 21 | python_requires = >=3.6 22 | 23 | [options.packages.find] 24 | where = src -------------------------------------------------------------------------------- /dataset/main.py: -------------------------------------------------------------------------------- 1 | from utils.functions import create_polygons 2 | from utils.functions import find_intersections 3 | from utils.functions import return_unique_polygons 4 | from utils.functions import plot_new_polygons 5 | from utils.functions import create_mask 6 | 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | from sklearn import datasets 11 | import alphashape 12 | from scipy.spatial import ConvexHull, convex_hull_plot_2d 13 | from shapely.ops import cascaded_union 14 | from shapely.geometry import Polygon 15 | 16 | 17 | #Test function 18 | NUM_SAMPLES= 1500 19 | NUM_CLUSTERS= 11 20 | test = create_polygons(type='blobs', 21 | num_samples=NUM_SAMPLES, 22 | num_clusters=NUM_CLUSTERS, 23 | random_state=19, 24 | keep_points=True) 25 | 26 | intersections = find_intersections(test) 27 | dictionary = return_unique_polygons(intersections) 28 | plt.savefig('cluster.png') 29 | polygons = plot_new_polygons(dictionary, test) 30 | plt.figure() 31 | create_mask(polygons) 32 | plt.savefig('annotation.png') -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 The Python Packaging Authority 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn import datasets 4 | 5 | #blobs 6 | n_samples = 1500 7 | blobs = datasets.make_blobs(n_samples=n_samples, centers=4, random_state=3) 8 | # plt.scatter(blobs[0][:,0],blobs[0][:,1]) 9 | # plt.show() 10 | 11 | cluster_0_points = [] 12 | cluster_1_points = [] 13 | cluster_2_points = [] 14 | cluster_3_points = [] 15 | 16 | for i in range(0,len(blobs[0])): 17 | if blobs[1][i] == 0: 18 | cluster_0_points.append(blobs[0][i]) 19 | if blobs[1][i] == 1: 20 | cluster_1_points.append(blobs[0][i]) 21 | if blobs[1][i] == 2: 22 | cluster_2_points.append(blobs[0][i]) 23 | if blobs[1][i] == 3: 24 | cluster_3_points.append(blobs[0][i]) 25 | 26 | 27 | clusters = [] 28 | 29 | clusters.append(cluster_0_points) 30 | clusters.append(cluster_1_points) 31 | clusters.append(cluster_2_points) 32 | clusters.append(cluster_3_points) 33 | 34 | 35 | 36 | from scipy.spatial import ConvexHull, convex_hull_plot_2d 37 | import matplotlib.pyplot as plt 38 | 39 | #Cluster 0 40 | hull_0 = ConvexHull(cluster_0_points) 41 | points_0 = np.array(cluster_0_points) 42 | 43 | for simplex in hull_0.simplices: 44 | plt.plot(points_0[simplex, 0], points_0[simplex, 1], 'k-') 45 | 46 | 47 | 48 | #Cluster 1 49 | hull_1 = ConvexHull(cluster_1_points) 50 | points_1 = np.array(cluster_1_points) 51 | 52 | for simplex in hull_1.simplices: 53 | plt.plot(points_1[simplex, 0], points_1[simplex, 1], 'k-') 54 | 55 | 56 | #Cluster 2 57 | hull_2 = ConvexHull(cluster_2_points) 58 | points_2 = np.array(cluster_2_points) 59 | 60 | for simplex in hull_2.simplices: 61 | plt.plot(points_2[simplex, 0], points_2[simplex, 1], 'k-') 62 | 63 | 64 | 65 | #Cluster 3 66 | hull_3 = ConvexHull(cluster_3_points) 67 | points_3 = np.array(cluster_3_points) 68 | 69 | for simplex in hull_3.simplices: 70 | plt.plot(points_3[simplex, 0], points_3[simplex, 1], 'k-') 71 | 72 | 73 | plt.show() -------------------------------------------------------------------------------- /gradio-demos/demo-synthetic-datasets.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | from itertools import cycle, islice 3 | 4 | 5 | def visual_clustering(cluster_type, num_clusters, num_samples, random_state, median_kernel_size, max_kernel_size): 6 | 7 | NUM_CLUSTERS = num_clusters 8 | CLUSTER_STD = 4 * np.ones(NUM_CLUSTERS) 9 | 10 | if cluster_type == "blobs": 11 | data = datasets.make_blobs(n_samples=num_samples, centers=NUM_CLUSTERS, random_state=random_state,center_box=(0, 256), cluster_std=CLUSTER_STD) 12 | 13 | elif cluster_type == "varied blobs": 14 | cluster_std = 1.5 * np.ones(NUM_CLUSTERS) 15 | data = datasets.make_blobs(n_samples=num_samples, centers=NUM_CLUSTERS, cluster_std=cluster_std, random_state=random_state) 16 | 17 | elif cluster_type == "aniso": 18 | X, y = datasets.make_blobs(n_samples=num_samples, centers=NUM_CLUSTERS, random_state=random_state, center_box=(-30, 30)) 19 | transformation = [[0.8, -0.6], [-0.4, 0.8]] 20 | X_aniso = np.dot(X, transformation) 21 | data = (X_aniso, y) 22 | 23 | elif cluster_type == "noisy moons": 24 | data = datasets.make_moons(n_samples=num_samples, noise=.05) 25 | 26 | elif cluster_type == "noisy circles": 27 | data = datasets.make_circles(n_samples=num_samples, factor=.01, noise=.05) 28 | 29 | max_x = max(data[0][:, 0]) 30 | min_x = min(data[0][:, 0]) 31 | new_max = 256 32 | new_min = 0 33 | 34 | data[0][:, 0] = (((data[0][:, 0] - min_x)*(new_max-new_min))/(max_x-min_x))+ new_min 35 | 36 | max_y = max(data[0][:, 1]) 37 | min_y = min(data[0][:, 1]) 38 | new_max_y = 256 39 | new_min_y = 0 40 | 41 | data[0][:, 1] = (((data[0][:, 1] - min_y)*(new_max_y-new_min_y))/(max_y-min_y))+ new_min_y 42 | 43 | fig1 = plt.figure() 44 | plt.scatter(data[0][:, 0], data[0][:, 1], s=1, c='black') 45 | plt.close() 46 | 47 | input = create_input_image(data[0]) 48 | filtered = ndimage.median_filter(input, size=median_kernel_size) 49 | result = predict_sample(filtered) 50 | y_km = get_instances(result, data[0], max_filter_size=max_kernel_size) 51 | 52 | colors = np.array(list(islice(cycle(["#000000", '#377eb8', '#ff7f00', '#4daf4a', 53 | '#f781bf', '#a65628', '#984ea3', 54 | '#999999', '#e41a1c', '#dede00' ,'#491010']), 55 | int(max(y_km) + 1)))) 56 | #add black color for outliers (if any) 57 | colors = np.append(colors, ["#000000"]) 58 | 59 | fig2 = plt.figure() 60 | plt.scatter(data[0][:, 0], data[0][:, 1], s=10, color=colors[y_km.astype('int8')]) 61 | plt.close() 62 | 63 | return fig1, fig2 64 | 65 | iface = gr.Interface( 66 | 67 | fn=visual_clustering, 68 | 69 | inputs=[ 70 | gr.inputs.Dropdown(["blobs", "varied blobs", "aniso", "noisy moons", "noisy circles" ]), 71 | gr.inputs.Slider(1, 10, step=1, label='Number of Clusters'), 72 | gr.inputs.Slider(10000, 1000000, step=10000, label='Number of Samples'), 73 | gr.inputs.Slider(1, 100, step=1, label='Random State'), 74 | gr.inputs.Slider(1, 100, step=1, label='Denoising Filter Kernel Size'), 75 | gr.inputs.Slider(1,100, step=1, label='Max Filter Kernel Size') 76 | ], 77 | 78 | outputs=[ 79 | gr.outputs.Image(type='plot', label='Dataset'), 80 | gr.outputs.Image(type='plot', label='Clustering Result') 81 | ] 82 | ) 83 | iface.launch(share=True) -------------------------------------------------------------------------------- /segmentation/unet.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import skimage.io as io 4 | import skimage.transform as trans 5 | import numpy as np 6 | from keras.models import * 7 | from keras.layers import * 8 | from keras.optimizers import * 9 | from keras.callbacks import ModelCheckpoint, LearningRateScheduler 10 | from keras import backend as keras 11 | 12 | 13 | inputs = Input((256,256,1)) 14 | conv1 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(inputs) 15 | conv1 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv1) 16 | pool1 = MaxPooling2D(pool_size=(2, 2))(conv1) 17 | conv2 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(pool1) 18 | conv2 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv2) 19 | pool2 = MaxPooling2D(pool_size=(2, 2))(conv2) 20 | conv3 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(pool2) 21 | conv3 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv3) 22 | pool3 = MaxPooling2D(pool_size=(2, 2))(conv3) 23 | conv4 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(pool3) 24 | conv4 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv4) 25 | drop4 = Dropout(0.5)(conv4) 26 | pool4 = MaxPooling2D(pool_size=(2, 2))(drop4) 27 | 28 | conv5 = Conv2D(1024, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(pool4) 29 | conv5 = Conv2D(1024, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv5) 30 | drop5 = Dropout(0.5)(conv5) 31 | 32 | up6 = Conv2D(512, 2, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(drop5)) 33 | merge6 = concatenate([drop4,up6], axis = 3) 34 | conv6 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(merge6) 35 | conv6 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv6) 36 | 37 | up7 = Conv2D(256, 2, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(conv6)) 38 | merge7 = concatenate([conv3,up7], axis = 3) 39 | conv7 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(merge7) 40 | conv7 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv7) 41 | 42 | up8 = Conv2D(128, 2, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(conv7)) 43 | merge8 = concatenate([conv2,up8], axis = 3) 44 | conv8 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(merge8) 45 | conv8 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv8) 46 | 47 | up9 = Conv2D(64, 2, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(conv8)) 48 | merge9 = concatenate([conv1,up9], axis = 3) 49 | conv9 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(merge9) 50 | conv9 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv9) 51 | conv9 = Conv2D(2, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv9) 52 | conv10 = Conv2D(1, 1, activation = 'sigmoid')(conv9) 53 | 54 | model = Model(inputs, conv10) 55 | 56 | model.compile(optimizer = Adam(lr = 1e-4), loss = 'binary_crossentropy', metrics = ['accuracy']) 57 | 58 | 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Visual Clustering 2 | 3 | Clustering is a popular approach to detect patterns in unlabeled data. Existing clustering methods typically treat samples in a dataset as points in a metric space and compute distances to group together similar points. **Visual Clustering** is a different way of clustering points in 2-dimensional space, inspired by how humans *"visually"* cluster data. The algorithm is based on trained neural networks that perform instance segmentation on plotted data. 4 | 5 | 6 | For more details, see the accompanying paper: ["Clustering Plotted Data by Image Segmentation"](https://openaccess.thecvf.com/content/CVPR2022/html/Naous_Clustering_Plotted_Data_by_Image_Segmentation_CVPR_2022_paper.html), **2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)**, and please use the citation below. 7 | 8 | ``` 9 | @article{naous2021clustering, 10 | title={Clustering Plotted Data by Image Segmentation}, 11 | author={Naous, Tarek and Sarkar, Srinjay and Abid, Abubakar and Zou, James}, 12 | journal={2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, 13 | year={2022} 14 | } 15 | ``` 16 | 17 | algorithm 18 | 19 | ## Installation 20 | 21 | ```python 22 | pip install visual-clustering 23 | ``` 24 | 25 | ## Usage 26 | 27 | The algorithm can be used the same way as the classical clustering algorithms in scikit-learn: \ 28 | You first import the class ```VisualClustering``` and create an instance of it. 29 | 30 | ```python 31 | from visual_clustering import VisualClustering 32 | 33 | model = VisualClustering(median_filter_size = 1, max_filter_size= 1) 34 | ``` 35 | The parameters ```median_filter_size``` and ```max_filter_size``` are set to 1 by default. \ 36 | You can experiment with different values to see *what works best for your dataset* ! 37 | 38 | 39 | Let's create a simple synthetic dataset of blobs. 40 | ```python 41 | from sklearn import datasets 42 | 43 | data = datasets.make_blobs(n_samples=50000, centers=6, random_state=23,center_box=(-30, 30)) 44 | plt.scatter(data[0][:, 0], data[0][:, 1], s=1, c='black') 45 | ``` 46 | 47 | blobs 48 | 49 | To cluster the dataset, use the ```fit``` function of the model: 50 | ```python 51 | predictions = model.fit(data[0]) 52 | ``` 53 | 54 | ## Visualizing the results 55 | 56 | You can visualize the results using matplotlib as you would normally do with classical clustering algorithms: 57 | 58 | ```python 59 | import matplotlib.pyplot as plt 60 | from itertools import cycle, islice 61 | import numpy as np 62 | 63 | colors = np.array(list(islice(cycle(["#000000", '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3']), int(max(predictions) + 1)))) 64 | #Black color for outliers (if any) 65 | colors = np.append(colors, ["#000000"]) 66 | plt.scatter(data[0][:, 0], data[0][:, 1], s=10, color=colors[predictions.astype('int8')]) 67 | ``` 68 | 69 | clustered_blobs 70 | 71 | ## Colab Demo: Cluster Your Name! 72 | 73 | Run Visual Clustering inside a colab notebook to cluster your own name! \ 74 | https://colab.research.google.com/drive/1DcZXhKnUpz1GDoGaJmpS6VVNXVuaRmE5?usp=sharing 75 | 76 | Name Plot: 77 | 78 | tarek-unclustered 79 | 80 | Name clustered via Visual Clustering: 81 | 82 | tarek-clustered 83 | 84 | 85 | ## Dependencies 86 | Make sure that you have the following libraries installed: 87 | ``` 88 | transformers 4.15.0 89 | scipy 1.4.1 90 | tensorflow 2.7.0 91 | keras 2.7.0 92 | numpy 1.19.5 93 | cv2 4.1.2 94 | skimage 0.18.3 95 | ``` 96 | ## Contact 97 | **Tarek Naous**: [Scholar](https://scholar.google.com/citations?user=ImyLv44AAAAJ&hl=en) | [Github](https://github.com/tareknaous?tab=repositories) | 98 | [Linkedin](https://www.linkedin.com/in/tareknaous/) | [Research Gate](https://www.researchgate.net/profile/Tarek_Naous?ev=hdr_xprf) | [Personal Wesbite](https://tareknaous.github.io/) 99 | | tareknaous@gmail.com 100 | -------------------------------------------------------------------------------- /src/visual_clustering/VisualClustering.py: -------------------------------------------------------------------------------- 1 | class VisualClustering: 2 | def __init__(self, max_filter_size = 1, median_filter_size = 1): 3 | """self (object): containing the loaded pre-trained U-Net from the Huggingface hub 4 | """ 5 | self.unet = from_pretrained_keras("tareknaous/unet-visual-clustering") 6 | self.max_filter_size = max_filter_size 7 | self.median_filter_size = median_filter_size 8 | 9 | def predict_sample(self, image): 10 | """Run inference using the U-Net model and return result 11 | 12 | Args: 13 | image (numpy.ndarray (256, 256, 1)): input image representing plotted 2D dataset 14 | 15 | Returns: 16 | result (numpy.ndarray (256, 256, 1)): predicted binary segmentation mask 17 | 18 | """ 19 | prediction = self.unet.predict(image[tf.newaxis, ...]) 20 | prediction[prediction > 0.5 ] = 1 21 | prediction[prediction !=1] = 0 22 | result = prediction[0]*255 23 | return result 24 | 25 | def create_input_image(self, data): 26 | #Initialize input matrix 27 | input = np.ones((256,256)) 28 | #Fill matrix with data point values 29 | for i in range(0,len(data)): 30 | if math.floor(data[i][0]) < 256 and math.floor(data[i][1]) < 256: 31 | input[math.floor(data[i][0])][math.floor(data[i][1])] = 0 32 | elif math.floor(data[i][0]) >= 256: 33 | input[255][math.floor(data[i][1])] = 0 34 | elif math.floor(data[i][1]) >= 256: 35 | input[math.floor(data[i][0])][255] = 0 36 | 37 | return input 38 | 39 | def denoise_input(self, image): 40 | denoised = ndimage.median_filter(image, size=self.median_filter_size) 41 | return denoised 42 | 43 | def linear_shifting(self, data): 44 | max_x = max(data[:, 0]) 45 | min_x = min(data[:, 0]) 46 | new_max = 256 47 | new_min = 0 48 | 49 | data[:, 0] = (((data[:, 0] - min_x)*(new_max-new_min))/(max_x-min_x))+ new_min 50 | 51 | max_y = max(data[:, 1]) 52 | min_y = min(data[:, 1]) 53 | new_max_y = 256 54 | new_min_y = 0 55 | 56 | data[:, 1] = (((data[:, 1] - min_y)*(new_max_y-new_min_y))/(max_y-min_y))+ new_min_y 57 | 58 | return data 59 | 60 | def get_instances(self, prediction, data): 61 | #Adjust format (clusters to be 255 and rest is 0) 62 | prediction[prediction == 255] = 3 63 | prediction[prediction == 0] = 4 64 | prediction[prediction == 3] = 0 65 | prediction[prediction == 4] = 255 66 | 67 | #Convert to 8-bit image 68 | prediction = image.img_to_array(prediction, dtype='uint8') 69 | 70 | #Get 1 color channel 71 | cells=prediction[:,:,0] 72 | #Threshold 73 | ret1, thresh = cv2.threshold(cells, 0, 255, cv2.THRESH_BINARY) 74 | #Filter to remove noise 75 | kernel = np.ones((3,3),np.uint8) 76 | opening = cv2.morphologyEx(thresh,cv2.MORPH_OPEN,kernel, iterations = 2) 77 | 78 | #Obtain background 79 | background = cv2.dilate(opening,kernel,iterations=5) 80 | dist_transform = cv2.distanceTransform(opening,cv2.DIST_L2,5) 81 | ret2, foreground = cv2.threshold(dist_transform,0.04*dist_transform.max(),255,0) 82 | foreground = np.uint8(foreground) 83 | unknown = cv2.subtract(background,foreground) 84 | 85 | #Connected Component Analysis 86 | ret3, markers = cv2.connectedComponents(foreground) 87 | markers = markers+10 88 | markers[unknown==255] = 0 89 | 90 | #Watershed 91 | img = cv2.merge((prediction,prediction,prediction)) 92 | markers = cv2.watershed(img,markers) 93 | img[markers == -1] = [0,255,255] 94 | 95 | #Maximum filtering 96 | markers = ndimage.maximum_filter(markers, size=self.max_filter_size) 97 | 98 | #Get regions 99 | regions = measure.regionprops(markers, intensity_image=cells) 100 | 101 | #Get Cluster IDs (Cluster Assignment) 102 | cluster_ids = np.zeros(len(data)) 103 | 104 | for i in range(0,len(cluster_ids)): 105 | row = math.floor(data[i][0]) 106 | column = math.floor(data[i][1]) 107 | if row < 256 and column < 256: 108 | cluster_ids[i] = markers[row][column] - 10 109 | elif row >= 256: 110 | # cluster_ids[i] = markers[255][column] 111 | cluster_ids[i] = 0 112 | elif column >= 256: 113 | # cluster_ids[i] = markers[row][255] 114 | cluster_ids[i] = 0 115 | 116 | cluster_ids = cluster_ids.astype('int8') 117 | cluster_ids[cluster_ids == -11] = 0 118 | 119 | return cluster_ids 120 | 121 | def fit(self, data): 122 | data = self.linear_shifting(data) 123 | input = self.create_input_image(data) 124 | if self.median_filter_size == 1: 125 | result = self.predict_sample(input) 126 | labels = self.get_instances(result, data) 127 | else: 128 | denoised_input = self.denoise_input(input) 129 | result = self.predict_sample(denoised_input) 130 | labels = self.get_instances(result, data) 131 | return labels 132 | -------------------------------------------------------------------------------- /examples/runtime.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from sklearn import datasets 5 | from sklearn.cluster import KMeans 6 | from sklearn import cluster, datasets, mixture 7 | from sklearn.neighbors import kneighbors_graph 8 | 9 | 10 | 11 | 12 | NUM_CLUSTERS = 10 13 | CLUSTER_STD = 4 * np.ones(NUM_CLUSTERS) 14 | 15 | # samples = [1000, 5000, 10000, 50000, 100000, 500000, 1000000, 1500000, 2000000] 16 | samples = [1000, 5000, 10000, 50000] 17 | 18 | 19 | time_visual = [] 20 | time_kmeans = [] 21 | time_dbscan = [] 22 | time_affinity = [] 23 | time_spectral = [] 24 | time_optics = [] 25 | time_gmm = [] 26 | time_ward = [] 27 | time_ms = [] 28 | time_birch = [] 29 | time_agglo = [] 30 | 31 | 32 | 33 | for i in samples: 34 | data = datasets.make_blobs(n_samples=i, centers=NUM_CLUSTERS, random_state=151,center_box=(0, 256), cluster_std=CLUSTER_STD) 35 | 36 | #Compute Visual 37 | start = time.time() 38 | input = create_input_image(data) 39 | result = predict_sample(input) 40 | y_km = get_instances(result, data) 41 | end = time.time() 42 | 43 | time_visual.append(end-start) 44 | 45 | #Compute Kmeans 46 | km = KMeans( 47 | n_clusters=10, init='random', 48 | n_init=10, max_iter=300, 49 | tol=1e-04, random_state=0 50 | ) 51 | 52 | start = time.time() 53 | y_km = km.fit_predict(data[0]) 54 | end = time.time() 55 | time_kmeans.append(end - start) 56 | 57 | #Compute dbscan 58 | dbscan = cluster.DBSCAN(eps=0.15) 59 | 60 | start = time.time() 61 | y_km = dbscan.fit_predict(data[0]) 62 | end = time.time() 63 | time_dbscan.append(end - start) 64 | 65 | #Compute Affinity Propagation 66 | # affinity_propagation = cluster.AffinityPropagation(damping=0.77, preference=-240) 67 | 68 | # start = time.time() 69 | # y_km = affinity_propagation.fit_predict(data[0]) 70 | # end = time.time() 71 | # time_affinity.append(end - start) 72 | 73 | #Compute Spectral 74 | spectral = cluster.SpectralClustering(n_clusters=10, eigen_solver='arpack', affinity="nearest_neighbors") 75 | start = time.time() 76 | y_km = dbscan.fit_predict(data[0]) 77 | end = time.time() 78 | time_spectral.append(end - start) 79 | 80 | #Compute OPTICS 81 | # optics = cluster.OPTICS(min_samples=20, xi=0.05, min_cluster_size=0.2) 82 | # start = time.time() 83 | # y_km = optics.fit_predict(data[0]) 84 | # end = time.time() 85 | # time_optics.append(end - start) 86 | 87 | #Compute GMM 88 | gmm = mixture.GaussianMixture(n_components=10, covariance_type='full') 89 | start = time.time() 90 | y_km = gmm.fit_predict(data[0]) 91 | end = time.time() 92 | time_gmm.append(end - start) 93 | 94 | #Ward 95 | # start = time.time() 96 | # # connectivity matrix for structured Ward 97 | # connectivity = kneighbors_graph(data[0], n_neighbors=10, include_self=False) 98 | # # make connectivity symmetric 99 | # connectivity = 0.5 * (connectivity + connectivity.T) 100 | # ward = cluster.AgglomerativeClustering(n_clusters=10, linkage='ward', connectivity=connectivity) 101 | # y_km = ward.fit_predict(data[0]) 102 | # end = time.time() 103 | # time_ward.append(end - start) 104 | 105 | #Mean Shift 106 | # estimate bandwidth for mean shift 107 | # start = time.time() 108 | # bandwidth = cluster.estimate_bandwidth(data[0], quantile=0.2) 109 | # ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) 110 | # y_km = ms.fit_predict(data[0]) 111 | # end = time.time() 112 | # time_ms.append(end - start) 113 | 114 | #Birch 115 | # birch = cluster.Birch(n_clusters=10) 116 | # start = time.time() 117 | # y_km = birch.fit_predict(data[0]) 118 | # end = time.time() 119 | # time_birch.append(end - start) 120 | 121 | #Agglomertive 122 | connectivity = kneighbors_graph(data[0], n_neighbors=10, include_self=False) 123 | # make connectivity symmetric 124 | connectivity = 0.5 * (connectivity + connectivity.T) 125 | average_linkage = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock",n_clusters=10, connectivity=connectivity) 126 | start = time.time() 127 | y_km = average_linkage.fit_predict(data[0]) 128 | end = time.time() 129 | time_agglo.append(end - start) 130 | 131 | plt.plot(samples, time_visual , 'r', marker='o', linewidth=2) 132 | plt.plot(samples, time_kmeans, 'b', marker='o',linewidth=2) 133 | plt.plot(samples, time_dbscan, 'g', marker='o',linewidth=2) 134 | plt.plot(samples, time_spectral, 'm', marker='o',linewidth=2) 135 | plt.plot(samples, time_gmm, 'y', marker='o',linewidth=2) 136 | plt.plot(samples, time_agglo, 'c', marker='o',linewidth=2) 137 | 138 | 139 | 140 | 141 | plt.legend(['Our Method', 'K-Means', 'DBSCAN', 'Spectral Clustering', 'Gaussian Mixture', 'Agglomerative Clustering'], fontsize=11) 142 | plt.grid( linestyle='-', linewidth=0.5) 143 | plt.ylabel('Time (sec)', fontsize=16) 144 | plt.xlabel('Samples', fontsize=16) 145 | plt.title("Time vs Nb of Samples for 10 blobs") 146 | 147 | 148 | 149 | 150 | -------------------------------------------------------------------------------- /segmentation/utils.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from matplotlib import pyplot as plt 4 | from scipy import ndimage 5 | from skimage import measure, color, io 6 | from tensorflow.keras.preprocessing import image 7 | import math 8 | from scipy.spatial import ConvexHull 9 | from shapely.geometry import Polygon 10 | 11 | 12 | 13 | #Function that predicts on only 1 sample 14 | def predict_sample(image): 15 | prediction = model.predict(image[tf.newaxis, ...]) 16 | prediction[prediction > 0.5 ] = 1 17 | prediction[prediction !=1] = 0 18 | result = prediction[0]*255 19 | return result 20 | 21 | 22 | #Function that creates the matrix that will be used as input to the binary segmentation model 23 | def create_input_image(data, visualize=False): 24 | #Initialize input matrix 25 | input = np.ones((256,256)) 26 | 27 | #Fill matrix with data point values 28 | for i in range(0,len(data[0])): 29 | if math.floor(data[0][i][0]) < 256 and math.floor(data[0][i][1]) < 256: 30 | input[math.floor(data[0][i][0])][math.floor(data[0][i][1])] = 0 31 | elif math.floor(data[0][i][0]) >= 256: 32 | input[255][math.floor(data[0][i][1])] = 0 33 | elif math.floor(data[0][i][1]) >= 256: 34 | input[math.floor(data[0][i][0])][255] = 0 35 | 36 | #Visualize 37 | if visualize == True: 38 | plt.imshow(input.T, cmap='gray') 39 | plt.gca().invert_yaxis() 40 | 41 | return input 42 | 43 | 44 | import cv2 45 | import numpy as np 46 | from matplotlib import pyplot as plt 47 | from scipy import ndimage 48 | from skimage import measure, color, io 49 | from tensorflow.keras.preprocessing import image 50 | from scipy import ndimage 51 | 52 | 53 | #Function that performs instance segmentation and clusters the dataset 54 | def get_instances(prediction, data, max_filter_size=1): 55 | #Adjust format (clusters to be 255 and rest is 0) 56 | prediction[prediction == 255] = 3 57 | prediction[prediction == 0] = 4 58 | prediction[prediction == 3] = 0 59 | prediction[prediction == 4] = 255 60 | 61 | #Convert to 8-bit image 62 | prediction = image.img_to_array(prediction, dtype='uint8') 63 | 64 | #Get 1 color channel 65 | cells=prediction[:,:,0] 66 | #Threshold 67 | ret1, thresh = cv2.threshold(cells, 0, 255, cv2.THRESH_BINARY) 68 | #Filter to remove noise 69 | kernel = np.ones((3,3),np.uint8) 70 | opening = cv2.morphologyEx(thresh,cv2.MORPH_OPEN,kernel, iterations = 2) 71 | 72 | #Get the background 73 | background = cv2.dilate(opening,kernel,iterations=5) 74 | dist_transform = cv2.distanceTransform(opening,cv2.DIST_L2,5) 75 | ret2, foreground = cv2.threshold(dist_transform,0.04*dist_transform.max(),255,0) 76 | foreground = np.uint8(foreground) 77 | unknown = cv2.subtract(background,foreground) 78 | 79 | #Connected Component Analysis 80 | ret3, markers = cv2.connectedComponents(foreground) 81 | markers = markers+10 82 | markers[unknown==255] = 0 83 | 84 | #Watershed 85 | img = cv2.merge((prediction,prediction,prediction)) 86 | markers = cv2.watershed(img,markers) 87 | img[markers == -1] = [0,255,255] 88 | 89 | #Maximum filtering 90 | markers = ndimage.maximum_filter(markers, size=max_filter_size) 91 | # plt.imshow(markers.T, cmap='gray') 92 | # plt.gca().invert_yaxis() 93 | 94 | #Get an RGB colored image 95 | img2 = color.label2rgb(markers, bg_label=1) 96 | # plt.imshow(img2) 97 | # plt.gca().invert_yaxis() 98 | 99 | #Get regions 100 | regions = measure.regionprops(markers, intensity_image=cells) 101 | 102 | #Get Cluster IDs 103 | cluster_ids = np.zeros(len(data)) 104 | 105 | for i in range(0,len(cluster_ids)): 106 | row = math.floor(data[i][0]) 107 | column = math.floor(data[i][1]) 108 | if row < 256 and column < 256: 109 | cluster_ids[i] = markers[row][column] - 10 110 | elif row >= 256: 111 | # cluster_ids[i] = markers[255][column] 112 | cluster_ids[i] = 0 113 | elif column >= 256: 114 | # cluster_ids[i] = markers[row][255] 115 | cluster_ids[i] = 0 116 | 117 | cluster_ids = cluster_ids.astype('int8') 118 | cluster_ids[cluster_ids == -11] = 0 119 | 120 | return cluster_ids 121 | 122 | 123 | 124 | def draw_clusters(regions,data): 125 | for i in range(1,len(regions)): 126 | #Get the coordinates of the region 127 | coordinates = regions[i].coords 128 | #Compute the convex hull 129 | hull = ConvexHull(coordinates) 130 | #Get the indexess of the vertices 131 | vertices_ids = hull.vertices 132 | #Append real values of the vertices 133 | hull_vertices = [] 134 | for j in range(0,len(vertices_ids)): 135 | hull_vertices.append(coordinates[vertices_ids[j]]) 136 | #Create and plot polygon of cluster 137 | polygon = Polygon(hull_vertices) 138 | x,y = polygon.exterior.xy 139 | plt.plot(x,y) 140 | 141 | #Overlay the data points on the image 142 | plt.scatter(data[0][:, 0], data[0][:, 1], s=1, c='black') 143 | 144 | 145 | def visual_clustering(data): 146 | input = create_input_image(data) 147 | result = predict_sample(input) 148 | regions = get_instances(result, data) 149 | draw_clusters(regions,data) -------------------------------------------------------------------------------- /src/visual_clustering/__init__.py: -------------------------------------------------------------------------------- 1 | import math 2 | import cv2 3 | import numpy as np 4 | import tensorflow as tf 5 | from tensorflow.keras.preprocessing import image 6 | from scipy import ndimage 7 | from matplotlib import pyplot as plt 8 | from skimage import measure, color, io 9 | from huggingface_hub.keras_mixin import from_pretrained_keras 10 | 11 | 12 | class VisualClustering: 13 | def __init__(self, max_filter_size = 1, median_filter_size = 1): 14 | """self (object): containing the loaded pre-trained U-Net from the Huggingface hub 15 | """ 16 | self.unet = from_pretrained_keras("tareknaous/unet-visual-clustering") 17 | self.max_filter_size = max_filter_size 18 | self.median_filter_size = median_filter_size 19 | 20 | def predict_sample(self, image): 21 | """Run inference using the U-Net model and return result 22 | 23 | Args: 24 | image (numpy.ndarray (256, 256, 1)): input image representing plotted 2D dataset 25 | 26 | Returns: 27 | result (numpy.ndarray (256, 256, 1)): predicted binary segmentation mask 28 | 29 | """ 30 | prediction = self.unet.predict(image[tf.newaxis, ...]) 31 | prediction[prediction > 0.5 ] = 1 32 | prediction[prediction !=1] = 0 33 | result = prediction[0]*255 34 | return result 35 | 36 | def create_input_image(self, data): 37 | #Initialize input matrix 38 | input = np.ones((256,256)) 39 | #Fill matrix with data point values 40 | for i in range(0,len(data)): 41 | if math.floor(data[i][0]) < 256 and math.floor(data[i][1]) < 256: 42 | input[math.floor(data[i][0])][math.floor(data[i][1])] = 0 43 | elif math.floor(data[i][0]) >= 256: 44 | input[255][math.floor(data[i][1])] = 0 45 | elif math.floor(data[i][1]) >= 256: 46 | input[math.floor(data[i][0])][255] = 0 47 | 48 | return input 49 | 50 | def denoise_input(self, image): 51 | denoised = ndimage.median_filter(image, size=self.median_filter_size) 52 | return denoised 53 | 54 | def linear_shifting(self, data): 55 | max_x = max(data[:, 0]) 56 | min_x = min(data[:, 0]) 57 | new_max = 256 58 | new_min = 0 59 | 60 | data[:, 0] = (((data[:, 0] - min_x)*(new_max-new_min))/(max_x-min_x))+ new_min 61 | 62 | max_y = max(data[:, 1]) 63 | min_y = min(data[:, 1]) 64 | new_max_y = 256 65 | new_min_y = 0 66 | 67 | data[:, 1] = (((data[:, 1] - min_y)*(new_max_y-new_min_y))/(max_y-min_y))+ new_min_y 68 | 69 | return data 70 | 71 | def get_instances(self, prediction, data): 72 | #Adjust format (clusters to be 255 and rest is 0) 73 | prediction[prediction == 255] = 3 74 | prediction[prediction == 0] = 4 75 | prediction[prediction == 3] = 0 76 | prediction[prediction == 4] = 255 77 | 78 | #Convert to 8-bit image 79 | prediction = image.img_to_array(prediction, dtype='uint8') 80 | 81 | #Get 1 color channel 82 | cells=prediction[:,:,0] 83 | #Threshold 84 | ret1, thresh = cv2.threshold(cells, 0, 255, cv2.THRESH_BINARY) 85 | #Filter to remove noise 86 | kernel = np.ones((3,3),np.uint8) 87 | opening = cv2.morphologyEx(thresh,cv2.MORPH_OPEN,kernel, iterations = 2) 88 | 89 | #Obtain background 90 | background = cv2.dilate(opening,kernel,iterations=5) 91 | dist_transform = cv2.distanceTransform(opening,cv2.DIST_L2,5) 92 | ret2, foreground = cv2.threshold(dist_transform,0.04*dist_transform.max(),255,0) 93 | foreground = np.uint8(foreground) 94 | unknown = cv2.subtract(background,foreground) 95 | 96 | #Connected Component Analysis 97 | ret3, markers = cv2.connectedComponents(foreground) 98 | markers = markers+10 99 | markers[unknown==255] = 0 100 | 101 | #Watershed 102 | img = cv2.merge((prediction,prediction,prediction)) 103 | markers = cv2.watershed(img,markers) 104 | img[markers == -1] = [0,255,255] 105 | 106 | #Maximum filtering 107 | markers = ndimage.maximum_filter(markers, size=self.max_filter_size) 108 | 109 | #Get regions 110 | regions = measure.regionprops(markers, intensity_image=cells) 111 | 112 | #Get Cluster IDs (Cluster Assignment) 113 | cluster_ids = np.zeros(len(data)) 114 | 115 | for i in range(0,len(cluster_ids)): 116 | row = math.floor(data[i][0]) 117 | column = math.floor(data[i][1]) 118 | if row < 256 and column < 256: 119 | cluster_ids[i] = markers[row][column] - 10 120 | elif row >= 256: 121 | # cluster_ids[i] = markers[255][column] 122 | cluster_ids[i] = 0 123 | elif column >= 256: 124 | # cluster_ids[i] = markers[row][255] 125 | cluster_ids[i] = 0 126 | 127 | cluster_ids = cluster_ids.astype('int8') 128 | cluster_ids[cluster_ids == -11] = 0 129 | 130 | return cluster_ids 131 | 132 | def fit(self, data): 133 | data = self.linear_shifting(data) 134 | input = self.create_input_image(data) 135 | if self.median_filter_size == 1: 136 | result = self.predict_sample(input) 137 | labels = self.get_instances(result, data) 138 | else: 139 | denoised_input = self.denoise_input(input) 140 | result = self.predict_sample(denoised_input) 141 | labels = self.get_instances(result, data) 142 | return labels 143 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 13 | 14 | 19 | 20 | 21 | 23 | 24 | 25 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 57 | 58 | 59 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 1621928676719 92 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /dataset/utils/functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn import datasets 4 | import alphashape 5 | from scipy.spatial import ConvexHull, convex_hull_plot_2d 6 | from shapely.ops import cascaded_union 7 | from shapely.geometry import Polygon 8 | 9 | def find_intersections(polygons): 10 | 'CONSIDER CLUSTERS SHOULD BE UNITED BASED ON PERCENTAGE OF INTERSECTION' 11 | 'Add in dictionary whether the intersection should be united or subtracted' 12 | #Percentage threshold for uniting polygons 13 | THRESHOLD = 30 14 | #create empty dictionary 15 | intersections = dict() 16 | 17 | #create keys in dictionary 18 | for i in range(0,len(polygons)): 19 | key = i 20 | intersections[key] = [] 21 | 22 | #Add intersections in the dictionary based on percentage criterion 23 | for i in range(0,len(polygons)): 24 | for j in range(i+1,len(polygons)): 25 | intersection_percentage = [] 26 | intersection_percentage.append((polygons[i].intersection(polygons[j]).area)/polygons[i].area*100) 27 | intersection_percentage.append((polygons[i].intersection(polygons[j]).area)/polygons[j].area*100) 28 | 29 | if polygons[i].intersects(polygons[j]) == True: 30 | if intersection_percentage[0] >=THRESHOLD or intersection_percentage[0] >= THRESHOLD: 31 | key = i 32 | value = [j, 'union'] 33 | intersections[key].append(value) 34 | else: 35 | key = i 36 | value = [j, 'subtraction'] 37 | intersections[key].append(value) 38 | 39 | return intersections 40 | 41 | 42 | 43 | 44 | def return_unique_polygons(intersections): 45 | 'updated with union and subtraction criteria' 46 | remove = [] #used to store index of keys to remove 47 | 48 | #check which keys in the dictionary will need to be removed 49 | for key in intersections: 50 | for value in intersections[key]: 51 | if value[0] in intersections: 52 | remove.append(value[0]) 53 | 54 | #remove key from dictionary 55 | for i in range(0,len(remove)): 56 | #Add exception if code was trying to remove key that was already removed 57 | try: 58 | intersections.pop(remove[i]) 59 | except KeyError: 60 | continue 61 | 62 | return intersections 63 | 64 | 65 | 66 | 67 | def plot_new_polygons(unique_dictionary, polygons): 68 | 'Subtracts polygons with intersection % below threshold, and combine polygons with intersection % above threshold' 69 | 70 | mask_polygons = [] 71 | 72 | #Variable to decide whether to perform subtraction in case we have 3 or more intersecting polygons 73 | need_subtract = False 74 | 75 | for key in unique_dictionary: 76 | need_subtract = False 77 | #check if the key is empty (has not values) 78 | if not unique_dictionary[key]: 79 | #plot the polygon with no intersections 80 | x,y = polygons[key].exterior.xy 81 | # plt.plot(x,y) 82 | mask_polygons.append(polygons[key]) 83 | 84 | else: 85 | #create an array to add the polygons to be merged 86 | combination_merge = [] 87 | #added the polygon in the key itself 88 | combination_merge.append(polygons[key]) 89 | #create an array to add the polygons to be subtracted, in case there is any 90 | combination_substract = [] 91 | 92 | for value in unique_dictionary[key]: 93 | if value[1] == 'union': 94 | combination_merge.append(polygons[value[0]]) 95 | 96 | elif value[1] == 'subtraction': 97 | combination_substract.append(polygons[value[0]]) 98 | need_subtract = True 99 | 100 | #merge the polygons to be merged 101 | merged = cascaded_union(combination_merge) 102 | 103 | #If no need to subtract, then just plot the merged polygons 104 | if need_subtract == False: 105 | x,y = merged.exterior.xy 106 | # plt.plot(x,y) 107 | mask_polygons.append(merged) 108 | 109 | elif need_subtract == True: 110 | #subtract the one to be subtracted from the merged ones 111 | subtracted = [] 112 | for i in range(0,len(combination_substract)): 113 | subtracted.append(merged.symmetric_difference(combination_substract[i])) 114 | for j in range(0,len(subtracted[i])): 115 | x,y = subtracted[i][j].exterior.xy 116 | # plt.plot(x,y) 117 | mask_polygons.append(subtracted[i][j]) 118 | 119 | return mask_polygons 120 | 121 | 122 | 123 | def create_mask(polygons): 124 | for i in range(0,len(polygons)): 125 | x, y = polygons[i].exterior.xy 126 | plt.fill(x,y, "black") 127 | plt.axis('off') 128 | 129 | 130 | 131 | def create_polygons(type, num_samples, num_clusters, random_state, *cluster_std, keep_points=False): 132 | if type == 'blobs': # works fine 133 | data = datasets.make_blobs(n_samples=num_samples, centers=num_clusters, random_state=random_state, 134 | center_box=(-30, 30)) 135 | 136 | elif type == 'aniso': # works fine 137 | X, y = datasets.make_blobs(n_samples=num_samples, centers=num_clusters, random_state=random_state, center_box=(-30, 30)) 138 | transformation = [[0.6, -0.6], [-0.4, 0.8]] 139 | X_aniso = np.dot(X, transformation) 140 | data = (X_aniso, y) 141 | 142 | elif type == 'noisy_moons': # works fine 143 | data = datasets.make_moons(n_samples=num_samples, noise=.05) 144 | if num_clusters != 2: 145 | raise Exception("Can only take 2 clusters for noisy_moons") 146 | 147 | elif type == 'noisy_circles': # works fine 148 | data = datasets.make_circles(n_samples=num_samples, factor=.01, noise=.2) 149 | if num_clusters != 2: 150 | raise Exception("Can only take 2 clusters for noisy_circles") 151 | 152 | elif type == 'varied_blobs': # works fine 153 | cluster_std = 1.5 * np.random.random(num_clusters) 154 | data = datasets.make_blobs(n_samples=num_samples, 155 | centers=num_clusters, 156 | cluster_std=cluster_std, 157 | random_state=random_state, 158 | center_box=(-30, 30)) 159 | if keep_points==True: 160 | plt.figure() 161 | plt.scatter(data[0][:, 0], data[0][:, 1], s=1, c='black') 162 | plt.axis('off') 163 | 164 | # Create a list of empty arrays for each cluster 165 | clusters = [[] for _ in range(num_clusters)] 166 | 167 | # Check each point to which cluster it belongs and append to the list accordingly 168 | for i in range(0, len(data[0])): 169 | cluster_index = data[1][i] 170 | clusters[cluster_index].append(data[0][i]) 171 | 172 | # Create emtpy arrays for convex hulls and data points 173 | hulls = [[] for _ in range(num_clusters)] 174 | points = [[] for _ in range(num_clusters)] 175 | hulls_vertices = [[] for _ in range(num_clusters)] 176 | 177 | # Use the Concave Hull for the noisy moons shape 178 | if type == "noisy_moons": 179 | ALPHA = 5 180 | for i in range(0, len(clusters)): 181 | hull = alphashape.alphashape(np.array(clusters[i]), ALPHA) 182 | hull_pts = hull.exterior.coords.xy 183 | hulls[i] = hull_pts 184 | 185 | # Append vertices 186 | for i in range(0, len(hulls)): 187 | for j in range(0, len(hulls[0][i])): 188 | vertex = [hulls[i][0][j], hulls[i][1][j]] 189 | hulls_vertices[i].append(vertex) 190 | 191 | 192 | # Use the ConvexHull for all other shapes 193 | else: 194 | # Append the hulls 195 | for i in range(0, len(clusters)): 196 | hulls[i] = ConvexHull(clusters[i]) 197 | 198 | # Append vertices of the hulls 199 | for i in range(0, len(hulls)): 200 | for j in range(0, len(hulls[i].vertices)): 201 | hulls_vertices[i].append(clusters[i][hulls[i].vertices[j]]) 202 | 203 | # Create empty array to append the polygons 204 | polygons = [] 205 | 206 | # Create polygons from hull vertices 207 | for i in range(0, len(hulls_vertices)): 208 | polygon = Polygon(np.array(hulls_vertices[i])) 209 | polygons.append(polygon) 210 | 211 | return polygons -------------------------------------------------------------------------------- /examples/comparisons.py: -------------------------------------------------------------------------------- 1 | print(__doc__) 2 | 3 | import time 4 | import warnings 5 | 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | 9 | from sklearn import cluster, datasets, mixture 10 | from sklearn.neighbors import kneighbors_graph 11 | from sklearn.preprocessing import StandardScaler 12 | from itertools import cycle, islice 13 | 14 | np.random.seed(0) 15 | 16 | # ============ 17 | # Generate datasets. We choose the size big enough to see the scalability 18 | # of the algorithms, but not too big to avoid too long running times 19 | # ============ 20 | n_samples = 2000 21 | noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5, 22 | noise=.05) 23 | noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05) 24 | blobs = datasets.make_blobs(n_samples=n_samples, random_state=8) 25 | no_structure = np.random.rand(n_samples, 2), None 26 | 27 | # Anisotropicly distributed data 28 | random_state = 170 29 | X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state) 30 | transformation = [[0.6, -0.6], [-0.4, 0.8]] 31 | X_aniso = np.dot(X, transformation) 32 | aniso = (X_aniso, y) 33 | 34 | # blobs with varied variances 35 | varied = datasets.make_blobs(n_samples=n_samples, 36 | cluster_std=[1.0, 2.5, 0.5], 37 | random_state=random_state) 38 | 39 | # ============ 40 | # Set up cluster parameters 41 | # ============ 42 | plt.figure(figsize=(9 * 2 + 3, 13)) 43 | plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.95, wspace=.05, 44 | hspace=.01) 45 | 46 | plot_num = 1 47 | 48 | default_base = {'quantile': .3, 49 | 'eps': .3, 50 | 'damping': .9, 51 | 'preference': -200, 52 | 'n_neighbors': 10, 53 | 'n_clusters': 3, 54 | 'min_samples': 20, 55 | 'xi': 0.05, 56 | 'min_cluster_size': 0.1} 57 | 58 | datasets = [ 59 | (noisy_circles, {'damping': .77, 'preference': -240, 60 | 'quantile': .2, 'n_clusters': 2, 61 | 'min_samples': 20, 'xi': 0.25}), 62 | (noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}), 63 | (varied, {'eps': .18, 'n_neighbors': 2, 64 | 'min_samples': 5, 'xi': 0.035, 'min_cluster_size': .2}), 65 | (aniso, {'eps': .15, 'n_neighbors': 2, 66 | 'min_samples': 20, 'xi': 0.1, 'min_cluster_size': .2}), 67 | (blobs, {}), 68 | (no_structure, {})] 69 | 70 | for i_dataset, (dataset, algo_params) in enumerate(datasets): 71 | # update parameters with dataset-specific values 72 | params = default_base.copy() 73 | params.update(algo_params) 74 | 75 | X, y = dataset 76 | 77 | # normalize dataset for easier parameter selection 78 | X = StandardScaler().fit_transform(X) 79 | 80 | # estimate bandwidth for mean shift 81 | bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile']) 82 | 83 | # connectivity matrix for structured Ward 84 | connectivity = kneighbors_graph( 85 | X, n_neighbors=params['n_neighbors'], include_self=False) 86 | # make connectivity symmetric 87 | connectivity = 0.5 * (connectivity + connectivity.T) 88 | 89 | # ============ 90 | # Create cluster objects 91 | # ============ 92 | ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) 93 | two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) 94 | ward = cluster.AgglomerativeClustering( 95 | n_clusters=params['n_clusters'], linkage='ward', 96 | connectivity=connectivity) 97 | spectral = cluster.SpectralClustering( 98 | n_clusters=params['n_clusters'], eigen_solver='arpack', 99 | affinity="nearest_neighbors") 100 | dbscan = cluster.DBSCAN(eps=params['eps']) 101 | optics = cluster.OPTICS(min_samples=params['min_samples'], 102 | xi=params['xi'], 103 | min_cluster_size=params['min_cluster_size']) 104 | affinity_propagation = cluster.AffinityPropagation( 105 | damping=params['damping'], preference=params['preference']) 106 | average_linkage = cluster.AgglomerativeClustering( 107 | linkage="average", affinity="cityblock", 108 | n_clusters=params['n_clusters'], connectivity=connectivity) 109 | birch = cluster.Birch(n_clusters=params['n_clusters']) 110 | gmm = mixture.GaussianMixture( 111 | n_components=params['n_clusters'], covariance_type='full') 112 | visual = 1 113 | 114 | clustering_algorithms = ( 115 | ('Visual\nClustering', visual), 116 | ('MiniBatch\nKMeans', two_means), 117 | ('Affinity\nPropagation', affinity_propagation), 118 | ('MeanShift', ms), 119 | ('Spectral\nClustering', spectral), 120 | ('Ward', ward), 121 | ('Agglomerative\nClustering', average_linkage), 122 | ('DBSCAN', dbscan), 123 | ('OPTICS', optics), 124 | ('BIRCH', birch), 125 | ('Gaussian\nMixture', gmm) 126 | ) 127 | 128 | for name, algorithm in clustering_algorithms: 129 | if name == 'Visual\nClustering': 130 | t0 = time.time() 131 | 132 | max_x = max(dataset[0][:, 0]) 133 | min_x = min(dataset[0][:, 0]) 134 | new_max = 256 135 | new_min = 0 136 | 137 | dataset[0][:, 0] = (((dataset[0][:, 0] - min_x)*(new_max-new_min))/(max_x-min_x))+ new_min 138 | 139 | max_y = max(dataset[0][:, 1]) 140 | min_y = min(dataset[0][:, 1]) 141 | new_max_y = 256 142 | new_min_y = 0 143 | 144 | dataset[0][:, 1] = (((dataset[0][:, 1] - min_y)*(new_max_y-new_min_y))/(max_y-min_y))+ new_min_y 145 | 146 | input = create_input_image(dataset[0]) 147 | result = predict_sample(input) 148 | y_pred = get_instances(result, dataset[0], max_filter_size=20) 149 | t1 = time.time() 150 | 151 | plt.subplot(len(datasets), len(clustering_algorithms), plot_num) 152 | 153 | if i_dataset == 0: 154 | plt.title(name, size=18) 155 | 156 | 157 | colors = np.array(list(islice(cycle(["#000000", '#377eb8', '#ff7f00', '#4daf4a', 158 | '#f781bf', '#a65628', '#984ea3', 159 | '#999999', '#e41a1c', '#dede00' ,'#491010']), 160 | int(max(y_pred) + 1)))) 161 | # add black color for outliers (if any) 162 | colors = np.append(colors, ["#000000"]) 163 | plt.scatter(dataset[0][:, 0], dataset[0][:, 1], s=10, color=colors[y_pred.astype('int8')]) 164 | 165 | plt.xlim(-20, 280) 166 | plt.ylim(-20, 280) 167 | plt.xticks(()) 168 | plt.yticks(()) 169 | plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), 170 | transform=plt.gca().transAxes, size=15, 171 | horizontalalignment='right') 172 | plot_num += 1 173 | 174 | 175 | else: 176 | t0 = time.time() 177 | 178 | # catch warnings related to kneighbors_graph 179 | with warnings.catch_warnings(): 180 | warnings.filterwarnings( 181 | "ignore", 182 | message="the number of connected components of the " + 183 | "connectivity matrix is [0-9]{1,2}" + 184 | " > 1. Completing it to avoid stopping the tree early.", 185 | category=UserWarning) 186 | warnings.filterwarnings( 187 | "ignore", 188 | message="Graph is not fully connected, spectral embedding" + 189 | " may not work as expected.", 190 | category=UserWarning) 191 | algorithm.fit(X) 192 | 193 | t1 = time.time() 194 | if hasattr(algorithm, 'labels_'): 195 | y_pred = algorithm.labels_.astype(int) 196 | else: 197 | y_pred = algorithm.predict(X) 198 | 199 | plt.subplot(len(datasets), len(clustering_algorithms), plot_num) 200 | if i_dataset == 0: 201 | plt.title(name, size=18) 202 | 203 | colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a', 204 | '#f781bf', '#a65628', '#984ea3', 205 | '#999999', '#e41a1c', '#dede00']), 206 | int(max(y_pred) + 1)))) 207 | # add black color for outliers (if any) 208 | colors = np.append(colors, ["#000000"]) 209 | plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred]) 210 | 211 | plt.xlim(-2.5, 2.5) 212 | plt.ylim(-2.5, 2.5) 213 | plt.xticks(()) 214 | plt.yticks(()) 215 | plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), 216 | transform=plt.gca().transAxes, size=15, 217 | horizontalalignment='right') 218 | plot_num += 1 219 | 220 | plt.show() --------------------------------------------------------------------------------