├── README.md └── sklearn └── streams └── cluStream ├── cluster └── CluStream.py └── model └── MicroCluster.py /README.md: -------------------------------------------------------------------------------- 1 | # data_stream_mining 2 | Implementation of the CluStream algorithm 3 | -------------------------------------------------------------------------------- /sklearn/streams/cluStream/cluster/CluStream.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator, ClusterMixin 2 | from sklearn.utils import check_array 3 | from sklearn.cluster import KMeans 4 | from sklearn.streams.model import MicroCluster as model 5 | from scipy.spatial import distance 6 | import math 7 | import numpy as np 8 | import threading 9 | import time 10 | import sys 11 | 12 | 13 | 14 | class CluStream(BaseEstimator, ClusterMixin): 15 | #Implementation of CluStream 16 | 17 | def __init__(self, nb_initial_points=1000, time_window=1000, timestamp=0, clocktime=0, nb_micro_cluster=100, 18 | nb_macro_cluster=5, micro_clusters=[], alpha=2, l=2, h=1000): 19 | self.start_time = time.time() 20 | self.nb_initial_points = nb_initial_points 21 | self.time_window = time_window # Range of the window 22 | self.timestamp = timestamp 23 | self.clocktime = clocktime 24 | self.micro_clusters = micro_clusters 25 | self.nb_micro_cluster = nb_micro_cluster 26 | self.nb_macro_cluster = nb_macro_cluster 27 | self.alpha = alpha 28 | self.l = l 29 | self.h = h 30 | #self.snapshots = [] 31 | self.nb_created_clusters = 0 32 | 33 | def fit(self, X, Y=None): 34 | X = check_array(X, accept_sparse='csr') 35 | nb_initial_points = X.shape[0] 36 | if nb_initial_points >= self.nb_initial_points: 37 | kmeans = KMeans(n_clusters=self.nb_micro_cluster, random_state=1) 38 | micro_cluster_labels = kmeans.fit_predict(X, Y) 39 | X = np.column_stack((micro_cluster_labels, X)) 40 | initial_clusters = [X[X[:, 0] == l][:, 1:] for l in set(micro_cluster_labels) if l != -1] 41 | for cluster in initial_clusters: 42 | self.create_micro_cluster(cluster) 43 | self.start_time = time.time() 44 | 45 | def create_micro_cluster(self, cluster): 46 | linear_sum = np.zeros(cluster.shape[1]) 47 | squared_sum = np.zeros(cluster.shape[1]) 48 | self.nb_created_clusters += 1 49 | new_m_cluster = model(identifier=self.nb_created_clusters, nb_points=0, linear_sum=linear_sum, 50 | squared_sum=squared_sum, update_timestamp=0) 51 | for point in cluster: 52 | new_m_cluster.insert(point, self.timestamp) 53 | self.micro_clusters.append(new_m_cluster) 54 | 55 | # take a snapshot at each second 56 | """ 57 | def snapshots_taking(self): 58 | threading.Timer(1.0, self.snapshots_taking).start() 59 | clock_time = int(round(time.time() - self.start_time)) 60 | print(str(clock_time)) 61 | snapshot = open(str(clock_time) + ".txt", "w") 62 | snapshot.write(str(self.micro_cluster)) 63 | snapshot.close() 64 | 65 | def snapshot_manager(self): 66 | # check the number of snapshot per order 67 | # check if the time of snapshot is redundant 68 | # delete the convienient snapshot 69 | max_order = math.pow(self.alpha, self.l) + 1 70 | nbr_orders = max_order + 1 71 | nbr_snapshot_order = np.zeros(nbr_orders) 72 | """ 73 | 74 | def distance_to_cluster(self, x, cluster): 75 | return distance.euclidean(x, cluster.get_center()) 76 | 77 | def find_closest_cluster(self, x, micro_clusters): 78 | min_distance = sys.float_info.max 79 | for cluster in micro_clusters: 80 | distance_cluster = self.distance_to_cluster(x, cluster) 81 | if distance_cluster < min_distance: 82 | min_distance = distance_cluster 83 | closest_cluster = cluster 84 | return closest_cluster 85 | 86 | def check_fit_in_cluster(self, x, cluster): 87 | if cluster.get_weight() == 1: 88 | # determine radius using next closest micro-cluster 89 | radius = sys.float_info.max 90 | micro_clusters = self.micro_clusters.copy() 91 | micro_clusters.remove(cluster) 92 | next_cluster = self.find_closest_cluster(x, micro_clusters) 93 | dist = distance.euclidean(next_cluster.get_center(), cluster.get_center()) 94 | radius = min(dist, radius) 95 | else: 96 | radius = cluster.get_radius() 97 | if self.distance_to_cluster(x, cluster) < radius: 98 | return True 99 | else: 100 | return False 101 | 102 | def oldest_updated_cluster(self): 103 | threshold = self.timestamp - self.time_window 104 | min_relevance_stamp = sys.float_info.max 105 | oldest_cluster = None 106 | for cluster in self.micro_clusters: 107 | relevance_stamp = cluster.get_relevancestamp() 108 | if (relevance_stamp < threshold) and (relevance_stamp < min_relevance_stamp): 109 | min_relevance_stamp = relevance_stamp 110 | oldest_cluster = cluster 111 | return oldest_cluster 112 | 113 | def merge_closest_clusters(self): 114 | min_distance = sys.float_info.max 115 | for i, cluster in enumerate(self.micro_clusters): 116 | center = cluster.get_center() 117 | for next_cluster in self.micro_clusters[i+1:]: 118 | dist = distance.euclidean(center, next_cluster.get_center()) 119 | if dist < min_distance: 120 | min_distance = dist 121 | cluster_1 = cluster 122 | cluster_2 = next_cluster 123 | assert (cluster_1 != cluster_2) 124 | cluster_1.merge(cluster_2) 125 | self.micro_clusters.remove(cluster_2) 126 | 127 | def partial_fit(self, x, y): 128 | self.timestamp += 1 129 | X = x 130 | x = x[0] 131 | closest_cluster = self.find_closest_cluster(x, self.micro_clusters) 132 | check = self.check_fit_in_cluster(x, closest_cluster) 133 | if check: 134 | closest_cluster.insert(x, self.timestamp) 135 | else: 136 | old_up_clust = self.oldest_updated_cluster() 137 | if old_up_clust is not None: 138 | self.micro_clusters.remove(old_up_clust) 139 | else: 140 | self.merge_closest_clusters() 141 | self.create_micro_cluster(X) 142 | 143 | 144 | def predict(self, X=None): 145 | """Predict the class labels for the provided data 146 | Parameters 147 | ---------- 148 | X : 149 | Returns 150 | ------- 151 | y : 152 | """ 153 | cluster_centers = list(map((lambda i: i.get_center()), self.micro_clusters)) 154 | #centers_weights = list(map((lambda i: i.get_weight()), self.micro_clusters)) 155 | kmeans = KMeans(n_clusters=self.nb_macro_cluster, random_state=1) 156 | result = kmeans.fit_predict(X=cluster_centers, y=None) 157 | return result 158 | -------------------------------------------------------------------------------- /sklearn/streams/cluStream/model/MicroCluster.py: -------------------------------------------------------------------------------- 1 | import math as math 2 | import numpy as np 3 | 4 | 5 | class MicroCluster: 6 | """ 7 | Implementation of the MicroCluster data structure for the CluStream algorithm 8 | Parameters 9 | ---------- 10 | :parameter nb_points is the number of points in the cluster 11 | :parameter identifier is the identifier of the cluster (take -1 if the cluster result from merging two clusters) 12 | :parameter merge is used to indicate whether the cluster is resulting from the merge of two existing ones 13 | :parameter id_list is the id list of merged clusters 14 | :parameter linear_sum is the linear sum of the points in the cluster. 15 | :parameter squared_sum is the squared sum of all the points added to the cluster. 16 | :parameter linear_time_sum is the linear sum of all the timestamps of points added to the cluster. 17 | :parameter squared_time_sum is the squared sum of all the timestamps of points added to the cluster. 18 | :parameter m is the number of points considered to determine the relevance stamp of a cluster 19 | :parameter update_timestamp is used to indicate the last update time of the cluster 20 | """ 21 | 22 | def __init__(self, nb_points=0, identifier=0, id_list=None, linear_sum=None, 23 | squared_sum=None, linear_time_sum=0, squared_time_sum=0, 24 | m=100, update_timestamp=0): 25 | self.nb_points = nb_points 26 | self.identifier = identifier 27 | self.id_list = id_list 28 | self.linear_sum = linear_sum 29 | self.squared_sum = squared_sum 30 | self.linear_time_sum = linear_time_sum 31 | self.squared_time_sum = squared_time_sum 32 | self.m = m 33 | self.update_timestamp = update_timestamp 34 | self.radius_factor = 1.8 35 | self.epsilon = 0.00005 36 | self.min_variance = math.pow(1, -5) 37 | 38 | 39 | def get_center(self): 40 | center = [self.linear_sum[i] / self.nb_points for i in range(len(self.linear_sum))] 41 | return center 42 | 43 | def get_weight(self): 44 | return self.nb_points 45 | 46 | 47 | def insert(self, new_point, current_timestamp): 48 | self.nb_points += 1 49 | self.update_timestamp = current_timestamp 50 | for i in range(len(new_point)): 51 | self.linear_sum[i] += new_point[i] 52 | self.squared_sum[i] += math.pow(new_point[i], 2) 53 | self.linear_time_sum += current_timestamp 54 | self.squared_time_sum += math.pow(current_timestamp, 2) 55 | 56 | 57 | def merge(self, micro_cluster): 58 | ## micro_cluster must be removed 59 | self.nb_points += micro_cluster.nb_points 60 | self.linear_sum += micro_cluster.linear_sum 61 | self.squared_sum += micro_cluster.squared_sum 62 | self.linear_time_sum += micro_cluster.linear_time_sum 63 | self.squared_time_sum += micro_cluster.squared_time_sum 64 | 65 | if (self.identifier != -1): 66 | if (micro_cluster.identifier != -1): 67 | self.id_list = [self.identifier, micro_cluster.identifier] 68 | else: 69 | micro_cluster.id_list.append(self.identifier) 70 | self.id_list = micro_cluster.id_list.copy() 71 | self.identifier = -1 72 | else : 73 | if (micro_cluster.identifier != -1): 74 | self.id_list.append(micro_cluster.identifier) 75 | else: 76 | self.id_list.extend(micro_cluster.id_list) 77 | 78 | def get_relevancestamp(self): 79 | if (self.nb_points < 2 * self.m): 80 | return self.get_mutime() 81 | return self.get_mutime() + self.get_sigmatime() * self.get_quantile(self.m /(2 * self.nb_points)) 82 | 83 | def get_mutime(self): 84 | return self.linear_time_sum / self.nb_points 85 | 86 | def get_sigmatime(self): 87 | return math.sqrt(self.squared_time_sum / self.nb_points - math.pow((self.linear_time_sum / self.nb_points), 2)) 88 | 89 | def get_quantile(self, x): 90 | assert(x >= 0 and x <= 1) 91 | return math.sqrt(2) * self.inverse_error(2 * x - 1) 92 | 93 | def get_radius(self): 94 | if self.nb_points == 1: 95 | return 0 96 | return self.get_deviation() * self.radius_factor 97 | 98 | def get_clsuter_feature(self): 99 | return self.this 100 | 101 | def get_deviation(self): 102 | variance = self.get_variance_vec() 103 | sum_deviation = 0 104 | for i in range(len(variance)): 105 | sqrt_deviation = math.sqrt(variance[i]) 106 | sum_deviation += sqrt_deviation 107 | return sum_deviation / len(variance) 108 | 109 | def get_variance_vec(self): 110 | variance_vec = list() 111 | for i in range(len(self.linear_sum)): 112 | ls_mean = self.linear_sum[i] / self.nb_points 113 | ss_mean = self.squared_sum[i] / self.nb_points 114 | variance = ss_mean - math.pow(ls_mean, 2) 115 | if variance <= 0: 116 | if variance > - self.epsilon: 117 | variance = self.min_variance 118 | 119 | variance_vec.append(variance) 120 | return variance_vec 121 | 122 | def inverse_error(self, x): 123 | z = (math.sqrt(math.pi) * x) 124 | inv_error = z / 2 125 | z_prod = math.pow(z,3) 126 | inv_error += (1 / 24) * z_prod 127 | 128 | z_prod *= math.pow(z,2) 129 | inv_error += (7 / 960) * z_prod 130 | 131 | z_prod = math.pow(z,2) 132 | inv_error += (127 * z_prod) / 80640 133 | 134 | z_prod = math.pow(z,2) 135 | inv_error += (4369 / z_prod) * 11612160 136 | 137 | z_prod = math.pow(z,2) 138 | inv_error += (34807 / z_prod) * 364953600 139 | 140 | z_prod = math.pow(z,2) 141 | inv_error += (20036983 / z_prod) * 0x797058662400d 142 | return z_prod 143 | 144 | --------------------------------------------------------------------------------