├── ClusterSVDD ├── __init__.py ├── cluster_svdd.py ├── kernel.py ├── mlp.py ├── svdd_dual_qp.py └── svdd_primal_sgd.py ├── LICENSE ├── README.md ├── __init__.py ├── scripts ├── __init__.py ├── test_ad_svdd.py ├── test_anom.py ├── test_clustersvdd.py ├── test_exm.py ├── test_impl.py ├── test_real.py ├── test_robust.py └── test_struct.py └── setup.py /ClusterSVDD/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nicococo/ClusterSvdd/2f61c187a3197c807b239202b72d9c84cb46400c/ClusterSVDD/__init__.py -------------------------------------------------------------------------------- /ClusterSVDD/cluster_svdd.py: -------------------------------------------------------------------------------- 1 | __author__ = 'nicococo' 2 | import numpy as np 3 | 4 | 5 | class ClusterSvdd: 6 | """ Implementation of the cluster support vector data description (ClusterSVDD). 7 | Author: Nico Goernitz, TU Berlin, 2015 8 | """ 9 | PRECISION = 1e-4 # This parameter can be important as it effects the threshold, 10 | # support vectors and speed! 11 | clusters = 0 # (scalar) number of clusters 12 | svdds = None # (list) list of dual qp svdds 13 | nu = -1.0 # (scalar) 0 < nu <= 1.0 14 | 15 | def __init__(self, svdds, nu=-1.0): 16 | self.clusters = len(svdds) 17 | self.svdds = svdds 18 | self.nu = nu 19 | self.use_local_fraction = nu <= 0. 20 | print('Creating new ClusterSVDD with {0} clusters.'.format(self.clusters)) 21 | 22 | def fit(self, X, min_chg=0.0, max_iter=40, max_svdd_iter=2000, init_membership=None): 23 | """ 24 | :param X: Data matrix is assumed to be feats x samples. 25 | :param min_chg: Minimum percent of changes per iteration before stopping. 26 | :param max_iter: Maximum number of iteration before stopping. 27 | :param max_svdd_iter: Maximum number of iterations for nested SVDDs. 28 | :param init_membership: Integer array with cluster affiliation per 29 | sample (used for initialization). 30 | :return: (Integer array ) Cluster affiliations for all samples. 31 | """ 32 | (dims, samples) = X.shape 33 | 34 | # init majorization step 35 | cinds_old = np.zeros(samples) 36 | cinds = np.random.randint(0, self.clusters, samples) 37 | if init_membership is not None: 38 | print('Using init cluster membership.') 39 | cinds = init_membership 40 | 41 | # init maximization step 42 | for c in range(self.clusters): 43 | inds = np.where(cinds == c)[0] 44 | self.svdds[c].fit(X[:, inds]) 45 | 46 | iter_cnt = 0 47 | scores = np.zeros((self.clusters, samples)) 48 | while np.sum(np.abs(cinds_old-cinds))/np.float(samples) > min_chg and iter_cnt < max_iter: 49 | print('Iter={0}'.format(iter_cnt)) 50 | # 1. majorization step 51 | for c in range(self.clusters): 52 | scores[c, :] = self.svdds[c].predict(X) 53 | cinds_old = cinds 54 | cinds = np.argmin(scores, axis=0) 55 | # 2. maximization step 56 | for c in range(self.clusters): 57 | inds = np.where(cinds == c)[0] 58 | if inds.size > 0: 59 | # perc = 2.0*np.float(inds.size)/np.float(samples) 60 | # self.svdds[c].nu = perc * self.nu 61 | self.svdds[c].fit(X[:, inds], max_iter=max_svdd_iter) 62 | iter_cnt += 1 63 | print('ClusterSVDD training finished after {0} iterations.'.format(iter_cnt)) 64 | return cinds 65 | 66 | def predict(self, Y): 67 | """ 68 | :param Y: 69 | :return: 70 | """ 71 | scores = np.zeros((self.clusters, Y.shape[1])) 72 | for c in range(self.clusters): 73 | scores[c, :] = self.svdds[c].predict(Y) 74 | cinds = np.argmin(scores, axis=0) 75 | return np.min(scores, axis=0), cinds 76 | -------------------------------------------------------------------------------- /ClusterSVDD/kernel.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from profilehooks import profile 4 | 5 | class Kernel: 6 | 7 | def __init__(self): 8 | pass 9 | 10 | @staticmethod 11 | def get_kernel(X, Y, type='linear', param=1.0): 12 | """Calculates a kernel given the data X and Y (dims x exms)""" 13 | (Xdims, Xn) = X.shape 14 | (Ydims, Yn) = Y.shape 15 | 16 | kernel = 1.0 17 | if type=='linear': 18 | print('Calculating linear kernel with size {0}x{1}.'.format(Xn, Yn)) 19 | kernel = X.T.dot(Y) 20 | 21 | if type=='rbf': 22 | print('Calculating Gaussian kernel with size {0}x{1} and sigma2={2}.'.format(Xn, Yn, param)) 23 | Dx = (np.ones((Yn, 1)) * np.diag(X.T.dot(X)).reshape(1, Xn)).T 24 | Dy = (np.ones((Xn, 1)) * np.diag(Y.T.dot(Y)).reshape(1, Yn)) 25 | kernel = Dx - 2.* np.array(X.T.dot(Y)) + Dy 26 | kernel = np.exp(-kernel / param) 27 | print kernel.shape 28 | 29 | return kernel 30 | 31 | 32 | @staticmethod 33 | def get_diag_kernel(X, type='linear', param=1.0): 34 | """Calculates the kernel diagonal given the data X (dims x exms)""" 35 | (Xdims, Xn) = X.shape 36 | 37 | kernel = 1.0 38 | if type=='linear': 39 | print('Calculating diagonal of a linear kernel with size {0}x{1}.'.format(Xn, Xn)) 40 | kernel = np.sum(X*X, axis=0) 41 | 42 | if type=='rbf': 43 | print('Gaussian kernel diagonal is always exp(0)=1.') 44 | kernel = np.ones(Xn, dtype='d') 45 | return kernel 46 | 47 | 48 | @staticmethod 49 | def center_kernel(K): 50 | print('IMPLEMENTED ME') 51 | return K 52 | 53 | 54 | @staticmethod 55 | def normalize_kernel(K): 56 | print('IMPLEMENTED ME') 57 | return K -------------------------------------------------------------------------------- /ClusterSVDD/mlp.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import numpy as np 3 | 4 | from sklearn.svm import SVR 5 | 6 | #-------------------------------------------------------------------- 7 | # Hyperparameters 8 | #-------------------------------------------------------------------- 9 | lr = 0.001 # learning rate 10 | 11 | 12 | #-------------------------------------------------------------------- 13 | # Multilayer network 14 | #-------------------------------------------------------------------- 15 | class Sequential: 16 | 17 | def __init__(self,modules): self.modules = modules 18 | 19 | def forward(self,X): 20 | for m in self.modules: X = m.forward(X) 21 | return X 22 | 23 | def backward(self,DY): 24 | for m in self.modules[::-1]: DY = m.backward(DY) 25 | return DY 26 | 27 | def update(self): 28 | for m in self.modules: m.update() 29 | 30 | #-------------------------------------------------------------------- 31 | # Linear layer 32 | #-------------------------------------------------------------------- 33 | class Linear: 34 | 35 | def __init__(self,m,n,last=False): 36 | self.m = m 37 | self.n = n 38 | 39 | self.W = numpy.random.uniform(-1/self.m**.5,1/self.m**.5,[m,n]).astype('float32') 40 | self.B = numpy.zeros([n]).astype('float32') 41 | if last: self.W *= 0 42 | 43 | def forward(self,X): 44 | self.X = X 45 | return numpy.dot(X,self.W)+self.B 46 | 47 | def backward(self,DY): 48 | 49 | DX = numpy.dot(DY,self.W.T) 50 | 51 | self.DW = (numpy.dot(self.X.T,DY))/ self.m**.5 52 | self.DB = (DY.sum(axis=0)) / self.m**.25 53 | 54 | return DX*(self.m**.5/self.n**.5) 55 | 56 | def update(self): 57 | self.W -= lr*self.DW 58 | self.B -= lr*self.DB 59 | 60 | #-------------------------------------------------------------------- 61 | # Hyperbolic tangent layer 62 | #-------------------------------------------------------------------- 63 | class Tanh: 64 | def __init__(self): pass 65 | def forward(self,X): self.Y = numpy.tanh(X); return self.Y 66 | def backward(self,DY): return DY*(1-self.Y**2) 67 | 68 | def update(self): pass 69 | 70 | 71 | #==================================================================== 72 | # Test 73 | #==================================================================== 74 | 75 | # Prepare data 76 | nbsamples=200 77 | nbinputdims=100 78 | nboutputdims=1 79 | 80 | # Random regression task 81 | X = numpy.random.normal(0,1,[nbsamples,nbinputdims]) 82 | T = numpy.random.normal(0,1,[nbsamples,nboutputdims]) 83 | T = numpy.random.normal(0,1,[nbsamples]) 84 | 85 | # Initialize network 86 | nn = Sequential([ 87 | Linear(nbinputdims,200), 88 | Tanh(), 89 | Linear(200,20), 90 | Tanh(), 91 | Linear(20,nboutputdims) 92 | ]) 93 | 94 | clf = SVR(C=1000.0, epsilon=0.0002) 95 | clf.fit(X, T) 96 | ypred = clf.predict(X) 97 | print((ypred-T)**2).sum() 98 | 99 | T = T[:,np.newaxis] 100 | 101 | # Training 102 | for t in range(1000): 103 | 104 | Y = nn.forward(X) 105 | nn.backward(Y-T) 106 | nn.update() 107 | 108 | if t % 100 == 0: print(t,((Y-T)**2).sum()) 109 | 110 | -------------------------------------------------------------------------------- /ClusterSVDD/svdd_dual_qp.py: -------------------------------------------------------------------------------- 1 | from cvxopt import matrix,spmatrix,sparse 2 | from cvxopt.solvers import qp 3 | import numpy as np 4 | 5 | from kernel import Kernel 6 | 7 | class SvddDualQP: 8 | """ Dual QP implementation of the support vector data description (SVDD). 9 | Author: Nico Goernitz, TU Berlin, 2015 10 | """ 11 | 12 | PRECISION = 1e-6 # important: effects the threshold, support vectors and speed! 13 | 14 | kernel = None # (string) name of the kernel to use 15 | kparam = None # (-) kernel parameter 16 | samples = -1 # (scalar) amount of training data in X 17 | 18 | nu = 0.95 # (scalar) the regularization constant > 0 19 | 20 | X = None # (matrix) training data 21 | alphas = None # (vector) dual solution vector 22 | svs = None # (vector) support vector indices 23 | radius2 = 0.0 # (scalar) the optimized threshold (rho) 24 | cTc = None # (vector) alphaT*K*alpha for support vectors only 25 | 26 | pobj = 0.0 # (scalar) primal objective value after training 27 | 28 | def __init__(self, kernel, kparam, nu): 29 | self.kernel = kernel 30 | self.kparam = kparam 31 | self.nu = nu 32 | print('Creating new dual QP SVDD ({0}) with nu={1}.'.format(kernel, nu)) 33 | 34 | def fit(self, X, max_iter=-1): 35 | """ 36 | :param X: Data matrix is assumed to be feats x samples. 37 | :param max_iter: *ignored*, just for compatibility. 38 | :return: Alphas and threshold for dual SVDDs. 39 | """ 40 | self.X = X.copy() 41 | dims, self.samples = X.shape 42 | 43 | if self.samples < 1: 44 | print('Invalid training data.') 45 | return -1 46 | 47 | # number of training examples 48 | N = self.samples 49 | C = 1. / np.float(self.samples*self.nu) 50 | 51 | kernel = Kernel.get_kernel(X, X, self.kernel, self.kparam) 52 | norms = np.diag(kernel).copy() 53 | 54 | if self.nu >= 1.0: 55 | print("Center-of-mass solution.") 56 | self.alphas = np.ones(self.samples) / float(self.samples) 57 | self.radius2 = 0.0 58 | self.svs = np.array(range(self.samples), dtype='i') 59 | self.pobj = 0.0 # TODO: calculate real primal objective 60 | self.cTc = self.alphas[self.svs].T.dot(kernel[self.svs, :][:, self.svs].dot(self.alphas[self.svs])) 61 | return self.alphas, self.radius2 62 | 63 | # generate a kernel matrix 64 | P = 2.0*matrix(kernel) 65 | 66 | # this is the diagonal of the kernel matrix 67 | q = -matrix(norms) 68 | 69 | # sum_i alpha_i = A alpha = b = 1.0 70 | A = matrix(1.0, (1, N)) 71 | b = matrix(1.0, (1, 1)) 72 | 73 | # 0 <= alpha_i <= h = C 74 | G1 = spmatrix(1.0, range(N), range(N)) 75 | G = sparse([G1, -G1]) 76 | h1 = matrix(C, (N, 1)) 77 | h2 = matrix(0.0, (N, 1)) 78 | h = matrix([h1, h2]) 79 | 80 | sol = qp(P, q, G, h, A, b) 81 | 82 | # store solution 83 | self.alphas = np.array(sol['x'], dtype=np.float) 84 | self.pobj = -sol['primal objective'] 85 | 86 | # find support vectors 87 | self.svs = np.where(self.alphas > self.PRECISION)[0] 88 | # self.cTc = self.alphas[self.svs].T.dot(kernel[self.svs, :][:, self.svs].dot(self.alphas[self.svs])) 89 | self.cTc = self.alphas.T.dot(kernel.dot(self.alphas)) 90 | 91 | # find support vectors with alpha < C for threshold calculation 92 | self.radius2 = 0. 93 | thres = self.predict(X[:, self.svs]) 94 | self.radius2 = np.min(thres) 95 | 96 | # self.radius2 = 0. 97 | # thres = self.predict(X) 98 | # sort_thres = np.sort(thres) 99 | # self.radius2 = sort_thres[np.floor(N*(1.0-self.nu))] 100 | # print 'RADIUS => Index: ', np.floor(N*(1.0-self.nu)), self.radius2 101 | # print sort_thres 102 | 103 | # print 'jkdsajfjsaldj', np.min(thres) 104 | # print np.sum(self.alphas) 105 | # print self.nu, np.sum(self.alphas>1e-6), self.samples 106 | 107 | # print('Threshold is {0}'.format(self.radius2)) 108 | return self.alphas, thres 109 | 110 | def get_radius(self): 111 | return self.radius2 112 | 113 | def get_alphas(self): 114 | return self.alphas 115 | 116 | def get_support_inds(self): 117 | return self.svs 118 | 119 | def get_support(self): 120 | return self.alphas[self.svs] 121 | 122 | def predict(self, Y): 123 | # build test kernel 124 | kernel = Kernel.get_kernel(Y, self.X[:, self.svs], self.kernel, self.kparam) 125 | # kernel = Kernel.get_kernel(Y, self.X, self.kernel, self.kparam) 126 | # for svdd we need the data norms additionally 127 | norms = Kernel.get_diag_kernel(Y, self.kernel) 128 | # number of training examples 129 | res = self.cTc - 2. * kernel.dot(self.get_support()).T + norms 130 | # res = self.cTc - 2. * kernel.dot(self.alphas).T + norms 131 | return res.reshape(Y.shape[1]) - self.radius2 132 | -------------------------------------------------------------------------------- /ClusterSVDD/svdd_primal_sgd.py: -------------------------------------------------------------------------------- 1 | __author__ = 'nicococo' 2 | import numpy as np 3 | 4 | from numba import autojit 5 | 6 | 7 | class SvddPrimalSGD(object): 8 | """ Primal subgradient descent solver for the support vector data description (SVDD). 9 | Author: Nico Goernitz, TU Berlin, 2015 10 | """ 11 | PRECISION = 10**-3 # important: effects the threshold, support vectors and speed! 12 | nu = 0.95 # (scalar) the regularization constant > 0 13 | c = None # (vecor) center of the hypersphere 14 | radius2 = 0.0 # (scalar) the optimized threshold (rho) 15 | pobj = 0.0 # (scalar) primal objective after training 16 | 17 | def __init__(self, nu): 18 | self.nu = nu 19 | print('Creating new primal SVDD with nu={0}.'.format(nu)) 20 | 21 | @autojit 22 | def fit(self, X, max_iter=20000, prec=1e-6, rate=0.01): 23 | if X.shape[1] < 1: 24 | print('Invalid training data.') 25 | return -1, -1 26 | self.c, self.radius2, self.pobj, iter = fit_extern(X, self.nu, max_iter, prec, rate) 27 | print('Iter={2}: obj={0} T={1}'.format(self.pobj, self.radius2, iter+1)) 28 | return self.c, self.radius2 29 | 30 | def get_radius(self): 31 | return self.radius2 32 | 33 | def predict(self, X): 34 | # X : (dims x samples) 35 | dist = self.c.T.dot(self.c) - 2.*self.c.T.dot(X) + np.sum(X*X, axis=0) 36 | return dist - self.radius2 37 | 38 | 39 | @autojit(nopython=True) 40 | def fit_extern(X, nu, max_iter, prec, rate): 41 | """ Subgradient descent solver for primal SVDD. 42 | Optimized for 'numba' 43 | """ 44 | (dims, samples) = X.shape 45 | 46 | # number of training examples 47 | reg = 1./(np.float64(samples)*nu) 48 | 49 | # center of mass 50 | c = np.zeros(dims, dtype=np.float64) 51 | # np.sum(X*X, axis=0) 52 | sum_XX = np.zeros(samples) 53 | for s in range(samples): 54 | foo = 0.0 55 | for d in range(dims): 56 | foo += X[d, s]*X[d, s] 57 | c[d] += X[d, s] / np.float64(samples) 58 | sum_XX[s] = foo 59 | # print np.sum(np.abs(c-np.mean(X, axis=1))) 60 | 61 | dot_2cX = np.zeros(samples, dtype=np.float64) 62 | for s in range(samples): 63 | dot_2cX[s] = 2.0 * np.sum(c*X[:, s]) 64 | dist = np.sum(c*c) - dot_2cX + sum_XX 65 | 66 | T = 0.4 * np.max(dist) * (1.0-nu) # starting heuristic T 67 | # if nu exceeds 1.0, then T^* is always 0 and c can 68 | # be computed analytically (as center-of-mass, mean) 69 | if nu >= 1.0: 70 | return c, 0.0, 0.0, 0 71 | 72 | is_converged = False 73 | best_c = c 74 | best_radius2 = T 75 | obj_best = np.float64(1e20) 76 | 77 | obj_bak = -100. 78 | iter = 0 79 | 80 | # gradient step for center 81 | dc = np.zeros(dims, dtype=np.float64) 82 | inds = np.zeros(samples, dtype=np.int64) 83 | while not is_converged and iter < max_iter: 84 | # print iter 85 | for s in range(samples): 86 | dot_2cX[s] = 2.0 * np.sum(c*X[:, s]) 87 | 88 | # calculate the distances of the center to each datapoint 89 | dist = np.sum(c*c) - dot_2cX + sum_XX 90 | inds_size = 0 91 | for s in range(samples): 92 | if dist[s]-T >= 1e-12: 93 | inds[inds_size] = s 94 | inds_size += 1 95 | # we need at least 1 entry, hence lower T to the maximum entry 96 | if inds_size == 0: 97 | inds_size = 1 98 | inds[0] = np.argmax(dist) 99 | T = dist[inds[0]] 100 | 101 | # real objective value given the current center c and threshold T 102 | ds = 0.0 103 | for s in range(inds_size): 104 | ds += dist[inds[s]] - T 105 | obj = T + reg*ds 106 | 107 | # this is subgradient, hence need to store the best solution so far 108 | if obj_best >= obj: 109 | best_c = c 110 | best_radius2 = T 111 | obj_best = obj 112 | 113 | # stop, if progress is too slow 114 | if obj > 0.: 115 | if np.abs((obj-obj_bak)/obj) < prec: 116 | is_converged = True 117 | continue 118 | obj_bak = obj 119 | 120 | # stepsize should be not more than 0.1 % of the maximum value encountered in dist 121 | max_change = rate * np.max(dist) / np.float(iter+1)*10. 122 | 123 | # gradient step for threshold 124 | dT = 1.0 - reg*np.float(inds_size) 125 | T -= np.sign(dT) * max_change 126 | 127 | # gradient step for center 128 | norm_dc = 0.0 129 | for d in range(dims): 130 | dc[d] = 0.0 131 | for s in range(inds_size): 132 | dc[d] += 2.*reg*(c[d] - X[d, inds[s]]) 133 | norm_dc += dc[d]*dc[d] 134 | norm_dc = np.sqrt(norm_dc) 135 | 136 | if np.abs(norm_dc) < 1e-12: 137 | norm_dc = 1.0 138 | 139 | for d in range(dims): 140 | c[d] -= dc[d]/norm_dc * max_change 141 | iter += 1 142 | 143 | return best_c, best_radius2, obj_best, iter 144 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 nico 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ClusterSvdd 2 | Cluster Support Vector Data Description 3 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nicococo/ClusterSvdd/2f61c187a3197c807b239202b72d9c84cb46400c/__init__.py -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nicococo/ClusterSvdd/2f61c187a3197c807b239202b72d9c84cb46400c/scripts/__init__.py -------------------------------------------------------------------------------- /scripts/test_ad_svdd.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import sklearn.metrics as metrics 3 | import sklearn.datasets as datasets 4 | import numpy as np 5 | 6 | from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD 7 | from ClusterSVDD.svdd_dual_qp import SvddDualQP 8 | from ClusterSVDD.cluster_svdd import ClusterSvdd 9 | 10 | 11 | def generate_data_uniform(datapoints, cluster_dir_alphas=(10, 10, 10), outlier_frac=0.1, feats=2, noise_feats=0): 12 | cluster = len(cluster_dir_alphas) 13 | X = np.zeros((feats, datapoints)) 14 | y = np.zeros(datapoints) 15 | 16 | num_noise = np.int(np.floor(datapoints*outlier_frac)) 17 | 18 | samples = np.random.dirichlet(cluster_dir_alphas, 1)[0] 19 | samples = np.array(samples*(datapoints-num_noise), dtype=np.int) 20 | print samples, sum(samples) 21 | if np.sum(samples)+num_noise < datapoints: 22 | print('Add another sample..') 23 | num_noise += datapoints-(np.sum(samples)+num_noise) 24 | print num_noise+np.sum(samples), datapoints 25 | 26 | cnt = num_noise 27 | for i in range(cluster): 28 | m = np.random.randn(feats-noise_feats)*8. 29 | #cov = np.diag(np.random.rand(feats-noise_feats)) 30 | cov = 2.*np.random.rand() * np.eye(feats-noise_feats) 31 | print cov 32 | X[:feats-noise_feats, cnt:cnt+samples[i]] = np.random.multivariate_normal(m, cov, samples[i]).T 33 | y[cnt:cnt+samples[i]] = i+1 34 | cnt += samples[i] 35 | 36 | mul = np.max(np.abs(X))*2. 37 | print mul 38 | X[:, :num_noise] = 2.*mul*(np.random.rand(feats, num_noise)-0.5) 39 | y[:num_noise] = -1 40 | 41 | X[feats-noise_feats:, :] = 2.*mul*np.random.randn(noise_feats, datapoints) 42 | 43 | # normalize each feature [-1,+1] 44 | X = X / np.repeat(np.max(np.abs(X), axis=1)[:, np.newaxis], datapoints, axis=1) 45 | 46 | return X, y 47 | 48 | 49 | def generate_data_moons(datapoints, outlier_frac=0.1, noise_feats=0.05): 50 | X = np.zeros((datapoints, 2)) 51 | y = np.zeros(datapoints) 52 | 53 | num_noise = np.int(np.floor(datapoints*outlier_frac)) 54 | 55 | X[num_noise:, :], y[num_noise:] = datasets.make_moons(n_samples=datapoints-num_noise, noise=noise_feats) 56 | X = X.T 57 | y[num_noise:] += 1 58 | 59 | mul = np.max(np.abs(X))*1.5 60 | print mul 61 | X[:, :num_noise] = 2.*mul*(np.random.rand(2, num_noise)-0.5) 62 | y[:num_noise] = -1 63 | 64 | # normalize each feature [-1,+1] 65 | X = X / np.repeat(np.max(np.abs(X), axis=1)[:, np.newaxis], datapoints, axis=1) 66 | 67 | return X, y 68 | 69 | 70 | def generate_data(datapoints, norm_dir_alpha=10., anom_dir_alpha=4., anom_cluster=[0, 0, 0, 1, 1, 1], feats=2): 71 | cluster = len(anom_cluster) 72 | X = np.zeros((feats, datapoints)) 73 | y = np.zeros(datapoints) 74 | 75 | cluster_dir_alphas = np.array(anom_cluster)*anom_dir_alpha + (1-np.array(anom_cluster))*norm_dir_alpha 76 | samples = np.random.dirichlet(cluster_dir_alphas, 1)[0] 77 | samples = np.array(samples*datapoints, dtype=np.int) 78 | if np.sum(samples) < datapoints: 79 | print('Add another sample..') 80 | samples[-1] += 1 81 | 82 | cnt = 0 83 | anom_lbl = -1 84 | norm_lbl = 1 85 | for i in range(cluster): 86 | sigma = 8. 87 | if anom_cluster[i] == 1: 88 | sigma = 1. 89 | m = np.random.randn(feats)*sigma 90 | cov = np.diag(np.random.rand(feats)) 91 | print cov 92 | X[:, cnt:cnt+samples[i]] = np.random.multivariate_normal(m, cov, samples[i]).T 93 | label = norm_lbl 94 | if anom_cluster[i] == 1: 95 | label = anom_lbl 96 | anom_lbl -= 1 97 | else: 98 | label = norm_lbl 99 | norm_lbl += 1 100 | y[cnt:cnt+samples[i]] = label 101 | cnt += samples[i] 102 | 103 | # normalize each feature [-1,+1] 104 | X = X / np.repeat(np.max(np.abs(X), axis=1)[:, np.newaxis], datapoints, axis=1) 105 | 106 | return X, y 107 | 108 | 109 | def evaluate(nu, k, data, y, train, test, use_kernel=False, kparam=0.1, plot=False): 110 | 111 | # fix the initialization for all methods 112 | membership = np.random.randint(0, k, y.size) 113 | svdds = list() 114 | for l in range(k): 115 | if use_kernel: 116 | svdds.append(SvddDualQP('rbf', kparam, nu)) 117 | else: 118 | svdds.append(SvddPrimalSGD(nu)) 119 | 120 | svdd = ClusterSvdd(svdds) 121 | svdd.fit(data[:, train].copy(), max_iter=60, init_membership=membership[train]) 122 | scores, classes = svdd.predict(data[:, test].copy()) 123 | 124 | # normal classes are positive (e.g. 1,2,3,..) anomalous class is -1 125 | print y[test] 126 | true_lbl = y[test] 127 | true_lbl[true_lbl < 0] = -1 # convert outliers to single outlier class 128 | ari = metrics.cluster.adjusted_rand_score(true_lbl, classes) 129 | if nu < 1.0: 130 | classes[scores > 0.] = -1 131 | ari = metrics.cluster.adjusted_rand_score(true_lbl, classes) 132 | print 'ARI=', ari 133 | 134 | fpr, tpr, _ = metrics.roc_curve(y[test]<0., scores, pos_label=1) 135 | auc = metrics.auc(fpr, tpr, ) 136 | print 'AUC=', auc 137 | 138 | if plot: 139 | plt.figure(1) 140 | anom_inds = np.where(y == -1)[0] 141 | plt.plot(data[0, anom_inds], data[1, anom_inds], '.g', markersize=2) 142 | nom_inds = np.where(y != -1)[0] 143 | plt.plot(data[0, nom_inds], data[1, nom_inds], '.r', markersize=6) 144 | 145 | an = np.linspace(0, 2*np.pi, 100) 146 | for l in range(k): 147 | r = np.sqrt(svdd.svdds[l].radius2) 148 | if hasattr(svdd.svdds[l],'c'): 149 | plt.plot(svdd.svdds[l].c[0], svdd.svdds[l].c[1], 150 | 'xb', markersize=6, linewidth=2, alpha=0.7) 151 | plt.plot(r*np.sin(an)+svdd.svdds[l].c[0], r*np.cos(an)+svdd.svdds[l].c[1], 152 | '-b', linewidth=2, alpha=0.7) 153 | plt.show() 154 | return ari, auc 155 | 156 | 157 | 158 | 159 | if __name__ == '__main__': 160 | num_train = 600 161 | num_test = 600 162 | 163 | train = np.array(range(num_train), dtype='i') 164 | test = np.array(range(num_train, num_train + num_test), dtype='i') 165 | 166 | reps = 1 167 | nus = [0.1, 0.5, 0.8, 1.0] 168 | ks = [3] 169 | aris = np.zeros((reps, len(nus),len(ks))) 170 | aucs = np.zeros((reps, len(nus),len(ks))) 171 | 172 | data, y = generate_data_uniform(num_train + num_test, cluster_dir_alphas=(10, 10, 10), outlier_frac=0.5, feats=2, noise_feats=0) 173 | # data, y = generate_data(num_train + num_test, norm_dir_alpha=10., anom_dir_alpha=2., anom_cluster=[0, 0, 0, 1, 1, 1, 1, 1, 1], feats=2) 174 | # data, y = generate_data_moons(num_train + num_test, outlier_frac=0.3, noise_feats=0.05) 175 | 176 | 177 | for r in range(reps): 178 | # data, y = generate_data_uniform(num_train + num_test, cluster_dir_alphas=(10, 10, 10), outlier_frac=0.25, feats=2, noise_feats=0) 179 | inds = np.random.permutation((num_test + num_train)) 180 | data = data[:, inds] 181 | y = y[inds] 182 | 183 | # inds = np.where(y>=-1)[0] 184 | # rinds = np.random.permutation(inds.size) 185 | # train = inds[rinds[:num_train]] 186 | # test = np.setdiff1d(np.arange(num_train+num_test), train) 187 | 188 | ssseeed = np.random.randint(low=0, high=1101010) 189 | for nu in range(len(nus)): 190 | for k in range(len(ks)): 191 | np.random.seed(ssseeed) 192 | aris[r, nu, k], aucs[r, nu, k] = evaluate(nus[nu], ks[k], data, y, train, test, use_kernel=False, kparam=1., plot=False) 193 | 194 | print '\n' 195 | for nu in range(len(nus)): 196 | print '' 197 | for k in range(len(ks)): 198 | print('k={0} nu={1}: ARI = {2:1.2f}+/-{4:1.2f} AUC = {3:1.2f}+/-{4:1.2f}'.format(ks[k], nus[nu], 199 | np.mean(aris[:, nu, k]), np.mean(aucs[:, nu, k]), np.std(aris[:, nu, k]), np.std(aucs[:, nu, k]))) 200 | 201 | print('\nDONE :)') 202 | -------------------------------------------------------------------------------- /scripts/test_anom.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Nico Goernitz' 2 | 3 | 4 | import matplotlib.pyplot as plt 5 | import sklearn.metrics as metrics 6 | import numpy as np 7 | 8 | from ClusterSVDD.svdd_dual_qp import SvddDualQP 9 | from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD 10 | from ClusterSVDD.cluster_svdd import ClusterSvdd 11 | 12 | 13 | def generate_data(datapoints, outlier_frac=0.1, dims=2): 14 | X = np.zeros((dims, datapoints)) 15 | y = np.zeros(datapoints) 16 | num_noise = np.floor(datapoints*outlier_frac) 17 | num_dpc = np.floor(float(datapoints-num_noise)/2.0) 18 | 19 | X[:, :num_noise] = 0.15*np.random.randn(dims, num_noise) \ 20 | + np.array([0.7, -0.]).reshape((2, 1)).dot(np.ones((1, num_noise))) 21 | y[:num_noise] = -1 22 | 23 | cnt = num_noise 24 | X[:, cnt:cnt+num_dpc] = 0.5*np.random.randn(dims, num_dpc) \ 25 | + np.array([-1.5, -2.]).reshape((2, 1)).dot(np.ones((1, num_dpc))) 26 | y[cnt:cnt+num_dpc] = 1 27 | cnt += num_dpc 28 | 29 | num_dpc = datapoints-cnt 30 | X[:, cnt:] = 0.6*np.random.randn(dims, num_dpc) \ 31 | + np.array([-1.5, +1.]).reshape((2, 1)).dot(np.ones((1, num_dpc))) 32 | y[cnt:] = 1 33 | return X, y 34 | 35 | 36 | def plot_results(fname): 37 | foo = np.load(fname) 38 | maucs = foo['maucs'] 39 | saucs = foo['saucs'] 40 | nus = foo['nus'] 41 | ks = foo['ks'] 42 | reps = foo['reps'] 43 | 44 | plt.figure(1) 45 | np.random.seed(10) 46 | cols = np.random.rand(maucs.shape[1], 3) 47 | fmts = ['-x', '--o', '--D', '--s', '--H'] 48 | for i in range(maucs.shape[1]): 49 | plt.errorbar(nus, maucs[:, i], saucs[:, i]/np.sqrt(reps), fmt=fmts[i], color=cols[i, :], ecolor=cols[i, :], linewidth=2.0, elinewidth=1.0, alpha=0.8) 50 | plt.xlim((-0.0, 0.21)) 51 | plt.ylim((0.35, 1.0)) 52 | # ticks = nus.astype('|S10') 53 | # ticks[0] = '1.0=kmeans' 54 | plt.xticks(nus, ['1', '2.5', '5', '7.5', '10', '15', '20']) 55 | # plt.xticks([0.0, 0.25, 0.5, 0.75, 1.0], ['0.0', '0.25', '0.5', '0.75', '1.0 = k-means'], fontsize=14) 56 | # plt.yticks([0.0, 0.25, 0.5, 0.75, 1.0], fontsize=14) 57 | plt.grid() 58 | plt.xlabel(r'regularization parameter $\nu$', fontsize=14) 59 | plt.ylabel(r'Anomaly Detection Accuracy (in AUROC)', fontsize=14) 60 | names = ['SVDD'] 61 | for i in range(1, maucs.shape[1]): 62 | names.append('ClusterSVDD (k={0})'.format(ks[i])) 63 | plt.legend(names, loc=4, fontsize=14) 64 | plt.show() 65 | 66 | 67 | def evaluate(res_filename, nus, sigmas, ks, reps, ntrain, ntest, nval, use_kernels, anom_frac): 68 | train = np.array(range(ntrain-nval), dtype='i') 69 | val = np.array(range(ntrain-nval, ntrain), dtype='i') 70 | test = np.array(range(ntrain, ntrain+ntest), dtype='i') 71 | aucs = np.zeros((reps, len(nus), len(ks))) 72 | for n in range(reps): 73 | # generate new gaussians 74 | data, y = generate_data(ntrain+ntest, outlier_frac=anom_frac) 75 | inds = np.random.permutation(range(ntest+ntrain)) 76 | data = data[:, inds] 77 | y = y[inds] 78 | for i in range(len(nus)): 79 | for k in range(len(ks)): 80 | # fix the initialization for all methods 81 | membership = np.random.randint(0, ks[k], y.size) 82 | 83 | max_auc = -1.0 84 | max_val_auc = -1.0 85 | for sigma in sigmas: 86 | # build cluster svdd 87 | svdds = list() 88 | for l in range(ks[k]): 89 | if use_kernels: 90 | svdds.append(SvddDualQP('rbf', sigma, nus[i])) 91 | else: 92 | svdds.append(SvddPrimalSGD(nus[i])) 93 | 94 | svdd = ClusterSvdd(svdds) 95 | svdd.fit(data[:, train], init_membership=membership[train]) 96 | scores_val, _ = svdd.predict(data[:, val]) 97 | # test on validation data 98 | fpr, tpr, _ = metrics.roc_curve(np.array(y[val]<0., dtype='i'), scores_val, pos_label=1) 99 | curr_auc = metrics.auc(fpr, tpr) 100 | if curr_auc >= max_val_auc: 101 | # store test data accuracy 102 | scores, _ = svdd.predict(data[:, test]) 103 | fpr, tpr, _ = metrics.roc_curve(np.array(y[test]<0., dtype='i'), scores, pos_label=1) 104 | max_auc = metrics.auc(fpr, tpr) 105 | max_val_auc = curr_auc 106 | aucs[n, i, k] = max_auc 107 | # means and standard deviations 108 | maucs = np.mean(aucs, axis=0) 109 | saucs = np.std(aucs, axis=0) 110 | print 'AUCs' 111 | print np.mean(aucs, axis=0) 112 | print 'Stds' 113 | print np.std(aucs, axis=0) 114 | # save results 115 | np.savez(res_filename, maucs=maucs, saucs=saucs, outlier_frac=nus, 116 | ntrain=ntrain, ntest=ntest, reps=reps, nus=nus, ks=ks, sigmas=sigmas) 117 | 118 | 119 | if __name__ == '__main__': 120 | nus = [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2] 121 | sigmas = [0.1, 0.25, 0.5, 1.0, 2.0] 122 | ks = [1, 2, 3, 4] 123 | 124 | reps = 50 # number of repetitions for performance measures 125 | num_train = 1000 # total number of data points is num_train+num_test 126 | num_test = 2000 127 | num_val = 400 # num_val is part of ntrain 128 | use_kernels = False 129 | 130 | anom_frac = 0.05 # fraction of anomalies in the generated dataset 131 | 132 | do_plot = True 133 | do_evaluation = False 134 | 135 | res_filename = 'res_anom_{0}_{1}_{2}_rbf.npz'.format(reps, len(ks), len(nus)) 136 | if not use_kernels: 137 | sigmas = [1.0] 138 | res_filename = 'res_anom_{0}_{1}_{2}.npz'.format(reps, len(ks), len(nus)) 139 | 140 | if do_evaluation: 141 | evaluate(res_filename, nus, sigmas, ks, reps, num_train, num_test, num_val, use_kernels, anom_frac) 142 | if do_plot: 143 | plot_results(res_filename) 144 | 145 | print('DONE :)') 146 | -------------------------------------------------------------------------------- /scripts/test_clustersvdd.py: -------------------------------------------------------------------------------- 1 | 2 | import matplotlib.pyplot as plt 3 | import matplotlib.pylab as pl 4 | import numpy as np 5 | 6 | from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD 7 | from ClusterSVDD.cluster_svdd import ClusterSvdd 8 | 9 | 10 | def generate_gaussians(datapoints, cluster, noise_frac=0.1, dims=2): 11 | mean_mul = 50. 12 | vars = [4.1, 4.1] 13 | 14 | num_noise = np.int(np.floor(datapoints*noise_frac)) 15 | num_dpc = np.int(np.floor(float(datapoints-num_noise)/float(cluster))) 16 | 17 | X = np.zeros((dims, datapoints)) 18 | X[:, :num_noise] = 100.*(2.*np.random.rand(dims, num_noise)-1.) 19 | 20 | y = np.zeros(datapoints) 21 | y[:num_noise] = -1 22 | cnt = num_noise 23 | 24 | for i in range(cluster): 25 | t = 4. 26 | v = np.diag( (t*vars[0] + (1.-t)*vars[1]) * np.ones(dims)) 27 | 28 | # draw the mean 29 | m = mean_mul * (4.*np.random.rand(dims, 1)-2.) 30 | if i == cluster-1: 31 | num_dpc = datapoints-cnt 32 | m = m.dot(np.ones((1, num_dpc))) 33 | # generate the cluster gaussian 34 | X[:, cnt:cnt+num_dpc] = v.dot(4.*np.random.randn(dims, num_dpc)) + m 35 | 36 | y[cnt:cnt+num_dpc] = i 37 | cnt += num_dpc 38 | 39 | # # normalize each feature 40 | X = X / np.repeat(np.max(np.abs(X), axis=1)[:, np.newaxis]/2., datapoints, axis=1) 41 | return X, y 42 | 43 | 44 | def train(cluster, data, nu, membership): 45 | svdds = [] 46 | for c in range(cluster): 47 | svdds.append(SvddPrimalSGD(nu)) 48 | svdd = ClusterSvdd(svdds, nu=nu) 49 | cinds = svdd.fit(data, init_membership=membership, max_svdd_iter=10000, max_iter=40) 50 | print cinds 51 | return svdd, cinds 52 | 53 | 54 | if __name__ == '__main__': 55 | np.random.seed(1000) 56 | nu = 0.1 # CLUSTER - DUAL, PRIMAL 57 | n_cluster = 3 # 'k' number of clusters for the methods and data generation 58 | 59 | Dtrain, ytrain = generate_gaussians(1000, n_cluster, noise_frac=0.01) 60 | membership = np.random.randint(0, n_cluster, ytrain.size) 61 | 62 | # generate test data grid 63 | delta = 0.1 64 | x = np.arange(-2.0-delta, 2.0+delta, delta) 65 | y = np.arange(-2.0-delta, 2.0+delta, delta) 66 | (X, Y) = np.meshgrid(x, y) 67 | (sx, sy) = X.shape 68 | Xf = np.reshape(X,(1, sx*sy)) 69 | Yf = np.reshape(Y,(1, sx*sy)) 70 | Dtest = np.append(Xf, Yf, axis=0) 71 | 72 | # The code below is basically only for beautiful visualizations 73 | plt.figure(1) 74 | 75 | # For each \nu in the nus list, train, predict and 76 | svdd, cinds = train(n_cluster, Dtrain, nu, membership) 77 | scores, cres = svdd.predict(Dtrain) 78 | res, cres = svdd.predict(Dtest) 79 | 80 | Z = np.reshape(res,(sx, sy)) 81 | cs = plt.contourf(X, Y, Z, cmap=plt.cm.bone, alpha=0.2) 82 | 83 | cols = np.random.rand(3, n_cluster+1) 84 | cols[:, 0] = np.array([0.95, 0.1, 0.1]) 85 | cols[:, 1] = np.array([0.9, 0.3, 0.7]) 86 | cols[:, 2] = np.array([0.4, 0.9, 0.3]) 87 | cols[:, 3] = np.array([0.4, 0.4, 0.9]) 88 | for c in range(n_cluster): 89 | inds = np.where(cinds == c)[0] 90 | plt.scatter(Dtrain[0, inds], Dtrain[1, inds], 30, alpha=0.7, c=cols[:, c]) 91 | pl.gca().add_patch(pl.Circle((svdd.svdds[c].c[0],svdd.svdds[c].c[1]), 92 | np.sqrt(svdd.svdds[c].radius2), alpha=0.6, 93 | color=cols[:, c], fill=True)) 94 | 95 | plt.xlim((-2., 2.)) 96 | plt.ylim((-2., 2.)) 97 | plt.yticks([], []) 98 | plt.xticks([], []) 99 | 100 | plt.show() 101 | pl.show() 102 | print('finished') 103 | -------------------------------------------------------------------------------- /scripts/test_exm.py: -------------------------------------------------------------------------------- 1 | 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD 6 | from ClusterSVDD.svdd_dual_qp import SvddDualQP 7 | from ClusterSVDD.cluster_svdd import ClusterSvdd 8 | 9 | 10 | def generate_gaussians(datapoints, cluster, noise_frac=0.1, dims=2): 11 | mean_mul = 50. 12 | vars = [4.1, 4.1] 13 | 14 | num_noise = np.int(np.floor(datapoints*noise_frac)) 15 | num_dpc = np.int(np.floor(float(datapoints-num_noise)/float(cluster))) 16 | 17 | X = np.zeros((dims, datapoints)) 18 | X[:, :num_noise] = 100.*(2.*np.random.rand(dims, num_noise)-1.) 19 | 20 | y = np.zeros(datapoints) 21 | y[:num_noise] = -1 22 | cnt = num_noise 23 | 24 | for i in range(cluster): 25 | t = np.random.rand() 26 | v = np.diag( (t*vars[0] + (1.-t)*vars[1]) * np.ones(dims)) 27 | 28 | # draw the mean 29 | m = mean_mul * (2.*np.random.rand(dims, 1)-1.) 30 | if i == cluster-1: 31 | num_dpc = datapoints-cnt 32 | m = m.dot(np.ones((1, num_dpc))) 33 | # generate the cluster gaussian 34 | X[:, cnt:cnt+num_dpc] = v.dot(np.random.randn(dims, num_dpc)) + m 35 | 36 | y[cnt:cnt+num_dpc] = i 37 | cnt += num_dpc 38 | 39 | # # normalize each feature 40 | X = X / np.repeat(np.max(np.abs(X), axis=1)[:, np.newaxis]/2., datapoints, axis=1) 41 | return X, y 42 | 43 | 44 | def train(cluster, data, nu, membership, use_primal=True): 45 | svdds = [] 46 | for c in range(cluster): 47 | if use_primal: 48 | svdds.append(SvddPrimalSGD(nu)) 49 | else: 50 | svdds.append(SvddDualQP('rbf', 0.4, nu)) 51 | svdd = ClusterSvdd(svdds, nu=nu) 52 | cinds = svdd.fit(data, init_membership=membership, max_svdd_iter=10000, max_iter=40) 53 | print cinds 54 | return svdd, cinds 55 | 56 | 57 | if __name__ == '__main__': 58 | np.random.seed(10) 59 | nus = [0.14] # ANOM - PRIMAL 60 | nus = [0.07] # ANOM - DUAL 61 | 62 | nus = [0.8] # CLUSTER - DUAL, PRIMAL 63 | n_cluster = 4 # 'k' number of clusters for the methods and data generation 64 | use_primal = True 65 | # use primal sgd svdd or dual kernel qp 66 | ad_setting = True # either ad or cluster setting 67 | 68 | Dtrain, ytrain = generate_gaussians(1000, n_cluster, noise_frac=0.01) 69 | membership = np.random.randint(0, n_cluster, ytrain.size) 70 | 71 | # generate test data grid 72 | delta = 0.1 73 | x = np.arange(-2.0-delta, 2.0+delta, delta) 74 | y = np.arange(-2.0-delta, 2.0+delta, delta) 75 | (X, Y) = np.meshgrid(x, y) 76 | (sx, sy) = X.shape 77 | Xf = np.reshape(X,(1, sx*sy)) 78 | Yf = np.reshape(Y,(1, sx*sy)) 79 | Dtest = np.append(Xf, Yf, axis=0) 80 | 81 | # For each \nu in the nus list, train, predict and 82 | # plot the data 83 | for i in range(len(nus)+2): 84 | if 0 < i < len(nus)+1: 85 | (svdd, cinds) = train(n_cluster, Dtrain, nus[i - 1], membership, use_primal=use_primal) 86 | (scores, cres) = svdd.predict(Dtrain) 87 | print 'Fraction {0}-{1}'.format(nus[i-1], np.float(np.sum(scores>=0.)) / np.float(scores.size)) 88 | (res, cres) = svdd.predict(Dtest) 89 | elif i == 0: 90 | if ad_setting: 91 | (svdd, cinds) = train(1, Dtrain, nus[i], membership, use_primal=use_primal) 92 | else: 93 | (svdd, cinds) = train(n_cluster, Dtrain, 1.0, membership, use_primal=use_primal) 94 | (scores, cres) = svdd.predict(Dtrain) 95 | print 'Fraction {0}-{1}'.format(nus[i], np.float(np.sum(scores>=0.)) / np.float(scores.size)) 96 | (res, cres) = svdd.predict(Dtest) 97 | else: 98 | scores = ytrain < 0 99 | cinds = ytrain 100 | 101 | # The code below is basically only for beautiful visualizations 102 | plt.figure(1) 103 | plt.subplot(1, len(nus)+2, (i+1) % (len(nus)+2)+1) 104 | if i < len(nus)+1: 105 | Z = np.reshape(res,(sx, sy)) 106 | # cs = plt.contourf(X, Y, Z, alpha=0.5, cmap=plt.cm.bone) 107 | if ad_setting: 108 | cs2 = plt.contour(X, Y, Z, [0.0], linewidths=2.0, colors='w', alpha=0.8) 109 | 110 | if not ad_setting: 111 | cols = np.random.rand(3, n_cluster+1) 112 | cols[:, 0] = np.array([0.95, 0.1, 0.1]) 113 | cols[:, 1] = np.array([0.9, 0.3, 0.7]) 114 | cols[:, 2] = np.array([0.4, 0.9, 0.3]) 115 | cols[:, 3] = np.array([0.4, 0.4, 0.9]) 116 | cols[:, 4] = np.array([0.7, 0.8, 0.99]) 117 | 118 | if i > len(nus): 119 | cols[1,:] = cols[1, np.array([1, 2, 3, 4, 0])] 120 | 121 | for c in range(n_cluster+1): 122 | inds = np.where(cinds == c-1)[0] 123 | plt.scatter(Dtrain[0, inds], Dtrain[1, inds], 30, c=cols[:, c]) 124 | else: 125 | # anomaly detection setting 126 | inds = np.where(scores > 0.)[0] 127 | plt.scatter(Dtrain[0, inds], Dtrain[1, inds], 30, c='r') 128 | inds = np.where(scores <= 0.)[0] 129 | plt.scatter(Dtrain[0, inds], Dtrain[1, inds], 30, c='g') 130 | 131 | # title 132 | if i == 0: 133 | if use_primal: 134 | if ad_setting: 135 | plt.title(r'SVDD', fontsize=16) 136 | else: 137 | plt.title(r'K-Means', fontsize=16) 138 | else: 139 | if ad_setting: 140 | plt.title(r'Kernel SVDD', fontsize=16) 141 | else: 142 | plt.title(r'Kernel K-Means', fontsize=16) 143 | elif i < len(nus)+1: 144 | if use_primal: 145 | plt.title(r'ClusterSVDD', fontsize=16) 146 | else: 147 | plt.title(r'Kernel ClusterSVDD', fontsize=16) 148 | else: 149 | plt.title(r'Ground truth', fontsize=16) 150 | plt.xlim((-2., 2.)) 151 | plt.ylim((-2., 2.)) 152 | plt.yticks(range(-2, 2), []) 153 | plt.xticks(range(-2, 2), []) 154 | 155 | plt.show() 156 | print('finished') 157 | -------------------------------------------------------------------------------- /scripts/test_impl.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | from ClusterSVDD.svdd_dual_qp import SvddDualQP 5 | from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD 6 | 7 | 8 | if __name__ == '__main__': 9 | nu = 0.15 # outlier fraction 10 | 11 | # generate raw training data 12 | Dtrain = np.random.randn(2, 1000) 13 | Dtrain /= np.max(np.abs(Dtrain)) 14 | 15 | # train dual svdd 16 | svdd = SvddDualQP('linear', 0.1, nu) 17 | svdd.fit(Dtrain) 18 | 19 | # train primal svdd 20 | psvdd = SvddPrimalSGD(nu) 21 | psvdd.fit(Dtrain, max_iter=1000, prec=1e-4) 22 | 23 | # print solutions 24 | print('\n dual-svdd: obj={0} T={1}.'.format(svdd.pobj, svdd.radius2)) 25 | print('primal-svdd: obj={0} T={1}.\n'.format(psvdd.pobj, psvdd.radius2)) 26 | 27 | # generate test data grid 28 | delta = 0.1 29 | x = np.arange(-2.0-delta, 2.0+delta, delta) 30 | y = np.arange(-2.0-delta, 2.0+delta, delta) 31 | X, Y = np.meshgrid(x, y) 32 | (sx, sy) = X.shape 33 | Xf = np.reshape(X,(1, sx*sy)) 34 | Yf = np.reshape(Y,(1, sx*sy)) 35 | Dtest = np.append(Xf, Yf, axis=0) 36 | if Dtrain.shape[0] > 2: 37 | Dtest = np.append(Dtest, np.random.randn(Dtrain.shape[0]-2, sx*sy), axis=0) 38 | print(Dtest.shape) 39 | 40 | res = svdd.predict(Dtest) 41 | pres = psvdd.predict(Dtest) 42 | 43 | # nice visualization 44 | plt.figure(1) 45 | plt.subplot(1, 2, 1) 46 | plt.title('Dual QP SVDD') 47 | Z = np.reshape(res,(sx, sy)) 48 | plt.contourf(X, Y, Z) 49 | plt.contour(X, Y, Z, [0.0], linewidths=3.0, colors='k') 50 | plt.scatter(Dtrain[0, svdd.get_support_inds()], Dtrain[1, svdd.get_support_inds()], 40, c='k') 51 | plt.scatter(Dtrain[0, :], Dtrain[1, :],10) 52 | plt.xlim((-2., 2.)) 53 | plt.ylim((-2., 2.)) 54 | plt.yticks(range(-2, 2), []) 55 | plt.xticks(range(-2, 2), []) 56 | 57 | plt.subplot(1, 2, 2) 58 | plt.title('Primal Subgradient SVDD') 59 | Z = np.reshape(pres,(sx, sy)) 60 | plt.contourf(X, Y, Z) 61 | plt.contour(X, Y, Z, [0.0], linewidths=3.0, colors='k') 62 | plt.scatter(Dtrain[0, :], Dtrain[1, :], 10) 63 | plt.xlim((-2., 2.)) 64 | plt.ylim((-2., 2.)) 65 | plt.yticks(range(-2, 2), []) 66 | plt.xticks(range(-2, 2), []) 67 | 68 | plt.show() 69 | 70 | print('finished') 71 | -------------------------------------------------------------------------------- /scripts/test_real.py: -------------------------------------------------------------------------------- 1 | import sklearn.metrics as metrics 2 | import numpy as np 3 | 4 | from ClusterSVDD.svdd_dual_qp import SvddDualQP 5 | from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD 6 | from ClusterSVDD.cluster_svdd import ClusterSvdd 7 | 8 | 9 | def load_data_set(fname, num_data, outlier_frac, train_inds): 10 | from sklearn.datasets import load_svmlight_file 11 | X, y = load_svmlight_file(fname) 12 | 13 | print X.shape 14 | y -= np.min(y) # classes should start from zero: 0,1,2,3,... 15 | inds = np.array([], dtype='i') 16 | for i in range(int(max(y))+1): 17 | inds = np.append(inds, np.where(y == i)[0]) 18 | 19 | print inds.shape 20 | X = X.toarray() 21 | X = X[inds, :] 22 | y = y[inds] 23 | 24 | inds = np.random.permutation(range(y.size)) 25 | X = X[inds[:num_data], :].T 26 | y = y[inds[:num_data]] 27 | 28 | # induce anomalies 29 | anoms = int(float(num_data)*outlier_frac) 30 | X[:, :anoms] = 1.*(np.random.rand(X.shape[0], anoms)*2.-1.) 31 | y[:anoms] = -1 32 | 33 | print np.unique(y) 34 | return X, y 35 | 36 | 37 | def evaluate(res_filename, dataset, nus, ks, outlier_frac, 38 | reps, num_train, num_val, num_test, use_kernels=False): 39 | train = np.array(range(num_train-num_val), dtype='i') 40 | val = np.array(range(num_train-num_val, num_train), dtype='i') 41 | test = np.array(range(num_train, num_train + num_test), dtype='i') 42 | 43 | aris = np.zeros((reps, len(nus), len(ks))) 44 | aucs = np.zeros((reps, len(nus), len(ks))) 45 | 46 | val_aris = np.zeros((reps, len(nus), len(ks))) 47 | val_aucs = np.zeros((reps, len(nus), len(ks))) 48 | 49 | for n in range(reps): 50 | # generate new gaussians 51 | # data, y = generate_data(num_train + num_test, outlier_frac=outlier_frac) 52 | inds = np.random.permutation(range(num_test + num_train)) 53 | data, y = load_data_set(dataset, num_train + num_test, outlier_frac, inds[:num_train]) 54 | data = data[:, inds] 55 | y = y[inds] 56 | for k in range(len(ks)): 57 | # fix the initialization for all methods 58 | membership = np.random.randint(0, ks[k], y.size) 59 | for i in range(len(nus)): 60 | svdds = list() 61 | for l in range(ks[k]): 62 | if use_kernels: 63 | svdds.append(SvddDualQP('rbf', 20.0, nus[i])) 64 | else: 65 | svdds.append(SvddPrimalSGD(nus[i])) 66 | svdd = ClusterSvdd(svdds) 67 | svdd.fit(data[:, train].copy(), init_membership=membership[train]) 68 | # test error 69 | scores, classes = svdd.predict(data[:, test].copy()) 70 | 71 | # evaluate clustering abilities 72 | # inds = np.where((y[test] >= 0))[0] 73 | # aris[n, i, k] = metrics.cluster.adjusted_rand_score(y[test[inds]], classes[inds]) 74 | 75 | ari = metrics.cluster.adjusted_rand_score(y[test], classes) 76 | if nus[i] < 1.0: 77 | inds = np.where(scores <= 0.)[0] 78 | 79 | ari = metrics.cluster.adjusted_rand_score(y[test[inds]], classes[inds]) 80 | aris[n, i, k] = ari 81 | 82 | # ...and anomaly detection accuracy 83 | fpr, tpr, _ = metrics.roc_curve(np.array(y[test]<0., dtype='i'), scores, pos_label=1) 84 | aucs[n, i, k] = metrics.auc(fpr, tpr) 85 | 86 | # validation error 87 | scores, classes = svdd.predict(data[:, val].copy()) 88 | # evaluate clustering abilities 89 | # inds = np.where((y[val] >= 0))[0] 90 | # val_aris[n, i, k] = metrics.cluster.adjusted_rand_score(y[val[inds]], classes[inds]) 91 | 92 | ari = metrics.cluster.adjusted_rand_score(y[val], classes) 93 | if nus[i] < 1.0: 94 | inds = np.where(scores <= 0.)[0] 95 | ari = metrics.cluster.adjusted_rand_score(y[val[inds]], classes[inds]) 96 | val_aris[n, i, k] = ari 97 | 98 | # ...and anomaly detection accuracy 99 | fpr, tpr, _ = metrics.roc_curve(np.array(y[val]<0., dtype='i'), scores, pos_label=1) 100 | val_aucs[n, i, k] = metrics.auc(fpr, tpr) 101 | 102 | print '---------------------------------------------------' 103 | maris = np.mean(aris, axis=0) 104 | saris = np.std(aris, axis=0) 105 | print '(Test) ARI:' 106 | print np.mean(aris, axis=0) 107 | print np.std(aris, axis=0) 108 | 109 | val_maris = np.mean(val_aris, axis=0) 110 | val_saris = np.std(val_aris, axis=0) 111 | print '(Val) ARI:' 112 | print val_maris 113 | print val_saris 114 | 115 | print '---------------------------------------------------' 116 | maucs = np.mean(aucs, axis=0) 117 | saucs = np.std(aucs, axis=0) 118 | print '(Test) AUC:' 119 | print np.mean(aucs, axis=0) 120 | print np.std(aucs, axis=0) 121 | 122 | val_maucs = np.mean(val_aucs, axis=0) 123 | val_saucs = np.std(val_aucs, axis=0) 124 | print '(Val) AUC:' 125 | print val_maucs 126 | print val_saucs 127 | print '---------------------------------------------------' 128 | 129 | res = np.zeros(4) 130 | res_stds = np.zeros(4) 131 | 132 | # best svdd result (assume col 0 is k=1) 133 | svdd_ind = np.argmax(val_maucs[:, 0]) 134 | print 'SVDD best AUC={0}'.format(maucs[svdd_ind, 0]) 135 | csvdd_ind = np.argmax(val_maucs) 136 | i1, i2 = np.unravel_index(csvdd_ind, maucs.shape) 137 | print 'ClusterSVDD best AUC={0}'.format(maucs[i1, i2]) 138 | res[0] = maucs[svdd_ind, 0] 139 | res_stds[0] = saucs[svdd_ind, 0] 140 | res[1] = maucs[i1, i2] 141 | res_stds[1] = saucs[i1, i2] 142 | 143 | # best svdd result (assume col 0 is k=1) 144 | km_ind = np.argmax(val_maris[0, :]) 145 | print 'k-means best ARI={0}'.format(maris[0, km_ind]) 146 | csvdd_ind = np.argmax(val_maris) 147 | i1, i2 = np.unravel_index(csvdd_ind, maris.shape) 148 | print 'ClusterSVDD best ARI={0}'.format(maris[i1, i2]) 149 | res[2] = maris[0, km_ind] 150 | res_stds[2] = saris[0, km_ind] 151 | res[3] = maris[i1, i2] 152 | res_stds[3] = saris[i1, i2] 153 | print '---------------------------------------------------' 154 | 155 | return res, res_stds 156 | 157 | if __name__ == '__main__': 158 | dataset_name = "../../segment.scale.txt" # 7c 159 | # dataset_name = "../../satimage.scale.txt" # 6c 160 | 161 | nus = [1.0, 0.95, 0.9, 0.5, 0.1, 0.01] 162 | outlier_fracs = [0.0, 0.02, 0.05, 0.1, 0.15] # fraction of uniform noise in the generated data 163 | reps = 10 # number of repetitions for performance measures 164 | 165 | ks = [1, 5, 7, 10, 14] # segment 166 | num_train = 1155 167 | num_test = 1155 168 | num_val = 250 169 | 170 | if 'satimage' in dataset_name: 171 | ks = [1, 3, 6, 9] 172 | # ks = [1, 3, 5, 6, 7] 173 | num_train = 2217 174 | num_test = 2218 175 | num_val = 400 176 | 177 | # nus = [1.0, 0.95] 178 | # outlier_fracs = [0.1] # fraction of uniform noise in the generated data 179 | # reps = 1 # number of repetitions for performance measures 180 | # ks = [6] # segment 181 | 182 | 183 | res_filename = 'res_real_{0}_{1}.npz'.format(reps, dataset_name[6:]) 184 | 185 | # res: 0:AUC-SVDD, 1:AUC-CSVDD, 2:ARI-KMEANS, 3:ARI-CSVDD 186 | res = np.zeros((len(outlier_fracs), 4)) 187 | res_stds = np.zeros((len(outlier_fracs), 4)) 188 | for i in range(len(outlier_fracs)): 189 | res[i, :], res_stds[i, :] = evaluate(res_filename, dataset_name, \ 190 | nus, ks, outlier_fracs[i], reps, 191 | num_train, num_val, num_test, use_kernels=False) 192 | 193 | np.savez(res_filename, dataset=dataset_name, res=res, res_stds=res_stds, \ 194 | outlier_fracs=outlier_fracs, ntrain=num_train, nval=num_val, ntest=num_test, reps=reps, nus=nus, ks=ks) 195 | 196 | print '==========================================' 197 | for i in range(len(outlier_fracs)): 198 | line = '{0}\\%'.format(int(outlier_fracs[i]*100.)) 199 | for j in range(4): 200 | line += ' & {0:1.2f}/{1:1.2f}'.format(res[i, j], res_stds[i, j]) 201 | line += ' \\\\' 202 | print line 203 | print '==========================================' 204 | 205 | print('DONE :)') 206 | -------------------------------------------------------------------------------- /scripts/test_robust.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import sklearn.metrics as metrics 3 | import numpy as np 4 | 5 | from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD 6 | from ClusterSVDD.cluster_svdd import ClusterSvdd 7 | from ClusterSVDD.svdd_dual_qp import SvddDualQP 8 | 9 | 10 | def generate_data(datapoints, outlier_frac=0.1, dims=2): 11 | X = np.zeros((dims, datapoints)) 12 | y = np.zeros(datapoints) 13 | 14 | num_noise = np.floor(datapoints*outlier_frac) 15 | num_dpc = np.floor(float(datapoints-num_noise)/2.0) 16 | 17 | X[:, :num_noise] = 0.5*np.random.randn(dims, num_noise) + 0. 18 | y[:num_noise] = -1 19 | 20 | cnt = num_noise 21 | X[:, cnt:cnt+num_dpc] = 1.5*np.random.randn(dims, num_dpc) - 1. 22 | y[cnt:cnt+num_dpc] = 1 23 | cnt += num_dpc 24 | 25 | X[:, cnt:] = 0.5*np.random.randn(dims, y.size-cnt) + 1. 26 | y[cnt:] = 2 27 | return X, y 28 | 29 | 30 | def plot_results(res_filename): 31 | foo = np.load(res_filename) 32 | maris = foo['maris'] 33 | saris = foo['saris'] 34 | nus = foo['nus'] 35 | reps = foo['reps'] 36 | 37 | plt.figure(1) 38 | np.random.seed(2) 39 | cols = np.random.rand(maris.shape[1], 3) 40 | fmts = ['-->', '-.o', '-D', '--s', '--H'] 41 | for i in range(maris.shape[1]): 42 | plt.errorbar(nus, maris[:, i], saris[:, i]/np.sqrt(reps), fmt=fmts[i], color=cols[i, :], \ 43 | ecolor=cols[i, :], linewidth=2.0, elinewidth=1.0, alpha=0.8) 44 | for i in range(maris.shape[1]): 45 | plt.errorbar(nus[-1], maris[-1, i], saris[-1, i]/np.sqrt(reps), \ 46 | color='r', ecolor='r', fmt=fmts[i][-1], markersize=10, linewidth=4.0, elinewidth=4.0, alpha=0.7) 47 | 48 | plt.xlim((-0.05, 1.05)) 49 | plt.ylim((0.2, .8)) 50 | plt.xticks([0.0, 0.25, 0.5, 0.75, 1.0], ['0.0', '0.25', '0.5', '0.75', '1.0 \n= Kernel $k$-means'], fontsize=14) 51 | plt.grid() 52 | plt.xlabel(r'regularization parameter $\nu$', fontsize=14) 53 | plt.ylabel(r'Adjusted Rand Index (ARI)', fontsize=14) 54 | names = list() 55 | for i in range(maris.shape[1]): 56 | names.append('ClusterSVDD ($k$={0})'.format(ks[i])) 57 | plt.legend(names, loc=4, fontsize=14) 58 | plt.show() 59 | 60 | 61 | def evaluate(res_filename, nus, ks, outlier_frac, reps, num_train, num_test, use_primal=True): 62 | train = np.array(range(num_train), dtype='i') 63 | test = np.array(range(num_train, num_train + num_test), dtype='i') 64 | 65 | aris = np.zeros((reps, len(nus), len(ks))) 66 | for n in range(reps): 67 | # generate new gaussians 68 | data, y = generate_data(num_train + num_test, outlier_frac=outlier_frac) 69 | inds = np.random.permutation(range(num_test + num_train)) 70 | data = data[:, inds] 71 | y = y[inds] 72 | for k in range(len(ks)): 73 | # fix the initialization for all methods 74 | membership = np.random.randint(0, ks[k], y.size) 75 | for i in range(len(nus)): 76 | svdds = list() 77 | for l in range(ks[k]): 78 | if use_primal: 79 | svdds.append(SvddPrimalSGD(nus[i])) 80 | else: 81 | svdds.append(SvddDualQP('rbf', 10.0, nus[i])) 82 | svdd = ClusterSvdd(svdds) 83 | svdd.fit(data[:, train].copy(), init_membership=membership[train]) 84 | _, classes = svdd.predict(data[:, test].copy()) 85 | # evaluate clustering abilities 86 | inds = np.where(y[test] >= 0)[0] 87 | aris[n, i, k] = metrics.cluster.adjusted_rand_score(y[test[inds]], classes[inds]) 88 | 89 | print aris 90 | print '' 91 | maris = np.mean(aris, axis=0) 92 | saris = np.std(aris, axis=0) 93 | print np.mean(aris, axis=0) 94 | print np.std(aris, axis=0) 95 | np.savez(res_filename, maris=maris, saris=saris, outlier_frac=outlier_frac, 96 | ntrain=num_train, ntest=num_test, reps=reps, nus=nus) 97 | 98 | 99 | if __name__ == '__main__': 100 | nus = (np.arange(1, 21)/20.) 101 | ks = [2, 3, 4] 102 | 103 | # ks = [3] 104 | # nus = [0.1, 0.5, 0.9, 1.0] 105 | 106 | outlier_frac = 0.05 # fraction of uniform noise in the generated data 107 | # outlier_frac = 0.1 # fraction of uniform noise in the generated data 108 | reps = 50 # number of repetitions for performance measures 109 | num_train = 1000 110 | num_test = 2000 111 | 112 | do_plot = True 113 | do_evaluation = False 114 | 115 | res_filename = 'res_robust_{0}_{1}_{2}.npz'.format(reps, len(ks), len(nus)) 116 | 117 | if do_evaluation: 118 | evaluate(res_filename, nus, ks, outlier_frac, reps, num_train, num_test, use_primal=False) 119 | if do_plot: 120 | plot_results(res_filename) 121 | 122 | print('DONE :)') 123 | -------------------------------------------------------------------------------- /scripts/test_struct.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import sklearn.metrics as metrics 3 | import numpy as np 4 | import time as time 5 | 6 | from numba import autojit 7 | 8 | from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD 9 | from ClusterSVDD.cluster_svdd import ClusterSvdd 10 | 11 | 12 | def generate_seqs(lens, block_len, cluster=3, dims=3): 13 | classes = np.random.randint(0, cluster) 14 | seqs = 1.0*np.random.randn(dims, lens) 15 | states = np.zeros(lens, dtype='i') 16 | y = classes 17 | start = np.random.randint(low=0, high=lens-block_len+1) 18 | states[start:start+block_len] = 1 19 | # seqs[0, start:start+block_len] = seqs[0, start:start+block_len]+0.5*classes-2.0*float(classes==0) 20 | seqs[classes, start:start+block_len] = seqs[0, start:start+block_len]+1.0 21 | return seqs, states, y 22 | 23 | 24 | def generate_data(datapoints, cluster=3, outlier_frac=0.1, dims=3, plot=True): 25 | lens = 500 26 | X = [] 27 | S = [] 28 | y = np.ones(datapoints, dtype='i') 29 | idx = np.zeros(cluster, dtype='i') 30 | idx_anom = -1 31 | for i in range(datapoints): 32 | exm, states, y[i] = generate_seqs(lens, 250, cluster=cluster, dims=dims) 33 | prob = np.random.uniform() 34 | if prob < outlier_frac: 35 | idx_anom = i 36 | exm *= np.random.uniform(low=-0.1,high=+0.1, size=(dims, lens)) 37 | exm *= np.exp(10.0*exm) 38 | y[i] = -1 39 | else: 40 | idx[y[i]] = i 41 | X.append(exm) 42 | S.append(states) 43 | 44 | if plot: 45 | plt.figure(1) 46 | 47 | for d in range(dims): 48 | for i in range(cluster): 49 | plt.subplot(1, cluster+1, i+1) 50 | plt.plot(range(lens), X[idx[i]][d, :]+d*6., '-r', alpha=0.7) 51 | plt.ylim((-5.0, 20.)) 52 | plt.yticks([0.0]) 53 | xinds = np.where(S[idx[i]]==1)[0] 54 | plt.fill_between(xinds, -5, 20, color=[0.3, 0.3, 0.3], alpha=0.25) 55 | plt.title('Class {0}'.format(i), fontsize=14) 56 | plt.xlabel('Sequence index', fontsize=14) 57 | plt.ylabel('Feature 0 Feature 1 Feature 2', fontsize=14) 58 | 59 | plt.subplot(1, cluster+1, cluster+1) 60 | for d in range(dims): 61 | plt.plot(range(lens), X[idx_anom][d, :]+d*6., '-r', alpha=0.7) 62 | plt.yticks([0.0]) 63 | plt.ylim((-5., 20. )) 64 | plt.title('Anomalous Data', fontsize=14) 65 | plt.xlabel('Sequence index', fontsize=14) 66 | plt.ylabel('Feature 0 Feature 1 Feature 2', fontsize=14) 67 | 68 | plt.show() 69 | return X, S, y 70 | 71 | 72 | def preprocess_training_data(data_seqs, state_seqs, train_inds): 73 | # estimate the transition and emission matrix given the training 74 | # data only. Number of states is 2. 75 | N = len(data_seqs) 76 | F, _ = data_seqs[0].shape 77 | phi = np.zeros((2*2 + F*2, N)) 78 | for n in train_inds: 79 | phi[:, n] = get_joint_feature_map(data_seqs[n], state_seqs[n]) 80 | phi[:, n] /= np.linalg.norm(phi[:, n], ord=2) 81 | return phi 82 | 83 | 84 | def preprocess_test_data(csvdd, X, S, inds): 85 | # 1. for all i,k: y_i,k = argmax_y 86 | # 2. for all i: calculate membership z_i = argmin_k ||c_k - psi(x_i, y_i,k)||^2 - R_k 87 | # 3. for all i: hamming loss delta(y_i, y_i,z_i) 88 | N = inds.size 89 | F, _ = X[0].shape 90 | 91 | pred_phis = np.zeros((2*2 + F*2, N)) 92 | true_states = [] 93 | pred_states = [] 94 | states = [] 95 | for n in range(N): 96 | states.append(S[inds[n]]) 97 | true_states.append(S[inds[n]]) 98 | pred_states.append(S[inds[n]]) 99 | 100 | min_scores = 1e12*np.ones(N, dtype='d') 101 | for k in range(csvdd.clusters): 102 | phis = np.zeros((2*2 + F*2, N)) 103 | for n in range(N): 104 | sol = csvdd.svdds[k].c 105 | states[n] = argmax(sol, X[inds[n]]) 106 | phis[:, n] = get_joint_feature_map(X[inds[n]], states[n]) 107 | # states[n] = true_states[n] 108 | phis[:, n] /= np.linalg.norm(phis[:, n], ord=2) 109 | 110 | scores = csvdd.svdds[k].predict(phis) 111 | minds = np.where(scores <= min_scores)[0] 112 | pred_phis[:, minds] = phis[:, minds] 113 | min_scores[minds] = scores[minds] 114 | for i in minds: 115 | pred_states[i] = states[i] 116 | 117 | return pred_phis, true_states, pred_states 118 | 119 | def hamming_loss(y_true, y_pred): 120 | N = len(y_pred) 121 | loss = 0.0 122 | for i in range(N): 123 | loss += float(np.sum(y_true[i] != y_pred[i])) / float(y_pred[i].size) 124 | return loss / float(N) 125 | 126 | 127 | @autojit(nopython=True) 128 | def argmax(sol, X): 129 | # if labels are present, then argmax will solve 130 | # the loss augmented programm 131 | T = X.shape[1] 132 | N = 2 133 | 134 | # get transition matrix from current solution 135 | A = np.zeros((N, N), dtype=np.double) 136 | for i in range(N): 137 | for j in range(N): 138 | A[i, j] = sol[i*N+j] 139 | 140 | # calc emission matrix from current solution, data points and 141 | F = X.shape[0] 142 | em = np.zeros((N, T)) 143 | for t in range(T): 144 | for s in range(N): 145 | for f in xrange(F): 146 | em[s, t] += sol[N*N + s*F + f] * X[f, t] 147 | 148 | delta = np.zeros((N, T)) 149 | psi = np.zeros((N, T), dtype=np.int8) 150 | # initialization 151 | for i in xrange(N): 152 | # use equal start probs for each state 153 | delta[i, 0] = 0. + em[i, 0] 154 | 155 | # recursion 156 | for t in range(1, T): 157 | for i in range(N): 158 | foo_argmax = 0 159 | foo_max = -1e16 160 | for l in range(N): 161 | foo = delta[l, t-1] + A[l, i] + em[i, t] 162 | if foo > foo_max: 163 | foo_max = foo 164 | foo_argmax = l 165 | psi[i, t] = foo_argmax 166 | delta[i, t] = foo_max 167 | 168 | states = np.zeros(T, dtype=np.int8) 169 | states[T-1] = np.argmax(delta[:, T-1]) 170 | 171 | # for t in reversed(xrange(1, T)): 172 | for t in range(T-1, 0, -1): 173 | states[t-1] = psi[states[t], t] 174 | return states 175 | 176 | 177 | @autojit(nopython=True) 178 | def get_joint_feature_map(X, y): 179 | N = 2 180 | T = y.size 181 | F = X.shape[0] 182 | jfm = np.zeros(N*N + N*F) 183 | # transition part 184 | for t in range(T-1): 185 | for i in range(N): 186 | for j in range(N): 187 | if y[t]==i and y[t+1]==j: 188 | jfm[j*N+i] += 1 189 | # emission parts 190 | for t in range(T): 191 | for f in range(F): 192 | jfm[y[t]*F + f + N*N] += X[f, t] 193 | return jfm 194 | 195 | 196 | def plot_results(res_filename): 197 | data, states, y = generate_data(1000, cluster=3, outlier_frac=0.05, dims=3, plot=False) 198 | 199 | foo = np.load(res_filename) 200 | maris = foo['maris'] 201 | saris = foo['saris'] 202 | mloss = foo['mloss'] 203 | sloss = foo['sloss'] 204 | nus = foo['nus'] 205 | reps = foo['reps'] 206 | 207 | res = np.zeros((len(nus), 4)) 208 | res_stds = np.zeros((len(nus), 4)) 209 | 210 | # svdd 211 | res[:, 0] = mloss[:, 0] 212 | res_stds[:, 0] = sloss[:, 0] 213 | # csvdd 214 | res[:, 1] = mloss[:, 1] 215 | res_stds[:, 1] = sloss[:, 1] 216 | 217 | # kmeans 218 | res[0, 2] = maris[0, 1] 219 | res_stds[0, 2] = saris[0, 1] 220 | # csvdd 221 | res[:, 3] = maris[:, 1] 222 | res_stds[:, 3] = saris[:, 1] 223 | 224 | print '==========================================' 225 | for i in range(len(nus)): 226 | line = '{0:1.2f}\\%'.format(nus[i]) 227 | for j in range(4): 228 | line += ' & {0:1.2f}/{1:1.2f}'.format(res[i, j], res_stds[i, j]) 229 | line += ' \\\\' 230 | print line 231 | print '==========================================' 232 | 233 | 234 | def evaluate(res_filename, nus, ks, outlier_frac, reps, num_train, num_test): 235 | train = np.array(range(num_train), dtype='i') 236 | test = np.array(range(num_train, num_train + num_test), dtype='i') 237 | 238 | aris = np.zeros((reps, len(nus), len(ks))) 239 | loss = np.zeros((reps, len(nus), len(ks))) 240 | for n in range(reps): 241 | # generate new gaussians 242 | X, S, y = generate_data(num_train + num_test, cluster=3, outlier_frac=outlier_frac, dims=3, plot=False) 243 | inds = np.random.permutation(range(num_test + num_train)) 244 | data = preprocess_training_data(X, S, inds[:num_train]) 245 | data = data[:, inds] 246 | y = y[inds] 247 | print data 248 | print y 249 | for k in range(len(ks)): 250 | # fix the initialization for all methods 251 | membership = np.random.randint(0, ks[k], y.size) 252 | for i in range(len(nus)): 253 | svdds = list() 254 | for l in range(ks[k]): 255 | svdds.append(SvddPrimalSGD(nus[i])) 256 | svdd = ClusterSvdd(svdds) 257 | svdd.fit(data[:, train], init_membership=membership[train]) 258 | 259 | stime = time.time() 260 | pred_phis, true_states, pred_states = preprocess_test_data(svdd, X, S, inds[num_train:]) 261 | _, classes = svdd.predict(pred_phis) 262 | print '---------------- TIME' 263 | print time.time()-stime 264 | print '----------------' 265 | 266 | # evaluate clustering abilities 267 | ninds = np.where(y[test] >= 0)[0] 268 | aris[n, i, k] = metrics.cluster.adjusted_rand_score(y[test[ninds]], classes[ninds]) 269 | # evaluate structured prediction accuracy 270 | loss[n, i, k] = hamming_loss(true_states, pred_states) 271 | print loss[n, i, k] 272 | 273 | maris = np.mean(aris, axis=0) 274 | saris = np.std(aris, axis=0) 275 | print 'ARI' 276 | print np.mean(aris, axis=0) 277 | print np.std(aris, axis=0) 278 | 279 | mloss = np.mean(loss, axis=0) 280 | sloss = np.std(loss, axis=0) 281 | print 'Normalized Hamming Distance' 282 | print np.mean(loss, axis=0) 283 | print np.std(loss, axis=0) 284 | 285 | np.savez(res_filename, maris=maris, saris=saris, mloss=mloss, sloss=sloss, 286 | outlier_frac=outlier_frac, ntrain=num_train, ntest=num_test, reps=reps, nus=nus) 287 | 288 | 289 | if __name__ == '__main__': 290 | nus = [1.0, 0.9, 0.5, 0.1, 0.01] 291 | ks = [1, 3] 292 | 293 | outlier_frac = 0.05 # fraction of uniform noise in the generated data 294 | reps = 10 # number of repetitions for performance measures 295 | num_train = 2000 296 | num_test = 500 297 | 298 | do_plot = True 299 | do_evaluation = True 300 | 301 | res_filename = 'res_struct_{0}_{1}_{2}.npz'.format(reps, len(ks), len(nus)) 302 | 303 | if do_evaluation: 304 | evaluate(res_filename, nus, ks, outlier_frac, reps, num_train, num_test) 305 | if do_plot: 306 | # data, states, y = generate_data(num_train + num_test, outlier_frac=outlier_frac, dims=2, plot=True) 307 | plot_results(res_filename) 308 | 309 | print('DONE :)') 310 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | try: 2 | from setuptools import setup 3 | except ImportError: 4 | from distutils.core import setup 5 | 6 | config = { 7 | 'description': 'ClusterSVDD--Latent variable support vector data description', 8 | 'url': 'https://github.com/nicococo/ClusterSVDD', 9 | 'author': 'Nico Goernitz', 10 | 'author_email': 'nico.goernitz@tu-berlin.de', 11 | 'version': '0.1', 12 | 'install_requires': ['numba', 'cvxopt','scikit-learn','numpy', 'scipy'], 13 | 'packages': ['ClusterSVDD'], 14 | 'package_dir' : {'clusterSVDD': 'ClusterSVDD'}, 15 | #'package_data': {'clusterSVDD': ['*.txt']}, 16 | #'scripts': ['bin/ClusterSVDD.sh'], 17 | 'name': 'ClusterSVDD', 18 | 'classifiers':['Intended Audience :: Science/Research', 19 | 'Programming Language :: Python', 20 | 'Topic :: Scientific/Engineering', 21 | 'Operating System :: POSIX', 22 | 'Operating System :: Unix', 23 | 'Operating System :: MacOS', 24 | 'Programming Language :: Python :: 2', 25 | 'Programming Language :: Python :: 2.7'] 26 | } 27 | 28 | setup(**config) --------------------------------------------------------------------------------