├── ClusterSVDD
    ├── __init__.py
    ├── cluster_svdd.py
    ├── kernel.py
    ├── mlp.py
    ├── svdd_dual_qp.py
    └── svdd_primal_sgd.py
├── LICENSE
├── README.md
├── __init__.py
├── scripts
    ├── __init__.py
    ├── test_ad_svdd.py
    ├── test_anom.py
    ├── test_clustersvdd.py
    ├── test_exm.py
    ├── test_impl.py
    ├── test_real.py
    ├── test_robust.py
    └── test_struct.py
└── setup.py


/ClusterSVDD/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nicococo/ClusterSvdd/2f61c187a3197c807b239202b72d9c84cb46400c/ClusterSVDD/__init__.py


--------------------------------------------------------------------------------
/ClusterSVDD/cluster_svdd.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'nicococo'
 2 | import numpy as np
 3 | 
 4 | 
 5 | class ClusterSvdd:
 6 |     """ Implementation of the cluster support vector data description (ClusterSVDD).
 7 |         Author: Nico Goernitz, TU Berlin, 2015
 8 |     """
 9 |     PRECISION = 1e-4  # This parameter can be important as it effects the threshold,
10 |                       # support vectors and speed!
11 |     clusters = 0     # (scalar) number of clusters
12 |     svdds = None     # (list) list of dual qp svdds
13 |     nu = -1.0        # (scalar) 0 < nu <= 1.0
14 | 
15 |     def __init__(self, svdds, nu=-1.0):
16 |         self.clusters = len(svdds)
17 |         self.svdds = svdds
18 |         self.nu = nu
19 |         self.use_local_fraction = nu <= 0.
20 |         print('Creating new ClusterSVDD with {0} clusters.'.format(self.clusters))
21 | 
22 |     def fit(self, X, min_chg=0.0, max_iter=40, max_svdd_iter=2000, init_membership=None):
23 |         """
24 |         :param X: Data matrix is assumed to be feats x samples.
25 |         :param min_chg: Minimum percent of changes per iteration before stopping.
26 |         :param max_iter: Maximum number of iteration before stopping.
27 |         :param max_svdd_iter: Maximum number of iterations for nested SVDDs.
28 |         :param init_membership: Integer array with cluster affiliation per
29 |                                 sample (used for initialization).
30 |         :return: (Integer array ) Cluster affiliations for all samples.
31 |         """
32 |         (dims, samples) = X.shape
33 | 
34 |         # init majorization step
35 |         cinds_old = np.zeros(samples)
36 |         cinds = np.random.randint(0, self.clusters, samples)
37 |         if init_membership is not None:
38 |             print('Using init cluster membership.')
39 |             cinds = init_membership
40 | 
41 |         # init maximization step
42 |         for c in range(self.clusters):
43 |             inds = np.where(cinds == c)[0]
44 |             self.svdds[c].fit(X[:, inds])
45 | 
46 |         iter_cnt = 0
47 |         scores = np.zeros((self.clusters, samples))
48 |         while np.sum(np.abs(cinds_old-cinds))/np.float(samples) > min_chg and iter_cnt < max_iter:
49 |             print('Iter={0}'.format(iter_cnt))
50 |             # 1. majorization step
51 |             for c in range(self.clusters):
52 |                 scores[c, :] = self.svdds[c].predict(X)
53 |             cinds_old = cinds
54 |             cinds = np.argmin(scores, axis=0)
55 |             # 2. maximization step
56 |             for c in range(self.clusters):
57 |                 inds = np.where(cinds == c)[0]
58 |                 if inds.size > 0:
59 |                     # perc = 2.0*np.float(inds.size)/np.float(samples)
60 |                     # self.svdds[c].nu = perc * self.nu
61 |                     self.svdds[c].fit(X[:, inds], max_iter=max_svdd_iter)
62 |             iter_cnt += 1
63 |         print('ClusterSVDD training finished after {0} iterations.'.format(iter_cnt))
64 |         return cinds
65 | 
66 |     def predict(self, Y):
67 |         """
68 |         :param Y:
69 |         :return:
70 |         """
71 |         scores = np.zeros((self.clusters, Y.shape[1]))
72 |         for c in range(self.clusters):
73 |             scores[c, :] = self.svdds[c].predict(Y)
74 |         cinds = np.argmin(scores, axis=0)
75 |         return np.min(scores, axis=0), cinds
76 | 


--------------------------------------------------------------------------------
/ClusterSVDD/kernel.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from profilehooks import profile
 4 | 
 5 | class Kernel:
 6 | 
 7 |     def __init__(self):
 8 |         pass
 9 | 
10 |     @staticmethod
11 |     def get_kernel(X, Y, type='linear', param=1.0):
12 |         """Calculates a kernel given the data X and Y (dims x exms)"""
13 |         (Xdims, Xn) = X.shape
14 |         (Ydims, Yn) = Y.shape
15 |         
16 |         kernel = 1.0
17 |         if type=='linear':
18 |             print('Calculating linear kernel with size {0}x{1}.'.format(Xn, Yn))
19 |             kernel = X.T.dot(Y)
20 | 
21 |         if type=='rbf':
22 |             print('Calculating Gaussian kernel with size {0}x{1} and sigma2={2}.'.format(Xn, Yn, param))
23 |             Dx = (np.ones((Yn, 1)) * np.diag(X.T.dot(X)).reshape(1, Xn)).T
24 |             Dy = (np.ones((Xn, 1)) * np.diag(Y.T.dot(Y)).reshape(1, Yn))
25 |             kernel = Dx - 2.* np.array(X.T.dot(Y)) + Dy
26 |             kernel = np.exp(-kernel / param)
27 |             print kernel.shape
28 | 
29 |         return kernel
30 | 
31 | 
32 |     @staticmethod
33 |     def get_diag_kernel(X, type='linear', param=1.0):
34 |         """Calculates the kernel diagonal given the data X (dims x exms)"""
35 |         (Xdims, Xn) = X.shape
36 |         
37 |         kernel = 1.0
38 |         if type=='linear':
39 |             print('Calculating diagonal of a linear kernel with size {0}x{1}.'.format(Xn, Xn))
40 |             kernel = np.sum(X*X, axis=0)
41 |         
42 |         if type=='rbf':
43 |             print('Gaussian kernel diagonal is always exp(0)=1.')
44 |             kernel = np.ones(Xn, dtype='d')
45 |         return kernel
46 | 
47 | 
48 |     @staticmethod
49 |     def center_kernel(K):
50 |         print('IMPLEMENTED ME')
51 |         return K
52 | 
53 | 
54 |     @staticmethod 
55 |     def normalize_kernel(K):
56 |         print('IMPLEMENTED ME')     
57 |         return K


--------------------------------------------------------------------------------
/ClusterSVDD/mlp.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import numpy as np
  3 | 
  4 | from sklearn.svm import SVR
  5 | 
  6 | #--------------------------------------------------------------------
  7 | # Hyperparameters
  8 | #--------------------------------------------------------------------
  9 | lr = 0.001 # learning rate
 10 | 
 11 | 
 12 | #--------------------------------------------------------------------
 13 | # Multilayer network
 14 | #--------------------------------------------------------------------
 15 | class Sequential:
 16 | 
 17 |     def __init__(self,modules): self.modules = modules
 18 | 
 19 |     def forward(self,X):
 20 |         for m in self.modules: X = m.forward(X)
 21 |         return X
 22 | 
 23 |     def backward(self,DY):
 24 |         for m in self.modules[::-1]: DY = m.backward(DY)
 25 |         return DY
 26 | 
 27 |     def update(self):
 28 |         for m in self.modules: m.update()
 29 | 
 30 | #--------------------------------------------------------------------
 31 | # Linear layer
 32 | #--------------------------------------------------------------------
 33 | class Linear:
 34 | 
 35 |     def __init__(self,m,n,last=False):
 36 |         self.m = m
 37 |         self.n = n
 38 | 
 39 |         self.W = numpy.random.uniform(-1/self.m**.5,1/self.m**.5,[m,n]).astype('float32')
 40 |         self.B = numpy.zeros([n]).astype('float32')
 41 |         if last: self.W *= 0
 42 | 
 43 |     def forward(self,X):
 44 |         self.X = X
 45 |         return numpy.dot(X,self.W)+self.B
 46 | 
 47 |     def backward(self,DY):
 48 | 
 49 |         DX = numpy.dot(DY,self.W.T)
 50 | 
 51 |         self.DW = (numpy.dot(self.X.T,DY))/ self.m**.5
 52 |         self.DB = (DY.sum(axis=0)) / self.m**.25
 53 | 
 54 |         return DX*(self.m**.5/self.n**.5)
 55 | 
 56 |     def update(self):
 57 |         self.W -= lr*self.DW
 58 |         self.B -= lr*self.DB
 59 | 
 60 | #--------------------------------------------------------------------
 61 | # Hyperbolic tangent layer
 62 | #--------------------------------------------------------------------
 63 | class Tanh:
 64 |     def __init__(self): pass
 65 |     def forward(self,X): self.Y = numpy.tanh(X); return self.Y
 66 |     def backward(self,DY): return  DY*(1-self.Y**2)
 67 | 
 68 |     def update(self): pass
 69 | 
 70 | 
 71 | #====================================================================
 72 | # Test
 73 | #====================================================================
 74 | 
 75 | # Prepare data
 76 | nbsamples=200
 77 | nbinputdims=100
 78 | nboutputdims=1
 79 | 
 80 | # Random regression task
 81 | X = numpy.random.normal(0,1,[nbsamples,nbinputdims])
 82 | T = numpy.random.normal(0,1,[nbsamples,nboutputdims])
 83 | T = numpy.random.normal(0,1,[nbsamples])
 84 | 
 85 | # Initialize network
 86 | nn = Sequential([
 87 |     Linear(nbinputdims,200),
 88 |     Tanh(),
 89 |     Linear(200,20),
 90 |     Tanh(),
 91 |     Linear(20,nboutputdims)
 92 | ])
 93 | 
 94 | clf = SVR(C=1000.0, epsilon=0.0002)
 95 | clf.fit(X, T)
 96 | ypred = clf.predict(X)
 97 | print((ypred-T)**2).sum()
 98 | 
 99 | T = T[:,np.newaxis]
100 | 
101 | # Training
102 | for t in range(1000):
103 | 
104 |     Y = nn.forward(X)
105 |     nn.backward(Y-T)
106 |     nn.update()
107 | 
108 |     if t % 100 == 0: print(t,((Y-T)**2).sum())
109 | 
110 | 


--------------------------------------------------------------------------------
/ClusterSVDD/svdd_dual_qp.py:
--------------------------------------------------------------------------------
  1 | from cvxopt import matrix,spmatrix,sparse
  2 | from cvxopt.solvers import qp
  3 | import numpy as np
  4 | 
  5 | from kernel import Kernel
  6 | 
  7 | class SvddDualQP:
  8 |     """ Dual QP implementation of the support vector data description (SVDD).
  9 |         Author: Nico Goernitz, TU Berlin, 2015
 10 |     """
 11 | 
 12 |     PRECISION = 1e-6  # important: effects the threshold, support vectors and speed!
 13 | 
 14 |     kernel = None 	# (string) name of the kernel to use
 15 |     kparam = None 	# (-) kernel parameter
 16 |     samples = -1 	# (scalar) amount of training data in X
 17 | 
 18 |     nu = 0.95	    # (scalar) the regularization constant > 0
 19 | 
 20 |     X = None        # (matrix) training data
 21 |     alphas = None   # (vector) dual solution vector
 22 |     svs = None      # (vector) support vector indices
 23 |     radius2 = 0.0   # (scalar) the optimized threshold (rho)
 24 |     cTc = None      # (vector) alphaT*K*alpha for support vectors only
 25 | 
 26 |     pobj = 0.0      # (scalar) primal objective value after training
 27 | 
 28 |     def __init__(self, kernel, kparam, nu):
 29 |         self.kernel = kernel
 30 |         self.kparam = kparam
 31 |         self.nu = nu
 32 |         print('Creating new dual QP SVDD ({0}) with nu={1}.'.format(kernel, nu))
 33 | 
 34 |     def fit(self, X, max_iter=-1):
 35 |         """
 36 |         :param X: Data matrix is assumed to be feats x samples.
 37 |         :param max_iter: *ignored*, just for compatibility.
 38 |         :return: Alphas and threshold for dual SVDDs.
 39 |         """
 40 |         self.X = X.copy()
 41 |         dims, self.samples = X.shape
 42 | 
 43 |         if self.samples < 1:
 44 |             print('Invalid training data.')
 45 |             return -1
 46 | 
 47 |         # number of training examples
 48 |         N = self.samples
 49 |         C = 1. / np.float(self.samples*self.nu)
 50 | 
 51 |         kernel = Kernel.get_kernel(X, X, self.kernel, self.kparam)
 52 |         norms = np.diag(kernel).copy()
 53 | 
 54 |         if self.nu >= 1.0:
 55 |             print("Center-of-mass solution.")
 56 |             self.alphas = np.ones(self.samples) / float(self.samples)
 57 |             self.radius2 = 0.0
 58 |             self.svs = np.array(range(self.samples), dtype='i')
 59 |             self.pobj = 0.0  # TODO: calculate real primal objective
 60 |             self.cTc = self.alphas[self.svs].T.dot(kernel[self.svs, :][:, self.svs].dot(self.alphas[self.svs]))
 61 |             return self.alphas, self.radius2
 62 | 
 63 |         # generate a kernel matrix
 64 |         P = 2.0*matrix(kernel)
 65 | 
 66 |         # this is the diagonal of the kernel matrix
 67 |         q = -matrix(norms)
 68 | 
 69 |         # sum_i alpha_i = A alpha = b = 1.0
 70 |         A = matrix(1.0, (1, N))
 71 |         b = matrix(1.0, (1, 1))
 72 | 
 73 |         # 0 <= alpha_i <= h = C
 74 |         G1 = spmatrix(1.0, range(N), range(N))
 75 |         G = sparse([G1, -G1])
 76 |         h1 = matrix(C, (N, 1))
 77 |         h2 = matrix(0.0, (N, 1))
 78 |         h = matrix([h1, h2])
 79 | 
 80 |         sol = qp(P, q, G, h, A, b)
 81 | 
 82 |         # store solution
 83 |         self.alphas = np.array(sol['x'], dtype=np.float)
 84 |         self.pobj = -sol['primal objective']
 85 | 
 86 |         # find support vectors
 87 |         self.svs = np.where(self.alphas > self.PRECISION)[0]
 88 |         # self.cTc = self.alphas[self.svs].T.dot(kernel[self.svs, :][:, self.svs].dot(self.alphas[self.svs]))
 89 |         self.cTc = self.alphas.T.dot(kernel.dot(self.alphas))
 90 | 
 91 |         # find support vectors with alpha < C for threshold calculation
 92 |         self.radius2 = 0.
 93 |         thres = self.predict(X[:, self.svs])
 94 |         self.radius2 = np.min(thres)
 95 | 
 96 |         # self.radius2 = 0.
 97 |         # thres = self.predict(X)
 98 |         # sort_thres = np.sort(thres)
 99 |         # self.radius2 = sort_thres[np.floor(N*(1.0-self.nu))]
100 |         # print 'RADIUS => Index: ', np.floor(N*(1.0-self.nu)), self.radius2
101 |         # print sort_thres
102 | 
103 |         # print 'jkdsajfjsaldj', np.min(thres)
104 |         # print np.sum(self.alphas)
105 |         # print self.nu, np.sum(self.alphas>1e-6), self.samples
106 | 
107 |         # print('Threshold is {0}'.format(self.radius2))
108 |         return self.alphas, thres
109 | 
110 |     def get_radius(self):
111 |         return self.radius2
112 | 
113 |     def get_alphas(self):
114 |         return self.alphas
115 | 
116 |     def get_support_inds(self):
117 |         return self.svs
118 | 
119 |     def get_support(self):
120 |         return self.alphas[self.svs]
121 | 
122 |     def predict(self, Y):
123 |         # build test kernel
124 |         kernel = Kernel.get_kernel(Y, self.X[:, self.svs], self.kernel, self.kparam)
125 |         # kernel = Kernel.get_kernel(Y, self.X, self.kernel, self.kparam)
126 |         # for svdd we need the data norms additionally
127 |         norms = Kernel.get_diag_kernel(Y, self.kernel)
128 |         # number of training examples
129 |         res = self.cTc - 2. * kernel.dot(self.get_support()).T + norms
130 |         # res = self.cTc - 2. * kernel.dot(self.alphas).T + norms
131 |         return res.reshape(Y.shape[1]) - self.radius2
132 | 


--------------------------------------------------------------------------------
/ClusterSVDD/svdd_primal_sgd.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'nicococo'
  2 | import numpy as np
  3 | 
  4 | from numba import autojit
  5 | 
  6 | 
  7 | class SvddPrimalSGD(object):
  8 |     """ Primal subgradient descent solver for the support vector data description (SVDD).
  9 |         Author: Nico Goernitz, TU Berlin, 2015
 10 |     """
 11 |     PRECISION = 10**-3  # important: effects the threshold, support vectors and speed!
 12 |     nu = 0.95	    # (scalar) the regularization constant > 0
 13 |     c = None        # (vecor) center of the hypersphere
 14 |     radius2 = 0.0   # (scalar) the optimized threshold (rho)
 15 |     pobj = 0.0      # (scalar) primal objective after training
 16 | 
 17 |     def __init__(self, nu):
 18 |         self.nu = nu
 19 |         print('Creating new primal SVDD with nu={0}.'.format(nu))
 20 | 
 21 |     @autojit
 22 |     def fit(self, X, max_iter=20000, prec=1e-6, rate=0.01):
 23 |         if X.shape[1] < 1:
 24 |             print('Invalid training data.')
 25 |             return -1, -1
 26 |         self.c, self.radius2, self.pobj, iter = fit_extern(X, self.nu, max_iter, prec, rate)
 27 |         print('Iter={2}: obj={0}  T={1}'.format(self.pobj, self.radius2, iter+1))
 28 |         return self.c, self.radius2
 29 | 
 30 |     def get_radius(self):
 31 |         return self.radius2
 32 | 
 33 |     def predict(self, X):
 34 |         # X : (dims x samples)
 35 |         dist = self.c.T.dot(self.c) - 2.*self.c.T.dot(X) + np.sum(X*X, axis=0)
 36 |         return dist - self.radius2
 37 | 
 38 | 
 39 | @autojit(nopython=True)
 40 | def fit_extern(X, nu, max_iter, prec, rate):
 41 |     """ Subgradient descent solver for primal SVDD.
 42 |         Optimized for 'numba'
 43 |     """
 44 |     (dims, samples) = X.shape
 45 | 
 46 |     # number of training examples
 47 |     reg = 1./(np.float64(samples)*nu)
 48 | 
 49 |     # center of mass
 50 |     c = np.zeros(dims, dtype=np.float64)
 51 |     # np.sum(X*X, axis=0)
 52 |     sum_XX = np.zeros(samples)
 53 |     for s in range(samples):
 54 |         foo = 0.0
 55 |         for d in range(dims):
 56 |             foo += X[d, s]*X[d, s]
 57 |             c[d] += X[d, s] / np.float64(samples)
 58 |         sum_XX[s] = foo
 59 |     # print np.sum(np.abs(c-np.mean(X, axis=1)))
 60 | 
 61 |     dot_2cX = np.zeros(samples, dtype=np.float64)
 62 |     for s in range(samples):
 63 |         dot_2cX[s] = 2.0 * np.sum(c*X[:, s])
 64 |     dist = np.sum(c*c) - dot_2cX + sum_XX
 65 | 
 66 |     T = 0.4 * np.max(dist) * (1.0-nu)  # starting heuristic T
 67 |     # if nu exceeds 1.0, then T^* is always 0 and c can
 68 |     # be computed analytically (as center-of-mass, mean)
 69 |     if nu >= 1.0:
 70 |         return c, 0.0, 0.0, 0
 71 | 
 72 |     is_converged = False
 73 |     best_c = c
 74 |     best_radius2 = T
 75 |     obj_best = np.float64(1e20)
 76 | 
 77 |     obj_bak = -100.
 78 |     iter = 0
 79 | 
 80 |     # gradient step for center
 81 |     dc = np.zeros(dims, dtype=np.float64)
 82 |     inds = np.zeros(samples, dtype=np.int64)
 83 |     while not is_converged and iter < max_iter:
 84 |         # print iter
 85 |         for s in range(samples):
 86 |            dot_2cX[s] = 2.0 * np.sum(c*X[:, s])
 87 | 
 88 |         # calculate the distances of the center to each datapoint
 89 |         dist = np.sum(c*c) - dot_2cX + sum_XX
 90 |         inds_size = 0
 91 |         for s in range(samples):
 92 |             if dist[s]-T >= 1e-12:
 93 |                 inds[inds_size] = s
 94 |                 inds_size += 1
 95 |         # we need at least 1 entry, hence lower T to the maximum entry
 96 |         if inds_size == 0:
 97 |             inds_size = 1
 98 |             inds[0] = np.argmax(dist)
 99 |             T = dist[inds[0]]
100 | 
101 |         # real objective value given the current center c and threshold T
102 |         ds = 0.0
103 |         for s in range(inds_size):
104 |             ds += dist[inds[s]] - T
105 |         obj = T + reg*ds
106 | 
107 |         # this is subgradient, hence need to store the best solution so far
108 |         if obj_best >= obj:
109 |             best_c = c
110 |             best_radius2 = T
111 |             obj_best = obj
112 | 
113 |         # stop, if progress is too slow
114 |         if obj > 0.:
115 |             if np.abs((obj-obj_bak)/obj) < prec:
116 |                 is_converged = True
117 |                 continue
118 |         obj_bak = obj
119 | 
120 |         # stepsize should be not more than 0.1 % of the maximum value encountered in dist
121 |         max_change = rate * np.max(dist) / np.float(iter+1)*10.
122 | 
123 |         # gradient step for threshold
124 |         dT = 1.0 - reg*np.float(inds_size)
125 |         T -= np.sign(dT) * max_change
126 | 
127 |         # gradient step for center
128 |         norm_dc = 0.0
129 |         for d in range(dims):
130 |             dc[d] = 0.0
131 |             for s in range(inds_size):
132 |                 dc[d] += 2.*reg*(c[d] - X[d, inds[s]])
133 |             norm_dc += dc[d]*dc[d]
134 |         norm_dc = np.sqrt(norm_dc)
135 | 
136 |         if np.abs(norm_dc) < 1e-12:
137 |             norm_dc = 1.0
138 | 
139 |         for d in range(dims):
140 |             c[d] -= dc[d]/norm_dc * max_change
141 |         iter += 1
142 | 
143 |     return best_c, best_radius2, obj_best, iter
144 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 nico
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ClusterSvdd
2 | Cluster Support Vector Data Description
3 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nicococo/ClusterSvdd/2f61c187a3197c807b239202b72d9c84cb46400c/__init__.py


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nicococo/ClusterSvdd/2f61c187a3197c807b239202b72d9c84cb46400c/scripts/__init__.py


--------------------------------------------------------------------------------
/scripts/test_ad_svdd.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import sklearn.metrics as metrics
  3 | import sklearn.datasets as datasets
  4 | import numpy as np
  5 | 
  6 | from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD
  7 | from ClusterSVDD.svdd_dual_qp import SvddDualQP
  8 | from ClusterSVDD.cluster_svdd import ClusterSvdd
  9 | 
 10 | 
 11 | def generate_data_uniform(datapoints, cluster_dir_alphas=(10, 10, 10), outlier_frac=0.1, feats=2, noise_feats=0):
 12 |     cluster = len(cluster_dir_alphas)
 13 |     X = np.zeros((feats, datapoints))
 14 |     y = np.zeros(datapoints)
 15 | 
 16 |     num_noise = np.int(np.floor(datapoints*outlier_frac))
 17 | 
 18 |     samples = np.random.dirichlet(cluster_dir_alphas, 1)[0]
 19 |     samples = np.array(samples*(datapoints-num_noise), dtype=np.int)
 20 |     print samples, sum(samples)
 21 |     if np.sum(samples)+num_noise < datapoints:
 22 |         print('Add another sample..')
 23 |         num_noise += datapoints-(np.sum(samples)+num_noise)
 24 |         print num_noise+np.sum(samples), datapoints
 25 | 
 26 |     cnt = num_noise
 27 |     for i in range(cluster):
 28 |         m = np.random.randn(feats-noise_feats)*8.
 29 |         #cov = np.diag(np.random.rand(feats-noise_feats))
 30 |         cov = 2.*np.random.rand() * np.eye(feats-noise_feats)
 31 |         print cov
 32 |         X[:feats-noise_feats, cnt:cnt+samples[i]] = np.random.multivariate_normal(m, cov, samples[i]).T
 33 |         y[cnt:cnt+samples[i]] = i+1
 34 |         cnt += samples[i]
 35 | 
 36 |     mul = np.max(np.abs(X))*2.
 37 |     print mul
 38 |     X[:, :num_noise] = 2.*mul*(np.random.rand(feats, num_noise)-0.5)
 39 |     y[:num_noise] = -1
 40 | 
 41 |     X[feats-noise_feats:, :] = 2.*mul*np.random.randn(noise_feats, datapoints)
 42 | 
 43 |     # normalize each feature [-1,+1]
 44 |     X = X / np.repeat(np.max(np.abs(X), axis=1)[:, np.newaxis], datapoints, axis=1)
 45 | 
 46 |     return X, y
 47 | 
 48 | 
 49 | def generate_data_moons(datapoints, outlier_frac=0.1, noise_feats=0.05):
 50 |     X = np.zeros((datapoints, 2))
 51 |     y = np.zeros(datapoints)
 52 | 
 53 |     num_noise = np.int(np.floor(datapoints*outlier_frac))
 54 | 
 55 |     X[num_noise:, :], y[num_noise:] = datasets.make_moons(n_samples=datapoints-num_noise, noise=noise_feats)
 56 |     X = X.T
 57 |     y[num_noise:] += 1
 58 | 
 59 |     mul = np.max(np.abs(X))*1.5
 60 |     print mul
 61 |     X[:, :num_noise] = 2.*mul*(np.random.rand(2, num_noise)-0.5)
 62 |     y[:num_noise] = -1
 63 | 
 64 |     # normalize each feature [-1,+1]
 65 |     X = X / np.repeat(np.max(np.abs(X), axis=1)[:, np.newaxis], datapoints, axis=1)
 66 | 
 67 |     return X, y
 68 | 
 69 | 
 70 | def generate_data(datapoints, norm_dir_alpha=10., anom_dir_alpha=4., anom_cluster=[0, 0, 0, 1, 1, 1], feats=2):
 71 |     cluster = len(anom_cluster)
 72 |     X = np.zeros((feats, datapoints))
 73 |     y = np.zeros(datapoints)
 74 | 
 75 |     cluster_dir_alphas = np.array(anom_cluster)*anom_dir_alpha + (1-np.array(anom_cluster))*norm_dir_alpha
 76 |     samples = np.random.dirichlet(cluster_dir_alphas, 1)[0]
 77 |     samples = np.array(samples*datapoints, dtype=np.int)
 78 |     if np.sum(samples) < datapoints:
 79 |         print('Add another sample..')
 80 |         samples[-1] += 1
 81 | 
 82 |     cnt = 0
 83 |     anom_lbl = -1
 84 |     norm_lbl = 1
 85 |     for i in range(cluster):
 86 |         sigma = 8.
 87 |         if anom_cluster[i] == 1:
 88 |             sigma = 1.
 89 |         m = np.random.randn(feats)*sigma
 90 |         cov = np.diag(np.random.rand(feats))
 91 |         print cov
 92 |         X[:, cnt:cnt+samples[i]] = np.random.multivariate_normal(m, cov, samples[i]).T
 93 |         label = norm_lbl
 94 |         if anom_cluster[i] == 1:
 95 |             label = anom_lbl
 96 |             anom_lbl -= 1
 97 |         else:
 98 |             label = norm_lbl
 99 |             norm_lbl += 1
100 |         y[cnt:cnt+samples[i]] = label
101 |         cnt += samples[i]
102 | 
103 |     # normalize each feature [-1,+1]
104 |     X = X / np.repeat(np.max(np.abs(X), axis=1)[:, np.newaxis], datapoints, axis=1)
105 | 
106 |     return X, y
107 | 
108 | 
109 | def evaluate(nu, k, data, y, train, test, use_kernel=False, kparam=0.1, plot=False):
110 | 
111 |     # fix the initialization for all methods
112 |     membership = np.random.randint(0, k, y.size)
113 |     svdds = list()
114 |     for l in range(k):
115 |         if use_kernel:
116 |             svdds.append(SvddDualQP('rbf', kparam, nu))
117 |         else:
118 |             svdds.append(SvddPrimalSGD(nu))
119 | 
120 |     svdd = ClusterSvdd(svdds)
121 |     svdd.fit(data[:, train].copy(), max_iter=60, init_membership=membership[train])
122 |     scores, classes = svdd.predict(data[:, test].copy())
123 | 
124 |     # normal classes are positive (e.g. 1,2,3,..) anomalous class is -1
125 |     print y[test]
126 |     true_lbl = y[test]
127 |     true_lbl[true_lbl < 0] = -1  # convert outliers to single outlier class
128 |     ari = metrics.cluster.adjusted_rand_score(true_lbl, classes)
129 |     if nu < 1.0:
130 |         classes[scores > 0.] = -1
131 |         ari = metrics.cluster.adjusted_rand_score(true_lbl, classes)
132 |     print 'ARI=', ari
133 | 
134 |     fpr, tpr, _ = metrics.roc_curve(y[test]<0., scores, pos_label=1)
135 |     auc = metrics.auc(fpr, tpr, )
136 |     print 'AUC=', auc
137 | 
138 |     if plot:
139 |         plt.figure(1)
140 |         anom_inds = np.where(y == -1)[0]
141 |         plt.plot(data[0, anom_inds], data[1, anom_inds], '.g', markersize=2)
142 |         nom_inds = np.where(y != -1)[0]
143 |         plt.plot(data[0, nom_inds], data[1, nom_inds], '.r', markersize=6)
144 | 
145 |         an = np.linspace(0, 2*np.pi, 100)
146 |         for l in range(k):
147 |             r = np.sqrt(svdd.svdds[l].radius2)
148 |             if hasattr(svdd.svdds[l],'c'):
149 |                 plt.plot(svdd.svdds[l].c[0], svdd.svdds[l].c[1],
150 |                          'xb', markersize=6, linewidth=2, alpha=0.7)
151 |                 plt.plot(r*np.sin(an)+svdd.svdds[l].c[0], r*np.cos(an)+svdd.svdds[l].c[1],
152 |                          '-b', linewidth=2, alpha=0.7)
153 |         plt.show()
154 |     return ari, auc
155 | 
156 | 
157 | 
158 | 
159 | if __name__ == '__main__':
160 |     num_train = 600
161 |     num_test = 600
162 | 
163 |     train = np.array(range(num_train), dtype='i')
164 |     test = np.array(range(num_train, num_train + num_test), dtype='i')
165 | 
166 |     reps = 1
167 |     nus = [0.1, 0.5, 0.8, 1.0]
168 |     ks = [3]
169 |     aris = np.zeros((reps, len(nus),len(ks)))
170 |     aucs = np.zeros((reps, len(nus),len(ks)))
171 | 
172 |     data, y = generate_data_uniform(num_train + num_test, cluster_dir_alphas=(10, 10, 10), outlier_frac=0.5, feats=2, noise_feats=0)
173 |     # data, y = generate_data(num_train + num_test, norm_dir_alpha=10., anom_dir_alpha=2., anom_cluster=[0, 0, 0, 1, 1, 1, 1, 1, 1], feats=2)
174 |     # data, y = generate_data_moons(num_train + num_test, outlier_frac=0.3, noise_feats=0.05)
175 | 
176 | 
177 |     for r in range(reps):
178 |         # data, y = generate_data_uniform(num_train + num_test, cluster_dir_alphas=(10, 10, 10), outlier_frac=0.25, feats=2, noise_feats=0)
179 |         inds = np.random.permutation((num_test + num_train))
180 |         data = data[:, inds]
181 |         y = y[inds]
182 | 
183 |         # inds = np.where(y>=-1)[0]
184 |         # rinds = np.random.permutation(inds.size)
185 |         # train = inds[rinds[:num_train]]
186 |         # test = np.setdiff1d(np.arange(num_train+num_test), train)
187 | 
188 |         ssseeed = np.random.randint(low=0, high=1101010)
189 |         for nu in range(len(nus)):
190 |             for k in range(len(ks)):
191 |                 np.random.seed(ssseeed)
192 |                 aris[r, nu, k], aucs[r, nu, k] = evaluate(nus[nu], ks[k], data, y, train, test, use_kernel=False, kparam=1., plot=False)
193 | 
194 |     print '\n'
195 |     for nu in range(len(nus)):
196 |         print ''
197 |         for k in range(len(ks)):
198 |             print('k={0} nu={1}: ARI = {2:1.2f}+/-{4:1.2f}   AUC = {3:1.2f}+/-{4:1.2f}'.format(ks[k], nus[nu],
199 |                         np.mean(aris[:, nu, k]), np.mean(aucs[:, nu, k]), np.std(aris[:, nu, k]), np.std(aucs[:, nu, k])))
200 | 
201 |     print('\nDONE :)')
202 | 


--------------------------------------------------------------------------------
/scripts/test_anom.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'Nico Goernitz'
  2 | 
  3 | 
  4 | import matplotlib.pyplot as plt
  5 | import sklearn.metrics as metrics
  6 | import numpy as np
  7 | 
  8 | from ClusterSVDD.svdd_dual_qp import SvddDualQP
  9 | from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD
 10 | from ClusterSVDD.cluster_svdd import ClusterSvdd
 11 | 
 12 | 
 13 | def generate_data(datapoints, outlier_frac=0.1, dims=2):
 14 |     X = np.zeros((dims, datapoints))
 15 |     y = np.zeros(datapoints)
 16 |     num_noise = np.floor(datapoints*outlier_frac)
 17 |     num_dpc = np.floor(float(datapoints-num_noise)/2.0)
 18 | 
 19 |     X[:, :num_noise] = 0.15*np.random.randn(dims, num_noise) \
 20 |                        + np.array([0.7, -0.]).reshape((2, 1)).dot(np.ones((1, num_noise)))
 21 |     y[:num_noise] = -1
 22 | 
 23 |     cnt = num_noise
 24 |     X[:, cnt:cnt+num_dpc] = 0.5*np.random.randn(dims, num_dpc) \
 25 |                             + np.array([-1.5, -2.]).reshape((2, 1)).dot(np.ones((1, num_dpc)))
 26 |     y[cnt:cnt+num_dpc] = 1
 27 |     cnt += num_dpc
 28 | 
 29 |     num_dpc = datapoints-cnt
 30 |     X[:, cnt:] = 0.6*np.random.randn(dims, num_dpc) \
 31 |                  + np.array([-1.5, +1.]).reshape((2, 1)).dot(np.ones((1, num_dpc)))
 32 |     y[cnt:] = 1
 33 |     return X, y
 34 | 
 35 | 
 36 | def plot_results(fname):
 37 |     foo = np.load(fname)
 38 |     maucs = foo['maucs']
 39 |     saucs = foo['saucs']
 40 |     nus = foo['nus']
 41 |     ks = foo['ks']
 42 |     reps = foo['reps']
 43 | 
 44 |     plt.figure(1)
 45 |     np.random.seed(10)
 46 |     cols = np.random.rand(maucs.shape[1], 3)
 47 |     fmts = ['-x', '--o', '--D', '--s', '--H']
 48 |     for i in range(maucs.shape[1]):
 49 |         plt.errorbar(nus, maucs[:, i], saucs[:, i]/np.sqrt(reps), fmt=fmts[i], color=cols[i, :], ecolor=cols[i, :], linewidth=2.0, elinewidth=1.0, alpha=0.8)
 50 |     plt.xlim((-0.0, 0.21))
 51 |     plt.ylim((0.35, 1.0))
 52 |     # ticks = nus.astype('|S10')
 53 |     # ticks[0] = '1.0=kmeans'
 54 |     plt.xticks(nus, ['1', '2.5', '5', '7.5', '10', '15', '20'])
 55 |     # plt.xticks([0.0, 0.25, 0.5, 0.75, 1.0], ['0.0', '0.25', '0.5', '0.75', '1.0 = k-means'], fontsize=14)
 56 |     # plt.yticks([0.0, 0.25, 0.5, 0.75, 1.0], fontsize=14)
 57 |     plt.grid()
 58 |     plt.xlabel(r'regularization parameter $\nu$', fontsize=14)
 59 |     plt.ylabel(r'Anomaly Detection Accuracy (in AUROC)', fontsize=14)
 60 |     names = ['SVDD']
 61 |     for i in range(1, maucs.shape[1]):
 62 |         names.append('ClusterSVDD (k={0})'.format(ks[i]))
 63 |     plt.legend(names, loc=4, fontsize=14)
 64 |     plt.show()
 65 | 
 66 | 
 67 | def evaluate(res_filename, nus, sigmas, ks, reps, ntrain, ntest, nval, use_kernels, anom_frac):
 68 |     train = np.array(range(ntrain-nval), dtype='i')
 69 |     val = np.array(range(ntrain-nval, ntrain), dtype='i')
 70 |     test = np.array(range(ntrain, ntrain+ntest), dtype='i')
 71 |     aucs = np.zeros((reps, len(nus), len(ks)))
 72 |     for n in range(reps):
 73 |         # generate new gaussians
 74 |         data, y = generate_data(ntrain+ntest, outlier_frac=anom_frac)
 75 |         inds = np.random.permutation(range(ntest+ntrain))
 76 |         data = data[:, inds]
 77 |         y = y[inds]
 78 |         for i in range(len(nus)):
 79 |             for k in range(len(ks)):
 80 |                 # fix the initialization for all methods
 81 |                 membership = np.random.randint(0, ks[k], y.size)
 82 | 
 83 |                 max_auc = -1.0
 84 |                 max_val_auc = -1.0
 85 |                 for sigma in sigmas:
 86 |                     # build cluster svdd
 87 |                     svdds = list()
 88 |                     for l in range(ks[k]):
 89 |                         if use_kernels:
 90 |                             svdds.append(SvddDualQP('rbf', sigma, nus[i]))
 91 |                         else:
 92 |                             svdds.append(SvddPrimalSGD(nus[i]))
 93 | 
 94 |                     svdd = ClusterSvdd(svdds)
 95 |                     svdd.fit(data[:, train], init_membership=membership[train])
 96 |                     scores_val, _ = svdd.predict(data[:, val])
 97 |                     # test on validation data
 98 |                     fpr, tpr, _ = metrics.roc_curve(np.array(y[val]<0., dtype='i'), scores_val, pos_label=1)
 99 |                     curr_auc = metrics.auc(fpr, tpr)
100 |                     if curr_auc >= max_val_auc:
101 |                         # store test data accuracy
102 |                         scores, _ = svdd.predict(data[:, test])
103 |                         fpr, tpr, _ = metrics.roc_curve(np.array(y[test]<0., dtype='i'), scores, pos_label=1)
104 |                         max_auc = metrics.auc(fpr, tpr)
105 |                         max_val_auc = curr_auc
106 |                 aucs[n, i, k] = max_auc
107 |     # means and standard deviations
108 |     maucs = np.mean(aucs, axis=0)
109 |     saucs = np.std(aucs, axis=0)
110 |     print 'AUCs'
111 |     print np.mean(aucs, axis=0)
112 |     print 'Stds'
113 |     print np.std(aucs, axis=0)
114 |     # save results
115 |     np.savez(res_filename, maucs=maucs, saucs=saucs, outlier_frac=nus,
116 |              ntrain=ntrain, ntest=ntest, reps=reps, nus=nus, ks=ks, sigmas=sigmas)
117 | 
118 | 
119 | if __name__ == '__main__':
120 |     nus = [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2]
121 |     sigmas = [0.1, 0.25, 0.5, 1.0, 2.0]
122 |     ks = [1, 2, 3, 4]
123 | 
124 |     reps = 50  # number of repetitions for performance measures
125 |     num_train = 1000  # total number of data points is num_train+num_test
126 |     num_test = 2000
127 |     num_val = 400  # num_val is part of ntrain
128 |     use_kernels = False
129 | 
130 |     anom_frac = 0.05  # fraction of anomalies in the generated dataset
131 | 
132 |     do_plot = True
133 |     do_evaluation = False
134 | 
135 |     res_filename = 'res_anom_{0}_{1}_{2}_rbf.npz'.format(reps, len(ks), len(nus))
136 |     if not use_kernels:
137 |         sigmas = [1.0]
138 |         res_filename = 'res_anom_{0}_{1}_{2}.npz'.format(reps, len(ks), len(nus))
139 | 
140 |     if do_evaluation:
141 |         evaluate(res_filename, nus, sigmas, ks, reps, num_train, num_test, num_val, use_kernels, anom_frac)
142 |     if do_plot:
143 |         plot_results(res_filename)
144 | 
145 |     print('DONE :)')
146 | 


--------------------------------------------------------------------------------
/scripts/test_clustersvdd.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import matplotlib.pyplot as plt
  3 | import matplotlib.pylab as pl
  4 | import numpy as np
  5 | 
  6 | from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD
  7 | from ClusterSVDD.cluster_svdd import ClusterSvdd
  8 | 
  9 | 
 10 | def generate_gaussians(datapoints, cluster, noise_frac=0.1, dims=2):
 11 |     mean_mul = 50.
 12 |     vars = [4.1, 4.1]
 13 | 
 14 |     num_noise = np.int(np.floor(datapoints*noise_frac))
 15 |     num_dpc = np.int(np.floor(float(datapoints-num_noise)/float(cluster)))
 16 | 
 17 |     X = np.zeros((dims, datapoints))
 18 |     X[:, :num_noise] = 100.*(2.*np.random.rand(dims, num_noise)-1.)
 19 | 
 20 |     y = np.zeros(datapoints)
 21 |     y[:num_noise] = -1
 22 |     cnt = num_noise
 23 | 
 24 |     for i in range(cluster):
 25 |         t = 4.
 26 |         v = np.diag( (t*vars[0] + (1.-t)*vars[1]) * np.ones(dims))
 27 | 
 28 |         # draw the mean
 29 |         m = mean_mul * (4.*np.random.rand(dims, 1)-2.)
 30 |         if i == cluster-1:
 31 |             num_dpc = datapoints-cnt
 32 |         m = m.dot(np.ones((1, num_dpc)))
 33 |         # generate the cluster gaussian
 34 |         X[:, cnt:cnt+num_dpc] = v.dot(4.*np.random.randn(dims, num_dpc)) + m
 35 | 
 36 |         y[cnt:cnt+num_dpc] = i
 37 |         cnt += num_dpc
 38 | 
 39 |     # # normalize each feature
 40 |     X = X / np.repeat(np.max(np.abs(X), axis=1)[:, np.newaxis]/2., datapoints, axis=1)
 41 |     return X, y
 42 | 
 43 | 
 44 | def train(cluster, data, nu, membership):
 45 |     svdds = []
 46 |     for c in range(cluster):
 47 |         svdds.append(SvddPrimalSGD(nu))
 48 |     svdd = ClusterSvdd(svdds, nu=nu)
 49 |     cinds = svdd.fit(data, init_membership=membership, max_svdd_iter=10000, max_iter=40)
 50 |     print cinds
 51 |     return svdd, cinds
 52 | 
 53 | 
 54 | if __name__ == '__main__':
 55 |     np.random.seed(1000)
 56 |     nu = 0.1  # CLUSTER - DUAL, PRIMAL
 57 |     n_cluster = 3  # 'k' number of clusters for the methods and data generation
 58 | 
 59 |     Dtrain, ytrain = generate_gaussians(1000, n_cluster, noise_frac=0.01)
 60 |     membership = np.random.randint(0, n_cluster, ytrain.size)
 61 | 
 62 |     # generate test data grid
 63 |     delta = 0.1
 64 |     x = np.arange(-2.0-delta, 2.0+delta, delta)
 65 |     y = np.arange(-2.0-delta, 2.0+delta, delta)
 66 |     (X, Y) = np.meshgrid(x, y)
 67 |     (sx, sy) = X.shape
 68 |     Xf = np.reshape(X,(1, sx*sy))
 69 |     Yf = np.reshape(Y,(1, sx*sy))
 70 |     Dtest = np.append(Xf, Yf, axis=0)
 71 | 
 72 |     # The code below is basically only for beautiful visualizations
 73 |     plt.figure(1)
 74 | 
 75 |     # For each \nu in the nus list, train, predict and
 76 |     svdd, cinds = train(n_cluster, Dtrain, nu, membership)
 77 |     scores, cres = svdd.predict(Dtrain)
 78 |     res, cres = svdd.predict(Dtest)
 79 | 
 80 |     Z = np.reshape(res,(sx, sy))
 81 |     cs = plt.contourf(X, Y, Z, cmap=plt.cm.bone, alpha=0.2)
 82 | 
 83 |     cols = np.random.rand(3, n_cluster+1)
 84 |     cols[:, 0] = np.array([0.95, 0.1, 0.1])
 85 |     cols[:, 1] = np.array([0.9, 0.3, 0.7])
 86 |     cols[:, 2] = np.array([0.4, 0.9, 0.3])
 87 |     cols[:, 3] = np.array([0.4, 0.4, 0.9])
 88 |     for c in range(n_cluster):
 89 |         inds = np.where(cinds == c)[0]
 90 |         plt.scatter(Dtrain[0, inds], Dtrain[1, inds], 30, alpha=0.7, c=cols[:, c])
 91 |         pl.gca().add_patch(pl.Circle((svdd.svdds[c].c[0],svdd.svdds[c].c[1]),
 92 |                                      np.sqrt(svdd.svdds[c].radius2), alpha=0.6,
 93 |                                      color=cols[:, c], fill=True))
 94 | 
 95 |     plt.xlim((-2., 2.))
 96 |     plt.ylim((-2., 2.))
 97 |     plt.yticks([], [])
 98 |     plt.xticks([], [])
 99 | 
100 |     plt.show()
101 |     pl.show()
102 |     print('finished')
103 | 


--------------------------------------------------------------------------------
/scripts/test_exm.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import matplotlib.pyplot as plt
  3 | import numpy as np
  4 | 
  5 | from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD
  6 | from ClusterSVDD.svdd_dual_qp import SvddDualQP
  7 | from ClusterSVDD.cluster_svdd import ClusterSvdd
  8 | 
  9 | 
 10 | def generate_gaussians(datapoints, cluster, noise_frac=0.1, dims=2):
 11 |     mean_mul = 50.
 12 |     vars = [4.1, 4.1]
 13 | 
 14 |     num_noise = np.int(np.floor(datapoints*noise_frac))
 15 |     num_dpc = np.int(np.floor(float(datapoints-num_noise)/float(cluster)))
 16 | 
 17 |     X = np.zeros((dims, datapoints))
 18 |     X[:, :num_noise] = 100.*(2.*np.random.rand(dims, num_noise)-1.)
 19 | 
 20 |     y = np.zeros(datapoints)
 21 |     y[:num_noise] = -1
 22 |     cnt = num_noise
 23 | 
 24 |     for i in range(cluster):
 25 |         t = np.random.rand()
 26 |         v = np.diag( (t*vars[0] + (1.-t)*vars[1]) * np.ones(dims))
 27 | 
 28 |         # draw the mean
 29 |         m = mean_mul * (2.*np.random.rand(dims, 1)-1.)
 30 |         if i == cluster-1:
 31 |             num_dpc = datapoints-cnt
 32 |         m = m.dot(np.ones((1, num_dpc)))
 33 |         # generate the cluster gaussian
 34 |         X[:, cnt:cnt+num_dpc] = v.dot(np.random.randn(dims, num_dpc)) + m
 35 | 
 36 |         y[cnt:cnt+num_dpc] = i
 37 |         cnt += num_dpc
 38 | 
 39 |     # # normalize each feature
 40 |     X = X / np.repeat(np.max(np.abs(X), axis=1)[:, np.newaxis]/2., datapoints, axis=1)
 41 |     return X, y
 42 | 
 43 | 
 44 | def train(cluster, data, nu, membership, use_primal=True):
 45 |     svdds = []
 46 |     for c in range(cluster):
 47 |         if use_primal:
 48 |             svdds.append(SvddPrimalSGD(nu))
 49 |         else:
 50 |             svdds.append(SvddDualQP('rbf', 0.4, nu))
 51 |     svdd = ClusterSvdd(svdds, nu=nu)
 52 |     cinds = svdd.fit(data, init_membership=membership, max_svdd_iter=10000, max_iter=40)
 53 |     print cinds
 54 |     return svdd, cinds
 55 | 
 56 | 
 57 | if __name__ == '__main__':
 58 |     np.random.seed(10)
 59 |     nus = [0.14] # ANOM - PRIMAL
 60 |     nus = [0.07]  # ANOM - DUAL
 61 | 
 62 |     nus = [0.8]  # CLUSTER - DUAL, PRIMAL
 63 |     n_cluster = 4  # 'k' number of clusters for the methods and data generation
 64 |     use_primal = True
 65 |     # use primal sgd svdd or dual kernel qp
 66 |     ad_setting = True  # either ad or cluster setting
 67 | 
 68 |     Dtrain, ytrain = generate_gaussians(1000, n_cluster, noise_frac=0.01)
 69 |     membership = np.random.randint(0, n_cluster, ytrain.size)
 70 | 
 71 |     # generate test data grid
 72 |     delta = 0.1
 73 |     x = np.arange(-2.0-delta, 2.0+delta, delta)
 74 |     y = np.arange(-2.0-delta, 2.0+delta, delta)
 75 |     (X, Y) = np.meshgrid(x, y)
 76 |     (sx, sy) = X.shape
 77 |     Xf = np.reshape(X,(1, sx*sy))
 78 |     Yf = np.reshape(Y,(1, sx*sy))
 79 |     Dtest = np.append(Xf, Yf, axis=0)
 80 | 
 81 |     # For each \nu in the nus list, train, predict and
 82 |     # plot the data
 83 |     for i in range(len(nus)+2):
 84 |         if 0 < i < len(nus)+1:
 85 |             (svdd, cinds) = train(n_cluster, Dtrain, nus[i - 1], membership, use_primal=use_primal)
 86 |             (scores, cres) = svdd.predict(Dtrain)
 87 |             print 'Fraction {0}-{1}'.format(nus[i-1], np.float(np.sum(scores>=0.)) / np.float(scores.size))
 88 |             (res, cres) = svdd.predict(Dtest)
 89 |         elif i == 0:
 90 |             if ad_setting:
 91 |                 (svdd, cinds) = train(1, Dtrain, nus[i], membership, use_primal=use_primal)
 92 |             else:
 93 |                 (svdd, cinds) = train(n_cluster, Dtrain, 1.0, membership, use_primal=use_primal)
 94 |             (scores, cres) = svdd.predict(Dtrain)
 95 |             print 'Fraction {0}-{1}'.format(nus[i], np.float(np.sum(scores>=0.)) / np.float(scores.size))
 96 |             (res, cres) = svdd.predict(Dtest)
 97 |         else:
 98 |             scores = ytrain < 0
 99 |             cinds = ytrain
100 | 
101 |         # The code below is basically only for beautiful visualizations
102 |         plt.figure(1)
103 |         plt.subplot(1, len(nus)+2, (i+1) % (len(nus)+2)+1)
104 |         if i < len(nus)+1:
105 |             Z = np.reshape(res,(sx, sy))
106 |             # cs = plt.contourf(X, Y, Z, alpha=0.5, cmap=plt.cm.bone)
107 |             if ad_setting:
108 |                 cs2 = plt.contour(X, Y, Z, [0.0], linewidths=2.0, colors='w', alpha=0.8)
109 | 
110 |         if not ad_setting:
111 |             cols = np.random.rand(3, n_cluster+1)
112 |             cols[:, 0] = np.array([0.95, 0.1, 0.1])
113 |             cols[:, 1] = np.array([0.9, 0.3, 0.7])
114 |             cols[:, 2] = np.array([0.4, 0.9, 0.3])
115 |             cols[:, 3] = np.array([0.4, 0.4, 0.9])
116 |             cols[:, 4] = np.array([0.7, 0.8, 0.99])
117 | 
118 |             if i > len(nus):
119 |                 cols[1,:] = cols[1, np.array([1, 2, 3, 4, 0])]
120 | 
121 |             for c in range(n_cluster+1):
122 |                 inds = np.where(cinds == c-1)[0]
123 |                 plt.scatter(Dtrain[0, inds], Dtrain[1, inds], 30, c=cols[:, c])
124 |         else:
125 |             # anomaly detection setting
126 |             inds = np.where(scores > 0.)[0]
127 |             plt.scatter(Dtrain[0, inds], Dtrain[1, inds], 30, c='r')
128 |             inds = np.where(scores <= 0.)[0]
129 |             plt.scatter(Dtrain[0, inds], Dtrain[1, inds], 30, c='g')
130 | 
131 |         # title
132 |         if i == 0:
133 |             if use_primal:
134 |                 if ad_setting:
135 |                     plt.title(r'SVDD', fontsize=16)
136 |                 else:
137 |                     plt.title(r'K-Means', fontsize=16)
138 |             else:
139 |                 if ad_setting:
140 |                     plt.title(r'Kernel SVDD', fontsize=16)
141 |                 else:
142 |                     plt.title(r'Kernel K-Means', fontsize=16)
143 |         elif i < len(nus)+1:
144 |             if use_primal:
145 |                 plt.title(r'ClusterSVDD', fontsize=16)
146 |             else:
147 |                 plt.title(r'Kernel ClusterSVDD', fontsize=16)
148 |         else:
149 |             plt.title(r'Ground truth', fontsize=16)
150 |         plt.xlim((-2., 2.))
151 |         plt.ylim((-2., 2.))
152 |         plt.yticks(range(-2, 2), [])
153 |         plt.xticks(range(-2, 2), [])
154 | 
155 |     plt.show()
156 |     print('finished')
157 | 


--------------------------------------------------------------------------------
/scripts/test_impl.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | 
 4 | from ClusterSVDD.svdd_dual_qp import SvddDualQP
 5 | from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 |     nu = 0.15  # outlier fraction
10 | 
11 |     # generate raw training data
12 |     Dtrain = np.random.randn(2, 1000)
13 |     Dtrain /= np.max(np.abs(Dtrain))
14 | 
15 |     # train dual svdd
16 |     svdd = SvddDualQP('linear', 0.1, nu)
17 |     svdd.fit(Dtrain)
18 | 
19 |     # train primal svdd
20 |     psvdd = SvddPrimalSGD(nu)
21 |     psvdd.fit(Dtrain, max_iter=1000, prec=1e-4)
22 | 
23 |     # print solutions
24 |     print('\n  dual-svdd: obj={0}  T={1}.'.format(svdd.pobj, svdd.radius2))
25 |     print('primal-svdd: obj={0}  T={1}.\n'.format(psvdd.pobj, psvdd.radius2))
26 | 
27 |     # generate test data grid
28 |     delta = 0.1
29 |     x = np.arange(-2.0-delta, 2.0+delta, delta)
30 |     y = np.arange(-2.0-delta, 2.0+delta, delta)
31 |     X, Y = np.meshgrid(x, y)
32 |     (sx, sy) = X.shape
33 |     Xf = np.reshape(X,(1, sx*sy))
34 |     Yf = np.reshape(Y,(1, sx*sy))
35 |     Dtest = np.append(Xf, Yf, axis=0)
36 |     if Dtrain.shape[0] > 2:
37 |         Dtest = np.append(Dtest, np.random.randn(Dtrain.shape[0]-2, sx*sy), axis=0)
38 |     print(Dtest.shape)
39 | 
40 |     res = svdd.predict(Dtest)
41 |     pres = psvdd.predict(Dtest)
42 | 
43 |     # nice visualization
44 |     plt.figure(1)
45 |     plt.subplot(1, 2, 1)
46 |     plt.title('Dual QP SVDD')
47 |     Z = np.reshape(res,(sx, sy))
48 |     plt.contourf(X, Y, Z)
49 |     plt.contour(X, Y, Z, [0.0], linewidths=3.0, colors='k')
50 |     plt.scatter(Dtrain[0, svdd.get_support_inds()], Dtrain[1, svdd.get_support_inds()], 40, c='k')
51 |     plt.scatter(Dtrain[0, :], Dtrain[1, :],10)
52 |     plt.xlim((-2., 2.))
53 |     plt.ylim((-2., 2.))
54 |     plt.yticks(range(-2, 2), [])
55 |     plt.xticks(range(-2, 2), [])
56 | 
57 |     plt.subplot(1, 2, 2)
58 |     plt.title('Primal Subgradient SVDD')
59 |     Z = np.reshape(pres,(sx, sy))
60 |     plt.contourf(X, Y, Z)
61 |     plt.contour(X, Y, Z, [0.0], linewidths=3.0, colors='k')
62 |     plt.scatter(Dtrain[0, :], Dtrain[1, :], 10)
63 |     plt.xlim((-2., 2.))
64 |     plt.ylim((-2., 2.))
65 |     plt.yticks(range(-2, 2), [])
66 |     plt.xticks(range(-2, 2), [])
67 | 
68 |     plt.show()
69 | 
70 |     print('finished')
71 | 


--------------------------------------------------------------------------------
/scripts/test_real.py:
--------------------------------------------------------------------------------
  1 | import sklearn.metrics as metrics
  2 | import numpy as np
  3 | 
  4 | from ClusterSVDD.svdd_dual_qp import SvddDualQP
  5 | from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD
  6 | from ClusterSVDD.cluster_svdd import ClusterSvdd
  7 | 
  8 | 
  9 | def load_data_set(fname, num_data, outlier_frac, train_inds):
 10 |     from sklearn.datasets import load_svmlight_file
 11 |     X, y = load_svmlight_file(fname)
 12 | 
 13 |     print X.shape
 14 |     y -= np.min(y) # classes should start from zero: 0,1,2,3,...
 15 |     inds = np.array([], dtype='i')
 16 |     for i in range(int(max(y))+1):
 17 |         inds = np.append(inds, np.where(y == i)[0])
 18 | 
 19 |     print inds.shape
 20 |     X = X.toarray()
 21 |     X = X[inds, :]
 22 |     y = y[inds]
 23 | 
 24 |     inds = np.random.permutation(range(y.size))
 25 |     X = X[inds[:num_data], :].T
 26 |     y = y[inds[:num_data]]
 27 | 
 28 |     # induce anomalies
 29 |     anoms = int(float(num_data)*outlier_frac)
 30 |     X[:, :anoms] = 1.*(np.random.rand(X.shape[0], anoms)*2.-1.)
 31 |     y[:anoms] = -1
 32 | 
 33 |     print np.unique(y)
 34 |     return X, y
 35 | 
 36 | 
 37 | def evaluate(res_filename, dataset, nus, ks, outlier_frac,
 38 |              reps, num_train, num_val, num_test, use_kernels=False):
 39 |     train = np.array(range(num_train-num_val), dtype='i')
 40 |     val = np.array(range(num_train-num_val, num_train), dtype='i')
 41 |     test = np.array(range(num_train, num_train + num_test), dtype='i')
 42 | 
 43 |     aris = np.zeros((reps, len(nus), len(ks)))
 44 |     aucs = np.zeros((reps, len(nus), len(ks)))
 45 | 
 46 |     val_aris = np.zeros((reps, len(nus), len(ks)))
 47 |     val_aucs = np.zeros((reps, len(nus), len(ks)))
 48 | 
 49 |     for n in range(reps):
 50 |         # generate new gaussians
 51 |         # data, y = generate_data(num_train + num_test, outlier_frac=outlier_frac)
 52 |         inds = np.random.permutation(range(num_test + num_train))
 53 |         data, y = load_data_set(dataset, num_train + num_test, outlier_frac, inds[:num_train])
 54 |         data = data[:, inds]
 55 |         y = y[inds]
 56 |         for k in range(len(ks)):
 57 |             # fix the initialization for all methods
 58 |             membership = np.random.randint(0, ks[k], y.size)
 59 |             for i in range(len(nus)):
 60 |                 svdds = list()
 61 |                 for l in range(ks[k]):
 62 |                     if use_kernels:
 63 |                         svdds.append(SvddDualQP('rbf', 20.0, nus[i]))
 64 |                     else:
 65 |                         svdds.append(SvddPrimalSGD(nus[i]))
 66 |                 svdd = ClusterSvdd(svdds)
 67 |                 svdd.fit(data[:, train].copy(), init_membership=membership[train])
 68 |                 # test error
 69 |                 scores, classes = svdd.predict(data[:, test].copy())
 70 | 
 71 |                 # evaluate clustering abilities
 72 |                 # inds = np.where((y[test] >= 0))[0]
 73 |                 # aris[n, i, k] = metrics.cluster.adjusted_rand_score(y[test[inds]], classes[inds])
 74 | 
 75 |                 ari = metrics.cluster.adjusted_rand_score(y[test], classes)
 76 |                 if nus[i] < 1.0:
 77 |                     inds = np.where(scores <= 0.)[0]
 78 | 
 79 |                     ari = metrics.cluster.adjusted_rand_score(y[test[inds]], classes[inds])
 80 |                 aris[n, i, k] = ari
 81 | 
 82 |                 # ...and anomaly detection accuracy
 83 |                 fpr, tpr, _ = metrics.roc_curve(np.array(y[test]<0., dtype='i'), scores, pos_label=1)
 84 |                 aucs[n, i, k] = metrics.auc(fpr, tpr)
 85 | 
 86 |                 # validation error
 87 |                 scores, classes = svdd.predict(data[:, val].copy())
 88 |                 # evaluate clustering abilities
 89 |                 # inds = np.where((y[val] >= 0))[0]
 90 |                 # val_aris[n, i, k] = metrics.cluster.adjusted_rand_score(y[val[inds]], classes[inds])
 91 | 
 92 |                 ari = metrics.cluster.adjusted_rand_score(y[val], classes)
 93 |                 if nus[i] < 1.0:
 94 |                     inds = np.where(scores <= 0.)[0]
 95 |                     ari = metrics.cluster.adjusted_rand_score(y[val[inds]], classes[inds])
 96 |                 val_aris[n, i, k] = ari
 97 | 
 98 |                 # ...and anomaly detection accuracy
 99 |                 fpr, tpr, _ = metrics.roc_curve(np.array(y[val]<0., dtype='i'), scores, pos_label=1)
100 |                 val_aucs[n, i, k] = metrics.auc(fpr, tpr)
101 | 
102 |     print '---------------------------------------------------'
103 |     maris = np.mean(aris, axis=0)
104 |     saris = np.std(aris, axis=0)
105 |     print '(Test) ARI:'
106 |     print np.mean(aris, axis=0)
107 |     print np.std(aris, axis=0)
108 | 
109 |     val_maris = np.mean(val_aris, axis=0)
110 |     val_saris = np.std(val_aris, axis=0)
111 |     print '(Val) ARI:'
112 |     print val_maris
113 |     print val_saris
114 | 
115 |     print '---------------------------------------------------'
116 |     maucs = np.mean(aucs, axis=0)
117 |     saucs = np.std(aucs, axis=0)
118 |     print '(Test) AUC:'
119 |     print np.mean(aucs, axis=0)
120 |     print np.std(aucs, axis=0)
121 | 
122 |     val_maucs = np.mean(val_aucs, axis=0)
123 |     val_saucs = np.std(val_aucs, axis=0)
124 |     print '(Val) AUC:'
125 |     print val_maucs
126 |     print val_saucs
127 |     print '---------------------------------------------------'
128 | 
129 |     res = np.zeros(4)
130 |     res_stds = np.zeros(4)
131 | 
132 |     # best svdd result (assume col 0 is k=1)
133 |     svdd_ind = np.argmax(val_maucs[:, 0])
134 |     print 'SVDD best AUC={0}'.format(maucs[svdd_ind, 0])
135 |     csvdd_ind = np.argmax(val_maucs)
136 |     i1, i2 = np.unravel_index(csvdd_ind, maucs.shape)
137 |     print 'ClusterSVDD best AUC={0}'.format(maucs[i1, i2])
138 |     res[0] = maucs[svdd_ind, 0]
139 |     res_stds[0] = saucs[svdd_ind, 0]
140 |     res[1] = maucs[i1, i2]
141 |     res_stds[1] = saucs[i1, i2]
142 | 
143 |     # best svdd result (assume col 0 is k=1)
144 |     km_ind = np.argmax(val_maris[0, :])
145 |     print 'k-means best ARI={0}'.format(maris[0, km_ind])
146 |     csvdd_ind = np.argmax(val_maris)
147 |     i1, i2 = np.unravel_index(csvdd_ind, maris.shape)
148 |     print 'ClusterSVDD best ARI={0}'.format(maris[i1, i2])
149 |     res[2] = maris[0, km_ind]
150 |     res_stds[2] = saris[0, km_ind]
151 |     res[3] = maris[i1, i2]
152 |     res_stds[3] = saris[i1, i2]
153 |     print '---------------------------------------------------'
154 | 
155 |     return res, res_stds
156 | 
157 | if __name__ == '__main__':
158 |     dataset_name = "../../segment.scale.txt" # 7c
159 |     # dataset_name = "../../satimage.scale.txt" # 6c
160 | 
161 |     nus = [1.0, 0.95, 0.9, 0.5, 0.1, 0.01]
162 |     outlier_fracs = [0.0, 0.02, 0.05, 0.1, 0.15]  # fraction of uniform noise in the generated data
163 |     reps = 10  # number of repetitions for performance measures
164 | 
165 |     ks = [1, 5, 7, 10, 14] # segment
166 |     num_train = 1155
167 |     num_test = 1155
168 |     num_val = 250
169 | 
170 |     if 'satimage' in dataset_name:
171 |         ks = [1, 3, 6, 9]
172 |         # ks = [1, 3, 5, 6, 7]
173 |         num_train = 2217
174 |         num_test = 2218
175 |         num_val = 400
176 | 
177 |     # nus = [1.0, 0.95]
178 |     # outlier_fracs = [0.1]  # fraction of uniform noise in the generated data
179 |     # reps = 1  # number of repetitions for performance measures
180 |     # ks = [6] # segment
181 | 
182 | 
183 |     res_filename = 'res_real_{0}_{1}.npz'.format(reps, dataset_name[6:])
184 | 
185 |     # res: 0:AUC-SVDD, 1:AUC-CSVDD, 2:ARI-KMEANS, 3:ARI-CSVDD
186 |     res = np.zeros((len(outlier_fracs), 4))
187 |     res_stds = np.zeros((len(outlier_fracs), 4))
188 |     for i in range(len(outlier_fracs)):
189 |         res[i, :], res_stds[i, :] = evaluate(res_filename, dataset_name, \
190 |                                              nus, ks, outlier_fracs[i], reps,
191 |                                              num_train, num_val, num_test, use_kernels=False)
192 | 
193 |     np.savez(res_filename, dataset=dataset_name, res=res, res_stds=res_stds, \
194 |             outlier_fracs=outlier_fracs, ntrain=num_train, nval=num_val, ntest=num_test, reps=reps, nus=nus, ks=ks)
195 | 
196 |     print '=========================================='
197 |     for i in range(len(outlier_fracs)):
198 |         line = '{0}\\%'.format(int(outlier_fracs[i]*100.))
199 |         for j in range(4):
200 |             line += ' & {0:1.2f}/{1:1.2f}'.format(res[i, j], res_stds[i, j])
201 |         line += '  \\\\'
202 |         print line
203 |     print '=========================================='
204 | 
205 |     print('DONE :)')
206 | 


--------------------------------------------------------------------------------
/scripts/test_robust.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import sklearn.metrics as metrics
  3 | import numpy as np
  4 | 
  5 | from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD
  6 | from ClusterSVDD.cluster_svdd import ClusterSvdd
  7 | from ClusterSVDD.svdd_dual_qp import SvddDualQP
  8 | 
  9 | 
 10 | def generate_data(datapoints, outlier_frac=0.1, dims=2):
 11 |     X = np.zeros((dims, datapoints))
 12 |     y = np.zeros(datapoints)
 13 | 
 14 |     num_noise = np.floor(datapoints*outlier_frac)
 15 |     num_dpc = np.floor(float(datapoints-num_noise)/2.0)
 16 | 
 17 |     X[:, :num_noise] = 0.5*np.random.randn(dims, num_noise) + 0.
 18 |     y[:num_noise] = -1
 19 | 
 20 |     cnt = num_noise
 21 |     X[:, cnt:cnt+num_dpc] = 1.5*np.random.randn(dims, num_dpc) - 1.
 22 |     y[cnt:cnt+num_dpc] = 1
 23 |     cnt += num_dpc
 24 | 
 25 |     X[:, cnt:] = 0.5*np.random.randn(dims, y.size-cnt) + 1.
 26 |     y[cnt:] = 2
 27 |     return X, y
 28 | 
 29 | 
 30 | def plot_results(res_filename):
 31 |     foo = np.load(res_filename)
 32 |     maris = foo['maris']
 33 |     saris = foo['saris']
 34 |     nus = foo['nus']
 35 |     reps = foo['reps']
 36 | 
 37 |     plt.figure(1)
 38 |     np.random.seed(2)
 39 |     cols = np.random.rand(maris.shape[1], 3)
 40 |     fmts = ['-->', '-.o', '-D', '--s', '--H']
 41 |     for i in range(maris.shape[1]):
 42 |         plt.errorbar(nus, maris[:, i], saris[:, i]/np.sqrt(reps), fmt=fmts[i], color=cols[i, :], \
 43 |                      ecolor=cols[i, :], linewidth=2.0, elinewidth=1.0, alpha=0.8)
 44 |     for i in range(maris.shape[1]):
 45 |         plt.errorbar(nus[-1], maris[-1, i], saris[-1, i]/np.sqrt(reps), \
 46 |                      color='r', ecolor='r', fmt=fmts[i][-1], markersize=10, linewidth=4.0, elinewidth=4.0, alpha=0.7)
 47 | 
 48 |     plt.xlim((-0.05, 1.05))
 49 |     plt.ylim((0.2, .8))
 50 |     plt.xticks([0.0, 0.25, 0.5, 0.75, 1.0], ['0.0', '0.25', '0.5', '0.75', '1.0 \n= Kernel $k$-means'], fontsize=14)
 51 |     plt.grid()
 52 |     plt.xlabel(r'regularization parameter $\nu$', fontsize=14)
 53 |     plt.ylabel(r'Adjusted Rand Index (ARI)', fontsize=14)
 54 |     names = list()
 55 |     for i in range(maris.shape[1]):
 56 |         names.append('ClusterSVDD ($k$={0})'.format(ks[i]))
 57 |     plt.legend(names, loc=4, fontsize=14)
 58 |     plt.show()
 59 | 
 60 | 
 61 | def evaluate(res_filename, nus, ks, outlier_frac, reps, num_train, num_test, use_primal=True):
 62 |     train = np.array(range(num_train), dtype='i')
 63 |     test = np.array(range(num_train, num_train + num_test), dtype='i')
 64 | 
 65 |     aris = np.zeros((reps, len(nus), len(ks)))
 66 |     for n in range(reps):
 67 |         # generate new gaussians
 68 |         data, y = generate_data(num_train + num_test, outlier_frac=outlier_frac)
 69 |         inds = np.random.permutation(range(num_test + num_train))
 70 |         data = data[:, inds]
 71 |         y = y[inds]
 72 |         for k in range(len(ks)):
 73 |             # fix the initialization for all methods
 74 |             membership = np.random.randint(0, ks[k], y.size)
 75 |             for i in range(len(nus)):
 76 |                 svdds = list()
 77 |                 for l in range(ks[k]):
 78 |                     if use_primal:
 79 |                         svdds.append(SvddPrimalSGD(nus[i]))
 80 |                     else:
 81 |                         svdds.append(SvddDualQP('rbf', 10.0, nus[i]))
 82 |                 svdd = ClusterSvdd(svdds)
 83 |                 svdd.fit(data[:, train].copy(), init_membership=membership[train])
 84 |                 _, classes = svdd.predict(data[:, test].copy())
 85 |                 # evaluate clustering abilities
 86 |                 inds = np.where(y[test] >= 0)[0]
 87 |                 aris[n, i, k] = metrics.cluster.adjusted_rand_score(y[test[inds]], classes[inds])
 88 | 
 89 |     print aris
 90 |     print ''
 91 |     maris = np.mean(aris, axis=0)
 92 |     saris = np.std(aris, axis=0)
 93 |     print np.mean(aris, axis=0)
 94 |     print np.std(aris, axis=0)
 95 |     np.savez(res_filename, maris=maris, saris=saris, outlier_frac=outlier_frac,
 96 |              ntrain=num_train, ntest=num_test, reps=reps, nus=nus)
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     nus = (np.arange(1, 21)/20.)
101 |     ks = [2, 3, 4]
102 | 
103 |     # ks = [3]
104 |     # nus = [0.1, 0.5, 0.9, 1.0]
105 | 
106 |     outlier_frac = 0.05  # fraction of uniform noise in the generated data
107 |     # outlier_frac = 0.1  # fraction of uniform noise in the generated data
108 |     reps = 50  # number of repetitions for performance measures
109 |     num_train = 1000
110 |     num_test = 2000
111 | 
112 |     do_plot = True
113 |     do_evaluation = False
114 | 
115 |     res_filename = 'res_robust_{0}_{1}_{2}.npz'.format(reps, len(ks), len(nus))
116 | 
117 |     if do_evaluation:
118 |         evaluate(res_filename, nus, ks, outlier_frac, reps, num_train, num_test, use_primal=False)
119 |     if do_plot:
120 |         plot_results(res_filename)
121 | 
122 |     print('DONE :)')
123 | 


--------------------------------------------------------------------------------
/scripts/test_struct.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import sklearn.metrics as metrics
  3 | import numpy as np
  4 | import time as time
  5 | 
  6 | from numba import autojit
  7 | 
  8 | from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD
  9 | from ClusterSVDD.cluster_svdd import ClusterSvdd
 10 | 
 11 | 
 12 | def generate_seqs(lens, block_len, cluster=3, dims=3):
 13 |     classes = np.random.randint(0, cluster)
 14 |     seqs = 1.0*np.random.randn(dims, lens)
 15 |     states = np.zeros(lens, dtype='i')
 16 |     y = classes
 17 |     start = np.random.randint(low=0, high=lens-block_len+1)
 18 |     states[start:start+block_len] = 1
 19 |     # seqs[0, start:start+block_len] = seqs[0, start:start+block_len]+0.5*classes-2.0*float(classes==0)
 20 |     seqs[classes, start:start+block_len] = seqs[0, start:start+block_len]+1.0
 21 |     return seqs, states, y
 22 | 
 23 | 
 24 | def generate_data(datapoints, cluster=3, outlier_frac=0.1, dims=3, plot=True):
 25 |     lens = 500
 26 |     X = []
 27 |     S = []
 28 |     y = np.ones(datapoints, dtype='i')
 29 |     idx = np.zeros(cluster, dtype='i')
 30 |     idx_anom = -1
 31 |     for i in range(datapoints):
 32 |         exm, states, y[i] = generate_seqs(lens, 250, cluster=cluster, dims=dims)
 33 |         prob = np.random.uniform()
 34 |         if prob < outlier_frac:
 35 |             idx_anom = i
 36 |             exm *= np.random.uniform(low=-0.1,high=+0.1, size=(dims, lens))
 37 |             exm *= np.exp(10.0*exm)
 38 |             y[i] = -1
 39 |         else:
 40 |             idx[y[i]] = i
 41 |         X.append(exm)
 42 |         S.append(states)
 43 | 
 44 |     if plot:
 45 |         plt.figure(1)
 46 | 
 47 |         for d in range(dims):
 48 |             for i in range(cluster):
 49 |                 plt.subplot(1, cluster+1, i+1)
 50 |                 plt.plot(range(lens), X[idx[i]][d, :]+d*6., '-r', alpha=0.7)
 51 |                 plt.ylim((-5.0, 20.))
 52 |                 plt.yticks([0.0])
 53 |                 xinds = np.where(S[idx[i]]==1)[0]
 54 |                 plt.fill_between(xinds, -5, 20, color=[0.3, 0.3, 0.3], alpha=0.25)
 55 |                 plt.title('Class {0}'.format(i), fontsize=14)
 56 |                 plt.xlabel('Sequence index', fontsize=14)
 57 |                 plt.ylabel('Feature 0  Feature 1  Feature 2', fontsize=14)
 58 | 
 59 |         plt.subplot(1, cluster+1, cluster+1)
 60 |         for d in range(dims):
 61 |             plt.plot(range(lens), X[idx_anom][d, :]+d*6., '-r', alpha=0.7)
 62 |         plt.yticks([0.0])
 63 |         plt.ylim((-5., 20.  ))
 64 |         plt.title('Anomalous Data', fontsize=14)
 65 |         plt.xlabel('Sequence index', fontsize=14)
 66 |         plt.ylabel('Feature 0  Feature 1  Feature 2', fontsize=14)
 67 | 
 68 |         plt.show()
 69 |     return X, S, y
 70 | 
 71 | 
 72 | def preprocess_training_data(data_seqs, state_seqs, train_inds):
 73 |     # estimate the transition and emission matrix given the training
 74 |     # data only. Number of states is 2.
 75 |     N = len(data_seqs)
 76 |     F, _ = data_seqs[0].shape
 77 |     phi = np.zeros((2*2 + F*2, N))
 78 |     for n in train_inds:
 79 |         phi[:, n] = get_joint_feature_map(data_seqs[n], state_seqs[n])
 80 |         phi[:, n] /= np.linalg.norm(phi[:, n], ord=2)
 81 |     return phi
 82 | 
 83 | 
 84 | def preprocess_test_data(csvdd, X, S, inds):
 85 |     # 1. for all i,k:  y_i,k = argmax_y <c_k, psi(x_i, y)>
 86 |     # 2. for all i: calculate membership z_i = argmin_k ||c_k - psi(x_i, y_i,k)||^2 - R_k
 87 |     # 3. for all i: hamming loss delta(y_i, y_i,z_i)
 88 |     N = inds.size
 89 |     F, _ = X[0].shape
 90 | 
 91 |     pred_phis = np.zeros((2*2 + F*2, N))
 92 |     true_states = []
 93 |     pred_states = []
 94 |     states = []
 95 |     for n in range(N):
 96 |         states.append(S[inds[n]])
 97 |         true_states.append(S[inds[n]])
 98 |         pred_states.append(S[inds[n]])
 99 | 
100 |     min_scores = 1e12*np.ones(N, dtype='d')
101 |     for k in range(csvdd.clusters):
102 |         phis = np.zeros((2*2 + F*2, N))
103 |         for n in range(N):
104 |             sol = csvdd.svdds[k].c
105 |             states[n] = argmax(sol, X[inds[n]])
106 |             phis[:, n] = get_joint_feature_map(X[inds[n]], states[n])
107 |             # states[n] = true_states[n]
108 |             phis[:, n] /= np.linalg.norm(phis[:, n], ord=2)
109 | 
110 |         scores = csvdd.svdds[k].predict(phis)
111 |         minds = np.where(scores <= min_scores)[0]
112 |         pred_phis[:, minds] = phis[:, minds]
113 |         min_scores[minds] = scores[minds]
114 |         for i in minds:
115 |             pred_states[i] = states[i]
116 | 
117 |     return pred_phis, true_states, pred_states
118 | 
119 | def hamming_loss(y_true, y_pred):
120 |     N = len(y_pred)
121 |     loss = 0.0
122 |     for i in range(N):
123 |         loss += float(np.sum(y_true[i] != y_pred[i])) / float(y_pred[i].size)
124 |     return loss / float(N)
125 | 
126 | 
127 | @autojit(nopython=True)
128 | def argmax(sol, X):
129 |     # if labels are present, then argmax will solve
130 |     # the loss augmented programm
131 |     T = X.shape[1]
132 |     N = 2
133 | 
134 |     # get transition matrix from current solution
135 |     A = np.zeros((N, N), dtype=np.double)
136 |     for i in range(N):
137 |         for j in range(N):
138 |             A[i, j] = sol[i*N+j]
139 | 
140 |     # calc emission matrix from current solution, data points and
141 |     F = X.shape[0]
142 |     em = np.zeros((N, T))
143 |     for t in range(T):
144 |         for s in range(N):
145 |             for f in xrange(F):
146 |                 em[s, t] += sol[N*N + s*F + f] * X[f, t]
147 | 
148 |     delta = np.zeros((N, T))
149 |     psi = np.zeros((N, T), dtype=np.int8)
150 |     # initialization
151 |     for i in xrange(N):
152 |         # use equal start probs for each state
153 |         delta[i, 0] = 0. + em[i, 0]
154 | 
155 |     # recursion
156 |     for t in range(1, T):
157 |         for i in range(N):
158 |             foo_argmax = 0
159 |             foo_max = -1e16
160 |             for l in range(N):
161 |                 foo = delta[l, t-1] + A[l, i] + em[i, t]
162 |                 if foo > foo_max:
163 |                     foo_max = foo
164 |                     foo_argmax = l
165 |             psi[i, t] = foo_argmax
166 |             delta[i, t] = foo_max
167 | 
168 |     states = np.zeros(T, dtype=np.int8)
169 |     states[T-1] = np.argmax(delta[:, T-1])
170 | 
171 |     # for t in reversed(xrange(1, T)):
172 |     for t in range(T-1, 0, -1):
173 |         states[t-1] = psi[states[t], t]
174 |     return states
175 | 
176 | 
177 | @autojit(nopython=True)
178 | def get_joint_feature_map(X, y):
179 |     N = 2
180 |     T = y.size
181 |     F = X.shape[0]
182 |     jfm = np.zeros(N*N + N*F)
183 |     # transition part
184 |     for t in range(T-1):
185 |         for i in range(N):
186 |             for j in range(N):
187 |                 if y[t]==i and y[t+1]==j:
188 |                     jfm[j*N+i] += 1
189 |     # emission parts
190 |     for t in range(T):
191 |         for f in range(F):
192 |             jfm[y[t]*F + f + N*N] += X[f, t]
193 |     return jfm
194 | 
195 | 
196 | def plot_results(res_filename):
197 |     data, states, y = generate_data(1000, cluster=3, outlier_frac=0.05, dims=3, plot=False)
198 | 
199 |     foo = np.load(res_filename)
200 |     maris = foo['maris']
201 |     saris = foo['saris']
202 |     mloss = foo['mloss']
203 |     sloss = foo['sloss']
204 |     nus = foo['nus']
205 |     reps = foo['reps']
206 | 
207 |     res = np.zeros((len(nus), 4))
208 |     res_stds = np.zeros((len(nus), 4))
209 | 
210 |     # svdd
211 |     res[:, 0] = mloss[:, 0]
212 |     res_stds[:, 0] = sloss[:, 0]
213 |     # csvdd
214 |     res[:, 1] = mloss[:, 1]
215 |     res_stds[:, 1] = sloss[:, 1]
216 | 
217 |     # kmeans
218 |     res[0, 2] = maris[0, 1]
219 |     res_stds[0, 2] = saris[0, 1]
220 |     # csvdd
221 |     res[:, 3] = maris[:, 1]
222 |     res_stds[:, 3] = saris[:, 1]
223 | 
224 |     print '=========================================='
225 |     for i in range(len(nus)):
226 |         line = '{0:1.2f}\\%'.format(nus[i])
227 |         for j in range(4):
228 |             line += ' & {0:1.2f}/{1:1.2f}'.format(res[i, j], res_stds[i, j])
229 |         line += '  \\\\'
230 |         print line
231 |     print '=========================================='
232 | 
233 | 
234 | def evaluate(res_filename, nus, ks, outlier_frac, reps, num_train, num_test):
235 |     train = np.array(range(num_train), dtype='i')
236 |     test = np.array(range(num_train, num_train + num_test), dtype='i')
237 | 
238 |     aris = np.zeros((reps, len(nus), len(ks)))
239 |     loss = np.zeros((reps, len(nus), len(ks)))
240 |     for n in range(reps):
241 |         # generate new gaussians
242 |         X, S, y = generate_data(num_train + num_test, cluster=3, outlier_frac=outlier_frac, dims=3, plot=False)
243 |         inds = np.random.permutation(range(num_test + num_train))
244 |         data = preprocess_training_data(X, S, inds[:num_train])
245 |         data = data[:, inds]
246 |         y = y[inds]
247 |         print data
248 |         print y
249 |         for k in range(len(ks)):
250 |             # fix the initialization for all methods
251 |             membership = np.random.randint(0, ks[k], y.size)
252 |             for i in range(len(nus)):
253 |                 svdds = list()
254 |                 for l in range(ks[k]):
255 |                     svdds.append(SvddPrimalSGD(nus[i]))
256 |                 svdd = ClusterSvdd(svdds)
257 |                 svdd.fit(data[:, train], init_membership=membership[train])
258 | 
259 |                 stime = time.time()
260 |                 pred_phis, true_states, pred_states = preprocess_test_data(svdd, X, S, inds[num_train:])
261 |                 _, classes = svdd.predict(pred_phis)
262 |                 print '---------------- TIME'
263 |                 print time.time()-stime
264 |                 print '----------------'
265 | 
266 |                 # evaluate clustering abilities
267 |                 ninds = np.where(y[test] >= 0)[0]
268 |                 aris[n, i, k] = metrics.cluster.adjusted_rand_score(y[test[ninds]], classes[ninds])
269 |                 # evaluate structured prediction accuracy
270 |                 loss[n, i, k] = hamming_loss(true_states, pred_states)
271 |                 print loss[n, i, k]
272 | 
273 |     maris = np.mean(aris, axis=0)
274 |     saris = np.std(aris, axis=0)
275 |     print 'ARI'
276 |     print np.mean(aris, axis=0)
277 |     print np.std(aris, axis=0)
278 | 
279 |     mloss = np.mean(loss, axis=0)
280 |     sloss = np.std(loss, axis=0)
281 |     print 'Normalized Hamming Distance'
282 |     print np.mean(loss, axis=0)
283 |     print np.std(loss, axis=0)
284 | 
285 |     np.savez(res_filename, maris=maris, saris=saris, mloss=mloss, sloss=sloss,
286 |                 outlier_frac=outlier_frac, ntrain=num_train, ntest=num_test, reps=reps, nus=nus)
287 | 
288 | 
289 | if __name__ == '__main__':
290 |     nus = [1.0, 0.9, 0.5, 0.1, 0.01]
291 |     ks = [1, 3]
292 | 
293 |     outlier_frac = 0.05  # fraction of uniform noise in the generated data
294 |     reps = 10  # number of repetitions for performance measures
295 |     num_train = 2000
296 |     num_test = 500
297 | 
298 |     do_plot = True
299 |     do_evaluation = True
300 | 
301 |     res_filename = 'res_struct_{0}_{1}_{2}.npz'.format(reps, len(ks), len(nus))
302 | 
303 |     if do_evaluation:
304 |         evaluate(res_filename, nus, ks, outlier_frac, reps, num_train, num_test)
305 |     if do_plot:
306 |         # data, states, y = generate_data(num_train + num_test, outlier_frac=outlier_frac, dims=2, plot=True)
307 |         plot_results(res_filename)
308 | 
309 |     print('DONE :)')
310 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from setuptools import setup
 3 | except ImportError:
 4 |     from distutils.core import setup
 5 | 
 6 | config = {
 7 |     'description': 'ClusterSVDD--Latent variable support vector data description',
 8 |     'url': 'https://github.com/nicococo/ClusterSVDD',
 9 |     'author': 'Nico Goernitz',
10 |     'author_email': 'nico.goernitz@tu-berlin.de',
11 |     'version': '0.1',
12 |     'install_requires': ['numba', 'cvxopt','scikit-learn','numpy', 'scipy'],
13 |     'packages': ['ClusterSVDD'],
14 |     'package_dir' : {'clusterSVDD': 'ClusterSVDD'},
15 |     #'package_data': {'clusterSVDD': ['*.txt']},
16 |     #'scripts': ['bin/ClusterSVDD.sh'],
17 |     'name': 'ClusterSVDD',
18 |     'classifiers':['Intended Audience :: Science/Research',
19 |                    'Programming Language :: Python',
20 |                    'Topic :: Scientific/Engineering',
21 |                    'Operating System :: POSIX',
22 |                    'Operating System :: Unix',
23 |                    'Operating System :: MacOS',
24 |                    'Programming Language :: Python :: 2',
25 |                    'Programming Language :: Python :: 2.7']
26 | }
27 | 
28 | setup(**config)


--------------------------------------------------------------------------------