├── README.md
└── sample_svdd.py


/README.md:
--------------------------------------------------------------------------------
1 | A sampling based SVDD implementation. Look at the run_main() function in sample_svdd.py for a simple comparison with the
2 | standard method.
3 | 
4 | 


--------------------------------------------------------------------------------
/sample_svdd.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Apr 19 16:31:41 2016
  4 | This script runs under python3.
  5 | Code written to accompany sampling based svdd paper.
  6 | """
  7 | 
  8 | #  Some general comments.
  9 | # 1. One Class SVM formulation (OCSVM) is identical to the SVDD formulation for the Gaussian Kernel.
 10 | # 2. The feasible set  for the optimization in SVDD/OCSVM computation is
 11 | #        0 <= alpha_i <= 1/(n * f),
 12 | #        \sum a\lpha_i = 1,
 13 | #    which is equivalent to
 14 | #       0 <= alpha_i <= min(1,1/(n*f)),
 15 | #       \sum alpha_i = 1.
 16 | #    So a value of f less than 1/n can be replaced by 1/n. For some reason explicitly replacing f
 17 | #    gives much better results than passing in tiny values of f.
 18 | # For the paper we used the C++ SVDD implementation from LIBSVM here:
 19 | # https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/#libsvm_for_svdd_and_finding_the_smallest_sphere_containing_all_data.
 20 | # Even though Scikit-learn's OCSVM implementation is also based on LIBSVM, their performance characteristics are different.
 21 | #  LIBSVM probably uses different solvers for OCSVM and SVDD, and in may cases this python OCSVM implmentation outperformed
 22 | # the SVDD one significantly.
 23 | 
 24 | 
 25 | from collections import namedtuple
 26 | import numpy as np
 27 | from numpy.random import choice
 28 | from sklearn import svm
 29 | from sklearn.metrics.pairwise import rbf_kernel
 30 | 
 31 | #Compute the radius and center from a svdd result
 32 | def _compute_radius_center(clf, method=1):
 33 |     sv, coef = clf.support_vectors_, clf.dual_coef_
 34 |     sv_pos = np.where((coef < 1)[0, ...])[0]
 35 |     coef.shape = (coef.shape[1], )
 36 |     coef = coef/np.sum(coef)
 37 |     center = np.dot(coef, sv)
 38 |     #method 1 is a fast approximation of the radius which is good enough for our purpose
 39 |     if method == 0:
 40 |         m = rbf_kernel(sv, sv, gamma=clf.gamma)
 41 |         radius = 1 - 2 * np.dot(m[sv_pos[0], ...], coef) + np.dot(coef, np.dot(m, coef))
 42 |     else:
 43 |         v = sv[sv_pos[0], ...].reshape(1, sv.shape[1])
 44 |         m = rbf_kernel(v, sv, gamma=clf.gamma)
 45 |         radius = 1 - np.dot(m, coef)
 46 |     return radius, center
 47 | 
 48 | #compute svdd given the indices of the sample 
 49 | def _do_one_class_svm_sample(gamma, nu, x_train, sample_indices, compute_rc=True):
 50 |     x_train_sample = x_train[sample_indices, ...]
 51 |     nsample = x_train_sample.shape[0]
 52 |     nu_1 = nu  if nu * nsample > 1 else 1/nsample
 53 |     clf = svm.OneClassSVM(gamma=gamma, nu=nu_1)
 54 |     clf.fit(x_train_sample)
 55 |     if compute_rc:
 56 |         radius, center = _compute_radius_center(clf)
 57 |         return sample_indices[clf.support_], radius, center
 58 |     else:
 59 |         return sample_indices[clf.support_]
 60 | 
 61 | # draw a random sample from the original data and peform svdd on it
 62 | def _do_one_class_svm_random(gamma, nu, x_train, sample_size, compute_rc=True):
 63 |     sample = choice(x_train.shape[0], sample_size)
 64 |     return _do_one_class_svm_sample(gamma, nu, x_train, sample, compute_rc=compute_rc)
 65 | 
 66 | 
 67 | # the sampling svdd implementation, see the __main__ section for an example
 68 | def sample_svdd(x_train,
 69 |                 outlier_fraction=0.001,
 70 |                 kernel_s=2,
 71 |                 maxiter=1000,
 72 |                 sample_size=10,
 73 |                 resample_n=3,
 74 |                 stop_tol=1e-6,
 75 |                 n_iter=30,
 76 |                 iter_history=True,
 77 |                 seed=2513646):
 78 |     """
 79 |     Perform sampling based approximate svdd.
 80 |     Input Parameters:
 81 |         x_train : input data to train, must be a two-dim numpy array 
 82 |         kernel_s: the bandwidth for the Gaussian kernel, the Gaussian kernel is 
 83 |                   assumed to be of the form exp( -||x - y||^2 / (2 *kernel_s^2))
 84 |         sample_size: the size of each random sample 
 85 |         resample_n: take these many samples in each iteration, and merge the union of their support vectors with the
 86 |                     master, the method documented in the paper corresponds to resample_n = 1
 87 |         stop_tol: the tolerance value to detect convergence
 88 |         n_iter: the raidus and center must be close to each other for this many consecutive iterations
 89 |                 for convergence to be declared
 90 |         iter: flag to determine whether convergence history will be stored
 91 |         seed: seed value for the random number generator    
 92 |     Output:
 93 |         The output is a named tuple. If the output is denoted by res then:
 94 |             res.IterHist: a named tuple containing the iteration history
 95 |                 res.IterHist.niter_ : number of iterations till convergence
 96 |                 res.IterHist.radius_history_ : the iteration history for the radius
 97 |                 res.IterHist.center_history_: the iteration history of the center
 98 |                 res.IterHist.converged_ : convergence status flag
 99 |             res.Params: a named tuple containing the output parameters of the suggested SVDD 
100 |                 res.Params.sv_: the indices of the fitted support vectors
101 |                 res.Params.center_: final center point
102 |                 res.Params.radius_ : final radius
103 |             res.OneClassSVM:
104 |                 A sklearn.svm.OneClassSVM instance corresponding to the result. Can be used for scoring.                                
105 |     """    
106 |     
107 |     # Only matrix input allowed
108 |     if len(x_train.shape) != 2:
109 |         print("ERROR: invalid x_train input found, expecting a matrix")
110 |         raise ValueError
111 | 
112 |     #sanity checks
113 |     if maxiter <= 0:
114 |         print("ERROR: maxiter must be positive integer")
115 |         raise ValueError
116 | 
117 |     nobs = x_train.shape[0]
118 | 
119 |     if nobs <= sample_size:
120 |         print("ERROR: sample size must be strictly smaller than number of observations in input data")
121 |         raise ValueError
122 | 
123 |     # convert kernel_s to gamma
124 |     gamma, nu = 0.5/(kernel_s*kernel_s), outlier_fraction
125 | 
126 |     if np.isfinite(gamma) != True or np.isfinite(nu) != True or (nu < 0) or (nu > 1):
127 |         print("ERROR: Invalid kernel_s or outlier_fraction input")
128 |         raise ValueError
129 | 
130 |     #if negative seed is provided use a system chosen seed
131 |     np.random.seed(seed=seed if seed >= 0 else None)
132 | 
133 |     if iter_history:
134 |         radius_history, center_history = np.empty(maxiter+1), list()
135 | 
136 |     clf = None
137 |     sv_ind_prev, radius_prev, center_prev = _do_one_class_svm_random(gamma, nu, x_train, sample_size)
138 | 
139 |     if iter_history:
140 |         radius_history[0] = radius_prev
141 |         center_history.append(center_prev)
142 | 
143 |     i, converged, iter_n = 0, 0, 0
144 |     while i < maxiter:
145 |         if converged: break
146 | 
147 |         sv_ind_local = _do_one_class_svm_random(gamma, nu, x_train, sample_size, compute_rc=False)
148 |         for dummy1 in range(resample_n-1):
149 |             sv_ind_locals = _do_one_class_svm_random(gamma, nu, x_train, sample_size, compute_rc=False)
150 |             sv_ind_local = np.union1d(sv_ind_locals, sv_ind_local)
151 | 
152 |         sv_ind_merge = np.union1d(sv_ind_local, sv_ind_prev)
153 |         sv_ind_master, radius_master, center_master = _do_one_class_svm_sample(gamma, nu, x_train, sv_ind_merge)
154 | 
155 | 
156 |         if iter_history:
157 |             radius_history[i+1] = radius_master
158 |             center_history.append(center_master)
159 | 
160 |         iter_n = iter_n + 1 if np.fabs(radius_master - radius_prev) <= stop_tol * np.fabs(radius_prev) else 0
161 |         if iter_n >= n_iter:
162 |             converged = 1
163 |         else:
164 |             sv_ind_prev, center_prev, radius_prev = sv_ind_master, center_master, radius_master
165 |         i += 1
166 | 
167 |     if iter_history:
168 |         radius_history = radius_history[0:i+1]
169 |     niter = i + 1
170 | 
171 |     SampleSVDDRes      = namedtuple("SampleSVDDRes", "Params  IterHist OneClassSVM")
172 |     SampleSVDDParams   = namedtuple("SampleSVDDParams", "sv_ center_ radius_")
173 |     SampleSVDDIterHist = namedtuple("SampleSVDDIterHist", "niter_ radius_history_ center_history_ converged_")
174 | 
175 |     params = SampleSVDDParams(sv_ind_master, center_master, radius_master)
176 | 
177 |     iterhist = None
178 |     if iter_history:
179 |         iterhist = SampleSVDDIterHist(niter, radius_history, center_history, converged)
180 | 
181 |     nsv = sv_ind_master.shape[0]
182 |     clf = svm.OneClassSVM(gamma=gamma, nu=nu if nu * nsv > 1 else 1./nsv)
183 |     clf.fit(x_train[sv_ind_master, ...])
184 | 
185 |     return SampleSVDDRes(params, iterhist, clf)
186 | 
187 | if __name__ == "__main__":
188 |     def run_main():
189 |         import matplotlib.pyplot as plt
190 |         import time
191 |         #create a donut data.
192 |         def one_donut(rmin, rmax, origin, nobs):
193 |             """
194 |                 rmin: inner radius
195 |                 rmax: outer radis
196 |                 origin: origin
197 |                 nobs: number of observations in the data
198 |             """
199 |             r = np.sqrt(rmin*rmin + (rmax - rmin) * (rmax + rmin) * np.random.ranf(nobs))
200 |             theta = 2 * np.pi * np.random.ranf(nobs)
201 |             res = np.array([(r_*np.cos(theta_), r_*np.sin(theta_)) for r_, theta_ in zip(r, theta)])
202 |             return res + origin
203 | 
204 |         seed = 24215125
205 |         np.random.seed(seed)
206 |         
207 |         #store time taken by the two methods
208 |         tsample, tfull = list(),list()
209 |         
210 |         #run the method over data sets of these sizes
211 |         dsize_list = [5000,10000,100000,500000,1000000,1250000,2000000] 
212 |         
213 |         #this will take about 10mins to run
214 |         for ndat in dsize_list:
215 |             
216 |             #parameters of the two donuts           
217 |             r_min1, r_max1, origin1, nobs1 = 3, 5, (0, 0), np.floor(0.75 * ndat)
218 |             r_min2, r_max2, origin2, nobs2 = 2, 4, (10, 10), ndat - nobs1
219 |     
220 |             #create the training data        
221 |             test_data = np.append(one_donut(r_min1, r_max1, origin1, nobs1), one_donut(r_min2, r_max2, origin2, nobs2), axis=0)
222 |             
223 |             print('the test data has {0} observations'.format(test_data.shape[0]))
224 |             
225 |             #parameters of the training SVDD. Tweak for performance/accuracy.
226 |             outlier_fraction, kernel_s = 0.0001, 1.3
227 |             sample_size, resample_n, n_iter = 10, 1, 10
228 |             stop_tol, maxiter = 1e-4, 5000
229 |             
230 |             #train using sampling svdd
231 |             start = time.time()
232 |             result = sample_svdd(test_data,
233 |                                  outlier_fraction=outlier_fraction, 
234 |                                  kernel_s=kernel_s, 
235 |                                  resample_n=resample_n, 
236 |                                  maxiter=maxiter, 
237 |                                  sample_size=sample_size, 
238 |                                  stop_tol=stop_tol, 
239 |                                  n_iter=n_iter, 
240 |                                  iter_history=True, 
241 |                                  seed=seed)
242 |             end = time.time()
243 |             tsample.append( end-start )
244 |             print("sample svdd took {0} seconds to train, iteration history stored".format(end-start))
245 |             radius_history = result.IterHist.radius_history_
246 |             sv_indices = result.Params.sv_
247 |     
248 |             #train using full svdd
249 |             start = time.time()
250 |             clf1 = svm.OneClassSVM(nu=outlier_fraction if test_data.shape[0] * outlier_fraction > 1 else 1./test_data.shape[0], kernel="rbf", gamma=0.5/(kernel_s*kernel_s))
251 |             clf1.fit(test_data)
252 |             end = time.time()
253 |             tfull.append(end-start)
254 |             print("full svdd took {0} seconds to train".format(end-start))
255 |     
256 |             
257 |             #plot the support vectors
258 |             plt.figure(1)
259 |             plt.grid(True)
260 |             plt.title('Support Vectors (Sampling Method)')
261 |             plt.scatter(test_data[sv_indices, 0], test_data[sv_indices, 1])
262 |             plt.show()
263 |     
264 |             plt.figure(2)
265 |             plt.grid(True)
266 |             plt.title('Support Vectors (Full SVDD))')
267 |             plt.scatter(clf1.support_vectors_[..., 0], clf1.support_vectors_[..., 1])
268 |             plt.show()
269 |     
270 |             plt.figure(3)
271 |             plt.title('Iteration History for Sampling Method')
272 |             plt.plot(radius_history)
273 |             plt.show()
274 |     
275 |             #create a 200 x 200 grid on the bounding rectangle of the training data
276 |             # for scoring
277 |             ngrid=200
278 |             max_x, max_y = np.amax(test_data, axis=0)
279 |             min_x, min_y = np.amin(test_data, axis=0)
280 |     
281 |             x_ = np.linspace(min_x, max_x, ngrid)
282 |             y_ = np.linspace(min_y, max_y, ngrid)
283 |     
284 |             x, y = np.meshgrid(x_, y_)
285 |     
286 |             score_data = np.array([(x1, y1) for x1, y1 in zip(x.ravel(), y.ravel())])
287 |     
288 |             #the OneClasSVM result corresponding to the sample method
289 |             clf2 = result.OneClassSVM
290 |     
291 |             scores1 = clf1.predict(score_data)
292 |             scores2 = clf2.predict(score_data)
293 |             
294 |             #plot the scored data
295 |             plt.figure(4)
296 |             p2 = np.where(scores2 == 1)
297 |             plt.grid(True)
298 |             plt.title("Scoring Results : Inside Points Colored green (using sampling svdd)")
299 |             plt.scatter(score_data[p2, 0], score_data[p2, 1], color='g', s=0.75)
300 |             plt.show()
301 |     
302 |             plt.figure(5)
303 |             p1 = np.where(scores1 == 1)
304 |             plt.grid(True)
305 |             plt.title("Scoring Results : Inside Points Colored (using full svdd)")
306 |             plt.scatter(score_data[p1, 0], score_data[p1, 1], color='g', s=0.75)
307 |             plt.show()
308 | 
309 |         plt.figure(6)
310 |         plt.grid(True)
311 |         plt.title("Sampling SVDD Performance. Sample Size {0}".format(sample_size))
312 |         plt.xlabel("Input Data Size")
313 |         plt.ylabel("Time Taken (in seconds)")
314 |         plt.plot(dsize_list,tsample)
315 |         
316 |         plt.figure(7)
317 |         plt.grid(True)
318 |         plt.title("Full SVDD Performance")
319 |         plt.xlabel("Input Data Size")
320 |         plt.ylabel("Time Taken (in seconds)")
321 |         plt.plot(dsize_list,tfull)
322 |     
323 |     run_main()    
324 |     
325 |     
326 |         
327 |         
328 |         
329 |         
330 |     
331 |     
332 | 
333 | 


--------------------------------------------------------------------------------