├── README.md └── sample_svdd.py /README.md: -------------------------------------------------------------------------------- 1 | A sampling based SVDD implementation. Look at the run_main() function in sample_svdd.py for a simple comparison with the 2 | standard method. 3 | 4 | -------------------------------------------------------------------------------- /sample_svdd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 19 16:31:41 2016 4 | This script runs under python3. 5 | Code written to accompany sampling based svdd paper. 6 | """ 7 | 8 | # Some general comments. 9 | # 1. One Class SVM formulation (OCSVM) is identical to the SVDD formulation for the Gaussian Kernel. 10 | # 2. The feasible set for the optimization in SVDD/OCSVM computation is 11 | # 0 <= alpha_i <= 1/(n * f), 12 | # \sum a\lpha_i = 1, 13 | # which is equivalent to 14 | # 0 <= alpha_i <= min(1,1/(n*f)), 15 | # \sum alpha_i = 1. 16 | # So a value of f less than 1/n can be replaced by 1/n. For some reason explicitly replacing f 17 | # gives much better results than passing in tiny values of f. 18 | # For the paper we used the C++ SVDD implementation from LIBSVM here: 19 | # https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/#libsvm_for_svdd_and_finding_the_smallest_sphere_containing_all_data. 20 | # Even though Scikit-learn's OCSVM implementation is also based on LIBSVM, their performance characteristics are different. 21 | # LIBSVM probably uses different solvers for OCSVM and SVDD, and in may cases this python OCSVM implmentation outperformed 22 | # the SVDD one significantly. 23 | 24 | 25 | from collections import namedtuple 26 | import numpy as np 27 | from numpy.random import choice 28 | from sklearn import svm 29 | from sklearn.metrics.pairwise import rbf_kernel 30 | 31 | #Compute the radius and center from a svdd result 32 | def _compute_radius_center(clf, method=1): 33 | sv, coef = clf.support_vectors_, clf.dual_coef_ 34 | sv_pos = np.where((coef < 1)[0, ...])[0] 35 | coef.shape = (coef.shape[1], ) 36 | coef = coef/np.sum(coef) 37 | center = np.dot(coef, sv) 38 | #method 1 is a fast approximation of the radius which is good enough for our purpose 39 | if method == 0: 40 | m = rbf_kernel(sv, sv, gamma=clf.gamma) 41 | radius = 1 - 2 * np.dot(m[sv_pos[0], ...], coef) + np.dot(coef, np.dot(m, coef)) 42 | else: 43 | v = sv[sv_pos[0], ...].reshape(1, sv.shape[1]) 44 | m = rbf_kernel(v, sv, gamma=clf.gamma) 45 | radius = 1 - np.dot(m, coef) 46 | return radius, center 47 | 48 | #compute svdd given the indices of the sample 49 | def _do_one_class_svm_sample(gamma, nu, x_train, sample_indices, compute_rc=True): 50 | x_train_sample = x_train[sample_indices, ...] 51 | nsample = x_train_sample.shape[0] 52 | nu_1 = nu if nu * nsample > 1 else 1/nsample 53 | clf = svm.OneClassSVM(gamma=gamma, nu=nu_1) 54 | clf.fit(x_train_sample) 55 | if compute_rc: 56 | radius, center = _compute_radius_center(clf) 57 | return sample_indices[clf.support_], radius, center 58 | else: 59 | return sample_indices[clf.support_] 60 | 61 | # draw a random sample from the original data and peform svdd on it 62 | def _do_one_class_svm_random(gamma, nu, x_train, sample_size, compute_rc=True): 63 | sample = choice(x_train.shape[0], sample_size) 64 | return _do_one_class_svm_sample(gamma, nu, x_train, sample, compute_rc=compute_rc) 65 | 66 | 67 | # the sampling svdd implementation, see the __main__ section for an example 68 | def sample_svdd(x_train, 69 | outlier_fraction=0.001, 70 | kernel_s=2, 71 | maxiter=1000, 72 | sample_size=10, 73 | resample_n=3, 74 | stop_tol=1e-6, 75 | n_iter=30, 76 | iter_history=True, 77 | seed=2513646): 78 | """ 79 | Perform sampling based approximate svdd. 80 | Input Parameters: 81 | x_train : input data to train, must be a two-dim numpy array 82 | kernel_s: the bandwidth for the Gaussian kernel, the Gaussian kernel is 83 | assumed to be of the form exp( -||x - y||^2 / (2 *kernel_s^2)) 84 | sample_size: the size of each random sample 85 | resample_n: take these many samples in each iteration, and merge the union of their support vectors with the 86 | master, the method documented in the paper corresponds to resample_n = 1 87 | stop_tol: the tolerance value to detect convergence 88 | n_iter: the raidus and center must be close to each other for this many consecutive iterations 89 | for convergence to be declared 90 | iter: flag to determine whether convergence history will be stored 91 | seed: seed value for the random number generator 92 | Output: 93 | The output is a named tuple. If the output is denoted by res then: 94 | res.IterHist: a named tuple containing the iteration history 95 | res.IterHist.niter_ : number of iterations till convergence 96 | res.IterHist.radius_history_ : the iteration history for the radius 97 | res.IterHist.center_history_: the iteration history of the center 98 | res.IterHist.converged_ : convergence status flag 99 | res.Params: a named tuple containing the output parameters of the suggested SVDD 100 | res.Params.sv_: the indices of the fitted support vectors 101 | res.Params.center_: final center point 102 | res.Params.radius_ : final radius 103 | res.OneClassSVM: 104 | A sklearn.svm.OneClassSVM instance corresponding to the result. Can be used for scoring. 105 | """ 106 | 107 | # Only matrix input allowed 108 | if len(x_train.shape) != 2: 109 | print("ERROR: invalid x_train input found, expecting a matrix") 110 | raise ValueError 111 | 112 | #sanity checks 113 | if maxiter <= 0: 114 | print("ERROR: maxiter must be positive integer") 115 | raise ValueError 116 | 117 | nobs = x_train.shape[0] 118 | 119 | if nobs <= sample_size: 120 | print("ERROR: sample size must be strictly smaller than number of observations in input data") 121 | raise ValueError 122 | 123 | # convert kernel_s to gamma 124 | gamma, nu = 0.5/(kernel_s*kernel_s), outlier_fraction 125 | 126 | if np.isfinite(gamma) != True or np.isfinite(nu) != True or (nu < 0) or (nu > 1): 127 | print("ERROR: Invalid kernel_s or outlier_fraction input") 128 | raise ValueError 129 | 130 | #if negative seed is provided use a system chosen seed 131 | np.random.seed(seed=seed if seed >= 0 else None) 132 | 133 | if iter_history: 134 | radius_history, center_history = np.empty(maxiter+1), list() 135 | 136 | clf = None 137 | sv_ind_prev, radius_prev, center_prev = _do_one_class_svm_random(gamma, nu, x_train, sample_size) 138 | 139 | if iter_history: 140 | radius_history[0] = radius_prev 141 | center_history.append(center_prev) 142 | 143 | i, converged, iter_n = 0, 0, 0 144 | while i < maxiter: 145 | if converged: break 146 | 147 | sv_ind_local = _do_one_class_svm_random(gamma, nu, x_train, sample_size, compute_rc=False) 148 | for dummy1 in range(resample_n-1): 149 | sv_ind_locals = _do_one_class_svm_random(gamma, nu, x_train, sample_size, compute_rc=False) 150 | sv_ind_local = np.union1d(sv_ind_locals, sv_ind_local) 151 | 152 | sv_ind_merge = np.union1d(sv_ind_local, sv_ind_prev) 153 | sv_ind_master, radius_master, center_master = _do_one_class_svm_sample(gamma, nu, x_train, sv_ind_merge) 154 | 155 | 156 | if iter_history: 157 | radius_history[i+1] = radius_master 158 | center_history.append(center_master) 159 | 160 | iter_n = iter_n + 1 if np.fabs(radius_master - radius_prev) <= stop_tol * np.fabs(radius_prev) else 0 161 | if iter_n >= n_iter: 162 | converged = 1 163 | else: 164 | sv_ind_prev, center_prev, radius_prev = sv_ind_master, center_master, radius_master 165 | i += 1 166 | 167 | if iter_history: 168 | radius_history = radius_history[0:i+1] 169 | niter = i + 1 170 | 171 | SampleSVDDRes = namedtuple("SampleSVDDRes", "Params IterHist OneClassSVM") 172 | SampleSVDDParams = namedtuple("SampleSVDDParams", "sv_ center_ radius_") 173 | SampleSVDDIterHist = namedtuple("SampleSVDDIterHist", "niter_ radius_history_ center_history_ converged_") 174 | 175 | params = SampleSVDDParams(sv_ind_master, center_master, radius_master) 176 | 177 | iterhist = None 178 | if iter_history: 179 | iterhist = SampleSVDDIterHist(niter, radius_history, center_history, converged) 180 | 181 | nsv = sv_ind_master.shape[0] 182 | clf = svm.OneClassSVM(gamma=gamma, nu=nu if nu * nsv > 1 else 1./nsv) 183 | clf.fit(x_train[sv_ind_master, ...]) 184 | 185 | return SampleSVDDRes(params, iterhist, clf) 186 | 187 | if __name__ == "__main__": 188 | def run_main(): 189 | import matplotlib.pyplot as plt 190 | import time 191 | #create a donut data. 192 | def one_donut(rmin, rmax, origin, nobs): 193 | """ 194 | rmin: inner radius 195 | rmax: outer radis 196 | origin: origin 197 | nobs: number of observations in the data 198 | """ 199 | r = np.sqrt(rmin*rmin + (rmax - rmin) * (rmax + rmin) * np.random.ranf(nobs)) 200 | theta = 2 * np.pi * np.random.ranf(nobs) 201 | res = np.array([(r_*np.cos(theta_), r_*np.sin(theta_)) for r_, theta_ in zip(r, theta)]) 202 | return res + origin 203 | 204 | seed = 24215125 205 | np.random.seed(seed) 206 | 207 | #store time taken by the two methods 208 | tsample, tfull = list(),list() 209 | 210 | #run the method over data sets of these sizes 211 | dsize_list = [5000,10000,100000,500000,1000000,1250000,2000000] 212 | 213 | #this will take about 10mins to run 214 | for ndat in dsize_list: 215 | 216 | #parameters of the two donuts 217 | r_min1, r_max1, origin1, nobs1 = 3, 5, (0, 0), np.floor(0.75 * ndat) 218 | r_min2, r_max2, origin2, nobs2 = 2, 4, (10, 10), ndat - nobs1 219 | 220 | #create the training data 221 | test_data = np.append(one_donut(r_min1, r_max1, origin1, nobs1), one_donut(r_min2, r_max2, origin2, nobs2), axis=0) 222 | 223 | print('the test data has {0} observations'.format(test_data.shape[0])) 224 | 225 | #parameters of the training SVDD. Tweak for performance/accuracy. 226 | outlier_fraction, kernel_s = 0.0001, 1.3 227 | sample_size, resample_n, n_iter = 10, 1, 10 228 | stop_tol, maxiter = 1e-4, 5000 229 | 230 | #train using sampling svdd 231 | start = time.time() 232 | result = sample_svdd(test_data, 233 | outlier_fraction=outlier_fraction, 234 | kernel_s=kernel_s, 235 | resample_n=resample_n, 236 | maxiter=maxiter, 237 | sample_size=sample_size, 238 | stop_tol=stop_tol, 239 | n_iter=n_iter, 240 | iter_history=True, 241 | seed=seed) 242 | end = time.time() 243 | tsample.append( end-start ) 244 | print("sample svdd took {0} seconds to train, iteration history stored".format(end-start)) 245 | radius_history = result.IterHist.radius_history_ 246 | sv_indices = result.Params.sv_ 247 | 248 | #train using full svdd 249 | start = time.time() 250 | clf1 = svm.OneClassSVM(nu=outlier_fraction if test_data.shape[0] * outlier_fraction > 1 else 1./test_data.shape[0], kernel="rbf", gamma=0.5/(kernel_s*kernel_s)) 251 | clf1.fit(test_data) 252 | end = time.time() 253 | tfull.append(end-start) 254 | print("full svdd took {0} seconds to train".format(end-start)) 255 | 256 | 257 | #plot the support vectors 258 | plt.figure(1) 259 | plt.grid(True) 260 | plt.title('Support Vectors (Sampling Method)') 261 | plt.scatter(test_data[sv_indices, 0], test_data[sv_indices, 1]) 262 | plt.show() 263 | 264 | plt.figure(2) 265 | plt.grid(True) 266 | plt.title('Support Vectors (Full SVDD))') 267 | plt.scatter(clf1.support_vectors_[..., 0], clf1.support_vectors_[..., 1]) 268 | plt.show() 269 | 270 | plt.figure(3) 271 | plt.title('Iteration History for Sampling Method') 272 | plt.plot(radius_history) 273 | plt.show() 274 | 275 | #create a 200 x 200 grid on the bounding rectangle of the training data 276 | # for scoring 277 | ngrid=200 278 | max_x, max_y = np.amax(test_data, axis=0) 279 | min_x, min_y = np.amin(test_data, axis=0) 280 | 281 | x_ = np.linspace(min_x, max_x, ngrid) 282 | y_ = np.linspace(min_y, max_y, ngrid) 283 | 284 | x, y = np.meshgrid(x_, y_) 285 | 286 | score_data = np.array([(x1, y1) for x1, y1 in zip(x.ravel(), y.ravel())]) 287 | 288 | #the OneClasSVM result corresponding to the sample method 289 | clf2 = result.OneClassSVM 290 | 291 | scores1 = clf1.predict(score_data) 292 | scores2 = clf2.predict(score_data) 293 | 294 | #plot the scored data 295 | plt.figure(4) 296 | p2 = np.where(scores2 == 1) 297 | plt.grid(True) 298 | plt.title("Scoring Results : Inside Points Colored green (using sampling svdd)") 299 | plt.scatter(score_data[p2, 0], score_data[p2, 1], color='g', s=0.75) 300 | plt.show() 301 | 302 | plt.figure(5) 303 | p1 = np.where(scores1 == 1) 304 | plt.grid(True) 305 | plt.title("Scoring Results : Inside Points Colored (using full svdd)") 306 | plt.scatter(score_data[p1, 0], score_data[p1, 1], color='g', s=0.75) 307 | plt.show() 308 | 309 | plt.figure(6) 310 | plt.grid(True) 311 | plt.title("Sampling SVDD Performance. Sample Size {0}".format(sample_size)) 312 | plt.xlabel("Input Data Size") 313 | plt.ylabel("Time Taken (in seconds)") 314 | plt.plot(dsize_list,tsample) 315 | 316 | plt.figure(7) 317 | plt.grid(True) 318 | plt.title("Full SVDD Performance") 319 | plt.xlabel("Input Data Size") 320 | plt.ylabel("Time Taken (in seconds)") 321 | plt.plot(dsize_list,tfull) 322 | 323 | run_main() 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | --------------------------------------------------------------------------------