├── models ├── __init__.py ├── sklearn_base.py ├── knn.py ├── combination.py ├── lof.py ├── base.py └── feature_bagging.py ├── utils ├── __init__.py ├── stat_models.py └── utility.py ├── datasets └── cardio.mat ├── figs └── flowchart2.png ├── requirements.txt ├── results ├── cardio_lof_20181006_152659.txt └── cardio_lof_20181006_152659.csv ├── demo_lof.py └── README.md /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasets/cardio.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzhao062/LSCP/HEAD/datasets/cardio.mat -------------------------------------------------------------------------------- /figs/flowchart2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yzhao062/LSCP/HEAD/figs/flowchart2.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.13 2 | numba>=0.35 3 | scipy>=0.19.1 4 | scikit_learn>=0.19.1 5 | -------------------------------------------------------------------------------- /results/cardio_lof_20181006_152659.txt: -------------------------------------------------------------------------------- 1 | 2 | n_ite: 20 3 | test_size: 0.4 4 | n_baselines: 10 5 | 6 | loc_region_perc: 0.1 7 | loc_region_ite: 20 8 | loc_region_threshold: 10 9 | loc_min_features: 0.5 10 | loc_region_size: 100 11 | loc_region_min: 30 12 | loc_region_max: 100 13 | 14 | n_clf: 50 15 | k_min: 5 16 | k_max: 200 17 | n_bins: 10 18 | n_selected: 1 19 | n_buckets: 5 20 | execution_time: 180.2803671360016 -------------------------------------------------------------------------------- /results/cardio_lof_20181006_152659.csv: -------------------------------------------------------------------------------- 1 | method, roc, best_roc, diff_roc,ap, best_ap, diff_ap,best roc, best ap 2 | GG_a,0.9061,0.9241,1.9865,0.4282,0.4942,15.4134,SCP_aom,SCP_aom 3 | GG_m,0.8829,0.9241,4.6664,0.4016,0.4942,23.0578,SCP_aom,SCP_aom 4 | GG_wa,0.9075,0.9241,1.8292,0.4326,0.4942,14.2395,SCP_aom,SCP_aom 5 | GG_thresh,0.9099,0.9241,1.5606,0.438,0.4942,12.8311,SCP_aom,SCP_aom 6 | GG_aom,0.9102,0.9241,1.5271,0.4558,0.4942,8.4247,SCP_aom,SCP_aom 7 | GG_moa,0.9138,0.9241,1.1272,0.4513,0.4942,9.5059,SCP_aom,SCP_aom 8 | SCP_a,0.8974,0.9241,2.9753,0.4089,0.4942,20.8608,SCP_aom,SCP_aom 9 | SCP_moa,0.9158,0.9241,0.9063,0.473,0.4942,4.482,SCP_aom,SCP_aom 10 | SCP_m,0.8057,0.9241,14.6953,0.3291,0.4942,50.1671,SCP_aom,SCP_aom 11 | SCP_aom,0.9241,0.9241,0.0,0.4942,0.4942,0.0,SCP_aom,SCP_aom -------------------------------------------------------------------------------- /utils/stat_models.py: -------------------------------------------------------------------------------- 1 | from numba import njit 2 | import numpy as np 3 | from scipy.special import betainc 4 | 5 | 6 | # from scipy.stats import pearsonr 7 | 8 | def pearsonr(x, y): 9 | """ Calculate Pearson Correlation between x and y 10 | :param x: 11 | :param y: 12 | :return: 13 | """ 14 | x = np.asarray(x) 15 | y = np.asarray(y) 16 | 17 | # if np.unique(x).shape[0] == 1 or np.unique(y).shape[0] == 1: 18 | # return 1.0 19 | r = pearsonr_helper(x, y) 20 | 21 | # Presumably, if abs(r) > 1, then it is only some small artifact of 22 | # floating point arithmetic. 23 | r = max(min(r, 1.0), -1.0) 24 | return r 25 | 26 | 27 | @njit 28 | def pearsonr_helper(x, y): 29 | """ Optimized version for Pearson correlation calculation 30 | :param x: 31 | :param y: 32 | :return: 33 | """ 34 | # x and y should have same length. 35 | # n = len(x) 36 | mx = x.mean() 37 | my = y.mean() 38 | xm, ym = x - mx, y - my 39 | # r_num = np.add.reduce(xm * ym) 40 | 41 | r_num = np.sum(xm * ym) 42 | r_den = np.sqrt(np.sum(xm * xm, axis=0) * np.sum(ym * ym, axis=0)) 43 | 44 | # only use in case of overflow 45 | # if r_den == 0: 46 | # return 1 47 | r = r_num / r_den 48 | 49 | return r 50 | 51 | 52 | def wpearsonr(x, y, w=None): 53 | """ Weighted Pearson Correlation 54 | :param x: 55 | :param y: 56 | :param w: 57 | :return: 58 | """ 59 | # https://stats.stackexchange.com/questions/221246/such-thing-as-a-weighted-correlation 60 | 61 | # unweighted version 62 | if w is None: 63 | return pearsonr(x, y) 64 | 65 | x = np.asarray(x) 66 | y = np.asarray(y) 67 | w = np.asarray(w) 68 | 69 | n = len(x) 70 | 71 | w_sum = w.sum() 72 | mx = np.sum(x * w) / w_sum 73 | my = np.sum(y * w) / w_sum 74 | 75 | xm, ym = (x - mx), (y - my) 76 | 77 | r_num = np.sum(xm * ym * w) / w_sum 78 | 79 | xm2 = np.sum(xm * xm * w) / w_sum 80 | ym2 = np.sum(ym * ym * w) / w_sum 81 | 82 | r_den = np.sqrt(xm2 * ym2) 83 | r = r_num / r_den 84 | 85 | r = max(min(r, 1.0), -1.0) 86 | # df = n - 2 87 | # 88 | # if abs(r) == 1.0: 89 | # prob = 0.0 90 | # else: 91 | # t_squared = r ** 2 * (df / ((1.0 - r) * (1.0 + r))) 92 | # prob = _betai(0.5 * df, 0.5, df / (df + t_squared)) 93 | return r # , prob 94 | 95 | 96 | ##################################### 97 | # PROBABILITY CALCULATIONS # 98 | ##################################### 99 | 100 | 101 | def _betai(a, b, x): 102 | x = np.asarray(x) 103 | x = np.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0 104 | return betainc(a, b, x) 105 | 106 | 107 | def pearsonr_mat(mat, w=None): 108 | n_row = mat.shape[0] 109 | n_col = mat.shape[1] 110 | pear_mat = np.full([n_row, n_row], 1).astype(float) 111 | 112 | if w is not None: 113 | for cx in range(n_row): 114 | for cy in range(cx + 1, n_row): 115 | curr_pear = wpearsonr(mat[cx, :], mat[cy, :], w) 116 | pear_mat[cx, cy] = curr_pear 117 | pear_mat[cy, cx] = curr_pear 118 | else: 119 | for cx in range(n_col): 120 | for cy in range(cx + 1, n_row): 121 | curr_pear = pearsonr(mat[cx, :], mat[cy, :])[0] 122 | pear_mat[cx, cy] = curr_pear 123 | pear_mat[cy, cx] = curr_pear 124 | 125 | return pear_mat 126 | -------------------------------------------------------------------------------- /models/sklearn_base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Utility function copied over from sklearn/base.py 3 | """ 4 | # Author: Yue Zhao 5 | # License: BSD 2 clause 6 | 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import numpy as np 11 | from sklearn.externals import six 12 | from sklearn.externals.joblib import cpu_count 13 | 14 | 15 | def _get_n_jobs(n_jobs): 16 | """Get number of jobs for the computation. 17 | See sklearn/utils/__init__.py for more information. 18 | 19 | This function reimplements the logic of joblib to determine the actual 20 | number of jobs depending on the cpu count. If -1 all CPUs are used. 21 | If 1 is given, no parallel computing code is used at all, which is useful 22 | for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. 23 | Thus for n_jobs = -2, all CPUs but one are used. 24 | Parameters 25 | ---------- 26 | n_jobs : int 27 | Number of jobs stated in joblib convention. 28 | Returns 29 | ------- 30 | n_jobs : int 31 | The actual number of jobs as positive integer. 32 | Examples 33 | -------- 34 | >>> from sklearn.utils import _get_n_jobs 35 | >>> _get_n_jobs(4) 36 | 4 37 | >>> jobs = _get_n_jobs(-2) 38 | >>> assert jobs == max(cpu_count() - 1, 1) 39 | >>> _get_n_jobs(0) 40 | Traceback (most recent call last): 41 | ... 42 | ValueError: Parameter n_jobs == 0 has no meaning. 43 | """ 44 | if n_jobs < 0: 45 | return max(cpu_count() + 1 + n_jobs, 1) 46 | elif n_jobs == 0: 47 | raise ValueError('Parameter n_jobs == 0 has no meaning.') 48 | else: 49 | return n_jobs 50 | 51 | 52 | def _partition_estimators(n_estimators, n_jobs): 53 | """Private function used to partition estimators between jobs. 54 | See sklearn/ensemble/base.py for more information. 55 | """ 56 | # Compute the number of jobs 57 | n_jobs = min(_get_n_jobs(n_jobs), n_estimators) 58 | 59 | # Partition estimators between jobs 60 | n_estimators_per_job = (n_estimators // n_jobs) * np.ones(n_jobs, 61 | dtype=np.int) 62 | n_estimators_per_job[:n_estimators % n_jobs] += 1 63 | starts = np.cumsum(n_estimators_per_job) 64 | 65 | return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist() 66 | 67 | 68 | def _pprint(params, offset=0, printer=repr): 69 | # noinspection PyPep8 70 | """Pretty print the dictionary 'params' 71 | 72 | See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html 73 | and sklearn/base.py for more information. 74 | 75 | :param params: The dictionary to pretty print 76 | :type params: dict 77 | 78 | :param offset: The offset in characters to add at the begin of each line. 79 | :type offset: int 80 | 81 | :param printer: The function to convert entries to strings, typically 82 | the builtin str or repr 83 | :type printer: callable 84 | 85 | :return: None 86 | """ 87 | 88 | # Do a multi-line justified repr: 89 | options = np.get_printoptions() 90 | np.set_printoptions(precision=5, threshold=64, edgeitems=2) 91 | params_list = list() 92 | this_line_length = offset 93 | line_sep = ',\n' + (1 + offset // 2) * ' ' 94 | for i, (k, v) in enumerate(sorted(six.iteritems(params))): 95 | if type(v) is float: 96 | # use str for representing floating point numbers 97 | # this way we get consistent representation across 98 | # architectures and versions. 99 | this_repr = '%s=%s' % (k, str(v)) 100 | else: 101 | # use repr of the rest 102 | this_repr = '%s=%s' % (k, printer(v)) 103 | if len(this_repr) > 500: 104 | this_repr = this_repr[:300] + '...' + this_repr[-100:] 105 | if i > 0: 106 | if this_line_length + len(this_repr) >= 75 or '\n' in this_repr: 107 | params_list.append(line_sep) 108 | this_line_length = len(line_sep) 109 | else: 110 | params_list.append(', ') 111 | this_line_length += 2 112 | params_list.append(this_repr) 113 | this_line_length += len(this_repr) 114 | 115 | np.set_printoptions(**options) 116 | lines = ''.join(params_list) 117 | # Strip trailing space to avoid nightmare in doctests 118 | lines = '\n'.join(l.rstrip(' ') for l in lines.split('\n')) 119 | return lines 120 | -------------------------------------------------------------------------------- /models/knn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.preprocessing import MinMaxScaler 3 | from sklearn.neighbors import NearestNeighbors 4 | from sklearn.neighbors import KDTree 5 | from sklearn.exceptions import NotFittedError 6 | from scipy.stats import scoreatpercentile 7 | from scipy.stats import rankdata 8 | from scipy.special import erf 9 | 10 | 11 | class Knn(object): 12 | ''' 13 | Knn class for outlier detection 14 | support original knn, average knn, and median knn 15 | ''' 16 | 17 | def __init__(self, n_neighbors=1, contamination=0.05, method='largest'): 18 | self.n_neighbors = n_neighbors 19 | self.contamination = contamination 20 | self.method = method 21 | 22 | def fit(self, X_train): 23 | self.X_train = X_train 24 | self._isfitted = True 25 | self.tree = KDTree(X_train) 26 | 27 | neigh = NearestNeighbors(n_neighbors=self.n_neighbors) 28 | neigh.fit(self.X_train) 29 | 30 | result = neigh.kneighbors(n_neighbors=self.n_neighbors, 31 | return_distance=True) 32 | dist_arr = result[0] 33 | 34 | if self.method == 'largest': 35 | dist = dist_arr[:, -1] 36 | elif self.method == 'mean': 37 | dist = np.mean(dist_arr, axis=1) 38 | elif self.method == 'median': 39 | dist = np.median(dist_arr, axis=1) 40 | 41 | self.threshold = scoreatpercentile(dist, 42 | 100 * (1 - self.contamination)) 43 | self.decision_scores = dist.ravel() 44 | self.y_pred = (self.decision_scores > self.threshold).astype('int') 45 | 46 | self.mu = np.mean(self.decision_scores) 47 | self.sigma = np.std(self.decision_scores) 48 | 49 | def decision_function(self, X_test): 50 | 51 | if not self._isfitted: 52 | NotFittedError('Knn is not fitted yet') 53 | 54 | # initialize the output score 55 | pred_score = np.zeros([X_test.shape[0], 1]) 56 | 57 | for i in range(X_test.shape[0]): 58 | x_i = X_test[i, :] 59 | x_i = np.asarray(x_i).reshape(1, x_i.shape[0]) 60 | 61 | # get the distance of the current point 62 | dist_arr, ind_arr = self.tree.query(x_i, k=self.n_neighbors) 63 | 64 | if self.method == 'largest': 65 | dist = dist_arr[:, -1] 66 | elif self.method == 'mean': 67 | dist = np.mean(dist_arr, axis=1) 68 | elif self.method == 'median': 69 | dist = np.median(dist_arr, axis=1) 70 | 71 | pred_score_i = dist[-1] 72 | 73 | # record the current item 74 | pred_score[i, :] = pred_score_i 75 | 76 | return pred_score 77 | 78 | def predict(self, X_test): 79 | pred_score = self.decision_function(X_test) 80 | return (pred_score > self.threshold).astype('int') 81 | 82 | def predict_proba(self, X_test, method='linear'): 83 | test_scores = self.decision_function(X_test) 84 | train_scores = self.decision_scores 85 | 86 | if method == 'linear': 87 | scaler = MinMaxScaler().fit(train_scores.reshape(-1, 1)) 88 | proba = scaler.transform(test_scores.reshape(-1, 1)) 89 | return proba.clip(0, 1) 90 | else: 91 | # turn output into probability 92 | pre_erf_score = (test_scores - self.mu) / (self.sigma * np.sqrt(2)) 93 | erf_score = erf(pre_erf_score) 94 | proba = erf_score.clip(0) 95 | 96 | # TODO: move to testing code 97 | assert (proba.min() >= 0) 98 | assert (proba.max() <= 1) 99 | return proba 100 | 101 | def predict_rank(self, X_test): 102 | test_scores = self.decision_function(X_test) 103 | train_scores = self.decision_scores 104 | 105 | ranks = np.zeros([X_test.shape[0], 1]) 106 | 107 | for i in range(test_scores.shape[0]): 108 | train_scores_i = np.append(train_scores.reshape(-1, 1), 109 | test_scores[i]) 110 | 111 | ranks[i] = rankdata(train_scores_i)[-1] 112 | 113 | # return normalized ranks 114 | ranks_norm = ranks / ranks.max() 115 | return ranks_norm 116 | 117 | ############################################################################## 118 | # samples = [[-1, 0], [0., 0.], [1., 1], [2., 5.], [3, 1]] 119 | # 120 | # clf = Knn() 121 | # clf.fit(samples) 122 | # 123 | # scores = clf.decision_function(np.asarray([[2, 3], [6, 8]])).ravel() 124 | # assert (scores[0] == [2]) 125 | # assert (scores[1] == [5]) 126 | # # 127 | # labels = clf.predict(np.asarray([[2, 3], [6, 8]])).ravel() 128 | # assert (labels[0] == [0]) 129 | # assert (labels[1] == [1]) 130 | -------------------------------------------------------------------------------- /models/combination.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.utils.validation import check_array 3 | from sklearn.utils.validation import column_or_1d 4 | from sklearn.utils.testing import assert_equal 5 | 6 | 7 | def aom(scores, n_buckets, n_estimators, standard=True): 8 | ''' 9 | Average of Maximum - An ensemble method for outlier detection 10 | 11 | Aggarwal, C.C. and Sathe, S., 2015. Theoretical foundations and algorithms 12 | for outlier ensembles. ACM SIGKDD Explorations Newsletter, 17(1), pp.24-47. 13 | 14 | :param scores: 15 | :param n_buckets: 16 | :param n_estimators: 17 | :param standard: 18 | :return: 19 | ''' 20 | scores = np.asarray(scores) 21 | if scores.shape[1] != n_estimators: 22 | raise ValueError('score matrix should be n_samples by n_estimaters') 23 | 24 | scores_aom = np.zeros([scores.shape[0], n_buckets]) 25 | 26 | n_estimators_per_bucket = int(n_estimators / n_buckets) 27 | if n_estimators % n_buckets != 0: 28 | Warning('n_estimators / n_buckets leads to a remainder') 29 | 30 | # shuffle the estimator order 31 | estimators_list = list(range(0, n_estimators, 1)) 32 | np.random.shuffle(estimators_list) 33 | 34 | head = 0 35 | for i in range(0, n_estimators, n_estimators_per_bucket): 36 | tail = i + n_estimators_per_bucket 37 | batch_ind = int(i / n_estimators_per_bucket) 38 | 39 | scores_aom[:, batch_ind] = np.max( 40 | scores[:, estimators_list[head:tail]], axis=1) 41 | 42 | head = head + n_estimators_per_bucket 43 | 44 | return np.mean(scores_aom, axis=1) 45 | 46 | 47 | def moa(scores, n_buckets, n_estimators): 48 | ''' 49 | Maximum of Average - An ensemble method for outlier detection 50 | 51 | Aggarwal, C.C. and Sathe, S., 2015. Theoretical foundations and algorithms 52 | for outlier ensembles. ACM SIGKDD Explorations Newsletter, 17(1), pp.24-47. 53 | 54 | :param scores: 55 | :param n_buckets: 56 | :param n_estimators: 57 | :param standard: 58 | :return: 59 | ''' 60 | scores = np.asarray(scores) 61 | if scores.shape[1] != n_estimators: 62 | raise ValueError('score matrix should be n_samples by n_estimaters') 63 | 64 | scores_moa = np.zeros([scores.shape[0], n_buckets]) 65 | 66 | n_estimators_per_bucket = int(n_estimators / n_buckets) 67 | if n_estimators % n_buckets != 0: 68 | Warning('n_estimators / n_buckets leads to a remainder') 69 | 70 | # shuffle the estimator order 71 | estimators_list = list(range(0, n_estimators, 1)) 72 | np.random.shuffle(estimators_list) 73 | 74 | head = 0 75 | for i in range(0, n_estimators, n_estimators_per_bucket): 76 | tail = i + n_estimators_per_bucket 77 | batch_ind = int(i / n_estimators_per_bucket) 78 | 79 | scores_moa[:, batch_ind] = np.mean( 80 | scores[:, estimators_list[head:tail]], axis=1) 81 | 82 | head = head + n_estimators_per_bucket 83 | 84 | return np.max(scores_moa, axis=1) 85 | 86 | 87 | def average(scores, estimator_weight=None): 88 | """Combination method to merge the outlier scores from multiple estimators 89 | by taking the average. 90 | 91 | Parameters 92 | ---------- 93 | scores : numpy array of shape (n_samples, n_estimators) 94 | Score matrix from multiple estimators on the same samples. 95 | 96 | estimator_weight : list of shape (1, n_estimators) 97 | If specified, using weighted average 98 | 99 | Returns 100 | ------- 101 | combined_scores : numpy array of shape (n_samples, ) 102 | The combined outlier scores. 103 | 104 | """ 105 | scores = check_array(scores) 106 | 107 | if estimator_weight is not None: 108 | estimator_weight = column_or_1d(estimator_weight).reshape(1, -1) 109 | assert_equal(scores.shape[1], estimator_weight.shape[1]) 110 | 111 | # (d1*w1 + d2*w2 + ...+ dn*wn)/(w1+w2+...+wn) 112 | # generated weighted scores 113 | scores = np.sum(np.multiply(scores, estimator_weight), 114 | axis=1) / np.sum( 115 | estimator_weight) 116 | return scores.ravel() 117 | 118 | else: 119 | return np.mean(scores, axis=1).ravel() 120 | 121 | 122 | def maximization(scores): 123 | """Combination method to merge the outlier scores from multiple estimators 124 | by taking the maximum. 125 | 126 | Parameters 127 | ---------- 128 | scores : numpy array of shape (n_samples, n_estimators) 129 | Score matrix from multiple estimators on the same samples. 130 | 131 | Returns 132 | ------- 133 | combined_scores : numpy array of shape (n_samples, ) 134 | The combined outlier scores. 135 | 136 | """ 137 | 138 | scores = check_array(scores) 139 | return np.max(scores, axis=1).ravel() 140 | -------------------------------------------------------------------------------- /models/lof.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Local Outlier Factor (LOF). Implemented on scikit-learn library. 3 | """ 4 | # Author: Yue Zhao 5 | # License: BSD 2 clause 6 | 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import sklearn 11 | from sklearn.neighbors import LocalOutlierFactor 12 | from sklearn.utils.validation import check_is_fitted 13 | from sklearn.utils.validation import check_array 14 | from sklearn.utils.validation import column_or_1d 15 | 16 | from .base import BaseDetector 17 | 18 | 19 | def invert_order(scores, method='multiplication'): 20 | """ Invert the order of a list of values. The smallest value becomes 21 | the largest in the inverted list. This is useful while combining 22 | multiple detectors since their score order could be different. 23 | 24 | Parameters 25 | ---------- 26 | scores : list, array or numpy array with shape (n_samples,) 27 | The list of values to be inverted 28 | 29 | method : str, optional (default='multiplication') 30 | Methods used for order inversion. Valid methods are: 31 | 32 | - 'multiplication': multiply by -1 33 | - 'subtraction': max(scores) - scores 34 | 35 | Returns 36 | ------- 37 | inverted_scores : numpy array of shape (n_samples,) 38 | The inverted list 39 | 40 | Examples 41 | -------- 42 | >>> scores1 = [0.1, 0.3, 0.5, 0.7, 0.2, 0.1] 43 | >>> invert_order(scores1) 44 | >>> array[-0.1, -0.3, -0.5, -0.7, -0.2, -0.1] 45 | >>> invert_order(scores1, method='subtraction') 46 | >>> array[0.6, 0.4, 0.2, 0, 0.5, 0.6] 47 | """ 48 | 49 | scores = column_or_1d(scores) 50 | 51 | if method == 'multiplication': 52 | return scores.ravel() * -1 53 | 54 | if method == 'subtraction': 55 | return (scores.max() - scores).ravel() 56 | 57 | 58 | def _sklearn_version_20(): 59 | """ Utility function to decide the version of sklearn 60 | In sklearn 20.0, LOF is changed. Specifically, _decision_function 61 | is replaced by _score_samples 62 | 63 | Returns 64 | ------- 65 | sklearn_20_flag : bool 66 | True if sklearn.__version__ is newer than 0.20.0 67 | 68 | """ 69 | sklearn_version = str(sklearn.__version__) 70 | if int(sklearn_version.split(".")[1]) > 19: 71 | return True 72 | else: 73 | return False 74 | 75 | 76 | class LOF(BaseDetector): 77 | """Wrapper of scikit-learn LOF Class with more functionalities. 78 | Unsupervised Outlier Detection using Local Outlier Factor (LOF). 79 | 80 | The anomaly score of each sample is called Local Outlier Factor. 81 | It measures the local deviation of density of a given sample with 82 | respect to its neighbors. 83 | It is local in that the anomaly score depends on how isolated the object 84 | is with respect to the surrounding neighborhood. 85 | More precisely, locality is given by k-nearest neighbors, whose distance 86 | is used to estimate the local density. 87 | By comparing the local density of a sample to the local densities of 88 | its neighbors, one can identify samples that have a substantially lower 89 | density than their neighbors. These are considered outliers. 90 | See :cite:`breunig2000lof` for details. 91 | 92 | Parameters 93 | ---------- 94 | n_neighbors : int, optional (default=20) 95 | Number of neighbors to use by default for `kneighbors` queries. 96 | If n_neighbors is larger than the number of samples provided, 97 | all samples will be used. 98 | 99 | algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional 100 | Algorithm used to compute the nearest neighbors: 101 | 102 | - 'ball_tree' will use BallTree 103 | - 'kd_tree' will use KDTree 104 | - 'brute' will use a brute-force search. 105 | - 'auto' will attempt to decide the most appropriate algorithm 106 | based on the values passed to :meth:`fit` method. 107 | 108 | Note: fitting on sparse input will override the setting of 109 | this parameter, using brute force. 110 | 111 | leaf_size : int, optional (default=30) 112 | Leaf size passed to `BallTree` or `KDTree`. This can 113 | affect the speed of the construction and query, as well as the memory 114 | required to store the tree. The optimal value depends on the 115 | nature of the problem. 116 | 117 | metric : string or callable, default 'minkowski' 118 | metric used for the distance computation. Any metric from scikit-learn 119 | or scipy.spatial.distance can be used. 120 | 121 | If 'precomputed', the training input X is expected to be a distance 122 | matrix. 123 | 124 | If metric is a callable function, it is called on each 125 | pair of instances (rows) and the resulting value recorded. The callable 126 | should take two arrays as input and return one value indicating the 127 | distance between them. This works for Scipy's metrics, but is less 128 | efficient than passing the metric name as a string. 129 | 130 | Valid values for metric are: 131 | 132 | - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 133 | 'manhattan'] 134 | 135 | - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 136 | 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 137 | 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 138 | 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 139 | 'sqeuclidean', 'yule'] 140 | 141 | See the documentation for scipy.spatial.distance for details on these 142 | metrics: 143 | http://docs.scipy.org/doc/scipy/reference/spatial.distance.html 144 | 145 | p : integer, optional (default = 2) 146 | Parameter for the Minkowski metric from 147 | sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is 148 | equivalent to using manhattan_distance (l1), and euclidean_distance 149 | (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. 150 | See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances 151 | 152 | metric_params : dict, optional (default = None) 153 | Additional keyword arguments for the metric function. 154 | 155 | contamination : float in (0., 0.5), optional (default=0.1) 156 | The amount of contamination of the data set, i.e. the proportion 157 | of outliers in the data set. When fitting this is used to define the 158 | threshold on the decision function. 159 | 160 | n_jobs : int, optional (default = 1) 161 | The number of parallel jobs to run for neighbors search. 162 | If ``-1``, then the number of jobs is set to the number of CPU cores. 163 | Affects only kneighbors and kneighbors_graph methods. 164 | 165 | Attributes 166 | ---------- 167 | n_neighbors_ : int 168 | The actual number of neighbors used for `kneighbors` queries. 169 | 170 | decision_scores_ : numpy array of shape (n_samples,) 171 | The outlier scores of the training data. 172 | The higher, the more abnormal. Outliers tend to have higher 173 | scores. This value is available once the detector is 174 | fitted. 175 | 176 | threshold_ : float 177 | The threshold is based on ``contamination``. It is the 178 | ``n_samples * contamination`` most abnormal samples in 179 | ``decision_scores_``. The threshold is calculated for generating 180 | binary outlier labels. 181 | 182 | labels_ : int, either 0 or 1 183 | The binary labels of the training data. 0 stands for inliers 184 | and 1 for outliers/anomalies. It is generated by applying 185 | ``threshold_`` on ``decision_scores_``. 186 | """ 187 | 188 | def __init__(self, n_neighbors=20, algorithm='auto', leaf_size=30, 189 | metric='minkowski', p=2, metric_params=None, 190 | contamination=0.1, n_jobs=1): 191 | super(LOF, self).__init__(contamination=contamination) 192 | self.n_neighbors = n_neighbors 193 | self.algorithm = algorithm 194 | self.leaf_size = leaf_size 195 | self.metric = metric 196 | self.p = p 197 | self.metric_params = metric_params 198 | self.n_jobs = n_jobs 199 | 200 | # noinspection PyIncorrectDocstring 201 | def fit(self, X, y=None): 202 | """Fit detector. y is optional for unsupervised methods. 203 | 204 | Parameters 205 | ---------- 206 | X : numpy array of shape (n_samples, n_features) 207 | The input samples. 208 | 209 | y : numpy array of shape (n_samples,), optional (default=None) 210 | The ground truth of the input samples (labels). 211 | """ 212 | # validate inputs X and y (optional) 213 | X = check_array(X) 214 | self._set_n_classes(y) 215 | 216 | self.detector_ = LocalOutlierFactor(n_neighbors=self.n_neighbors, 217 | algorithm=self.algorithm, 218 | leaf_size=self.leaf_size, 219 | metric=self.metric, 220 | p=self.p, 221 | metric_params=self.metric_params, 222 | contamination=self.contamination, 223 | n_jobs=self.n_jobs) 224 | self.detector_.fit(X=X, y=y) 225 | 226 | # Invert decision_scores_. Outliers comes with higher outlier scores 227 | self.decision_scores_ = invert_order( 228 | self.detector_.negative_outlier_factor_) 229 | self._process_decision_scores() 230 | return self 231 | 232 | def decision_function(self, X): 233 | """Predict raw anomaly score of X using the fitted detector. 234 | 235 | The anomaly score of an input sample is computed based on different 236 | detector algorithms. For consistency, outliers are assigned with 237 | larger anomaly scores. 238 | 239 | Parameters 240 | ---------- 241 | X : numpy array of shape (n_samples, n_features) 242 | The training input samples. Sparse matrices are accepted only 243 | if they are supported by the base estimator. 244 | 245 | Returns 246 | ------- 247 | anomaly_scores : numpy array of shape (n_samples,) 248 | The anomaly score of the input samples. 249 | """ 250 | 251 | check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) 252 | 253 | # Invert outlier scores. Outliers comes with higher outlier scores 254 | # noinspection PyProtectedMember 255 | if _sklearn_version_20(): 256 | return invert_order(self.detector_._score_samples(X)) 257 | else: 258 | return invert_order(self.detector_._decision_function(X)) 259 | 260 | @property 261 | def n_neighbors_(self): 262 | """The actual number of neighbors used for kneighbors queries. 263 | Decorator for scikit-learn LOF attributes. 264 | """ 265 | return self.detector_.n_neighbors_ 266 | -------------------------------------------------------------------------------- /demo_lof.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import time 3 | 4 | import numpy as np 5 | 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.metrics import roc_auc_score 8 | from sklearn.metrics import average_precision_score 9 | 10 | from models.lof import LOF 11 | from models.feature_bagging import FeatureBagging 12 | from models.combination import aom, moa 13 | from utils.stat_models import pearsonr 14 | from utils.utility import get_local_region 15 | from utils.utility import get_competent_detectors 16 | from utils.utility import train_predict_lof, generate_bagging_indices 17 | from utils.utility import print_save_result, save_script 18 | from utils.utility import loaddata, precision_n_score, standardizer 19 | 20 | # access the timestamp for logging purpose 21 | today = datetime.datetime.now() 22 | timestamp = today.strftime("%Y%m%d_%H%M%S") 23 | 24 | # set numpy parameters 25 | np.set_printoptions(suppress=True, precision=4) 26 | 27 | ############################################################################### 28 | # parameter settings 29 | 30 | data = 'cardio' 31 | # data = 'letter' 32 | 33 | 34 | base_detector = 'lof' 35 | n_ite = 30 # number of iterations 36 | test_size = 0.4 # training = 60%, testing = 40% 37 | n_baselines = 11 # the number of baseline algorithms, DO NOT CHANGE 38 | 39 | # reference pearson size: 40 | # https://www.researchgate.net/post/What_is_the_minimum_sample_size_to_run_Pearsons_R 41 | loc_region_size = 0 42 | loc_region_min = 30 # min local region size 43 | loc_region_max = 100 # max local region size 44 | ############################################################################### 45 | # adjustable parameters 46 | loc_region_perc = 0.1 47 | loc_region_ite = 20 # the number of iterations in defining local region 48 | loc_region_threshold = int(loc_region_ite / 2) # the threshold to keep a point 49 | loc_min_features = 0.5 # the lower bound of the number of features to use 50 | 51 | n_bins = 10 52 | n_selected = 1 # actually not a parameter to tweak 53 | 54 | n_clf = 50 55 | k_min = 5 56 | k_max = 200 57 | 58 | # for SG_AOM and SG_MOA, choose the right number of buckets 59 | n_buckets = 5 60 | n_clf_bucket = int(n_clf / n_buckets) 61 | assert (n_clf % n_buckets == 0) # in case wrong number of buckets 62 | 63 | # flag for printing and output saving 64 | verbose = True 65 | 66 | # record of feature bagging detector 67 | fb_n_neighbors = [] 68 | ############################################################################### 69 | 70 | if __name__ == '__main__': 71 | 72 | start_time = time.time() 73 | X_orig, y_orig = loaddata(data) 74 | 75 | # initialize the matrix for storing scores 76 | roc_mat = np.zeros([n_ite, n_baselines]) # receiver operating curve 77 | ap_mat = np.zeros([n_ite, n_baselines]) # average precision 78 | 79 | for t in range(n_ite): 80 | print('\nn_ite', t + 1, data) # print status 81 | 82 | random_state = np.random.RandomState() 83 | 84 | # split the data into training and testing 85 | X_train, X_test, y_train, y_test = train_test_split(X_orig, y_orig, 86 | test_size=test_size, 87 | random_state=random_state) 88 | # in case of small datasets 89 | if k_max > X_train.shape[0]: 90 | k_max = X_train.shape[0] 91 | k_list = random_state.randint(k_min, k_max, size=n_clf).tolist() 92 | k_list.sort() 93 | 94 | # normalized the data 95 | X_train_norm, X_test_norm = standardizer(X_train, X_test) 96 | 97 | train_scores = np.zeros([X_train.shape[0], n_clf]) 98 | test_scores = np.zeros([X_test.shape[0], n_clf]) 99 | 100 | # initialized the list to store the results 101 | test_target_list = [] 102 | method_list = [] 103 | 104 | # generate a pool of detectors and predict on test instances 105 | train_scores, test_scores = train_predict_lof(k_list, X_train_norm, 106 | X_test_norm, 107 | train_scores, 108 | test_scores) 109 | 110 | ####################################################################### 111 | # fit feature bagging using median of k_list 112 | # n_neighbors = int(np.median(k_list)) 113 | n_neighbors = random_state.randint(low=k_min, high=k_max) 114 | clf = FeatureBagging(base_estimator=LOF(n_neighbors=n_neighbors), 115 | n_estimators=len(k_list), check_estimator=False) 116 | print(clf) 117 | fb_n_neighbors.append(n_neighbors) 118 | clf.fit(X_train_norm) 119 | 120 | # generate scores 121 | target_test_feature_bagging = clf.decision_function(X_test_norm) 122 | test_target_list.append(target_test_feature_bagging) 123 | method_list.append('FB') 124 | ####################################################################### 125 | # generate normalized scores 126 | train_scores_norm, test_scores_norm = standardizer(train_scores, 127 | test_scores) 128 | # generate mean and max outputs 129 | # SG_A and SG_M 130 | target_test_mean = np.mean(test_scores_norm, axis=1) 131 | target_test_max = np.max(test_scores_norm, axis=1) 132 | test_target_list.extend([target_test_mean, target_test_max]) 133 | method_list.extend(['GG_a', 'GG_m']) 134 | 135 | # generate pseudo target for training -> for calculating weights 136 | target_mean = np.mean(train_scores_norm, axis=1).reshape(-1, 1) 137 | target_max = np.max(train_scores_norm, axis=1).reshape(-1, 1) 138 | 139 | # generate weighted mean 140 | # weights are distance or pearson in different modes 141 | clf_weights_pear = np.zeros([n_clf, 1]) 142 | for i in range(n_clf): 143 | clf_weights_pear[i] = pearsonr( 144 | target_mean, train_scores_norm[:, i].reshape(-1, 1)) 145 | 146 | # generate weighted mean 147 | target_test_weighted_pear = np.sum( 148 | test_scores_norm * clf_weights_pear.reshape(1, -1) / 149 | clf_weights_pear.sum(), axis=1) 150 | 151 | test_target_list.append(target_test_weighted_pear) 152 | method_list.append('GG_wa') 153 | 154 | # generate threshold sum 155 | target_test_threshold = np.sum(test_scores_norm.clip(0), axis=1) 156 | test_target_list.append(target_test_threshold) 157 | method_list.append('GG_thresh') 158 | 159 | # generate average of maximum (SG_AOM) and maximum of average (SG_MOA) 160 | target_test_aom = aom(test_scores_norm, n_buckets, n_clf) 161 | target_test_moa = moa(test_scores_norm, n_buckets, n_clf) 162 | test_target_list.extend([target_test_aom, target_test_moa]) 163 | method_list.extend(['GG_aom', 'GG_moa']) 164 | ################################################################## 165 | 166 | # define the local region size 167 | loc_region_size = int(X_train_norm.shape[0] * loc_region_perc) 168 | if loc_region_size < loc_region_min: 169 | loc_region_size = loc_region_min 170 | if loc_region_size > loc_region_max: 171 | loc_region_size = loc_region_max 172 | 173 | # define local region 174 | ind_arr = get_local_region(X_train_norm, X_test_norm, 175 | loc_region_size, 176 | loc_region_ite=loc_region_ite, 177 | local_region_strength=loc_region_threshold, 178 | loc_min_features=loc_min_features, 179 | random_state=random_state) 180 | 181 | pred_scores_best = np.zeros([X_test.shape[0], ]) 182 | pred_scores_ens = np.zeros([X_test.shape[0], ]) 183 | 184 | for i in range(X_test.shape[0]): # iterate all test instance 185 | 186 | ind_k = ind_arr[i] 187 | 188 | # get the pseudo target: mean 189 | target_k = target_mean[ind_k,].ravel() 190 | 191 | # get the current scores from all clf 192 | curr_train_k = train_scores_norm[ind_k, :] 193 | 194 | # initialize containers for correlation 195 | corr_pear_n = np.zeros([n_clf, ]) 196 | 197 | for d in range(n_clf): 198 | corr_pear_n[d,] = pearsonr(target_k, curr_train_k[:, d]) 199 | 200 | # pick the best one 201 | best_clf_ind = np.nanargmax(corr_pear_n) 202 | pred_scores_best[i,] = test_scores_norm[i, best_clf_ind] 203 | 204 | pred_scores_ens[i,] = np.max( 205 | test_scores_norm[ 206 | i, get_competent_detectors(corr_pear_n, n_bins, 207 | n_selected)]) 208 | 209 | test_target_list.extend([pred_scores_best, 210 | pred_scores_ens]) 211 | method_list.extend(['LSCP_a', 212 | 'LSCP_moa']) 213 | ###################################################################### 214 | 215 | pred_scores_best = np.zeros([X_test.shape[0], ]) 216 | pred_scores_ens = np.zeros([X_test.shape[0], ]) 217 | 218 | for i in range(X_test.shape[0]): # iterate all test instance 219 | # get the neighbor idx of the current point 220 | ind_k = ind_arr[i] 221 | # get the pseudo target: mean 222 | target_k = target_max[ind_k,].ravel() 223 | 224 | # get the current scores from all clf 225 | curr_train_k = train_scores_norm[ind_k, :] 226 | 227 | # initialize containers for correlation 228 | corr_pear_n = np.zeros([n_clf, ]) 229 | 230 | for d in range(n_clf): 231 | corr_pear_n[d,] = pearsonr(target_k, curr_train_k[:, d]) 232 | 233 | # pick the best one 234 | best_clf_ind = np.nanargmax(corr_pear_n) 235 | pred_scores_best[i,] = test_scores_norm[i, best_clf_ind] 236 | 237 | pred_scores_ens[i,] = np.mean( 238 | test_scores_norm[ 239 | i, get_competent_detectors(corr_pear_n, n_bins, 240 | n_selected)]) 241 | 242 | test_target_list.extend([pred_scores_best, 243 | pred_scores_ens]) 244 | method_list.extend(['LSCP_m', 245 | 'LSCP_aom']) 246 | 247 | ###################################################################### 248 | 249 | # store performance information and print result 250 | for i in range(n_baselines): 251 | roc_mat[t, i] = roc_auc_score(y_test, test_target_list[i]) 252 | ap_mat[t, i] = average_precision_score(y_test, 253 | test_target_list[i]) 254 | print(method_list[i], roc_mat[t, i]) 255 | print('local region size:', loc_region_size) 256 | 257 | print("--- %s seconds ---" % (time.time() - start_time)) 258 | execution_time = time.time() - start_time 259 | 260 | # save parameters 261 | save_script(data, base_detector, timestamp, n_ite, test_size, n_baselines, 262 | loc_region_perc, loc_region_ite, loc_region_threshold, 263 | loc_min_features, loc_region_size, loc_region_min, 264 | loc_region_max, n_clf, k_min, k_max, n_bins, n_selected, 265 | n_buckets, fb_n_neighbors, execution_time) 266 | 267 | # print and save the result 268 | # default location is /results/***.csv 269 | print_save_result(data, base_detector, n_baselines, roc_mat, 270 | ap_mat, method_list, timestamp, verbose) 271 | -------------------------------------------------------------------------------- /utils/utility.py: -------------------------------------------------------------------------------- 1 | import os 2 | import collections 3 | import pathlib 4 | 5 | import numpy as np 6 | import scipy.io as scio 7 | from scipy.stats import scoreatpercentile 8 | 9 | import sklearn 10 | from sklearn.neighbors import KDTree 11 | from sklearn.metrics import precision_score 12 | from sklearn.preprocessing import StandardScaler 13 | from sklearn.utils import column_or_1d 14 | from sklearn.utils import check_random_state 15 | from sklearn.utils.random import sample_without_replacement 16 | 17 | from models.lof import LOF 18 | from models.knn import Knn 19 | 20 | 21 | def argmaxp(a, p): 22 | """Utlity function to return the index of top p values in a 23 | :param a: list variable 24 | :param p: number of elements to select 25 | :return: index of top p elements in a 26 | """ 27 | 28 | a = np.asarray(a).ravel() 29 | length = a.shape[0] 30 | pth = np.argpartition(a, length - p) 31 | return pth[length - p:] 32 | 33 | 34 | # @njit("i8[:](i8[:], u8, b1)") 35 | def argmaxn(value_list, n, desc=True): 36 | """ 37 | Return the index of top n elements in the list if order is set to 'desc', 38 | otherwise return the index of n smallest elements 39 | 40 | :param value_list: a list containing all values 41 | :type value_list: list, array 42 | :param n: the number of the elements to select 43 | :type n: int 44 | :param order: the order to sort {'desc', 'asc'} 45 | :type order: str, optional (default='desc') 46 | :return: the index of the top n elements 47 | :rtype: list 48 | """ 49 | value_list = column_or_1d(value_list) 50 | length = len(value_list) 51 | 52 | # for the smallest n, flip the value 53 | if not desc: 54 | n = length - n 55 | 56 | # partition is not part of numba 57 | value_sorted = np.partition(value_list, length - n) 58 | threshold = value_sorted[int(length - n)] 59 | 60 | if desc: 61 | return np.where(np.greater_equal(value_list, threshold))[0] 62 | else: # return the index of n smallest elements 63 | return np.where(np.less(value_list, threshold))[0] 64 | 65 | 66 | def get_label_n(y, y_pred): 67 | """ Infer the binary label of the top n samples with highest scores 68 | :param y: 69 | :param y_pred: 70 | :return: 71 | """ 72 | out_perc = np.count_nonzero(y) / len(y) 73 | threshold = scoreatpercentile(y_pred, 100 * (1 - out_perc)) 74 | y_pred = (y_pred > threshold).astype('int') 75 | return y_pred 76 | 77 | 78 | def standardizer(X_train, X_test): 79 | """ 80 | normalization function wrapper 81 | :param X_train: 82 | :param X_test: 83 | :return: X_train and X_test after the Z-score normalization 84 | """ 85 | scaler = StandardScaler().fit(X_train) 86 | return scaler.transform(X_train), scaler.transform(X_test) 87 | 88 | 89 | def precision_n_score(y, y_pred): 90 | """ 91 | Utlity function to calculate precision@n 92 | :param y: ground truth 93 | :param y_pred: number of outliers 94 | :return: score 95 | """ 96 | # calculate the percentage of outliers 97 | out_perc = np.count_nonzero(y) / len(y) 98 | 99 | threshold = scoreatpercentile(y_pred, 100 * (1 - out_perc)) 100 | y_pred = (y_pred > threshold).astype('int') 101 | return precision_score(y, y_pred) 102 | 103 | 104 | def loaddata(filename): 105 | """ 106 | load data 107 | :param filename: 108 | :return: 109 | """ 110 | mat = scio.loadmat(os.path.join('datasets', filename + '.mat')) 111 | X_orig = mat['X'] 112 | y_orig = mat['y'].ravel() 113 | 114 | return X_orig, y_orig 115 | 116 | 117 | def train_predict_lof(k_list, X_train_norm, X_test_norm, train_scores, 118 | test_scores): 119 | # initialize base detectors 120 | clf_list = [] 121 | for k in k_list: 122 | clf = LOF(n_neighbors=k) 123 | clf.fit(X_train_norm) 124 | train_score = clf.decision_scores_ 125 | test_score = clf.decision_function(X_test_norm) 126 | clf_name = 'lof_' + str(k) 127 | 128 | clf_list.append(clf_name) 129 | curr_ind = len(clf_list) - 1 130 | 131 | train_scores[:, curr_ind] = train_score.ravel() 132 | test_scores[:, curr_ind] = test_score.ravel() 133 | 134 | return train_scores, test_scores 135 | 136 | 137 | def train_predict_knn(k_list, X_train_norm, X_test_norm, train_scores, 138 | test_scores): 139 | # initialize base detectors 140 | clf_list = [] 141 | for k in k_list: 142 | clf = Knn(n_neighbors=k, method='largest') 143 | clf.fit(X_train_norm) 144 | train_score = clf.decision_scores 145 | test_score = clf.decision_function(X_test_norm) 146 | clf_name = 'knn_' + str(k) 147 | 148 | clf_list.append(clf_name) 149 | curr_ind = len(clf_list) - 1 150 | 151 | train_scores[:, curr_ind] = train_score.ravel() 152 | test_scores[:, curr_ind] = test_score.ravel() 153 | 154 | return train_scores, test_scores 155 | 156 | 157 | def save_script(data, base_detector, timestamp, n_ite, test_size, n_baselines, 158 | loc_region_perc, loc_region_ite, loc_region_strength, 159 | loc_min_features, loc_region_size, loc_region_min, 160 | loc_region_max, n_clf, k_min, k_max, n_bins, n_selected, 161 | n_buckets, fb_n_neighbors, execution_time): 162 | # initialize the log directory if it does not exist 163 | pathlib.Path('results').mkdir(parents=True, exist_ok=True) 164 | f = open( 165 | 'results\\' + data + '_' + base_detector + '_' + timestamp + '.txt', 166 | 'a') 167 | 168 | f.writelines("\n n_ite: " + str(n_ite)) 169 | f.writelines("\n test_size: " + str(test_size)) 170 | f.writelines("\n n_baselines: " + str(n_baselines)) 171 | f.writelines("\n") 172 | 173 | f.writelines("\n loc_region_perc: " + str(loc_region_perc)) 174 | f.writelines("\n loc_region_ite: " + str(loc_region_ite)) 175 | f.writelines("\n loc_region_threshold: " + str(loc_region_strength)) 176 | f.writelines("\n loc_min_features: " + str(loc_min_features)) 177 | f.writelines("\n loc_region_size: " + str(loc_region_size)) 178 | f.writelines("\n loc_region_min: " + str(loc_region_min)) 179 | f.writelines("\n loc_region_max: " + str(loc_region_max)) 180 | f.writelines("\n") 181 | 182 | f.writelines("\n n_clf: " + str(n_clf)) 183 | 184 | f.writelines("\n k_min: " + str(k_min)) 185 | f.writelines("\n k_max: " + str(k_max)) 186 | f.writelines("\n n_bins: " + str(n_bins)) 187 | f.writelines("\n n_selected: " + str(n_selected)) 188 | f.writelines("\n n_buckets: " + str(n_buckets)) 189 | f.writelines("\n") 190 | 191 | f.writelines("\n fb n_neighbors: ") 192 | for n_neighnors in fb_n_neighbors: 193 | f.writelines(str(n_neighnors) + ", ") 194 | f.writelines("\n") 195 | 196 | f.writelines("\n execution_time: " + str(execution_time)) 197 | f.close() 198 | 199 | 200 | def print_save_result(data, base_detector, n_baselines, roc_mat, 201 | ap_mat, method_list, timestamp, verbose): 202 | """ 203 | :param data: 204 | :param base_detector: 205 | :param n_baselines: 206 | :param n_clf: 207 | :param n_ite: 208 | :param roc_mat: 209 | :param ap_mat: 210 | :param prc_mat: 211 | :param method_list: 212 | :param timestamp: 213 | :param verbose: 214 | :return: None 215 | """ 216 | 217 | roc_scores = np.round(np.mean(roc_mat, axis=0), decimals=4) 218 | ap_scores = np.round(np.mean(ap_mat, axis=0), decimals=4) 219 | 220 | method_np = np.asarray(method_list) 221 | 222 | top_roc_ind = argmaxp(roc_scores, 1) 223 | top_ap_ind = argmaxp(ap_scores, 1) 224 | 225 | top_roc_clf = method_np[top_roc_ind].tolist()[0] 226 | top_ap_clf = method_np[top_ap_ind].tolist()[0] 227 | 228 | top_roc = np.round(roc_scores[top_roc_ind][0], decimals=4) 229 | top_ap = np.round(ap_scores[top_ap_ind][0], decimals=4) 230 | 231 | roc_diff = np.round(100 * (top_roc - roc_scores) / roc_scores, decimals=4) 232 | ap_diff = np.round(100 * (top_ap - ap_scores) / ap_scores, decimals=4) 233 | 234 | # initialize the log directory if it does not exist 235 | pathlib.Path('results').mkdir(parents=True, exist_ok=True) 236 | 237 | # create the file if it does not exist 238 | f = open( 239 | 'results\\' + data + '_' + base_detector + '_' + timestamp + '.csv', 240 | 'a') 241 | 242 | if verbose: 243 | f.writelines('method, ' 244 | 'roc, best_roc, diff_roc,' 245 | 'ap, best_ap, diff_ap,' 246 | 'best roc, best ap') 247 | else: 248 | f.writelines('method, ' 249 | 'roc, ap, p@m,' 250 | 'best roc, best ap') 251 | 252 | print('method, roc, ap, p@m, best roc, best ap') 253 | delim = ',' 254 | for i in range(n_baselines): 255 | print(method_list[i], roc_scores[i], ap_scores[i], 256 | top_roc_clf, top_ap_clf) 257 | 258 | if verbose: 259 | f.writelines( 260 | '\n' + str(method_list[i]) + delim + 261 | str(roc_scores[i]) + delim + str(top_roc) + delim + str( 262 | roc_diff[i]) + delim + 263 | str(ap_scores[i]) + delim + str(top_ap) + delim + str( 264 | ap_diff[i]) + delim + 265 | top_roc_clf + delim + top_ap_clf) 266 | else: 267 | f.writelines( 268 | '\n' + str(method_list[i]) + delim + 269 | str(roc_scores[i]) + delim + 270 | str(ap_scores[i]) + delim + 271 | top_roc_clf + delim + top_ap_clf) 272 | 273 | f.close() 274 | 275 | 276 | def generate_bagging_indices(random_state, bootstrap_features, n_features, 277 | min_features, max_features): 278 | """ 279 | Randomly draw feature indices. Internal use only. 280 | 281 | Modified from sklearn/ensemble/bagging.py 282 | """ 283 | # Get valid random state 284 | random_state = check_random_state(random_state) 285 | 286 | # decide number of features to draw 287 | random_n_features = random_state.randint(min_features, max_features) 288 | 289 | # Draw indices 290 | feature_indices = _generate_indices(random_state, bootstrap_features, 291 | n_features, random_n_features) 292 | 293 | return feature_indices 294 | 295 | 296 | def _generate_indices(random_state, bootstrap, n_population, n_samples): 297 | """ 298 | Draw randomly sampled indices. Internal use only. 299 | 300 | See sklearn/ensemble/bagging.py 301 | """ 302 | # Draw sample indices 303 | if bootstrap: 304 | indices = random_state.randint(0, n_population, n_samples) 305 | else: 306 | indices = sample_without_replacement(n_population, n_samples, 307 | random_state=random_state) 308 | 309 | return indices 310 | 311 | 312 | def get_local_region(X_train_norm, X_test_norm, loc_region_size, 313 | loc_region_ite, local_region_strength, 314 | loc_min_features, random_state): 315 | # Initialize the local region list 316 | grid = [[]] * X_test_norm.shape[0] 317 | 318 | for t in range(loc_region_ite): 319 | features = generate_bagging_indices(random_state, 320 | bootstrap_features=False, 321 | n_features=X_train_norm.shape[1], 322 | min_features=int( 323 | X_train_norm.shape[ 324 | 1] * loc_min_features), 325 | max_features=X_train_norm.shape[1]) 326 | 327 | tree = KDTree(X_train_norm[:, features]) 328 | dist_arr, ind_arr = tree.query(X_test_norm[:, features], 329 | k=loc_region_size) 330 | 331 | for j in range(X_test_norm.shape[0]): 332 | grid[j] = grid[j] + ind_arr[j, :].tolist() 333 | 334 | grid_f = [[]] * X_test_norm.shape[0] 335 | for j in range(X_test_norm.shape[0]): 336 | grid_f[j] = [item for item, count in 337 | collections.Counter(grid[j]).items() if 338 | count > local_region_strength] 339 | 340 | return grid_f 341 | 342 | 343 | def get_competent_detectors(scores, n_bins=10, n_selected=5): 344 | """ algorithm for selecting the most competent detectors 345 | :param scores: 346 | :param n_bins: 347 | :param n_selected: 348 | :return: 349 | """ 350 | scores = scores.reshape(-1, 1) 351 | hist, bin_edges = np.histogram(scores, bins=n_bins) 352 | # dense_bin = np.argmax(hist) 353 | max_bins = argmaxn(hist, n=n_selected, desc=True) 354 | candidates = [] 355 | # print(hist) 356 | for max_bin in max_bins: 357 | # print(bin_edges[max_bin], bin_edges[max_bin+1]) 358 | selected = np.where((scores >= bin_edges[max_bin]) 359 | & (scores <= bin_edges[max_bin + 1])) 360 | # print(selected) 361 | candidates = candidates + selected[0].tolist() 362 | 363 | # print(np.mean(scores[candidates,:]), np.mean(scores)) 364 | # return np.mean(scores[candidates, :]) 365 | return candidates 366 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | **L**ocally **S**elective **C**ombination in **P**arallel Outlier Ensembles (LSCP): 3 | **a fully unsupervised framework to selectively combine base detectors by emphasizing data locality.** 4 | 5 | ------------ 6 | 7 | Zhao, Y., Nasrullah, Z., Hryniewicki, M.K. and Li, Z. LSCP: Locally Selective Combination in Parallel Outlier Ensembles. 8 | *SIAM International Conference on Data Mining (SDM)*, 2019. 9 | 10 | Please cite the paper as: 11 | 12 | @inproceedings{zhao2019lscp, 13 | title={{LSCP:} Locally Selective Combination in Parallel Outlier Ensembles}, 14 | author={Zhao, Yue and Nasrullah, Zain and Hryniewicki, Maciej K and Li, Zheng}, 15 | booktitle={Proceedings of the 2019 {SIAM} International Conference on Data Mining, {SDM} 2019}, 16 | pages={585--593}, 17 | month = {May}, 18 | year={2019}, 19 | address = {Calgary, Canada}, 20 | organization={SIAM}, 21 | url={https://doi.org/10.1137/1.9781611975673.66}, 22 | doi={10.1137/1.9781611975673.66} 23 | } 24 | 25 | 26 | [PDF for Personal Use](https://epubs.siam.org/doi/pdf/10.1137/1.9781611975673.66) | 27 | [SIAM Page](https://epubs.siam.org/doi/abs/10.1137/1.9781611975673.66) | 28 | [Presentation Slides](http://www.andrew.cmu.edu/user/yuezhao2/papers/19-sdm-lscp-slides.pdf) | 29 | [API Documentation](https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.lscp) | 30 | [Example with PyOD](https://github.com/yzhao062/pyod/blob/master/examples/lscp_example.py) 31 | 32 | **Update** (May 9th, 2019): [Published version](https://epubs.siam.org/doi/pdf/10.1137/1.9781611975673.66) is available for download. 33 | 34 | **Update** (Jan 23th, 2019): [Camera-ready version](https://arxiv.org/abs/1812.01528) is available for download. 35 | 36 | **Update** (Dec 25th, 2018): LSCP has been officially released in **[Python Outlier Detection (PyOD)](https://github.com/yzhao062/pyod)** V0.6.6. 37 | 38 | **Update** (Dec 21th, 2018): LSCP has been accepted at SDM 2019. Acceptance rate 22.7% (90/397). 39 | 40 | **Update** (Dec 6th, 2018): LSCP has been included as part of **[Python Outlier Detection (PyOD)](https://github.com/yzhao062/pyod)**, 41 | to be released in pyod V0.6.6. 42 | 43 | ------------ 44 | 45 | ### Additional notes 46 | 47 | 1. Two versions of codes are provided: 48 | 1. **Demo version** (demo_lof.py) is created for the fast reproduction of the experiment results. The demo version only compares the baseline algorithms with LSCP algorithms. 49 | 2. **Production version** ([Python Outlier Detection (PyOD)](https://github.com/yzhao062/pyod)) is released with full optimization and testing as a framework. The purpose of this version is to be used in real applications, which should require fewer dependencies and faster execution. 50 | 2. It is understood that there are **small variations** in the results due to the random process, e.g., splitting the training and test sets. Thus, running demo codes would only result in similar results to the paper but not the exactly same results. 51 | 52 | ------------ 53 | 54 | ## Introduction 55 | In unsupervised outlier ensembles, the absence of ground truth makes the combination of base outlier detectors a challenging task. 56 | Specifically, existing parallel outlier ensembles lack a reliable way of selecting competent base detectors, affecting accuracy and stability, during model combination. 57 | In this paper, we propose a framework---called Locally Selective Combination in Parallel Outlier Ensembles (LSCP)---which addresses the issue by defining a local region around a test instance using the consensus of its nearest neighbors in randomly selected feature subspaces. 58 | The top-performing base detectors in this local region are selected and combined as the model's final output. 59 | Four variants of the LSCP framework are compared with seven widely used parallel frameworks. Experimental results demonstrate that one of these variants, LSCP_AOM, consistently outperforms baselines on the majority of twenty real-world datasets. 60 | 61 | ![LSCP Flowchart](https://github.com/yzhao062/LSCP/blob/master/figs/flowchart2.png) 62 | 63 | ## Dependency 64 | The experiment codes are writen in Python 3.6 and built on a number of Python packages: 65 | - numpy>=1.13 66 | - numba>=0.35 67 | - scipy>=0.19 68 | - scikit_learn>=0.19 69 | 70 | Batch installation is possible using the supplied "requirements.txt" with pip or conda. 71 | 72 | ````cmd 73 | pip install -r requirements.txt 74 | ```` 75 | 76 | ## Datasets 77 | 20 datasets are used (see dataset folder): 78 | 79 | | Datasets | #Sample Dimension | Dimension | #Outliers | # Outlier Perc| 80 | | -----------| ------------------ | ---------- | ---------- | ------------- | 81 | | Annthyroid | 7200 | 6 | 534 | 7.41 | 82 | | Arrhythmia | 452 | 274 | 66 | 14.60 | 83 | | Breastw | 683 | 9 | 239 | 34.99 | 84 | | Cardio | 1831 | 21 | 176 | 9.61 | 85 | | Letter | 1600 | 32 | 100 | 6.25 | 86 | | MNIST | 7603 | 100 | 700 | 9.21 | 87 | | Musk | 3062 | 166 | 97 | 3.17 | 88 | | PageBlocks | 5393 | 10 | 510 | 9.46 | 89 | | Pendigits | 6870 | 16 | 156 | 2.27 | 90 | | Pima | 768 | 8 | 268 | 34.90 | 91 | | Satellite | 6435 | 36 | 2036 | 31.64 | 92 | | Satimage-2 | 5803 | 36 | 71 | 1.22 | 93 | | Shuttle | 49097 | 9 | 3511 | 7.15 | 94 | | SpamSpace | 4207 | 57 | 1679 | 39.91 | 95 | | Stamps | 340 | 9 | 31 | 9.12 | 96 | | Thyroid | 3772 | 6 | 93 | 2.47 | 97 | | Vertebral | 240 | 6 | 30 | 12.50 | 98 | | Vowels | 1456 | 12 | 50 | 3.43 | 99 | | Wbc | 378 | 30 | 21 | 5.56 | 100 | | Wilt | 4819 | 5 | 257 | 5.33 | 101 | 102 | All datasets are accessible from http://odds.cs.stonybrook.edu/ and 103 | http://www.dbs.ifi.lmu.de/research/outlier-evaluation/DAMI/. 104 | 105 | Citation Suggestion for the datasets please refer to: 106 | > Shebuti Rayana (2016). ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science. 107 | 108 | > Campos, G.O., Zimek, A., Sander, J., Campello, R.J., Micenková, B., Schubert, E., Assent, I. and Houle, M.E., 2016. On the evaluation of unsupervised outlier detection: measures, datasets, and an empirical study. *Data Mining and Knowledge Discovery*, 30(4), pp.891-927. 109 | 110 | ## Usage and Sample Output (Demo Version) 111 | Experiments could be reproduced by running **demo_lof.py** directly. You could simply download/clone the entire repository and execute the code by 112 | 113 | ```cmd 114 | python demo_lof.py 115 | ``` 116 | Two evaluation methods are introduced and the result would be saved into "results" folder: 117 | 1. The area under receiver operating characteristic curve (**ROC**) 118 | 2. mean Average Precision (**mAP**) 119 | 120 | ## Results 121 | 122 | **Table 2: ROC-AUC scores (average of 30 independent trials, highest score highlighted in bold)** 123 | 124 | | Datasets | LSCP_A | LSCP_MOA | LSCP_M | LSCP_AOM | GG_A | GG_MOA | GG_M | GG_AOM | GG_WA | GG_TH | GG_FB | 125 | | -----------| ------ | ------ | -------| ------ | -------| ------ | ------ | ------ | -------- | -------- | ------ | 126 | | Annthyroid | 0.7548 | 0.7590 | 0.7849 | 0.7520 | 0.7642 | 0.7660 | 0.7769 | 0.7730 | 0.7632 | 0.7552 | **0.7854** | 127 | | Arrhythmia | 0.7746 | 0.7715 | 0.7729 | **0.7763** | 0.7758 | 0.7749 | 0.7656 | 0.7690 | 0.7758 | 0.7313 | 0.7709 | 128 | | Breastw | 0.6553 | 0.7044 | 0.7236 | **0.7845** | 0.7362 | 0.7140 | 0.6590 | 0.6838 | 0.7453 | 0.6285 | 0.3935 | 129 | | Cardio | 0.8691 | 0.8908 | 0.8491 | **0.9013** | 0.8770 | 0.8865 | 0.8798 | 0.8903 | 0.8782 | 0.8830 | 0.8422 | 130 | | Letter | 0.7818 | 0.7954 | 0.8361 | 0.7867 | 0.7925 | 0.8031 | **0.8434** | 0.8300 | 0.7908 | 0.8001 | 0.7640 | 131 | | MNIST | 0.8576 | 0.8623 | 0.7812 | **0.8633** | 0.8557 | 0.8588 | 0.8349 | 0.8553 | 0.8563 | 0.8272 | 0.8468 | 132 | | Musk | 0.9950 | 0.9970 | 0.9931 | **0.9981** | 0.9937 | 0.9960 | 0.9960 | 0.9970 | 0.9953 | 0.9958 | 0.7344 | 133 | | PageBlocks | 0.9349 | 0.9343 | 0.8687 | **0.9488** | 0.9443 | 0.9440 | 0.9240 | 0.9371 | 0.9453 | 0.9418 | 0.9284 | 134 | | Pendigits | 0.8238 | 0.8656 | 0.7238 | **0.8744** | 0.8378 | 0.8509 | 0.8488 | 0.8622 | 0.8425 | 0.8548 | 0.8034 | 135 | | Pima | 0.7059 | 0.6991 | 0.6640 | **0.7061** | 0.7030 | 0.7003 | 0.6730 | 0.6856 | 0.7037 | 0.6349 | 0.6989 | 136 | | Satellite | 0.5814 | 0.6106 | 0.6006 | 0.6015 | 0.5881 | 0.5992 | **0.6258** | 0.6220 | 0.5876 | 0.6101 | 0.5818 | 137 | | Satimage-2 | 0.9852 | 0.9931 | 0.9878 | **0.9935** | 0.9872 | 0.9907 | 0.9909 | 0.9925 | 0.9880 | 0.9881 | 0.9181 | 138 | | Shuttle | 0.5392 | 0.5551 | 0.5373 | 0.5514 | 0.5439 | 0.5504 | **0.5612** | 0.5602 | 0.5413 | 0.5561 | 0.3702 | 139 | | SpamSpace | 0.3792 | 0.4594 | 0.4305 | **0.4744** | 0.4487 | 0.4377 | 0.4060 | 0.4128 | 0.4580 | 0.4104 | 0.3312 | 140 | | Stamps | 0.8888 | 0.8719 | 0.8525 | **0.8985** | 0.8946 | 0.8927 | 0.8559 | 0.8763 | 0.8953 | 0.8904 | 0.8715 | 141 | | Thyroid | 0.9579 | 0.9624 | 0.9413 | **0.9700** | 0.9656 | 0.9647 | 0.9385 | 0.9510 | 0.9665 | 0.9644 | 0.8510 | 142 | | Vertebral | 0.3324 | 0.3662 | **0.4306** | 0.3478 | 0.3433 | 0.3467 | 0.3662 | 0.3614 | 0.3442 | 0.3678 | 0.3385 | 143 | | Vowels | 0.9276 | 0.9185 | 0.9238 | 0.9199 | 0.9265 | 0.9275 | **0.9313** | 0.9271 | 0.9261 | 0.9299 | 0.9148 | 144 | | WBC | 0.9379 | 0.9344 | 0.9242 | **0.9451** | 0.9421 | 0.9409 | 0.9321 | 0.9367 | 0.9420 | 0.9314 | 0.9407 | 145 | | Wilt | 0.5275 | 0.5517 | **0.6550** | 0.4286 | 0.5101 | 0.5358 | 0.6384 | 0.6056 | 0.5037 | 0.5586 | 0.5868 | 146 | 147 | **Table 3: mAP scores (average of 30 independent trials, highest score highlighted in bold)** 148 | 149 | | Datasets | LSCP_A | LSCP_MOA | LSCP_M | LSCP_AOM | GG_A | GG_MOA | GG_M | GG_AOM | GG_WA | GG_TH | GG_FB | 150 | | -----------| ------ | ------ | -------| ------ | -------| ------ | ------ | ------ | -------- | -------- | ------ | 151 | | Annthyroid | 0.2283 | 0.2375 | 0.2349 | 0.2453 | 0.2301 | 0.2395 | 0.2413 | **0.2516** | 0.2306 | 0.2277 | 0.1864 | 152 | | Arrhythmia | 0.3780 | 0.3744 | 0.3790 | **0.3796** | 0.3766 | 0.3769 | 0.3690 | 0.3722 | 0.3766 | 0.3468 | 0.3707 | 153 | | Breastw | 0.4334 | 0.4766 | 0.4728 | **0.5655** | 0.4995 | 0.4849 | 0.4249 | 0.4577 | 0.5085 | 0.4366 | 0.2854 | 154 | | Cardio | 0.3375 | 0.3960 | 0.3197 | **0.4117** | 0.3516 | 0.3708 | 0.3666 | 0.3864 | 0.3535 | 0.3629 | 0.3643 | 155 | | Letter | 0.2302 | 0.2396 | **0.3346** | 0.2407 | 0.2388 | 0.2473 | 0.3160 | 0.2867 | 0.2372 | 0.2416 | 0.2193 | 156 | | MNIST | 0.3933 | 0.3974 | 0.3353 | **0.3979** | 0.3911 | 0.3941 | 0.3701 | 0.3896 | 0.3918 | 0.3836 | 0.3928 | 157 | | Musk | 0.8478 | 0.8773 | 0.8433 | **0.9240** | 0.8245 | 0.8718 | 0.8479 | 0.8806 | 0.8608 | 0.8629 | 0.5806 | 158 | | PageBlocks | 0.5805 | 0.5707 | 0.4684 | **0.6360** | 0.6043 | 0.6016 | 0.5297 | 0.5733 | 0.6077 | 0.6064 | 0.6094 | 159 | | Pendigits | 0.0709 | 0.0893 | 0.0625 | **0.0944** | 0.0777 | 0.0823 | 0.0834 | 0.0895 | 0.0780 | 0.0832 | 0.0834 | 160 | | Pima | 0.5092 | 0.5045 | 0.4716 | **0.5142** | 0.5089 | 0.5054 | 0.4813 | 0.4920 | 0.5095 | 0.4599 | 0.5094 | 161 | | Satellite | 0.4077 | 0.4268 | 0.4223 | 0.4196 | 0.4047 | 0.4139 | **0.4385** | 0.4352 | 0.4047 | 0.4031 | 0.4049 | 162 | | Satimage-2 | 0.3477 | 0.6248 | 0.3994 | **0.6249** | 0.3959 | 0.5089 | 0.5344 | 0.5922 | 0.4159 | 0.4114 | 0.4851 | 163 | | Shuttle | 0.1228 | 0.1296 | 0.1167 | **0.1330** | 0.1297 | 0.1316 | 0.1239 | 0.1294 | 0.1293 | 0.1316 | 0.0549 | 164 | | SpamSpace | 0.3326 | 0.3615 | 0.3592 | **0.3665** | 0.3572 | 0.3521 | 0.3379 | 0.3413 | 0.3612 | 0.3601 | 0.3079 | 165 | | Stamps | 0.3596 | 0.3310 | 0.3193 | **0.3779** | 0.3694 | 0.3660 | 0.3144 | 0.3387 | 0.3706 | 0.3638 | 0.3535 | 166 | | Thyroid | 0.3544 | 0.3955 | 0.2638 | **0.4651** | 0.4045 | 0.4123 | 0.2850 | 0.3488 | 0.4130 | 0.4071 | 0.1186 | 167 | | Vertebral | 0.0948 | 0.1020 | **0.1230** | 0.0988 | 0.0971 | 0.0975 | 0.1029 | 0.1000 | 0.0972 | 0.1067 | 0.0965 | 168 | | Vowels | **0.3913** | 0.3678 | 0.3482 | 0.3539 | 0.3783 | 0.3790 | 0.3760 | 0.3732 | 0.3784 | 0.3783 | 0.3340 | 169 | | WBC | 0.6033 | 0.5983 | 0.5472 | **0.6131** | 0.6097 | 0.6069 | 0.5579 | 0.5925 | 0.6105 | 0.6045 | 0.5933 | 170 | | Wilt | 0.0518 | 0.0557 | **0.0770** | 0.0423 | 0.0493 | 0.0523 | 0.0715 | 0.0633 | 0.0486 | 0.0537 | 0.0591 | 171 | 172 | ## Conclusions 173 | 174 | In this work, we propose four variants of a novel unsupervised outlier detection framework called Locally Selective Combination in Parallel Outlier Ensembles (LSCP). 175 | Unlike traditional combination approaches, LSCP identifies the top-performing base detectors for each test instance relative to its local region. 176 | To validate its effectiveness, the proposed framework is assessed on 20 real-world datasets and is shown to be superior to baseline algorithms. 177 | The ensemble approach *LSCP_AOM* demonstrates the best performance achieving the highest detection score on 13/20 datasets with respect to ROC-AUC and 14/20 datasets with respect to mAP. 178 | Theoretical considerations under the bias-variance framework and visualizations are also provided for LSCP to provide a holistic overview of the framework. 179 | Since LSCP demonstrates the promise of data locality, future work can extend this exploration by investigating the use of heterogeneous base detectors and more reliable pseudo ground truth generation methods. -------------------------------------------------------------------------------- /models/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Base class for all outlier detector models 3 | """ 4 | # Author: Yue Zhao 5 | # License: BSD 2 clause 6 | 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import warnings 11 | from collections import defaultdict 12 | import abc 13 | import sklearn 14 | 15 | 16 | def _sklearn_version_21(): # pragma: no cover 17 | """ Utility function to decide the version of sklearn 18 | In sklearn 21.0, LOF is changed. Specifically, _decision_function 19 | is replaced by _score_samples 20 | 21 | Returns 22 | ------- 23 | sklearn_21_flag : bool 24 | True if sklearn.__version__ is newer than 0.21.0 25 | 26 | """ 27 | sklearn_version = str(sklearn.__version__) 28 | if int(sklearn_version.split(".")[1]) > 20: 29 | return True 30 | else: 31 | return False 32 | 33 | 34 | if _sklearn_version_21(): 35 | from inspect import signature 36 | else: 37 | from sklearn.externals.funcsigs import signature 38 | 39 | from sklearn.externals import six 40 | 41 | import numpy as np 42 | from scipy.special import erf 43 | from scipy.stats import scoreatpercentile 44 | from sklearn.preprocessing import MinMaxScaler 45 | from sklearn.utils.validation import check_is_fitted 46 | from sklearn.utils.multiclass import check_classification_targets 47 | 48 | from .sklearn_base import _pprint 49 | 50 | 51 | @six.add_metaclass(abc.ABCMeta) 52 | class BaseDetector(object): 53 | """Abstract class for all outlier detection algorithms. 54 | 55 | Parameters 56 | ---------- 57 | contamination : float in (0., 0.5), optional (default=0.1) 58 | The amount of contamination of the data set, 59 | i.e. the proportion of outliers in the data set. Used when fitting to 60 | define the threshold on the decision function. 61 | 62 | Attributes 63 | ---------- 64 | decision_scores_ : numpy array of shape (n_samples,) 65 | The outlier scores of the training data. 66 | The higher, the more abnormal. Outliers tend to have higher 67 | scores. This value is available once the detector is fitted. 68 | 69 | threshold_ : float 70 | The threshold is based on ``contamination``. It is the 71 | ``n_samples * contamination`` most abnormal samples in 72 | ``decision_scores_``. The threshold is calculated for generating 73 | binary outlier labels. 74 | 75 | labels_ : int, either 0 or 1 76 | The binary labels of the training data. 0 stands for inliers 77 | and 1 for outliers/anomalies. It is generated by applying 78 | ``threshold_`` on ``decision_scores_``. 79 | """ 80 | 81 | @abc.abstractmethod 82 | def __init__(self, contamination=0.1): 83 | 84 | if not (0. < contamination <= 0.5): 85 | raise ValueError("contamination must be in (0, 0.5], " 86 | "got: %f" % contamination) 87 | 88 | self.contamination = contamination 89 | 90 | # noinspection PyIncorrectDocstring 91 | @abc.abstractmethod 92 | def fit(self, X, y=None): 93 | """Fit detector. y is optional for unsupervised methods. 94 | 95 | Parameters 96 | ---------- 97 | X : numpy array of shape (n_samples, n_features) 98 | The input samples. 99 | 100 | y : numpy array of shape (n_samples,), optional (default=None) 101 | The ground truth of the input samples (labels). 102 | """ 103 | pass 104 | 105 | @abc.abstractmethod 106 | def decision_function(self, X): 107 | """Predict raw anomaly score of X using the fitted detector. 108 | 109 | The anomaly score of an input sample is computed based on different 110 | detector algorithms. For consistency, outliers are assigned with 111 | larger anomaly scores. 112 | 113 | Parameters 114 | ---------- 115 | X : numpy array of shape (n_samples, n_features) 116 | The training input samples. Sparse matrices are accepted only 117 | if they are supported by the base estimator. 118 | 119 | Returns 120 | ------- 121 | anomaly_scores : numpy array of shape (n_samples,) 122 | The anomaly score of the input samples. 123 | """ 124 | pass 125 | 126 | def fit_predict(self, X, y=None): 127 | """Fit detector first and then predict whether a particular sample 128 | is an outlier or not. 129 | 130 | Parameters 131 | ---------- 132 | X : numpy array of shape (n_samples, n_features) 133 | The input samples. 134 | 135 | y : numpy array of shape (n_samples,), optional (default=None) 136 | The ground truth of the input samples (labels). 137 | 138 | Returns 139 | ------- 140 | outlier_labels : numpy array of shape (n_samples,) 141 | For each observation, tells whether or not 142 | it should be considered as an outlier according to the 143 | fitted model. 0 stands for inliers and 1 for outliers. 144 | """ 145 | 146 | self.fit(X, y) 147 | return self.labels_ 148 | 149 | def predict(self, X): 150 | """Predict if a particular sample is an outlier or not. 151 | 152 | Parameters 153 | ---------- 154 | X : numpy array of shape (n_samples, n_features) 155 | The input samples. 156 | 157 | Returns 158 | ------- 159 | outlier_labels : numpy array of shape (n_samples,) 160 | For each observation, tells whether or not 161 | it should be considered as an outlier according to the 162 | fitted model. 0 stands for inliers and 1 for outliers. 163 | """ 164 | 165 | check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) 166 | 167 | pred_score = self.decision_function(X) 168 | return (pred_score > self.threshold_).astype('int').ravel() 169 | 170 | def predict_proba(self, X, method='linear'): 171 | """Predict the probability of a sample being outlier. Two approaches 172 | are possible: 173 | 174 | 1. simply use Min-max conversion to linearly transform the outlier 175 | scores into the range of [0,1]. The model must be 176 | fitted first. 177 | 2. use unifying scores, see :cite:`kriegel2011interpreting`. 178 | 179 | Parameters 180 | ---------- 181 | X : numpy array of shape (n_samples, n_features) 182 | The input samples. 183 | 184 | method : str, optional (default='linear') 185 | probability conversion method. It must be one of 186 | 'linear' or 'unify'. 187 | 188 | Returns 189 | ------- 190 | outlier_labels : numpy array of shape (n_samples,) 191 | For each observation, tells whether or not 192 | it should be considered as an outlier according to the 193 | fitted model. Return the outlier probability, ranging 194 | in [0,1]. 195 | """ 196 | 197 | check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) 198 | train_scores = self.decision_scores_ 199 | 200 | test_scores = self.decision_function(X) 201 | 202 | probs = np.zeros([X.shape[0], int(self._classes)]) 203 | if method == 'linear': 204 | scaler = MinMaxScaler().fit(train_scores.reshape(-1, 1)) 205 | probs[:, 1] = scaler.transform( 206 | test_scores.reshape(-1, 1)).ravel().clip(0, 1) 207 | probs[:, 0] = 1 - probs[:, 1] 208 | return probs 209 | 210 | elif method == 'unify': 211 | # turn output into probability 212 | pre_erf_score = (test_scores - self._mu) / ( 213 | self._sigma * np.sqrt(2)) 214 | erf_score = erf(pre_erf_score) 215 | probs[:, 1] = erf_score.clip(0, 1).ravel() 216 | probs[:, 0] = 1 - probs[:, 1] 217 | return probs 218 | else: 219 | raise ValueError(method, 220 | 'is not a valid probability conversion method') 221 | 222 | def _predict_rank(self, X, normalized=False): 223 | """Predict the outlyingness rank of a sample by a fitted model. The 224 | method is for outlier detector score combination. 225 | 226 | Parameters 227 | ---------- 228 | X : numpy array of shape (n_samples, n_features) 229 | The input samples. 230 | 231 | normalized : bool, optional (default=False) 232 | If set to True, all ranks are normalized to [0,1]. 233 | 234 | Returns 235 | ------- 236 | ranks : array, shape (n_samples,) 237 | Outlying rank of a sample according to the training data. 238 | 239 | """ 240 | 241 | check_is_fitted(self, ['decision_scores_']) 242 | 243 | test_scores = self.decision_function(X) 244 | train_scores = self.decision_scores_ 245 | 246 | sorted_train_scores = np.sort(train_scores) 247 | ranks = np.searchsorted(sorted_train_scores, test_scores) 248 | 249 | if normalized: 250 | # return normalized ranks 251 | ranks = ranks / ranks.max() 252 | return ranks 253 | 254 | # def score(self, X, y, scoring='roc_auc_score'): 255 | # """Returns the evaluation resulted on the given test data and labels. 256 | # ROC is chosen as the default evaluation metric 257 | # 258 | # :param X: The input samples 259 | # :type X: numpy array of shape (n_samples, n_features) 260 | # 261 | # :param y: Outlier labels of the input samples 262 | # :type y: array, shape (n_samples,) 263 | # 264 | # :param scoring: Evaluation metric 265 | # 266 | # -' roc_auc_score': ROC score 267 | # - 'prc_n_score': Precision @ rank n score 268 | # :type scoring: str, optional (default='roc_auc_score') 269 | # 270 | # :return: Evaluation score 271 | # :rtype: float 272 | # """ 273 | # check_is_fitted(self, ['decision_scores_']) 274 | # if scoring == 'roc_auc_score': 275 | # score = roc_auc_score(y, self.decision_function(X)) 276 | # elif scoring == 'prc_n_score': 277 | # score = precision_n_scores(y, self.decision_function(X)) 278 | # else: 279 | # raise NotImplementedError('PyOD built-in scoring only supports ' 280 | # 'ROC and Precision @ rank n') 281 | # 282 | # print("{metric}: {score}".format(metric=scoring, score=score)) 283 | # 284 | # return score 285 | 286 | def _set_n_classes(self, y): 287 | """Set the number of classes if `y` is presented, which is not 288 | expected. It could be useful for multi-class outlier detection. 289 | 290 | Parameters 291 | ---------- 292 | y : numpy array of shape (n_samples,) 293 | Ground truth. 294 | 295 | Returns 296 | ------- 297 | self 298 | """ 299 | 300 | self._classes = 2 # default as binary classification 301 | if y is not None: 302 | check_classification_targets(y) 303 | self._classes = len(np.unique(y)) 304 | return self 305 | 306 | def _process_decision_scores(self): 307 | """Internal function to calculate key attributes: 308 | 309 | - threshold_: used to decide the binary label 310 | - labels_: binary labels of training data 311 | 312 | Returns 313 | ------- 314 | self 315 | """ 316 | 317 | self.threshold_ = scoreatpercentile(self.decision_scores_, 318 | 100 * (1 - self.contamination)) 319 | self.labels_ = (self.decision_scores_ > self.threshold_).astype( 320 | 'int').ravel() 321 | 322 | # calculate for predict_proba() 323 | 324 | self._mu = np.mean(self.decision_scores_) 325 | self._sigma = np.std(self.decision_scores_) 326 | 327 | return self 328 | 329 | # noinspection PyMethodParameters 330 | def _get_param_names(cls): 331 | # noinspection PyPep8 332 | """Get parameter names for the estimator 333 | 334 | See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html 335 | and sklearn/base.py for more information. 336 | """ 337 | 338 | # fetch the constructor or the original constructor before 339 | # deprecation wrapping if any 340 | init = getattr(cls.__init__, 'deprecated_original', cls.__init__) 341 | if init is object.__init__: 342 | # No explicit constructor to introspect 343 | return [] 344 | 345 | # introspect the constructor arguments to find the model parameters 346 | # to represent 347 | init_signature = signature(init) 348 | # Consider the constructor parameters excluding 'self' 349 | parameters = [p for p in init_signature.parameters.values() 350 | if p.name != 'self' and p.kind != p.VAR_KEYWORD] 351 | for p in parameters: 352 | if p.kind == p.VAR_POSITIONAL: 353 | raise RuntimeError("scikit-learn estimators should always " 354 | "specify their parameters in the signature" 355 | " of their __init__ (no varargs)." 356 | " %s with constructor %s doesn't " 357 | " follow this convention." 358 | % (cls, init_signature)) 359 | # Extract and sort argument names excluding 'self' 360 | return sorted([p.name for p in parameters]) 361 | 362 | # noinspection PyPep8 363 | def get_params(self, deep=True): 364 | """Get parameters for this estimator. 365 | 366 | See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html 367 | and sklearn/base.py for more information. 368 | 369 | Parameters 370 | ---------- 371 | deep : boolean, optional 372 | If True, will return the parameters for this estimator and 373 | contained subobjects that are estimators. 374 | 375 | Returns 376 | ------- 377 | params : mapping of string to any 378 | Parameter names mapped to their values. 379 | """ 380 | 381 | out = dict() 382 | for key in self._get_param_names(): 383 | # We need deprecation warnings to always be on in order to 384 | # catch deprecated param values. 385 | # This is set in utils/__init__.py but it gets overwritten 386 | # when running under python3 somehow. 387 | warnings.simplefilter("always", DeprecationWarning) 388 | try: 389 | with warnings.catch_warnings(record=True) as w: 390 | value = getattr(self, key, None) 391 | if len(w) and w[0].category == DeprecationWarning: 392 | # if the parameter is deprecated, don't show it 393 | continue 394 | finally: 395 | warnings.filters.pop(0) 396 | 397 | # XXX: should we rather test if instance of estimator? 398 | if deep and hasattr(value, 'get_params'): 399 | deep_items = value.get_params().items() 400 | out.update((key + '__' + k, val) for k, val in deep_items) 401 | out[key] = value 402 | return out 403 | 404 | def set_params(self, **params): 405 | # noinspection PyPep8 406 | """Set the parameters of this estimator. 407 | The method works on simple estimators as well as on nested objects 408 | (such as pipelines). The latter have parameters of the form 409 | ``__`` so that it's possible to update each 410 | component of a nested object. 411 | 412 | See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html 413 | and sklearn/base.py for more information. 414 | 415 | Returns 416 | ------- 417 | self : object 418 | """ 419 | 420 | if not params: 421 | # Simple optimization to gain speed (inspect is slow) 422 | return self 423 | valid_params = self.get_params(deep=True) 424 | 425 | nested_params = defaultdict(dict) # grouped by prefix 426 | for key, value in params.items(): 427 | key, delim, sub_key = key.partition('__') 428 | if key not in valid_params: 429 | raise ValueError('Invalid parameter %s for estimator %s. ' 430 | 'Check the list of available parameters ' 431 | 'with `estimator.get_params().keys()`.' % 432 | (key, self)) 433 | 434 | if delim: 435 | nested_params[key][sub_key] = value 436 | else: 437 | setattr(self, key, value) 438 | 439 | for key, sub_params in nested_params.items(): 440 | valid_params[key].set_params(**sub_params) 441 | 442 | return self 443 | 444 | def __repr__(self): 445 | # noinspection PyPep8 446 | """ 447 | See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html 448 | and sklearn/base.py for more information. 449 | """ 450 | 451 | class_name = self.__class__.__name__ 452 | return '%s(%s)' % (class_name, _pprint(self.get_params(deep=False), 453 | offset=len(class_name), ),) 454 | -------------------------------------------------------------------------------- /models/feature_bagging.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Feature bagging detector 3 | """ 4 | # Author: Yue Zhao 5 | # License: BSD 2 clause 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | import numpy as np 10 | import numbers 11 | from sklearn.base import clone 12 | from sklearn.utils import check_random_state 13 | from sklearn.utils import check_array 14 | from sklearn.utils.validation import check_is_fitted 15 | from sklearn.utils.estimator_checks import check_estimator 16 | from sklearn.utils.random import sample_without_replacement 17 | 18 | from .lof import LOF 19 | from .base import BaseDetector 20 | 21 | from .combination import average, maximization 22 | 23 | MAX_INT = np.iinfo(np.int32).max 24 | MIN_INT = -1 * MAX_INT 25 | 26 | 27 | def generate_bagging_indices(random_state, bootstrap_features, n_features, 28 | min_features, max_features): 29 | """ Randomly draw feature indices. Internal use only. 30 | 31 | Modified from sklearn/ensemble/bagging.py 32 | 33 | Parameters 34 | ---------- 35 | random_state : RandomState 36 | A random number generator instance to define the state of the random 37 | permutations generator. 38 | 39 | bootstrap_features : bool 40 | Specifies whether to bootstrap indice generation 41 | 42 | n_features : int 43 | Specifies the population size when generating indices 44 | 45 | min_features : int 46 | Lower limit for number of features to randomly sample 47 | 48 | max_features : int 49 | Upper limit for number of features to randomly sample 50 | 51 | Returns 52 | ------- 53 | feature_indices : numpy array, shape (n_samples,) 54 | Indices for features to bag 55 | 56 | """ 57 | 58 | # Get valid random state 59 | random_state = check_random_state(random_state) 60 | 61 | # decide number of features to draw 62 | random_n_features = random_state.randint(min_features, max_features) 63 | 64 | # Draw indices 65 | feature_indices = generate_indices(random_state, bootstrap_features, 66 | n_features, random_n_features) 67 | 68 | return feature_indices 69 | 70 | 71 | def generate_indices(random_state, bootstrap, n_population, n_samples): 72 | """ Draw randomly sampled indices. Internal use only. 73 | 74 | See sklearn/ensemble/bagging.py 75 | 76 | Parameters 77 | ---------- 78 | random_state : RandomState 79 | A random number generator instance to define the state of the random 80 | permutations generator. 81 | 82 | bootstrap : bool 83 | Specifies whether to bootstrap indice generation 84 | 85 | n_population : int 86 | Specifies the population size when generating indices 87 | 88 | n_samples : int 89 | Specifies number of samples to draw 90 | 91 | Returns 92 | ------- 93 | indices : numpy array, shape (n_samples,) 94 | randomly drawn indices 95 | """ 96 | 97 | # Draw sample indices 98 | if bootstrap: 99 | indices = random_state.randint(0, n_population, n_samples) 100 | else: 101 | indices = sample_without_replacement(n_population, n_samples, 102 | random_state=random_state) 103 | 104 | return indices 105 | 106 | 107 | def check_parameter(param, low=MIN_INT, high=MAX_INT, param_name='', 108 | include_left=False, include_right=False): 109 | """Check if an input is within the defined range. 110 | 111 | Parameters 112 | ---------- 113 | param : int, float 114 | The input parameter to check. 115 | 116 | low : int, float 117 | The lower bound of the range. 118 | 119 | high : int, float 120 | The higher bound of the range. 121 | 122 | param_name : str, optional (default='') 123 | The name of the parameter. 124 | 125 | include_left : bool, optional (default=False) 126 | Whether includes the lower bound (lower bound <=). 127 | 128 | include_right : bool, optional (default=False) 129 | Whether includes the higher bound (<= higher bound). 130 | 131 | Returns 132 | ------- 133 | within_range : bool or raise errors 134 | Whether the parameter is within the range of (low, high) 135 | 136 | """ 137 | 138 | # param, low and high should all be numerical 139 | if not isinstance(param, (numbers.Integral, np.integer, np.float)): 140 | raise TypeError('{param_name} is set to {param} Not numerical'.format( 141 | param=param, param_name=param_name)) 142 | 143 | if not isinstance(low, (numbers.Integral, np.integer, np.float)): 144 | raise TypeError('low is set to {low}. Not numerical'.format(low=low)) 145 | 146 | if not isinstance(high, (numbers.Integral, np.integer, np.float)): 147 | raise TypeError('high is set to {high}. Not numerical'.format( 148 | high=high)) 149 | 150 | # at least one of the bounds should be specified 151 | if low is MIN_INT and high is MAX_INT: 152 | raise ValueError('Neither low nor high bounds is undefined') 153 | 154 | # if wrong bound values are used 155 | if low > high: 156 | raise ValueError( 157 | 'Lower bound > Higher bound') 158 | 159 | # value check under different bound conditions 160 | if (include_left and include_right) and (param < low or param > high): 161 | raise ValueError( 162 | '{param_name} is set to {param}. ' 163 | 'Not in the range of [{low}, {high}].'.format( 164 | param=param, low=low, high=high, param_name=param_name)) 165 | 166 | elif (include_left and not include_right) and ( 167 | param < low or param >= high): 168 | raise ValueError( 169 | '{param_name} is set to {param}. ' 170 | 'Not in the range of [{low}, {high}).'.format( 171 | param=param, low=low, high=high, param_name=param_name)) 172 | 173 | elif (not include_left and include_right) and ( 174 | param <= low or param > high): 175 | raise ValueError( 176 | '{param_name} is set to {param}. ' 177 | 'Not in the range of ({low}, {high}].'.format( 178 | param=param, low=low, high=high, param_name=param_name)) 179 | 180 | elif (not include_left and not include_right) and ( 181 | param <= low or param >= high): 182 | raise ValueError( 183 | '{param_name} is set to {param}. ' 184 | 'Not in the range of ({low}, {high}).'.format( 185 | param=param, low=low, high=high, param_name=param_name)) 186 | else: 187 | return True 188 | 189 | 190 | def _set_random_states(estimator, random_state=None): 191 | """Sets fixed random_state parameters for an estimator. Internal use only. 192 | Modified from sklearn/base.py 193 | 194 | Finds all parameters ending ``random_state`` and sets them to integers 195 | derived from ``random_state``. 196 | 197 | Parameters 198 | ---------- 199 | estimator : estimator supporting get/set_params 200 | Estimator with potential randomness managed by random_state 201 | parameters. 202 | 203 | random_state : int, RandomState instance or None, optional (default=None) 204 | If int, random_state is the seed used by the random number generator; 205 | If RandomState instance, random_state is the random number generator; 206 | If None, the random number generator is the RandomState instance used 207 | by `np.random`. 208 | 209 | Notes 210 | ----- 211 | This does not necessarily set *all* ``random_state`` attributes that 212 | control an estimator's randomness, only those accessible through 213 | ``estimator.get_params()``. ``random_state``s not controlled include 214 | those belonging to: 215 | 216 | * cross-validation splitters 217 | * ``scipy.stats`` rvs 218 | """ 219 | random_state = check_random_state(random_state) 220 | to_set = {} 221 | for key in sorted(estimator.get_params(deep=True)): 222 | if key == 'random_state' or key.endswith('__random_state'): 223 | to_set[key] = random_state.randint(MAX_INT) 224 | 225 | if to_set: 226 | estimator.set_params(**to_set) 227 | 228 | 229 | def _parallel_decision_function(estimators, estimators_features, X): 230 | n_samples = X.shape[0] 231 | scores = np.zeros((n_samples, len(estimators))) 232 | 233 | for i, (estimator, features) in enumerate( 234 | zip(estimators, estimators_features)): 235 | if hasattr(estimator, 'decision_function'): 236 | estimator_score = estimator.decision_function( 237 | X[:, features]) 238 | scores[:, i] = estimator_score 239 | else: 240 | raise NotImplementedError( 241 | 'current base detector has no decision_function') 242 | return scores 243 | 244 | 245 | # TODO: should support parallelization at the model level 246 | class FeatureBagging(BaseDetector): 247 | """ A feature bagging detector is a meta estimator that fits a number of 248 | base detectors on various sub-samples of the dataset and use averaging 249 | or other combination methods to improve the predictive accuracy and 250 | control over-fitting. 251 | 252 | The sub-sample size is always the same as the original input sample size 253 | but the features are randomly sampled from half of the features to all 254 | features. 255 | 256 | By default, LOF is used as the base estimator. However, any estimator 257 | could be used as the base estimator, such as kNN and ABOD. 258 | 259 | Feature bagging first construct n subsamples by random selecting a subset 260 | of features, which induces the diversity of base estimators. 261 | 262 | Finally, the prediction score is generated by averaging/taking the maximum 263 | of all base detectors. See :cite:`lazarevic2005feature` for details. 264 | 265 | Parameters 266 | ---------- 267 | base_estimator : object or None, optional (default=None) 268 | The base estimator to fit on random subsets of the dataset. 269 | If None, then the base estimator is a LOF detector. 270 | 271 | n_estimators : int, optional (default=10) 272 | The number of base estimators in the ensemble. 273 | 274 | contamination : float in (0., 0.5), optional (default=0.1) 275 | The amount of contamination of the data set, 276 | i.e. the proportion of outliers in the data set. Used when fitting to 277 | define the threshold on the decision function. 278 | 279 | max_features : int or float, optional (default=1.0) 280 | The number of features to draw from X to train each base estimator. 281 | 282 | - If int, then draw `max_features` features. 283 | - If float, then draw `max_features * X.shape[1]` features. 284 | 285 | bootstrap_features : bool, optional (default=False) 286 | Whether features are drawn with replacement. 287 | 288 | check_estimator : bool, optional (default=True) 289 | If set to True, check whether the base estimator is consistent with 290 | sklearn standard. 291 | 292 | n_jobs : optional (default=1) 293 | The number of jobs to run in parallel for both `fit` and 294 | `predict`. If -1, then the number of jobs is set to the 295 | number of cores. 296 | 297 | random_state : int, RandomState or None, optional (default=None) 298 | If int, random_state is the seed used by the random 299 | number generator; If RandomState instance, random_state is the random 300 | number generator; If None, the random number generator is the 301 | RandomState instance used by `np.random`. 302 | 303 | combination : str, optional (default='average') 304 | the method of combination: 305 | 306 | - if 'average': take the average of all detectors 307 | - if 'max': take the maximum scores of all detectors 308 | 309 | verbose : int, optional (default=0) 310 | Controls the verbosity of the building process. 311 | 312 | estimator_params : dict, optional (default=None) 313 | The list of attributes to use as parameters 314 | when instantiating a new base estimator. If none are given, 315 | default parameters are used. 316 | 317 | Attributes 318 | ---------- 319 | decision_scores_ : numpy array of shape (n_samples,) 320 | The outlier scores of the training data. 321 | The higher, the more abnormal. Outliers tend to have higher 322 | scores. This value is available once the detector is 323 | fitted. 324 | 325 | threshold_ : float 326 | The threshold is based on ``contamination``. It is the 327 | ``n_samples * contamination`` most abnormal samples in 328 | ``decision_scores_``. The threshold is calculated for generating 329 | binary outlier labels. 330 | 331 | labels_ : int, either 0 or 1 332 | The binary labels of the training data. 0 stands for inliers 333 | and 1 for outliers/anomalies. It is generated by applying 334 | ``threshold_`` on ``decision_scores_``. 335 | 336 | """ 337 | 338 | def __init__(self, base_estimator=None, n_estimators=10, contamination=0.1, 339 | max_features=1.0, bootstrap_features=False, 340 | check_estimator=True, n_jobs=1, random_state=None, 341 | combination='average', verbose=0, estimator_params=None): 342 | 343 | super(FeatureBagging, self).__init__(contamination=contamination) 344 | self.base_estimator = base_estimator 345 | self.n_estimators = n_estimators 346 | self.max_features = max_features 347 | self.bootstrap_features = bootstrap_features 348 | self.check_estimator = check_estimator 349 | self.combination = combination 350 | self.n_jobs = n_jobs 351 | self.random_state = random_state 352 | self.verbose = verbose 353 | if estimator_params is not None: 354 | self.estimator_params = estimator_params 355 | else: 356 | self.estimator_params = {} 357 | 358 | def fit(self, X, y=None): 359 | """Fit detector. y is optional for unsupervised methods. 360 | 361 | Parameters 362 | ---------- 363 | X : numpy array of shape (n_samples, n_features) 364 | The input samples. 365 | 366 | y : numpy array of shape (n_samples,), optional (default=None) 367 | The ground truth of the input samples (labels). 368 | """ 369 | random_state = check_random_state(self.random_state) 370 | 371 | X = check_array(X) 372 | self.n_samples_, self.n_features_ = X.shape[0], X.shape[1] 373 | 374 | self._set_n_classes(y) 375 | 376 | # expect at least 2 features, does not make sense if only have 377 | # 1 feature 378 | check_parameter(self.n_features_, low=2, include_left=True, 379 | param_name='n_features') 380 | 381 | # check parameters 382 | self._validate_estimator(default=LOF(n_jobs=self.n_jobs)) 383 | 384 | # use at least half of the features 385 | self.min_features_ = int(0.5 * self.n_features_) 386 | 387 | # Validate max_features 388 | if isinstance(self.max_features, (numbers.Integral, np.integer)): 389 | self.max_features_ = self.max_features 390 | else: # float 391 | self.max_features_ = int(self.max_features * self.n_features_) 392 | 393 | # min_features and max_features could equal 394 | check_parameter(self.max_features_, low=self.min_features_, 395 | param_name='max_features', high=self.n_features_, 396 | include_left=True, include_right=True) 397 | 398 | self.estimators_ = [] 399 | self.estimators_features_ = [] 400 | 401 | n_more_estimators = self.n_estimators - len(self.estimators_) 402 | 403 | if n_more_estimators < 0: 404 | raise ValueError('n_estimators=%d must be larger or equal to ' 405 | 'len(estimators_)=%d when warm_start==True' 406 | % (self.n_estimators, len(self.estimators_))) 407 | 408 | seeds = random_state.randint(MAX_INT, size=n_more_estimators) 409 | self._seeds = seeds 410 | 411 | for i in range(self.n_estimators): 412 | random_state = np.random.RandomState(seeds[i]) 413 | 414 | # max_features is incremented by one since random 415 | # function is [min_features, max_features) 416 | features = generate_bagging_indices(random_state, 417 | self.bootstrap_features, 418 | self.n_features_, 419 | self.min_features_, 420 | self.max_features_ + 1) 421 | # initialize and append estimators 422 | estimator = self._make_estimator(append=False, 423 | random_state=random_state) 424 | estimator.fit(X[:, features]) 425 | 426 | self.estimators_.append(estimator) 427 | self.estimators_features_.append(features) 428 | 429 | # decision score matrix from all estimators 430 | all_decision_scores = self._get_decision_scores() 431 | 432 | if self.combination == 'average': 433 | self.decision_scores_ = average(all_decision_scores) 434 | else: 435 | self.decision_scores_ = maximization(all_decision_scores) 436 | 437 | self._process_decision_scores() 438 | 439 | return self 440 | 441 | def decision_function(self, X): 442 | """Predict raw anomaly score of X using the fitted detector. 443 | 444 | The anomaly score of an input sample is computed based on different 445 | detector algorithms. For consistency, outliers are assigned with 446 | larger anomaly scores. 447 | 448 | Parameters 449 | ---------- 450 | X : numpy array of shape (n_samples, n_features) 451 | The training input samples. Sparse matrices are accepted only 452 | if they are supported by the base estimator. 453 | 454 | Returns 455 | ------- 456 | anomaly_scores : numpy array of shape (n_samples,) 457 | The anomaly score of the input samples. 458 | """ 459 | check_is_fitted(self, ['estimators_', 'estimators_features_', 460 | 'decision_scores_', 'threshold_', 'labels_']) 461 | X = check_array(X) 462 | 463 | if self.n_features_ != X.shape[1]: 464 | raise ValueError("Number of features of the model must " 465 | "match the input. Model n_features is {0} and " 466 | "input n_features is {1}." 467 | "".format(self.n_features_, X.shape[1])) 468 | 469 | # Parallel loop 470 | # n_jobs, n_estimators, starts = _partition_estimators(self.n_estimators, 471 | # self.n_jobs) 472 | # all_pred_scores = Parallel(n_jobs=n_jobs, verbose=self.verbose)( 473 | # delayed(_parallel_decision_function)( 474 | # self.estimators_[starts[i]:starts[i + 1]], 475 | # self.estimators_features_[starts[i]:starts[i + 1]], 476 | # X) 477 | # for i in range(n_jobs)) 478 | # 479 | # # Reduce 480 | # all_pred_scores = np.concatenate(all_pred_scores, axis=1) 481 | all_pred_scores = self._predict_decision_scores(X) 482 | 483 | if self.combination == 'average': 484 | return average(all_pred_scores) 485 | else: 486 | return maximization(all_pred_scores) 487 | 488 | def _predict_decision_scores(self, X): 489 | all_pred_scores = np.zeros([X.shape[0], self.n_estimators]) 490 | for i in range(self.n_estimators): 491 | features = self.estimators_features_[i] 492 | all_pred_scores[:, i] = self.estimators_[i].decision_function( 493 | X[:, features]) 494 | return all_pred_scores 495 | 496 | def _get_decision_scores(self): 497 | all_decision_scores = np.zeros([self.n_samples_, self.n_estimators]) 498 | for i in range(self.n_estimators): 499 | all_decision_scores[:, i] = self.estimators_[i].decision_scores_ 500 | return all_decision_scores 501 | 502 | def _validate_estimator(self, default=None): 503 | """Check the estimator and the n_estimator attribute, set the 504 | `base_estimator_` attribute.""" 505 | if not isinstance(self.n_estimators, (numbers.Integral, np.integer)): 506 | raise ValueError("n_estimators must be an integer, " 507 | "got {0}.".format(type(self.n_estimators))) 508 | 509 | if self.n_estimators <= 0: 510 | raise ValueError("n_estimators must be greater than zero, " 511 | "got {0}.".format(self.n_estimators)) 512 | 513 | if self.base_estimator is not None: 514 | self.base_estimator_ = self.base_estimator 515 | else: 516 | self.base_estimator_ = default 517 | 518 | if self.base_estimator_ is None: 519 | raise ValueError("base_estimator cannot be None") 520 | 521 | # make sure estimator is consistent with sklearn 522 | if self.check_estimator: 523 | check_estimator(self.base_estimator_) 524 | 525 | def _make_estimator(self, append=True, random_state=None): 526 | """Make and configure a copy of the `base_estimator_` attribute. 527 | 528 | sklearn/base.py 529 | 530 | Warning: This method should be used to properly instantiate new 531 | sub-estimators. 532 | """ 533 | 534 | # TODO: add a check for estimator_param 535 | estimator = clone(self.base_estimator_) 536 | estimator.set_params(**self.estimator_params) 537 | 538 | if random_state is not None: 539 | _set_random_states(estimator, random_state) 540 | 541 | if append: 542 | self.estimators_.append(estimator) 543 | 544 | return estimator 545 | 546 | def __len__(self): 547 | """Returns the number of estimators in the ensemble.""" 548 | return len(self.estimators_) 549 | 550 | def __getitem__(self, index): 551 | """Returns the index'th estimator in the ensemble.""" 552 | return self.estimators_[index] 553 | 554 | def __iter__(self): 555 | """Returns iterator over estimators in the ensemble.""" 556 | return iter(self.estimators_) 557 | --------------------------------------------------------------------------------