├── models
    ├── __init__.py
    ├── sklearn_base.py
    ├── knn.py
    ├── combination.py
    ├── lof.py
    ├── base.py
    └── feature_bagging.py
├── utils
    ├── __init__.py
    ├── stat_models.py
    └── utility.py
├── datasets
    └── cardio.mat
├── figs
    └── flowchart2.png
├── requirements.txt
├── results
    ├── cardio_lof_20181006_152659.txt
    └── cardio_lof_20181006_152659.csv
├── demo_lof.py
└── README.md


/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/datasets/cardio.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzhao062/LSCP/HEAD/datasets/cardio.mat


--------------------------------------------------------------------------------
/figs/flowchart2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzhao062/LSCP/HEAD/figs/flowchart2.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.13
2 | numba>=0.35
3 | scipy>=0.19.1
4 | scikit_learn>=0.19.1
5 | 


--------------------------------------------------------------------------------
/results/cardio_lof_20181006_152659.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |  n_ite: 20
 3 |  test_size: 0.4
 4 |  n_baselines: 10
 5 | 
 6 |  loc_region_perc: 0.1
 7 |  loc_region_ite: 20
 8 |  loc_region_threshold: 10
 9 |  loc_min_features: 0.5
10 |  loc_region_size: 100
11 |  loc_region_min: 30
12 |  loc_region_max: 100
13 | 
14 |  n_clf: 50
15 |  k_min: 5
16 |  k_max: 200
17 |  n_bins: 10
18 |  n_selected: 1
19 |  n_buckets: 5
20 |  execution_time: 180.2803671360016


--------------------------------------------------------------------------------
/results/cardio_lof_20181006_152659.csv:
--------------------------------------------------------------------------------
 1 | method, roc, best_roc, diff_roc,ap, best_ap, diff_ap,best roc, best ap
 2 | GG_a,0.9061,0.9241,1.9865,0.4282,0.4942,15.4134,SCP_aom,SCP_aom
 3 | GG_m,0.8829,0.9241,4.6664,0.4016,0.4942,23.0578,SCP_aom,SCP_aom
 4 | GG_wa,0.9075,0.9241,1.8292,0.4326,0.4942,14.2395,SCP_aom,SCP_aom
 5 | GG_thresh,0.9099,0.9241,1.5606,0.438,0.4942,12.8311,SCP_aom,SCP_aom
 6 | GG_aom,0.9102,0.9241,1.5271,0.4558,0.4942,8.4247,SCP_aom,SCP_aom
 7 | GG_moa,0.9138,0.9241,1.1272,0.4513,0.4942,9.5059,SCP_aom,SCP_aom
 8 | SCP_a,0.8974,0.9241,2.9753,0.4089,0.4942,20.8608,SCP_aom,SCP_aom
 9 | SCP_moa,0.9158,0.9241,0.9063,0.473,0.4942,4.482,SCP_aom,SCP_aom
10 | SCP_m,0.8057,0.9241,14.6953,0.3291,0.4942,50.1671,SCP_aom,SCP_aom
11 | SCP_aom,0.9241,0.9241,0.0,0.4942,0.4942,0.0,SCP_aom,SCP_aom


--------------------------------------------------------------------------------
/utils/stat_models.py:
--------------------------------------------------------------------------------
  1 | from numba import njit
  2 | import numpy as np
  3 | from scipy.special import betainc
  4 | 
  5 | 
  6 | # from scipy.stats import pearsonr
  7 | 
  8 | def pearsonr(x, y):
  9 |     """ Calculate Pearson Correlation between x and y
 10 |     :param x:
 11 |     :param y:
 12 |     :return:
 13 |     """
 14 |     x = np.asarray(x)
 15 |     y = np.asarray(y)
 16 | 
 17 |     # if np.unique(x).shape[0] == 1 or np.unique(y).shape[0] == 1:
 18 |     #     return 1.0
 19 |     r = pearsonr_helper(x, y)
 20 | 
 21 |     #    Presumably, if abs(r) > 1, then it is only some small artifact of
 22 |     #    floating point arithmetic.
 23 |     r = max(min(r, 1.0), -1.0)
 24 |     return r
 25 | 
 26 | 
 27 | @njit
 28 | def pearsonr_helper(x, y):
 29 |     """ Optimized version for Pearson correlation calculation
 30 |     :param x:
 31 |     :param y:
 32 |     :return:
 33 |     """
 34 |     # x and y should have same length.
 35 |     #    n = len(x)
 36 |     mx = x.mean()
 37 |     my = y.mean()
 38 |     xm, ym = x - mx, y - my
 39 |     # r_num = np.add.reduce(xm * ym)
 40 | 
 41 |     r_num = np.sum(xm * ym)
 42 |     r_den = np.sqrt(np.sum(xm * xm, axis=0) * np.sum(ym * ym, axis=0))
 43 | 
 44 |     # only use in case of overflow
 45 |     # if r_den == 0:
 46 |     #     return 1
 47 |     r = r_num / r_den
 48 | 
 49 |     return r
 50 | 
 51 | 
 52 | def wpearsonr(x, y, w=None):
 53 |     """ Weighted Pearson Correlation
 54 |     :param x:
 55 |     :param y:
 56 |     :param w:
 57 |     :return:
 58 |     """
 59 |     # https://stats.stackexchange.com/questions/221246/such-thing-as-a-weighted-correlation
 60 | 
 61 |     # unweighted version
 62 |     if w is None:
 63 |         return pearsonr(x, y)
 64 | 
 65 |     x = np.asarray(x)
 66 |     y = np.asarray(y)
 67 |     w = np.asarray(w)
 68 | 
 69 |     n = len(x)
 70 | 
 71 |     w_sum = w.sum()
 72 |     mx = np.sum(x * w) / w_sum
 73 |     my = np.sum(y * w) / w_sum
 74 | 
 75 |     xm, ym = (x - mx), (y - my)
 76 | 
 77 |     r_num = np.sum(xm * ym * w) / w_sum
 78 | 
 79 |     xm2 = np.sum(xm * xm * w) / w_sum
 80 |     ym2 = np.sum(ym * ym * w) / w_sum
 81 | 
 82 |     r_den = np.sqrt(xm2 * ym2)
 83 |     r = r_num / r_den
 84 | 
 85 |     r = max(min(r, 1.0), -1.0)
 86 |     #    df = n - 2
 87 |     #
 88 |     #    if abs(r) == 1.0:
 89 |     #        prob = 0.0
 90 |     #    else:
 91 |     #        t_squared = r ** 2 * (df / ((1.0 - r) * (1.0 + r)))
 92 |     #        prob = _betai(0.5 * df, 0.5, df / (df + t_squared))
 93 |     return r  # , prob
 94 | 
 95 | 
 96 | #####################################
 97 | #      PROBABILITY CALCULATIONS     #
 98 | #####################################
 99 | 
100 | 
101 | def _betai(a, b, x):
102 |     x = np.asarray(x)
103 |     x = np.where(x < 1.0, x, 1.0)  # if x > 1 then return 1.0
104 |     return betainc(a, b, x)
105 | 
106 | 
107 | def pearsonr_mat(mat, w=None):
108 |     n_row = mat.shape[0]
109 |     n_col = mat.shape[1]
110 |     pear_mat = np.full([n_row, n_row], 1).astype(float)
111 | 
112 |     if w is not None:
113 |         for cx in range(n_row):
114 |             for cy in range(cx + 1, n_row):
115 |                 curr_pear = wpearsonr(mat[cx, :], mat[cy, :], w)
116 |                 pear_mat[cx, cy] = curr_pear
117 |                 pear_mat[cy, cx] = curr_pear
118 |     else:
119 |         for cx in range(n_col):
120 |             for cy in range(cx + 1, n_row):
121 |                 curr_pear = pearsonr(mat[cx, :], mat[cy, :])[0]
122 |                 pear_mat[cx, cy] = curr_pear
123 |                 pear_mat[cy, cx] = curr_pear
124 | 
125 |     return pear_mat
126 | 


--------------------------------------------------------------------------------
/models/sklearn_base.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Utility function copied over from sklearn/base.py
  3 | """
  4 | # Author: Yue Zhao <yuezhao@cs.toronto.edu>
  5 | # License: BSD 2 clause
  6 | 
  7 | from __future__ import division
  8 | from __future__ import print_function
  9 | 
 10 | import numpy as np
 11 | from sklearn.externals import six
 12 | from sklearn.externals.joblib import cpu_count
 13 | 
 14 | 
 15 | def _get_n_jobs(n_jobs):
 16 |     """Get number of jobs for the computation.
 17 |     See sklearn/utils/__init__.py for more information.
 18 | 
 19 |     This function reimplements the logic of joblib to determine the actual
 20 |     number of jobs depending on the cpu count. If -1 all CPUs are used.
 21 |     If 1 is given, no parallel computing code is used at all, which is useful
 22 |     for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used.
 23 |     Thus for n_jobs = -2, all CPUs but one are used.
 24 |     Parameters
 25 |     ----------
 26 |     n_jobs : int
 27 |         Number of jobs stated in joblib convention.
 28 |     Returns
 29 |     -------
 30 |     n_jobs : int
 31 |         The actual number of jobs as positive integer.
 32 |     Examples
 33 |     --------
 34 |     >>> from sklearn.utils import _get_n_jobs
 35 |     >>> _get_n_jobs(4)
 36 |     4
 37 |     >>> jobs = _get_n_jobs(-2)
 38 |     >>> assert jobs == max(cpu_count() - 1, 1)
 39 |     >>> _get_n_jobs(0)
 40 |     Traceback (most recent call last):
 41 |     ...
 42 |     ValueError: Parameter n_jobs == 0 has no meaning.
 43 |     """
 44 |     if n_jobs < 0:
 45 |         return max(cpu_count() + 1 + n_jobs, 1)
 46 |     elif n_jobs == 0:
 47 |         raise ValueError('Parameter n_jobs == 0 has no meaning.')
 48 |     else:
 49 |         return n_jobs
 50 | 
 51 | 
 52 | def _partition_estimators(n_estimators, n_jobs):
 53 |     """Private function used to partition estimators between jobs.
 54 |     See sklearn/ensemble/base.py for more information.
 55 |     """
 56 |     # Compute the number of jobs
 57 |     n_jobs = min(_get_n_jobs(n_jobs), n_estimators)
 58 | 
 59 |     # Partition estimators between jobs
 60 |     n_estimators_per_job = (n_estimators // n_jobs) * np.ones(n_jobs,
 61 |                                                               dtype=np.int)
 62 |     n_estimators_per_job[:n_estimators % n_jobs] += 1
 63 |     starts = np.cumsum(n_estimators_per_job)
 64 | 
 65 |     return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist()
 66 | 
 67 | 
 68 | def _pprint(params, offset=0, printer=repr):
 69 |     # noinspection PyPep8
 70 |     """Pretty print the dictionary 'params'
 71 | 
 72 |     See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html
 73 |     and sklearn/base.py for more information.
 74 | 
 75 |     :param params: The dictionary to pretty print
 76 |     :type params: dict
 77 | 
 78 |     :param offset: The offset in characters to add at the begin of each line.
 79 |     :type offset: int
 80 | 
 81 |     :param printer: The function to convert entries to strings, typically
 82 |         the builtin str or repr
 83 |     :type printer: callable
 84 | 
 85 |     :return: None
 86 |     """
 87 | 
 88 |     # Do a multi-line justified repr:
 89 |     options = np.get_printoptions()
 90 |     np.set_printoptions(precision=5, threshold=64, edgeitems=2)
 91 |     params_list = list()
 92 |     this_line_length = offset
 93 |     line_sep = ',\n' + (1 + offset // 2) * ' '
 94 |     for i, (k, v) in enumerate(sorted(six.iteritems(params))):
 95 |         if type(v) is float:
 96 |             # use str for representing floating point numbers
 97 |             # this way we get consistent representation across
 98 |             # architectures and versions.
 99 |             this_repr = '%s=%s' % (k, str(v))
100 |         else:
101 |             # use repr of the rest
102 |             this_repr = '%s=%s' % (k, printer(v))
103 |         if len(this_repr) > 500:
104 |             this_repr = this_repr[:300] + '...' + this_repr[-100:]
105 |         if i > 0:
106 |             if this_line_length + len(this_repr) >= 75 or '\n' in this_repr:
107 |                 params_list.append(line_sep)
108 |                 this_line_length = len(line_sep)
109 |             else:
110 |                 params_list.append(', ')
111 |                 this_line_length += 2
112 |         params_list.append(this_repr)
113 |         this_line_length += len(this_repr)
114 | 
115 |     np.set_printoptions(**options)
116 |     lines = ''.join(params_list)
117 |     # Strip trailing space to avoid nightmare in doctests
118 |     lines = '\n'.join(l.rstrip(' ') for l in lines.split('\n'))
119 |     return lines
120 | 


--------------------------------------------------------------------------------
/models/knn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.preprocessing import MinMaxScaler
  3 | from sklearn.neighbors import NearestNeighbors
  4 | from sklearn.neighbors import KDTree
  5 | from sklearn.exceptions import NotFittedError
  6 | from scipy.stats import scoreatpercentile
  7 | from scipy.stats import rankdata
  8 | from scipy.special import erf
  9 | 
 10 | 
 11 | class Knn(object):
 12 |     '''
 13 |     Knn class for outlier detection
 14 |     support original knn, average knn, and median knn
 15 |     '''
 16 | 
 17 |     def __init__(self, n_neighbors=1, contamination=0.05, method='largest'):
 18 |         self.n_neighbors = n_neighbors
 19 |         self.contamination = contamination
 20 |         self.method = method
 21 | 
 22 |     def fit(self, X_train):
 23 |         self.X_train = X_train
 24 |         self._isfitted = True
 25 |         self.tree = KDTree(X_train)
 26 | 
 27 |         neigh = NearestNeighbors(n_neighbors=self.n_neighbors)
 28 |         neigh.fit(self.X_train)
 29 | 
 30 |         result = neigh.kneighbors(n_neighbors=self.n_neighbors,
 31 |                                   return_distance=True)
 32 |         dist_arr = result[0]
 33 | 
 34 |         if self.method == 'largest':
 35 |             dist = dist_arr[:, -1]
 36 |         elif self.method == 'mean':
 37 |             dist = np.mean(dist_arr, axis=1)
 38 |         elif self.method == 'median':
 39 |             dist = np.median(dist_arr, axis=1)
 40 | 
 41 |         self.threshold = scoreatpercentile(dist,
 42 |                                            100 * (1 - self.contamination))
 43 |         self.decision_scores = dist.ravel()
 44 |         self.y_pred = (self.decision_scores > self.threshold).astype('int')
 45 | 
 46 |         self.mu = np.mean(self.decision_scores)
 47 |         self.sigma = np.std(self.decision_scores)
 48 | 
 49 |     def decision_function(self, X_test):
 50 | 
 51 |         if not self._isfitted:
 52 |             NotFittedError('Knn is not fitted yet')
 53 | 
 54 |         # initialize the output score
 55 |         pred_score = np.zeros([X_test.shape[0], 1])
 56 | 
 57 |         for i in range(X_test.shape[0]):
 58 |             x_i = X_test[i, :]
 59 |             x_i = np.asarray(x_i).reshape(1, x_i.shape[0])
 60 | 
 61 |             # get the distance of the current point
 62 |             dist_arr, ind_arr = self.tree.query(x_i, k=self.n_neighbors)
 63 | 
 64 |             if self.method == 'largest':
 65 |                 dist = dist_arr[:, -1]
 66 |             elif self.method == 'mean':
 67 |                 dist = np.mean(dist_arr, axis=1)
 68 |             elif self.method == 'median':
 69 |                 dist = np.median(dist_arr, axis=1)
 70 | 
 71 |             pred_score_i = dist[-1]
 72 | 
 73 |             # record the current item
 74 |             pred_score[i, :] = pred_score_i
 75 | 
 76 |         return pred_score
 77 | 
 78 |     def predict(self, X_test):
 79 |         pred_score = self.decision_function(X_test)
 80 |         return (pred_score > self.threshold).astype('int')
 81 | 
 82 |     def predict_proba(self, X_test, method='linear'):
 83 |         test_scores = self.decision_function(X_test)
 84 |         train_scores = self.decision_scores
 85 | 
 86 |         if method == 'linear':
 87 |             scaler = MinMaxScaler().fit(train_scores.reshape(-1, 1))
 88 |             proba = scaler.transform(test_scores.reshape(-1, 1))
 89 |             return proba.clip(0, 1)
 90 |         else:
 91 |             # turn output into probability
 92 |             pre_erf_score = (test_scores - self.mu) / (self.sigma * np.sqrt(2))
 93 |             erf_score = erf(pre_erf_score)
 94 |             proba = erf_score.clip(0)
 95 | 
 96 |             # TODO: move to testing code
 97 |             assert (proba.min() >= 0)
 98 |             assert (proba.max() <= 1)
 99 |             return proba
100 | 
101 |     def predict_rank(self, X_test):
102 |         test_scores = self.decision_function(X_test)
103 |         train_scores = self.decision_scores
104 | 
105 |         ranks = np.zeros([X_test.shape[0], 1])
106 | 
107 |         for i in range(test_scores.shape[0]):
108 |             train_scores_i = np.append(train_scores.reshape(-1, 1),
109 |                                        test_scores[i])
110 | 
111 |             ranks[i] = rankdata(train_scores_i)[-1]
112 | 
113 |         # return normalized ranks
114 |         ranks_norm = ranks / ranks.max()
115 |         return ranks_norm
116 | 
117 | ##############################################################################
118 | # samples = [[-1, 0], [0., 0.], [1., 1], [2., 5.], [3, 1]]
119 | #
120 | # clf = Knn()
121 | # clf.fit(samples)
122 | #
123 | # scores = clf.decision_function(np.asarray([[2, 3], [6, 8]])).ravel()
124 | # assert (scores[0] == [2])
125 | # assert (scores[1] == [5])
126 | # #
127 | # labels = clf.predict(np.asarray([[2, 3], [6, 8]])).ravel()
128 | # assert (labels[0] == [0])
129 | # assert (labels[1] == [1])
130 | 


--------------------------------------------------------------------------------
/models/combination.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.utils.validation import check_array
  3 | from sklearn.utils.validation import column_or_1d
  4 | from sklearn.utils.testing import assert_equal
  5 | 
  6 | 
  7 | def aom(scores, n_buckets, n_estimators, standard=True):
  8 |     '''
  9 |     Average of Maximum - An ensemble method for outlier detection
 10 | 
 11 |     Aggarwal, C.C. and Sathe, S., 2015. Theoretical foundations and algorithms
 12 |     for outlier ensembles. ACM SIGKDD Explorations Newsletter, 17(1), pp.24-47.
 13 | 
 14 |     :param scores:
 15 |     :param n_buckets:
 16 |     :param n_estimators:
 17 |     :param standard:
 18 |     :return:
 19 |     '''
 20 |     scores = np.asarray(scores)
 21 |     if scores.shape[1] != n_estimators:
 22 |         raise ValueError('score matrix should be n_samples by n_estimaters')
 23 | 
 24 |     scores_aom = np.zeros([scores.shape[0], n_buckets])
 25 | 
 26 |     n_estimators_per_bucket = int(n_estimators / n_buckets)
 27 |     if n_estimators % n_buckets != 0:
 28 |         Warning('n_estimators / n_buckets leads to a remainder')
 29 | 
 30 |     # shuffle the estimator order
 31 |     estimators_list = list(range(0, n_estimators, 1))
 32 |     np.random.shuffle(estimators_list)
 33 | 
 34 |     head = 0
 35 |     for i in range(0, n_estimators, n_estimators_per_bucket):
 36 |         tail = i + n_estimators_per_bucket
 37 |         batch_ind = int(i / n_estimators_per_bucket)
 38 | 
 39 |         scores_aom[:, batch_ind] = np.max(
 40 |             scores[:, estimators_list[head:tail]], axis=1)
 41 | 
 42 |         head = head + n_estimators_per_bucket
 43 | 
 44 |     return np.mean(scores_aom, axis=1)
 45 | 
 46 | 
 47 | def moa(scores, n_buckets, n_estimators):
 48 |     '''
 49 |     Maximum of Average - An ensemble method for outlier detection
 50 | 
 51 |     Aggarwal, C.C. and Sathe, S., 2015. Theoretical foundations and algorithms
 52 |     for outlier ensembles. ACM SIGKDD Explorations Newsletter, 17(1), pp.24-47.
 53 | 
 54 |     :param scores:
 55 |     :param n_buckets:
 56 |     :param n_estimators:
 57 |     :param standard:
 58 |     :return:
 59 |     '''
 60 |     scores = np.asarray(scores)
 61 |     if scores.shape[1] != n_estimators:
 62 |         raise ValueError('score matrix should be n_samples by n_estimaters')
 63 | 
 64 |     scores_moa = np.zeros([scores.shape[0], n_buckets])
 65 | 
 66 |     n_estimators_per_bucket = int(n_estimators / n_buckets)
 67 |     if n_estimators % n_buckets != 0:
 68 |         Warning('n_estimators / n_buckets leads to a remainder')
 69 | 
 70 |     # shuffle the estimator order
 71 |     estimators_list = list(range(0, n_estimators, 1))
 72 |     np.random.shuffle(estimators_list)
 73 | 
 74 |     head = 0
 75 |     for i in range(0, n_estimators, n_estimators_per_bucket):
 76 |         tail = i + n_estimators_per_bucket
 77 |         batch_ind = int(i / n_estimators_per_bucket)
 78 | 
 79 |         scores_moa[:, batch_ind] = np.mean(
 80 |             scores[:, estimators_list[head:tail]], axis=1)
 81 | 
 82 |         head = head + n_estimators_per_bucket
 83 | 
 84 |     return np.max(scores_moa, axis=1)
 85 | 
 86 | 
 87 | def average(scores, estimator_weight=None):
 88 |     """Combination method to merge the outlier scores from multiple estimators
 89 |     by taking the average.
 90 | 
 91 |     Parameters
 92 |     ----------
 93 |     scores : numpy array of shape (n_samples, n_estimators)
 94 |         Score matrix from multiple estimators on the same samples.
 95 | 
 96 |     estimator_weight : list of shape (1, n_estimators)
 97 |         If specified, using weighted average
 98 | 
 99 |     Returns
100 |     -------
101 |     combined_scores : numpy array of shape (n_samples, )
102 |         The combined outlier scores.
103 | 
104 |     """
105 |     scores = check_array(scores)
106 | 
107 |     if estimator_weight is not None:
108 |         estimator_weight = column_or_1d(estimator_weight).reshape(1, -1)
109 |         assert_equal(scores.shape[1], estimator_weight.shape[1])
110 | 
111 |         # (d1*w1 + d2*w2 + ...+ dn*wn)/(w1+w2+...+wn)
112 |         # generated weighted scores
113 |         scores = np.sum(np.multiply(scores, estimator_weight),
114 |                         axis=1) / np.sum(
115 |             estimator_weight)
116 |         return scores.ravel()
117 | 
118 |     else:
119 |         return np.mean(scores, axis=1).ravel()
120 | 
121 | 
122 | def maximization(scores):
123 |     """Combination method to merge the outlier scores from multiple estimators
124 |     by taking the maximum.
125 | 
126 |     Parameters
127 |     ----------
128 |     scores : numpy array of shape (n_samples, n_estimators)
129 |         Score matrix from multiple estimators on the same samples.
130 | 
131 |     Returns
132 |     -------
133 |     combined_scores : numpy array of shape (n_samples, )
134 |         The combined outlier scores.
135 | 
136 |     """
137 | 
138 |     scores = check_array(scores)
139 |     return np.max(scores, axis=1).ravel()
140 | 


--------------------------------------------------------------------------------
/models/lof.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Local Outlier Factor (LOF). Implemented on scikit-learn library.
  3 | """
  4 | # Author: Yue Zhao <yuezhao@cs.toronto.edu>
  5 | # License: BSD 2 clause
  6 | 
  7 | from __future__ import division
  8 | from __future__ import print_function
  9 | 
 10 | import sklearn
 11 | from sklearn.neighbors import LocalOutlierFactor
 12 | from sklearn.utils.validation import check_is_fitted
 13 | from sklearn.utils.validation import check_array
 14 | from sklearn.utils.validation import column_or_1d
 15 | 
 16 | from .base import BaseDetector
 17 | 
 18 | 
 19 | def invert_order(scores, method='multiplication'):
 20 |     """ Invert the order of a list of values. The smallest value becomes
 21 |     the largest in the inverted list. This is useful while combining
 22 |     multiple detectors since their score order could be different.
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     scores : list, array or numpy array with shape (n_samples,)
 27 |         The list of values to be inverted
 28 | 
 29 |     method : str, optional (default='multiplication')
 30 |         Methods used for order inversion. Valid methods are:
 31 | 
 32 |         - 'multiplication': multiply by -1
 33 |         - 'subtraction': max(scores) - scores
 34 | 
 35 |     Returns
 36 |     -------
 37 |     inverted_scores : numpy array of shape (n_samples,)
 38 |         The inverted list
 39 | 
 40 |     Examples
 41 |     --------
 42 |     >>> scores1 = [0.1, 0.3, 0.5, 0.7, 0.2, 0.1]
 43 |     >>> invert_order(scores1)
 44 |     >>> array[-0.1, -0.3, -0.5, -0.7, -0.2, -0.1]
 45 |     >>> invert_order(scores1, method='subtraction')
 46 |     >>> array[0.6, 0.4, 0.2, 0, 0.5, 0.6]
 47 |     """
 48 | 
 49 |     scores = column_or_1d(scores)
 50 | 
 51 |     if method == 'multiplication':
 52 |         return scores.ravel() * -1
 53 | 
 54 |     if method == 'subtraction':
 55 |         return (scores.max() - scores).ravel()
 56 | 
 57 | 
 58 | def _sklearn_version_20():
 59 |     """ Utility function to decide the version of sklearn
 60 |     In sklearn 20.0, LOF is changed. Specifically, _decision_function
 61 |     is replaced by _score_samples
 62 | 
 63 |     Returns
 64 |     -------
 65 |     sklearn_20_flag : bool
 66 |         True if sklearn.__version__ is newer than 0.20.0
 67 | 
 68 |     """
 69 |     sklearn_version = str(sklearn.__version__)
 70 |     if int(sklearn_version.split(".")[1]) > 19:
 71 |         return True
 72 |     else:
 73 |         return False
 74 | 
 75 | 
 76 | class LOF(BaseDetector):
 77 |     """Wrapper of scikit-learn LOF Class with more functionalities.
 78 |     Unsupervised Outlier Detection using Local Outlier Factor (LOF).
 79 | 
 80 |     The anomaly score of each sample is called Local Outlier Factor.
 81 |     It measures the local deviation of density of a given sample with
 82 |     respect to its neighbors.
 83 |     It is local in that the anomaly score depends on how isolated the object
 84 |     is with respect to the surrounding neighborhood.
 85 |     More precisely, locality is given by k-nearest neighbors, whose distance
 86 |     is used to estimate the local density.
 87 |     By comparing the local density of a sample to the local densities of
 88 |     its neighbors, one can identify samples that have a substantially lower
 89 |     density than their neighbors. These are considered outliers.
 90 |     See :cite:`breunig2000lof` for details.
 91 | 
 92 |     Parameters
 93 |     ----------
 94 |     n_neighbors : int, optional (default=20)
 95 |         Number of neighbors to use by default for `kneighbors` queries.
 96 |         If n_neighbors is larger than the number of samples provided,
 97 |         all samples will be used.
 98 | 
 99 |     algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
100 |         Algorithm used to compute the nearest neighbors:
101 | 
102 |         - 'ball_tree' will use BallTree
103 |         - 'kd_tree' will use KDTree
104 |         - 'brute' will use a brute-force search.
105 |         - 'auto' will attempt to decide the most appropriate algorithm
106 |           based on the values passed to :meth:`fit` method.
107 | 
108 |         Note: fitting on sparse input will override the setting of
109 |         this parameter, using brute force.
110 | 
111 |     leaf_size : int, optional (default=30)
112 |         Leaf size passed to `BallTree` or `KDTree`. This can
113 |         affect the speed of the construction and query, as well as the memory
114 |         required to store the tree. The optimal value depends on the
115 |         nature of the problem.
116 | 
117 |     metric : string or callable, default 'minkowski'
118 |         metric used for the distance computation. Any metric from scikit-learn
119 |         or scipy.spatial.distance can be used.
120 | 
121 |         If 'precomputed', the training input X is expected to be a distance
122 |         matrix.
123 | 
124 |         If metric is a callable function, it is called on each
125 |         pair of instances (rows) and the resulting value recorded. The callable
126 |         should take two arrays as input and return one value indicating the
127 |         distance between them. This works for Scipy's metrics, but is less
128 |         efficient than passing the metric name as a string.
129 | 
130 |         Valid values for metric are:
131 | 
132 |         - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
133 |           'manhattan']
134 | 
135 |         - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
136 |           'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
137 |           'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
138 |           'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath',
139 |           'sqeuclidean', 'yule']
140 | 
141 |         See the documentation for scipy.spatial.distance for details on these
142 |         metrics:
143 |         http://docs.scipy.org/doc/scipy/reference/spatial.distance.html
144 | 
145 |     p : integer, optional (default = 2)
146 |         Parameter for the Minkowski metric from
147 |         sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
148 |         equivalent to using manhattan_distance (l1), and euclidean_distance
149 |         (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
150 |         See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances
151 | 
152 |     metric_params : dict, optional (default = None)
153 |         Additional keyword arguments for the metric function.
154 | 
155 |     contamination : float in (0., 0.5), optional (default=0.1)
156 |         The amount of contamination of the data set, i.e. the proportion
157 |         of outliers in the data set. When fitting this is used to define the
158 |         threshold on the decision function.
159 | 
160 |     n_jobs : int, optional (default = 1)
161 |         The number of parallel jobs to run for neighbors search.
162 |         If ``-1``, then the number of jobs is set to the number of CPU cores.
163 |         Affects only kneighbors and kneighbors_graph methods.
164 | 
165 |     Attributes
166 |     ----------
167 |     n_neighbors_ : int
168 |         The actual number of neighbors used for `kneighbors` queries.
169 | 
170 |     decision_scores_ : numpy array of shape (n_samples,)
171 |         The outlier scores of the training data.
172 |         The higher, the more abnormal. Outliers tend to have higher
173 |         scores. This value is available once the detector is
174 |         fitted.
175 | 
176 |     threshold_ : float
177 |         The threshold is based on ``contamination``. It is the
178 |         ``n_samples * contamination`` most abnormal samples in
179 |         ``decision_scores_``. The threshold is calculated for generating
180 |         binary outlier labels.
181 | 
182 |     labels_ : int, either 0 or 1
183 |         The binary labels of the training data. 0 stands for inliers
184 |         and 1 for outliers/anomalies. It is generated by applying
185 |         ``threshold_`` on ``decision_scores_``.
186 |     """
187 | 
188 |     def __init__(self, n_neighbors=20, algorithm='auto', leaf_size=30,
189 |                  metric='minkowski', p=2, metric_params=None,
190 |                  contamination=0.1, n_jobs=1):
191 |         super(LOF, self).__init__(contamination=contamination)
192 |         self.n_neighbors = n_neighbors
193 |         self.algorithm = algorithm
194 |         self.leaf_size = leaf_size
195 |         self.metric = metric
196 |         self.p = p
197 |         self.metric_params = metric_params
198 |         self.n_jobs = n_jobs
199 | 
200 |     # noinspection PyIncorrectDocstring
201 |     def fit(self, X, y=None):
202 |         """Fit detector. y is optional for unsupervised methods.
203 | 
204 |         Parameters
205 |         ----------
206 |         X : numpy array of shape (n_samples, n_features)
207 |             The input samples.
208 | 
209 |         y : numpy array of shape (n_samples,), optional (default=None)
210 |             The ground truth of the input samples (labels).
211 |         """
212 |         # validate inputs X and y (optional)
213 |         X = check_array(X)
214 |         self._set_n_classes(y)
215 | 
216 |         self.detector_ = LocalOutlierFactor(n_neighbors=self.n_neighbors,
217 |                                             algorithm=self.algorithm,
218 |                                             leaf_size=self.leaf_size,
219 |                                             metric=self.metric,
220 |                                             p=self.p,
221 |                                             metric_params=self.metric_params,
222 |                                             contamination=self.contamination,
223 |                                             n_jobs=self.n_jobs)
224 |         self.detector_.fit(X=X, y=y)
225 | 
226 |         # Invert decision_scores_. Outliers comes with higher outlier scores
227 |         self.decision_scores_ = invert_order(
228 |             self.detector_.negative_outlier_factor_)
229 |         self._process_decision_scores()
230 |         return self
231 | 
232 |     def decision_function(self, X):
233 |         """Predict raw anomaly score of X using the fitted detector.
234 | 
235 |         The anomaly score of an input sample is computed based on different
236 |         detector algorithms. For consistency, outliers are assigned with
237 |         larger anomaly scores.
238 | 
239 |         Parameters
240 |         ----------
241 |         X : numpy array of shape (n_samples, n_features)
242 |             The training input samples. Sparse matrices are accepted only
243 |             if they are supported by the base estimator.
244 | 
245 |         Returns
246 |         -------
247 |         anomaly_scores : numpy array of shape (n_samples,)
248 |             The anomaly score of the input samples.
249 |         """
250 | 
251 |         check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
252 | 
253 |         # Invert outlier scores. Outliers comes with higher outlier scores
254 |         # noinspection PyProtectedMember
255 |         if _sklearn_version_20():
256 |             return invert_order(self.detector_._score_samples(X))
257 |         else:
258 |             return invert_order(self.detector_._decision_function(X))
259 | 
260 |     @property
261 |     def n_neighbors_(self):
262 |         """The actual number of neighbors used for kneighbors queries.
263 |         Decorator for scikit-learn LOF attributes.
264 |         """
265 |         return self.detector_.n_neighbors_
266 | 


--------------------------------------------------------------------------------
/demo_lof.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import time
  3 | 
  4 | import numpy as np
  5 | 
  6 | from sklearn.model_selection import train_test_split
  7 | from sklearn.metrics import roc_auc_score
  8 | from sklearn.metrics import average_precision_score
  9 | 
 10 | from models.lof import LOF
 11 | from models.feature_bagging import FeatureBagging
 12 | from models.combination import aom, moa
 13 | from utils.stat_models import pearsonr
 14 | from utils.utility import get_local_region
 15 | from utils.utility import get_competent_detectors
 16 | from utils.utility import train_predict_lof, generate_bagging_indices
 17 | from utils.utility import print_save_result, save_script
 18 | from utils.utility import loaddata, precision_n_score, standardizer
 19 | 
 20 | # access the timestamp for logging purpose
 21 | today = datetime.datetime.now()
 22 | timestamp = today.strftime("%Y%m%d_%H%M%S")
 23 | 
 24 | # set numpy parameters
 25 | np.set_printoptions(suppress=True, precision=4)
 26 | 
 27 | ###############################################################################
 28 | # parameter settings
 29 | 
 30 | data = 'cardio'
 31 | # data = 'letter'
 32 | 
 33 | 
 34 | base_detector = 'lof'
 35 | n_ite = 30  # number of iterations
 36 | test_size = 0.4  # training = 60%, testing = 40%
 37 | n_baselines = 11  # the number of baseline algorithms, DO NOT CHANGE
 38 | 
 39 | # reference pearson size:
 40 | # https://www.researchgate.net/post/What_is_the_minimum_sample_size_to_run_Pearsons_R
 41 | loc_region_size = 0
 42 | loc_region_min = 30  # min local region size
 43 | loc_region_max = 100  # max local region size
 44 | ###############################################################################
 45 | # adjustable parameters
 46 | loc_region_perc = 0.1
 47 | loc_region_ite = 20  # the number of iterations in defining local region
 48 | loc_region_threshold = int(loc_region_ite / 2)  # the threshold to keep a point
 49 | loc_min_features = 0.5  # the lower bound of the number of features to use
 50 | 
 51 | n_bins = 10
 52 | n_selected = 1  # actually not a parameter to tweak
 53 | 
 54 | n_clf = 50
 55 | k_min = 5
 56 | k_max = 200
 57 | 
 58 | # for SG_AOM and SG_MOA, choose the right number of buckets
 59 | n_buckets = 5
 60 | n_clf_bucket = int(n_clf / n_buckets)
 61 | assert (n_clf % n_buckets == 0)  # in case wrong number of buckets
 62 | 
 63 | # flag for printing and output saving
 64 | verbose = True
 65 | 
 66 | # record of feature bagging detector
 67 | fb_n_neighbors = []
 68 | ###############################################################################
 69 | 
 70 | if __name__ == '__main__':
 71 | 
 72 |     start_time = time.time()
 73 |     X_orig, y_orig = loaddata(data)
 74 | 
 75 |     # initialize the matrix for storing scores
 76 |     roc_mat = np.zeros([n_ite, n_baselines])  # receiver operating curve
 77 |     ap_mat = np.zeros([n_ite, n_baselines])  # average precision
 78 | 
 79 |     for t in range(n_ite):
 80 |         print('\nn_ite', t + 1, data)  # print status
 81 | 
 82 |         random_state = np.random.RandomState()
 83 | 
 84 |         # split the data into training and testing
 85 |         X_train, X_test, y_train, y_test = train_test_split(X_orig, y_orig,
 86 |                                                             test_size=test_size,
 87 |                                                             random_state=random_state)
 88 |         # in case of small datasets
 89 |         if k_max > X_train.shape[0]:
 90 |             k_max = X_train.shape[0]
 91 |         k_list = random_state.randint(k_min, k_max, size=n_clf).tolist()
 92 |         k_list.sort()
 93 | 
 94 |         # normalized the data
 95 |         X_train_norm, X_test_norm = standardizer(X_train, X_test)
 96 | 
 97 |         train_scores = np.zeros([X_train.shape[0], n_clf])
 98 |         test_scores = np.zeros([X_test.shape[0], n_clf])
 99 | 
100 |         # initialized the list to store the results
101 |         test_target_list = []
102 |         method_list = []
103 | 
104 |         # generate a pool of detectors and predict on test instances
105 |         train_scores, test_scores = train_predict_lof(k_list, X_train_norm,
106 |                                                       X_test_norm,
107 |                                                       train_scores,
108 |                                                       test_scores)
109 | 
110 |         #######################################################################
111 |         # fit feature bagging using median of k_list
112 |         # n_neighbors = int(np.median(k_list))
113 |         n_neighbors = random_state.randint(low=k_min, high=k_max)
114 |         clf = FeatureBagging(base_estimator=LOF(n_neighbors=n_neighbors),
115 |                              n_estimators=len(k_list), check_estimator=False)
116 |         print(clf)
117 |         fb_n_neighbors.append(n_neighbors)
118 |         clf.fit(X_train_norm)
119 | 
120 |         # generate scores
121 |         target_test_feature_bagging = clf.decision_function(X_test_norm)
122 |         test_target_list.append(target_test_feature_bagging)
123 |         method_list.append('FB')
124 |         #######################################################################
125 |         # generate normalized scores
126 |         train_scores_norm, test_scores_norm = standardizer(train_scores,
127 |                                                            test_scores)
128 |         # generate mean and max outputs
129 |         # SG_A and SG_M
130 |         target_test_mean = np.mean(test_scores_norm, axis=1)
131 |         target_test_max = np.max(test_scores_norm, axis=1)
132 |         test_target_list.extend([target_test_mean, target_test_max])
133 |         method_list.extend(['GG_a', 'GG_m'])
134 | 
135 |         # generate pseudo target for training -> for calculating weights
136 |         target_mean = np.mean(train_scores_norm, axis=1).reshape(-1, 1)
137 |         target_max = np.max(train_scores_norm, axis=1).reshape(-1, 1)
138 | 
139 |         # generate weighted mean
140 |         # weights are distance or pearson in different modes
141 |         clf_weights_pear = np.zeros([n_clf, 1])
142 |         for i in range(n_clf):
143 |             clf_weights_pear[i] = pearsonr(
144 |                 target_mean, train_scores_norm[:, i].reshape(-1, 1))
145 | 
146 |         # generate weighted mean
147 |         target_test_weighted_pear = np.sum(
148 |             test_scores_norm * clf_weights_pear.reshape(1, -1) /
149 |             clf_weights_pear.sum(), axis=1)
150 | 
151 |         test_target_list.append(target_test_weighted_pear)
152 |         method_list.append('GG_wa')
153 | 
154 |         # generate threshold sum
155 |         target_test_threshold = np.sum(test_scores_norm.clip(0), axis=1)
156 |         test_target_list.append(target_test_threshold)
157 |         method_list.append('GG_thresh')
158 | 
159 |         # generate average of maximum (SG_AOM) and maximum of average (SG_MOA)
160 |         target_test_aom = aom(test_scores_norm, n_buckets, n_clf)
161 |         target_test_moa = moa(test_scores_norm, n_buckets, n_clf)
162 |         test_target_list.extend([target_test_aom, target_test_moa])
163 |         method_list.extend(['GG_aom', 'GG_moa'])
164 |         ##################################################################
165 | 
166 |         # define the local region size
167 |         loc_region_size = int(X_train_norm.shape[0] * loc_region_perc)
168 |         if loc_region_size < loc_region_min:
169 |             loc_region_size = loc_region_min
170 |         if loc_region_size > loc_region_max:
171 |             loc_region_size = loc_region_max
172 | 
173 |         # define local region
174 |         ind_arr = get_local_region(X_train_norm, X_test_norm,
175 |                                    loc_region_size,
176 |                                    loc_region_ite=loc_region_ite,
177 |                                    local_region_strength=loc_region_threshold,
178 |                                    loc_min_features=loc_min_features,
179 |                                    random_state=random_state)
180 | 
181 |         pred_scores_best = np.zeros([X_test.shape[0], ])
182 |         pred_scores_ens = np.zeros([X_test.shape[0], ])
183 | 
184 |         for i in range(X_test.shape[0]):  # iterate all test instance
185 | 
186 |             ind_k = ind_arr[i]
187 | 
188 |             # get the pseudo target: mean
189 |             target_k = target_mean[ind_k,].ravel()
190 | 
191 |             # get the current scores from all clf
192 |             curr_train_k = train_scores_norm[ind_k, :]
193 | 
194 |             # initialize containers for correlation
195 |             corr_pear_n = np.zeros([n_clf, ])
196 | 
197 |             for d in range(n_clf):
198 |                 corr_pear_n[d,] = pearsonr(target_k, curr_train_k[:, d])
199 | 
200 |             # pick the best one
201 |             best_clf_ind = np.nanargmax(corr_pear_n)
202 |             pred_scores_best[i,] = test_scores_norm[i, best_clf_ind]
203 | 
204 |             pred_scores_ens[i,] = np.max(
205 |                 test_scores_norm[
206 |                     i, get_competent_detectors(corr_pear_n, n_bins,
207 |                                                n_selected)])
208 | 
209 |         test_target_list.extend([pred_scores_best,
210 |                                  pred_scores_ens])
211 |         method_list.extend(['LSCP_a',
212 |                             'LSCP_moa'])
213 |         ######################################################################
214 | 
215 |         pred_scores_best = np.zeros([X_test.shape[0], ])
216 |         pred_scores_ens = np.zeros([X_test.shape[0], ])
217 | 
218 |         for i in range(X_test.shape[0]):  # iterate all test instance
219 |             # get the neighbor idx of the current point
220 |             ind_k = ind_arr[i]
221 |             # get the pseudo target: mean
222 |             target_k = target_max[ind_k,].ravel()
223 | 
224 |             # get the current scores from all clf
225 |             curr_train_k = train_scores_norm[ind_k, :]
226 | 
227 |             # initialize containers for correlation
228 |             corr_pear_n = np.zeros([n_clf, ])
229 | 
230 |             for d in range(n_clf):
231 |                 corr_pear_n[d,] = pearsonr(target_k, curr_train_k[:, d])
232 | 
233 |             # pick the best one
234 |             best_clf_ind = np.nanargmax(corr_pear_n)
235 |             pred_scores_best[i,] = test_scores_norm[i, best_clf_ind]
236 | 
237 |             pred_scores_ens[i,] = np.mean(
238 |                 test_scores_norm[
239 |                     i, get_competent_detectors(corr_pear_n, n_bins,
240 |                                                n_selected)])
241 | 
242 |         test_target_list.extend([pred_scores_best,
243 |                                  pred_scores_ens])
244 |         method_list.extend(['LSCP_m',
245 |                             'LSCP_aom'])
246 | 
247 |         ######################################################################
248 | 
249 |         # store performance information and print result
250 |         for i in range(n_baselines):
251 |             roc_mat[t, i] = roc_auc_score(y_test, test_target_list[i])
252 |             ap_mat[t, i] = average_precision_score(y_test,
253 |                                                    test_target_list[i])
254 |             print(method_list[i], roc_mat[t, i])
255 |         print('local region size:', loc_region_size)
256 | 
257 |     print("--- %s seconds ---" % (time.time() - start_time))
258 |     execution_time = time.time() - start_time
259 | 
260 |     # save parameters
261 |     save_script(data, base_detector, timestamp, n_ite, test_size, n_baselines,
262 |                 loc_region_perc, loc_region_ite, loc_region_threshold,
263 |                 loc_min_features, loc_region_size, loc_region_min,
264 |                 loc_region_max, n_clf, k_min, k_max, n_bins, n_selected,
265 |                 n_buckets, fb_n_neighbors, execution_time)
266 | 
267 |     # print and save the result
268 |     # default location is /results/***.csv
269 |     print_save_result(data, base_detector, n_baselines, roc_mat,
270 |                       ap_mat, method_list, timestamp, verbose)
271 | 


--------------------------------------------------------------------------------
/utils/utility.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import collections
  3 | import pathlib
  4 | 
  5 | import numpy as np
  6 | import scipy.io as scio
  7 | from scipy.stats import scoreatpercentile
  8 | 
  9 | import sklearn
 10 | from sklearn.neighbors import KDTree
 11 | from sklearn.metrics import precision_score
 12 | from sklearn.preprocessing import StandardScaler
 13 | from sklearn.utils import column_or_1d
 14 | from sklearn.utils import check_random_state
 15 | from sklearn.utils.random import sample_without_replacement
 16 | 
 17 | from models.lof import LOF
 18 | from models.knn import Knn
 19 | 
 20 | 
 21 | def argmaxp(a, p):
 22 |     """Utlity function to return the index of top p values in a
 23 |     :param a: list variable
 24 |     :param p: number of elements to select
 25 |     :return: index of top p elements in a
 26 |     """
 27 | 
 28 |     a = np.asarray(a).ravel()
 29 |     length = a.shape[0]
 30 |     pth = np.argpartition(a, length - p)
 31 |     return pth[length - p:]
 32 | 
 33 | 
 34 | # @njit("i8[:](i8[:], u8, b1)")
 35 | def argmaxn(value_list, n, desc=True):
 36 |     """
 37 |     Return the index of top n elements in the list if order is set to 'desc',
 38 |     otherwise return the index of n smallest elements
 39 | 
 40 |     :param value_list: a list containing all values
 41 |     :type value_list: list, array
 42 |     :param n: the number of the elements to select
 43 |     :type n: int
 44 |     :param order: the order to sort {'desc', 'asc'}
 45 |     :type order: str, optional (default='desc')
 46 |     :return: the index of the top n elements
 47 |     :rtype: list
 48 |     """
 49 |     value_list = column_or_1d(value_list)
 50 |     length = len(value_list)
 51 | 
 52 |     # for the smallest n, flip the value
 53 |     if not desc:
 54 |         n = length - n
 55 | 
 56 |     # partition is not part of numba
 57 |     value_sorted = np.partition(value_list, length - n)
 58 |     threshold = value_sorted[int(length - n)]
 59 | 
 60 |     if desc:
 61 |         return np.where(np.greater_equal(value_list, threshold))[0]
 62 |     else:  # return the index of n smallest elements
 63 |         return np.where(np.less(value_list, threshold))[0]
 64 | 
 65 | 
 66 | def get_label_n(y, y_pred):
 67 |     """ Infer the binary label of the top n samples with highest scores    
 68 |     :param y: 
 69 |     :param y_pred: 
 70 |     :return: 
 71 |     """
 72 |     out_perc = np.count_nonzero(y) / len(y)
 73 |     threshold = scoreatpercentile(y_pred, 100 * (1 - out_perc))
 74 |     y_pred = (y_pred > threshold).astype('int')
 75 |     return y_pred
 76 | 
 77 | 
 78 | def standardizer(X_train, X_test):
 79 |     """
 80 |     normalization function wrapper
 81 |     :param X_train:
 82 |     :param X_test:
 83 |     :return: X_train and X_test after the Z-score normalization
 84 |     """
 85 |     scaler = StandardScaler().fit(X_train)
 86 |     return scaler.transform(X_train), scaler.transform(X_test)
 87 | 
 88 | 
 89 | def precision_n_score(y, y_pred):
 90 |     """
 91 |     Utlity function to calculate precision@n
 92 |     :param y: ground truth
 93 |     :param y_pred: number of outliers
 94 |     :return: score
 95 |     """
 96 |     # calculate the percentage of outliers
 97 |     out_perc = np.count_nonzero(y) / len(y)
 98 | 
 99 |     threshold = scoreatpercentile(y_pred, 100 * (1 - out_perc))
100 |     y_pred = (y_pred > threshold).astype('int')
101 |     return precision_score(y, y_pred)
102 | 
103 | 
104 | def loaddata(filename):
105 |     """
106 |     load data
107 |     :param filename:
108 |     :return:
109 |     """
110 |     mat = scio.loadmat(os.path.join('datasets', filename + '.mat'))
111 |     X_orig = mat['X']
112 |     y_orig = mat['y'].ravel()
113 | 
114 |     return X_orig, y_orig
115 | 
116 | 
117 | def train_predict_lof(k_list, X_train_norm, X_test_norm, train_scores,
118 |                       test_scores):
119 |     # initialize base detectors
120 |     clf_list = []
121 |     for k in k_list:
122 |         clf = LOF(n_neighbors=k)
123 |         clf.fit(X_train_norm)
124 |         train_score = clf.decision_scores_
125 |         test_score = clf.decision_function(X_test_norm)
126 |         clf_name = 'lof_' + str(k)
127 | 
128 |         clf_list.append(clf_name)
129 |         curr_ind = len(clf_list) - 1
130 | 
131 |         train_scores[:, curr_ind] = train_score.ravel()
132 |         test_scores[:, curr_ind] = test_score.ravel()
133 | 
134 |     return train_scores, test_scores
135 | 
136 | 
137 | def train_predict_knn(k_list, X_train_norm, X_test_norm, train_scores,
138 |                       test_scores):
139 |     # initialize base detectors
140 |     clf_list = []
141 |     for k in k_list:
142 |         clf = Knn(n_neighbors=k, method='largest')
143 |         clf.fit(X_train_norm)
144 |         train_score = clf.decision_scores
145 |         test_score = clf.decision_function(X_test_norm)
146 |         clf_name = 'knn_' + str(k)
147 | 
148 |         clf_list.append(clf_name)
149 |         curr_ind = len(clf_list) - 1
150 | 
151 |         train_scores[:, curr_ind] = train_score.ravel()
152 |         test_scores[:, curr_ind] = test_score.ravel()
153 | 
154 |     return train_scores, test_scores
155 | 
156 | 
157 | def save_script(data, base_detector, timestamp, n_ite, test_size, n_baselines,
158 |                 loc_region_perc, loc_region_ite, loc_region_strength,
159 |                 loc_min_features, loc_region_size, loc_region_min,
160 |                 loc_region_max, n_clf, k_min, k_max, n_bins, n_selected,
161 |                 n_buckets, fb_n_neighbors, execution_time):
162 |     # initialize the log directory if it does not exist
163 |     pathlib.Path('results').mkdir(parents=True, exist_ok=True)
164 |     f = open(
165 |         'results\\' + data + '_' + base_detector + '_' + timestamp + '.txt',
166 |         'a')
167 | 
168 |     f.writelines("\n n_ite: " + str(n_ite))
169 |     f.writelines("\n test_size: " + str(test_size))
170 |     f.writelines("\n n_baselines: " + str(n_baselines))
171 |     f.writelines("\n")
172 | 
173 |     f.writelines("\n loc_region_perc: " + str(loc_region_perc))
174 |     f.writelines("\n loc_region_ite: " + str(loc_region_ite))
175 |     f.writelines("\n loc_region_threshold: " + str(loc_region_strength))
176 |     f.writelines("\n loc_min_features: " + str(loc_min_features))
177 |     f.writelines("\n loc_region_size: " + str(loc_region_size))
178 |     f.writelines("\n loc_region_min: " + str(loc_region_min))
179 |     f.writelines("\n loc_region_max: " + str(loc_region_max))
180 |     f.writelines("\n")
181 | 
182 |     f.writelines("\n n_clf: " + str(n_clf))
183 | 
184 |     f.writelines("\n k_min: " + str(k_min))
185 |     f.writelines("\n k_max: " + str(k_max))
186 |     f.writelines("\n n_bins: " + str(n_bins))
187 |     f.writelines("\n n_selected: " + str(n_selected))
188 |     f.writelines("\n n_buckets: " + str(n_buckets))
189 |     f.writelines("\n")
190 | 
191 |     f.writelines("\n fb n_neighbors: ")
192 |     for n_neighnors in fb_n_neighbors:
193 |         f.writelines(str(n_neighnors) + ", ")
194 |     f.writelines("\n")
195 | 
196 |     f.writelines("\n execution_time: " + str(execution_time))
197 |     f.close()
198 | 
199 | 
200 | def print_save_result(data, base_detector, n_baselines, roc_mat,
201 |                       ap_mat, method_list, timestamp, verbose):
202 |     """
203 |     :param data:
204 |     :param base_detector:
205 |     :param n_baselines:
206 |     :param n_clf:
207 |     :param n_ite:
208 |     :param roc_mat:
209 |     :param ap_mat:
210 |     :param prc_mat:
211 |     :param method_list:
212 |     :param timestamp:
213 |     :param verbose:
214 |     :return: None
215 |     """
216 | 
217 |     roc_scores = np.round(np.mean(roc_mat, axis=0), decimals=4)
218 |     ap_scores = np.round(np.mean(ap_mat, axis=0), decimals=4)
219 | 
220 |     method_np = np.asarray(method_list)
221 | 
222 |     top_roc_ind = argmaxp(roc_scores, 1)
223 |     top_ap_ind = argmaxp(ap_scores, 1)
224 | 
225 |     top_roc_clf = method_np[top_roc_ind].tolist()[0]
226 |     top_ap_clf = method_np[top_ap_ind].tolist()[0]
227 | 
228 |     top_roc = np.round(roc_scores[top_roc_ind][0], decimals=4)
229 |     top_ap = np.round(ap_scores[top_ap_ind][0], decimals=4)
230 | 
231 |     roc_diff = np.round(100 * (top_roc - roc_scores) / roc_scores, decimals=4)
232 |     ap_diff = np.round(100 * (top_ap - ap_scores) / ap_scores, decimals=4)
233 | 
234 |     # initialize the log directory if it does not exist
235 |     pathlib.Path('results').mkdir(parents=True, exist_ok=True)
236 | 
237 |     # create the file if it does not exist
238 |     f = open(
239 |         'results\\' + data + '_' + base_detector + '_' + timestamp + '.csv',
240 |         'a')
241 | 
242 |     if verbose:
243 |         f.writelines('method, '
244 |                      'roc, best_roc, diff_roc,'
245 |                      'ap, best_ap, diff_ap,'
246 |                      'best roc, best ap')
247 |     else:
248 |         f.writelines('method, '
249 |                      'roc, ap, p@m,'
250 |                      'best roc, best ap')
251 | 
252 |     print('method, roc, ap, p@m, best roc, best ap')
253 |     delim = ','
254 |     for i in range(n_baselines):
255 |         print(method_list[i], roc_scores[i], ap_scores[i],
256 |               top_roc_clf, top_ap_clf)
257 | 
258 |         if verbose:
259 |             f.writelines(
260 |                 '\n' + str(method_list[i]) + delim +
261 |                 str(roc_scores[i]) + delim + str(top_roc) + delim + str(
262 |                     roc_diff[i]) + delim +
263 |                 str(ap_scores[i]) + delim + str(top_ap) + delim + str(
264 |                     ap_diff[i]) + delim +
265 |                 top_roc_clf + delim + top_ap_clf)
266 |         else:
267 |             f.writelines(
268 |                 '\n' + str(method_list[i]) + delim +
269 |                 str(roc_scores[i]) + delim +
270 |                 str(ap_scores[i]) + delim +
271 |                 top_roc_clf + delim + top_ap_clf)
272 | 
273 |     f.close()
274 | 
275 | 
276 | def generate_bagging_indices(random_state, bootstrap_features, n_features,
277 |                              min_features, max_features):
278 |     """
279 |     Randomly draw feature indices. Internal use only.
280 | 
281 |     Modified from sklearn/ensemble/bagging.py
282 |     """
283 |     # Get valid random state
284 |     random_state = check_random_state(random_state)
285 | 
286 |     # decide number of features to draw
287 |     random_n_features = random_state.randint(min_features, max_features)
288 | 
289 |     # Draw indices
290 |     feature_indices = _generate_indices(random_state, bootstrap_features,
291 |                                         n_features, random_n_features)
292 | 
293 |     return feature_indices
294 | 
295 | 
296 | def _generate_indices(random_state, bootstrap, n_population, n_samples):
297 |     """
298 |     Draw randomly sampled indices. Internal use only.
299 | 
300 |     See sklearn/ensemble/bagging.py
301 |     """
302 |     # Draw sample indices
303 |     if bootstrap:
304 |         indices = random_state.randint(0, n_population, n_samples)
305 |     else:
306 |         indices = sample_without_replacement(n_population, n_samples,
307 |                                              random_state=random_state)
308 | 
309 |     return indices
310 | 
311 | 
312 | def get_local_region(X_train_norm, X_test_norm, loc_region_size,
313 |                      loc_region_ite, local_region_strength,
314 |                      loc_min_features, random_state):
315 |     # Initialize the local region list
316 |     grid = [[]] * X_test_norm.shape[0]
317 | 
318 |     for t in range(loc_region_ite):
319 |         features = generate_bagging_indices(random_state,
320 |                                             bootstrap_features=False,
321 |                                             n_features=X_train_norm.shape[1],
322 |                                             min_features=int(
323 |                                                 X_train_norm.shape[
324 |                                                     1] * loc_min_features),
325 |                                             max_features=X_train_norm.shape[1])
326 | 
327 |         tree = KDTree(X_train_norm[:, features])
328 |         dist_arr, ind_arr = tree.query(X_test_norm[:, features],
329 |                                        k=loc_region_size)
330 | 
331 |         for j in range(X_test_norm.shape[0]):
332 |             grid[j] = grid[j] + ind_arr[j, :].tolist()
333 | 
334 |     grid_f = [[]] * X_test_norm.shape[0]
335 |     for j in range(X_test_norm.shape[0]):
336 |         grid_f[j] = [item for item, count in
337 |                      collections.Counter(grid[j]).items() if
338 |                      count > local_region_strength]
339 | 
340 |     return grid_f
341 | 
342 | 
343 | def get_competent_detectors(scores, n_bins=10, n_selected=5):
344 |     """ algorithm for selecting the most competent detectors
345 |     :param scores:
346 |     :param n_bins:
347 |     :param n_selected:
348 |     :return:
349 |     """
350 |     scores = scores.reshape(-1, 1)
351 |     hist, bin_edges = np.histogram(scores, bins=n_bins)
352 |     #    dense_bin = np.argmax(hist)
353 |     max_bins = argmaxn(hist, n=n_selected, desc=True)
354 |     candidates = []
355 |     #    print(hist)
356 |     for max_bin in max_bins:
357 |         #        print(bin_edges[max_bin], bin_edges[max_bin+1])
358 |         selected = np.where((scores >= bin_edges[max_bin])
359 |                             & (scores <= bin_edges[max_bin + 1]))
360 |         #        print(selected)
361 |         candidates = candidates + selected[0].tolist()
362 | 
363 |     #    print(np.mean(scores[candidates,:]), np.mean(scores))
364 |     # return np.mean(scores[candidates, :])
365 |     return candidates
366 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | **L**ocally **S**elective **C**ombination in **P**arallel Outlier Ensembles (LSCP): 
  3 | **a fully unsupervised framework to selectively combine base detectors by emphasizing data locality.**
  4 | 
  5 | ------------
  6 | 
  7 | Zhao, Y., Nasrullah, Z., Hryniewicki, M.K. and Li, Z. LSCP: Locally Selective Combination in Parallel Outlier Ensembles. 
  8 | *SIAM International Conference on Data Mining (SDM)*, 2019.
  9 | 
 10 | Please cite the paper as:
 11 | 
 12 |     @inproceedings{zhao2019lscp,
 13 |       title={{LSCP:} Locally Selective Combination in Parallel Outlier Ensembles},
 14 |       author={Zhao, Yue and Nasrullah, Zain and Hryniewicki, Maciej K and Li, Zheng},
 15 |       booktitle={Proceedings of the 2019 {SIAM} International Conference on Data Mining, {SDM} 2019},
 16 |       pages={585--593},
 17 |       month = {May},
 18 |       year={2019},
 19 |       address = {Calgary, Canada},
 20 |       organization={SIAM},
 21 |       url={https://doi.org/10.1137/1.9781611975673.66},
 22 |       doi={10.1137/1.9781611975673.66}
 23 |     }
 24 |         
 25 | 
 26 | [PDF for Personal Use](https://epubs.siam.org/doi/pdf/10.1137/1.9781611975673.66) | 
 27 | [SIAM Page](https://epubs.siam.org/doi/abs/10.1137/1.9781611975673.66) | 
 28 | [Presentation Slides](http://www.andrew.cmu.edu/user/yuezhao2/papers/19-sdm-lscp-slides.pdf) | 
 29 | [API Documentation](https://pyod.readthedocs.io/en/latest/pyod.models.html#module-pyod.models.lscp) | 
 30 | [Example with PyOD](https://github.com/yzhao062/pyod/blob/master/examples/lscp_example.py) 
 31 | 
 32 | **Update** (May 9th, 2019): [Published version](https://epubs.siam.org/doi/pdf/10.1137/1.9781611975673.66) is available for download. 
 33 | 
 34 | **Update** (Jan 23th, 2019): [Camera-ready version](https://arxiv.org/abs/1812.01528) is available for download. 
 35 | 
 36 | **Update** (Dec 25th, 2018): LSCP has been officially released in **[Python Outlier Detection (PyOD)](https://github.com/yzhao062/pyod)** V0.6.6.
 37 | 
 38 | **Update** (Dec 21th, 2018): LSCP has been accepted at SDM 2019. Acceptance rate 22.7% (90/397).
 39 | 
 40 | **Update** (Dec 6th, 2018): LSCP has been included as part of **[Python Outlier Detection (PyOD)](https://github.com/yzhao062/pyod)**, 
 41 | to be released in pyod V0.6.6.
 42 |  
 43 | ------------
 44 | 
 45 | ### Additional notes
 46 | 
 47 | 1. Two versions of codes are provided:
 48 |    1. **Demo version** (demo_lof.py) is created for the fast reproduction of the experiment results. The demo version only compares the baseline algorithms with LSCP algorithms.
 49 |    2. **Production version** ([Python Outlier Detection (PyOD)](https://github.com/yzhao062/pyod)) is released with full optimization and testing as a framework. The purpose of this version is to be used in real applications, which should require fewer dependencies and faster execution.
 50 | 2. It is understood that there are **small variations** in the results due to the random process, e.g., splitting the training and test sets. Thus, running demo codes would only result in similar results to the paper but not the exactly same results.
 51 | 
 52 | ------------
 53 | 
 54 | ##  Introduction
 55 | In unsupervised outlier ensembles, the absence of ground truth makes the combination of base outlier detectors a challenging task. 
 56 | Specifically, existing parallel outlier ensembles lack a reliable way of selecting competent base detectors, affecting accuracy and stability, during model combination. 
 57 | In this paper, we propose a framework---called Locally Selective Combination in Parallel Outlier Ensembles (LSCP)---which addresses the issue by defining a local region around a test instance using the consensus of its nearest neighbors in randomly selected feature subspaces. 
 58 | The top-performing base detectors in this local region are selected and combined as the model's final output. 
 59 | Four variants of the LSCP framework are compared with seven widely used parallel frameworks. Experimental results demonstrate that one of these variants, LSCP_AOM, consistently outperforms baselines on the majority of twenty real-world datasets.
 60 | 
 61 | ![LSCP Flowchart](https://github.com/yzhao062/LSCP/blob/master/figs/flowchart2.png)
 62 | 
 63 | ## Dependency
 64 | The experiment codes are writen in Python 3.6 and built on a number of Python packages:
 65 | - numpy>=1.13
 66 | - numba>=0.35
 67 | - scipy>=0.19
 68 | - scikit_learn>=0.19
 69 | 
 70 | Batch installation is possible using the supplied "requirements.txt" with pip or conda.
 71 | 
 72 | ````cmd
 73 | pip install -r requirements.txt
 74 | ````
 75 | 
 76 | ## Datasets
 77 | 20 datasets are used (see dataset folder):
 78 | 
 79 | | Datasets   | #Sample Dimension  | Dimension  | #Outliers  | # Outlier Perc|
 80 | | -----------| ------------------ | ---------- | ---------- | ------------- |
 81 | | Annthyroid | 7200               | 6          | 534        | 7.41          |        
 82 | | Arrhythmia | 452                | 274        | 66         | 14.60         |
 83 | | Breastw    | 683                | 9          | 239        | 34.99         |
 84 | | Cardio     | 1831               | 21         | 176        | 9.61          |
 85 | | Letter     | 1600               | 32         | 100        | 6.25          |
 86 | | MNIST      | 7603               | 100        | 700        | 9.21          |
 87 | | Musk       | 3062               | 166        | 97         | 3.17          |
 88 | | PageBlocks | 5393               | 10         | 510        | 9.46          |
 89 | | Pendigits  | 6870               | 16         | 156        | 2.27          |
 90 | | Pima       | 768                | 8          | 268        | 34.90         |
 91 | | Satellite  | 6435               | 36         | 2036       | 31.64         |
 92 | | Satimage-2 | 5803               | 36         | 71         | 1.22          |
 93 | | Shuttle    | 49097              | 9          | 3511       | 7.15          |
 94 | | SpamSpace  | 4207               | 57         | 1679       | 39.91         |
 95 | | Stamps     | 340                | 9          | 31         | 9.12          |
 96 | | Thyroid    | 3772               | 6          | 93         | 2.47          |
 97 | | Vertebral  | 240                | 6          | 30         | 12.50         |
 98 | | Vowels     | 1456               | 12         | 50         | 3.43          |
 99 | | Wbc        | 378                | 30         | 21         | 5.56          |
100 | | Wilt       | 4819               | 5          | 257        | 5.33          |
101 | 
102 | All datasets are accessible from http://odds.cs.stonybrook.edu/ and
103 | http://www.dbs.ifi.lmu.de/research/outlier-evaluation/DAMI/.
104 | 
105 | Citation Suggestion for the datasets please refer to: 
106 | > Shebuti Rayana (2016).  ODDS Library [http://odds.cs.stonybrook.edu]. Stony Brook, NY: Stony Brook University, Department of Computer Science.
107 | 
108 | > Campos, G.O., Zimek, A., Sander, J., Campello, R.J., Micenková, B., Schubert, E., Assent, I. and Houle, M.E., 2016. On the evaluation of unsupervised outlier detection: measures, datasets, and an empirical study. *Data Mining and Knowledge Discovery*, 30(4), pp.891-927.
109 | 
110 | ## Usage and Sample Output (Demo Version)
111 | Experiments could be reproduced by running **demo_lof.py** directly. You could simply download/clone the entire repository and execute the code by 
112 | 
113 | ```cmd
114 | python demo_lof.py
115 | ```
116 | Two evaluation methods are introduced and the result would be saved into "results" folder:
117 | 1.  The area under receiver operating characteristic curve (**ROC**)
118 | 2.  mean Average Precision (**mAP**) 
119 | 
120 | ## Results
121 | 
122 | **Table 2: ROC-AUC scores (average of 30 independent trials, highest score highlighted in bold)**
123 | 
124 | | Datasets   | LSCP_A | LSCP_MOA | LSCP_M | LSCP_AOM | GG_A | GG_MOA  | GG_M  | GG_AOM | GG_WA  | GG_TH    | GG_FB  |
125 | | -----------| ------ | ------ | -------| ------ | -------| ------ | ------ | ------ | -------- | -------- | ------ |	
126 | | Annthyroid | 0.7548 | 0.7590 | 0.7849 | 0.7520 | 0.7642 | 0.7660 | 0.7769 | 0.7730 | 0.7632 | 0.7552 | **0.7854** |
127 | | Arrhythmia | 0.7746 | 0.7715 | 0.7729 | **0.7763** | 0.7758 | 0.7749 | 0.7656 | 0.7690 | 0.7758 | 0.7313 | 0.7709 |
128 | | Breastw    | 0.6553 | 0.7044 | 0.7236 | **0.7845** | 0.7362 | 0.7140 | 0.6590 | 0.6838 | 0.7453 | 0.6285 | 0.3935 |
129 | | Cardio     | 0.8691 | 0.8908 | 0.8491 | **0.9013** | 0.8770 | 0.8865 | 0.8798 | 0.8903 | 0.8782 | 0.8830 | 0.8422 |
130 | | Letter     | 0.7818 | 0.7954 | 0.8361 | 0.7867 | 0.7925 | 0.8031 | **0.8434** | 0.8300 | 0.7908 | 0.8001 | 0.7640 |
131 | | MNIST      | 0.8576 | 0.8623 | 0.7812 | **0.8633** | 0.8557 | 0.8588 | 0.8349 | 0.8553 | 0.8563 | 0.8272 | 0.8468 |
132 | | Musk       | 0.9950 | 0.9970 | 0.9931 | **0.9981** | 0.9937 | 0.9960 | 0.9960 | 0.9970 | 0.9953 | 0.9958 | 0.7344 |
133 | | PageBlocks | 0.9349 | 0.9343 | 0.8687 | **0.9488** | 0.9443 | 0.9440 | 0.9240 | 0.9371 | 0.9453 | 0.9418 | 0.9284 |
134 | | Pendigits  | 0.8238 | 0.8656 | 0.7238 | **0.8744** | 0.8378 | 0.8509 | 0.8488 | 0.8622 | 0.8425 | 0.8548 | 0.8034 |
135 | | Pima       | 0.7059 | 0.6991 | 0.6640 | **0.7061** | 0.7030 | 0.7003 | 0.6730 | 0.6856 | 0.7037 | 0.6349 | 0.6989 |
136 | | Satellite  | 0.5814 | 0.6106 | 0.6006 | 0.6015 | 0.5881 | 0.5992 | **0.6258** | 0.6220 | 0.5876 | 0.6101 | 0.5818 |
137 | | Satimage-2 | 0.9852 | 0.9931 | 0.9878 | **0.9935** | 0.9872 | 0.9907 | 0.9909 | 0.9925 | 0.9880 | 0.9881 | 0.9181 |
138 | | Shuttle    | 0.5392 | 0.5551 | 0.5373 | 0.5514 | 0.5439 | 0.5504 | **0.5612** | 0.5602 | 0.5413 | 0.5561 | 0.3702 |
139 | | SpamSpace  | 0.3792 | 0.4594 | 0.4305 | **0.4744** | 0.4487 | 0.4377 | 0.4060 | 0.4128 | 0.4580 | 0.4104 | 0.3312 |
140 | | Stamps     | 0.8888 | 0.8719 | 0.8525 | **0.8985** | 0.8946 | 0.8927 | 0.8559 | 0.8763 | 0.8953 | 0.8904 | 0.8715 |
141 | | Thyroid    | 0.9579 | 0.9624 | 0.9413 | **0.9700** | 0.9656 | 0.9647 | 0.9385 | 0.9510 | 0.9665 | 0.9644 | 0.8510 |
142 | | Vertebral  | 0.3324 | 0.3662 | **0.4306** | 0.3478 | 0.3433 | 0.3467 | 0.3662 | 0.3614 | 0.3442 | 0.3678 | 0.3385 |
143 | | Vowels     | 0.9276 | 0.9185 | 0.9238 | 0.9199 | 0.9265 | 0.9275 | **0.9313** | 0.9271 | 0.9261 | 0.9299 | 0.9148 |
144 | | WBC        | 0.9379 | 0.9344 | 0.9242 | **0.9451** | 0.9421 | 0.9409 | 0.9321 | 0.9367 | 0.9420 | 0.9314 | 0.9407 |
145 | | Wilt       | 0.5275 | 0.5517 | **0.6550** | 0.4286 | 0.5101 | 0.5358 | 0.6384 | 0.6056 | 0.5037 | 0.5586 | 0.5868 |
146 | 
147 | **Table 3: mAP scores (average of 30 independent trials, highest score highlighted in bold)**
148 | 
149 | | Datasets   | LSCP_A | LSCP_MOA | LSCP_M | LSCP_AOM | GG_A | GG_MOA  | GG_M  | GG_AOM | GG_WA  | GG_TH    | GG_FB  |
150 | | -----------| ------ | ------ | -------| ------ | -------| ------ | ------ | ------ | -------- | -------- | ------ |
151 | | Annthyroid | 0.2283 | 0.2375 | 0.2349 | 0.2453 | 0.2301 | 0.2395 | 0.2413 | **0.2516** | 0.2306 | 0.2277 | 0.1864 |
152 | | Arrhythmia | 0.3780 | 0.3744 | 0.3790 | **0.3796** | 0.3766 | 0.3769 | 0.3690 | 0.3722 | 0.3766 | 0.3468 | 0.3707 |
153 | | Breastw    | 0.4334 | 0.4766 | 0.4728 | **0.5655** | 0.4995 | 0.4849 | 0.4249 | 0.4577 | 0.5085 | 0.4366 | 0.2854 |
154 | | Cardio     | 0.3375 | 0.3960 | 0.3197 | **0.4117** | 0.3516 | 0.3708 | 0.3666 | 0.3864 | 0.3535 | 0.3629 | 0.3643 |
155 | | Letter     | 0.2302 | 0.2396 | **0.3346** | 0.2407 | 0.2388 | 0.2473 | 0.3160 | 0.2867 | 0.2372 | 0.2416 | 0.2193 |
156 | | MNIST      | 0.3933 | 0.3974 | 0.3353 | **0.3979** | 0.3911 | 0.3941 | 0.3701 | 0.3896 | 0.3918 | 0.3836 | 0.3928 |
157 | | Musk       | 0.8478 | 0.8773 | 0.8433 | **0.9240** | 0.8245 | 0.8718 | 0.8479 | 0.8806 | 0.8608 | 0.8629 | 0.5806 |
158 | | PageBlocks | 0.5805 | 0.5707 | 0.4684 | **0.6360** | 0.6043 | 0.6016 | 0.5297 | 0.5733 | 0.6077 | 0.6064 | 0.6094 |
159 | | Pendigits  | 0.0709 | 0.0893 | 0.0625 | **0.0944** | 0.0777 | 0.0823 | 0.0834 | 0.0895 | 0.0780 | 0.0832 | 0.0834 |
160 | | Pima       | 0.5092 | 0.5045 | 0.4716 | **0.5142** | 0.5089 | 0.5054 | 0.4813 | 0.4920 | 0.5095 | 0.4599 | 0.5094 |
161 | | Satellite  | 0.4077 | 0.4268 | 0.4223 | 0.4196 | 0.4047 | 0.4139 | **0.4385** | 0.4352 | 0.4047 | 0.4031 | 0.4049 |
162 | | Satimage-2 | 0.3477 | 0.6248 | 0.3994 | **0.6249** | 0.3959 | 0.5089 | 0.5344 | 0.5922 | 0.4159 | 0.4114 | 0.4851 |
163 | | Shuttle    | 0.1228 | 0.1296 | 0.1167 | **0.1330** | 0.1297 | 0.1316 | 0.1239 | 0.1294 | 0.1293 | 0.1316 | 0.0549 |
164 | | SpamSpace  | 0.3326 | 0.3615 | 0.3592 | **0.3665** | 0.3572 | 0.3521 | 0.3379 | 0.3413 | 0.3612 | 0.3601 | 0.3079 |
165 | | Stamps     | 0.3596 | 0.3310 | 0.3193 | **0.3779** | 0.3694 | 0.3660 | 0.3144 | 0.3387 | 0.3706 | 0.3638 | 0.3535 |
166 | | Thyroid    | 0.3544 | 0.3955 | 0.2638 | **0.4651** | 0.4045 | 0.4123 | 0.2850 | 0.3488 | 0.4130 | 0.4071 | 0.1186 |
167 | | Vertebral  | 0.0948 | 0.1020 | **0.1230** | 0.0988 | 0.0971 | 0.0975 | 0.1029 | 0.1000 | 0.0972 | 0.1067 | 0.0965 |
168 | | Vowels     | **0.3913** | 0.3678 | 0.3482 | 0.3539 | 0.3783 | 0.3790 | 0.3760 | 0.3732 | 0.3784 | 0.3783 | 0.3340 |
169 | | WBC        | 0.6033 | 0.5983 | 0.5472 | **0.6131** | 0.6097 | 0.6069 | 0.5579 | 0.5925 | 0.6105 | 0.6045 | 0.5933 |
170 | | Wilt       | 0.0518 | 0.0557 | **0.0770** | 0.0423 | 0.0493 | 0.0523 | 0.0715 | 0.0633 | 0.0486 | 0.0537 | 0.0591 |
171 | 
172 | ## Conclusions
173 | 
174 | In this work, we propose four variants of a novel unsupervised outlier detection framework called Locally Selective Combination in Parallel Outlier Ensembles (LSCP). 
175 | Unlike traditional combination approaches, LSCP identifies the top-performing base detectors for each test instance relative to its local region. 
176 | To validate its effectiveness, the proposed framework is assessed on 20 real-world datasets and is shown to be superior to baseline algorithms. 
177 | The ensemble approach *LSCP_AOM* demonstrates the best performance achieving the highest detection score on 13/20 datasets with respect to ROC-AUC and 14/20 datasets with respect to mAP. 
178 | Theoretical considerations under the bias-variance framework and visualizations are also provided for LSCP to provide a holistic overview of the framework. 
179 | Since LSCP demonstrates the promise of data locality, future work can extend this exploration by investigating the use of heterogeneous base detectors and more reliable pseudo ground truth generation methods. 


--------------------------------------------------------------------------------
/models/base.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Base class for all outlier detector models
  3 | """
  4 | # Author: Yue Zhao <yuezhao@cs.toronto.edu>
  5 | # License: BSD 2 clause
  6 | 
  7 | from __future__ import division
  8 | from __future__ import print_function
  9 | 
 10 | import warnings
 11 | from collections import defaultdict
 12 | import abc
 13 | import sklearn
 14 | 
 15 | 
 16 | def _sklearn_version_21():  # pragma: no cover
 17 |     """ Utility function to decide the version of sklearn
 18 |     In sklearn 21.0, LOF is changed. Specifically, _decision_function
 19 |     is replaced by _score_samples
 20 | 
 21 |     Returns
 22 |     -------
 23 |     sklearn_21_flag : bool
 24 |         True if sklearn.__version__ is newer than 0.21.0
 25 | 
 26 |     """
 27 |     sklearn_version = str(sklearn.__version__)
 28 |     if int(sklearn_version.split(".")[1]) > 20:
 29 |         return True
 30 |     else:
 31 |         return False
 32 | 
 33 | 
 34 | if _sklearn_version_21():
 35 |     from inspect import signature
 36 | else:
 37 |     from sklearn.externals.funcsigs import signature
 38 | 
 39 | from sklearn.externals import six
 40 | 
 41 | import numpy as np
 42 | from scipy.special import erf
 43 | from scipy.stats import scoreatpercentile
 44 | from sklearn.preprocessing import MinMaxScaler
 45 | from sklearn.utils.validation import check_is_fitted
 46 | from sklearn.utils.multiclass import check_classification_targets
 47 | 
 48 | from .sklearn_base import _pprint
 49 | 
 50 | 
 51 | @six.add_metaclass(abc.ABCMeta)
 52 | class BaseDetector(object):
 53 |     """Abstract class for all outlier detection algorithms.
 54 | 
 55 |     Parameters
 56 |     ----------
 57 |     contamination : float in (0., 0.5), optional (default=0.1)
 58 |         The amount of contamination of the data set,
 59 |         i.e. the proportion of outliers in the data set. Used when fitting to
 60 |         define the threshold on the decision function.
 61 | 
 62 |     Attributes
 63 |     ----------
 64 |     decision_scores_ : numpy array of shape (n_samples,)
 65 |         The outlier scores of the training data.
 66 |         The higher, the more abnormal. Outliers tend to have higher
 67 |         scores. This value is available once the detector is fitted.
 68 | 
 69 |     threshold_ : float
 70 |         The threshold is based on ``contamination``. It is the
 71 |         ``n_samples * contamination`` most abnormal samples in
 72 |         ``decision_scores_``. The threshold is calculated for generating
 73 |         binary outlier labels.
 74 | 
 75 |     labels_ : int, either 0 or 1
 76 |         The binary labels of the training data. 0 stands for inliers
 77 |         and 1 for outliers/anomalies. It is generated by applying
 78 |         ``threshold_`` on ``decision_scores_``.
 79 |     """
 80 | 
 81 |     @abc.abstractmethod
 82 |     def __init__(self, contamination=0.1):
 83 | 
 84 |         if not (0. < contamination <= 0.5):
 85 |             raise ValueError("contamination must be in (0, 0.5], "
 86 |                              "got: %f" % contamination)
 87 | 
 88 |         self.contamination = contamination
 89 | 
 90 |     # noinspection PyIncorrectDocstring
 91 |     @abc.abstractmethod
 92 |     def fit(self, X, y=None):
 93 |         """Fit detector. y is optional for unsupervised methods.
 94 | 
 95 |         Parameters
 96 |         ----------
 97 |         X : numpy array of shape (n_samples, n_features)
 98 |             The input samples.
 99 | 
100 |         y : numpy array of shape (n_samples,), optional (default=None)
101 |             The ground truth of the input samples (labels).
102 |         """
103 |         pass
104 | 
105 |     @abc.abstractmethod
106 |     def decision_function(self, X):
107 |         """Predict raw anomaly score of X using the fitted detector.
108 | 
109 |         The anomaly score of an input sample is computed based on different
110 |         detector algorithms. For consistency, outliers are assigned with
111 |         larger anomaly scores.
112 | 
113 |         Parameters
114 |         ----------
115 |         X : numpy array of shape (n_samples, n_features)
116 |             The training input samples. Sparse matrices are accepted only
117 |             if they are supported by the base estimator.
118 | 
119 |         Returns
120 |         -------
121 |         anomaly_scores : numpy array of shape (n_samples,)
122 |             The anomaly score of the input samples.
123 |         """
124 |         pass
125 | 
126 |     def fit_predict(self, X, y=None):
127 |         """Fit detector first and then predict whether a particular sample
128 |         is an outlier or not.
129 | 
130 |         Parameters
131 |         ----------
132 |         X : numpy array of shape (n_samples, n_features)
133 |             The input samples.
134 | 
135 |         y : numpy array of shape (n_samples,), optional (default=None)
136 |             The ground truth of the input samples (labels).
137 | 
138 |         Returns
139 |         -------
140 |         outlier_labels : numpy array of shape (n_samples,)
141 |             For each observation, tells whether or not
142 |             it should be considered as an outlier according to the
143 |             fitted model. 0 stands for inliers and 1 for outliers.
144 |         """
145 | 
146 |         self.fit(X, y)
147 |         return self.labels_
148 | 
149 |     def predict(self, X):
150 |         """Predict if a particular sample is an outlier or not.
151 | 
152 |         Parameters
153 |         ----------
154 |         X : numpy array of shape (n_samples, n_features)
155 |             The input samples.
156 | 
157 |         Returns
158 |         -------
159 |         outlier_labels : numpy array of shape (n_samples,)
160 |             For each observation, tells whether or not
161 |             it should be considered as an outlier according to the
162 |             fitted model. 0 stands for inliers and 1 for outliers.
163 |         """
164 | 
165 |         check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
166 | 
167 |         pred_score = self.decision_function(X)
168 |         return (pred_score > self.threshold_).astype('int').ravel()
169 | 
170 |     def predict_proba(self, X, method='linear'):
171 |         """Predict the probability of a sample being outlier. Two approaches
172 |         are possible:
173 | 
174 |         1. simply use Min-max conversion to linearly transform the outlier
175 |            scores into the range of [0,1]. The model must be
176 |            fitted first.
177 |         2. use unifying scores, see :cite:`kriegel2011interpreting`.
178 | 
179 |         Parameters
180 |         ----------
181 |         X : numpy array of shape (n_samples, n_features)
182 |             The input samples.
183 | 
184 |         method : str, optional (default='linear')
185 |             probability conversion method. It must be one of
186 |             'linear' or 'unify'.
187 | 
188 |         Returns
189 |         -------
190 |         outlier_labels : numpy array of shape (n_samples,)
191 |             For each observation, tells whether or not
192 |             it should be considered as an outlier according to the
193 |             fitted model. Return the outlier probability, ranging
194 |             in [0,1].
195 |         """
196 | 
197 |         check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
198 |         train_scores = self.decision_scores_
199 | 
200 |         test_scores = self.decision_function(X)
201 | 
202 |         probs = np.zeros([X.shape[0], int(self._classes)])
203 |         if method == 'linear':
204 |             scaler = MinMaxScaler().fit(train_scores.reshape(-1, 1))
205 |             probs[:, 1] = scaler.transform(
206 |                 test_scores.reshape(-1, 1)).ravel().clip(0, 1)
207 |             probs[:, 0] = 1 - probs[:, 1]
208 |             return probs
209 | 
210 |         elif method == 'unify':
211 |             # turn output into probability
212 |             pre_erf_score = (test_scores - self._mu) / (
213 |                     self._sigma * np.sqrt(2))
214 |             erf_score = erf(pre_erf_score)
215 |             probs[:, 1] = erf_score.clip(0, 1).ravel()
216 |             probs[:, 0] = 1 - probs[:, 1]
217 |             return probs
218 |         else:
219 |             raise ValueError(method,
220 |                              'is not a valid probability conversion method')
221 | 
222 |     def _predict_rank(self, X, normalized=False):
223 |         """Predict the outlyingness rank of a sample by a fitted model. The
224 |         method is for outlier detector score combination.
225 | 
226 |         Parameters
227 |         ----------
228 |         X : numpy array of shape (n_samples, n_features)
229 |             The input samples.
230 | 
231 |         normalized : bool, optional (default=False)
232 |             If set to True, all ranks are normalized to [0,1].
233 | 
234 |         Returns
235 |         -------
236 |         ranks : array, shape (n_samples,)
237 |             Outlying rank of a sample according to the training data.
238 | 
239 |         """
240 | 
241 |         check_is_fitted(self, ['decision_scores_'])
242 | 
243 |         test_scores = self.decision_function(X)
244 |         train_scores = self.decision_scores_
245 | 
246 |         sorted_train_scores = np.sort(train_scores)
247 |         ranks = np.searchsorted(sorted_train_scores, test_scores)
248 | 
249 |         if normalized:
250 |             # return normalized ranks
251 |             ranks = ranks / ranks.max()
252 |         return ranks
253 | 
254 |     # def score(self, X, y, scoring='roc_auc_score'):
255 |     #     """Returns the evaluation resulted on the given test data and labels.
256 |     #     ROC is chosen as the default evaluation metric
257 |     #
258 |     #     :param X: The input samples
259 |     #     :type X: numpy array of shape (n_samples, n_features)
260 |     #
261 |     #     :param y: Outlier labels of the input samples
262 |     #     :type y: array, shape (n_samples,)
263 |     #
264 |     #     :param scoring: Evaluation metric
265 |     #
266 |     #             -' roc_auc_score': ROC score
267 |     #             - 'prc_n_score': Precision @ rank n score
268 |     #     :type scoring: str, optional (default='roc_auc_score')
269 |     #
270 |     #     :return: Evaluation score
271 |     #     :rtype: float
272 |     #     """
273 |     #     check_is_fitted(self, ['decision_scores_'])
274 |     #     if scoring == 'roc_auc_score':
275 |     #         score = roc_auc_score(y, self.decision_function(X))
276 |     #     elif scoring == 'prc_n_score':
277 |     #         score = precision_n_scores(y, self.decision_function(X))
278 |     #     else:
279 |     #         raise NotImplementedError('PyOD built-in scoring only supports '
280 |     #                                   'ROC and Precision @ rank n')
281 |     #
282 |     #     print("{metric}: {score}".format(metric=scoring, score=score))
283 |     #
284 |     #     return score
285 | 
286 |     def _set_n_classes(self, y):
287 |         """Set the number of classes if `y` is presented, which is not
288 |         expected. It could be useful for multi-class outlier detection.
289 | 
290 |         Parameters
291 |         ----------
292 |         y : numpy array of shape (n_samples,)
293 |             Ground truth.
294 | 
295 |         Returns
296 |         -------
297 |         self
298 |         """
299 | 
300 |         self._classes = 2  # default as binary classification
301 |         if y is not None:
302 |             check_classification_targets(y)
303 |             self._classes = len(np.unique(y))
304 |         return self
305 | 
306 |     def _process_decision_scores(self):
307 |         """Internal function to calculate key attributes:
308 | 
309 |         - threshold_: used to decide the binary label
310 |         - labels_: binary labels of training data
311 | 
312 |         Returns
313 |         -------
314 |         self
315 |         """
316 | 
317 |         self.threshold_ = scoreatpercentile(self.decision_scores_,
318 |                                             100 * (1 - self.contamination))
319 |         self.labels_ = (self.decision_scores_ > self.threshold_).astype(
320 |             'int').ravel()
321 | 
322 |         # calculate for predict_proba()
323 | 
324 |         self._mu = np.mean(self.decision_scores_)
325 |         self._sigma = np.std(self.decision_scores_)
326 | 
327 |         return self
328 | 
329 |     # noinspection PyMethodParameters
330 |     def _get_param_names(cls):
331 |         # noinspection PyPep8
332 |         """Get parameter names for the estimator
333 | 
334 |         See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html
335 |         and sklearn/base.py for more information.
336 |         """
337 | 
338 |         # fetch the constructor or the original constructor before
339 |         # deprecation wrapping if any
340 |         init = getattr(cls.__init__, 'deprecated_original', cls.__init__)
341 |         if init is object.__init__:
342 |             # No explicit constructor to introspect
343 |             return []
344 | 
345 |         # introspect the constructor arguments to find the model parameters
346 |         # to represent
347 |         init_signature = signature(init)
348 |         # Consider the constructor parameters excluding 'self'
349 |         parameters = [p for p in init_signature.parameters.values()
350 |                       if p.name != 'self' and p.kind != p.VAR_KEYWORD]
351 |         for p in parameters:
352 |             if p.kind == p.VAR_POSITIONAL:
353 |                 raise RuntimeError("scikit-learn estimators should always "
354 |                                    "specify their parameters in the signature"
355 |                                    " of their __init__ (no varargs)."
356 |                                    " %s with constructor %s doesn't "
357 |                                    " follow this convention."
358 |                                    % (cls, init_signature))
359 |         # Extract and sort argument names excluding 'self'
360 |         return sorted([p.name for p in parameters])
361 | 
362 |     # noinspection PyPep8
363 |     def get_params(self, deep=True):
364 |         """Get parameters for this estimator.
365 | 
366 |         See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html
367 |         and sklearn/base.py for more information.
368 | 
369 |         Parameters
370 |         ----------
371 |         deep : boolean, optional
372 |             If True, will return the parameters for this estimator and
373 |             contained subobjects that are estimators.
374 | 
375 |         Returns
376 |         -------
377 |         params : mapping of string to any
378 |             Parameter names mapped to their values.
379 |         """
380 | 
381 |         out = dict()
382 |         for key in self._get_param_names():
383 |             # We need deprecation warnings to always be on in order to
384 |             # catch deprecated param values.
385 |             # This is set in utils/__init__.py but it gets overwritten
386 |             # when running under python3 somehow.
387 |             warnings.simplefilter("always", DeprecationWarning)
388 |             try:
389 |                 with warnings.catch_warnings(record=True) as w:
390 |                     value = getattr(self, key, None)
391 |                 if len(w) and w[0].category == DeprecationWarning:
392 |                     # if the parameter is deprecated, don't show it
393 |                     continue
394 |             finally:
395 |                 warnings.filters.pop(0)
396 | 
397 |             # XXX: should we rather test if instance of estimator?
398 |             if deep and hasattr(value, 'get_params'):
399 |                 deep_items = value.get_params().items()
400 |                 out.update((key + '__' + k, val) for k, val in deep_items)
401 |             out[key] = value
402 |         return out
403 | 
404 |     def set_params(self, **params):
405 |         # noinspection PyPep8
406 |         """Set the parameters of this estimator.
407 |         The method works on simple estimators as well as on nested objects
408 |         (such as pipelines). The latter have parameters of the form
409 |         ``<component>__<parameter>`` so that it's possible to update each
410 |         component of a nested object.
411 | 
412 |         See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html
413 |         and sklearn/base.py for more information.
414 | 
415 |         Returns
416 |         -------
417 |         self : object
418 |         """
419 | 
420 |         if not params:
421 |             # Simple optimization to gain speed (inspect is slow)
422 |             return self
423 |         valid_params = self.get_params(deep=True)
424 | 
425 |         nested_params = defaultdict(dict)  # grouped by prefix
426 |         for key, value in params.items():
427 |             key, delim, sub_key = key.partition('__')
428 |             if key not in valid_params:
429 |                 raise ValueError('Invalid parameter %s for estimator %s. '
430 |                                  'Check the list of available parameters '
431 |                                  'with `estimator.get_params().keys()`.' %
432 |                                  (key, self))
433 | 
434 |             if delim:
435 |                 nested_params[key][sub_key] = value
436 |             else:
437 |                 setattr(self, key, value)
438 | 
439 |         for key, sub_params in nested_params.items():
440 |             valid_params[key].set_params(**sub_params)
441 | 
442 |         return self
443 | 
444 |     def __repr__(self):
445 |         # noinspection PyPep8
446 |         """
447 |         See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html
448 |         and sklearn/base.py for more information.
449 |         """
450 | 
451 |         class_name = self.__class__.__name__
452 |         return '%s(%s)' % (class_name, _pprint(self.get_params(deep=False),
453 |                                                offset=len(class_name), ),)
454 | 


--------------------------------------------------------------------------------
/models/feature_bagging.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Feature bagging detector
  3 | """
  4 | # Author: Yue Zhao <yuezhao@cs.toronto.edu>
  5 | # License: BSD 2 clause
  6 | from __future__ import division
  7 | from __future__ import print_function
  8 | 
  9 | import numpy as np
 10 | import numbers
 11 | from sklearn.base import clone
 12 | from sklearn.utils import check_random_state
 13 | from sklearn.utils import check_array
 14 | from sklearn.utils.validation import check_is_fitted
 15 | from sklearn.utils.estimator_checks import check_estimator
 16 | from sklearn.utils.random import sample_without_replacement
 17 | 
 18 | from .lof import LOF
 19 | from .base import BaseDetector
 20 | 
 21 | from .combination import average, maximization
 22 | 
 23 | MAX_INT = np.iinfo(np.int32).max
 24 | MIN_INT = -1 * MAX_INT
 25 | 
 26 | 
 27 | def generate_bagging_indices(random_state, bootstrap_features, n_features,
 28 |                              min_features, max_features):
 29 |     """ Randomly draw feature indices. Internal use only.
 30 | 
 31 |     Modified from sklearn/ensemble/bagging.py
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     random_state : RandomState
 36 |         A random number generator instance to define the state of the random
 37 |         permutations generator.
 38 | 
 39 |     bootstrap_features : bool
 40 |         Specifies whether to bootstrap indice generation
 41 | 
 42 |     n_features : int
 43 |         Specifies the population size when generating indices
 44 | 
 45 |     min_features : int
 46 |         Lower limit for number of features to randomly sample
 47 | 
 48 |     max_features : int
 49 |         Upper limit for number of features to randomly sample
 50 | 
 51 |     Returns
 52 |     -------
 53 |     feature_indices : numpy array, shape (n_samples,)
 54 |         Indices for features to bag
 55 | 
 56 |     """
 57 | 
 58 |     # Get valid random state
 59 |     random_state = check_random_state(random_state)
 60 | 
 61 |     # decide number of features to draw
 62 |     random_n_features = random_state.randint(min_features, max_features)
 63 | 
 64 |     # Draw indices
 65 |     feature_indices = generate_indices(random_state, bootstrap_features,
 66 |                                        n_features, random_n_features)
 67 | 
 68 |     return feature_indices
 69 | 
 70 | 
 71 | def generate_indices(random_state, bootstrap, n_population, n_samples):
 72 |     """ Draw randomly sampled indices. Internal use only.
 73 | 
 74 |     See sklearn/ensemble/bagging.py
 75 | 
 76 |     Parameters
 77 |     ----------
 78 |     random_state : RandomState
 79 |         A random number generator instance to define the state of the random
 80 |         permutations generator.
 81 | 
 82 |     bootstrap :  bool
 83 |         Specifies whether to bootstrap indice generation
 84 | 
 85 |     n_population : int
 86 |         Specifies the population size when generating indices
 87 | 
 88 |     n_samples : int
 89 |         Specifies number of samples to draw
 90 | 
 91 |     Returns
 92 |     -------
 93 |     indices : numpy array, shape (n_samples,)
 94 |         randomly drawn indices
 95 |     """
 96 | 
 97 |     # Draw sample indices
 98 |     if bootstrap:
 99 |         indices = random_state.randint(0, n_population, n_samples)
100 |     else:
101 |         indices = sample_without_replacement(n_population, n_samples,
102 |                                              random_state=random_state)
103 | 
104 |     return indices
105 | 
106 | 
107 | def check_parameter(param, low=MIN_INT, high=MAX_INT, param_name='',
108 |                     include_left=False, include_right=False):
109 |     """Check if an input is within the defined range.
110 | 
111 |     Parameters
112 |     ----------
113 |     param : int, float
114 |         The input parameter to check.
115 | 
116 |     low : int, float
117 |         The lower bound of the range.
118 | 
119 |     high : int, float
120 |         The higher bound of the range.
121 | 
122 |     param_name : str, optional (default='')
123 |         The name of the parameter.
124 | 
125 |     include_left : bool, optional (default=False)
126 |         Whether includes the lower bound (lower bound <=).
127 | 
128 |     include_right : bool, optional (default=False)
129 |         Whether includes the higher bound (<= higher bound).
130 | 
131 |     Returns
132 |     -------
133 |     within_range : bool or raise errors
134 |         Whether the parameter is within the range of (low, high)
135 | 
136 |     """
137 | 
138 |     # param, low and high should all be numerical
139 |     if not isinstance(param, (numbers.Integral, np.integer, np.float)):
140 |         raise TypeError('{param_name} is set to {param} Not numerical'.format(
141 |             param=param, param_name=param_name))
142 | 
143 |     if not isinstance(low, (numbers.Integral, np.integer, np.float)):
144 |         raise TypeError('low is set to {low}. Not numerical'.format(low=low))
145 | 
146 |     if not isinstance(high, (numbers.Integral, np.integer, np.float)):
147 |         raise TypeError('high is set to {high}. Not numerical'.format(
148 |             high=high))
149 | 
150 |     # at least one of the bounds should be specified
151 |     if low is MIN_INT and high is MAX_INT:
152 |         raise ValueError('Neither low nor high bounds is undefined')
153 | 
154 |     # if wrong bound values are used
155 |     if low > high:
156 |         raise ValueError(
157 |             'Lower bound > Higher bound')
158 | 
159 |     # value check under different bound conditions
160 |     if (include_left and include_right) and (param < low or param > high):
161 |         raise ValueError(
162 |             '{param_name} is set to {param}. '
163 |             'Not in the range of [{low}, {high}].'.format(
164 |                 param=param, low=low, high=high, param_name=param_name))
165 | 
166 |     elif (include_left and not include_right) and (
167 |             param < low or param >= high):
168 |         raise ValueError(
169 |             '{param_name} is set to {param}. '
170 |             'Not in the range of [{low}, {high}).'.format(
171 |                 param=param, low=low, high=high, param_name=param_name))
172 | 
173 |     elif (not include_left and include_right) and (
174 |             param <= low or param > high):
175 |         raise ValueError(
176 |             '{param_name} is set to {param}. '
177 |             'Not in the range of ({low}, {high}].'.format(
178 |                 param=param, low=low, high=high, param_name=param_name))
179 | 
180 |     elif (not include_left and not include_right) and (
181 |             param <= low or param >= high):
182 |         raise ValueError(
183 |             '{param_name} is set to {param}. '
184 |             'Not in the range of ({low}, {high}).'.format(
185 |                 param=param, low=low, high=high, param_name=param_name))
186 |     else:
187 |         return True
188 | 
189 | 
190 | def _set_random_states(estimator, random_state=None):
191 |     """Sets fixed random_state parameters for an estimator. Internal use only.
192 |     Modified from sklearn/base.py
193 | 
194 |     Finds all parameters ending ``random_state`` and sets them to integers
195 |     derived from ``random_state``.
196 | 
197 |     Parameters
198 |     ----------
199 |     estimator : estimator supporting get/set_params
200 |         Estimator with potential randomness managed by random_state
201 |         parameters.
202 | 
203 |     random_state : int, RandomState instance or None, optional (default=None)
204 |         If int, random_state is the seed used by the random number generator;
205 |         If RandomState instance, random_state is the random number generator;
206 |         If None, the random number generator is the RandomState instance used
207 |         by `np.random`.
208 | 
209 |     Notes
210 |     -----
211 |     This does not necessarily set *all* ``random_state`` attributes that
212 |     control an estimator's randomness, only those accessible through
213 |     ``estimator.get_params()``.  ``random_state``s not controlled include
214 |     those belonging to:
215 | 
216 |         * cross-validation splitters
217 |         * ``scipy.stats`` rvs
218 |     """
219 |     random_state = check_random_state(random_state)
220 |     to_set = {}
221 |     for key in sorted(estimator.get_params(deep=True)):
222 |         if key == 'random_state' or key.endswith('__random_state'):
223 |             to_set[key] = random_state.randint(MAX_INT)
224 | 
225 |     if to_set:
226 |         estimator.set_params(**to_set)
227 | 
228 | 
229 | def _parallel_decision_function(estimators, estimators_features, X):
230 |     n_samples = X.shape[0]
231 |     scores = np.zeros((n_samples, len(estimators)))
232 | 
233 |     for i, (estimator, features) in enumerate(
234 |             zip(estimators, estimators_features)):
235 |         if hasattr(estimator, 'decision_function'):
236 |             estimator_score = estimator.decision_function(
237 |                 X[:, features])
238 |             scores[:, i] = estimator_score
239 |         else:
240 |             raise NotImplementedError(
241 |                 'current base detector has no decision_function')
242 |     return scores
243 | 
244 | 
245 | # TODO: should support parallelization at the model level
246 | class FeatureBagging(BaseDetector):
247 |     """ A feature bagging detector is a meta estimator that fits a number of
248 |     base detectors on various sub-samples of the dataset and use averaging
249 |     or other combination methods to improve the predictive accuracy and
250 |     control over-fitting.
251 | 
252 |     The sub-sample size is always the same as the original input sample size
253 |     but the features are randomly sampled from half of the features to all
254 |     features.
255 | 
256 |     By default, LOF is used as the base estimator. However, any estimator
257 |     could be used as the base estimator, such as kNN and ABOD.
258 | 
259 |     Feature bagging first construct n subsamples by random selecting a subset
260 |     of features, which induces the diversity of base estimators.
261 | 
262 |     Finally, the prediction score is generated by averaging/taking the maximum
263 |     of all base detectors. See :cite:`lazarevic2005feature` for details.
264 | 
265 |     Parameters
266 |     ----------
267 |     base_estimator : object or None, optional (default=None)
268 |         The base estimator to fit on random subsets of the dataset.
269 |         If None, then the base estimator is a LOF detector.
270 | 
271 |     n_estimators : int, optional (default=10)
272 |         The number of base estimators in the ensemble.
273 | 
274 |     contamination : float in (0., 0.5), optional (default=0.1)
275 |         The amount of contamination of the data set,
276 |         i.e. the proportion of outliers in the data set. Used when fitting to
277 |         define the threshold on the decision function.
278 | 
279 |     max_features : int or float, optional (default=1.0)
280 |         The number of features to draw from X to train each base estimator.
281 | 
282 |         - If int, then draw `max_features` features.
283 |         - If float, then draw `max_features * X.shape[1]` features.
284 | 
285 |     bootstrap_features : bool, optional (default=False)
286 |         Whether features are drawn with replacement.
287 | 
288 |     check_estimator : bool, optional (default=True)
289 |         If set to True, check whether the base estimator is consistent with
290 |         sklearn standard.
291 | 
292 |     n_jobs : optional (default=1)
293 |         The number of jobs to run in parallel for both `fit` and
294 |         `predict`. If -1, then the number of jobs is set to the
295 |         number of cores.
296 | 
297 |     random_state : int, RandomState or None, optional (default=None)
298 |         If int, random_state is the seed used by the random
299 |         number generator; If RandomState instance, random_state is the random
300 |         number generator; If None, the random number generator is the
301 |         RandomState instance used by `np.random`.
302 | 
303 |     combination : str, optional (default='average')
304 |         the method of combination:
305 | 
306 |         - if 'average': take the average of all detectors
307 |         - if 'max': take the maximum scores of all detectors
308 | 
309 |     verbose : int, optional (default=0)
310 |         Controls the verbosity of the building process.
311 | 
312 |     estimator_params : dict, optional (default=None)
313 |         The list of attributes to use as parameters
314 |         when instantiating a new base estimator. If none are given,
315 |         default parameters are used.
316 | 
317 |     Attributes
318 |     ----------
319 |     decision_scores_ : numpy array of shape (n_samples,)
320 |         The outlier scores of the training data.
321 |         The higher, the more abnormal. Outliers tend to have higher
322 |         scores. This value is available once the detector is
323 |         fitted.
324 | 
325 |     threshold_ : float
326 |         The threshold is based on ``contamination``. It is the
327 |         ``n_samples * contamination`` most abnormal samples in
328 |         ``decision_scores_``. The threshold is calculated for generating
329 |         binary outlier labels.
330 | 
331 |     labels_ : int, either 0 or 1
332 |         The binary labels of the training data. 0 stands for inliers
333 |         and 1 for outliers/anomalies. It is generated by applying
334 |         ``threshold_`` on ``decision_scores_``.
335 | 
336 |     """
337 | 
338 |     def __init__(self, base_estimator=None, n_estimators=10, contamination=0.1,
339 |                  max_features=1.0, bootstrap_features=False,
340 |                  check_estimator=True, n_jobs=1, random_state=None,
341 |                  combination='average', verbose=0, estimator_params=None):
342 | 
343 |         super(FeatureBagging, self).__init__(contamination=contamination)
344 |         self.base_estimator = base_estimator
345 |         self.n_estimators = n_estimators
346 |         self.max_features = max_features
347 |         self.bootstrap_features = bootstrap_features
348 |         self.check_estimator = check_estimator
349 |         self.combination = combination
350 |         self.n_jobs = n_jobs
351 |         self.random_state = random_state
352 |         self.verbose = verbose
353 |         if estimator_params is not None:
354 |             self.estimator_params = estimator_params
355 |         else:
356 |             self.estimator_params = {}
357 | 
358 |     def fit(self, X, y=None):
359 |         """Fit detector. y is optional for unsupervised methods.
360 | 
361 |         Parameters
362 |         ----------
363 |         X : numpy array of shape (n_samples, n_features)
364 |             The input samples.
365 | 
366 |         y : numpy array of shape (n_samples,), optional (default=None)
367 |             The ground truth of the input samples (labels).
368 |         """
369 |         random_state = check_random_state(self.random_state)
370 | 
371 |         X = check_array(X)
372 |         self.n_samples_, self.n_features_ = X.shape[0], X.shape[1]
373 | 
374 |         self._set_n_classes(y)
375 | 
376 |         # expect at least 2 features, does not make sense if only have
377 |         # 1 feature
378 |         check_parameter(self.n_features_, low=2, include_left=True,
379 |                         param_name='n_features')
380 | 
381 |         # check parameters
382 |         self._validate_estimator(default=LOF(n_jobs=self.n_jobs))
383 | 
384 |         # use at least half of the features
385 |         self.min_features_ = int(0.5 * self.n_features_)
386 | 
387 |         # Validate max_features
388 |         if isinstance(self.max_features, (numbers.Integral, np.integer)):
389 |             self.max_features_ = self.max_features
390 |         else:  # float
391 |             self.max_features_ = int(self.max_features * self.n_features_)
392 | 
393 |         # min_features and max_features could equal
394 |         check_parameter(self.max_features_, low=self.min_features_,
395 |                         param_name='max_features', high=self.n_features_,
396 |                         include_left=True, include_right=True)
397 | 
398 |         self.estimators_ = []
399 |         self.estimators_features_ = []
400 | 
401 |         n_more_estimators = self.n_estimators - len(self.estimators_)
402 | 
403 |         if n_more_estimators < 0:
404 |             raise ValueError('n_estimators=%d must be larger or equal to '
405 |                              'len(estimators_)=%d when warm_start==True'
406 |                              % (self.n_estimators, len(self.estimators_)))
407 | 
408 |         seeds = random_state.randint(MAX_INT, size=n_more_estimators)
409 |         self._seeds = seeds
410 | 
411 |         for i in range(self.n_estimators):
412 |             random_state = np.random.RandomState(seeds[i])
413 | 
414 |             # max_features is incremented by one since random
415 |             # function is [min_features, max_features)
416 |             features = generate_bagging_indices(random_state,
417 |                                                 self.bootstrap_features,
418 |                                                 self.n_features_,
419 |                                                 self.min_features_,
420 |                                                 self.max_features_ + 1)
421 |             # initialize and append estimators
422 |             estimator = self._make_estimator(append=False,
423 |                                              random_state=random_state)
424 |             estimator.fit(X[:, features])
425 | 
426 |             self.estimators_.append(estimator)
427 |             self.estimators_features_.append(features)
428 | 
429 |         # decision score matrix from all estimators
430 |         all_decision_scores = self._get_decision_scores()
431 | 
432 |         if self.combination == 'average':
433 |             self.decision_scores_ = average(all_decision_scores)
434 |         else:
435 |             self.decision_scores_ = maximization(all_decision_scores)
436 | 
437 |         self._process_decision_scores()
438 | 
439 |         return self
440 | 
441 |     def decision_function(self, X):
442 |         """Predict raw anomaly score of X using the fitted detector.
443 | 
444 |         The anomaly score of an input sample is computed based on different
445 |         detector algorithms. For consistency, outliers are assigned with
446 |         larger anomaly scores.
447 | 
448 |         Parameters
449 |         ----------
450 |         X : numpy array of shape (n_samples, n_features)
451 |             The training input samples. Sparse matrices are accepted only
452 |             if they are supported by the base estimator.
453 | 
454 |         Returns
455 |         -------
456 |         anomaly_scores : numpy array of shape (n_samples,)
457 |             The anomaly score of the input samples.
458 |         """
459 |         check_is_fitted(self, ['estimators_', 'estimators_features_',
460 |                                'decision_scores_', 'threshold_', 'labels_'])
461 |         X = check_array(X)
462 | 
463 |         if self.n_features_ != X.shape[1]:
464 |             raise ValueError("Number of features of the model must "
465 |                              "match the input. Model n_features is {0} and "
466 |                              "input n_features is {1}."
467 |                              "".format(self.n_features_, X.shape[1]))
468 | 
469 |         # Parallel loop
470 |         # n_jobs, n_estimators, starts = _partition_estimators(self.n_estimators,
471 |         #                                                      self.n_jobs)
472 |         # all_pred_scores = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
473 |         #     delayed(_parallel_decision_function)(
474 |         #         self.estimators_[starts[i]:starts[i + 1]],
475 |         #         self.estimators_features_[starts[i]:starts[i + 1]],
476 |         #         X)
477 |         #     for i in range(n_jobs))
478 |         #
479 |         # # Reduce
480 |         # all_pred_scores = np.concatenate(all_pred_scores, axis=1)
481 |         all_pred_scores = self._predict_decision_scores(X)
482 | 
483 |         if self.combination == 'average':
484 |             return average(all_pred_scores)
485 |         else:
486 |             return maximization(all_pred_scores)
487 | 
488 |     def _predict_decision_scores(self, X):
489 |         all_pred_scores = np.zeros([X.shape[0], self.n_estimators])
490 |         for i in range(self.n_estimators):
491 |             features = self.estimators_features_[i]
492 |             all_pred_scores[:, i] = self.estimators_[i].decision_function(
493 |                 X[:, features])
494 |         return all_pred_scores
495 | 
496 |     def _get_decision_scores(self):
497 |         all_decision_scores = np.zeros([self.n_samples_, self.n_estimators])
498 |         for i in range(self.n_estimators):
499 |             all_decision_scores[:, i] = self.estimators_[i].decision_scores_
500 |         return all_decision_scores
501 | 
502 |     def _validate_estimator(self, default=None):
503 |         """Check the estimator and the n_estimator attribute, set the
504 |         `base_estimator_` attribute."""
505 |         if not isinstance(self.n_estimators, (numbers.Integral, np.integer)):
506 |             raise ValueError("n_estimators must be an integer, "
507 |                              "got {0}.".format(type(self.n_estimators)))
508 | 
509 |         if self.n_estimators <= 0:
510 |             raise ValueError("n_estimators must be greater than zero, "
511 |                              "got {0}.".format(self.n_estimators))
512 | 
513 |         if self.base_estimator is not None:
514 |             self.base_estimator_ = self.base_estimator
515 |         else:
516 |             self.base_estimator_ = default
517 | 
518 |         if self.base_estimator_ is None:
519 |             raise ValueError("base_estimator cannot be None")
520 | 
521 |         # make sure estimator is consistent with sklearn
522 |         if self.check_estimator:
523 |             check_estimator(self.base_estimator_)
524 | 
525 |     def _make_estimator(self, append=True, random_state=None):
526 |         """Make and configure a copy of the `base_estimator_` attribute.
527 | 
528 |         sklearn/base.py
529 | 
530 |         Warning: This method should be used to properly instantiate new
531 |         sub-estimators.
532 |         """
533 | 
534 |         # TODO: add a check for estimator_param
535 |         estimator = clone(self.base_estimator_)
536 |         estimator.set_params(**self.estimator_params)
537 | 
538 |         if random_state is not None:
539 |             _set_random_states(estimator, random_state)
540 | 
541 |         if append:
542 |             self.estimators_.append(estimator)
543 | 
544 |         return estimator
545 | 
546 |     def __len__(self):
547 |         """Returns the number of estimators in the ensemble."""
548 |         return len(self.estimators_)
549 | 
550 |     def __getitem__(self, index):
551 |         """Returns the index'th estimator in the ensemble."""
552 |         return self.estimators_[index]
553 | 
554 |     def __iter__(self):
555 |         """Returns iterator over estimators in the ensemble."""
556 |         return iter(self.estimators_)
557 | 


--------------------------------------------------------------------------------