├── Algo Contrast ├── Pics │ ├── Oversample_contrast.jpg │ ├── f1_contrast.png │ ├── f1_contrast_v1.png │ ├── pul_three.jpg │ ├── semi_contrast.jpg │ ├── timecost_contrast.png │ ├── timecost_contrast_v1.png │ └── unsupervised_contrast.jpg ├── semi_detection_contrast.py └── unsupervised_detection_contrast.py ├── README.md ├── SemiSupervised-ADOA ├── Anomaly Detection with Partially Observed Anomalies.pdf ├── Pics │ ├── Isolation Score.jpg │ ├── Similarity Score.jpg │ ├── Total Score.jpg │ ├── adoa_zoom_in.jpg │ ├── 异常权重.jpg │ ├── 正常权重.jpg │ └── 结构风险.jpg ├── ReadMe.md ├── adoa.py └── cluster_centers.py ├── SemiSupervised-KADOA-Original ├── ReadMe.md ├── adoa_series_contrast.png ├── compare_adoa_kadoa.py └── kadoa.py ├── SemiSupervised-PU Learning ├── Papers │ ├── A Survey on Postive and Unlabelled Learning.pdf │ ├── Building Text Classifiers Using Positive and Unlabeled Examples.pdf │ ├── Learning From Positive and Unlabeled Data:A Survey.pdf │ ├── Learning from Positive and Unlabeled Examples with Different Data Distributions_2005_A_EM.pdf │ ├── Learning with Positive and Unlabeled Examples Using Weighted Logistic Regression.pdf │ ├── POSTER_ A PU Learning based System for Potential Malicious URL Detection.pdf │ └── Partially Supervised Classification of Text Documents.pdf ├── Pics │ ├── BiasedSVM.jpg │ ├── Incorporation of the Class Prior.jpg │ ├── Spy technique.jpg │ ├── minC.jpg │ ├── optimal.jpg │ ├── param.jpg │ ├── post_prob.jpg │ ├── relation.jpg │ ├── sample_ratio_rf.png │ ├── three cate.jpg │ └── three_cate.jpg ├── ReadMe.md ├── biased_svm.py ├── pu_learning.py └── weighted_lr.py ├── UnSupervised-Based on PCA ├── Papers │ ├── A Novel Anomaly Detection Scheme Based on Principal Component Classifier.pdf │ └── AI2 _ Training a big data machine to defend.pdf ├── Pics │ ├── classify_outlier.jpg │ ├── indices_max_decrease.jpg │ ├── last_pp.jpg │ ├── major_minor.jpg │ ├── max_ev_decrease.jpg │ ├── outliers_high_error.jpg │ ├── outlierscore.jpg │ └── recon_matrix.jpg ├── ReadMe.md ├── max_ev_decrease.py ├── recon_error_kpca.py ├── recon_error_pca.py ├── recon_error_pca_svd.py └── robustpcc.py ├── UnSupervised-Isolation Forest ├── Isolation Forest.pdf ├── IsolationForest.py ├── Pics │ ├── Algorithm2.jpg │ ├── Isolation Score.jpg │ └── fdfd └── ReadMe.md ├── UnSupervised-Local Outlier Factor ├── LOF:Identifying Density-Based Local Outliers.pdf ├── LocalOutlierFactor.py ├── Pics │ ├── 1.K-dist.jpg │ ├── 2.reach_dist.jpg │ ├── 3.lrd.jpg │ └── 4.LOF.jpg └── ReadMe.md └── UnSupervised-Mahalanobis Distance ├── Pics ├── Mahdist_verify_result.jpg ├── mahal_dist.jpg ├── 变体参数含义.jpg └── 马氏距离变体.jpg ├── ReadMe.md ├── mahal_dist.py ├── mahal_dist_variant.py └── verify_mahal_equivalence.py /Algo Contrast/Pics/Oversample_contrast.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/Algo Contrast/Pics/Oversample_contrast.jpg -------------------------------------------------------------------------------- /Algo Contrast/Pics/f1_contrast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/Algo Contrast/Pics/f1_contrast.png -------------------------------------------------------------------------------- /Algo Contrast/Pics/f1_contrast_v1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/Algo Contrast/Pics/f1_contrast_v1.png -------------------------------------------------------------------------------- /Algo Contrast/Pics/pul_three.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/Algo Contrast/Pics/pul_three.jpg -------------------------------------------------------------------------------- /Algo Contrast/Pics/semi_contrast.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/Algo Contrast/Pics/semi_contrast.jpg -------------------------------------------------------------------------------- /Algo Contrast/Pics/timecost_contrast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/Algo Contrast/Pics/timecost_contrast.png -------------------------------------------------------------------------------- /Algo Contrast/Pics/timecost_contrast_v1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/Algo Contrast/Pics/timecost_contrast_v1.png -------------------------------------------------------------------------------- /Algo Contrast/Pics/unsupervised_contrast.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/Algo Contrast/Pics/unsupervised_contrast.jpg -------------------------------------------------------------------------------- /Algo Contrast/semi_detection_contrast.py: -------------------------------------------------------------------------------- 1 | # Author:马肖 2 | # E-mail:maxiaoscut@aliyun.com 3 | # Github:https://github.com/Albertsr 4 | 5 | import time 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from ADOA import ADOA 10 | from pu_learning import PUL as pul 11 | from coverage import weighted_coverage 12 | from xgboost import XGBClassifier 13 | from sklearn.ensemble import RandomForestClassifier 14 | from sklearn.linear_model import LogisticRegression 15 | from sklearn.metrics import * 16 | from sklearn.svm import SVC 17 | 18 | 19 | # 函数generate_pudata用于生成适用于PU_Learning的数据集 20 | # 参数seed为随机数种子,positive_size表示P集在整个数据集中的占比 21 | 22 | def generate_pudata(seed, positive_size=0.25): 23 | rdg = np.random.RandomState(seed) 24 | # row, col分别为数据集的行数与列数 25 | row = rdg.randint(6000, 8000) 26 | col = rdg.randint(45, 55) 27 | 28 | # contamination为U集中正样本的占比 29 | contamination = rdg.uniform(0.025, 0.035) 30 | 31 | # p_num、u_num分别为P集、U集包含的样本数 32 | p_num = int(np.ceil(row * positive_size)) 33 | u_num = row - p_num 34 | 35 | # pos_u_num为U集中包含的正样本数 36 | pos_u_num = int(np.ceil(u_num * contamination)) 37 | # 将异常样本分为3个簇,分别服从卡方分布,标准伽马分布,指数分布 38 | pos_num = p_num + pos_u_num 39 | row_sub = pos_num // 3 40 | 41 | col_1 = int(col * 0.65) 42 | col_2 = col - col_1 43 | a = rdg.uniform(-1, 0, size=(row_sub, col_1)) 44 | b = rdg.rayleigh(1, size=(row_sub, col_2)) 45 | anomalies_1 = np.c_[a, b] 46 | anomalies_2 = rdg.uniform(0, 1, size=(row_sub, col)) 47 | anomalies_3 = rdg.exponential(2.5, size=(row_sub, col)) #1.5 #2 ADOA最佳 48 | anomalies_ = np.r_[anomalies_1, anomalies_2, anomalies_3] 49 | 50 | rd_indices = rdg.permutation(len(anomalies_)) 51 | anomalies_U = anomalies_[rd_indices[:pos_u_num]] 52 | 53 | # 生成最终的正样本集,由观测到的anomalies构成 54 | P = anomalies_[rd_indices[pos_u_num:]] 55 | 56 | # 生成最终的无标签样本集,其中包含contamination比例的正样本 57 | U_neg = rdg.rand(u_num-pos_u_num, col) 58 | U = np.r_[U_neg, anomalies_U] 59 | U_label = np.r_[np.zeros(len(U_neg)), np.ones(len(anomalies_U))] 60 | return P, U, U_label 61 | 62 | def model_perfomance(y_pred, y_prob, y_true): 63 | tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() 64 | recall = tp / (tp+fn) 65 | specificity = tn / (tn+fp) 66 | gmean = np.sqrt(recall * specificity) 67 | 68 | auc = roc_auc_score(y_true, y_prob) 69 | f_score = f1_score(y_true, y_pred) 70 | coverage = weighted_coverage(y_true, y_prob) 71 | acc = accuracy_score(y_true, y_pred) 72 | performance = [auc, coverage, f_score, gmean, recall, acc] 73 | return np.array(performance) 74 | 75 | 76 | rf = RandomForestClassifier(n_estimators=350, max_depth=6, random_state=2018) 77 | xgb = XGBClassifier(n_estimators=350, learning_rate=0.25, max_depth=6, n_jobs=-1, random_state=2018) 78 | svm = SVC(C=1.0, kernel='rbf', gamma='auto', probability=True, degree=3, random_state=2018) 79 | lr = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', max_iter=1200, random_state=2019, n_jobs=-1) 80 | 81 | def performance_contrast(seed, cost_fn=1.5, cost_fp=1.0, clf_one=xgb, clf_two=rf): 82 | start = time.time() 83 | P, U, U_label = generate_pudata(seed) 84 | print('Seed:{:}, P Shape:{:}, U Shape:{:}'.format(seed, P.shape, U.shape)) 85 | 86 | # ADOA 87 | adoa = ADOA(P, U, clf_one) 88 | y_pred, y_prob = adoa.predict() 89 | adoa_performance = model_perfomance(y_pred, y_prob, U_label) 90 | 91 | # BiasedSVM 92 | X_train = np.r_[P, U] 93 | y_train = np.r_[np.ones(len(P)), np.zeros(len(U))] 94 | svm.fit(X_train, y_train, sample_weight=[cost_fn if i else cost_fp for i in y_train]) 95 | y_pred, y_prob = svm.predict(U), svm.predict_proba(U)[:, -1] 96 | svm_performance = model_perfomance(y_pred, y_prob, U_label) 97 | 98 | # Weighted LR 99 | pos_weight = len(U) / len(X_train) 100 | neg_weight = 1 - pos_weight 101 | lr.fit(X_train, y_train, sample_weight=[pos_weight if i else neg_weight for i in y_train]) 102 | y_pred, y_prob = lr.predict(U), lr.predict_proba(U)[:, -1] 103 | lr_performance = model_perfomance(y_pred, y_prob, U_label) 104 | 105 | # PUL CostSensitive 106 | pul_csl= pul(P, U, cost_fn=cost_fn, cost_fp=cost_fp, clf_one=clf_one, clf_two=clf_two, over_sample=False) 107 | y_pred, y_prob = pul_csl.predict() 108 | pul_csl_performance = model_perfomance(y_pred, y_prob, U_label) 109 | 110 | #pul_oversampled = pul(P, U, cost_fn=1, cost_fp=1, clf_one=clf_one, clf_two=clf_two, over_sample=True) 111 | #y_pred, y_prob = pul_oversampled.predict() 112 | #pul_sampled_performance = model_perfomance(y_pred, y_prob, U_label) 113 | 114 | metrics = ['AUC', 'Coverage', 'F1_Score', 'G_Mean', 'Recall', 'ACC'] 115 | models = ['ADOA', 'Biased_SVM', 'Weighted_LR', 'PUL_CostSensitive'] 116 | list_ = [adoa_performance, svm_performance, lr_performance, pul_csl_performance] 117 | performance = pd.DataFrame(list_, columns=metrics, index=models) 118 | 119 | algorithms = [performance[i].idxmax() for i in metrics] 120 | performance.loc['The Best Algorithm', :] = algorithms 121 | print(algorithms) 122 | 123 | decription = 'The evaluation of the algorithm has been completed.' 124 | print(decription, 'Running_Time:{:.2f}s\n'.format(time.time()-start)) 125 | return performance 126 | 127 | 128 | seeds = np.random.RandomState(2019).choice(range(1000), size=10, replace=False) 129 | contrast = [performance_contrast(seed) for seed in seeds] 130 | contrast_concat = np.concatenate([contrast[i] for i in range(len(contrast))]) 131 | 132 | data_names = np.array([['Dataset_' + str(i)]*5 for i in range(len(seeds))]).ravel() 133 | models = ['ADOA', 'Biased_SVM', 'Weighted_LR', 'PUL_CostSensitive', 'The Best Algorithm'] * len(seeds) # CostSensitive 134 | metrics = ['AUC', 'Coverage', 'F1_Score', 'G_Mean', 'Recall', 'ACC'] 135 | arrays = [data_names, models] 136 | idx = pd.MultiIndex.from_arrays(arrays, names=('VerifyData', 'Algorithm')) 137 | contrast_result = pd.DataFrame(contrast_concat, index=idx, columns=metrics) 138 | print(contrast_result) 139 | 140 | best_algo = contrast_result.query("Algorithm == 'The Best Algorithm'") 141 | 142 | # 处理某列出现多个等频次的众数的情况 143 | def real_mode(df): 144 | mode = df.mode(axis=0) 145 | if len(mode) == 1: 146 | return mode.values.ravel() 147 | else: 148 | target_idx = np.where(df.columns==mode.notnull().sum().idxmax())[0][0] 149 | target_name = df.columns[target_idx] 150 | target_col = mode.iloc[:, target_idx] 151 | 152 | target_df = contrast_result[target_name].swaplevel() 153 | mean_value = [target_df[model].mean() for model in target_col] 154 | idx_max = np.argmax(mean_value) 155 | 156 | # 去掉first_row中在target_idx索引处的值,成为first_row_trimmed 157 | first_row = mode.iloc[0, :] 158 | first_row[target_idx] = target_col[idx_max] 159 | return first_row.values 160 | 161 | algo_best = best_algo.copy() 162 | algo_best.loc[('All Datesets', 'Algorithm Mode(众数)'), :] = real_mode(algo_best) 163 | print(algo_best) 164 | 165 | 166 | # 对众数标黄,仅对Jupyter有效 167 | def show(row): 168 | color = 'yellow' 169 | return 'background-color: %s' % color 170 | 171 | algo_best.style.applymap(show, subset=pd.IndexSlice[('All Datesets', 'Algorithm Mode(众数)'):, :]) 172 | -------------------------------------------------------------------------------- /Algo Contrast/unsupervised_detection_contrast.py: -------------------------------------------------------------------------------- 1 | # Author:Maxiao 2 | # E-mail:maxiaoscut@aliyun.com 3 | # Github:https://github.com/Albertsr 4 | 5 | import time 6 | import numpy as np 7 | import pandas as pd 8 | import seaborn as sns 9 | 10 | import mahal_dist as md 11 | import RobustPCC as rp 12 | import PCA_Recon_Error as rep 13 | import KPCA_Recon_Error as rek 14 | 15 | from sklearn.metrics import * 16 | from sklearn.datasets import * 17 | from sklearn.ensemble import IsolationForest 18 | from sklearn.neighbors import LocalOutlierFactor 19 | 20 | from matplotlib import pyplot as plt 21 | from pandas.plotting import parallel_coordinates 22 | %matplotlib inline 23 | 24 | def predict_anomaly_indices(X, contamination): 25 | 26 | # 孤立森林 27 | iforest = IsolationForest(n_estimators=125, contamination=contamination, 28 | behaviour='new', random_state=2018, n_jobs=-1) 29 | # Returns -1 for outliers and 1 for inliers. 30 | iforest_pred = iforest.fit_predict(X) 31 | iforest_result = np.array([1 if pred==-1 else 0 for pred in iforest_pred]) 32 | 33 | # LOF 34 | lof = LocalOutlierFactor(contamination=contamination, p=2, novelty=False, n_jobs=-1) 35 | # Returns -1 for outliers and 1 for inliers. 36 | lof_pred = lof.fit_predict(X) 37 | lof_result = np.array([1 if pred==-1 else 0 for pred in lof_pred]) 38 | 39 | # 马氏距离 40 | dist = md.mahal_dist(X) 41 | anomaly_num = int(np.ceil(contamination * len(X))) 42 | md_idx = np.argsort(-dist)[:anomaly_num] 43 | mahal_result = np.array([1 if i in md_idx else 0 for i in range(len(X))]) 44 | 45 | # RobustPCC 46 | rpcc = rp.RobustPCC(X, X, gamma=0.01, quantile=99, contamination=contamination) 47 | rpcc_result = rpcc.predict() 48 | 49 | #LinearPCA重构 50 | pre = rep.PCA_Recon_Error(X, contamination=contamination) 51 | pre_result = pre.predict() 52 | 53 | ##KernelPCA重构 54 | kre = rek.KPCA_Recon_Error(X, contamination=contamination, kernel='linear') 55 | print('KernelPCA starts.') 56 | start = time.time() 57 | kre_result = kre.predict() 58 | end = time.time() 59 | print("KernelPCA cost time: {:.2f}s".format(end-start)) 60 | 61 | anomaly_pred = [iforest_result, lof_result, mahal_result, pre_result, kre_result, rpcc_result] 62 | return np.array(anomaly_pred) 63 | 64 | 65 | def evaluate_model(y_true, y_pred): 66 | assert len(y_true) == len(y_pred) 67 | acc = accuracy_score(y_true, y_pred).round(4) 68 | f1 = f1_score(y_true, y_pred).round(4) 69 | recall = recall_score(y_true, y_pred).round(4) 70 | precision = precision_score(y_true, y_pred).round(4) 71 | decription = 'F1:{:.3f}, ACC:{:.3F}, Recall:{:.3f}, Precision:{:.3f}' 72 | df_temp = pd.DataFrame([f1, acc, recall, precision]).T 73 | df_temp.columns = ['F1', 'ACC', 'Recall', 'Precision'] 74 | return df_temp 75 | 76 | 77 | def contrast_models(X, y_true, metric=['f1']): 78 | contamination = sum(y_true) / len(X) 79 | anomaly_pred = predict_anomaly_indices(X, contamination) 80 | df_res = pd.concat([evaluate_model(y_true, i) for i in anomaly_pred]) 81 | df_res.index = ['Isolation Forest', 'LOF', 'Mahalanobis Dist', 'PCA_Recon_Error', 'KPCA_Recon_Error', 'Robust PCC'] 82 | cols1 = np.array(['f1', 'acc', 'recall', 'precision']) 83 | cols2 = np.array(['F1', 'ACC', 'Recall', 'Precision']) 84 | display_metrics = cols2[[np.argwhere(cols1==i)[0][0] for i in metric]] 85 | return pd.DataFrame(df_res.loc[:, display_metrics]).T 86 | 87 | 88 | def generate_dataset(seed): 89 | rdg = np.random.RandomState(seed) 90 | row = rdg.randint(2500, 3000) #rdg.randint(2500, 3000) 91 | col = rdg.randint(30, 35) 92 | contamination = rdg.uniform(0.015, 0.025) 93 | 94 | outlier_num = int(row*contamination) 95 | inlier_num = row - outlier_num 96 | 97 | # 正常样本集服从标准正态分布 98 | inliers = rdg.randn(inlier_num, col) 99 | 100 | # 如果outlier_num为奇数,row_1=outlier_num//2,否则row_1=int(outlier_num/2) 101 | row_1 = outlier_num//2 if np.mod(outlier_num, 2) else int(outlier_num/2) 102 | row_2 = outlier_num - row_1 103 | 104 | # outliers_sub_1服从伽玛分布;outliers_sub_2服从指数分布 105 | outliers_sub_1 = rdg.gamma(shape=2, scale=0.5, size=(row_1 , col)) 106 | outliers_sub_2 = rdg.exponential(1.5, size=(row_2, col)) 107 | outliers = np.r_[outliers_sub_1, outliers_sub_2] 108 | 109 | # 将inliers与outliers在axis=0方向上予以整合,构成实验数据集 110 | X = np.r_[inliers, outliers] 111 | y = np.r_[np.zeros(len(inliers)), np.ones(len(outliers))] 112 | return X, y 113 | 114 | seeds = np.random.RandomState(2018).choice(range(1000), size=10, replace=False) 115 | datasets = [generate_dataset(seed) for seed in seeds] 116 | 117 | 118 | def get_metric_df(datasets, metric): 119 | df_metrics = pd.concat([contrast_models(i[0], i[1], metric=metric) for i in datasets]) 120 | df_metrics['dataset'] = np.array([['Dataset_' + str(i)]*len(metric) for i in range(len(datasets))]).ravel() 121 | return df_metrics 122 | 123 | 124 | def plot_parallel(df): 125 | plt.figure(figsize=(12, 6)) 126 | plt.title(df.index[0]+' score of different algorithms', fontsize=15) 127 | parallel_coordinates(df, 'dataset') 128 | plt.grid(lw=0.1) 129 | plt.legend(loc=4) 130 | plt.ylabel(df.index[0], fontsize=14) 131 | plt.show() 132 | 133 | df_metrics = get_metric_df(datasets, ['f1', 'acc', 'recall', 'precision']) 134 | plot_parallel(df_metrics.loc['F1']) 135 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Anomaly-Detection 2 | - **Author:** MaXiao 3 | - **E-Mail:** maxiaoscut@aliyun.com 4 | - **备注:** 若文档无法正常显示图片,请参考右方链接: [github图片不显示的问题](https://zhuanlan.zhihu.com/p/107196957) 5 | --- 6 | 7 | # 第一部分:无监督异常检测 (Unsupervised Detection) 8 | ## 1. 算法 9 | ### 1.1 Isolation Forest 10 | - **算法论文:** [Isolation Forest [Liu et.al, 2008]](https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/icdm08b.pdf) 11 | - **算法解析:** [Isolation Forest算法解析](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Isolation%20Forest/ReadMe.md) 12 | - **算法应用:** [isolationforest.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Isolation%20Forest/IsolationForest.py) 13 | 14 | ### 1.2 基于PCA的异常检测 15 | - **方法1:基于样本的重构误差** 16 | - **算法论文:** [AI^2 : Training a big data machine to defend [Veeramachaneni et.al, 2016]](https://people.csail.mit.edu/kalyan/AI2/) 17 | - **算法解析:** [Chapter 1:基于样本的重构误差](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Based%20on%20PCA/ReadMe.md#chapter-1基于样本的重构误差) 18 | 19 | - **算法实现** 20 | - **基于KernelPCA重构误差的异常检测:** [recon_error_kpca.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Based%20on%20PCA/recon_error_kpca.py) 21 | - **基于LinearPCA重构误差的异常检测:** [recon_error_pca.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Based%20on%20PCA/recon_error_pca.py) 22 | - **只调用Numpy实现LinearPCA异常检测:** [recon_error_pca_svd.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Based%20on%20PCA/recon_error_pca_svd.py) 23 | - 不调用scikit-learn,只调用Numpy,通过矩阵的奇异值分解(SVD)实现PCA,再进行异常检测 24 | - 返回结果与recon_error_pca.py完全一致 25 | 26 | - **方法2:基于样本在Major/Minor主成分上的偏差** 27 | - **算法论文:** [A Novel Anomaly Detection Scheme Based on Principal Component [Shyu et.al, 2003]](https://cn.bing.com/academic/profile?id=6ffacfce89595db316f3fd3bfeea1c1e&encoded=0&v=paper_preview&mkt=zh-cn) 28 | - **算法解析:** [Chapter 2:基于样本在major/minor主成分上的偏离程度](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Based%20on%20PCA/ReadMe.md#chapter-2基于样本在majorminor主成分上的偏离程度) 29 | 30 | - **算法实现:** [robustpcc.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Based%20on%20PCA/robustpcc.py) 31 | 32 | - **术语定义** 33 | - **major_eigen_vectors:将特征值降序排列后,累计之和占比约50%的前若干个特征值对应的特征向量** 34 | - refers to the eigenvectors corresponding to the first few eigenvalues whose cumulative eigenvalues account for about 50% after the eigenvalues are arranged in descending order 35 | - **minor_eigen_vectors:特征值小于0.2对应的特征向量** 36 | - refers to the eigenvectors corresponding to the eigenvalue less than 0.2 37 | 38 | - **实证分析: 异常样本在最前、最后的若干主成分上具有最大的方差** 39 | 40 | - **分析:** [Chapter 3. 实证分析:异常样本在最前、最后的若干主成分上具有最大的方差](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Based%20on%20PCA/ReadMe.md#chapter-3-实证分析异常样本在最前最后的若干主成分上具有最大的方差) 41 | - **验证代码:** [max_ev_decrease.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Based%20on%20PCA/max_ev_decrease.py) 42 | - **验证结果:** 43 | - Multiple random data sets prove that abnormal samples have the maximum variance on the first and last principal components. 44 | 45 |  46 | 47 | 48 | ### 1.3 马氏距离(Mahalabonas Distance) 49 | - **算法解析:** 50 | - [Mahalanobis Distance](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Mahalanobis%20Distance/ReadMe.md#1-马氏距离) 51 | - [Mahalanobis Distance's Variant](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Mahalanobis%20Distance/ReadMe.md#3-马氏距离的变体及其代码实现) 52 | - In fact, the square of Mahalanobis distance is equal to the variation of Mahalanobis distance. 53 | 54 | - **算法实现:** 55 | - **马氏距离的初始定义实现:** [mahal_dist.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Mahalanobis%20Distance/mahal_dist.py) 56 | - **马氏距离的变体实现:** [mahal_dist_variant.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Mahalanobis%20Distance/mahal_dist_variant.py) 57 | 58 | - **实证分析:** 59 | - **结论:** 马氏距离的平方等于马氏距离的变体 60 | - **Conclusion:** the square of Mahalanobis distance is equal to the variation of Mahalanobis distance 61 | - **推论:** 马氏距离及其变体对样本的异常程度有一致的判定 62 | - **Inference:** Mahalanobis distance and its variants are consistent in determining the abnormal degree of the sample 63 | - **验证代码:** [verify_mahal_equivalence.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Mahalanobis%20Distance/verify_mahal_equivalence.py) 64 | - **验证结果:** 65 | 66 |  67 | 68 | ### 1.4 局部异常因子(Local Outlier Factor) 69 | - **算法论文:** [LOF:Identifying Density-Based Local Outliers](https://cn.bing.com/academic/profile?id=95956f2ccd5a6941f3e71ccfb2988419&encoded=0&v=paper_preview&mkt=zh-cn) 70 | - **算法解析:** [Local Outlier Factor算法解析](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Local%20Outlier%20Factor/ReadMe.md) 71 | - **算法应用:** [lof.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Local%20Outlier%20Factor/LocalOutlierFactor.py) 72 | 73 | --- 74 | 75 | ## 2. 性能对比 76 | ### 2.1 对比方案 77 | - **步骤一:** 生成一系列随机数据集,每个数据集的行数(row)、列数(col)、污染率(contamination)均从某区间内随机抽取 78 | - **步骤二:** 各个无监督异常检测算法根据指定的contamination返回异常样本的索引(anomalies_indices) 79 | - **步骤三:** 确定baseline 80 | - 如果数据集中异常样本的索引已知(记为observed_anomaly_indices),则以此作为baseline 81 | - 如果数据集中异常样本的索引未知,则以Isolation Forest返回的异常样本索引作为baseline 82 | - **步骤四:** 确定性能评判标准 83 | - 若异常样本的索引已知(记为observed_anomaly_indices),则以F1-Score作为评判标准 84 | - 若异常样本的索引未知,则比较算法预测的异常样本索引与baseline的共同索引个数,个数越多则认为效果相对越好 85 | - **步骤五:** 不同的数据集对异常检测算法的性能可能会有不同的评估,因此可生成多个数据集来判定各算法的性能 86 | 87 | ### 2.2 对比代码 88 | - **Python代码:** [unsupervised_detection_contrast.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/Algo%20Contrast/unsupervised_detection_contrast.py) (Jupyter交互式运行代码能更直观地展示验证过程) 89 | 90 | ### 2.3 对比结果 91 | - **根据算法在特定数据集上的异常检测性能降序排列,10个随机数据集的对比结果如下图所示:** 92 | - **F1 Score** 93 | 94 |  95 | 96 | - **Time Cost** 97 | 98 |  99 | 100 | 101 | ### 2.4 对比分析 102 | #### 1)RobustPCC 103 | - RobustPCC重点考察了样本在major/minor Principal Component上的偏差,论文作者认为异常样本在主成分空间内的投影主要集中在上述两类主成分上 104 | - RobustPCC在构建过程中,需要通过马氏距离(变体)检测并剔除数据集中一定比例(gamma)的潜在异常样本,以保证RobustPCC的有效性 105 | - RobustPCC需要根据指定的分位点参数(quantile)来设定样本异常与否的阈值,**个人在实验中适度增大了gamma、quantile的取值,进一步降低FPR,提升鲁棒性** 106 | - 实验结果表明,RobustPCC具有优良的异常检测性能 107 | 108 | #### 2)Recon_Error_PCA/KPCA (Reconstruction Error Based on PCA/KernelPCA) 109 | - Recon_Error_KPCA引入核函数(对比实验取Linear、RBF),无需显式定义映射函数,通过Kernel Trick计算样本在高维特征空间(希尔伯特空间)内的重构误差; 110 | - KernelPCA的核函数需要根据数据集进行调整,在核函数适宜的情况下,高维(或无穷维)主成分空间对样本具有更强的表出能力 111 | - 低维空间内线性不可分的异常样本在高维空间内的投影将显著区别于正常样本; 112 | - 相应地,异常样本在高维(或无穷维)主成分空间内的重构误差将明显区分于正常样本; 113 | 114 | #### 3)Isolation Forest 115 | - Isolation Forest(孤立森林)表现稳定,在验证数据的异常索引未知情况下,个人将其预测值作为baseline,用于衡量其它算法的性能 116 | 117 | #### 4)Mahalabonas Distance 118 | - Mahalabonas Distance(马氏距离)实际上考虑了样本在所有主成分上的偏离度,检测性能紧跟Recon_Error_KPCA之后 119 | 120 | #### 5)Local Outlier Factor 121 | - LOF考虑了局部相邻密度,它存在一定的局限性:对于相关性结构较特殊的异常样本(anomalies in terms of different correlation structures)的检测能力不足 122 | 123 | #### **备注** 124 | - **上述实验结论受到实验数据集的样本构成、样本数量等多方面因素的影响,不一定具备普适性** 125 | - **在实际运用中,需要根据数据集本身的特点予以灵活选择相应的异常检测算法** 126 | 127 | --- 128 | 129 | # 第二部分:半监督异常检测 (Semi-supervised Detection) 130 | ## 1. 算法 131 | ### 1.1 算法一:ADOA 132 | - **算法论文:** [Anomaly Detection with Partially Observed Anomalies [Zhang et.al]](https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/www18bw.pdf) 133 | - **算法解读:** [ADOA算法解读](https://github.com/Albertsr/Anomaly-Detection/blob/master/SemiSupervised-ADOA/ReadMe.md) 134 | - **算法实现:** [adoa.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/SemiSupervised-ADOA/adoa.py) 【其中包含:用于返回聚类中心子模块 [cluster_centers.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/SemiSupervised-ADOA/cluster_centers.py)】 135 | 136 |  137 | 138 | ### 1.2 算法二: 个人原创 KADOA (personal originality) 139 | - **思路简介** 140 | - ADOA采用孤立森林与聚类相结合,KADOA运用KernelPCA重构误差替代孤立森林进行异常检测,其它思路与ADOA一致 141 | 142 | - **KADOA代码** 143 | - [kadoa.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/SemiSupervised-KADOA-Original/kadoa.py) 144 | 145 | - **KADOA与ADOA的性能对比** 146 | - **对比代码:** [compare_adoa_kadoa.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/SemiSupervised-KADOA-Original/compare_adoa_kadoa.py) 147 | - **对比结果:** 在数据集、参数设置完全一致的情况下,KADOA的性能显著优于ADOA,但此结论有待更多数据集予以验证 148 | 149 |  150 | 151 | ### 1.3 算法二:PU Learning 152 | #### 1) PU Learning三大处理方法:[PU Learning三大处理方法详细解读](https://github.com/Albertsr/Anomaly-Detection/blob/master/SemiSupervised-PU%20Learning/ReadMe.md) 153 | 154 | - **方法一:Two step techniques** 155 | - **第一步:** 从U集中筛选可靠负样本RN(identifying reliable negative examples) 156 | - **第二步:** 基于已知正样本P、可靠负样本RN训练分类器(learning based on the labeled positives and reliable negatives) 157 | 158 |  159 | 160 | - **方法二:Biased Learning** 161 | - 将U集视为带有噪音(即正样本)的负样本集(treat the unlabeled examples as negatives examples with class label noise) 162 | - 将正样本P、负样本U分别赋予相对更高、更低的正则化参数,使得混杂在U的noise允许被误分,这些被误分的noise实际上是真实的正样本 163 | - 常见的算法是Biased SVM,其优化问题如下 164 | 165 |  166 | 167 | - **方法三:Class Prior Incorporation** 168 | - Class prior incorporation modifies standard learning methods by applying the mathematics from the SCAR(Selected Completely At Random) assumption directly, using the provided class prior. Additionally, methods for learning from relational PU data are discussed 169 | 170 |  171 | 172 | #### 2) 思路详解与代码实现 173 | - **思路一:Two Step Strategy + Cost-Sensitive Learning** 174 | - **算法论文:** [POSTER_ A PU Learning based System for Potential Malicious URL Detection [Zhang et.al]](https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/ccs17poster.pdf) 175 | 176 | - **算法解读:** [Two Step Strategy与 Cost-Sensitive Learning的结合](http://note.youdao.com/noteshare?id=594e87d6ac9e03d0bb461eb5160ffd7b&sub=781142FA6A634B44B3837CADCCCF74AC) 177 | - **Two Step Strategy:** [Two Step Strategy详解](https://github.com/Albertsr/Anomaly-Detection/blob/master/SemiSupervised-PU%20Learning/ReadMe.md#1-方法一two-step-strategy) 178 | - **Cost-Sensitive Learning:** [Cost-Sensitive Learning详解](https://github.com/Albertsr/Class-Imbalance/blob/master/ReadMe.md) 179 | 180 | - **算法实现:** [pu_learning.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/SemiSupervised-PU%20Learning/pu_learning.py) 181 | - **对sample_ratio的研究:** [sample_ratio](https://github.com/Albertsr/Anomaly-Detection/blob/master/SemiSupervised-PU%20Learning/ReadMe.md#附录关于spy-technique中抽样比例sample_ratio的总结) 182 | 183 | - **思路二:Biased Learning** 184 | - **算法论文:** [Building Text Classifiers Using Positive and Unlabeled Examples [Liu et.al]](https://cn.bing.com/academic/profile?id=1252dfd9254eaa6059c5a1202548ee40&encoded=0&v=paper_preview&mkt=zh-cn) 185 | - **算法解读:** [Biased Learning解读](https://github.com/Albertsr/Anomaly-Detection/blob/master/SemiSupervised-PU%20Learning/ReadMe.md#2-方法二biased-learning) 186 | - **算法实现:** [biased_svm.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/SemiSupervised-PU%20Learning/biased_svm.py) 187 | 188 | 189 | - **思路三:Class Prior Incorporation** 190 | - **算法论文:** [Learning with Positive and Unlabeled Examples Using Weighted Logistic Regression [Lee et.al]](https://cn.bing.com/academic/profile?id=b4da94afa8e9a1e8d33ac97332c98b64&encoded=0&v=paper_preview&mkt=zh-cn) 191 | - **算法解读:** [Class Prior Incorporation解读](https://github.com/Albertsr/Anomaly-Detection/tree/master/SemiSupervised-PU%20Learning#3-方法三class-prior-incorporation) 192 | - **算法实现:** [weighted_lr.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/SemiSupervised-PU%20Learning/weighted_lr.py) 193 | 194 | --- 195 | 196 | ## 2. 性能对比 197 | ### 2.1 对比算法 198 | - **算法一:ADOA** 199 | - **算法二:Biased SVM** 200 | - **算法三:Weighted LR** 201 | - **算法四:PU Learning + Cost-Sensitive Learning** 202 | 203 | --- 204 | 205 | ### 2.2 模型评估指标 206 | - **选取的模型评估指标:** AUC、F1_Score、**Coverage**、**G-Mean**、Recall、ACC 207 | 208 | - **Coverage详解** 209 | - **出处:** [蚂蚁金服-风险大脑-支付风险识别大赛(第一赛季)](https://dc.cloud.alipay.com/index#/topic/data?id=4) 210 | 211 | - **代码实现:** [coverage.py](https://github.com/Albertsr/Class-Imbalance/blob/master/5.%20Appropriate%20Metrics/coverage.py) 212 | 213 | - **定义:** 214 |  215 | 216 | 217 | - **G-Mean** 218 | - **出处:** [Addressing the Curse of Imbalanced Training Sets: One-Sided Selection [Miroslav Kubat, Stan Matwin; 1997]](https://cn.bing.com/academic/profile?id=32c7b83b5988bbcad21fdeb24360d5c4&encoded=0&v=paper_preview&mkt=zh-cn) 219 | 220 | - **代码实现:** [gmean.py](https://github.com/Albertsr/Class-Imbalance/blob/master/5.%20Appropriate%20Metrics/gmean.py) 221 | 222 | - **定义:** 223 | 224 |  225 | 226 | 227 | ### 2.3 对比方案与代码 228 | - **对比代码:** [semi_detection_contrast.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/Algo%20Contrast/semi_detection_contrast.py) 229 | 230 | - **对比思路:** 231 | - **步骤一:** 生成一系列随机数据集,其中包含已观察到的**异常样本集(记为正样本集P)**,**无标签样本集(记为U)** 232 | 233 | - **步骤二:** 各个半监督异常检测算法**对U集进行预测并返回预测值y_pred** 234 | 235 | - **步骤三:** 生成U集时,其真实标签y_true是已知的,**根据y_true、y_pred计算半监督异常检测算法的性能** 236 | 237 | - **步骤四:** 不同的模型评估指标、不同的数据集对算法的性能有不同的评估,因此**根据多个随机数据返回多个模型评估指标对应的值,再进行比较** 238 | 239 | 240 | ### 2.4 验证结果 241 | 242 | - **对比结果:** 243 | - 备注:每一列表示以对应列名为模型评估指标时,在相应数据集上表现最优的算法 244 | - 示例:第1列以AUC作为评估指标,根据10个随机数据集的结果取众数,Biased_SVM的表现是最佳的 245 | 246 |  247 | 248 | - **解析** 249 | - **对比实验证明:各半监督异常检测算法均有各自的优势,但PUL CostSensitive的Recall最高,表明FN的高代价起到了一定效果** 250 | 251 | - **备注** 252 | - **上述实验结论受到实验数据集的样本构成、样本数量等多方面因素的影响,不一定具备普适性** 253 | - **在实际运用中,需要根据数据集本身的特点予以灵活选择相应的异常检测算法** 254 | -------------------------------------------------------------------------------- /SemiSupervised-ADOA/Anomaly Detection with Partially Observed Anomalies.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-ADOA/Anomaly Detection with Partially Observed Anomalies.pdf -------------------------------------------------------------------------------- /SemiSupervised-ADOA/Pics/Isolation Score.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-ADOA/Pics/Isolation Score.jpg -------------------------------------------------------------------------------- /SemiSupervised-ADOA/Pics/Similarity Score.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-ADOA/Pics/Similarity Score.jpg -------------------------------------------------------------------------------- /SemiSupervised-ADOA/Pics/Total Score.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-ADOA/Pics/Total Score.jpg -------------------------------------------------------------------------------- /SemiSupervised-ADOA/Pics/adoa_zoom_in.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-ADOA/Pics/adoa_zoom_in.jpg -------------------------------------------------------------------------------- /SemiSupervised-ADOA/Pics/异常权重.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-ADOA/Pics/异常权重.jpg -------------------------------------------------------------------------------- /SemiSupervised-ADOA/Pics/正常权重.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-ADOA/Pics/正常权重.jpg -------------------------------------------------------------------------------- /SemiSupervised-ADOA/Pics/结构风险.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-ADOA/Pics/结构风险.jpg -------------------------------------------------------------------------------- /SemiSupervised-ADOA/ReadMe.md: -------------------------------------------------------------------------------- 1 | ## ADOA :Anomaly Detection with Partially Observed Anomalies 2 | 3 | ## 1. 论文地址与代码实现 4 | - **论文地址:** [Anomaly Detection with Partially Observed Anomalies](https://github.com/Albertsr/Anomaly-Detection/blob/master/SemiSupervised-ADOA/Anomaly%20Detection%20with%20Partially%20Observed%20Anomalies.pdf) 5 | 6 | - **Python实现:** [ADOA](https://github.com/Albertsr/Anomaly-Detection/blob/master/SemiSupervised-ADOA/ADOA.py) 7 | 8 | - **计算聚类中心的子模块:** [cluster_centers](https://github.com/Albertsr/Anomaly-Detection/blob/master/SemiSupervised-ADOA/cluster_centers.py) 9 | 10 | --- 11 | 12 | ## 2. ADOA的适用场景 13 | - 在只有极少量的已知异常样本(Partially Observed Anomalies)和大量的无标记数据(Unable Observations)的情况下,来进行的异常检测问题 14 | 15 | --- 16 | 17 | ## 3. 无监督方法、监督方法与PU Learning的弊端 18 | 19 | - **若简单的形式化为无监督学习**:丢弃已有的部分标记信息会带来信息的极大损失,且效果不理想 20 | 21 | - **若将无标记的数据完全当作正常样本**:采用监督学习的模型来处理,则会因为引入的大量噪音导致效果欠佳 22 | 23 | - **PU Learning**:适用于异常值基本相似的场景,而异常样本往往千差万别,因此PU Learning的应用受到限制 24 | 25 | --- 26 | 27 | ## 4. ADOA的处理过程 28 | 29 | #### 4.1 阶段一:对已知异常样本聚类,并从无标签样本中过滤出潜在异常样本(Potential anomalies)**以及**可靠正常样本(Reliable Normals) 30 | 31 | - 对于已知的异常样本进行聚类,聚类后的每一簇之间具有较高的相似性 32 | 33 | - 对于异常样本而言,一方面,它有着容易被隔离的特点,另一方面,它往往与某些已知的异常样本有着较高的相似性 34 | 35 | - 计算无标记样本的**隔离得分(Isolation Score)** 36 |  37 | 38 | 39 | - 计算无标记样本与异常样本簇的**相似得分(Similarity Score)** 40 | 41 |  42 | 43 | - 计算一个样本的**异常程度总得分** 44 | 45 |  46 | 47 | --- 48 | 49 | #### 4.2 阶段二:构建带权重的多分类模型 50 | 51 | - 令所有已知的异常样本的权重为1 52 | - 对于潜在异常样本,其TS(x)越高,则其作为异常样本的置信度越高,权重越大 53 | 54 |  55 | 56 | - 对于可靠正常样本,其TS(x)越低,则其作为正常样本的置信度越高,权重越大 57 |  58 | 59 | --- 60 | 61 | ## 5. 构建分类模型 62 | 63 | #### 5.1 目标函数与结构风险最小化 64 |  65 | 66 | #### 5.2 异常样本的判定 67 | - 对于未来的待预测样本,通过该模型预测其所属类别,若样本被分类到任何异常类,则将其视为异常样本,否则,视为正常样本 68 | 69 | --- 70 | -------------------------------------------------------------------------------- /SemiSupervised-ADOA/adoa.py: -------------------------------------------------------------------------------- 1 | # Author:MaXiao 2 | # E-mail:maxiaoscut@aliyun.com 3 | 4 | import numpy as np 5 | from sklearn.ensemble import IsolationForest 6 | from cluster_centers import get_cluster_centers 7 | from sklearn.preprocessing import StandardScaler, minmax_scale 8 | 9 | class ADOA: 10 | """Implementation of ADOA (Anomaly Detection with Partially Observed Anomalies)""" 11 | def __init__(self, anomalies, unlabel, classifer, cluster_algo='kmeans', n_clusters='auto', 12 | contamination=0.01, theta=0.85, alpha='auto', beta='auto', return_proba=False, 13 | random_state=2018): 14 | ''' 15 | :param anomalies: Observed anomaly data sets 16 | 17 | :param unlabel: Unlabeled data sets. 18 | 19 | :param classifer: A Classifer fitting weighted samples and labels to predict unlabel samples. 20 | 21 | :param cluster_algo: str, {'kmeans'、'spectral'、'birch'、'dbscan'}, default = 'kmeans' 22 | Clustering algorithm for clustering anomaly samples. 23 | 24 | :param n_clusters: int, default=5 25 | The number of clusters to form as well as the number of centroids to generate. 26 | 27 | :param contamination : float, range (0, 0.5). 28 | The proportion of outliers in the data set. 29 | 30 | :param theta : float, range [0, 1]. 31 | The weights of isolation_score and similarity_score are theta and 1-theta respectively. 32 | 33 | :param alpha : float, should be positive number, default = mean value of anomalies's score 34 | Threshold value for determining unlabel sample as potential anomaly 35 | 36 | :param beta : float, should be positive number 37 | Threshold value for determining unlabel sample as reliable normal sample 38 | 39 | :param return_proba : bool, default=False 40 | Whether return the predicted probability for positive(anomaly) class for each sample. 41 | Need classifer to provide predict_proba method. 42 | ''' 43 | dataset_scaled = StandardScaler().fit_transform(np.r_[anomalies, unlabel]) 44 | self.anomalies = dataset_scaled[:len(anomalies), :] 45 | self.unlabel = dataset_scaled[len(anomalies):, :] 46 | self.contamination = contamination 47 | self.classifer = classifer 48 | self.n_clusters = n_clusters 49 | self.cluster_algo = cluster_algo 50 | self.theta = theta 51 | self.alpha = alpha 52 | self.beta = beta 53 | self.return_proba = return_proba 54 | self.random_state = random_state 55 | self.centers, self.cluster_score = get_cluster_centers(self.anomalies, self.n_clusters, self.cluster_algo) 56 | 57 | def cal_weighted_score(self): 58 | dataset = np.r_[self.anomalies, self.unlabel] 59 | iforest = IsolationForest(n_estimators=100, contamination=self.contamination, 60 | random_state=self.random_state, n_jobs=-1) 61 | iforest.fit(dataset) 62 | # Paper:The higher is the score IS(x) (close to 1), the more likely that x being an anomaly. 63 | # Scikit-learn API : decision_function(X): The lower, the more abnormal. 64 | isolation_score = -iforest.decision_function(dataset) 65 | isolation_score_scaled = minmax_scale(isolation_score) 66 | 67 | def cal_similarity_score(arr, centers=self.centers): 68 | min_dist = np.min([np.square(arr - center).sum() for center in centers]) 69 | similarity_score = np.exp(-min_dist/len(arr)) 70 | ''' 71 | In the paper, when calculating similarity_score, min_dist is not divided by the number of features 72 | (len(arr)), but when the number of features is large, the value of np.exp(min_dist) is very large, 73 | so that similarity_score is close to 0, which lacks weighted meaning. Dividing by the number of 74 | features helps to alleviate this phenomenon and does not affect the ordering of similarity_score. 75 | ''' 76 | return similarity_score 77 | similarity_score = [cal_similarity_score(arr) for arr in dataset] 78 | similarity_score_scaled = minmax_scale(similarity_score) 79 | weighted_score = self.theta * isolation_score_scaled + (1-self.theta) * similarity_score_scaled 80 | return weighted_score 81 | 82 | def determine_trainset(self): 83 | weighted_score = self.cal_weighted_score() 84 | min_score, max_score, median_score = [func(weighted_score) for func in (np.min, np.max, np.median)] 85 | anomalies_score = weighted_score[:len(self.anomalies)] 86 | unlabel_scores = weighted_score[len(self.anomalies):] 87 | 88 | # determine the value of alpha、beta 89 | self.alpha = np.mean(anomalies_score) if self.alpha == 'auto' else self.alpha 90 | percent = 45 91 | self.beta = median_score if median_score < self.alpha else np.percentile(weighted_score, percent) 92 | while self.beta >= self.alpha: 93 | percent -= 5 94 | self.beta = np.percentile(weighted_score, percent) 95 | assert self.beta < self.alpha, 'beta should be smaller than alpha.' 96 | 97 | # rlb:reliabel, ptt:potential 98 | rlb_bool, ptt_bool = unlabel_scores<=self.beta, unlabel_scores>=self.alpha 99 | rlb_normal, ptt_anomalies = self.unlabel[rlb_bool], self.unlabel[ptt_bool] 100 | rlb_normal_score, ptt_anomalies_score = unlabel_scores[rlb_bool], unlabel_scores[ptt_bool] 101 | rlb_normal_weight = (max_score-rlb_normal_score) / (max_score-min_score) 102 | ptt_anomalies_weight = ptt_anomalies_score / max_score 103 | 104 | anomalies_weight = anomalies_label = np.ones(len(self.anomalies)) 105 | X_train = np.r_[self.anomalies, ptt_anomalies, rlb_normal] 106 | weights = np.r_[anomalies_weight, ptt_anomalies_weight, rlb_normal_weight] 107 | y_train = np.r_[anomalies_label, np.ones(len(ptt_anomalies)), np.zeros(len(rlb_normal))].astype(int) 108 | return X_train, y_train, weights 109 | 110 | def predict(self): 111 | X_train, y_train, weights = self.determine_trainset() 112 | clf = self.classifer 113 | clf.fit(X_train, y_train, sample_weight=weights) 114 | y_pred = clf.predict(self.unlabel) 115 | if self.return_proba: 116 | y_prob = clf.predict_proba(self.unlabel)[:, 1] 117 | return y_pred, y_prob 118 | else: 119 | return y_pred 120 | 121 | def __repr__(self): 122 | info_1 = \ 123 | '1) The Observed Anomalies is divided into {:} clusters, and the calinski_harabasz_score is {:.2f}.\n'.\ 124 | format(len(self.centers), self.cluster_score) 125 | 126 | y_train = self.determine_trainset()[1] 127 | rll_num = np.sum(y_train==0) 128 | ptt_num = sum(y_train)-len(self.anomalies) 129 | 130 | info_2 = "2) Reliable Normals's number = {:}, accounts for {:.2%} within the Unlabel dataset.\n".\ 131 | format(rll_num, rll_num/len(self.unlabel)) 132 | 133 | info_3 = "3) Potential Anomalies's number = {:}, accounts for {:.2%} within the Unlabel dataset.".\ 134 | format(ptt_num, ptt_num/len(self.unlabel)) 135 | return info_1 + info_2 + info_3 136 | -------------------------------------------------------------------------------- /SemiSupervised-ADOA/cluster_centers.py: -------------------------------------------------------------------------------- 1 | # Author:MaXiao 2 | # E-mail:maxiaoscut@aliyun.com 3 | 4 | import time 5 | import itertools 6 | import numpy as np 7 | from sklearn.cluster import KMeans, SpectralClustering, DBSCAN, Birch 8 | from sklearn.metrics import calinski_harabasz_score 9 | 10 | 11 | def timer(func): 12 | def wrapper(*args, **kwargs): 13 | start = time.time() 14 | result = func(*args,**kwargs) 15 | end = time.time() 16 | print(func.__name__+' running time:{:.2f}s'.format(end-start)) 17 | return result 18 | return wrapper 19 | 20 | @timer 21 | def get_cluster_centers(dataset, n_clusters='auto', cluster_algo='kmeans', params_grid='auto', random_state=2018): 22 | if n_clusters == 'auto': 23 | clusters_range = range(2, 8) 24 | else: 25 | assertion = 'n_clusters should be an integer greater than or equal to 2' 26 | assert isinstance(n_clusters, int) and n_clusters>=2, assertion 27 | clusters_range = [n_clusters] 28 | 29 | # get_centers函数可根据聚类算法对无标签数据集的预测结果(cluster_pred)返回各聚类簇的中心 30 | def get_centers(cluster_label, data=dataset): 31 | centers = [] 32 | for label in np.unique(cluster_label): 33 | subset = data[cluster_label==label] 34 | center = np.mean(subset, axis=0) 35 | centers.append(center) 36 | return np.sort(centers, axis=0) 37 | 38 | # 通过生成参数的笛卡尔积,寻求谱聚类算法的最优参数 39 | if cluster_algo == 'spectral': 40 | if params_grid == 'auto': 41 | params_grid = {'n_clusters':clusters_range, 'gamma':np.linspace(0.5, 1.5, 3)} 42 | params, score, y_pred_set = [], [], [] 43 | for i, j in itertools.product(params_grid['n_clusters'], params_grid['gamma']): 44 | params.append((i, j)) 45 | spectral = SpectralClustering(n_clusters=i, gamma=j, n_jobs=-1, random_state=random_state) 46 | y_pred_spectral = spectral.fit_predict(dataset) 47 | y_pred_set.append(y_pred_spectral) 48 | ch_score = calinski_harabasz_score(dataset, y_pred_spectral) 49 | score.append(ch_score) 50 | # 获取calinski_harabasz_score取最大值时对应的参数与预测聚类类标 51 | best_param = params[np.argmax(score)] 52 | y_pred = y_pred_set[np.argmax(score)] 53 | return get_centers(y_pred), np.max(score) 54 | 55 | # 通过生成参数的笛卡尔积,寻求Birch聚类算法的最优参数 56 | elif cluster_algo == 'birch': 57 | if params_grid == 'auto': 58 | params_grid = {'n_clusters':clusters_range, 'branching_factor':range(2, 10), 59 | 'threshold':np.linspace(0, 0.8, num=10)} 60 | 61 | params, score, y_pred_set = [], [], [] 62 | for i, j, k in itertools.product(params_grid['n_clusters'], params_grid['branching_factor'], params_grid['threshold']): 63 | params.append((i, j, k)) 64 | birch = Birch(n_clusters=i, branching_factor=j, threshold=k) 65 | y_pred_birch = birch.fit_predict(dataset) 66 | y_pred_set.append(y_pred_birch) 67 | ch_score = calinski_harabasz_score(dataset, y_pred_birch) 68 | score.append(ch_score) 69 | best_param = params[np.argmax(score)] 70 | y_pred = y_pred_set[np.argmax(score)] 71 | return get_centers(y_pred), np.max(score) 72 | 73 | 74 | # 通过生成参数的笛卡尔积,寻求DBSCAN聚类算法的最优参数 75 | elif cluster_algo == 'dbscan': 76 | if params_grid == 'auto': 77 | params_grid = {'eps':np.linspace(0.1, 10, num=50), 'min_samples':range(1, 10)} 78 | 79 | params, unlabeled_set, y_pred_set, score = [], [], [], [] 80 | for i, j in itertools.product(params_grid['eps'], params_grid['min_samples']): 81 | dbscan = DBSCAN(eps=i, min_samples=j, n_jobs=-1) 82 | y_pred_dbscan = dbscan.fit_predict(dataset) 83 | 84 | # DBSCAN视预测结果为-1的样本为噪声,因此需要将“噪音样本”予以排除 85 | y_pred_new = y_pred_dbscan[y_pred_dbscan != -1] 86 | dataset_new = dataset[y_pred_dbscan != -1] 87 | 88 | # 计算剔除“噪音样本”后无标签样本的剩余比例以及聚类簇的数目 89 | ratio = dataset_new.shape[0] / dataset.shape[0] 90 | n_clusters = len(np.unique(y_pred_new)) 91 | # 剩余样本的聚类簇数以及剩余比例满足一定要求,才能对参数及预测结果予以保留 92 | if n_clusters in range(2, 8) and ratio>=0.8: 93 | params.append((i, j)) 94 | unlabeled_set.append(dataset_new) 95 | y_pred_set.append(y_pred_new) 96 | ch_score = calinski_harabasz_score(dataset_new, y_pred_new) 97 | score.append(ch_score) 98 | if len(score) > 0: 99 | best_param = params[np.argmax(score)] 100 | unlabeled_final = unlabeled_set[np.argmax(score)] 101 | y_pred = y_pred_set[np.argmax(score)] 102 | return get_centers(y_pred, data=unlabeled_final), np.max(score) 103 | else: 104 | descript = 'It is difficult for dbscan to determine the number of clusters in this dataset. \ 105 | Please switch to another clustering algorithm.' 106 | print(descript) 107 | 108 | # 寻求Kmeans聚类算法的最优参数 109 | else: 110 | if params_grid == 'auto': params_grid = {'n_clusters': clusters_range} 111 | params, score, y_pred_set = [], [], [] 112 | for i in params_grid['n_clusters']: 113 | params.append(i) 114 | kmeans = KMeans(n_clusters=i, random_state=2018) 115 | y_pred_kms = kmeans.fit_predict(dataset) 116 | y_pred_set.append(y_pred_kms) 117 | ch_score = calinski_harabasz_score(dataset, y_pred_kms) 118 | score.append(ch_score) 119 | best_param = params[np.argmax(score)] 120 | y_pred = y_pred_set[np.argmax(score)] 121 | return get_centers(y_pred), np.max(score) 122 | -------------------------------------------------------------------------------- /SemiSupervised-KADOA-Original/ReadMe.md: -------------------------------------------------------------------------------- 1 | - **Author:** 马肖 2 | - **E-Mail:** maxiaoscut@aliyun.com 3 | - **GitHub:** https://github.com/Albertsr 4 | 5 | 6 | #### 1. 思路简介 7 | - ADOA采用孤立森林与聚类相结合,KADOA运用KernelPCA重构误差替代孤立森林进行异常检测,其它思路与ADOA一致 8 | 9 | #### 2. KADOA代码 10 | - [KADOA.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/SemiSupervised-KADOA-Original/KADOA.py) 11 | 12 | #### 3. KADOA与ADOA的性能对比 13 | - **对比代码:** [adoa_kadoa_contrast.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/SemiSupervised-KADOA-Original/adoa_kadoa_contrast.py) 14 | 15 | - **对比结果:** 在数据集、参数设置完全一致的情况下,KADOA的性能显著优于ADOA,但此结论有待更多数据集予以验证 16 | 17 |  18 | -------------------------------------------------------------------------------- /SemiSupervised-KADOA-Original/adoa_series_contrast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-KADOA-Original/adoa_series_contrast.png -------------------------------------------------------------------------------- /SemiSupervised-KADOA-Original/compare_adoa_kadoa.py: -------------------------------------------------------------------------------- 1 | # Author:MaXiao 2 | # E-mail:maxiaoscut@aliyun.com 3 | 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from adoa import ADOA 8 | from kadoa import KADOA 9 | from collections import Counter 10 | from lightgbm import LGBMClassifier 11 | np.set_printoptions(precision=3, suppress=True) 12 | pd.set_option('precision', 3) 13 | 14 | from sklearn.metrics import * 15 | def evaluate_model(y_true, y_pred, y_prob, index='model'): 16 | assert len(y_true) == len(y_pred) 17 | assert len(y_true) == len(y_prob) 18 | 19 | acc = accuracy_score(y_true, y_pred) 20 | f1 = f1_score(y_true, y_pred) 21 | auc = roc_auc_score(y_true, y_prob) 22 | recall = recall_score(y_true, y_pred) 23 | precision = precision_score(y_true, y_pred) 24 | gmean = np.sqrt(recall * precision) 25 | eval_frame = pd.DataFrame({'AUC':auc, 'F1':f1, 'G-Mean':gmean, 'ACC':acc, 26 | 'Recall':recall, 'Precision':precision}, index=[index]) 27 | return eval_frame 28 | 29 | # 对高分值予以标黄,仅对Jupyter有效 30 | def highlight_bg_max(s): 31 | is_max = s == s.max() # is_max是一个布尔型变量构成的矩阵 32 | bg_op = 'background-color: yellow' 33 | bg = [bg_op if v else '' for v in is_max] 34 | return bg 35 | 36 | def generate_pudata(seed, anomaly_size=0.25): 37 | rdg = np.random.RandomState(seed) 38 | # row, col分别为数据集的行数与列数 39 | row = rdg.randint(6000, 8000) 40 | col = rdg.randint(10, 15) 41 | 42 | # anomaly_num、unlabel_num分别为P集、U集包含的样本数 43 | anomaly_num = int(row * anomaly_size) 44 | unlabel_num = row - anomaly_num 45 | 46 | # contamination为U集中异常样本的占比 47 | # pos_u_num为U集中包含的正样本数 48 | contamination = rdg.uniform(0.1, 0.2) 49 | anomaly_unlabel_num = int(unlabel_num * contamination) 50 | 51 | # 异常样本同时分布于Unlabel set、Label set 52 | # 假设所有的异常样本分为3个簇,均不服从正态分布 53 | anomaly_total_num = anomaly_num + anomaly_unlabel_num 54 | anomaly_sub_num = anomaly_total_num // 3 55 | 56 | anomalies_1 = rdg.uniform(-1.8, 1.8, size=(anomaly_sub_num, col)) 57 | anomalies_2 = rdg.uniform(0, 1, size=(anomaly_sub_num, col)) 58 | anomalies_3 = rdg.exponential(1.5, size=(anomaly_sub_num, col)) 59 | anomalies_ = np.r_[anomalies_1, anomalies_2, anomalies_3] 60 | 61 | # anomalies_U是U集的子集 62 | rd_indices = rdg.permutation(len(anomalies_)) 63 | anomalies_U = anomalies_[rd_indices[:anomaly_unlabel_num]] 64 | 65 | # 生成最终的P集,由观测到的anomalies构成 66 | P = anomalies_[rd_indices[anomaly_num:]] 67 | 68 | # 生成最终的无标签样本集,其中包含contamination比例的正样本 69 | # 假设正常样本服从标准正态分布 70 | U_neg = rdg.normal(loc=0, scale=1, size=(unlabel_num-anomaly_unlabel_num, col)) 71 | U = np.r_[U_neg, anomalies_U] 72 | U_label = np.r_[np.zeros(len(U_neg)), np.ones(len(anomalies_U))].astype(int) 73 | return P, U, U_label 74 | 75 | seed = 2020 76 | P, U, U_label = generate_pudata(seed) 77 | P.shape, U.shape, Counter(U_label) 78 | 79 | clf = LGBMClassifier(num_leaves=64, n_estimators=100) 80 | a = KADOA(P, U, clf, kernel='linear', return_proba=True, verbose=2) 81 | b = ADOA(P, U, clf, return_proba=True) 82 | 83 | a_pred, a_prob = a.predict() 84 | b_pred, b_prob = b.predict() 85 | metrics_kadoa = evaluate_model(U_label, a_pred, a_prob, index='KADOA') 86 | metrics_adoa = evaluate_model(U_label, b_pred, b_prob, index='ADOA') 87 | metrics_contrast = pd.concat([metrics_adoa, metrics_kadoa], axis=0).round(3) 88 | print(metrics_contrast) 89 | -------------------------------------------------------------------------------- /SemiSupervised-KADOA-Original/kadoa.py: -------------------------------------------------------------------------------- 1 | # Author:MaXiao 2 | # E-mail:maxiaoscut@aliyun.com 3 | 4 | import numpy as np 5 | from recon_error_kpca import KPCA_Recon_Error 6 | from cluster_centers import get_cluster_centers 7 | from sklearn.preprocessing import StandardScaler, minmax_scale 8 | 9 | class KADOA: 10 | """Implementation of ADOA (Anomaly Detection with Partially Observed Anomalies)""" 11 | def __init__(self, anomalies, unlabel, classifer, cluster_algo='kmeans', n_clusters='auto', 12 | kernel='rbf', verbose=3, contamination=0.01, theta=0.85, alpha='auto', beta='auto', 13 | return_proba=False, random_state=2018): 14 | ''' 15 | :param anomalies: Observed anomaly datasets. 16 | 17 | :param unlabel: Unlabeled datasets. 18 | 19 | :param classifer: A Classifer fitting weighted samples and labels to predict unlabel samples. 20 | 21 | :param cluster_algo: str, {'kmeans'、'spectral'、'birch'、'dbscan'}, default = 'kmeans' 22 | Clustering algorithm for clustering anomaly samples. 23 | 24 | :param n_clusters: int, default=5 25 | The number of clusters to form as well as the number of centroids to generate. 26 | 27 | :param kernel: str, default='rbf'. 28 | 'linear' | 'poly' | 'rbf' | 'sigmoid' | 'cosine' | 'precomputed' kernel. 29 | 30 | :param verbose: int, default=3, Verbosity mode. the higher, the less messages. 31 | KernelPCA is time-consuming, and the verbose parameter helps to check the progress 32 | of the reconstruction. If verbose = m, information is printed every m rounds. 33 | 34 | :param contamination : float, range (0, 0.5). The proportion of outliers in the data set. 35 | 36 | :param theta : float, range [0, 1]. 37 | The weights of isolation_score and similarity_score are theta and 1-theta respectively. 38 | 39 | :param alpha : float, should be positive number, default = mean value of anomalies's score 40 | Threshold value for determining unlabel sample as potential anomaly 41 | 42 | :param beta : float, should be positive number 43 | Threshold value for determining unlabel sample as reliable normal sample 44 | 45 | :param return_proba : bool, default=False 46 | Whether return the predicted probability for positive(anomaly) class for each sample. 47 | Need classifer to provide predict_proba method. 48 | ''' 49 | self.dataset = StandardScaler().fit_transform(np.r_[anomalies, unlabel]) 50 | self.anomalies = self.dataset[:len(anomalies), :] 51 | self.unlabel = self.dataset[len(anomalies):, :] 52 | self.contamination = contamination 53 | self.verbose = verbose 54 | self.kernel = kernel 55 | self.classifer = classifer 56 | self.n_clusters = n_clusters 57 | self.cluster_algo = cluster_algo 58 | self.theta = theta 59 | self.alpha = alpha 60 | self.beta = beta 61 | self.return_proba = return_proba 62 | self.random_state = random_state 63 | self.centers, self.cluster_score = get_cluster_centers(self.anomalies, self.n_clusters, self.cluster_algo) 64 | 65 | def cal_weighted_score(self): 66 | dataset = np.r_[self.anomalies, self.unlabel] 67 | 68 | # Paper:The higher is the score IS(x) (close to 1), the more likely that x being an anomaly. 69 | kre = KPCA_Recon_Error(matrix=self.dataset, contamination=self.contamination, verbose=self.verbose, 70 | kernel=self.kernel, random_state=self.random_state) 71 | anomaly_score = kre.get_anomaly_score() 72 | isolation_score_scaled = minmax_scale(anomaly_score) 73 | 74 | def cal_similarity_score(arr, centers=self.centers): 75 | min_dist = np.min([np.square(arr - center).sum() for center in centers]) 76 | similarity_score = np.exp(-min_dist/len(arr)) 77 | ''' 78 | In the paper, when calculating similarity_score, min_dist is not divided by the number of features 79 | (len(arr)), but when the number of features is large, the value of np.exp(min_dist) is very large, 80 | so that similarity_score is close to 0, which lacks weighted meaning. Dividing by the number of 81 | features helps to alleviate this phenomenon and does not affect the ordering of similarity_score. 82 | ''' 83 | return similarity_score 84 | similarity_score = [cal_similarity_score(arr) for arr in dataset] 85 | similarity_score_scaled = minmax_scale(similarity_score) 86 | weighted_score = self.theta * isolation_score_scaled + (1-self.theta) * similarity_score_scaled 87 | return weighted_score 88 | 89 | def determine_trainset(self): 90 | weighted_score = self.cal_weighted_score() 91 | min_score, max_score, median_score = [func(weighted_score) for func in (np.min, np.max, np.median)] 92 | anomalies_score = weighted_score[:len(self.anomalies)] 93 | unlabel_scores = weighted_score[len(self.anomalies):] 94 | 95 | # determine the value of alpha、beta 96 | self.alpha = np.mean(anomalies_score) if self.alpha == 'auto' else self.alpha 97 | percent = 45 98 | self.beta = median_score if median_score < self.alpha else np.percentile(weighted_score, percent) 99 | while self.beta >= self.alpha: 100 | percent -= 5 101 | self.beta = np.percentile(weighted_score, percent) 102 | assert self.beta < self.alpha, 'beta should be smaller than alpha.' 103 | 104 | # rlb:reliabel, ptt:potential 105 | rlb_bool, ptt_bool = unlabel_scores<=self.beta, unlabel_scores>=self.alpha 106 | rlb_normal, ptt_anomalies = self.unlabel[rlb_bool], self.unlabel[ptt_bool] 107 | rlb_normal_score, ptt_anomalies_score = unlabel_scores[rlb_bool], unlabel_scores[ptt_bool] 108 | rlb_normal_weight = (max_score-rlb_normal_score) / (max_score-min_score) 109 | ptt_anomalies_weight = ptt_anomalies_score / max_score 110 | 111 | anomalies_weight = anomalies_label = np.ones(len(self.anomalies)) 112 | X_train = np.r_[self.anomalies, ptt_anomalies, rlb_normal] 113 | weights = np.r_[anomalies_weight, ptt_anomalies_weight, rlb_normal_weight] 114 | y_train = np.r_[anomalies_label, np.ones(len(ptt_anomalies)), np.zeros(len(rlb_normal))].astype(int) 115 | return X_train, y_train, weights 116 | 117 | def predict(self): 118 | X_train, y_train, weights = self.determine_trainset() 119 | clf = self.classifer 120 | clf.fit(X_train, y_train, sample_weight=weights) 121 | y_pred = clf.predict(self.unlabel) 122 | if self.return_proba: 123 | y_prob = clf.predict_proba(self.unlabel)[:, 1] 124 | return y_pred, y_prob 125 | else: 126 | return y_pred 127 | 128 | def __repr__(self): 129 | info_1 = '1) The Observed Anomalies is divided into {:} clusters, and the calinski_harabasz_score is {:.2f}.\n'.\ 130 | format(len(self.centers), self.cluster_score) 131 | 132 | y_train = self.determine_trainset()[1] 133 | rll_num = np.sum(y_train==0) 134 | ptt_num = sum(y_train)-len(self.anomalies) 135 | 136 | info_2 = "2) Reliable Normals's number = {:}, accounts for {:.2%} within the Unlabel dataset.\n".\ 137 | format(rll_num, rll_num/len(self.unlabel)) 138 | 139 | info_3 = "3) Potential Anomalies's number = {:}, accounts for {:.2%} within the Unlabel dataset.".\ 140 | format(ptt_num, ptt_num/len(self.unlabel)) 141 | return info_1 + info_2 + info_3 142 | -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/Papers/A Survey on Postive and Unlabelled Learning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-PU Learning/Papers/A Survey on Postive and Unlabelled Learning.pdf -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/Papers/Building Text Classifiers Using Positive and Unlabeled Examples.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-PU Learning/Papers/Building Text Classifiers Using Positive and Unlabeled Examples.pdf -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/Papers/Learning From Positive and Unlabeled Data:A Survey.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-PU Learning/Papers/Learning From Positive and Unlabeled Data:A Survey.pdf -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/Papers/Learning from Positive and Unlabeled Examples with Different Data Distributions_2005_A_EM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-PU Learning/Papers/Learning from Positive and Unlabeled Examples with Different Data Distributions_2005_A_EM.pdf -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/Papers/Learning with Positive and Unlabeled Examples Using Weighted Logistic Regression.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-PU Learning/Papers/Learning with Positive and Unlabeled Examples Using Weighted Logistic Regression.pdf -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/Papers/POSTER_ A PU Learning based System for Potential Malicious URL Detection.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-PU Learning/Papers/POSTER_ A PU Learning based System for Potential Malicious URL Detection.pdf -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/Papers/Partially Supervised Classification of Text Documents.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-PU Learning/Papers/Partially Supervised Classification of Text Documents.pdf -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/Pics/BiasedSVM.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-PU Learning/Pics/BiasedSVM.jpg -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/Pics/Incorporation of the Class Prior.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-PU Learning/Pics/Incorporation of the Class Prior.jpg -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/Pics/Spy technique.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-PU Learning/Pics/Spy technique.jpg -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/Pics/minC.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-PU Learning/Pics/minC.jpg -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/Pics/optimal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-PU Learning/Pics/optimal.jpg -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/Pics/param.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-PU Learning/Pics/param.jpg -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/Pics/post_prob.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-PU Learning/Pics/post_prob.jpg -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/Pics/relation.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-PU Learning/Pics/relation.jpg -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/Pics/sample_ratio_rf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-PU Learning/Pics/sample_ratio_rf.png -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/Pics/three cate.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-PU Learning/Pics/three cate.jpg -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/Pics/three_cate.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/SemiSupervised-PU Learning/Pics/three_cate.jpg -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/ReadMe.md: -------------------------------------------------------------------------------- 1 | ## 第一部分:PU Learning概述 2 | ### 1. PU Learning的定义 3 | - P:Positive,表示正样本集 4 | - U:Unlabeled,表示无标签样本集 5 | - 即存在正样本集、无标签样本集,不存在负样本集情况下的分类模型问题 6 | 7 | --- 8 | 9 | ### 2. PU Learning的三大处理方法 10 | 11 | - **方法一:** Two Step Strategy 12 | - **方法二:** Class Prior Incorporation 13 | - **方法三:** Biased Learning 14 | 15 | - 论文出处:[Learning From Positive and Unlabeled Data:A Survey](https://github.com/Albertsr/Anomaly-Detection/blob/master/SemiSupervised-PU%20Learning/Papers/Learning%20From%20Positive%20and%20Unlabeled%20Data%EF%BC%9AA%20Survey.pdf) 16 | 17 |  18 | 19 | 20 | --- 21 | 22 | ## 第二部分:PU Learning处理方法详述 23 | 24 | ### 1. 方法一:Two Step Strategy 25 | 26 | #### 1.1 核心思想 27 | - **Step 1:** 从无标签数据集U中选出可靠的负样本集RN(Reliable Negatives) 28 | - **Step 2:** 将P、RN作为训练集,训练一个分类器,然后对U集进行预测 29 | 30 | #### 1.2 第一阶段(Step1) 31 | 32 | - **目的:** 筛选可靠负样本集**RN(Reliable Negatives)** 33 | 34 | - **常用的算法:** 35 | - **Spy technique** 36 | - **The 1-DNF Technique:** 通过对比P和U,从P中抽取一些在正类样本中高频出现的特征,U中没有或只有极少量高频特征的样本可视为可靠负样本 37 | - **Rocchio:** 主要适用于文本分类算法 38 | - **NB Classifer:** 若P(1|x)
cost_fp > 0, '对FN应赋予更高的代价' 10 | 11 | X_train = np.r_[P, U] 12 | y_train = np.r_[np.ones(len(P)), np.zeros(len(U))] 13 | weight = [cost_fn if i else cost_fp for i in y_train] 14 | svm.fit(X_train, y_train, sample_weight=weight) 15 | y_pred = svm.predict(U) 16 | if return_proba: 17 | y_prob = svm.predict_proba(U)[:, -1] 18 | return y_pred, y_prob 19 | else: 20 | return y_pred -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/pu_learning.py: -------------------------------------------------------------------------------- 1 | # Author:MaXiao 2 | # E-mail:maxiaoscut@aliyun.coms 3 | # Paper: https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/ccs17poster.pdf 4 | # 论文解析:http://note.youdao.com/noteshare?id=594e87d6ac9e03d0bb461eb5160ffd7b&sub=781142FA6A634B44B3837CADCCCF74AC 5 | 6 | import numpy as np 7 | from lightgbm import LGBMClassifier 8 | from sklearn.linear_model import LogisticRegression 9 | 10 | class PULearning: 11 | """Implementation of PULearning (two-step strategy & cost-sensitive strategy)""" 12 | def __init__(self, P, U, clf_one, clf_two, Cplus, Cminus=1, sample_ratio=0.15, 13 | theta='auto', random_state=2018): 14 | """ 15 | :param P: Observed positive samples. 16 | :param U: Unlabeled datasets. 17 | :param clf_one: A Classifer used to determine reliable negative samples must be able to predict probability. 18 | :param clf_two: A Classifer fit positive samples and reliable negative samples, and predict the unlabeled. 19 | :param Cplus: the cost of not identifying positive samples, cost(FN) 20 | :param Cminus: the cost of not identifying negative samples, cost(FP) 21 | :param sample_ratio: the proportion of spy samples 22 | :param theta: the probability threshold of judging an unlabeled sample as a reliable negative sample 23 | """ 24 | self.P = P 25 | self.U = U 26 | assert clf_one.predict_proba, 'need predict_proba method to return probability estimates' 27 | self.clf_one = clf_one 28 | self.clf_two = clf_two 29 | self.Cplus = Cplus 30 | self.Cminus = Cminus 31 | self.theta = theta 32 | self.sample_ratio = 0.15 if sample_ratio=='auto' else sample_ratio 33 | self.random_state = random_state 34 | 35 | 36 | # Two-Stage Strategy: Select Reliable Negative Instances 37 | def select_reliable_negative(self): 38 | pos_num = len(self.P) 39 | spy_num = int(pos_num * self.sample_ratio) 40 | pos_random_indices = np.random.RandomState(self.random_state).permutation(pos_num) 41 | spy_indices, unspy_indices = pos_random_indices[:spy_num], pos_random_indices[spy_num:] 42 | spy_set, unspy_set = self.P[spy_indices, :], self.P[unspy_indices, :] 43 | 44 | negative_set = np.r_[self.U, spy_set] 45 | positive_set = unspy_set 46 | negative_label = np.zeros(len(negative_set)).astype(int) 47 | positive_label = np.ones(len(positive_set)).astype(int) 48 | 49 | X_train_one = np.r_[negative_set, positive_set] 50 | y_train_one = np.r_[negative_label, positive_label].astype(int) 51 | clf_one = self.clf_one.fit(X_train_one, y_train_one) 52 | 53 | y_prob_U = clf_one.predict_proba(self.U)[:, 1] 54 | y_prob_spy = clf_one.predict_proba(spy_set)[:, 1] 55 | 56 | theta = np.min(y_prob_spy) if self.theta == 'auto' else self.theta 57 | assertion = 'theta must not be greater than the minimum value of spy_prob so that \ 58 | all spy are predicted to be positive samples' 59 | assert theta <= np.min(y_prob_spy), assertion 60 | 61 | # rn: reliable_negative 62 | rn = self.U[y_prob_U <= theta, :] 63 | return rn 64 | 65 | def predict(self): 66 | # 对可靠负样本集的赋予标签0 67 | rn = self.select_reliable_negative() 68 | X_train_two = np.r_[self.P, rn] 69 | y_train_two = np.r_[np.ones(len(self.P)), np.zeros(len(rn))].astype(int) 70 | weights = np.array([self.Cplus if i else self.Cminus for i in y_train_two]) 71 | 72 | clf_two = self.clf_two 73 | clf_two.fit(X_train_two, y_train_two, sample_weight=weights) 74 | y_pred = clf_two.predict(self.U) 75 | 76 | if clf_two.predict_proba: 77 | y_prob = clf_two.predict_proba(self.U)[:, -1] 78 | return y_pred, y_prob 79 | -------------------------------------------------------------------------------- /SemiSupervised-PU Learning/weighted_lr.py: -------------------------------------------------------------------------------- 1 | # Author:马肖 2 | # E-mail:maxiaoscut@aliyun.com 3 | # Github:https://github.com/Albertsr 4 | 5 | from sklearn.linear_model import LogisticRegression 6 | lr = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', max_iter=1200, random_state=2019, n_jobs=-1) 7 | 8 | def weighted_lr(P, U, lr=lr, return_proba=True): 9 | X_train = np.r_[P, U] 10 | y_train = np.r_[np.ones(len(P)), np.zeros(len(U))] 11 | 12 | pos_weight = len(U) / len(X_train) 13 | neg_weight = 1 - pos_weight 14 | assert weight_pos > weight_neg > 0, '一般情况下,U集的个数应多于P集的个数' 15 | 16 | weight = [pos_weight if i else neg_weight for i in y_train] 17 | lr.fit(X_train, y_train, sample_weight=weight) 18 | y_pred = lr.predict(U) 19 | 20 | if return_proba: 21 | y_prob = lr.predict_proba(U)[:, -1] 22 | return y_pred, y_prob 23 | else: 24 | return y_pred -------------------------------------------------------------------------------- /UnSupervised-Based on PCA/Papers/A Novel Anomaly Detection Scheme Based on Principal Component Classifier.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Based on PCA/Papers/A Novel Anomaly Detection Scheme Based on Principal Component Classifier.pdf -------------------------------------------------------------------------------- /UnSupervised-Based on PCA/Papers/AI2 _ Training a big data machine to defend.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Based on PCA/Papers/AI2 _ Training a big data machine to defend.pdf -------------------------------------------------------------------------------- /UnSupervised-Based on PCA/Pics/classify_outlier.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Based on PCA/Pics/classify_outlier.jpg -------------------------------------------------------------------------------- /UnSupervised-Based on PCA/Pics/indices_max_decrease.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Based on PCA/Pics/indices_max_decrease.jpg -------------------------------------------------------------------------------- /UnSupervised-Based on PCA/Pics/last_pp.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Based on PCA/Pics/last_pp.jpg -------------------------------------------------------------------------------- /UnSupervised-Based on PCA/Pics/major_minor.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Based on PCA/Pics/major_minor.jpg -------------------------------------------------------------------------------- /UnSupervised-Based on PCA/Pics/max_ev_decrease.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Based on PCA/Pics/max_ev_decrease.jpg -------------------------------------------------------------------------------- /UnSupervised-Based on PCA/Pics/outliers_high_error.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Based on PCA/Pics/outliers_high_error.jpg -------------------------------------------------------------------------------- /UnSupervised-Based on PCA/Pics/outlierscore.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Based on PCA/Pics/outlierscore.jpg -------------------------------------------------------------------------------- /UnSupervised-Based on PCA/Pics/recon_matrix.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Based on PCA/Pics/recon_matrix.jpg -------------------------------------------------------------------------------- /UnSupervised-Based on PCA/ReadMe.md: -------------------------------------------------------------------------------- 1 | - **Author:** 马肖 2 | - **E-Mail:** maxiaoscut@aliyun.com 3 | - **GitHub:** https://github.com/Albertsr 4 | 5 | --- 6 | 7 | ## Chapter 1:基于样本的重构误差 8 | 9 | #### 1.1 论文与代码实现 10 | 11 | - **算法论文:** [AI^2 : Training a big data machine to defend [Veeramachaneni et.al, 2016]](https://people.csail.mit.edu/kalyan/AI2/) 12 | 13 | 14 | - **算法实现** 15 | - **基于KernelPCA重构误差的异常检测:** [recon_error_kpca.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Based%20on%20PCA/recon_error_kpca.py) 16 | - **基于LinearPCA重构误差的异常检测:** [recon_error_pca.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Based%20on%20PCA/recon_error_pca.py) 17 | - **纯Numpy版本:** [recon_error_pca_svd.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Based%20on%20PCA/recon_error_pca_svd.py) 18 | - 不调用scikit-learn,只调用numpy,通过SVD实现PCA,再进行异常检测 19 | - 返回结果与recon_error_pca.py完全一致 20 | 21 | #### 1.2 思路解析 22 | - **靠前的主成分主要解释了大部分正常样本的方差,而靠后的主成分主要解释了异常样本的方差** 23 | - 靠前的主成分是指对应于更大特征值的特征向量,靠后的主成分是指对应于更小特征值的特征向量 24 | - 上述特征值、特征向量可根据协方差矩阵的特征分解求得 25 | 26 |  27 | 28 | - **异常样本在靠前主成分上的投影较小,在靠后主成分上投影较大;只依靠排在前面的主成分是无法完整地重构异常样本的** 29 | - 只有少量排在前面的主成分被用于矩阵重构时,异常样本引起的重构误差是要远高于正常样本的 30 | - 重构误差越高的样本越有可能是异常样本 31 | 32 |  33 | 34 | - **样本在靠后主成分上的偏差应赋予更高的权重** 35 | - 令k为重构矩阵所用到的主成分数量,则随着k的逐步增加,更多靠后的主成分被用于矩阵重构 36 | - 这些靠后的主成分对异常样本具有更高的线性表出能力,因此样本在这些靠后的主成分上的偏差应赋予更高的权重 37 | 38 | #### 1.3 重构矩阵的生成方式 39 | - **重构矩阵** 40 | 41 |  42 | 43 | - **参数含义** 44 | - R为m * n型重构矩阵,与原样本矩阵X的shape保持一致 45 | - k为重构矩阵过程中用到的主成分个数 46 | - Q为投影矩阵,其k个列向量为前k个主成分(按特征值降序排列) 47 | 48 | #### 1.4 重构误差与异常分数 49 | - **异常得分** 50 | 51 |  52 | - k表示重构矩阵所用到的主成分数,n表示主成分总数 53 | - ev(k)表示前k个主成分多大程度上解释了总体方差,与k值成正比 54 | 55 | - **越靠后的主成分其对应的重构误差的权重也越大** 56 | - 重构矩阵所用到的主成分越多(k值越大),样本在靠后的主成分上的误差对应的权重ev(k)也越大 57 | - 靠后主成分对异常样本具有更强的表达能力,从而对应的误差应赋予更高的权重 58 | 59 | --- 60 | 61 | ## Chapter 2:基于样本在major/minor主成分上的偏离程度 62 | #### 2.1 论文与代码实现 63 | - **论文地址:** [A Novel Anomaly Detection Scheme Based on Principal Component Classifier](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Based%20on%20PCA/Papers/A%20Novel%20Anomaly%20Detection%20Scheme%20Based%20on%20Principal%20Component%20Classifier.pdf) 64 | 65 | - **Python实现:** [RobustPCC.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Based%20on%20PCA/RobustPCC.py) 66 | 67 | #### 2.2 术语定义 68 | - **major principal components** 69 | - 将特征值降序排列后,**累计特征值之和约占50%** 的前几个特征值对应的特征向量 70 | - **在major principal components 上偏差较大的样本,对应于在原始特征上取极值的异常样本** 71 | - the observations that are outliers with respect to major principal components usually correspond to outliers on one or more 72 | of the original variables. 73 | 74 | - **minor principal components** 75 | - 指**特征值小于0.2**对应的特征向量 76 | - 在minor principal components上偏差较大的样本,对应于那些**与正常样本相关性结构(the correlation structure)不一致的异常样本** 77 | - minor principal components are sensitive to **the observations that are inconsistent with the correlation structure of the data but 78 | are not outliers with respect to the original variables** 79 | 80 | - **样本在单个主成分上的偏差** 81 | - 样本在此特征向量上的**偏离程度定义为样本在此特征向量上投影的平方与特征值之商** 82 | - 其中除以特征值是为了起到归一化的作用,使得样本在不同特征向量上的偏差具有可比性 83 | 84 | - **样本在所有方向上的偏差之和等价于它与样本中心之间的马氏距离** 85 | 86 |  87 | 88 | #### 2.3 算法流程 89 | - **第一步:** 通过马氏距离筛选一定比例的极值样本从训练集中剔除,以获得鲁棒性更高的主成分及对应的特征值 90 | - **First, we use the Mahalanobis metric to identify the 100*gamma% extreme observations that are to be trimmed** 91 | - 令剩余样本构成的矩阵为remain_matrix 92 | 93 | - **第二步:** 对remain_matrix进行主成分分析,得到principal components及对应的特征值 94 | - **第三步:** 根据特征值的取值以及相关定义,确定major principal components与minor principal components 95 | - **第四步:** 求remain_matrix中所有样本在major principal components与minor principal components上的偏离度 96 | - **第五步:** 根据上一步求出的两个偏离度,以及指定的分位点,求出判定样本是否异常的阈值c1与c2 97 | - **第六步:** 对于一个待检测样本,计算它在major principal components与minor principal components上的偏离度,若其中之一超出相应的阈值则判定为异常,否则为正常样本 98 | 99 |  100 | 101 | #### 2.4 进一步提升RobustPCC性能的方法 102 | - 在样本数较多的情况下,可适当提高gamma,进一步剔除训练集中较异常的样本,以提升PCC的鲁棒性 103 | - 适当提高quantile的取值,以提升将样本判定为异常的阈值,有助于降低RobustPCC的FPR 104 | 105 | --- 106 | 107 | ## Chapter 3. 实证分析:异常样本在最前、最后的若干主成分上具有最大的方差 108 | 109 | #### 3.1 理论分析 110 | - 异常样本在最大以及最小的几个特征值对应的主成分上应具有更大的投影 111 | - 若最大以及最小的几个特征值对应的主成分构成的坐标轴不存在,则**异常样本在主成分空间(the principal components’ space)无法被完整地线性表出** 112 | - [A Novel Anomaly Detection Scheme Based on Principal Component Classifier](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Based%20on%20PCA/Papers/A%20Novel%20Anomaly%20Detection%20Scheme%20Based%20on%20Principal%20Component%20Classifier.pdf) [Shyu, et.al, 2003]明确提出: 113 | - 在major principal components上偏差较大的样本,对应于在原始特征上取极值的异常样本 114 | - 在minor principal components上偏差较大的样本,对应于那些与正常样本相关性结构不一致的异常样本 115 | - 论文截图 116 | 117 |  118 | 119 | #### 3.2 验证方法 120 | - 对数据集进行PCA,各主成分对应的特征值记为variance_original 121 | - 从原数据集中剔除孤立森林(或其他异常检测算法)检测出的若干异常样本,再进行PCA,对应的特征值向量记为variance_revised 122 | - 计算因剔除异常样本导致的特征值变动比例delta_ratio,其中delta_ratio = (variance_revised - variance_original) / variance_original 123 | - 找出降幅比例最大的前k(例如k=3)个特征值对应的索引indices_top_k 124 | - 若indices_top_k中包含最小或最大的索引,则可以认为异常样本在最前与最后的少数几个主成分上具有最大的方差 125 | 126 | #### 3.3 验证代码与结果 127 | - **验证代码:** [max_ev_decrease.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Based%20on%20PCA/max_ev_decrease.py) 128 | - **验证结果:** 经过随机生成的10个数据集的实验结果表明上述结论是正确的 129 | - **实验细节** 130 | - 每个数据集均为5000 * 20型矩阵,即包含5000个样本,20个特征 131 | - 正常样本均服从标准正态分布,异常样本由标准伽玛分布、指数分布组合构成 132 | - 验证结果由下图所示,可见**降幅最大的特征值索引中至少包含最小的索引值0或最大的索引值19** 133 | 134 |  135 | -------------------------------------------------------------------------------- /UnSupervised-Based on PCA/max_ev_decrease.py: -------------------------------------------------------------------------------- 1 | # Author:MaXiao 2 | # E-mail:maxiaoscut@aliyun.com 3 | 4 | import warnings 5 | warnings.filterwarnings("ignore") 6 | import numpy as np 7 | import pandas as pd 8 | from sklearn.decomposition import PCA 9 | from sklearn.ensemble import IsolationForest 10 | from sklearn.preprocessing import StandardScaler 11 | 12 | 13 | # compare_variance: used to observe the influence of eliminating abnormal sample on the corresponding eigenvalues of PCA. 14 | 15 | def compare_variance(X, k=3, contamination=0.01): 16 | """ 17 | :param X: {array-like, sparse matrix, dataframe} of shape (n_samples, n_features) 18 | :param k: int, the number of retained eigenvalues 19 | :param contanination: float, range (0, 0.5), The proportion of outliers in the data set. 20 | """ 21 | X = X.values if isinstance(X, pd.DataFrame) else np.array(X) 22 | X_scaled = StandardScaler().fit_transform(X) 23 | pca = PCA(n_components=None, random_state=2018) 24 | pca.fit(X_scaled) 25 | variance_original = pca.explained_variance_ 26 | 27 | # IsolationForest is used for anomaly detection, and the anomaly_indices is obtained. 28 | iforest = IsolationForest(contamination=contamination, random_state=2018, n_jobs=-1) 29 | anomaly_label = iforest.fit_predict(X_scaled) 30 | anomaly_indices = np.argwhere(anomaly_label==-1).ravel() 31 | 32 | # delete the exception sample and get the matrix X_filter 33 | X_filter = X_scaled[np.isin(range(len(X_scaled)), anomaly_indices, invert=True), :] 34 | pca.fit(X_filter) 35 | variance_filter = pca.explained_variance_ 36 | 37 | # compare the eigenvalues before and after deleting the abnormal sample. 38 | # Only negative numbers in delta_ratio are selected to ensure that the corresponding eigenvalues are reduced. 39 | delta_ratio = (variance_filter - variance_original) / variance_original 40 | target_ratio = delta_ratio[delta_ratio < 0] 41 | 42 | # select the index with the largest decrease in eigenvalues 43 | if len(target_ratio) >= k: 44 | indices_topk = np.argsort(target_ratio)[:k] 45 | else: 46 | indices_topk = np.argsort(target_ratio)[:len(target_ratio)] 47 | 48 | # verify that any one of the maximum or minimum index appears in the indices_topk 49 | indices_min_max = [0, X.shape[1]-1] 50 | bool_result = any(np.isin(indices_min_max, indices_topk)) 51 | return indices_topk, bool_result 52 | 53 | 54 | # generate_dataset用于生成实验数据集 55 | def generate_dataset(seed, row=5000, col=20, contamination=0.01): 56 | rdg = np.random.RandomState(seed) 57 | outlier_num = int(row*contamination) 58 | inlier_num = row - outlier_num 59 | 60 | # construct a normal sample set that obeys standard normal distribution 61 | inliers = rdg.randn(inlier_num, col) 62 | 63 | # If col is odd,col_1=col//2,else col_1=int(col/2) 64 | col_1 = col//2 if np.mod(col, 2) else int(col/2) 65 | col_2 = col - col_1 66 | 67 | # outliers_sub_1 obeys standard gamma distribution 68 | # outliers_sub_2 obeys exponential distribution. 69 | outliers_sub_1 = rdg.standard_gamma(1, (outlier_num, col_1)) 70 | outliers_sub_2 = rdg.exponential(5, (outlier_num, col_2)) 71 | outliers = np.c_[outliers_sub_1, outliers_sub_2] 72 | 73 | matrix = np.r_[inliers, outliers] 74 | return matrix 75 | 76 | # generate 10 non-repetitive random seeds and corresponding data sets 77 | seeds = np.random.RandomState(2018).choice(range(100), size=10, replace=False) 78 | matrices = [generate_dataset(seed) for seed in seeds] 79 | 80 | # output verification results 81 | contrast_result = [compare_variance(matrix) for matrix in matrices] 82 | verify_result = pd.DataFrame(contrast_result, columns=['target_index', 'contain_minmax']) 83 | print(verify_result) 84 | -------------------------------------------------------------------------------- /UnSupervised-Based on PCA/recon_error_kpca.py: -------------------------------------------------------------------------------- 1 | # Author:MaXiao 2 | # E-mail:maxiaoscut@aliyun.com 3 | # Github:https://github.com/Albertsr 4 | 5 | import time 6 | import numpy as np 7 | from sklearn.decomposition import KernelPCA 8 | from sklearn.preprocessing import StandardScaler 9 | 10 | 11 | class KPCA_Recon_Error: 12 | """Implementation of anomaly detection base on KernelPCA reconstruction error.""" 13 | def __init__(self, matrix, contamination=0.01, kernel='rbf', verbose=3, gamma=None, random_state=2018): 14 | 15 | ''' 16 | Parameters 17 | -------------------------- 18 | :param matrix : dataset, shape = [n_samples, n_features] 19 | 20 | :param contamination : float, should be in the range [0, 0.5], default=0.01 21 | The amount of contamination of the data set, i.e. the proportion of outliers in the data set. 22 | Used when fitting to define the threshold on the scores of the samples. 23 | 24 | :param kernel : 'linear' | 'poly' | 'rbf' | 'sigmoid' | 'cosine' | 'precomputed', default='rbf'. 25 | 26 | :param verbose: int, default=3, Verbosity mode. the higher, the less messages. 27 | the matrix reconstruction of KernelPCA is time-consuming, this parameter helps to check 28 | the progress of the reconstruction. If verbose = m, process information is printed every m rounds. 29 | 30 | :param gamma : float, default=1/n_features 31 | Kernel coefficient for rbf, poly and sigmoid kernels. Ignored by other kernels. 32 | ''' 33 | self.matrix = StandardScaler().fit_transform(matrix) 34 | self.contamination = contamination 35 | self.kernel = kernel 36 | self.gamma = gamma 37 | self.verbose = verbose 38 | self.random_state = random_state 39 | 40 | def get_ev_ratio(self): 41 | transformer = KernelPCA(n_components=None, kernel=self.kernel, gamma=self.gamma, 42 | fit_inverse_transform=True, random_state=self.random_state, n_jobs=-1) 43 | transformer.fit_transform(self.matrix) 44 | # ev_ratio is the cumulative proportion of eigenvalues and the weight of 45 | # reconstruction error corresponding to different number of principal components 46 | ev_ratio = np.cumsum(transformer.lambdas_) / np.sum(transformer.lambdas_) 47 | return ev_ratio 48 | 49 | def reconstruct_matrix(self): 50 | # the parameter recon_pc_num is the number of top principal components used in the reconstruction matrix. 51 | def reconstruct(recon_pc_num): 52 | transformer = KernelPCA(n_components=recon_pc_num, kernel=self.kernel, gamma=self.gamma, 53 | fit_inverse_transform=True, n_jobs=-1, random_state=self.random_state) 54 | X_transformed = transformer.fit_transform(self.matrix) 55 | recon_matrix = transformer.inverse_transform(X_transformed) 56 | assert_description = 'The shape of the reconstruction matrix should be equal to that of the initial matrix.' 57 | assert recon_matrix.shape == self.matrix.shape, assert_description 58 | return recon_matrix 59 | 60 | # generating a series of reconstruction matrices 61 | # the matrix reconstruction of KernelPCA is time-consuming, and the parameter verbose helps to check 62 | # the progress of the reconstruction, process information is printed every verbose rounds. 63 | cols_num = self.matrix.shape[1] 64 | if not self.verbose: 65 | recon_matrices = [reconstruct(i) for i in range(1, cols_num+1)] 66 | else: 67 | recon_matrices = [] 68 | time_cost = 0 69 | start = time.time() 70 | for i in range(1, cols_num+1): 71 | recon_matrices.append(reconstruct(i)) 72 | if i % int(self.verbose) == 0: 73 | running_time = time.time()-start 74 | time_cost += running_time 75 | print('{} feature(s) participate in reconstruction, running time: {:.2f}s'.format(i, running_time)) 76 | start = time.time() 77 | if i == cols_num: 78 | running_time = time.time()-start 79 | time_cost += running_time 80 | print('A total of {} matrices have been reconstructed, total time: {:.2f}s'.format(cols_num, time_cost)) 81 | 82 | # randomly select two reconstruction matrices to verify that they are different 83 | i, j = np.random.choice(range(cols_num), size=2, replace=False) 84 | description = 'The reconstruction matrices generated by different number of principal components are different.' 85 | assert not np.all(recon_matrices[i] == recon_matrices[j]), description 86 | return recon_matrices 87 | 88 | def get_anomaly_score(self): 89 | # calculate the modulus of a vector 90 | def compute_vector_length(vector): 91 | square_sum = np.square(vector).sum() 92 | return np.sqrt(square_sum) 93 | 94 | # calculate the anomaly score generated by a single reconstruction matrix for all samples 95 | def compute_sub_score(recon_matrix, ev): 96 | delta_matrix = self.matrix - recon_matrix 97 | score = np.apply_along_axis(compute_vector_length, axis=1, arr=delta_matrix) * ev 98 | return score 99 | 100 | ev_ratio = self.get_ev_ratio() 101 | reconstruct_matrices = self.reconstruct_matrix() 102 | # summarize the anomaly scores generated by all reconstruction matrices 103 | anomaly_scores = list(map(compute_sub_score, reconstruct_matrices, ev_ratio)) 104 | return np.sum(anomaly_scores, axis=0) 105 | 106 | # returns indices with the highest anomaly score based on a specific contamination 107 | def get_anomaly_indices(self): 108 | indices_desc = np.argsort(-self.get_anomaly_score()) 109 | anomaly_num = int(np.ceil(len(self.matrix) * self.contamination)) 110 | anomaly_indices = indices_desc[:anomaly_num] 111 | return anomaly_indices 112 | 113 | # returns 1 if the prediction is an anomaly, otherwise returns 0 114 | def predict(self): 115 | anomaly_indices = self.get_anomaly_indices() 116 | pred_result = np.isin(range(len(self.matrix)), anomaly_indices).astype(int) 117 | return pred_result 118 | -------------------------------------------------------------------------------- /UnSupervised-Based on PCA/recon_error_pca.py: -------------------------------------------------------------------------------- 1 | # Author:MaXiao 2 | # E-mail:maxiaoscut@aliyun.com 3 | # Github:https://github.com/Albertsr 4 | 5 | import numpy as np 6 | from sklearn.decomposition import PCA 7 | from sklearn.preprocessing import StandardScaler 8 | 9 | 10 | class PCA_Recon_Error: 11 | """Implementation of anomaly detection base on LinearPCA reconstruction error.""" 12 | def __init__(self, matrix, contamination=0.01, random_state=2018): 13 | """ 14 | Parameters 15 | -------------------------- 16 | - matrix : data set, shape = [n_samples, n_features]. 17 | - contamination : float, should be in the range [0, 0.5], default=0.005 18 | The amount of contamination of the data set, i.e. the proportion of outliers in the data set. 19 | Used when fitting to define the threshold on the scores of the samples. 20 | """ 21 | self.matrix = StandardScaler().fit_transform(matrix) 22 | self.contamination = contamination 23 | self.random_state = random_state 24 | 25 | def get_ev_ratio(self): 26 | pca = PCA(n_components=None, random_state=self.random_state) 27 | pca_result = pca.fit_transform(self.matrix) 28 | eigenvalues = pca.explained_variance_ 29 | # ev_ratio is the cumulative proportion of eigenvalues and the weight of 30 | # reconstruction error corresponding to different number of principal components 31 | ev_ratio = np.cumsum(eigenvalues) / np.sum(eigenvalues) 32 | return ev_ratio 33 | 34 | # using different numbers of principal components to generate a series of reconstruction matrices 35 | def reconstruct_matrix(self): 36 | # the parameter recon_pc_num is the number of top principal components used in the reconstruction matrix. 37 | def reconstruct(recon_pc_num): 38 | pca_recon = PCA(n_components=recon_pc_num, random_state=self.random_state) 39 | pca_reduction = pca_recon.fit_transform(self.matrix) 40 | recon_matrix = pca_recon.inverse_transform(pca_reduction) 41 | assert_description = 'The shape of the reconstruction matrix should be equal to that of the initial matrix.' 42 | assert recon_matrix.shape == self.matrix.shape, assert_description 43 | return recon_matrix 44 | 45 | # generating a series of reconstruction matrices 46 | col = self.matrix.shape[1] 47 | recon_matrices = [reconstruct(i) for i in range(1, col+1)] 48 | 49 | # randomly select two reconstruction matrices to verify that they are different 50 | i, j = np.random.choice(range(col), size=2, replace=False) 51 | description = 'The reconstruction matrices generated by different number of principal components are different.' 52 | assert not np.all(recon_matrices[i] == recon_matrices[j]), description 53 | return recon_matrices 54 | 55 | # calculate the final anomaly score 56 | def get_anomaly_score(self): 57 | # calculate the modulus of a vector 58 | def compute_vector_length(vector): 59 | square_sum = np.square(vector).sum() 60 | return np.sqrt(square_sum) 61 | 62 | # calculate the anomaly score generated by a single reconstruction matrix for all samples 63 | def compute_sub_score(recon_matrix, ev): 64 | delta_matrix = self.matrix - recon_matrix 65 | score = np.apply_along_axis(compute_vector_length, axis=1, arr=delta_matrix) * ev 66 | return score 67 | 68 | ev_ratio = self.get_ev_ratio() 69 | reconstruct_matrices = self.reconstruct_matrix() 70 | # summarize the anomaly scores generated by all reconstruction matrices 71 | anomaly_scores = list(map(compute_sub_score, reconstruct_matrices, ev_ratio)) 72 | return np.sum(anomaly_scores, axis=0) 73 | 74 | # returns indices with the highest anomaly score based on a specific contamination 75 | def get_anomaly_indices(self): 76 | indices_desc = np.argsort(-self.get_anomaly_score()) 77 | anomaly_num = int(np.ceil(len(self.matrix) * self.contamination)) 78 | anomaly_indices = indices_desc[:anomaly_num] 79 | return anomaly_indices 80 | 81 | # returns 1 if the prediction is an anomaly, otherwise returns 0 82 | def predict(self): 83 | anomaly_indices = self.get_anomaly_indices() 84 | pred_result = np.isin(range(len(self.matrix)), anomaly_indices).astype(int) 85 | return pred_result 86 | -------------------------------------------------------------------------------- /UnSupervised-Based on PCA/recon_error_pca_svd.py: -------------------------------------------------------------------------------- 1 | # Author:MaXiao 2 | # E-mail:maxiaoscut@aliyun.com 3 | # Github:https://github.com/Albertsr 4 | 5 | import numpy as np 6 | from numpy import linalg 7 | 8 | 9 | class PCA_Via_SVD: 10 | def __init__(self, matrix, n_components=None): 11 | self.matrix = matrix 12 | self.n_components = matrix.shape[1] if n_components==None else n_components 13 | U, sigma, Vh = self.svd_matrix() 14 | # cov_eigvalue : eigenvalues of covariance matrix 15 | cov_eigvalue = np.square(sigma) / (self.matrix.shape[0] - 1) 16 | self.components_ = Vh[:n_components, ] 17 | self.explained_variance_ = cov_eigvalue[:n_components] 18 | self.Vh = Vh 19 | 20 | def scale_matrix(self): 21 | def scale_vector(vector): 22 | delta = vector - np.mean(vector) 23 | std = np.std(vector, ddof=0) 24 | return delta / std 25 | matrix_scaled = np.apply_along_axis(arr=self.matrix, func1d=scale_vector, axis=0) 26 | return matrix_scaled 27 | 28 | # Singular value decomposition of scaled matrix 29 | def svd_matrix(self): 30 | U, sigma, Vh = linalg.svd(self.scale_matrix()) 31 | assert len(sigma) == min(self.matrix.shape) 32 | return U, sigma, Vh 33 | 34 | # Obtaining projection matrix Q through matrix V 35 | def implement_pca(self): 36 | # Q : The transpose of the first n_components row vectors of Vh 37 | Q = self.Vh[:self.n_components, :].T 38 | pca_result = np.dot(self.scale_matrix(), Q) 39 | assert pca_result.shape[1] == self.n_components 40 | return pca_result 41 | 42 | 43 | class PCA_Recon_Error(PCA_Via_SVD): 44 | def __init__(self, matrix, contamination=0.01, n_components=None): 45 | super(PCA_Recon_Error, self).__init__(matrix, n_components) 46 | self.contamination = contamination 47 | 48 | def get_ev_ratio(self): 49 | pca_result = self.implement_pca() 50 | eigenvalues = self.explained_variance_ 51 | # ev_ratio is the cumulative proportion of eigenvalues and the weight of 52 | # reconstruction error corresponding to different number of principal components 53 | ev_ratio = np.cumsum(eigenvalues) / np.sum(eigenvalues) 54 | return ev_ratio 55 | 56 | # using different numbers of principal components to generate a series of reconstruction matrices 57 | def reconstruct_matrix(self): 58 | # the parameter recon_pc_num is the number of top principal components used in the reconstruction matrix. 59 | def reconstruct(recon_pc_num): 60 | instance = PCA_Via_SVD(self.matrix, n_components=recon_pc_num) 61 | recon_matrix = np.dot(instance.implement_pca(), instance.Vh[:recon_pc_num, :]) 62 | assert_description = 'The shape of the reconstruction matrix should be equal to that of the initial matrix.' 63 | assert np.all(recon_matrix.shape == self.matrix.shape), assert_description 64 | return recon_matrix 65 | 66 | # generating a series of reconstruction matrices 67 | col = self.matrix.shape[1] 68 | recon_matrices = [reconstruct(i) for i in range(1, col+1)] 69 | 70 | # randomly select two reconstruction matrices to verify that they are different 71 | i, j = np.random.choice(range(col), size=2, replace=False) 72 | description = 'The reconstruction matrices generated by different number of principal components are different.' 73 | assert not np.all(recon_matrices[i] == recon_matrices[j]), description 74 | return recon_matrices 75 | 76 | # calculate the final anomaly score 77 | def get_anomaly_score(self): 78 | # calculate the modulus of a vector 79 | def compute_vector_length(vector): 80 | square_sum = np.square(vector).sum() 81 | return np.sqrt(square_sum) 82 | 83 | # calculate the anomaly score generated by a single reconstruction matrix for all samples 84 | def compute_sub_score(recon_matrix, ev): 85 | delta_matrix = self.scale_matrix() - recon_matrix 86 | score = np.apply_along_axis(compute_vector_length, axis=1, arr=delta_matrix) * ev 87 | return score 88 | 89 | ev_ratio = self.get_ev_ratio() 90 | reconstruct_matrices = self.reconstruct_matrix() 91 | # summarize the anomaly scores generated by all reconstruction matrices 92 | anomaly_scores = list(map(compute_sub_score, reconstruct_matrices, ev_ratio)) 93 | return np.sum(anomaly_scores, axis=0) 94 | 95 | # returns indices with the highest anomaly score based on a specific contamination 96 | def get_anomaly_indices(self): 97 | indices_desc = np.argsort(-self.get_anomaly_score()) 98 | anomaly_num = int(np.ceil(len(self.matrix) * self.contamination)) 99 | anomaly_indices = indices_desc[:anomaly_num] 100 | return anomaly_indices 101 | 102 | # returns 1 if the prediction is an anomaly, otherwise returns 0 103 | def predict(self): 104 | anomaly_indices = self.get_anomaly_indices() 105 | pred_result = np.isin(range(len(self.matrix)), anomaly_indices).astype(int) 106 | return pred_result 107 | -------------------------------------------------------------------------------- /UnSupervised-Based on PCA/robustpcc.py: -------------------------------------------------------------------------------- 1 | # Author:MaXiao 2 | # E-mail:maxiaoscut@aliyun.com 3 | 4 | import numpy as np 5 | from sklearn.decomposition import PCA 6 | from sklearn.preprocessing import StandardScaler 7 | 8 | 9 | class Mahalanobis: 10 | """Implementation of Mahalanobis distance's variant.""" 11 | 12 | def __init__(self, train_matrix, gamma=0.005, random_state=2018): 13 | """ 14 | :param train_matrix: training set, shape = [n_samples, n_features]. 15 | :param gamma: float, default=0.005 16 | The proportion of abnormal samples to be eliminated in the training set. 17 | Increasing gamma helps to improve the sensitivity of the algorithm to abnormal samples. 18 | """ 19 | self.scaler = StandardScaler().fit(train_matrix) 20 | self.train_matrix = self.scaler.transform(train_matrix) 21 | self.gamma = gamma 22 | self.random_state = random_state 23 | 24 | def decompose_train_matrix(self): 25 | pca = PCA(n_components=None, random_state=self.random_state) 26 | pca.fit(self.train_matrix) 27 | eigenvalues = pca.explained_variance_ 28 | components = pca.components_ 29 | return eigenvalues, components 30 | 31 | # the return value of compute_mahal_dist function is similar to Mahalanobis distance 32 | def compute_mahal_dist(self): 33 | eigenvalues, components = self.decompose_train_matrix() 34 | def get_score(pc_idx): 35 | # param pc_idx: the index of the principal components 36 | inner_product = np.dot(self.train_matrix, components[pc_idx]) 37 | score = np.square(inner_product) / eigenvalues[pc_idx] 38 | return score 39 | # calculate the score of each sample of the training set on all principal components and sum it 40 | mahal_dist = sum([get_score(idx) for idx in range(len(eigenvalues))]) 41 | return mahal_dist 42 | 43 | # return the indices of the anomaly samples in the training set 44 | def search_original_anomaly_indices(self): 45 | indices_sort = np.argsort(-self.compute_mahal_dist()) 46 | anomaly_num = int(np.ceil(len(self.train_matrix) * self.gamma)) 47 | original_anomaly_indices = indices_sort[:anomaly_num] 48 | return original_anomaly_indices 49 | 50 | def eliminate_original_anomalies(self): 51 | original_anomaly_indices = self.search_original_anomaly_indices() 52 | train_matrix_indices = range(len(self.train_matrix)) 53 | remain_bool = np.isin(train_matrix_indices, original_anomaly_indices, invert=True) 54 | remain_matrix = self.train_matrix[remain_bool] # np.extract(remain_bool, self.train_matrix) 55 | return remain_matrix 56 | 57 | 58 | class RobustPCC(Mahalanobis): 59 | """Implementation of RobustPCC Algorithm""" 60 | def __init__(self, train_matrix, gamma=0.005, quantile=98.99, random_state=2018): 61 | """ 62 | :param quantile: float, default=98.99, threshold quantile of whether it is abnormal or not. 63 | Increasing quantile helps to reduce the FPR(False Positive Rate) of the algorithm. 64 | """ 65 | super(RobustPCC, self).__init__(train_matrix, gamma, random_state) 66 | self.quantile = quantile 67 | 68 | def decompose_remain_matrix(self): 69 | remain_matrix = self.eliminate_original_anomalies() 70 | pca = PCA(n_components=None, random_state=self.random_state) 71 | pca.fit(remain_matrix) 72 | components = pca.components_ 73 | eigenvalues = pca.explained_variance_ 74 | cumsum_ratio = np.cumsum(eigenvalues) / np.sum(eigenvalues) 75 | return components, eigenvalues, cumsum_ratio 76 | 77 | def compute_matrix_score(self, matrix, components, eigenvalues): 78 | """ 79 | :func compute_matrix_score : calculate the score of matrix on any set of eigenvalues and components. 80 | :func get_observation_score : calculate the score of a single sample on any set of eigenvalues and components. 81 | :param observation: a single sample(row) 82 | """ 83 | def get_observation_score(observation): 84 | def sub_score(component, eigenvalue): 85 | inner_product = np.dot(observation, component) 86 | score = np.square(inner_product) / eigenvalue 87 | return score 88 | total_score = sum(map(sub_score, components, eigenvalues)) 89 | return total_score 90 | matrix_scores = np.apply_along_axis(arr=matrix, axis=1, func1d=get_observation_score) 91 | return matrix_scores 92 | 93 | def compute_major_minor_scores(self, matrix): 94 | components, eigenvalues, cumsum_ratio = self.decompose_remain_matrix() 95 | 96 | major_pc_num = len(np.argwhere(cumsum_ratio < 0.5)) + 1 97 | # major_components:corresponding to the first few principal components whose cumulative eigenvalues 98 | # account for about 50% after the eigenvalues are arranged in descending order. 99 | major_components = components[:major_pc_num, :] 100 | major_eigenvalues = eigenvalues[:major_pc_num] 101 | 102 | minor_pc_num = len(np.argwhere(eigenvalues < 0.2)) 103 | # minor_components:the principal components corresponding to the eigenvalue less than 0.2 104 | minor_components = components[-minor_pc_num:, :] 105 | minor_eigenvalues = eigenvalues[-minor_pc_num:] 106 | 107 | # compute_matrix_score:calculate the score of the given matrix corresponding to major/minor principal components. 108 | major_scores = self.compute_matrix_score(matrix, major_components, major_eigenvalues) 109 | minor_scores = self.compute_matrix_score(matrix, minor_components, minor_eigenvalues) 110 | return major_scores, minor_scores 111 | 112 | def determine_thresholds(self): 113 | remain_matrix = self.eliminate_original_anomalies() 114 | major_scores, minor_scores = self.compute_major_minor_scores(remain_matrix) 115 | # c1, c2: the anomaly thresholds corresponding to major/minor principal components 116 | c1 = np.percentile(major_scores, self.quantile) 117 | c2 = np.percentile(minor_scores, self.quantile) 118 | return c1, c2 119 | 120 | # predict the testset, 1 for anomaly, 0 for normal 121 | def predict(self, test_matrix): 122 | test_matrix = self.scaler.transform(test_matrix) 123 | test_major_scores, test_minor_scores = self.compute_major_minor_scores(test_matrix) 124 | c1, c2 = self.determine_thresholds() 125 | 126 | # determining the deduplicated indices of abnormal samples in test set according to scores and thresholds 127 | anomaly_indices_major = np.argwhere(test_major_scores > c1) 128 | anomaly_indices_minor = np.argwhere(test_minor_scores > c2) 129 | test_anomaly_indices = np.union1d(anomaly_indices_major, anomaly_indices_minor) 130 | 131 | # descending arrangement of the indices of abnormal samples according to the score 132 | test_scores = test_major_scores + test_minor_scores 133 | test_anomaly_scores = test_scores[test_anomaly_indices] 134 | test_anomaly_indices_desc = test_anomaly_indices[np.argsort(-test_anomaly_scores)] 135 | pred_result = np.isin(range(len(test_matrix)), test_anomaly_indices_desc).astype(int) 136 | return pred_result 137 | -------------------------------------------------------------------------------- /UnSupervised-Isolation Forest/Isolation Forest.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Isolation Forest/Isolation Forest.pdf -------------------------------------------------------------------------------- /UnSupervised-Isolation Forest/IsolationForest.py: -------------------------------------------------------------------------------- 1 | # Author:MaXiao 2 | # E-mail:maxiaoscut@aliyun.com 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.ensemble import IsolationForest 7 | 8 | 9 | ''' 10 | API简要说明: 11 | sklearn.ensemble.IsolationForest(n_estimators=100, max_samples='auto', contamination='legacy', max_features=1.0, 12 | bootstrap=False, n_jobs=None, behaviour=’old’, random_state=None, verbose=0) 13 | n_estimators:iTree的个数; 14 | max_samples:构建单颗iTree的样本数; 15 | contamination:异常值的比例; 16 | max_features:构建单颗iTree的特征数; 17 | bootstrap:布尔型参数,默认取False,表示构建iTree时有放回地进行抽样; 18 | ''' 19 | 20 | # 设置训练样本数及异常样本比例 21 | n_samples = 10000 22 | outliers_fraction = 0.25 23 | n_inliers = int((1. - outliers_fraction) * n_samples) 24 | n_outliers = int(outliers_fraction * n_samples) 25 | 26 | # //表示整数除法 27 | rng = np.random.RandomState(123) 28 | X = 0.3 * rng.randn(n_inliers // 2, 2) 29 | 30 | # 构建正常样本与异常样本 31 | X_train = np.r_[X + 2, X - 2] 32 | outliers = rng.uniform(low=-6, high=6, size=(n_outliers, 2)) 33 | 34 | # 正常样本与异常样本的融合 35 | X_train = np.r_[X_train, outliers] 36 | 37 | clf = IsolationForest(contamination=outliers_fraction, random_state=2018, n_jobs=-1, behaviour="new") 38 | # predict / fit_predict方法返回每个样本是否为正常值,若返回1表示正常值,返回-1表示异常值 39 | y_pred_train = clf.fit_predict(X_train) 40 | pred = np.array(['正常' if i==1 else '异常' for i in y_pred_train]) 41 | 42 | # 分数越小于0,越有可能是异常值 43 | scores_pred = clf.decision_function(X_train) 44 | dict_ = {'anomaly_score':scores_pred, 'y_pred':y_pred_train, 'result':pred} 45 | scores = pd.DataFrame(dict_) 46 | print(scores.sample(5)) 47 | -------------------------------------------------------------------------------- /UnSupervised-Isolation Forest/Pics/Algorithm2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Isolation Forest/Pics/Algorithm2.jpg -------------------------------------------------------------------------------- /UnSupervised-Isolation Forest/Pics/Isolation Score.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Isolation Forest/Pics/Isolation Score.jpg -------------------------------------------------------------------------------- /UnSupervised-Isolation Forest/Pics/fdfd: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /UnSupervised-Isolation Forest/ReadMe.md: -------------------------------------------------------------------------------- 1 | - **Author:** 马肖 2 | - **E-Mail:** maxiaoscut@aliyun.com 3 | - **GitHub:** https://github.com/Albertsr 4 | 5 | --- 6 | 7 | #### 1. iForest的核心思想 8 | - 异常点相比其他数据点较为疏离,只需少数几次切分就可以被隔离,即异常样本更容易被划分至叶结点,从而使得异常样本所属的叶结点距离根节点有更短的路径长度。 9 | - 在iTree中,异常点被isolated之后更加靠近树的根部,而正常数据isolated之后在树中更深 10 | 11 | --- 12 | 13 | #### 2. 概述 14 | - iForest 适用于**连续性数据**的异常值检测,属于**无监督、非参数(对样本的总体分布不做假设)模型** 15 | - **iForest利用了异常样本的两个特点**: 16 | - **few** : 异常样本在样本集中占比较小 17 | - **different** : 异常样本的某些特征的取值明显区别于正常样本 18 | - iForest只有两个参数:**iTree的个数、 训练每棵iTree的样本数** 19 | - iTree是**二叉树结构**,iTree的个数默认取100,论文原文:path lengths usually converge well before t = 100 20 | - **运用小样本集训练单颗itree有助于减轻swamping and masking effect** 21 | - **swamping**:是指将正常样本识别为异常样本,类似于FP; 22 | - **masking**:是指异常样本没有被识别出来,类似于FN; 23 | - swamping与masking更容易在数据量较大的情况下出现,因此训练单棵iTree的样本数不宜过多,默认不超过256 24 | - 大样本集不一定增强其性能,反而会增加计算量和内存占用 25 | 26 | - 在n个训练样本均不相同的情况下,训练出的iTree具有n个叶结点,n-1个内部结点(非叶结点),总结点数为2n-1 27 | 28 | --- 29 | 30 | #### 3. iForest的训练过程 31 | 32 | - 抽取若干个样本构成子样本集,放置于根节点,用于训练单颗iTree 33 | - 随机选择一个特征q作为起始结点,然后在特征q的最大值和最小值之间随机选择一个值p作为分割点 34 | - 根据属性q的取值进行分枝,把q
=p的样本划分至右子节点 35 | - 重复上两步,递归地构造左子节点和右子节点,直到满足以下条件之一: 36 | - 数据不可再分,即:只包含单个样本,或全部样本的取值相同 37 | - 二叉树达到了限定的最大深度 38 | - 获得t个iTree之后,iForest训练就结束了 39 | 40 | --- 41 | 42 | #### 4. 运用iForest判断样本是否异常 43 | 44 | - **将训练数据x遍历每一棵iTree,然后计算h(x)、E(h(x))** 45 | - **h(x):** 样本x从iTree的根节点到达叶结点所途径的路径长度,等价于样本x落入叶结点所需的划分次数 46 | - **E(h(x)):** 样本x在整个iForest上的平均路径长度 47 | 48 | - **计算:c(n) = 2H(n-1) - 2(n-1)/n** 49 | - 其中n为训练单颗iTree的样本数,H(i)为调和级数,且H(i)=In(i)+0.577(欧拉常数) 50 | - c(n)用于对h(x)进行标准化 51 | 52 | - **根据下列公式求异常分数** 53 |  54 | 55 | - **根据异常分数判断样本是否异常** 56 | - **异常分数与E(h(x))成反比,与样本异常程度成正比** 57 | - 当E(h(x))趋近于c(n)时,s趋近于0.5,若所有样本的异常分数均接近0.5,则表明数据中无明显异常值 58 | - 当E(h(x))趋近于0时,s趋近于1,此时样本x极可能是异常值 59 | - 当E(h(x))趋近于n-1时(即趋于最大划分次数),s趋近于0,此时样本x极可能是正常值 60 | 61 | --- 62 | 63 | #### 5. 算法优势 64 | - **缓解swamping and masking的出现** 65 | - **swamping**:是指将正常样本识别为异常样本;**masking**:是指异常样本没有被识别出来。这两种情况都是发生在数据量较大的情况下。 66 | - **iForest算法能有效地减缓上述两种情况发生的原因:** 67 | - 子采样限制了训练单颗iTree的样本数,有助于增强iTree的区分能力 68 | - 每一棵iTree的样本集和划分点都是随机产生的,因此每一棵iTree都具有独立性 69 | 70 | - **相比基于距离或密度的算法,iForest节省了大量的计算成本**:iForest utilizes no distance or density measures to detect anomalies.This eliminates major computational cost of distance calculation in all distance-based methods and density-based methods 71 | 72 | - **iForest的时间复杂度、内存占用较少,线性增长于样本个数**:iForest has a linear time complexity with a low 73 | constant and a low memory requirement 74 | 75 | - **iForest具备处理高维大数据集的能力**:iForest has the capacity to scale up to handle extremely 76 | large data size and high-dimensional problems with a 77 | large number of irrelevant attributes 78 | 79 |  80 | -------------------------------------------------------------------------------- /UnSupervised-Local Outlier Factor/LOF:Identifying Density-Based Local Outliers.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Local Outlier Factor/LOF:Identifying Density-Based Local Outliers.pdf -------------------------------------------------------------------------------- /UnSupervised-Local Outlier Factor/LocalOutlierFactor.py: -------------------------------------------------------------------------------- 1 | # Author:马肖 2 | # E-mail:maxiaoscut@aliyun.com 3 | # Github:https://github.com/Albertsr 4 | 5 | import numpy as np 6 | from sklearn.neighbors import LocalOutlierFactor 7 | 8 | 9 | # 设置训练样本数及异常样本比例 10 | n_samples = 1000 11 | contamination = 0.01 12 | n_inliers = int((1. - contamination) * n_samples) 13 | n_outliers = int(contamination * n_samples) 14 | 15 | rng = np.random.RandomState(2017) 16 | X = 0.3 * rng.randn(n_inliers // 2, 2) 17 | 18 | # 构建正常样本与异常样本 19 | inliers = np.r_[X + 2, X - 2] 20 | outliers = rng.uniform(low=-6, high=6, size=(n_outliers, 2)) 21 | 22 | # 正常样本与异常样本的融合 23 | X_train = np.r_[inliers, outliers] 24 | 25 | lof = LocalOutlierFactor(contamination=contamination, n_jobs=-1) 26 | # fit_predict返回-1则表示为异常值;返回1表示非异常值 27 | y_train_pred = lof.fit_predict(X_train) 28 | # 返回异常样本的索引 29 | outliers_indices = np.argwhere(y_train_pred==-1).ravel() 30 | print('训练集异常样本索引 : {}'.format(outliers_indices)) 31 | 32 | # 属性negative_outlier_factor_:返回负的LOF score,其绝对值越大,样本越可能异常 33 | lof_score = -lof.negative_outlier_factor_ 34 | outliers_indices_desc = np.argsort(-lof_score)[:len(outliers)] 35 | print('按异常程度降序排列的训练集异常样本索引 : {}'.format(outliers_indices_desc)) 36 | 37 | # 生成测试集 38 | n_samples = 1000 39 | contamination = 0.01 40 | n_inliers = int((1. - contamination) * n_samples) 41 | n_outliers = int(contamination * n_samples) 42 | 43 | rng = np.random.RandomState(2018) 44 | X = 0.3 * rng.randn(n_inliers // 2, 2) 45 | 46 | # 构建正常样本与异常样本 47 | inliers = np.r_[X + 2, X - 2] 48 | outliers = rng.uniform(low=-6, high=6, size=(n_outliers, 2)) 49 | X_test = np.r_[inliers, outliers] 50 | 51 | # 参数novelty必须设为True 52 | lof = LocalOutlierFactor(n_neighbors=20, contamination=contamination, n_jobs=-1, novelty=True) 53 | # 参数novelty=True时,clf.fit_predict(X_test)会报错 54 | lof.fit(X_train) 55 | y_test_pred = lof.predict(X_test) 56 | 57 | # 返回异常样本的索引 58 | outliers_indices = np.argwhere(y_test_pred==-1).ravel() 59 | print('测试集异常样本索引 : {}'.format(outliers_indices)) -------------------------------------------------------------------------------- /UnSupervised-Local Outlier Factor/Pics/1.K-dist.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Local Outlier Factor/Pics/1.K-dist.jpg -------------------------------------------------------------------------------- /UnSupervised-Local Outlier Factor/Pics/2.reach_dist.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Local Outlier Factor/Pics/2.reach_dist.jpg -------------------------------------------------------------------------------- /UnSupervised-Local Outlier Factor/Pics/3.lrd.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Local Outlier Factor/Pics/3.lrd.jpg -------------------------------------------------------------------------------- /UnSupervised-Local Outlier Factor/Pics/4.LOF.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Local Outlier Factor/Pics/4.LOF.jpg -------------------------------------------------------------------------------- /UnSupervised-Local Outlier Factor/ReadMe.md: -------------------------------------------------------------------------------- 1 | - **Author:** 马肖 2 | - **E-Mail:** maxiaoscut@aliyun.com 3 | - **GitHub:** https://github.com/Albertsr 4 | 5 | --- 6 | 7 | ## 1. K-邻近距离(k-distance) 8 | 9 |  10 | 11 | --- 12 | 13 | ## 2. 可达距离(rechability distance) 14 | 15 |  16 | 17 | --- 18 | 19 | ## 3. 局部可达密度(local reachability density) 20 | 21 |  22 | 23 | --- 24 | 25 | ## 4. 局部离群因子(Local Outlier Factor) 26 | 27 |  28 | 29 | - LOF(A)定义为点A的第k距离邻域内各点的平均局部可达密度与点A的局部可达密度lrd的比值 30 | - **LOF算法衡量一个数据点的异常程度,并不是看它的绝对局部密度,而是看它跟周围邻近的数据点的相对密度** 31 | - LOF(A)越小于1,表明点A越有可能处于一个相对密集的区域,就越可能是inlier 32 | - LOF(A)越接近于1,表明点A的局部可达与其k近邻越相似,就越不可能是异常值 33 | - LOF(A)越大于1,表明点A越有可能与其他点较疏远,就越有可能是异常值 34 | -------------------------------------------------------------------------------- /UnSupervised-Mahalanobis Distance/Pics/Mahdist_verify_result.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Mahalanobis Distance/Pics/Mahdist_verify_result.jpg -------------------------------------------------------------------------------- /UnSupervised-Mahalanobis Distance/Pics/mahal_dist.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Mahalanobis Distance/Pics/mahal_dist.jpg -------------------------------------------------------------------------------- /UnSupervised-Mahalanobis Distance/Pics/变体参数含义.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Mahalanobis Distance/Pics/变体参数含义.jpg -------------------------------------------------------------------------------- /UnSupervised-Mahalanobis Distance/Pics/马氏距离变体.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Albertsr/Anomaly-Detection/89a4a0720bcfcd1ee68c8d13f305ac996e745b9f/UnSupervised-Mahalanobis Distance/Pics/马氏距离变体.jpg -------------------------------------------------------------------------------- /UnSupervised-Mahalanobis Distance/ReadMe.md: -------------------------------------------------------------------------------- 1 | - **Author:** MaXiao 2 | - **E-Mail:** maxiaoscut@aliyun.com 3 | 4 | --- 5 | 6 | ## 1. 马氏距离 7 | 8 | #### 1.1 马氏距离等价于【规范化的主成分空间内的欧氏距离】 9 | 10 | - **规范化的主成分空间** 11 | - 对数据集进行主成分分析,即对数据集的协方差矩阵进行特征值分解,求主成分 12 | - 对所有主成分进行归一化处理,这些规范化的主成分即构成了规范化主成分空间的坐标轴 13 | 14 | - **将样本映射至规范化主成分空间,意味着数据从超椭圆(ellipsoidal)分布转化为超球面(spherical)分布** 15 | - 样本在规范化主成分空间各坐标轴上的投影(坐标分量),可通过计算样本向量与规范化主成分的内积求得 16 | 17 | - **两个向量的马氏距离等价于两者在规范化的主成分空间内的欧氏距离** 18 | - If each of these axes is re-scaled to have unit variance, then the Mahalanobis distance corresponds to standard Euclidean distance in the transformed space. 19 | 20 | 21 | #### 1.2 马氏距离的特点 22 | - **特点一:马氏距离是无单位化的、尺度无关的,它内生地考虑到了数据集各坐标轴之间的相关性** 23 | - The Mahalanobis distance is thus unitless and scale-invariant, and takes into account the correlations of the data set. 24 | 25 | - **特点二:马氏距离与样本在各主成分上的偏离度成正比** 26 | - This distance is zero if P is at the mean of D, and grows as P moves away from the mean along each principal component axis 27 | 28 | - The Mahalanobis distance measures the number of standard deviations from P to the mean of D. 29 | 30 | - 参考资料:[Wikipedia : Mahalanobis distance](https://en.wikipedia.org/wiki/Mahalanobis_distance) 31 | 32 | --- 33 | 34 | ## 2. 马氏距离的计算方法及其代码实现 35 | #### 2.1 Python代码实现:[mahal_dist.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Mahalanobis%20Distance/mahal_dist.py) 36 | 37 | #### 2.2 计算样本点x距离样本集中心的马氏距离公式 38 |  39 | 40 | --- 41 | 42 | ## 3. 马氏距离的变体及其代码实现 43 | #### 3.1 Python代码实现: [mahal_dist_variant.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Mahalanobis%20Distance/mahal_dist_variant.py) 44 | 45 | #### 3.2 论文出处: [A Novel Anomaly Detection Scheme Based on Principal Component [Shyu et.al, 2003]](https://cn.bing.com/academic/profile?id=6ffacfce89595db316f3fd3bfeea1c1e&encoded=0&v=paper_preview&mkt=zh-cn) 46 | 47 | #### 3.3 计算方法 48 | 49 |  50 | 51 | - **参数含义** 52 | 53 |  54 | 55 | - **异常样本的判定:** 当Score(x)大于某个阈值时,便可将样本x判定为异常样本 56 | 57 | --- 58 | 59 | ## 4. 马氏距离及其变体【对样本的异常程度评估完全一致】 60 | 61 | #### 4.1 验证方法 62 | - 根据多个不同的随机种子生成多组实验数据集 63 | - 根据两种方法返回的分数对样本集的索引进行升序或降序排列,例如数值最大的样本其对应的索引排在最前面,依次类推; 64 | - 若分别根据马氏距离及其变体返回的数值大小对样本索引降序排列,若两个索引序列完全一致,则证明这两种方法对样本集中每一个样本的异常程度评估是完全一致的 65 | - 换句话说,在数据集中随机抽取两个不同样本a与b,若马氏距离返回的数据显示样本a比样本b更偏离数据数据中心,则马氏距离变体对这种大小关系有一致的判定 66 | 67 | #### 4.2 验证代码:[verify_mahal_equivalence.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Mahalanobis%20Distance/verify_mahal_equivalence.py) 68 | 69 | #### 4.3 验证结论 70 | - 马氏距离及其变体对**各样本在数据集中的异常程度大小关系是完全一致的** 71 | - 根据随机生成的多个数据集进行验证,**实验结果表明上述结论是完全正确的** 72 | - 每个数据集的行数、列数、异常样本比例均在一定区间内随机生成 73 | - 正常样本服从标准正态分布,异常样本由两组异常样本子集构成,分别服从伽玛分布、指数分布 74 | - 更多细节请查阅上述验证代码 75 | 76 |  77 | -------------------------------------------------------------------------------- /UnSupervised-Mahalanobis Distance/mahal_dist.py: -------------------------------------------------------------------------------- 1 | # Author:MaXiao 2 | # E-mail:maxiaoscut@aliyun.com 3 | 4 | import numpy as np 5 | from numpy import linalg as LA 6 | 7 | def cal_mahal_dist(matrix): 8 | matrix_center = np.mean(matrix, axis=0) 9 | delta = matrix - matrix_center 10 | 11 | # calculate the covariance matrix and its inverse matrix 12 | cov_matrix = np.cov(matrix, rowvar=False, ddof=1) 13 | cov_matrix_inv = LA.inv(cov_matrix) 14 | 15 | # calculate the Mahalanobis distance between a single vector and the center of the dataset 16 | def md_vector(vector): 17 | inner_prod = np.dot(vector, cov_matrix_inv) 18 | dist = np.sqrt(np.dot(inner_prod, vector)) 19 | return dist 20 | 21 | mahal_dist = np.apply_along_axis(arr=delta, axis=1, func1d=md_vector) 22 | assert len(mahal_dist) == len(matrix) 23 | return mahal_dist 24 | -------------------------------------------------------------------------------- /UnSupervised-Mahalanobis Distance/mahal_dist_variant.py: -------------------------------------------------------------------------------- 1 | # Author:MaXiao 2 | # E-mail:maxiaoscut@aliyun.com 3 | 4 | import numpy as np 5 | from numpy import linalg as LA 6 | from sklearn.preprocessing import StandardScaler 7 | 8 | def cal_mahal_dist_variant(matrix): 9 | matrix = StandardScaler().fit_transform(matrix) 10 | cov_matrix = np.cov(matrix, rowvar=False, ddof=1) 11 | eigen_values, eigen_vectors = LA.eig(cov_matrix) 12 | 13 | def get_score(idx): 14 | inner_product = np.dot(matrix, eigen_vectors[:, idx]) 15 | score = np.square(inner_product) / eigen_values[idx] 16 | return score 17 | mahal_dist_variant = sum([get_score(i) for i in range(len(eigen_values))]) 18 | assert len(mahal_dist_variant) == len(matrix) 19 | return mahal_dist_variant 20 | -------------------------------------------------------------------------------- /UnSupervised-Mahalanobis Distance/verify_mahal_equivalence.py: -------------------------------------------------------------------------------- 1 | # Author:MaXiao 2 | # E-mail:maxiaoscut@aliyun.com 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from mahal_dist import cal_mahal_dist 7 | from mahal_dist_variant import cal_mahal_dist_variant 8 | 9 | def generate_dataset(seed): 10 | rdg = np.random.RandomState(seed) 11 | row = rdg.randint(8000, 10000) 12 | col = rdg.randint(30, 35) 13 | contamination = rdg.uniform(0.015, 0.025) 14 | 15 | outlier_num = int(row*contamination) 16 | inlier_num = row - outlier_num 17 | 18 | # the normal sample set obeys the standard normal distribution. 19 | inliers = rdg.randn(inlier_num, col) 20 | 21 | # If outlier_num is odd, row_1=outlier_num//2,else row_1=int(outlier_num/2) 22 | row_1 = outlier_num//2 if np.mod(outlier_num, 2) else int(outlier_num/2) 23 | row_2 = outlier_num - row_1 24 | 25 | # outliers_sub_1 obeys gamma distribution and outliers_sub_2 obeys exponential distribution. 26 | outliers_sub_1 = rdg.gamma(shape=2, scale=0.5, size=(row_1 , col)) 27 | outliers_sub_2 = rdg.exponential(1.5, size=(row_2, col)) 28 | outliers = np.r_[outliers_sub_1, outliers_sub_2] 29 | 30 | dataset = np.r_[inliers, outliers] 31 | outliers_indices = range(len(dataset))[inlier_num:] 32 | return dataset 33 | 34 | def verify_maldist_equivalence(dataset): 35 | mahal_dist = cal_mahal_dist(dataset) 36 | indices_desc = np.argsort(-mahal_dist) 37 | 38 | mahal_dist_variant = cal_mahal_dist_variant(dataset) 39 | indices_desc_variant = np.argsort(-mahal_dist_variant) 40 | 41 | square_bool = np.allclose(mahal_dist**2, mahal_dist_variant) 42 | indices_bool = np.all(indices_desc==indices_desc_variant) 43 | return square_bool and indices_bool 44 | 45 | 46 | seeds = np.random.choice(range(1000), size=10, replace=False) 47 | datasets = [generate_dataset(seed) for seed in seeds] 48 | bool_results = [verify_maldist_equivalence(dataset) for dataset in datasets] 49 | 50 | ''' 51 | relevant conclusions 52 | - the square of Mahalanobis distance is equal to its variation 53 | - they are consistent in determining the abnormal degree of the sample 54 | ''' 55 | 56 | if all(bool_results): 57 | print('Right! The relevant conclusions about Mahalanobis distance are correct.') 58 | else: 59 | print('Wrong! The relevant conclusions about Mahalanobis distance are incorrect.') 60 | 61 | dataset_name = ['Dataset_' + str(i) for i in range(len(seeds))] 62 | verify_result = pd.DataFrame(bool_results, index=dataset_name, columns=['Equivalence']) 63 | print(verify_result.T) 64 | --------------------------------------------------------------------------------