├── checkpoints └── BieqAbdL_checkpoint.pt ├── model_sinne ├── utils.py └── SiNNE.py ├── data_od_evaluation ├── wbc_pca_gt_hbos.csv ├── wbc_pca_gt_iforest.csv ├── vertebral_gt_hbos.csv ├── wbc_pca_gt_copod.csv ├── vertebral_gt_iforest.csv ├── vertebral_gt_copod.csv ├── speech_pca_gt_hbos.csv ├── SPECT_pca_gt_hbos.csv ├── arrhythmia_pca_gt_hbos.csv ├── SPECT_pca_gt_iforest.csv ├── SPECT_pca_gt_copod.csv ├── satimage-2_pca_gt_hbos.csv ├── wineQualityReds-od2_gt_hbos.csv ├── arrhythmia_pca_gt_iforest.csv ├── arrhythmia_pca_gt_copod.csv ├── wineQualityReds-od2_gt_iforest.csv ├── speech_pca_gt_iforest.csv ├── speech_pca_gt_copod.csv ├── wineQualityReds-od2_gt_copod.csv ├── satimage-2_pca_gt_iforest.csv ├── satimage-2_pca_gt_copod.csv ├── letter_pca_gt_hbos.csv ├── ionosphere_pca_gt_hbos.csv ├── letter_pca_gt_copod.csv ├── letter_pca_gt_iforest.csv ├── ionosphere_pca_gt_iforest.csv ├── optdigits_pca_gt_hbos.csv ├── ionosphere_pca_gt_copod.csv ├── wineQualityWhites-od2_gt_hbos.csv ├── wineQualityWhites-od2_gt_iforest.csv ├── optdigits_pca_gt_copod.csv ├── optdigits_pca_gt_iforest.csv ├── wineQualityWhites-od2_gt_copod.csv ├── pima_gt_hbos.csv ├── pima_gt_iforest.csv └── pima_gt_copod.csv ├── utils ├── eval_print_utils.py ├── model_utils.py └── synthetic_generator.py ├── eval ├── evaluation_utils.py ├── eva_main.py └── evaluation_od.py ├── model_coin ├── utils.py ├── prediction_strength.py └── COIN.py ├── model_iml ├── LIME.py ├── Anchor.py ├── IntGrad.py └── SHAP.py ├── model_aton ├── utils.py ├── ATON_ablation.py ├── networks.py ├── ATON_ablation2.py ├── ATON.py ├── ATON_ablation3.py └── datasets.py ├── config.py ├── main2.py ├── README.md ├── main.py ├── data └── 01-vertebral.csv └── LICENSE /checkpoints/BieqAbdL_checkpoint.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuhongzuo/outlier-interpretation/HEAD/checkpoints/BieqAbdL_checkpoint.pt -------------------------------------------------------------------------------- /model_sinne/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def powerset(s): 5 | x = len(s) 6 | pw_set = [] 7 | for i in range(1 << x): 8 | pw_set.append([s[j] for j in range(x) if (i & (1 << j))]) 9 | return pw_set 10 | 11 | -------------------------------------------------------------------------------- /data_od_evaluation/wbc_pca_gt_hbos.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 357,"[2, 3]" 3 | 358,"[1, 4]" 4 | 359,"[1, 2, 4]" 5 | 360,[5] 6 | 361,"[0, 2]" 7 | 362,[1] 8 | 363,"[2, 3, 5]" 9 | 364,"[1, 2, 4, 6]" 10 | 365,"[0, 5, 7, 8]" 11 | 366,[0] 12 | 367,[0] 13 | 368,[4] 14 | 369,[6] 15 | 370,"[0, 2, 3]" 16 | 371,"[0, 1, 3, 7]" 17 | 372,"[0, 1]" 18 | 373,"[3, 4, 5, 9]" 19 | 374,"[2, 3]" 20 | 375,[0] 21 | 376,"[0, 2, 6]" 22 | 377,[0] 23 | -------------------------------------------------------------------------------- /data_od_evaluation/wbc_pca_gt_iforest.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 357,[3] 3 | 358,"[1, 5]" 4 | 359,"[2, 5, 8]" 5 | 360,[5] 6 | 361,"[1, 2]" 7 | 362,"[1, 4, 5, 7, 9]" 8 | 363,"[5, 8]" 9 | 364,[1] 10 | 365,"[1, 2, 7, 8]" 11 | 366,"[0, 1, 5]" 12 | 367,"[0, 2]" 13 | 368,[4] 14 | 369,"[2, 6, 8]" 15 | 370,"[0, 1, 3, 5]" 16 | 371,[1] 17 | 372,"[1, 4, 7, 9]" 18 | 373,"[3, 4, 5, 8]" 19 | 374,[3] 20 | 375,[0] 21 | 376,"[0, 2]" 22 | 377,"[0, 2, 3, 5, 7]" 23 | -------------------------------------------------------------------------------- /data_od_evaluation/vertebral_gt_hbos.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 210,"[0, 2]" 3 | 211,[4] 4 | 212,[0] 5 | 213,[4] 6 | 214,[4] 7 | 215,[1] 8 | 216,"[2, 3]" 9 | 217,[1] 10 | 218,[4] 11 | 219,[1] 12 | 220,[4] 13 | 221,[4] 14 | 222,[0] 15 | 223,[1] 16 | 224,[1] 17 | 225,[4] 18 | 226,[4] 19 | 227,"[0, 1]" 20 | 228,[4] 21 | 229,[4] 22 | 230,[1] 23 | 231,"[2, 3]" 24 | 232,[1] 25 | 233,[4] 26 | 234,[4] 27 | 235,[0] 28 | 236,[3] 29 | 237,[1] 30 | 238,[4] 31 | 239,[4] 32 | -------------------------------------------------------------------------------- /data_od_evaluation/wbc_pca_gt_copod.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 357,"[2, 6]" 3 | 358,"[0, 1, 4, 5]" 4 | 359,"[2, 8]" 5 | 360,[5] 6 | 361,"[1, 2]" 7 | 362,"[7, 9]" 8 | 363,[8] 9 | 364,"[2, 4, 8, 9]" 10 | 365,"[4, 6, 8]" 11 | 366,"[0, 2, 3, 5]" 12 | 367,"[0, 3, 7]" 13 | 368,"[0, 2, 4, 5]" 14 | 369,"[2, 6]" 15 | 370,"[0, 1, 2, 7, 8]" 16 | 371,"[0, 1, 2, 7, 8]" 17 | 372,"[2, 4, 9]" 18 | 373,"[0, 2, 5]" 19 | 374,"[2, 6]" 20 | 375,[0] 21 | 376,"[0, 2, 4, 6, 7]" 22 | 377,"[0, 2]" 23 | -------------------------------------------------------------------------------- /data_od_evaluation/vertebral_gt_iforest.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 210,[0] 3 | 211,[1] 4 | 212,[0] 5 | 213,"[1, 4]" 6 | 214,"[1, 4]" 7 | 215,[1] 8 | 216,"[1, 2, 4]" 9 | 217,"[0, 5]" 10 | 218,"[0, 2]" 11 | 219,[1] 12 | 220,"[1, 2, 4]" 13 | 221,[4] 14 | 222,"[0, 4]" 15 | 223,"[1, 4]" 16 | 224,"[0, 2]" 17 | 225,"[3, 5]" 18 | 226,[5] 19 | 227,[1] 20 | 228,"[0, 2]" 21 | 229,"[0, 4]" 22 | 230,"[1, 3, 5]" 23 | 231,"[2, 4]" 24 | 232,"[1, 2, 3]" 25 | 233,"[3, 5]" 26 | 234,"[1, 4]" 27 | 235,"[3, 5]" 28 | 236,[4] 29 | 237,"[1, 4]" 30 | 238,[5] 31 | 239,"[0, 4]" 32 | -------------------------------------------------------------------------------- /data_od_evaluation/vertebral_gt_copod.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 210,[4] 3 | 211,"[1, 4]" 4 | 212,"[1, 4]" 5 | 213,"[1, 4]" 6 | 214,[4] 7 | 215,"[1, 5]" 8 | 216,"[1, 2, 3, 5]" 9 | 217,"[0, 1]" 10 | 218,[2] 11 | 219,"[1, 5]" 12 | 220,"[0, 1, 2, 4]" 13 | 221,[4] 14 | 222,"[3, 4]" 15 | 223,"[1, 4]" 16 | 224,"[1, 2, 3, 5]" 17 | 225,"[3, 4]" 18 | 226,[4] 19 | 227,"[0, 1, 2, 4]" 20 | 228,"[0, 4]" 21 | 229,"[0, 4]" 22 | 230,"[1, 5]" 23 | 231,"[2, 3, 4]" 24 | 232,"[1, 5]" 25 | 233,[3] 26 | 234,[4] 27 | 235,[5] 28 | 236,"[3, 4]" 29 | 237,"[1, 4]" 30 | 238,"[4, 5]" 31 | 239,"[3, 4]" 32 | -------------------------------------------------------------------------------- /utils/eval_print_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def print_eval_runs2(runs_metric_lst, data_name, algo_name): 5 | runs_metric_lst = np.array(runs_metric_lst) 6 | precision, recall, jaccard, t = np.average(runs_metric_lst, axis=0) 7 | txt = "%s, od_eval, [p,r,j], %.4f, %.4f, %.4f, time, %.2f, %s" % \ 8 | (data_name, precision, recall, jaccard, t, algo_name) 9 | return txt 10 | 11 | 12 | def print_eval_runs(runs_metric_lst, data_name, algo_name): 13 | runs_metric_lst = np.array(runs_metric_lst) 14 | precision, recall, jaccard, aupr, auroc, t = np.average(runs_metric_lst, axis=0) 15 | txt = "%s, [p r j aupr auroc], %.4f, %.4f, %.4f, %.4f, %.4f, time, %.2f, %s" % \ 16 | (data_name, precision, recall, jaccard, aupr, auroc, t, algo_name) 17 | return txt 18 | 19 | -------------------------------------------------------------------------------- /eval/evaluation_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def powerset(s): 5 | x = len(s) 6 | pw_set = [] 7 | for i in range(1 << x): 8 | pw_set.append([s[j] for j in range(x) if (i & (1 << j))]) 9 | return pw_set 10 | 11 | 12 | def min_max_norm(array): 13 | if np.min(array) == np.max(array): 14 | return array * 0 15 | else: 16 | return (array - np.min(array))/(np.max(array) - np.min(array)) 17 | 18 | 19 | def get_subset_candidate(dim, chosen_subspace=None): 20 | if chosen_subspace is not None: 21 | f_subsets = [] 22 | for subset in chosen_subspace: 23 | subset = list(subset) 24 | if subset not in f_subsets: 25 | f_subsets.append(list(subset)) 26 | else: 27 | full_set = np.arange(dim) 28 | f_subsets = powerset(full_set) 29 | f_subsets.remove([]) 30 | return f_subsets 31 | -------------------------------------------------------------------------------- /data_od_evaluation/speech_pca_gt_hbos.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 0,[1] 3 | 1,[7] 4 | 2,[3] 5 | 3,[3] 6 | 4,[3] 7 | 5,[8] 8 | 6,"[0, 1, 2, 4]" 9 | 7,[6] 10 | 8,"[0, 1, 7, 8]" 11 | 9,[1] 12 | 10,"[1, 2, 4, 6]" 13 | 11,[1] 14 | 12,[1] 15 | 13,"[1, 4]" 16 | 14,"[2, 3, 5, 7]" 17 | 15,"[0, 1, 4, 6]" 18 | 16,[1] 19 | 17,[3] 20 | 18,"[1, 8, 9]" 21 | 19,[8] 22 | 20,[2] 23 | 21,[1] 24 | 22,[1] 25 | 23,[1] 26 | 24,[9] 27 | 25,[7] 28 | 26,[1] 29 | 27,[6] 30 | 28,"[1, 2, 3, 9]" 31 | 29,"[1, 4, 7]" 32 | 30,"[2, 3, 4]" 33 | 31,[4] 34 | 32,[1] 35 | 33,"[1, 9]" 36 | 34,"[0, 4, 5, 8]" 37 | 35,"[0, 2, 4, 5, 8, 9]" 38 | 36,"[0, 2]" 39 | 37,[0] 40 | 38,[5] 41 | 39,[7] 42 | 40,[1] 43 | 41,"[2, 5]" 44 | 42,[6] 45 | 43,[2] 46 | 44,"[2, 4, 5, 6, 7]" 47 | 45,[2] 48 | 46,[4] 49 | 47,"[3, 8]" 50 | 48,[2] 51 | 49,"[0, 2, 5, 6, 9]" 52 | 50,"[0, 2, 4, 9]" 53 | 51,[2] 54 | 52,"[2, 3, 5]" 55 | 53,[2] 56 | 54,"[3, 6]" 57 | 55,[7] 58 | 56,[1] 59 | 57,"[0, 4, 6, 7, 8]" 60 | 58,[8] 61 | 59,"[0, 1, 2, 6, 7, 8]" 62 | 60,"[1, 4, 6, 9]" 63 | -------------------------------------------------------------------------------- /data_od_evaluation/SPECT_pca_gt_hbos.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 40,"[1, 2, 4, 6]" 3 | 41,"[1, 7]" 4 | 42,"[2, 3, 4, 7, 8]" 5 | 43,"[2, 3, 8, 9]" 6 | 44,"[2, 5]" 7 | 45,[6] 8 | 46,"[0, 4, 6, 7, 9]" 9 | 47,"[1, 2, 4, 6, 8]" 10 | 48,[0] 11 | 49,"[4, 9]" 12 | 50,"[2, 3, 8]" 13 | 51,[0] 14 | 52,[0] 15 | 53,"[0, 1, 3, 6]" 16 | 54,"[1, 2, 4, 5, 6]" 17 | 55,"[1, 2]" 18 | 56,"[1, 4, 8, 9]" 19 | 57,"[0, 1, 4, 9]" 20 | 58,[0] 21 | 59,[0] 22 | 60,"[0, 1, 2, 8, 9]" 23 | 61,[0] 24 | 62,"[1, 2, 4, 6]" 25 | 63,[0] 26 | 64,[0] 27 | 65,[0] 28 | 66,"[1, 2, 4, 6]" 29 | 67,[0] 30 | 68,[1] 31 | 69,[0] 32 | 70,[6] 33 | 71,"[1, 2, 3, 5, 9]" 34 | 72,"[1, 2, 3, 4, 8, 9]" 35 | 73,[3] 36 | 74,[0] 37 | 75,"[1, 3]" 38 | 76,[1] 39 | 77,"[0, 1, 3, 8, 9]" 40 | 78,[1] 41 | 79,[1] 42 | 252,[0] 43 | 253,"[1, 2, 4, 6]" 44 | 254,[0] 45 | 255,"[2, 4, 7, 9]" 46 | 256,"[2, 3, 4, 7, 8]" 47 | 257,[0] 48 | 258,[0] 49 | 259,"[1, 8]" 50 | 260,"[6, 8, 9]" 51 | 261,[0] 52 | 262,[0] 53 | 263,"[2, 7]" 54 | 264,"[2, 3, 4, 6, 7, 9]" 55 | 265,"[0, 1, 2, 8, 9]" 56 | 266,[0] 57 | -------------------------------------------------------------------------------- /data_od_evaluation/arrhythmia_pca_gt_hbos.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 0,[9] 3 | 4,[9] 4 | 5,"[1, 6, 9]" 5 | 10,[1] 6 | 26,[9] 7 | 34,[2] 8 | 45,[4] 9 | 46,[6] 10 | 60,[2] 11 | 61,[1] 12 | 76,"[1, 5, 7]" 13 | 83,[1] 14 | 85,[1] 15 | 87,[3] 16 | 88,[1] 17 | 89,[2] 18 | 91,[4] 19 | 93,[2] 20 | 100,[2] 21 | 105,[9] 22 | 141,[5] 23 | 168,[2] 24 | 169,[5] 25 | 174,"[1, 3]" 26 | 183,"[1, 5, 6, 8]" 27 | 185,[7] 28 | 188,[4] 29 | 189,[2] 30 | 204,[1] 31 | 207,[2] 32 | 214,[6] 33 | 217,"[1, 3]" 34 | 218,"[0, 1, 5]" 35 | 225,[7] 36 | 231,[7] 37 | 243,"[1, 3]" 38 | 248,[7] 39 | 251,[7] 40 | 252,[0] 41 | 253,"[0, 5]" 42 | 257,[9] 43 | 258,[3] 44 | 285,[5] 45 | 300,"[3, 4, 6]" 46 | 303,"[5, 8, 9]" 47 | 309,[3] 48 | 316,[2] 49 | 320,[2] 50 | 327,[0] 51 | 348,[2] 52 | 356,"[0, 3, 5, 7]" 53 | 361,"[0, 1, 5, 6]" 54 | 370,"[2, 3, 5]" 55 | 374,"[1, 4, 5, 6]" 56 | 376,"[0, 7]" 57 | 381,"[1, 5]" 58 | 387,[8] 59 | 388,[2] 60 | 395,[1] 61 | 398,"[0, 5]" 62 | 401,"[3, 4, 6]" 63 | 403,[2] 64 | 410,"[1, 5, 6]" 65 | 420,[3] 66 | 424,[2] 67 | 433,"[0, 3, 4, 5]" 68 | -------------------------------------------------------------------------------- /data_od_evaluation/SPECT_pca_gt_iforest.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 40,"[0, 6]" 3 | 41,"[0, 3, 7]" 4 | 42,"[0, 2, 8]" 5 | 43,"[0, 8]" 6 | 44,"[0, 1, 2, 4, 5]" 7 | 45,"[0, 1, 6]" 8 | 46,"[0, 2, 3, 4, 6, 7, 8, 9]" 9 | 47,"[0, 1, 3, 6, 8]" 10 | 48,[0] 11 | 49,"[0, 9]" 12 | 50,"[0, 3, 6, 7, 8]" 13 | 51,[0] 14 | 52,[0] 15 | 53,"[0, 6, 8]" 16 | 54,"[1, 2, 4, 5, 6]" 17 | 55,"[0, 1, 2, 3, 9]" 18 | 56,"[0, 2, 4, 5, 7, 8]" 19 | 57,"[0, 1, 2, 3, 4, 9]" 20 | 58,[0] 21 | 59,[0] 22 | 60,"[0, 1, 2, 8, 9]" 23 | 61,[0] 24 | 62,"[0, 6]" 25 | 63,[0] 26 | 64,[0] 27 | 65,[0] 28 | 66,"[0, 6]" 29 | 67,[0] 30 | 68,"[0, 1, 3]" 31 | 69,[0] 32 | 70,"[0, 6, 8]" 33 | 71,"[0, 1, 5]" 34 | 72,"[1, 2, 4, 5, 9]" 35 | 73,"[1, 3]" 36 | 74,[0] 37 | 75,"[0, 1, 3, 4]" 38 | 76,"[1, 2]" 39 | 77,"[1, 3]" 40 | 78,"[1, 4, 6]" 41 | 79,"[0, 1, 3]" 42 | 252,[0] 43 | 253,"[1, 2, 4, 6, 8]" 44 | 254,[0] 45 | 255,"[0, 2, 3, 6, 9]" 46 | 256,"[0, 2, 8]" 47 | 257,[0] 48 | 258,[0] 49 | 259,"[0, 1, 8]" 50 | 260,"[2, 3, 6, 8, 9]" 51 | 261,[0] 52 | 262,[0] 53 | 263,"[1, 2, 7, 8]" 54 | 264,"[3, 4, 9]" 55 | 265,"[2, 8, 9]" 56 | 266,[0] 57 | -------------------------------------------------------------------------------- /data_od_evaluation/SPECT_pca_gt_copod.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 40,"[0, 6]" 3 | 41,"[3, 6, 7, 8]" 4 | 42,"[2, 4, 7, 8]" 5 | 43,"[3, 4, 8, 9]" 6 | 44,"[2, 4, 5]" 7 | 45,"[1, 3, 6]" 8 | 46,"[6, 9]" 9 | 47,"[6, 8]" 10 | 48,"[0, 6]" 11 | 49,"[1, 6, 9]" 12 | 50,"[3, 8, 9]" 13 | 51,"[0, 6]" 14 | 52,"[0, 6]" 15 | 53,"[1, 3, 5, 6]" 16 | 54,"[1, 3, 4, 5, 6]" 17 | 55,"[1, 2, 3, 5, 6, 8, 9]" 18 | 56,"[1, 3, 4, 6, 7, 8, 9]" 19 | 57,"[0, 1, 6, 8, 9]" 20 | 58,"[0, 6]" 21 | 59,"[0, 6]" 22 | 60,"[1, 3, 5, 8, 9]" 23 | 61,"[0, 6]" 24 | 62,"[0, 6]" 25 | 63,"[0, 6]" 26 | 64,"[0, 6]" 27 | 65,"[0, 6]" 28 | 66,"[0, 6]" 29 | 67,"[0, 6]" 30 | 68,"[1, 3, 8, 9]" 31 | 69,"[0, 6]" 32 | 70,"[3, 6]" 33 | 71,"[2, 5, 9]" 34 | 72,"[1, 2, 3, 4, 5]" 35 | 73,"[3, 6]" 36 | 74,"[0, 6]" 37 | 75,"[1, 3]" 38 | 76,"[1, 7]" 39 | 77,"[1, 3]" 40 | 78,"[1, 4, 6]" 41 | 79,"[1, 3, 8, 9]" 42 | 252,"[0, 6]" 43 | 253,"[1, 4, 5, 6, 9]" 44 | 254,"[0, 6]" 45 | 255,"[3, 4, 5, 6, 7, 9]" 46 | 256,"[2, 4, 7, 8]" 47 | 257,"[0, 6]" 48 | 258,"[0, 6]" 49 | 259,"[1, 3, 5, 7, 8]" 50 | 260,"[6, 9]" 51 | 261,"[0, 6]" 52 | 262,"[0, 6]" 53 | 263,"[1, 2, 8]" 54 | 264,"[3, 6, 8]" 55 | 265,"[1, 6, 8, 9]" 56 | 266,"[0, 4]" 57 | -------------------------------------------------------------------------------- /data_od_evaluation/satimage-2_pca_gt_hbos.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 5732,[2] 3 | 5733,[1] 4 | 5734,"[2, 9]" 5 | 5735,[5] 6 | 5736,[5] 7 | 5737,[1] 8 | 5738,[1] 9 | 5739,[1] 10 | 5740,[1] 11 | 5741,"[1, 2, 3, 4, 9]" 12 | 5742,[1] 13 | 5743,[1] 14 | 5744,[1] 15 | 5745,[1] 16 | 5746,[5] 17 | 5747,[1] 18 | 5748,"[1, 9]" 19 | 5749,[7] 20 | 5750,[1] 21 | 5751,[1] 22 | 5752,[1] 23 | 5753,[1] 24 | 5754,"[0, 3, 5]" 25 | 5755,[1] 26 | 5756,[1] 27 | 5757,[1] 28 | 5758,[5] 29 | 5759,"[1, 3]" 30 | 5760,[1] 31 | 5761,[1] 32 | 5762,"[1, 5, 9]" 33 | 5763,"[1, 2, 3, 4, 9]" 34 | 5764,[1] 35 | 5765,[1] 36 | 5766,"[5, 6]" 37 | 5767,[1] 38 | 5768,[1] 39 | 5769,[5] 40 | 5770,[1] 41 | 5771,[5] 42 | 5772,[1] 43 | 5773,[1] 44 | 5774,[1] 45 | 5775,"[1, 3]" 46 | 5776,[1] 47 | 5777,[5] 48 | 5778,"[7, 9]" 49 | 5779,[1] 50 | 5780,"[2, 9]" 51 | 5781,"[2, 3, 5, 7, 9]" 52 | 5782,"[3, 5]" 53 | 5783,[1] 54 | 5784,[1] 55 | 5785,[1] 56 | 5786,[5] 57 | 5787,"[2, 4]" 58 | 5788,[1] 59 | 5789,[1] 60 | 5790,[1] 61 | 5791,[7] 62 | 5792,"[1, 3]" 63 | 5793,[1] 64 | 5794,[1] 65 | 5795,[1] 66 | 5796,[1] 67 | 5797,[1] 68 | 5798,[7] 69 | 5799,"[0, 3, 5]" 70 | 5800,[1] 71 | 5801,[1] 72 | 5802,"[2, 9]" 73 | -------------------------------------------------------------------------------- /data_od_evaluation/wineQualityReds-od2_gt_hbos.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 18,[3] 3 | 38,"[0, 1]" 4 | 41,"[3, 6]" 5 | 45,"[8, 10]" 6 | 73,[6] 7 | 79,[6] 8 | 94,"[0, 1]" 9 | 151,[2] 10 | 161,[9] 11 | 167,"[5, 10]" 12 | 170,"[1, 9]" 13 | 199,"[1, 5, 9, 10]" 14 | 224,[6] 15 | 261,[6] 16 | 266,"[1, 3, 5, 7, 8]" 17 | 409,"[0, 3, 5]" 18 | 459,[10] 19 | 517,"[0, 4, 10]" 20 | 573,[0] 21 | 576,[0] 22 | 600,"[1, 5, 9, 10]" 23 | 633,"[1, 7]" 24 | 647,"[0, 1, 10]" 25 | 659,[3] 26 | 690,"[1, 3]" 27 | 703,[6] 28 | 704,"[0, 1]" 29 | 724,[1] 30 | 813,[10] 31 | 830,[3] 32 | 832,"[0, 4, 5]" 33 | 833,"[0, 5]" 34 | 872,[5] 35 | 876,[10] 36 | 899,"[1, 3, 9, 10]" 37 | 927,[6] 38 | 937,[0] 39 | 1124,[0] 40 | 1176,"[0, 1, 3, 5, 10]" 41 | 1189,"[1, 9]" 42 | 1233,"[0, 1, 5, 6, 9]" 43 | 1235,[3] 44 | 1238,"[1, 3, 5, 9]" 45 | 1239,[3] 46 | 1261,"[0, 1, 10]" 47 | 1263,"[1, 5, 9]" 48 | 1276,[3] 49 | 1293,"[1, 5, 9]" 50 | 1299,[1] 51 | 1307,[3] 52 | 1363,[6] 53 | 1369,"[0, 7, 9]" 54 | 1374,[4] 55 | 1423,[3] 56 | 1461,"[0, 1, 8, 10]" 57 | 1467,"[0, 1, 10]" 58 | 1469,"[1, 6]" 59 | 1478,[3] 60 | 1480,[0] 61 | 1482,"[2, 10]" 62 | 1484,"[0, 1, 10]" 63 | 1505,"[0, 1, 8, 10]" 64 | 1521,[10] 65 | -------------------------------------------------------------------------------- /data_od_evaluation/arrhythmia_pca_gt_iforest.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 0,"[0, 9]" 3 | 4,[9] 4 | 5,"[1, 7, 9]" 5 | 10,[1] 6 | 26,"[0, 9]" 7 | 34,"[5, 8]" 8 | 45,[4] 9 | 46,[8] 10 | 60,[2] 11 | 61,[1] 12 | 76,[9] 13 | 83,"[1, 2]" 14 | 85,"[1, 5]" 15 | 87,"[3, 8]" 16 | 88,"[1, 3, 6]" 17 | 89,"[2, 6, 8]" 18 | 91,[4] 19 | 93,"[4, 9]" 20 | 100,"[1, 7, 8, 9]" 21 | 105,"[8, 9]" 22 | 141,[5] 23 | 168,"[2, 6]" 24 | 169,"[5, 8]" 25 | 174,"[1, 3]" 26 | 183,"[2, 5, 8]" 27 | 185,"[1, 6, 7]" 28 | 188,"[2, 3]" 29 | 189,"[0, 2, 3, 4, 5]" 30 | 204,"[1, 5, 9]" 31 | 207,[2] 32 | 214,"[2, 6]" 33 | 217,"[3, 7]" 34 | 218,"[0, 1, 5, 9]" 35 | 225,"[1, 7]" 36 | 231,"[1, 4, 5, 7]" 37 | 243,"[1, 3]" 38 | 248,"[2, 9]" 39 | 251,"[1, 7, 8, 9]" 40 | 252,[0] 41 | 253,"[0, 5]" 42 | 257,[8] 43 | 258,[3] 44 | 285,"[0, 4, 5]" 45 | 300,"[3, 8]" 46 | 303,"[0, 5, 8, 9]" 47 | 309,[3] 48 | 316,"[2, 3, 5]" 49 | 320,[2] 50 | 327,"[0, 8]" 51 | 348,[2] 52 | 356,"[0, 5]" 53 | 361,"[0, 1, 5, 9]" 54 | 370,"[1, 2, 3, 5]" 55 | 374,"[0, 1, 6]" 56 | 376,"[0, 4]" 57 | 381,"[1, 5]" 58 | 387,"[3, 8]" 59 | 388,[2] 60 | 395,"[1, 2, 3, 4, 6]" 61 | 398,"[0, 5]" 62 | 401,"[6, 8]" 63 | 403,"[0, 2, 9]" 64 | 410,"[1, 6, 7]" 65 | 420,"[2, 3]" 66 | 424,"[2, 4, 6, 8]" 67 | 433,"[0, 4]" 68 | -------------------------------------------------------------------------------- /data_od_evaluation/arrhythmia_pca_gt_copod.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 0,"[6, 9]" 3 | 4,[9] 4 | 5,"[1, 7]" 5 | 10,"[1, 7]" 6 | 26,"[3, 4, 5, 9]" 7 | 34,"[1, 3, 5, 6]" 8 | 45,"[3, 4, 9]" 9 | 46,"[6, 9]" 10 | 60,"[2, 3]" 11 | 61,"[3, 5]" 12 | 76,"[1, 5, 8, 9]" 13 | 83,"[0, 1]" 14 | 85,[1] 15 | 87,"[3, 4, 6]" 16 | 88,"[1, 6]" 17 | 89,"[2, 8]" 18 | 91,"[4, 5, 9]" 19 | 93,"[0, 1, 3, 9]" 20 | 100,"[3, 7, 9]" 21 | 105,"[8, 9]" 22 | 141,[5] 23 | 168,"[2, 3, 9]" 24 | 169,[6] 25 | 174,"[1, 3, 9]" 26 | 183,"[1, 3, 6, 9]" 27 | 185,[6] 28 | 188,"[2, 3, 4]" 29 | 189,"[0, 7]" 30 | 204,"[1, 5, 8]" 31 | 207,"[1, 2, 6, 9]" 32 | 214,[6] 33 | 217,[3] 34 | 218,"[0, 1, 5, 9]" 35 | 225,"[0, 1, 5, 7]" 36 | 231,[6] 37 | 243,"[3, 8]" 38 | 248,"[0, 3, 6, 9]" 39 | 251,"[6, 9]" 40 | 252,"[0, 5, 9]" 41 | 253,"[0, 5]" 42 | 257,"[1, 5, 8, 9]" 43 | 258,"[3, 7, 9]" 44 | 285,"[0, 5]" 45 | 300,"[3, 4, 6]" 46 | 303,"[0, 5, 9]" 47 | 309,"[1, 3, 7]" 48 | 316,"[3, 5, 6]" 49 | 320,"[2, 3, 5]" 50 | 327,"[0, 9]" 51 | 348,"[1, 5, 8]" 52 | 356,"[0, 5]" 53 | 361,"[0, 1, 5, 7]" 54 | 370,"[2, 3]" 55 | 374,"[2, 5]" 56 | 376,"[0, 7]" 57 | 381,"[1, 5, 8]" 58 | 387,"[3, 6, 8]" 59 | 388,[2] 60 | 395,"[0, 1]" 61 | 398,"[0, 5]" 62 | 401,"[3, 4, 6]" 63 | 403,"[0, 9]" 64 | 410,[6] 65 | 420,"[3, 4]" 66 | 424,"[1, 7]" 67 | 433,"[0, 3, 5]" 68 | -------------------------------------------------------------------------------- /data_od_evaluation/wineQualityReds-od2_gt_iforest.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 18,"[3, 10]" 3 | 38,"[0, 1, 4, 6]" 4 | 41,[9] 5 | 45,"[1, 8, 10]" 6 | 73,"[1, 10]" 7 | 79,"[5, 6, 9]" 8 | 94,"[0, 1, 8]" 9 | 151,[2] 10 | 161,"[1, 2, 6, 9, 10]" 11 | 167,"[2, 10]" 12 | 170,[9] 13 | 199,"[1, 7, 9]" 14 | 224,"[1, 2, 10]" 15 | 261,"[1, 2, 10]" 16 | 266,"[2, 7]" 17 | 409,"[0, 5]" 18 | 459,"[1, 2, 10]" 19 | 517,"[4, 7, 10]" 20 | 573,"[0, 1]" 21 | 576,"[0, 2, 6]" 22 | 600,"[1, 2]" 23 | 633,"[1, 7, 10]" 24 | 647,"[1, 5, 10]" 25 | 659,"[1, 3]" 26 | 690,"[1, 8]" 27 | 703,"[2, 6]" 28 | 704,"[0, 1, 2, 5]" 29 | 724,[1] 30 | 813,[6] 31 | 830,[6] 32 | 832,"[4, 5]" 33 | 833,"[0, 4, 5]" 34 | 872,"[1, 5, 9, 10]" 35 | 876,"[1, 2]" 36 | 899,"[1, 7]" 37 | 927,"[5, 6, 8]" 38 | 937,"[0, 3]" 39 | 1124,"[5, 8]" 40 | 1176,"[1, 3, 8]" 41 | 1189,"[0, 1, 2]" 42 | 1233,"[1, 10]" 43 | 1235,"[0, 3, 8]" 44 | 1238,"[0, 1, 2, 7, 9]" 45 | 1239,"[2, 3, 8, 10]" 46 | 1261,"[1, 2, 7, 10]" 47 | 1263,"[1, 2, 9]" 48 | 1276,"[3, 6]" 49 | 1293,"[1, 2, 5, 10]" 50 | 1299,[1] 51 | 1307,"[2, 3, 10]" 52 | 1363,"[1, 2]" 53 | 1369,[9] 54 | 1374,"[1, 4, 7]" 55 | 1423,"[3, 4, 8]" 56 | 1461,"[0, 1, 6, 8]" 57 | 1467,"[1, 2]" 58 | 1469,"[1, 5]" 59 | 1478,"[1, 3, 8]" 60 | 1480,"[8, 10]" 61 | 1482,"[0, 2, 5, 10]" 62 | 1484,"[1, 5, 10]" 63 | 1505,"[0, 1, 6, 8]" 64 | 1521,"[9, 10]" 65 | -------------------------------------------------------------------------------- /data_od_evaluation/speech_pca_gt_iforest.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 0,"[0, 1]" 3 | 1,"[1, 7]" 4 | 2,"[2, 3, 4]" 5 | 3,"[0, 2, 3, 5, 8]" 6 | 4,"[0, 2, 3]" 7 | 5,"[0, 1, 6, 8]" 8 | 6,"[0, 1]" 9 | 7,"[1, 6]" 10 | 8,"[1, 3, 7]" 11 | 9,"[1, 8, 9]" 12 | 10,"[1, 2, 6]" 13 | 11,"[1, 6, 8]" 14 | 12,"[1, 9]" 15 | 13,"[1, 9]" 16 | 14,"[2, 3, 7]" 17 | 15,"[1, 6]" 18 | 16,"[1, 6]" 19 | 17,"[3, 6]" 20 | 18,"[1, 8, 9]" 21 | 19,"[1, 2, 3, 5, 7, 8]" 22 | 20,"[1, 2]" 23 | 21,[1] 24 | 22,"[1, 8, 9]" 25 | 23,"[1, 8, 9]" 26 | 24,"[6, 9]" 27 | 25,"[7, 9]" 28 | 26,"[1, 4, 5]" 29 | 27,"[0, 1, 6]" 30 | 28,"[0, 1, 2, 3, 6, 7, 9]" 31 | 29,"[0, 1, 4, 6, 7, 9]" 32 | 30,"[0, 1, 3, 4, 5, 9]" 33 | 31,"[0, 2, 3, 4]" 34 | 32,"[1, 4]" 35 | 33,"[1, 9]" 36 | 34,"[0, 1, 4, 6, 8]" 37 | 35,"[0, 2, 3, 4, 5]" 38 | 36,"[0, 2]" 39 | 37,"[0, 2, 3]" 40 | 38,"[0, 1, 2, 3, 5, 6, 8]" 41 | 39,"[4, 7]" 42 | 40,"[1, 8, 9]" 43 | 41,"[2, 4, 5, 7]" 44 | 42,[6] 45 | 43,"[1, 2, 3, 6, 7, 8]" 46 | 44,"[4, 5, 6, 7]" 47 | 45,"[0, 2, 3]" 48 | 46,"[2, 3, 4, 6]" 49 | 47,"[1, 3, 4, 5, 7, 8, 9]" 50 | 48,"[1, 2, 3, 4, 7, 8]" 51 | 49,"[2, 3, 5, 6, 9]" 52 | 50,"[0, 2, 5, 8, 9]" 53 | 51,"[0, 2]" 54 | 52,"[0, 2, 3, 5]" 55 | 53,[2] 56 | 54,"[0, 3, 6, 8]" 57 | 55,"[0, 2, 3, 7]" 58 | 56,"[0, 1, 2, 3, 8]" 59 | 57,"[0, 1, 6, 7, 8]" 60 | 58,"[5, 7, 8]" 61 | 59,"[1, 2, 6, 7, 8]" 62 | 60,"[1, 2, 4, 6, 9]" 63 | -------------------------------------------------------------------------------- /data_od_evaluation/speech_pca_gt_copod.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 0,"[1, 4]" 3 | 1,"[0, 1, 2, 7, 9]" 4 | 2,"[0, 1, 3, 4]" 5 | 3,"[0, 1, 3, 6, 8]" 6 | 4,"[0, 2, 3, 5]" 7 | 5,"[0, 1, 6]" 8 | 6,"[0, 1, 4, 7]" 9 | 7,"[1, 6]" 10 | 8,"[1, 7, 8]" 11 | 9,"[1, 7, 8]" 12 | 10,"[1, 3, 6]" 13 | 11,"[1, 6, 8]" 14 | 12,"[1, 7]" 15 | 13,"[1, 4, 9]" 16 | 14,"[1, 2, 3, 7]" 17 | 15,"[0, 1, 4, 5]" 18 | 16,"[0, 1, 7]" 19 | 17,"[1, 3, 5]" 20 | 18,"[1, 4]" 21 | 19,"[1, 2, 3, 5, 7, 8, 9]" 22 | 20,"[1, 2, 3, 5, 8, 9]" 23 | 21,"[1, 7, 8]" 24 | 22,"[1, 4]" 25 | 23,"[1, 4, 6, 9]" 26 | 24,"[1, 6, 7]" 27 | 25,"[0, 1, 7]" 28 | 26,"[0, 1, 4, 6]" 29 | 27,"[0, 1, 6, 7]" 30 | 28,"[0, 1, 2, 7, 9]" 31 | 29,"[1, 7, 8]" 32 | 30,"[0, 1, 2, 3, 4, 7, 9]" 33 | 31,"[0, 1, 2, 4, 7, 9]" 34 | 32,"[1, 7, 8]" 35 | 33,"[1, 6]" 36 | 34,"[4, 5, 6, 8]" 37 | 35,"[0, 2, 3, 5]" 38 | 36,"[0, 2, 3, 5]" 39 | 37,"[0, 2, 3, 5]" 40 | 38,"[2, 3, 5, 6]" 41 | 39,"[1, 7]" 42 | 40,"[1, 7, 8]" 43 | 41,"[2, 5, 6, 7]" 44 | 42,"[1, 6]" 45 | 43,"[1, 2, 5, 6, 8]" 46 | 44,"[2, 5, 6, 7]" 47 | 45,"[0, 2, 3, 7]" 48 | 46,"[3, 4, 6, 8]" 49 | 47,"[1, 3, 8]" 50 | 48,"[0, 1, 2, 7, 8]" 51 | 49,"[2, 6, 8, 9]" 52 | 50,"[0, 2, 4, 9]" 53 | 51,"[0, 2, 9]" 54 | 52,"[0, 2, 3, 5]" 55 | 53,"[0, 1, 2, 4]" 56 | 54,"[0, 3, 6]" 57 | 55,"[2, 3, 7]" 58 | 56,"[0, 1, 2, 3]" 59 | 57,"[0, 4, 7]" 60 | 58,"[1, 2, 3, 6, 8]" 61 | 59,"[0, 1, 2, 6, 7, 8, 9]" 62 | 60,"[1, 4, 6]" 63 | -------------------------------------------------------------------------------- /data_od_evaluation/wineQualityReds-od2_gt_copod.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 18,"[1, 3, 4, 10]" 3 | 38,"[0, 1, 4]" 4 | 41,"[1, 3, 9]" 5 | 45,"[0, 8]" 6 | 73,"[1, 7, 10]" 7 | 79,"[6, 9, 10]" 8 | 94,"[0, 1, 5]" 9 | 151,[2] 10 | 161,"[1, 3, 9]" 11 | 167,"[5, 9, 10]" 12 | 170,"[1, 6, 9, 10]" 13 | 199,"[1, 8, 9, 10]" 14 | 224,"[0, 1, 4, 6]" 15 | 261,"[0, 1]" 16 | 266,"[1, 7, 8]" 17 | 409,"[0, 3, 5, 8]" 18 | 459,"[7, 10]" 19 | 517,"[1, 4, 7, 10]" 20 | 573,"[0, 6, 7]" 21 | 576,"[0, 4, 6]" 22 | 600,"[0, 1, 9]" 23 | 633,"[1, 7, 10]" 24 | 647,"[1, 5, 10]" 25 | 659,"[1, 3, 4]" 26 | 690,"[1, 3, 8]" 27 | 703,"[1, 2, 5, 6]" 28 | 704,"[0, 1, 5]" 29 | 724,"[1, 3]" 30 | 813,"[4, 6, 8, 10]" 31 | 830,"[1, 4, 6, 10]" 32 | 832,"[0, 4, 5]" 33 | 833,"[0, 4, 5]" 34 | 872,"[5, 8]" 35 | 876,"[4, 6, 8, 10]" 36 | 899,"[1, 7, 10]" 37 | 927,"[1, 4, 6, 10]" 38 | 937,"[0, 1]" 39 | 1124,"[4, 5, 8, 10]" 40 | 1176,"[1, 3, 8]" 41 | 1189,"[0, 1, 9]" 42 | 1233,"[0, 1, 9]" 43 | 1235,"[0, 1, 3, 4, 6]" 44 | 1238,"[1, 7, 9]" 45 | 1239,"[1, 3, 8, 10]" 46 | 1261,"[1, 8]" 47 | 1263,"[1, 7, 9]" 48 | 1276,"[3, 4, 10]" 49 | 1293,"[1, 2]" 50 | 1299,[1] 51 | 1307,"[1, 3]" 52 | 1363,"[1, 6, 9]" 53 | 1369,"[0, 6, 9]" 54 | 1374,"[1, 3, 4]" 55 | 1423,"[3, 4, 8]" 56 | 1461,"[1, 2, 8]" 57 | 1467,"[1, 5, 10]" 58 | 1469,"[1, 5]" 59 | 1478,"[1, 3]" 60 | 1480,[8] 61 | 1482,"[4, 6]" 62 | 1484,"[1, 8]" 63 | 1505,"[1, 8]" 64 | 1521,"[9, 10]" 65 | -------------------------------------------------------------------------------- /eval/eva_main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | from eval import evaluation_od 5 | from config import root 6 | 7 | 8 | def main(path): 9 | data_name = path.split("/")[-1].split(".")[0] 10 | df = pd.read_csv(path) 11 | X = df.values[:, :-1] 12 | y = np.array(df.values[:, -1], dtype=int) 13 | print("Data name: [%s]" % data_name) 14 | 15 | model_name1 = "hbos" 16 | path1 = "data_od_evaluation/" + data_name + "_gt_" + model_name1 + ".csv" 17 | path2 = "data_od_evaluation/" + data_name + "_score_" + model_name1 + ".csv" 18 | if not (os.path.exists(path1) and os.path.exists(path2)): 19 | print("OD evaluation model training is processing...") 20 | evaluation_od.evaluation_od_train(X, y, data_name, model_name1) 21 | 22 | return 23 | 24 | 25 | if __name__ == '__main__': 26 | input_root_list = [root + "data/"] 27 | runs = 1 28 | 29 | for input_root in input_root_list: 30 | if os.path.isdir(input_root): 31 | for file_name in sorted(os.listdir(input_root)): 32 | if file_name.endswith(".csv"): 33 | input_path = str(os.path.join(input_root, file_name)) 34 | name = input_path.split("/")[-1].split('.')[0] 35 | main(input_path) 36 | 37 | else: 38 | input_path = input_root 39 | name = input_path.split("/")[-1].split(".")[0] 40 | main(input_path) -------------------------------------------------------------------------------- /data_od_evaluation/satimage-2_pca_gt_iforest.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 5732,"[1, 2, 4, 8]" 3 | 5733,[1] 4 | 5734,"[1, 5, 7]" 5 | 5735,[5] 6 | 5736,[5] 7 | 5737,"[0, 1, 2, 4]" 8 | 5738,[1] 9 | 5739,"[0, 1, 6]" 10 | 5740,"[0, 1, 2, 8, 9]" 11 | 5741,"[0, 1, 7]" 12 | 5742,"[2, 3]" 13 | 5743,[1] 14 | 5744,[1] 15 | 5745,"[0, 1]" 16 | 5746,"[2, 4, 5, 6, 7, 8]" 17 | 5747,"[0, 2]" 18 | 5748,"[2, 4, 9]" 19 | 5749,"[0, 3, 4, 7]" 20 | 5750,"[1, 2, 4, 8]" 21 | 5751,"[0, 1, 6]" 22 | 5752,[2] 23 | 5753,"[1, 9]" 24 | 5754,"[0, 1, 3]" 25 | 5755,"[1, 2, 8]" 26 | 5756,"[0, 1, 5]" 27 | 5757,"[0, 1, 2, 8]" 28 | 5758,"[5, 7]" 29 | 5759,"[0, 1, 3, 5]" 30 | 5760,"[0, 2, 3, 4, 5]" 31 | 5761,[1] 32 | 5762,"[1, 7]" 33 | 5763,"[0, 1, 7]" 34 | 5764,"[1, 7]" 35 | 5765,"[0, 1, 2, 3]" 36 | 5766,"[4, 5, 6]" 37 | 5767,"[0, 1, 2, 3]" 38 | 5768,"[1, 2, 4, 7]" 39 | 5769,"[5, 7]" 40 | 5770,"[0, 2]" 41 | 5771,"[1, 5, 6]" 42 | 5772,"[0, 1, 6]" 43 | 5773,"[1, 3]" 44 | 5774,"[1, 2, 6]" 45 | 5775,"[0, 1, 3, 4, 5]" 46 | 5776,"[0, 2]" 47 | 5777,"[5, 9]" 48 | 5778,"[5, 7, 9]" 49 | 5779,"[0, 2]" 50 | 5780,"[0, 1, 9]" 51 | 5781,"[2, 4, 5]" 52 | 5782,"[0, 1, 3, 4]" 53 | 5783,[1] 54 | 5784,[1] 55 | 5785,"[0, 2]" 56 | 5786,"[0, 1, 5]" 57 | 5787,"[4, 5]" 58 | 5788,[7] 59 | 5789,"[0, 2]" 60 | 5790,"[1, 2, 6]" 61 | 5791,"[1, 3, 4, 5, 8]" 62 | 5792,"[1, 3]" 63 | 5793,"[0, 1, 6]" 64 | 5794,"[0, 2]" 65 | 5795,"[0, 1, 6]" 66 | 5796,"[0, 1, 2, 8]" 67 | 5797,"[1, 2, 8, 9]" 68 | 5798,"[3, 5, 6, 7, 8]" 69 | 5799,"[1, 3, 5]" 70 | 5800,[1] 71 | 5801,"[0, 2]" 72 | 5802,"[1, 5]" 73 | -------------------------------------------------------------------------------- /data_od_evaluation/satimage-2_pca_gt_copod.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 5732,"[2, 3]" 3 | 5733,"[0, 1, 3, 9]" 4 | 5734,"[2, 3, 5, 8, 9]" 5 | 5735,[5] 6 | 5736,[5] 7 | 5737,"[1, 2, 5, 6, 9]" 8 | 5738,"[1, 3]" 9 | 5739,"[0, 1, 3]" 10 | 5740,"[0, 1, 3, 8, 9]" 11 | 5741,"[1, 2, 3, 7, 8, 9]" 12 | 5742,"[1, 3, 6]" 13 | 5743,"[2, 3]" 14 | 5744,"[1, 3]" 15 | 5745,"[1, 2, 5, 6, 9]" 16 | 5746,"[1, 5, 6]" 17 | 5747,"[0, 2, 5]" 18 | 5748,"[0, 1, 2, 4, 7, 9]" 19 | 5749,"[3, 4, 5, 7, 8, 9]" 20 | 5750,"[1, 3, 4, 5]" 21 | 5751,"[0, 1, 3, 9]" 22 | 5752,[2] 23 | 5753,"[0, 1, 3, 8, 9]" 24 | 5754,"[1, 3, 6]" 25 | 5755,"[1, 3, 9]" 26 | 5756,"[0, 2, 5]" 27 | 5757,"[2, 3, 8]" 28 | 5758,"[1, 4, 5]" 29 | 5759,"[0, 1, 3]" 30 | 5760,"[0, 1, 3, 9]" 31 | 5761,"[1, 3, 9]" 32 | 5762,"[1, 6, 7]" 33 | 5763,"[1, 2, 3, 7, 8, 9]" 34 | 5764,"[1, 3, 9]" 35 | 5765,"[1, 3, 9]" 36 | 5766,"[2, 4, 5, 6, 8]" 37 | 5767,"[0, 1, 2, 3, 6, 8, 9]" 38 | 5768,"[0, 1, 2, 6, 7]" 39 | 5769,"[0, 1, 3, 5]" 40 | 5770,"[0, 2, 3, 5, 6]" 41 | 5771,"[2, 5, 6]" 42 | 5772,"[0, 1, 2, 7]" 43 | 5773,"[0, 1, 3]" 44 | 5774,"[0, 2, 3, 5]" 45 | 5775,"[1, 3, 9]" 46 | 5776,"[1, 2, 3, 4, 6, 7, 8]" 47 | 5777,"[7, 9]" 48 | 5778,"[0, 3, 5, 6, 7, 9]" 49 | 5779,"[0, 1, 3]" 50 | 5780,"[1, 3, 9]" 51 | 5781,"[1, 3, 4, 5, 9]" 52 | 5782,"[3, 7]" 53 | 5783,"[0, 1, 3, 9]" 54 | 5784,[1] 55 | 5785,"[2, 3, 5, 6]" 56 | 5786,"[1, 5, 9]" 57 | 5787,"[2, 4, 7]" 58 | 5788,[7] 59 | 5789,"[0, 1, 3, 5]" 60 | 5790,"[1, 3, 9]" 61 | 5791,"[1, 8]" 62 | 5792,"[0, 1, 3]" 63 | 5793,"[0, 1, 2, 3, 7]" 64 | 5794,"[1, 4, 5]" 65 | 5795,"[0, 1, 2, 5, 9]" 66 | 5796,"[2, 3, 8, 9]" 67 | 5797,"[0, 1, 2, 3, 5, 6, 9]" 68 | 5798,"[3, 4, 5, 6, 7, 8]" 69 | 5799,"[1, 3, 7]" 70 | 5800,"[1, 3, 8]" 71 | 5801,"[0, 2, 3, 5]" 72 | 5802,"[1, 2, 3, 7, 9]" 73 | -------------------------------------------------------------------------------- /model_coin/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.neighbors import LocalOutlierFactor 5 | from sklearn.ensemble import IsolationForest 6 | 7 | 8 | def detect_lof(args, X): 9 | num_inst = X.shape[0] 10 | num_nbr = int(num_inst * args.ratio_nbr) 11 | clf = LocalOutlierFactor(n_neighbors=num_nbr) 12 | y_pred = clf.fit_predict(X) 13 | outlier_scores = -clf.negative_outlier_factor_ 14 | 15 | return y_pred 16 | 17 | 18 | def detect_isoforest(args, X): 19 | num_inst = X.shape[0] 20 | clf = IsolationForest(behaviour='new', max_samples=num_inst, random_state=0) 21 | clf.fit(X) 22 | y_pred = clf.predict(X) 23 | outlier_scores = -clf.decision_function(X) 24 | 25 | return y_pred 26 | 27 | 28 | def get_datast_basic_info(path): 29 | data_name = path.split("/")[-1].split(".")[0] 30 | df = pd.read_csv(path) 31 | x = df.values[:, :-1] 32 | y = np.array(df.values[:, -1], dtype=int) 33 | n = x.shape[0] 34 | dim = x.shape[1] 35 | n_ano = len(np.where(y == 1)[0]) 36 | ratio_ano = n_ano / n 37 | 38 | print("%s, %d, %d, %d, %.4f " % (data_name, n, dim, n_ano, ratio_ano)) 39 | 40 | return 41 | 42 | 43 | if __name__ == '__main__': 44 | input_root_list = ["E:/OneDrive/work/0data/odds/integer/"] 45 | 46 | seed = -1 47 | 48 | for input_root in input_root_list: 49 | if os.path.isdir(input_root): 50 | for file_name in sorted(os.listdir(input_root)): 51 | if file_name.endswith(".csv"): 52 | input_path = str(os.path.join(input_root, file_name)) 53 | name = input_path.split("/")[-1].split('.')[0] 54 | get_datast_basic_info(input_path) 55 | 56 | else: 57 | input_path = input_root 58 | name = input_path.split("/")[-1].split(".")[0] 59 | get_datast_basic_info(input_path) -------------------------------------------------------------------------------- /model_iml/LIME.py: -------------------------------------------------------------------------------- 1 | import lime 2 | import lime.lime_tabular 3 | import numpy as np 4 | import sklearn 5 | import math 6 | from tqdm import tqdm 7 | import sklearn.datasets 8 | 9 | 10 | class LIME: 11 | def __init__(self, discretize_continuous=True, discretizer='quartile'): 12 | """ 13 | 14 | :param discretize_continuous: if True, all non-categorical features will be discretized into quartiles. 15 | :param discretizer: only matters if discretize_continuous is True and data is not sparse. 16 | Options are 'quartile', 'decile', 'entropy' or a BaseDiscretizer instance. 17 | """ 18 | self.discretize_continuous = discretize_continuous 19 | self.discretizer = discretizer 20 | 21 | self.dim = None 22 | self.ano_idx = None 23 | return 24 | 25 | def fit(self, x, y, ano_class=1): 26 | self.ano_idx = np.where(y == 1)[0] 27 | ano_idx = self.ano_idx 28 | self.dim = x.shape[1] 29 | svm = sklearn.svm.SVC(kernel="rbf", probability=True) 30 | svm.fit(x, y) 31 | 32 | y_pred = svm.predict(x) 33 | print("Clf model accuracy: [{:.4f}]".format(sklearn.metrics.accuracy_score(y, y_pred))) 34 | 35 | explainer = lime.lime_tabular.LimeTabularExplainer(x, discretize_continuous=self.discretize_continuous, 36 | discretizer=self.discretizer) 37 | ano_f_weights = np.zeros([len(ano_idx), self.dim]) 38 | 39 | print(len(ano_idx)) 40 | 41 | for ii in tqdm(range(len(ano_idx))): 42 | idx = ano_idx[ii] 43 | exp = explainer.explain_instance(x[idx], svm.predict_proba, labels=(ano_class,), num_features=self.dim) 44 | tuples = exp.as_map()[1] 45 | for tuple in tuples: 46 | f_id, weight = tuple 47 | ano_f_weights[ii][f_id] = weight 48 | return ano_f_weights 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /data_od_evaluation/letter_pca_gt_hbos.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 1500,[2] 3 | 1501,"[4, 8]" 4 | 1502,"[0, 5]" 5 | 1503,"[0, 4]" 6 | 1504,"[0, 1]" 7 | 1505,"[0, 1, 4, 5, 9]" 8 | 1506,"[1, 4, 6, 7, 8, 9]" 9 | 1507,[9] 10 | 1508,"[1, 2, 9]" 11 | 1509,"[1, 8, 9]" 12 | 1510,"[3, 4, 6, 7, 8]" 13 | 1511,[4] 14 | 1512,[0] 15 | 1513,"[0, 2, 3, 5, 6]" 16 | 1514,"[0, 4, 7]" 17 | 1515,"[0, 3, 5, 6]" 18 | 1516,[4] 19 | 1517,"[0, 4]" 20 | 1518,[4] 21 | 1519,[8] 22 | 1520,"[6, 7, 8]" 23 | 1521,"[0, 1, 2, 4, 6, 9]" 24 | 1522,"[0, 9]" 25 | 1523,[2] 26 | 1524,"[0, 1, 2, 5, 6, 8]" 27 | 1525,"[3, 5, 6, 7]" 28 | 1526,[0] 29 | 1527,"[1, 2, 7]" 30 | 1528,"[1, 2, 4, 9]" 31 | 1529,"[2, 4, 6, 7, 8]" 32 | 1530,"[2, 4, 8, 9]" 33 | 1531,[5] 34 | 1532,"[0, 1, 6, 9]" 35 | 1533,[9] 36 | 1534,"[3, 6, 7]" 37 | 1535,"[2, 9]" 38 | 1536,[7] 39 | 1537,[6] 40 | 1538,[0] 41 | 1539,[0] 42 | 1540,"[0, 1, 7]" 43 | 1541,"[0, 1, 4, 7, 9]" 44 | 1542,"[1, 2, 5, 8]" 45 | 1543,"[2, 4, 5]" 46 | 1544,[6] 47 | 1545,[6] 48 | 1546,[5] 49 | 1547,[0] 50 | 1548,[2] 51 | 1549,"[2, 9]" 52 | 1550,[5] 53 | 1551,[4] 54 | 1552,[9] 55 | 1553,"[0, 1]" 56 | 1554,[0] 57 | 1555,"[0, 3, 5]" 58 | 1556,"[3, 6, 9]" 59 | 1557,"[1, 2, 3, 5, 7, 8]" 60 | 1558,"[2, 3, 4, 5, 6]" 61 | 1559,[0] 62 | 1560,"[0, 1, 5]" 63 | 1561,[4] 64 | 1562,"[2, 4, 5, 6]" 65 | 1563,"[4, 9]" 66 | 1564,[0] 67 | 1565,[9] 68 | 1566,[9] 69 | 1567,[7] 70 | 1568,[2] 71 | 1569,"[0, 1]" 72 | 1570,[5] 73 | 1571,[7] 74 | 1572,"[0, 9]" 75 | 1573,[0] 76 | 1574,[0] 77 | 1575,"[1, 4]" 78 | 1576,[4] 79 | 1577,"[1, 6, 7, 9]" 80 | 1578,"[1, 4, 8, 9]" 81 | 1579,[4] 82 | 1580,[0] 83 | 1581,[9] 84 | 1582,[9] 85 | 1583,"[2, 3, 4, 9]" 86 | 1584,"[1, 2, 7]" 87 | 1585,[5] 88 | 1586,[0] 89 | 1587,[7] 90 | 1588,"[4, 6, 8, 9]" 91 | 1589,"[1, 2, 4, 5, 7, 9]" 92 | 1590,[4] 93 | 1591,[8] 94 | 1592,[9] 95 | 1593,[5] 96 | 1594,[7] 97 | 1595,"[1, 2, 5]" 98 | 1596,[9] 99 | 1597,"[0, 4, 5, 7, 9]" 100 | 1598,"[2, 9]" 101 | 1599,"[0, 1, 2, 3, 6]" 102 | -------------------------------------------------------------------------------- /utils/model_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | 4 | 5 | def weight2subspace(weight, ratio=0.7, num=-1): 6 | """ 7 | this function is to transfer feature weight list to a feature subspace with higher weight 8 | given different ratio (ratio of weight summation of subspace to the full space) of subspace length 9 | :param weight: 10 | :param ratio: 11 | :param num: 12 | :return: 13 | """ 14 | dim = len(weight) 15 | 16 | threshold = ratio * np.sum(weight) 17 | 18 | sorted_idx = np.argsort(weight) 19 | sorted_idx = [sorted_idx[dim - i - 1] for i in range(dim)] 20 | 21 | if num != -1: 22 | exp_subspace = sorted_idx[:num] 23 | exp_subspace = list(np.sort(exp_subspace)) 24 | return exp_subspace 25 | 26 | tmp_s = 0 27 | exp_subspace = [] 28 | for idx in sorted_idx: 29 | tmp_s += weight[idx] 30 | exp_subspace.append(idx) 31 | if tmp_s >= threshold: 32 | break 33 | exp_subspace = list(np.sort(exp_subspace)) 34 | return exp_subspace 35 | 36 | 37 | def weight2subspace_pn(weight): 38 | exp_subspace = [] 39 | for i in range(len(weight)): 40 | if weight[i] > 0: 41 | exp_subspace.append(i) 42 | if len(exp_subspace) == 0: 43 | exp_subspace.append(np.argsort(weight)[len(weight) - 1]) 44 | exp_subspace = list(np.sort(exp_subspace)) 45 | return exp_subspace 46 | 47 | 48 | def get_exp_subspace(fea_weight_lst, w2s_ratio, real_exp_len=None): 49 | exp_subspace_lst = [] 50 | n_ano = len(fea_weight_lst) 51 | dim = len(fea_weight_lst[0]) 52 | 53 | for ii in range(n_ano): 54 | fea_weight = fea_weight_lst[ii] 55 | if w2s_ratio == "real_len": 56 | if real_exp_len is None: 57 | raise ValueError("not give real exp len") 58 | exp_subspace_lst.append(weight2subspace(fea_weight, num=real_exp_len[ii])) 59 | 60 | elif w2s_ratio == "auto": 61 | r = math.sqrt(2 / dim) 62 | exp_subspace_lst.append(weight2subspace(fea_weight, ratio=r)) 63 | 64 | elif w2s_ratio == "pn": 65 | exp_subspace_lst.append(weight2subspace_pn(fea_weight)) 66 | 67 | else: 68 | exp_subspace_lst.append(weight2subspace(fea_weight, ratio=w2s_ratio)) 69 | return exp_subspace_lst 70 | 71 | 72 | -------------------------------------------------------------------------------- /model_iml/Anchor.py: -------------------------------------------------------------------------------- 1 | import sklearn 2 | from sklearn.ensemble import RandomForestClassifier 3 | import numpy as np 4 | from alibi.explainers import AnchorTabular 5 | from tqdm import tqdm 6 | 7 | 8 | class Anchor: 9 | def __init__(self, kernel="rbf"): 10 | """ 11 | 12 | :param kernel: clf model svm parameter 13 | :param threshold: threshold is used to filter feature subset for each data, the shap values of selected feature 14 | subspace accounts for [threshold] of the sum of the shap values of feature full space. 15 | """ 16 | self.ano_idx = None 17 | 18 | self.kernel = kernel 19 | 20 | self.dim = None 21 | return 22 | 23 | def fit(self, x, y): 24 | 25 | self.dim = x.shape[1] 26 | 27 | # clf = sklearn.svm.SVC(kernel=self.kernel, probability=True) 28 | clf = RandomForestClassifier() 29 | clf.fit(x, y) 30 | 31 | y_pred = clf.predict(x) 32 | print("Clf model accuracy: [{:.4f}]".format(sklearn.metrics.accuracy_score(y, y_pred))) 33 | 34 | self.ano_idx = np.where(y == 1)[0] 35 | print(self.ano_idx.shape) 36 | 37 | n_f = x.shape[1] 38 | feature_names = ["A"+str(i) for i in range(n_f)] 39 | # use anchor 40 | predict_fn = lambda xx: clf.predict_proba(xx) 41 | explainer = AnchorTabular(predict_fn, feature_names) 42 | explainer.fit(x, disc_perc=(25, 50, 75)) 43 | 44 | exp_sub_lst = [] 45 | for i in tqdm(range(len(self.ano_idx))): 46 | ano = x[self.ano_idx[i]] 47 | explanation = explainer.explain(ano, threshold=0.95) 48 | anchor = explanation['anchor'] 49 | f_sub = [] 50 | for a in anchor: 51 | for item in a.split(" "): 52 | if item.startswith("A"): 53 | item = int(item[1:]) 54 | f_sub.append(item) 55 | # print(anchor, f_sub) 56 | if len(f_sub) == 0: 57 | f_sub = np.arange(n_f) 58 | exp_sub_lst.append(f_sub) 59 | 60 | return exp_sub_lst 61 | 62 | 63 | import pandas as pd 64 | path = "../data/00-pima.csv" 65 | df = pd.read_csv(path) 66 | X = df.values[:, :-1] 67 | y = np.array(df.values[:, -1], dtype=int) 68 | model = Anchor() 69 | exp_sub_lst = model.fit(X, y) 70 | print(len(exp_sub_lst)) -------------------------------------------------------------------------------- /data_od_evaluation/ionosphere_pca_gt_hbos.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 1,[9] 3 | 3,"[1, 2, 3, 5, 6, 7, 8, 9]" 4 | 5,[2] 5 | 7,"[0, 8]" 6 | 9,"[0, 3]" 7 | 11,"[3, 4, 8]" 8 | 13,"[6, 7, 9]" 9 | 15,[9] 10 | 17,[9] 11 | 19,"[0, 1, 4, 5]" 12 | 21,"[0, 3, 4]" 13 | 23,"[1, 3, 4, 5, 7, 9]" 14 | 25,[1] 15 | 27,[7] 16 | 29,"[1, 4]" 17 | 31,[4] 18 | 33,"[2, 3, 5, 7, 8, 9]" 19 | 35,[4] 20 | 37,"[1, 7, 9]" 21 | 39,"[1, 6]" 22 | 41,"[1, 5, 9]" 23 | 43,"[7, 8]" 24 | 45,"[0, 4, 6]" 25 | 47,"[0, 5, 6]" 26 | 49,"[2, 3, 5, 7, 8, 9]" 27 | 51,[9] 28 | 53,[4] 29 | 55,[0] 30 | 57,"[4, 7]" 31 | 59,[4] 32 | 61,"[0, 2, 3, 4, 6]" 33 | 63,[6] 34 | 65,"[6, 8]" 35 | 67,"[6, 7]" 36 | 69,"[1, 8]" 37 | 71,"[1, 7]" 38 | 73,"[1, 6]" 39 | 75,[6] 40 | 77,"[1, 4, 5]" 41 | 79,"[1, 2, 4, 8]" 42 | 81,"[2, 3, 5, 6, 7, 8]" 43 | 83,[1] 44 | 85,"[0, 6, 7, 8, 9]" 45 | 87,"[2, 4, 6, 9]" 46 | 89,"[0, 1, 6]" 47 | 91,"[2, 5, 8]" 48 | 93,"[0, 5, 6]" 49 | 95,[1] 50 | 98,"[1, 2, 4]" 51 | 100,"[0, 1, 2, 3, 5]" 52 | 102,"[0, 2, 7]" 53 | 104,"[0, 1, 2, 6, 8]" 54 | 106,[7] 55 | 108,"[1, 4]" 56 | 110,"[0, 1, 4, 7]" 57 | 112,"[0, 7]" 58 | 114,"[0, 6]" 59 | 116,[3] 60 | 118,"[2, 5, 6, 8]" 61 | 120,"[0, 2, 8]" 62 | 122,[6] 63 | 124,[7] 64 | 126,[8] 65 | 128,[8] 66 | 130,[5] 67 | 132,"[4, 9]" 68 | 134,"[0, 4]" 69 | 136,"[0, 5, 6]" 70 | 138,[2] 71 | 140,"[2, 5, 6, 8]" 72 | 142,"[1, 4, 5]" 73 | 144,"[0, 1]" 74 | 146,[2] 75 | 148,"[1, 2, 9]" 76 | 150,"[0, 1, 6, 7]" 77 | 152,"[0, 3]" 78 | 154,"[0, 3]" 79 | 156,"[0, 3]" 80 | 158,"[0, 3]" 81 | 160,"[0, 3]" 82 | 162,[3] 83 | 164,"[1, 9]" 84 | 166,"[2, 7]" 85 | 168,"[2, 3]" 86 | 170,"[1, 2, 7, 8]" 87 | 172,"[4, 9]" 88 | 174,"[0, 6, 8]" 89 | 176,"[2, 3, 4, 5]" 90 | 178,"[0, 3]" 91 | 180,"[0, 5, 6]" 92 | 182,"[2, 4]" 93 | 184,"[0, 2, 4, 8]" 94 | 186,"[0, 8]" 95 | 188,[3] 96 | 190,"[1, 2, 5, 6, 8]" 97 | 192,"[1, 2, 6, 8]" 98 | 194,"[2, 3, 5]" 99 | 196,"[1, 4]" 100 | 198,"[2, 3, 5, 7, 9]" 101 | 200,"[0, 2, 3, 4, 5, 6, 7, 8, 9]" 102 | 202,[5] 103 | 204,[1] 104 | 206,"[0, 5, 6]" 105 | 208,"[0, 4, 6]" 106 | 210,[6] 107 | 212,"[3, 7, 8, 9]" 108 | 214,"[2, 3, 4]" 109 | 216,"[4, 7]" 110 | 218,"[0, 1, 4, 5]" 111 | 220,"[3, 4]" 112 | 222,[7] 113 | 224,"[0, 6, 8]" 114 | 226,[6] 115 | 228,"[2, 8]" 116 | 230,"[0, 6, 8]" 117 | 232,"[2, 4]" 118 | 234,[4] 119 | 236,[1] 120 | 238,"[0, 3]" 121 | 240,"[2, 3, 8]" 122 | 242,[7] 123 | 244,[5] 124 | 246,"[1, 2, 3, 8]" 125 | 248,"[0, 2, 7]" 126 | 250,[5] 127 | 252,"[0, 5, 6]" 128 | -------------------------------------------------------------------------------- /data_od_evaluation/letter_pca_gt_copod.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 1500,"[1, 2, 6, 9]" 3 | 1501,"[1, 4, 6, 8, 9]" 4 | 1502,"[2, 4, 5]" 5 | 1503,"[1, 3, 4, 6]" 6 | 1504,"[1, 4, 6, 9]" 7 | 1505,"[1, 4, 9]" 8 | 1506,"[1, 4, 7, 8]" 9 | 1507,"[4, 8, 9]" 10 | 1508,"[1, 2, 8, 9]" 11 | 1509,"[1, 6, 8, 9]" 12 | 1510,"[1, 3, 6, 7, 8]" 13 | 1511,"[2, 3, 4, 5, 7]" 14 | 1512,"[8, 9]" 15 | 1513,"[2, 3, 5, 6, 7]" 16 | 1514,"[0, 4, 5, 6, 7]" 17 | 1515,"[1, 3, 6]" 18 | 1516,"[2, 3, 4, 6]" 19 | 1517,"[2, 4, 5]" 20 | 1518,"[2, 3, 4, 6]" 21 | 1519,[8] 22 | 1520,"[6, 7, 8]" 23 | 1521,"[0, 1, 2, 4, 6]" 24 | 1522,"[7, 8, 9]" 25 | 1523,"[1, 2, 4, 6, 9]" 26 | 1524,"[2, 6, 8]" 27 | 1525,"[3, 5, 6, 7]" 28 | 1526,"[1, 3, 5]" 29 | 1527,"[1, 7]" 30 | 1528,"[1, 4, 9]" 31 | 1529,"[2, 4, 7, 8, 9]" 32 | 1530,"[2, 8, 9]" 33 | 1531,"[2, 3, 5, 6]" 34 | 1532,"[1, 3, 6]" 35 | 1533,"[7, 9]" 36 | 1534,"[6, 7, 8]" 37 | 1535,"[2, 4, 5]" 38 | 1536,"[6, 7, 9]" 39 | 1537,"[6, 8]" 40 | 1538,"[0, 3, 4, 6]" 41 | 1539,"[2, 4, 7, 8]" 42 | 1540,"[1, 7, 8, 9]" 43 | 1541,"[1, 4, 6]" 44 | 1542,"[1, 2, 6, 8]" 45 | 1543,"[1, 2, 4, 5, 7]" 46 | 1544,"[6, 7, 8]" 47 | 1545,"[6, 8]" 48 | 1546,"[0, 2, 5, 6]" 49 | 1547,"[2, 4, 5, 7, 9]" 50 | 1548,"[1, 2, 5, 6]" 51 | 1549,"[2, 5]" 52 | 1550,[5] 53 | 1551,"[4, 6, 8]" 54 | 1552,"[1, 6, 7, 9]" 55 | 1553,"[1, 3, 6]" 56 | 1554,"[0, 2, 3, 7]" 57 | 1555,"[2, 3, 5]" 58 | 1556,"[3, 4, 9]" 59 | 1557,"[7, 8, 9]" 60 | 1558,"[1, 2, 3, 4, 5, 6]" 61 | 1559,"[8, 9]" 62 | 1560,"[1, 7, 8, 9]" 63 | 1561,"[2, 4, 5]" 64 | 1562,"[0, 2, 4, 5, 6]" 65 | 1563,"[1, 2, 4, 6, 9]" 66 | 1564,"[0, 1, 3, 4, 6]" 67 | 1565,"[2, 4, 9]" 68 | 1566,"[2, 3, 4, 9]" 69 | 1567,"[1, 6, 7, 9]" 70 | 1568,"[1, 2, 6, 8]" 71 | 1569,"[1, 9]" 72 | 1570,"[2, 5]" 73 | 1571,"[7, 8]" 74 | 1572,"[1, 7, 8, 9]" 75 | 1573,"[1, 3, 6]" 76 | 1574,"[5, 7]" 77 | 1575,"[1, 2, 4, 6]" 78 | 1576,"[7, 8, 9]" 79 | 1577,"[1, 6, 7, 9]" 80 | 1578,"[0, 1, 2, 4, 7, 9]" 81 | 1579,"[4, 5, 6, 7, 9]" 82 | 1580,"[2, 4, 5]" 83 | 1581,"[3, 7, 9]" 84 | 1582,"[7, 9]" 85 | 1583,"[1, 3, 4, 6, 9]" 86 | 1584,"[2, 4, 7]" 87 | 1585,"[1, 5]" 88 | 1586,"[0, 2, 3, 5]" 89 | 1587,"[1, 2, 6, 7, 8, 9]" 90 | 1588,"[4, 6, 8]" 91 | 1589,"[2, 4, 5]" 92 | 1590,"[0, 2, 4, 6, 7]" 93 | 1591,"[3, 6, 8]" 94 | 1592,"[1, 6, 8, 9]" 95 | 1593,"[1, 2, 5]" 96 | 1594,"[6, 7]" 97 | 1595,"[1, 2, 5, 6]" 98 | 1596,"[2, 3, 4, 7, 9]" 99 | 1597,"[5, 7]" 100 | 1598,"[1, 2, 6, 7, 8, 9]" 101 | 1599,"[2, 3, 4, 6, 7]" 102 | -------------------------------------------------------------------------------- /data_od_evaluation/letter_pca_gt_iforest.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 1500,"[2, 3, 9]" 3 | 1501,"[4, 8]" 4 | 1502,[5] 5 | 1503,"[0, 3, 4, 6]" 6 | 1504,"[1, 6, 9]" 7 | 1505,"[1, 3, 4, 5, 9]" 8 | 1506,"[1, 4, 7, 8]" 9 | 1507,"[4, 9]" 10 | 1508,"[1, 2, 4, 9]" 11 | 1509,"[1, 4, 6, 8, 9]" 12 | 1510,"[3, 4, 5, 6, 8, 9]" 13 | 1511,"[0, 3, 4, 5, 6]" 14 | 1512,"[0, 8]" 15 | 1513,"[2, 3, 4, 5, 6]" 16 | 1514,"[0, 3, 4, 6, 7]" 17 | 1515,"[0, 1, 3, 5, 6]" 18 | 1516,"[0, 4]" 19 | 1517,"[0, 4, 5, 6]" 20 | 1518,"[2, 3, 4]" 21 | 1519,[8] 22 | 1520,"[6, 7, 8]" 23 | 1521,"[0, 1, 2, 4, 6, 9]" 24 | 1522,"[0, 9]" 25 | 1523,"[2, 4, 5, 6, 9]" 26 | 1524,"[1, 5, 6, 8]" 27 | 1525,"[3, 5, 6, 7]" 28 | 1526,"[0, 1, 2, 5]" 29 | 1527,"[1, 2, 3, 4, 7]" 30 | 1528,"[4, 9]" 31 | 1529,"[2, 4, 5, 6, 7, 8]" 32 | 1530,"[2, 4, 8, 9]" 33 | 1531,"[3, 5, 6]" 34 | 1532,"[0, 3, 4, 6]" 35 | 1533,"[1, 3, 5, 7, 9]" 36 | 1534,"[3, 4, 5, 6, 7]" 37 | 1535,"[2, 4, 9]" 38 | 1536,"[1, 7, 9]" 39 | 1537,"[3, 4, 6, 8]" 40 | 1538,"[0, 4, 7]" 41 | 1539,"[0, 4]" 42 | 1540,"[1, 7, 8, 9]" 43 | 1541,"[0, 1, 4, 6, 7]" 44 | 1542,"[1, 2, 4, 5, 6, 8]" 45 | 1543,"[1, 2, 4, 5]" 46 | 1544,"[2, 6, 7, 8, 9]" 47 | 1545,[6] 48 | 1546,"[0, 2, 5, 6, 7, 8]" 49 | 1547,"[0, 4]" 50 | 1548,"[2, 5]" 51 | 1549,"[2, 4, 5, 9]" 52 | 1550,[5] 53 | 1551,"[4, 9]" 54 | 1552,"[6, 9]" 55 | 1553,"[0, 1, 5, 6, 7]" 56 | 1554,"[0, 1, 2, 3, 7]" 57 | 1555,"[0, 3, 5, 8]" 58 | 1556,"[3, 4, 9]" 59 | 1557,"[3, 5, 6, 7, 9]" 60 | 1558,"[2, 3, 4, 5, 6, 9]" 61 | 1559,"[0, 8, 9]" 62 | 1560,"[0, 1, 5]" 63 | 1561,"[4, 9]" 64 | 1562,"[0, 2, 4, 5, 6]" 65 | 1563,"[2, 4, 5, 6, 9]" 66 | 1564,"[0, 3, 6, 7]" 67 | 1565,"[0, 1, 2, 6, 9]" 68 | 1566,[9] 69 | 1567,"[1, 4, 5, 7]" 70 | 1568,"[1, 2, 3, 4, 7, 8]" 71 | 1569,"[0, 1]" 72 | 1570,[5] 73 | 1571,"[4, 7]" 74 | 1572,"[0, 4, 9]" 75 | 1573,"[0, 1]" 76 | 1574,"[7, 8]" 77 | 1575,"[0, 1, 4, 6]" 78 | 1576,"[4, 7, 9]" 79 | 1577,"[1, 2, 3, 4, 7, 8, 9]" 80 | 1578,"[0, 1, 3, 4, 9]" 81 | 1579,"[0, 4, 7]" 82 | 1580,"[1, 2, 5, 6, 8, 9]" 83 | 1581,"[0, 7, 8, 9]" 84 | 1582,"[4, 9]" 85 | 1583,"[1, 3, 4, 6, 8, 9]" 86 | 1584,"[2, 3, 4, 7]" 87 | 1585,"[1, 5]" 88 | 1586,"[0, 2]" 89 | 1587,"[2, 3, 7, 8, 9]" 90 | 1588,"[2, 4, 5, 6, 9]" 91 | 1589,"[2, 5, 6, 7]" 92 | 1590,"[2, 4]" 93 | 1591,"[3, 4, 8]" 94 | 1592,"[1, 3, 4, 6, 8, 9]" 95 | 1593,"[3, 5]" 96 | 1594,"[5, 7]" 97 | 1595,"[1, 2, 5, 6, 9]" 98 | 1596,[9] 99 | 1597,"[4, 7]" 100 | 1598,"[2, 3, 9]" 101 | 1599,"[0, 2, 3, 4, 6, 7]" 102 | -------------------------------------------------------------------------------- /data_od_evaluation/ionosphere_pca_gt_iforest.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 1,"[2, 9]" 3 | 3,"[1, 2, 3, 7, 9]" 4 | 5,"[2, 9]" 5 | 7,"[6, 8]" 6 | 9,"[2, 3]" 7 | 11,"[0, 3, 6, 8]" 8 | 13,"[0, 6, 9]" 9 | 15,"[4, 9]" 10 | 17,[5] 11 | 19,"[1, 6, 8]" 12 | 21,"[0, 1, 2, 7]" 13 | 23,"[4, 7, 9]" 14 | 25,"[0, 1]" 15 | 27,[7] 16 | 29,"[1, 4, 9]" 17 | 31,"[0, 4, 9]" 18 | 33,"[0, 3, 5, 8, 9]" 19 | 35,"[0, 4, 7]" 20 | 37,"[0, 1, 4, 8, 9]" 21 | 39,"[0, 1, 6]" 22 | 41,"[0, 1, 5, 7]" 23 | 43,"[0, 7, 8]" 24 | 45,"[0, 6, 8]" 25 | 47,"[0, 1, 5, 6]" 26 | 49,"[2, 5, 7]" 27 | 51,"[4, 7, 9]" 28 | 53,[4] 29 | 55,[0] 30 | 57,"[4, 7]" 31 | 59,"[1, 2, 4, 9]" 32 | 61,[4] 33 | 63,"[1, 2, 4, 6, 7]" 34 | 65,"[0, 6, 9]" 35 | 67,"[0, 1, 3, 6, 7]" 36 | 69,"[0, 8]" 37 | 71,"[1, 4, 5, 6, 7, 8]" 38 | 73,[6] 39 | 75,[6] 40 | 77,"[1, 4]" 41 | 79,"[1, 2, 4]" 42 | 81,"[0, 1, 5, 6, 7]" 43 | 83,[1] 44 | 85,"[7, 9]" 45 | 87,"[2, 5]" 46 | 89,[6] 47 | 91,"[5, 6, 8]" 48 | 93,[6] 49 | 95,"[0, 1, 2, 3]" 50 | 98,"[1, 2, 7]" 51 | 100,"[0, 1, 3]" 52 | 102,"[2, 9]" 53 | 104,"[2, 6, 7, 8]" 54 | 106,"[4, 7]" 55 | 108,"[1, 4, 5, 7]" 56 | 110,"[6, 7]" 57 | 112,"[0, 7, 8]" 58 | 114,[6] 59 | 116,[8] 60 | 118,[5] 61 | 120,"[8, 9]" 62 | 122,"[6, 7]" 63 | 124,"[7, 9]" 64 | 126,"[2, 8]" 65 | 128,"[2, 4, 8, 9]" 66 | 130,"[0, 5, 6]" 67 | 132,"[4, 8, 9]" 68 | 134,"[0, 3, 4]" 69 | 136,"[0, 1, 5, 6]" 70 | 138,"[2, 3]" 71 | 140,"[0, 5]" 72 | 142,"[0, 1, 5]" 73 | 144,[1] 74 | 146,[7] 75 | 148,"[0, 1, 2, 8, 9]" 76 | 150,[9] 77 | 152,"[2, 3]" 78 | 154,"[2, 3]" 79 | 156,"[2, 3]" 80 | 158,"[2, 3]" 81 | 160,"[2, 3]" 82 | 162,[3] 83 | 164,"[1, 9]" 84 | 166,"[6, 7]" 85 | 168,"[2, 3]" 86 | 170,"[1, 2, 7, 8]" 87 | 172,"[2, 9]" 88 | 174,"[6, 7]" 89 | 176,"[2, 4]" 90 | 178,"[2, 3]" 91 | 180,"[6, 9]" 92 | 182,"[1, 2, 4, 9]" 93 | 184,"[4, 9]" 94 | 186,[8] 95 | 188,"[0, 1, 2, 3, 6]" 96 | 190,"[1, 2, 3, 4, 6, 8]" 97 | 192,"[1, 2, 6]" 98 | 194,"[1, 5, 6]" 99 | 196,"[1, 2, 4]" 100 | 198,"[2, 7, 9]" 101 | 200,"[4, 9]" 102 | 202,[5] 103 | 204,"[1, 6, 7]" 104 | 206,"[0, 1, 6]" 105 | 208,"[6, 9]" 106 | 210,"[1, 6]" 107 | 212,"[0, 3, 9]" 108 | 214,"[0, 2, 3, 4]" 109 | 216,"[0, 4, 6, 7]" 110 | 218,"[1, 6]" 111 | 220,"[0, 1, 5]" 112 | 222,"[2, 7]" 113 | 224,"[6, 8]" 114 | 226,"[6, 8]" 115 | 228,"[0, 2, 7]" 116 | 230,"[2, 3]" 117 | 232,[2] 118 | 234,"[0, 4]" 119 | 236,[1] 120 | 238,"[2, 3]" 121 | 240,"[2, 3]" 122 | 242,"[3, 7]" 123 | 244,"[0, 5]" 124 | 246,"[1, 8]" 125 | 248,"[2, 9]" 126 | 250,"[5, 7]" 127 | 252,"[0, 5, 6, 7]" 128 | -------------------------------------------------------------------------------- /data_od_evaluation/optdigits_pca_gt_hbos.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 5066,"[0, 4, 9]" 3 | 5067,[6] 4 | 5068,[6] 5 | 5069,"[0, 6]" 6 | 5070,[6] 7 | 5071,[6] 8 | 5072,[6] 9 | 5073,"[2, 6]" 10 | 5074,[6] 11 | 5075,[6] 12 | 5076,[6] 13 | 5077,[6] 14 | 5078,[6] 15 | 5079,[6] 16 | 5080,[6] 17 | 5081,"[1, 3, 6, 7]" 18 | 5082,[6] 19 | 5083,[6] 20 | 5084,[6] 21 | 5085,"[0, 6]" 22 | 5086,[6] 23 | 5087,[6] 24 | 5088,[6] 25 | 5089,[6] 26 | 5090,"[0, 2, 6]" 27 | 5091,[6] 28 | 5092,"[0, 3, 6]" 29 | 5093,"[0, 3, 6]" 30 | 5094,[6] 31 | 5095,[6] 32 | 5096,[6] 33 | 5097,"[0, 6]" 34 | 5098,[6] 35 | 5099,[6] 36 | 5100,[6] 37 | 5101,[6] 38 | 5102,"[1, 3, 6, 7]" 39 | 5103,[6] 40 | 5104,[6] 41 | 5105,"[0, 2, 6]" 42 | 5106,"[0, 2, 3, 6, 8]" 43 | 5107,[6] 44 | 5108,[3] 45 | 5109,[6] 46 | 5110,[6] 47 | 5111,[6] 48 | 5112,[6] 49 | 5113,[6] 50 | 5114,"[0, 2, 6]" 51 | 5115,[6] 52 | 5116,[6] 53 | 5117,[6] 54 | 5118,"[2, 6]" 55 | 5119,"[0, 6]" 56 | 5120,[6] 57 | 5121,"[0, 1, 2, 3, 6, 8]" 58 | 5122,[6] 59 | 5123,[6] 60 | 5124,[6] 61 | 5125,[6] 62 | 5126,"[0, 1, 2, 6]" 63 | 5127,[6] 64 | 5128,[6] 65 | 5129,"[6, 8]" 66 | 5130,[6] 67 | 5131,[3] 68 | 5132,[6] 69 | 5133,[3] 70 | 5134,[3] 71 | 5135,"[2, 3, 6]" 72 | 5136,[6] 73 | 5137,"[0, 2, 5, 6]" 74 | 5138,"[3, 6]" 75 | 5139,[6] 76 | 5140,[6] 77 | 5141,"[0, 3, 6, 7]" 78 | 5142,[6] 79 | 5143,[6] 80 | 5144,"[1, 2, 4, 6, 7]" 81 | 5145,[6] 82 | 5146,"[0, 4, 9]" 83 | 5147,[6] 84 | 5148,"[0, 1, 6, 8]" 85 | 5149,[6] 86 | 5150,"[0, 2, 4, 6, 7]" 87 | 5151,[6] 88 | 5152,"[0, 1, 6]" 89 | 5153,[6] 90 | 5154,[6] 91 | 5155,[6] 92 | 5156,[6] 93 | 5157,"[0, 2, 4, 6, 7]" 94 | 5158,"[3, 8, 9]" 95 | 5159,[6] 96 | 5160,"[0, 3, 6]" 97 | 5161,"[0, 6]" 98 | 5162,"[0, 2, 6]" 99 | 5163,[6] 100 | 5164,[6] 101 | 5165,"[0, 6]" 102 | 5166,[3] 103 | 5167,[6] 104 | 5168,[6] 105 | 5169,[6] 106 | 5170,[6] 107 | 5171,"[0, 3, 6]" 108 | 5172,[0] 109 | 5173,[6] 110 | 5174,"[2, 4, 6]" 111 | 5175,[6] 112 | 5176,[6] 113 | 5177,"[0, 6]" 114 | 5178,[6] 115 | 5179,[3] 116 | 5180,[6] 117 | 5181,[6] 118 | 5182,"[0, 6]" 119 | 5183,[6] 120 | 5184,"[0, 6, 9]" 121 | 5185,"[0, 3, 5, 6, 7, 8]" 122 | 5186,[6] 123 | 5187,[6] 124 | 5188,"[0, 6]" 125 | 5189,[6] 126 | 5190,"[0, 6]" 127 | 5191,[6] 128 | 5192,[6] 129 | 5193,"[0, 6]" 130 | 5194,[6] 131 | 5195,[6] 132 | 5196,"[6, 7]" 133 | 5197,"[6, 8]" 134 | 5198,[6] 135 | 5199,"[0, 6]" 136 | 5200,"[0, 1, 2, 6]" 137 | 5201,[6] 138 | 5202,"[3, 7, 8, 9]" 139 | 5203,"[0, 4, 9]" 140 | 5204,"[2, 3, 6, 8, 9]" 141 | 5205,"[0, 2, 3, 6, 8]" 142 | 5206,[3] 143 | 5207,[6] 144 | 5208,[9] 145 | 5209,[6] 146 | 5210,"[0, 2, 6, 8]" 147 | 5211,[6] 148 | 5212,[6] 149 | 5213,[6] 150 | 5214,[6] 151 | 5215,[6] 152 | -------------------------------------------------------------------------------- /utils/synthetic_generator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def generate_data(n_nor, n_ano, dim, rate=0, n_nor_c=1, n_ano_c=1): 6 | if rate > 0: 7 | n_ano = int(n_nor * rate) 8 | 9 | # normal class with "n_nor_c" clusters 10 | x_nor = np.zeros([n_nor, dim]) 11 | for i in range(dim): 12 | size = round(n_nor / n_nor_c) 13 | for j in range(n_nor_c): 14 | loc = np.random.rand() 15 | scale = float(np.random.rand()) 16 | print("Inlier: dim"+str(i), "cluster"+str(j), round(loc, 1), round(scale, 2)) 17 | # last c 18 | if j == n_nor_c - 1: 19 | last_size = n_nor - (n_nor_c-1)*size 20 | x_nor[j * size:, i] = np.random.normal(loc, scale, last_size) 21 | else: 22 | x_nor[j * size: (j+1)*size, i] = np.random.normal(loc, scale, size) 23 | 24 | x_ano = np.zeros([n_ano, dim]) 25 | for i in range(dim): 26 | size = round(n_ano / n_ano_c) 27 | for j in range(n_ano_c): 28 | loc = np.random.rand() + 1 29 | scale = float(np.random.rand()) 30 | print("anomaly: dim"+str(i), "cluster"+str(j), round(loc, 1), round(scale, 2)) 31 | 32 | # last c 33 | if j != n_ano_c - 1: 34 | x_ano[j*size: (j+1)*size, i] = np.random.normal(loc, scale, size) 35 | else: 36 | last_size = n_ano - (n_ano_c - 1) * size 37 | x_ano[j*size:, i] = np.random.normal(loc, scale, last_size) 38 | # x_ano[:, i] = np.random.normal(loc, scale, n_ano) 39 | 40 | x = np.concatenate([x_ano, x_nor], axis=0) 41 | y = np.append(np.ones(n_ano, dtype=int), np.zeros(n_nor, dtype=int)) 42 | matrix = np.concatenate([x, y.reshape([x.shape[0], 1])], axis=1) 43 | 44 | columns = ["A"+str(i) for i in range(dim)] 45 | columns.append("class") 46 | df = pd.DataFrame(matrix, columns=columns) 47 | df['class'] = df['class'].astype(int) 48 | return df 49 | 50 | 51 | # Scal-up Test 52 | dim_range = [8, 32, 128, 512, 2048] 53 | size_range = [1000, 4000, 16000, 64000, 256000] 54 | 55 | 56 | root = "../scal_data/" 57 | for ii, dim in enumerate(dim_range): 58 | n_nor = 995 59 | n_ano = 5 60 | size = n_nor + n_ano 61 | df = generate_data(n_nor=n_nor, n_ano=n_ano, dim=dim) 62 | name = "scal_dim" + str(ii) + "_" + str(size) + "-" + str(dim) + ".csv" 63 | df.to_csv(root + name, index=False) 64 | 65 | for ii, size in enumerate(size_range): 66 | dim = 32 67 | n_nor = int(size * 0.995) 68 | n_ano = int(size * 0.005) 69 | df = generate_data(n_nor=n_nor, n_ano=n_ano, dim=dim) 70 | name = "scal_size" + str(ii) + "_" + str(size) + "-" + str(dim) + ".csv" 71 | df.to_csv(root + name, index=False) 72 | -------------------------------------------------------------------------------- /model_coin/prediction_strength.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.cluster import KMeans 3 | 4 | 5 | def ClosestCenter(point, centroids): 6 | # Find the closest center over all centroids 7 | min_index = -1 8 | min_dist = float('inf') 9 | for i in range(len(centroids)): 10 | center = centroids[i] 11 | dist_cur = np.linalg.norm(point - center) 12 | if dist_cur < min_dist: 13 | min_index = i 14 | min_dist = dist_cur 15 | 16 | return min_index 17 | 18 | 19 | def PredictionStrength(data_test, test_labels, train_centers, c): 20 | # Compute prediction strength under c clusters 21 | pred_strength = np.zeros(c) 22 | for cc in range(c): 23 | num_cc = test_labels.tolist().count(cc) 24 | count = 0. 25 | for i in range(len(test_labels)-1): 26 | for j in range(i+1, len(test_labels)): 27 | if test_labels[i] == test_labels[j] == cc: 28 | pi = data_test[i] 29 | pj = data_test[j] 30 | if ClosestCenter(pi, train_centers) == ClosestCenter(pj, train_centers): 31 | count += 1 32 | 33 | if num_cc <= 1: 34 | pred_strength[cc] = float('inf') 35 | else: 36 | pred_strength[cc] = count/(num_cc * (num_cc-1)/2.) 37 | 38 | return min(pred_strength) 39 | 40 | 41 | def optimalK(data, num_fold, maxClusters=5, THRE_PS=0.90): 42 | # Find the best number of clusters using prediction strength 43 | num_data = data.shape[0] 44 | num_feat = data.shape[1] 45 | 46 | pred_strength_avg = np.zeros(maxClusters+1) 47 | for nf in range(num_fold): 48 | # Split into training and testing samples 49 | inds_train = np.random.choice(num_data, int(num_data*0.5), replace=False) 50 | inds_test = list(set(range(num_data)).difference(inds_train)) 51 | data_train = data[inds_train] 52 | data_test = data[inds_test] 53 | 54 | pred_strength_cur = np.zeros(maxClusters+1) 55 | for c in range(1, maxClusters+1): 56 | train_cluster = KMeans(n_clusters=c).fit(data_train) 57 | test_cluster = KMeans(n_clusters=c).fit(data_test) 58 | pred_strength_cur[c] = PredictionStrength(data_test, test_cluster.labels_, train_cluster.cluster_centers_, c) 59 | 60 | pred_strength_avg += pred_strength_cur 61 | 62 | pred_strength_avg /= num_fold 63 | # print("Prediction Strength vec: ", pred_strength_avg) 64 | 65 | k_optimal = max([i for i,j in enumerate(pred_strength_avg) if j > THRE_PS]) 66 | 67 | return k_optimal 68 | 69 | 70 | # if __name__ == "__main__": 71 | # x, y = make_blobs(1000, n_features=5, centers=3) 72 | # plt.scatter(x[:, 0], x[:, 1]) 73 | # plt.show() 74 | # 75 | # k = optimalK(x, 10) 76 | # print('Optimal k is: ', k) -------------------------------------------------------------------------------- /data_od_evaluation/ionosphere_pca_gt_copod.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 1,"[2, 5, 9]" 3 | 3,"[2, 3, 6, 7, 8]" 4 | 5,"[0, 2, 3, 8]" 5 | 7,"[2, 3, 8]" 6 | 9,"[0, 1, 3, 6]" 7 | 11,"[0, 8]" 8 | 13,"[0, 2, 5, 6, 7, 8]" 9 | 15,"[2, 5, 6, 7, 8, 9]" 10 | 17,[9] 11 | 19,"[5, 8]" 12 | 21,"[0, 3, 4]" 13 | 23,"[4, 7, 9]" 14 | 25,"[0, 1, 2, 4, 5, 7]" 15 | 27,"[2, 3, 6, 7, 8]" 16 | 29,"[1, 3, 6, 7, 9]" 17 | 31,"[0, 3, 4]" 18 | 33,"[4, 5, 8]" 19 | 35,"[7, 9]" 20 | 37,"[1, 3, 4, 8]" 21 | 39,"[1, 6, 7]" 22 | 41,"[0, 5, 7, 8, 9]" 23 | 43,"[7, 8]" 24 | 45,"[0, 4, 6]" 25 | 47,"[5, 6, 7]" 26 | 49,"[0, 2, 5, 7]" 27 | 51,"[4, 7, 9]" 28 | 53,"[2, 3, 4, 6]" 29 | 55,[0] 30 | 57,"[3, 4]" 31 | 59,"[1, 2, 3, 6, 8]" 32 | 61,"[2, 6, 7]" 33 | 63,"[4, 6]" 34 | 65,"[1, 2, 7, 8]" 35 | 67,"[5, 6, 7]" 36 | 69,[8] 37 | 71,"[1, 5, 6, 7]" 38 | 73,"[1, 3, 6]" 39 | 75,[6] 40 | 77,[4] 41 | 79,"[2, 4]" 42 | 81,"[2, 5, 6, 7, 8]" 43 | 83,"[1, 5]" 44 | 85,"[1, 9]" 45 | 87,"[2, 6, 8, 9]" 46 | 89,"[0, 1, 8]" 47 | 91,"[4, 5, 6, 8]" 48 | 93,"[2, 3, 4, 6]" 49 | 95,"[0, 1]" 50 | 98,"[1, 2, 5, 6, 7]" 51 | 100,"[1, 5]" 52 | 102,"[2, 4, 6, 7, 8, 9]" 53 | 104,"[2, 5, 8]" 54 | 106,"[3, 4]" 55 | 108,"[1, 4, 5, 7, 9]" 56 | 110,"[1, 2, 5, 7, 9]" 57 | 112,"[5, 8]" 58 | 114,"[0, 1, 3, 6]" 59 | 116,"[4, 7, 8]" 60 | 118,"[2, 5, 6, 7, 8]" 61 | 120,"[7, 8]" 62 | 122,"[6, 7, 9]" 63 | 124,"[4, 5, 6, 7]" 64 | 126,"[2, 6, 8, 9]" 65 | 128,"[2, 6, 7, 8, 9]" 66 | 130,"[4, 5, 7, 9]" 67 | 132,"[4, 9]" 68 | 134,"[4, 9]" 69 | 136,"[0, 4]" 70 | 138,"[0, 2, 3, 4, 6]" 71 | 140,[5] 72 | 142,"[1, 4, 5, 7, 9]" 73 | 144,"[1, 7]" 74 | 146,"[0, 2, 5, 7]" 75 | 148,"[1, 2, 3, 9]" 76 | 150,"[1, 7, 9]" 77 | 152,"[0, 2, 3, 7, 8]" 78 | 154,"[0, 1, 3, 8]" 79 | 156,"[0, 3]" 80 | 158,"[0, 2, 3, 6]" 81 | 160,"[0, 3]" 82 | 162,[3] 83 | 164,"[1, 4, 6, 7, 8, 9]" 84 | 166,"[0, 4, 6]" 85 | 168,"[0, 2, 3, 4, 8]" 86 | 170,"[2, 4, 7, 9]" 87 | 172,"[2, 8, 9]" 88 | 174,"[6, 8]" 89 | 176,"[2, 3, 6, 8]" 90 | 178,"[0, 3, 6]" 91 | 180,"[6, 7, 8, 9]" 92 | 182,"[1, 2, 4, 7, 9]" 93 | 184,"[2, 8, 9]" 94 | 186,"[1, 2, 3, 4, 6, 7, 8]" 95 | 188,"[3, 4, 6]" 96 | 190,"[1, 2, 3, 4, 6, 8]" 97 | 192,"[1, 2, 3, 6, 8]" 98 | 194,"[3, 6]" 99 | 196,"[1, 2, 3, 4, 6]" 100 | 198,"[5, 9]" 101 | 200,"[4, 7, 8, 9]" 102 | 202,"[0, 1, 2, 3, 4, 5]" 103 | 204,"[0, 1, 3, 6]" 104 | 206,"[0, 1, 6]" 105 | 208,"[1, 6, 7, 9]" 106 | 210,"[2, 3, 6, 7, 9]" 107 | 212,"[0, 3, 8]" 108 | 214,"[1, 2, 3, 4, 9]" 109 | 216,"[0, 1, 7, 8]" 110 | 218,"[4, 5, 6, 7]" 111 | 220,"[0, 3, 5, 6]" 112 | 222,"[6, 7]" 113 | 224,"[6, 7, 8, 9]" 114 | 226,"[0, 1, 3, 6]" 115 | 228,"[2, 9]" 116 | 230,"[1, 3, 4, 8]" 117 | 232,"[2, 4, 5]" 118 | 234,"[1, 2, 4, 6, 8]" 119 | 236,"[0, 1]" 120 | 238,"[2, 3, 8]" 121 | 240,"[2, 4, 6, 7, 8, 9]" 122 | 242,"[5, 6, 7]" 123 | 244,"[4, 5, 7, 9]" 124 | 246,"[0, 1, 3, 8]" 125 | 248,"[2, 4, 6, 7, 8, 9]" 126 | 250,"[7, 9]" 127 | 252,"[4, 5, 6, 7]" 128 | -------------------------------------------------------------------------------- /model_aton/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script implements an outlier interpretation method of the following paper: 3 | "Beyond Outlier Detection: Outlier Interpretation by Attention-Guided Triplet Deviation Network". in WWW'21. 4 | @ Author: Hongzuo Xu 5 | @ email: hongzuo.xu@gmail.com or leogarcia@126.com or xuhongzuo13@nudt.edu.cn 6 | """ 7 | 8 | 9 | import numpy as np 10 | import torch 11 | import random, string 12 | import os 13 | mask = ''.join(random.sample(string.ascii_letters, 8)) 14 | 15 | 16 | def min_max_normalize(x): 17 | n, dim = x.shape 18 | x_n = np.zeros(x.shape) 19 | for i in range(dim): 20 | array = x[:, i] 21 | _min, _max = np.min(array), np.max(array) 22 | if _min == _max: 23 | x_n[:, i] = np.zeros(n) 24 | else: 25 | x_n[:, i] = (array - _min) / (_max - _min) 26 | 27 | return x_n 28 | 29 | 30 | class EarlyStopping: 31 | """Early stops the training if validation loss doesn't improve after a given patience.""" 32 | def __init__(self, patience=7, verbose=False, delta=0, path="checkpoints/" + mask + '_checkpoint.pt', trace_func=print): 33 | """ 34 | Args: 35 | patience (int): How long to wait after last time validation loss improved. 36 | Default: 7 37 | verbose (bool): If True, prints a message for each validation loss improvement. 38 | Default: False 39 | delta (float): Minimum change in the monitored quantity to qualify as an improvement. 40 | Default: 0 41 | path (str): Path for the checkpoint to be saved to. 42 | Default: 'checkpoint.pt' 43 | trace_func (function): trace print function. 44 | Default: print 45 | """ 46 | self.patience = patience 47 | self.verbose = verbose 48 | self.counter = 0 49 | self.best_score = None 50 | self.early_stop = False 51 | self.val_loss_min = np.Inf 52 | self.delta = delta 53 | self.path = path 54 | self.trace_func = trace_func 55 | 56 | def __call__(self, val_loss, model): 57 | score = -val_loss 58 | if self.best_score is None: 59 | self.best_score = score 60 | self.save_checkpoint(val_loss, model) 61 | elif score < self.best_score + self.delta: 62 | self.counter += 1 63 | # self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}') 64 | if self.counter >= self.patience: 65 | self.early_stop = True 66 | else: 67 | self.best_score = score 68 | self.save_checkpoint(val_loss, model) 69 | self.counter = 0 70 | 71 | def save_checkpoint(self, val_loss, model): 72 | """Saves model when validation loss decrease.""" 73 | if self.verbose: 74 | self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...') 75 | torch.save(model.state_dict(), self.path) 76 | self.val_loss_min = val_loss -------------------------------------------------------------------------------- /model_iml/IntGrad.py: -------------------------------------------------------------------------------- 1 | import sklearn 2 | import numpy as np 3 | from alibi.explainers import IntegratedGradients 4 | from tensorflow.keras.layers import Dense, Input 5 | from tensorflow.keras.models import Model 6 | from tensorflow.keras.utils import to_categorical 7 | from tensorflow.keras import optimizers 8 | 9 | 10 | class IntGrad: 11 | def __init__(self, n_steps=50, method="gausslegendre"): 12 | """ 13 | 14 | :param n_steps: 15 | :param method: 16 | """ 17 | self.clf_batch_size = 64 18 | self.clf_epochs = 30 19 | 20 | self.n_steps = n_steps 21 | self.method = method 22 | 23 | self.ano_idx = None 24 | self.nor_idx = None 25 | 26 | self.dim = None 27 | return 28 | 29 | def fit(self, x, y): 30 | self.dim = x.shape[1] 31 | x = min_max_normalize(x) 32 | # x = z_score_normalize(x) 33 | y_oh = to_categorical(y, 2) 34 | clf = self.nn_model() 35 | clf.fit(x, y_oh, batch_size=self.clf_batch_size, epochs=self.clf_epochs, verbose=1) 36 | y_pred = clf(x).numpy().argmax(axis=1) 37 | print("Clf model accuracy: [{:.4f}]".format(sklearn.metrics.accuracy_score(y, y_pred))) 38 | 39 | # Initialize IntegratedGradients instance 40 | ig = IntegratedGradients(clf, n_steps=self.n_steps, method=self.method) 41 | 42 | # Calculate attributions for the first 10 images in the test set 43 | self.ano_idx = np.where(y == 1)[0] 44 | x_ano = x[self.ano_idx] 45 | # predictions = clf(x_ano).numpy().argmax(axis=1) 46 | predictions = np.ones(len(self.ano_idx), dtype=int) 47 | 48 | self.nor_idx = np.where(y == 0)[0] 49 | x_nor = x[self.nor_idx] 50 | x_nor_avg = np.average(x_nor, axis=0) 51 | baselines = np.array([x_nor_avg] * len(self.ano_idx)) 52 | explanation = ig.explain(x_ano, baselines=baselines, target=predictions) 53 | 54 | fea_weight_lst = explanation.data['attributions'] 55 | return fea_weight_lst 56 | 57 | def nn_model(self): 58 | x_in = Input(shape=(self.dim,)) 59 | x = Dense(10, activation='relu')(x_in) 60 | # x = Dense(10, activation='relu')(x) 61 | x_out = Dense(2, activation='softmax')(x) 62 | nn = Model(inputs=x_in, outputs=x_out) 63 | sgd = optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True) 64 | nn.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy']) 65 | return nn 66 | 67 | 68 | def min_max_normalize(x): 69 | n, dim = x.shape 70 | x_n = np.zeros(x.shape) 71 | for i in range(dim): 72 | array = x[:, i] 73 | _min, _max = np.min(array), np.max(array) 74 | if _min == _max: 75 | x_n[:, i] = np.zeros(n) 76 | else: 77 | x_n[:, i] = (array - _min) / (_max - _min) 78 | 79 | return x_n 80 | 81 | 82 | def z_score_normalize(x): 83 | n, dim = x.shape 84 | x_n = np.zeros(x.shape) 85 | for i in range(dim): 86 | array = x[:, i] 87 | avg = np.average(array) 88 | std = np.std(array) 89 | if std != 0: 90 | x_n[:, i] = (array - avg) / std 91 | else: 92 | x_n[:, i] = array 93 | return x_n 94 | -------------------------------------------------------------------------------- /data_od_evaluation/wineQualityWhites-od2_gt_hbos.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 46,[5] 3 | 98,[0] 4 | 115,"[2, 8]" 5 | 147,"[1, 2, 5]" 6 | 172,"[1, 5]" 7 | 176,[6] 8 | 178,"[1, 2, 5]" 9 | 189,[6] 10 | 204,[0] 11 | 207,"[0, 2]" 12 | 230,"[1, 2, 5, 6]" 13 | 250,[8] 14 | 251,"[0, 3]" 15 | 253,"[0, 5, 8]" 16 | 259,"[0, 6]" 17 | 278,[6] 18 | 282,[6] 19 | 294,[10] 20 | 433,"[1, 2, 3, 4, 5, 10]" 21 | 445,[6] 22 | 496,"[5, 10]" 23 | 499,"[5, 10]" 24 | 526,[6] 25 | 540,"[3, 5]" 26 | 626,"[1, 2, 3, 4, 5, 10]" 27 | 641,[6] 28 | 646,"[2, 5]" 29 | 659,[5] 30 | 662,"[1, 2, 3, 4, 5, 6, 10]" 31 | 687,"[1, 4]" 32 | 690,"[5, 9]" 33 | 702,[6] 34 | 740,[6] 35 | 780,"[2, 5]" 36 | 831,"[1, 6, 8]" 37 | 873,"[0, 6]" 38 | 905,[0] 39 | 906,"[0, 6]" 40 | 908,"[0, 3, 5]" 41 | 914,"[0, 6]" 42 | 948,"[1, 5, 6]" 43 | 991,"[0, 6]" 44 | 993,[6] 45 | 1027,"[1, 3, 5, 6, 9]" 46 | 1029,[6] 47 | 1034,"[4, 9]" 48 | 1040,"[1, 5]" 49 | 1042,"[1, 5]" 50 | 1053,[0] 51 | 1059,"[0, 4, 10]" 52 | 1109,[0] 53 | 1114,[6] 54 | 1152,"[1, 2, 5, 6]" 55 | 1154,"[1, 6, 8]" 56 | 1155,"[5, 8]" 57 | 1229,[0] 58 | 1245,"[0, 1, 2, 3, 5]" 59 | 1293,"[5, 9]" 60 | 1294,"[5, 9]" 61 | 1349,[0] 62 | 1363,"[2, 5]" 63 | 1405,"[1, 5]" 64 | 1417,[6] 65 | 1420,[0] 66 | 1423,[6] 67 | 1430,"[1, 5]" 68 | 1474,[6] 69 | 1483,[6] 70 | 1484,[0] 71 | 1541,"[1, 5, 6]" 72 | 1558,"[1, 5]" 73 | 1559,"[0, 6]" 74 | 1574,"[0, 3, 5]" 75 | 1577,"[1, 5]" 76 | 1579,[6] 77 | 1649,[8] 78 | 1652,[0] 79 | 1664,"[0, 3]" 80 | 1688,[5] 81 | 1690,[0] 82 | 1702,[10] 83 | 1708,"[1, 5]" 84 | 1718,[0] 85 | 1739,"[0, 6]" 86 | 1781,"[5, 6, 8]" 87 | 1817,"[1, 2]" 88 | 1856,"[0, 1]" 89 | 1924,[0] 90 | 1931,[5] 91 | 1951,[1] 92 | 1990,"[1, 5]" 93 | 2050,[0] 94 | 2079,"[1, 5]" 95 | 2116,[6] 96 | 2119,[0] 97 | 2154,"[0, 1, 3]" 98 | 2156,[6] 99 | 2159,[6] 100 | 2225,[3] 101 | 2237,[3] 102 | 2246,[3] 103 | 2275,[3] 104 | 2318,"[2, 3]" 105 | 2337,[5] 106 | 2346,"[2, 3]" 107 | 2372,[6] 108 | 2373,[6] 109 | 2379,"[4, 5, 6, 8]" 110 | 2380,"[4, 5, 6, 8]" 111 | 2386,[6] 112 | 2387,[6] 113 | 2388,[10] 114 | 2400,[0] 115 | 2401,[0] 116 | 2409,"[5, 9]" 117 | 2412,"[4, 6, 7, 9]" 118 | 2413,"[5, 9]" 119 | 2414,"[3, 4, 5, 6, 7, 8, 9, 10]" 120 | 2435,[6] 121 | 2493,"[3, 5]" 122 | 2494,"[3, 5]" 123 | 2502,[0] 124 | 2503,[0] 125 | 2531,"[1, 5]" 126 | 2532,"[1, 5]" 127 | 2589,"[1, 2, 5, 6]" 128 | 2656,[5] 129 | 2818,[5] 130 | 2888,[6] 131 | 2920,[6] 132 | 2935,"[5, 6, 9]" 133 | 3021,[6] 134 | 3050,[5] 135 | 3067,[6] 136 | 3087,[6] 137 | 3109,[6] 138 | 3179,[0] 139 | 3186,"[0, 6]" 140 | 3218,"[4, 5, 10]" 141 | 3265,"[0, 5, 6, 10]" 142 | 3275,[2] 143 | 3307,[5] 144 | 3409,"[2, 5, 8, 10]" 145 | 3417,"[1, 5, 6]" 146 | 3528,[6] 147 | 3559,[6] 148 | 3571,[6] 149 | 3578,[10] 150 | 3650,"[2, 8]" 151 | 3662,"[1, 2, 5, 6]" 152 | 3714,[8] 153 | 3736,"[3, 5, 6, 9, 10]" 154 | 3770,[10] 155 | 3810,[3] 156 | 3872,"[5, 10]" 157 | 3879,"[1, 5, 6]" 158 | 3901,"[0, 6]" 159 | 3933,"[0, 6]" 160 | 3965,[10] 161 | 3967,[6] 162 | 3973,[6] 163 | 4020,[10] 164 | 4039,[1] 165 | 4074,[6] 166 | 4212,[6] 167 | 4213,"[2, 4, 5, 6, 10]" 168 | 4217,"[2, 5]" 169 | 4222,"[2, 5]" 170 | 4223,[6] 171 | 4253,"[0, 2, 5, 6, 10]" 172 | 4278,[6] 173 | 4389,"[3, 5]" 174 | 4483,[0] 175 | 4508,[5] 176 | 4609,[6] 177 | 4680,"[1, 2, 5]" 178 | 4686,"[1, 2, 5]" 179 | 4745,[5] 180 | 4774,[10] 181 | 4779,"[1, 2, 5, 6]" 182 | 4804,[6] 183 | 4839,[6] 184 | 4878,"[1, 2, 5, 6]" 185 | -------------------------------------------------------------------------------- /data_od_evaluation/wineQualityWhites-od2_gt_iforest.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 46,"[1, 5]" 3 | 98,"[0, 3]" 4 | 115,"[2, 5, 8]" 5 | 147,"[1, 8]" 6 | 172,"[1, 5, 8]" 7 | 176,"[0, 2, 6, 10]" 8 | 178,"[1, 2, 5]" 9 | 189,"[6, 10]" 10 | 204,"[0, 10]" 11 | 207,"[0, 2]" 12 | 230,"[1, 2]" 13 | 250,"[7, 8]" 14 | 251,"[0, 3]" 15 | 253,"[5, 8, 10]" 16 | 259,[5] 17 | 278,[6] 18 | 282,"[1, 5, 6, 9]" 19 | 294,"[0, 1, 10]" 20 | 433,"[1, 4, 5]" 21 | 445,"[3, 6]" 22 | 496,"[2, 5]" 23 | 499,"[2, 5]" 24 | 526,[6] 25 | 540,"[3, 5, 6, 8]" 26 | 626,"[0, 1, 2, 4, 10]" 27 | 641,"[0, 3]" 28 | 646,"[2, 5]" 29 | 659,"[2, 5]" 30 | 662,"[1, 2, 4, 6]" 31 | 687,"[1, 4]" 32 | 690,"[6, 9]" 33 | 702,[2] 34 | 740,[6] 35 | 780,"[2, 5, 6, 8]" 36 | 831,"[3, 8, 9, 10]" 37 | 873,"[0, 6]" 38 | 905,"[0, 2, 5]" 39 | 906,"[0, 2, 5]" 40 | 908,"[0, 3, 7, 8]" 41 | 914,"[0, 5, 6, 10]" 42 | 948,"[1, 6, 8]" 43 | 991,"[0, 2, 6, 9]" 44 | 993,"[2, 5, 6]" 45 | 1027,"[1, 6, 9]" 46 | 1029,"[1, 6]" 47 | 1034,"[0, 4, 6]" 48 | 1040,"[1, 6, 9]" 49 | 1042,[1] 50 | 1053,"[0, 7]" 51 | 1059,"[0, 4, 10]" 52 | 1109,"[0, 7]" 53 | 1114,[6] 54 | 1152,[2] 55 | 1154,"[0, 8, 10]" 56 | 1155,"[0, 8]" 57 | 1229,"[0, 10]" 58 | 1245,"[1, 2, 3, 6]" 59 | 1293,[9] 60 | 1294,[9] 61 | 1349,"[0, 6, 8, 9]" 62 | 1363,"[2, 5]" 63 | 1405,"[4, 5, 9]" 64 | 1417,"[6, 7, 8]" 65 | 1420,"[0, 3]" 66 | 1423,"[0, 2, 6, 9]" 67 | 1430,"[3, 5, 9]" 68 | 1474,"[6, 10]" 69 | 1483,"[6, 10]" 70 | 1484,"[5, 10]" 71 | 1541,"[1, 6, 9, 10]" 72 | 1558,"[1, 2, 5, 10]" 73 | 1559,"[0, 2, 6]" 74 | 1574,"[3, 8]" 75 | 1577,"[1, 8]" 76 | 1579,"[2, 3, 4, 6, 7, 8, 9]" 77 | 1649,"[7, 8]" 78 | 1652,"[0, 2, 3, 9]" 79 | 1664,"[0, 3, 7, 8]" 80 | 1688,[5] 81 | 1690,"[0, 1, 5]" 82 | 1702,"[5, 10]" 83 | 1708,"[1, 8]" 84 | 1718,"[0, 7]" 85 | 1739,"[0, 5, 6, 8]" 86 | 1781,"[3, 6, 8]" 87 | 1817,[2] 88 | 1856,"[0, 1]" 89 | 1924,[5] 90 | 1931,"[1, 5, 6]" 91 | 1951,"[0, 1]" 92 | 1990,[5] 93 | 2050,"[0, 7]" 94 | 2079,"[1, 8]" 95 | 2116,"[6, 7, 8, 9]" 96 | 2119,"[0, 1, 5, 8]" 97 | 2154,"[0, 1]" 98 | 2156,"[3, 5, 9]" 99 | 2159,"[5, 6, 7, 9, 10]" 100 | 2225,"[0, 3, 4]" 101 | 2237,"[0, 2, 3, 4, 7]" 102 | 2246,"[0, 2, 3, 4, 7]" 103 | 2275,"[0, 3, 4, 10]" 104 | 2318,[2] 105 | 2337,[4] 106 | 2346,[2] 107 | 2372,[5] 108 | 2373,"[1, 5, 6, 8]" 109 | 2379,"[4, 5, 8]" 110 | 2380,"[4, 8, 9]" 111 | 2386,"[3, 6]" 112 | 2387,"[3, 6]" 113 | 2388,[5] 114 | 2400,"[0, 7]" 115 | 2401,"[0, 7]" 116 | 2409,"[8, 9]" 117 | 2412,"[0, 3, 4, 7, 8, 9]" 118 | 2413,"[8, 9]" 119 | 2414,"[0, 3, 4, 7, 8, 9]" 120 | 2435,"[5, 6]" 121 | 2493,"[2, 3, 5, 7]" 122 | 2494,"[2, 3, 5, 7]" 123 | 2502,"[0, 2, 5]" 124 | 2503,"[0, 2, 5]" 125 | 2531,"[1, 5]" 126 | 2532,"[1, 5]" 127 | 2589,"[1, 2]" 128 | 2656,"[6, 9]" 129 | 2818,"[1, 5, 8]" 130 | 2888,[6] 131 | 2920,"[3, 5]" 132 | 2935,[5] 133 | 3021,[5] 134 | 3050,"[3, 5, 6, 8, 10]" 135 | 3067,[2] 136 | 3087,"[0, 6]" 137 | 3109,"[6, 8]" 138 | 3179,"[0, 9]" 139 | 3186,"[0, 2, 6]" 140 | 3218,[4] 141 | 3265,"[0, 10]" 142 | 3275,[2] 143 | 3307,"[0, 5]" 144 | 3409,[3] 145 | 3417,"[1, 6]" 146 | 3528,"[1, 3, 6, 9]" 147 | 3559,[6] 148 | 3571,"[1, 2, 6]" 149 | 3578,[5] 150 | 3650,"[2, 5]" 151 | 3662,"[1, 6, 8]" 152 | 3714,[8] 153 | 3736,"[3, 6, 9, 10]" 154 | 3770,[4] 155 | 3810,"[3, 10]" 156 | 3872,[10] 157 | 3879,"[1, 5, 6, 9]" 158 | 3901,"[1, 2, 4, 5, 10]" 159 | 3933,"[0, 2, 6]" 160 | 3965,[1] 161 | 3967,[2] 162 | 3973,[2] 163 | 4020,[10] 164 | 4039,[1] 165 | 4074,"[2, 3, 5, 6]" 166 | 4212,"[5, 7, 10]" 167 | 4213,"[2, 4, 5, 6]" 168 | 4217,"[2, 9]" 169 | 4222,"[2, 9]" 170 | 4223,"[0, 1, 2, 5, 6, 7, 8]" 171 | 4253,"[0, 1, 2, 5, 8]" 172 | 4278,"[6, 10]" 173 | 4389,[9] 174 | 4483,"[2, 5]" 175 | 4508,"[5, 10]" 176 | 4609,"[1, 2, 6]" 177 | 4680,"[1, 2]" 178 | 4686,"[1, 2]" 179 | 4745,[5] 180 | 4774,[5] 181 | 4779,[2] 182 | 4804,"[2, 5, 6, 7, 10]" 183 | 4839,"[6, 8]" 184 | 4878,"[2, 5]" 185 | -------------------------------------------------------------------------------- /data_od_evaluation/optdigits_pca_gt_copod.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 5066,"[0, 4, 6, 8, 9]" 3 | 5067,"[0, 3, 6, 8, 9]" 4 | 5068,"[3, 4, 5, 6, 7, 8]" 5 | 5069,"[0, 2, 4, 6]" 6 | 5070,"[2, 3, 6]" 7 | 5071,"[1, 2, 3, 5, 6]" 8 | 5072,"[2, 3, 4, 5, 6, 7]" 9 | 5073,"[0, 2, 3, 6, 8, 9]" 10 | 5074,[6] 11 | 5075,"[2, 3, 6]" 12 | 5076,"[0, 2, 5, 6, 8]" 13 | 5077,"[0, 3, 6, 8, 9]" 14 | 5078,"[2, 3, 4, 6, 9]" 15 | 5079,"[0, 3, 4, 6, 8]" 16 | 5080,"[2, 3, 5, 6, 7]" 17 | 5081,"[3, 4, 6, 7, 8]" 18 | 5082,"[2, 3, 6]" 19 | 5083,"[2, 3, 6]" 20 | 5084,[6] 21 | 5085,"[0, 6, 8, 9]" 22 | 5086,"[2, 3, 5, 6, 7]" 23 | 5087,"[2, 3, 6]" 24 | 5088,"[0, 6, 8, 9]" 25 | 5089,"[1, 2, 3, 4, 5, 6, 7]" 26 | 5090,"[0, 3, 6, 8, 9]" 27 | 5091,[6] 28 | 5092,"[3, 5, 6, 7, 8]" 29 | 5093,"[0, 3, 6, 8, 9]" 30 | 5094,"[0, 6, 8, 9]" 31 | 5095,"[3, 4, 5, 6, 7, 8]" 32 | 5096,"[1, 4, 6, 7]" 33 | 5097,"[0, 2, 3, 6, 8, 9]" 34 | 5098,"[5, 6, 7, 8]" 35 | 5099,"[2, 3, 6]" 36 | 5100,"[5, 6]" 37 | 5101,"[1, 2, 3, 6, 7]" 38 | 5102,"[3, 4, 5, 6, 7, 8]" 39 | 5103,"[1, 3, 6, 8]" 40 | 5104,"[2, 3, 6]" 41 | 5105,"[0, 2, 6]" 42 | 5106,"[0, 3, 6, 8, 9]" 43 | 5107,"[0, 3, 6, 8]" 44 | 5108,"[0, 3, 6, 8, 9]" 45 | 5109,"[2, 5, 6, 7]" 46 | 5110,"[1, 2, 3, 4, 5, 6, 7]" 47 | 5111,"[2, 3, 6]" 48 | 5112,"[0, 3, 6, 8, 9]" 49 | 5113,"[2, 3, 6]" 50 | 5114,"[0, 2, 4, 6]" 51 | 5115,"[2, 3, 6]" 52 | 5116,"[2, 3, 5, 6, 7]" 53 | 5117,[6] 54 | 5118,"[2, 3, 6]" 55 | 5119,"[0, 4, 6, 9]" 56 | 5120,"[2, 5, 6, 7]" 57 | 5121,"[0, 1, 2, 3, 5, 6, 8, 9]" 58 | 5122,"[2, 3, 5, 6, 7]" 59 | 5123,"[1, 2, 3, 5, 6, 7]" 60 | 5124,"[2, 3, 6]" 61 | 5125,"[2, 3, 6]" 62 | 5126,"[2, 5, 6]" 63 | 5127,"[2, 3, 5, 6, 7]" 64 | 5128,"[2, 3, 6]" 65 | 5129,"[0, 3, 6, 8, 9]" 66 | 5130,"[3, 4, 5, 6, 7, 8]" 67 | 5131,"[1, 2, 3, 6, 7]" 68 | 5132,"[1, 2, 6, 7]" 69 | 5133,"[1, 2, 3, 6, 9]" 70 | 5134,"[2, 3, 4, 6, 7]" 71 | 5135,"[2, 3, 4, 6, 7]" 72 | 5136,[6] 73 | 5137,"[0, 2, 5, 6, 8, 9]" 74 | 5138,"[2, 3, 4, 6, 7]" 75 | 5139,"[2, 3, 6]" 76 | 5140,"[2, 3, 4, 5, 6, 7]" 77 | 5141,"[3, 4, 5, 6, 7]" 78 | 5142,"[0, 3, 6, 8, 9]" 79 | 5143,"[2, 3, 6]" 80 | 5144,"[3, 4, 6, 7, 8]" 81 | 5145,"[2, 3, 6]" 82 | 5146,"[0, 4, 6, 8, 9]" 83 | 5147,"[0, 3, 4, 6, 9]" 84 | 5148,"[0, 1, 4, 6, 8]" 85 | 5149,"[5, 6]" 86 | 5150,"[4, 6, 7, 8]" 87 | 5151,"[0, 6, 8, 9]" 88 | 5152,"[0, 6, 8, 9]" 89 | 5153,"[2, 3, 5, 6, 7]" 90 | 5154,"[2, 3, 6]" 91 | 5155,"[4, 5, 6, 7]" 92 | 5156,"[2, 4, 6, 7, 9]" 93 | 5157,"[4, 6, 7, 8]" 94 | 5158,"[3, 6, 8, 9]" 95 | 5159,[6] 96 | 5160,"[0, 3, 6, 8, 9]" 97 | 5161,"[0, 3, 6, 8, 9]" 98 | 5162,"[0, 2, 3, 4, 5, 6]" 99 | 5163,"[2, 3, 6]" 100 | 5164,"[1, 2, 6, 7]" 101 | 5165,"[0, 2, 3, 6, 8, 9]" 102 | 5166,"[3, 6, 8, 9]" 103 | 5167,"[0, 3, 6, 8, 9]" 104 | 5168,"[5, 6, 7, 8]" 105 | 5169,"[2, 3, 6]" 106 | 5170,"[5, 6, 8]" 107 | 5171,"[0, 3, 6, 8]" 108 | 5172,"[0, 3, 4, 6, 8]" 109 | 5173,"[2, 3, 5, 6, 7]" 110 | 5174,"[2, 3, 6]" 111 | 5175,"[0, 3, 6, 8, 9]" 112 | 5176,"[1, 2, 3, 4, 5, 6, 7]" 113 | 5177,"[0, 6, 8, 9]" 114 | 5178,"[6, 8]" 115 | 5179,"[3, 4, 5, 6, 7]" 116 | 5180,"[1, 2, 6]" 117 | 5181,"[5, 6]" 118 | 5182,"[0, 3, 6, 8, 9]" 119 | 5183,"[1, 2, 3, 4, 5, 6, 7]" 120 | 5184,"[0, 6, 8, 9]" 121 | 5185,"[3, 5, 6, 7, 8]" 122 | 5186,"[3, 5, 6, 7, 8]" 123 | 5187,"[0, 3, 6, 8, 9]" 124 | 5188,"[0, 6, 8]" 125 | 5189,"[0, 3, 6, 8, 9]" 126 | 5190,"[0, 2, 3, 6, 9]" 127 | 5191,"[2, 3, 5, 6, 7]" 128 | 5192,"[2, 3, 6]" 129 | 5193,"[0, 4, 6]" 130 | 5194,"[6, 8]" 131 | 5195,"[2, 3, 6]" 132 | 5196,"[1, 2, 6, 7]" 133 | 5197,"[0, 3, 6, 8, 9]" 134 | 5198,"[0, 3, 6, 8, 9]" 135 | 5199,"[0, 6, 8, 9]" 136 | 5200,"[2, 5, 6]" 137 | 5201,"[2, 3, 5, 6, 7]" 138 | 5202,"[3, 5, 6, 7, 8, 9]" 139 | 5203,"[0, 4, 6, 8, 9]" 140 | 5204,"[0, 3, 6, 8, 9]" 141 | 5205,"[0, 3, 6, 8, 9]" 142 | 5206,"[3, 6, 9]" 143 | 5207,"[2, 3, 6]" 144 | 5208,"[0, 3, 6, 8, 9]" 145 | 5209,"[2, 3, 4, 6, 7]" 146 | 5210,"[0, 2, 3, 5, 6, 8, 9]" 147 | 5211,"[0, 6, 9]" 148 | 5212,"[0, 3, 6, 8, 9]" 149 | 5213,"[1, 4, 6, 7]" 150 | 5214,[6] 151 | 5215,[6] 152 | -------------------------------------------------------------------------------- /data_od_evaluation/optdigits_pca_gt_iforest.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 5066,"[2, 4, 6, 9]" 3 | 5067,"[0, 1, 2, 3, 6, 8]" 4 | 5068,"[1, 3, 4, 6, 7, 8]" 5 | 5069,"[0, 1, 2, 5, 6]" 6 | 5070,"[0, 2, 4, 6]" 7 | 5071,"[1, 3, 6]" 8 | 5072,"[0, 1, 2, 4, 6]" 9 | 5073,"[0, 2, 4, 6, 8]" 10 | 5074,[6] 11 | 5075,"[0, 1, 2, 3, 6, 8]" 12 | 5076,"[0, 2, 4, 6]" 13 | 5077,"[1, 2, 4, 6]" 14 | 5078,"[1, 2, 4, 6]" 15 | 5079,"[0, 1, 2, 3, 6, 8]" 16 | 5080,"[0, 1, 2, 3, 6, 8]" 17 | 5081,"[0, 2, 3, 6, 8]" 18 | 5082,"[0, 1, 2, 3, 6, 8]" 19 | 5083,"[0, 1, 2, 3, 6, 8]" 20 | 5084,"[1, 6]" 21 | 5085,"[0, 1, 2, 5, 6]" 22 | 5086,"[3, 6, 7]" 23 | 5087,"[2, 4, 6]" 24 | 5088,"[0, 6, 7, 8]" 25 | 5089,"[3, 6, 7]" 26 | 5090,"[0, 1, 2, 3, 6, 8]" 27 | 5091,"[1, 6]" 28 | 5092,"[0, 2, 6, 7, 8]" 29 | 5093,"[0, 2, 3, 5, 6]" 30 | 5094,"[0, 2, 6]" 31 | 5095,"[3, 6, 7]" 32 | 5096,"[4, 6, 7]" 33 | 5097,"[0, 2, 6, 8]" 34 | 5098,"[0, 2, 4, 6, 8]" 35 | 5099,"[2, 6]" 36 | 5100,"[1, 3, 6]" 37 | 5101,"[0, 1, 2, 3, 6, 8]" 38 | 5102,"[0, 3, 4, 6, 7, 8]" 39 | 5103,"[3, 6, 8]" 40 | 5104,"[0, 1, 2, 6]" 41 | 5105,"[0, 2, 4, 6]" 42 | 5106,"[0, 1, 2, 3, 6, 8]" 43 | 5107,"[6, 8]" 44 | 5108,"[2, 3, 5, 6, 8]" 45 | 5109,"[6, 9]" 46 | 5110,"[3, 6, 7]" 47 | 5111,"[0, 1, 2, 3, 6, 8]" 48 | 5112,"[0, 1, 2, 3, 6, 8]" 49 | 5113,"[0, 1, 2, 3, 6, 8]" 50 | 5114,"[0, 1, 2, 6]" 51 | 5115,"[0, 1, 2, 3, 6, 8]" 52 | 5116,"[0, 1, 2, 3, 6, 8]" 53 | 5117,"[0, 6, 9]" 54 | 5118,"[0, 1, 2, 3, 6, 8]" 55 | 5119,"[0, 2, 3, 6, 8, 9]" 56 | 5120,"[6, 9]" 57 | 5121,"[0, 1, 2, 3, 6, 8]" 58 | 5122,"[0, 2, 4, 6]" 59 | 5123,"[1, 6]" 60 | 5124,"[0, 1, 2, 6]" 61 | 5125,"[0, 1, 2, 3, 6, 8]" 62 | 5126,"[0, 1, 2, 4, 6]" 63 | 5127,"[1, 2, 6, 7]" 64 | 5128,"[0, 2, 3, 4, 6]" 65 | 5129,"[6, 8]" 66 | 5130,"[1, 3, 4, 6, 7, 8]" 67 | 5131,"[1, 2, 3, 6]" 68 | 5132,"[3, 4, 6, 7]" 69 | 5133,"[1, 2, 3, 6]" 70 | 5134,"[0, 1, 2, 3, 6, 8]" 71 | 5135,"[2, 3, 4, 5, 6]" 72 | 5136,"[1, 4, 6]" 73 | 5137,"[0, 2, 3, 5, 6]" 74 | 5138,"[2, 3, 4, 5, 6]" 75 | 5139,"[0, 1, 2, 3, 6, 8]" 76 | 5140,"[6, 9]" 77 | 5141,"[3, 4, 6, 7]" 78 | 5142,"[0, 1, 2, 3, 6, 8]" 79 | 5143,"[2, 6, 8, 9]" 80 | 5144,"[0, 3, 4, 6, 7, 8]" 81 | 5145,"[2, 6]" 82 | 5146,"[2, 4, 6, 9]" 83 | 5147,"[0, 1, 2, 4, 6]" 84 | 5148,"[0, 1, 6, 8]" 85 | 5149,"[1, 3, 6]" 86 | 5150,"[0, 3, 4, 6, 7]" 87 | 5151,"[1, 3, 6]" 88 | 5152,"[0, 2, 6, 8]" 89 | 5153,"[6, 8]" 90 | 5154,"[0, 2, 6]" 91 | 5155,"[4, 6, 7]" 92 | 5156,"[1, 2, 6, 7]" 93 | 5157,"[0, 3, 4, 6, 7]" 94 | 5158,"[1, 2, 3, 6, 7, 8, 9]" 95 | 5159,"[0, 1, 2, 5, 6]" 96 | 5160,"[0, 1, 2, 3, 6, 8]" 97 | 5161,"[0, 1, 2, 3, 6, 8]" 98 | 5162,"[0, 2, 3, 5, 6]" 99 | 5163,"[3, 6]" 100 | 5164,"[3, 4, 6, 7]" 101 | 5165,"[0, 2, 6, 8]" 102 | 5166,"[1, 2, 3, 6, 7, 8, 9]" 103 | 5167,"[0, 2, 4, 6]" 104 | 5168,"[5, 6, 9]" 105 | 5169,"[0, 2, 4, 6]" 106 | 5170,"[6, 8]" 107 | 5171,"[0, 3, 6, 8]" 108 | 5172,"[0, 3, 4, 6]" 109 | 5173,"[3, 5, 6]" 110 | 5174,"[0, 2, 4, 6]" 111 | 5175,"[0, 1, 2, 3, 6, 8]" 112 | 5176,"[3, 5, 6, 7]" 113 | 5177,"[0, 1, 2, 5, 6]" 114 | 5178,"[1, 2, 6]" 115 | 5179,"[3, 6, 7]" 116 | 5180,"[1, 3, 6]" 117 | 5181,"[1, 3, 6]" 118 | 5182,"[3, 6, 9]" 119 | 5183,"[0, 2, 4, 6]" 120 | 5184,"[0, 1, 2, 6, 9]" 121 | 5185,"[0, 3, 5, 6, 8, 9]" 122 | 5186,"[1, 3, 5, 6, 7, 8]" 123 | 5187,"[0, 1, 2, 3, 6, 8]" 124 | 5188,"[0, 2, 4, 6, 8]" 125 | 5189,"[0, 2, 4, 6, 8]" 126 | 5190,"[0, 1, 2, 4, 6]" 127 | 5191,"[0, 2, 3, 5, 6, 7]" 128 | 5192,"[0, 1, 2, 3, 6, 8]" 129 | 5193,"[0, 1, 4, 6]" 130 | 5194,"[1, 2, 6]" 131 | 5195,"[0, 1, 2, 3, 6, 8]" 132 | 5196,"[1, 2, 6, 7]" 133 | 5197,"[0, 3, 6, 7, 8, 9]" 134 | 5198,"[6, 8]" 135 | 5199,"[0, 6, 8]" 136 | 5200,"[0, 1, 2, 4, 6]" 137 | 5201,"[3, 4, 6, 7]" 138 | 5202,"[1, 2, 3, 6, 7, 8, 9]" 139 | 5203,"[2, 4, 6, 9]" 140 | 5204,"[1, 3, 6, 8, 9]" 141 | 5205,"[0, 1, 2, 3, 6, 8]" 142 | 5206,"[0, 3, 4, 6, 9]" 143 | 5207,"[0, 1, 2, 3, 6, 8]" 144 | 5208,"[0, 2, 3, 4, 6, 8, 9]" 145 | 5209,"[0, 2, 3, 6, 7]" 146 | 5210,"[0, 2, 3, 5, 8, 9]" 147 | 5211,"[0, 2, 4, 6]" 148 | 5212,"[2, 6, 8, 9]" 149 | 5213,"[6, 9]" 150 | 5214,"[1, 6]" 151 | 5215,[6] 152 | -------------------------------------------------------------------------------- /model_iml/SHAP.py: -------------------------------------------------------------------------------- 1 | import shap 2 | import math 3 | import random 4 | import sklearn 5 | import numpy as np 6 | 7 | 8 | class SHAP: 9 | def __init__(self, kernel="rbf", n_sample=100, threshold=0.8): 10 | """ 11 | 12 | :param kernel: clf model svm parameter 13 | :param threshold: threshold is used to filter feature subset for each data, the shap values of selected feature 14 | subspace accounts for [threshold] of the sum of the shap values of feature full space. 15 | """ 16 | self.ano_idx = None 17 | 18 | self.kernel = kernel 19 | self.threshold = threshold 20 | self.n_sample = n_sample 21 | self.dim = None 22 | return 23 | 24 | def fit(self, x, y): 25 | 26 | self.dim = x.shape[1] 27 | 28 | # metric_lst = [] 29 | # clf_lst = [] 30 | # for model in classifiers.keys(): 31 | # clf = classifiers[model] 32 | # clf.fit(x, y) 33 | # y_pred = clf.predict(x) 34 | # clf_lst.append(clf) 35 | # metric_lst.append(sklearn.metrics.f1_score(y, y_pred)) 36 | # choose_idx = int(np.argmax(metric_lst)) 37 | # clf = clf_lst[choose_idx] 38 | # print("Choosing Clf: [%s]" % list(classifiers.keys())[choose_idx]) 39 | 40 | clf = sklearn.svm.SVC(kernel=self.kernel, probability=True) 41 | clf.fit(x, y) 42 | 43 | y_pred = clf.predict(x) 44 | print("Clf model accuracy: [{:.4f}]".format(sklearn.metrics.accuracy_score(y, y_pred))) 45 | 46 | self.ano_idx = np.where(y == 1)[0] 47 | 48 | # use Kernel SHAP to explain test set predictions 49 | # As instructed by SHAP, Using many background data samples could cause slower run times. 50 | # we use shap.kmeans(data, K) to summarize the background as 100 samples. 51 | 52 | x_kmean = shap.kmeans(x, self.n_sample) 53 | explainer = shap.KernelExplainer(clf.predict_proba, x_kmean, link="logit") 54 | anomaly_shap_values = explainer.shap_values(x[self.ano_idx], nsamples="auto") 55 | 56 | anomaly_shap_values = anomaly_shap_values[1] 57 | return anomaly_shap_values 58 | 59 | def weight2subspace(self, weight, r=0.7, num=-1): 60 | threshold = r * np.sum(weight) 61 | tmp_s = 0 62 | exp_subspace = [] 63 | sorted_idx1 = np.argsort(weight) 64 | sorted_idx = [sorted_idx1[self.dim - i -1] for i in range(self.dim)] 65 | if num != -1: 66 | exp_subspace = sorted_idx[:num] 67 | exp_subspace = list(np.sort(exp_subspace)) 68 | return exp_subspace 69 | 70 | for idx in sorted_idx: 71 | tmp_s += weight[idx] 72 | exp_subspace.append(idx) 73 | if tmp_s >= threshold: 74 | break 75 | exp_subspace = list(np.sort(exp_subspace)) 76 | return exp_subspace 77 | 78 | def weight2subspace_pn(self, weight): 79 | exp_subspace = [] 80 | for i in range(len(weight)): 81 | if weight[i] > 0: 82 | exp_subspace.append(i) 83 | exp_subspace = list(np.sort(exp_subspace)) 84 | return exp_subspace 85 | 86 | def get_exp_subspace(self, fea_weight_lst, w2s_ratio, real_exp_len=None): 87 | exp_subspace_lst = [] 88 | for ii, idx in enumerate(self.ano_idx): 89 | fea_weight = fea_weight_lst[ii] 90 | if w2s_ratio == "real_len": 91 | exp_subspace_lst.append(self.weight2subspace(fea_weight, num=real_exp_len[ii])) 92 | elif w2s_ratio == "auto": 93 | r = math.sqrt(2 / self.dim) 94 | exp_subspace_lst.append(self.weight2subspace(fea_weight, r=r)) 95 | elif w2s_ratio == "pn": 96 | exp_subspace_lst.append(self.weight2subspace_pn(fea_weight)) 97 | else: 98 | exp_subspace_lst.append(self.weight2subspace(fea_weight, r=w2s_ratio)) 99 | return exp_subspace_lst 100 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # ------------------- path of datasets and annotations ----------------- # 2 | root = '' 3 | eva_root = '' 4 | 5 | 6 | def get_parser(algorithm_name, parser): 7 | if algorithm_name == "aton": 8 | parser.add_argument('--nbrs_num', type=int, default=30, help='') 9 | parser.add_argument('--rand_num', type=int, default=30, help='') 10 | parser.add_argument('--alpha1', type=float, default=0.8, help='triplet loss factor in loss function') 11 | parser.add_argument('--alpha2', type=float, default=0.2, help='dis loss factor in loss function') 12 | parser.add_argument('--n_epoch', type=int, default=10, help='') 13 | parser.add_argument('--batch_size', type=int, default=512, help='') 14 | parser.add_argument('--lr', type=float, default=0.1, help='') 15 | parser.add_argument('--n_linear', type=int, default=64, help='') 16 | parser.add_argument('--margin', type=float, default=5., help='') 17 | elif algorithm_name == "shap": 18 | parser.add_argument('--kernel', type=str, default='rbf', help='') 19 | parser.add_argument("--n_sample", type=int, default=100, help='') 20 | parser.add_argument("--threshold", type=int, default=-1, help='') 21 | elif algorithm_name == "lime": 22 | parser.add_argument('--discretize_continuous', type=bool, default=True, help='') 23 | parser.add_argument("--discretizer", type=str, default="quartile", help='') 24 | elif algorithm_name == "intgrad": 25 | parser.add_argument('--n_steps', type=int, default=40, help='') 26 | parser.add_argument('--method', type=str, default="gausslegendre", help='') 27 | elif algorithm_name == "coin": 28 | parser.add_argument('--AUG', type=float, default=10, help='an additional attribute value as augmentation') 29 | parser.add_argument('--ratio_nbr', type=float, default=0.08, 30 | help='controls number of neighbors to use in kneighbors queries') 31 | parser.add_argument('--MIN_CLUSTER_SIZE', type=int, default=5, 32 | help='minimum number of samples required in a cluster') 33 | parser.add_argument('--MAX_NUM_CLUSTER', type=int, default=4, 34 | help='maximum number of clusters for each context') 35 | parser.add_argument('--VAL_TIMES', type=int, default=10, 36 | help='number of iterations for computing prediction strength') 37 | parser.add_argument('--C_SVM', type=float, default=1., help='penalty parameter for svm') 38 | parser.add_argument('--DEFK', type=int, default=0, 39 | help='pre-determined number of clusters in each context (use prediction strength if 0)') 40 | parser.add_argument('--THRE_PS', type=float, default=0.85, 41 | help='threshold for deciding the best cluster value in prediction strength') 42 | elif algorithm_name == "aton_ablation" or algorithm_name == "aton_ablation2" or algorithm_name == "aton_ablation3": 43 | parser.add_argument('--nbrs_num', type=int, default=30, help='') 44 | parser.add_argument('--rand_num', type=int, default=30, help='') 45 | parser.add_argument('--n_epoch', type=int, default=10, help='') 46 | parser.add_argument('--batch_size', type=int, default=64, help='') 47 | parser.add_argument('--lr', type=float, default=0.1, help='') 48 | parser.add_argument('--n_linear', type=int, default=64, help='') 49 | parser.add_argument('--margin', type=float, default=5., help='') 50 | elif algorithm_name == "sinne": 51 | parser.add_argument('--max_level', default='full', help='') 52 | parser.add_argument("--width", type=int, default=10, help='') 53 | parser.add_argument("--ensemble_num", type=int, default=100, help='') 54 | parser.add_argument("--sample_num", type=int, default=8, help='') 55 | parser.add_argument("--pretrain", type=bool, default=False, help='') 56 | parser.add_argument("--verbose", type=bool, default=False, help='') 57 | else: 58 | raise NotImplementedError("not supported algorithm") 59 | return parser 60 | 61 | -------------------------------------------------------------------------------- /data_od_evaluation/wineQualityWhites-od2_gt_copod.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 46,"[1, 4, 5, 8]" 3 | 98,"[0, 2, 5, 8]" 4 | 115,"[1, 2, 4, 5, 8]" 5 | 147,"[1, 4, 8]" 6 | 172,"[1, 5, 8]" 7 | 176,"[0, 6, 8, 10]" 8 | 178,"[1, 2, 4, 10]" 9 | 189,"[3, 6, 9, 10]" 10 | 204,"[0, 2, 4, 8, 9]" 11 | 207,"[0, 2]" 12 | 230,"[1, 2, 4, 8]" 13 | 250,"[8, 9]" 14 | 251,"[0, 3, 4, 6, 8]" 15 | 253,"[2, 5, 8]" 16 | 259,"[3, 5, 10]" 17 | 278,"[1, 6, 8]" 18 | 282,"[1, 6, 9]" 19 | 294,"[0, 1, 4, 10]" 20 | 433,"[1, 2, 4]" 21 | 445,"[0, 3, 6, 8, 10]" 22 | 496,"[1, 2, 5]" 23 | 499,"[1, 2, 5]" 24 | 526,"[5, 10]" 25 | 540,"[3, 6, 8, 9, 10]" 26 | 626,"[0, 1, 2, 4, 8]" 27 | 641,"[4, 6, 7, 8, 10]" 28 | 646,"[2, 4, 5, 9, 10]" 29 | 659,"[2, 5]" 30 | 662,"[1, 4, 8]" 31 | 687,"[1, 4]" 32 | 690,"[8, 9]" 33 | 702,"[1, 2]" 34 | 740,"[4, 6, 8, 10]" 35 | 780,"[1, 2, 5, 8]" 36 | 831,"[6, 9, 10]" 37 | 873,"[0, 6, 8]" 38 | 905,"[0, 2, 4, 5, 9]" 39 | 906,"[0, 2, 4, 5, 9]" 40 | 908,"[0, 4, 7, 8]" 41 | 914,"[0, 1, 4, 5, 8]" 42 | 948,"[1, 6, 8]" 43 | 991,"[0, 2, 5, 9]" 44 | 993,"[2, 5, 8]" 45 | 1027,"[0, 1, 2, 6, 9]" 46 | 1029,"[1, 6]" 47 | 1034,"[1, 2, 4, 6]" 48 | 1040,"[1, 5, 9]" 49 | 1042,[1] 50 | 1053,"[0, 2, 4, 9]" 51 | 1059,"[0, 2, 4, 5, 8, 10]" 52 | 1109,"[0, 4]" 53 | 1114,"[1, 6, 8]" 54 | 1152,"[1, 2]" 55 | 1154,"[0, 6, 8, 10]" 56 | 1155,"[0, 6, 8]" 57 | 1229,"[0, 10]" 58 | 1245,"[1, 2, 6, 9]" 59 | 1293,"[1, 5, 9]" 60 | 1294,"[1, 5, 9]" 61 | 1349,"[0, 3, 6, 8, 9]" 62 | 1363,"[1, 2, 4, 5]" 63 | 1405,"[1, 4, 5, 9]" 64 | 1417,"[1, 6]" 65 | 1420,"[0, 2, 7]" 66 | 1423,"[0, 2, 4, 6]" 67 | 1430,"[1, 5, 9]" 68 | 1474,"[4, 6, 8]" 69 | 1483,"[4, 6, 8]" 70 | 1484,"[0, 4, 5, 10]" 71 | 1541,"[1, 6, 9]" 72 | 1558,"[1, 2, 5, 9]" 73 | 1559,"[0, 2, 6, 8, 9]" 74 | 1574,"[0, 2, 4, 8, 9]" 75 | 1577,"[1, 2]" 76 | 1579,"[2, 4, 5, 6, 7, 9, 10]" 77 | 1649,"[2, 8]" 78 | 1652,"[0, 2, 3]" 79 | 1664,"[0, 4, 7]" 80 | 1688,"[5, 8, 10]" 81 | 1690,"[0, 1, 4, 5]" 82 | 1702,"[5, 10]" 83 | 1708,"[1, 8]" 84 | 1718,"[0, 10]" 85 | 1739,"[0, 3, 5, 8, 10]" 86 | 1781,"[4, 6, 8]" 87 | 1817,"[1, 2, 5]" 88 | 1856,"[0, 1, 4]" 89 | 1924,"[0, 2, 4, 5, 9]" 90 | 1931,"[1, 5]" 91 | 1951,"[0, 1]" 92 | 1990,"[1, 5, 8]" 93 | 2050,"[0, 7]" 94 | 2079,"[1, 8]" 95 | 2116,"[3, 6, 9, 10]" 96 | 2119,"[0, 2, 4, 5, 9]" 97 | 2154,"[0, 1, 2, 4]" 98 | 2156,"[0, 5, 9, 10]" 99 | 2159,"[2, 4, 5, 6, 7, 8, 9, 10]" 100 | 2225,"[0, 3, 4]" 101 | 2237,"[3, 4, 6]" 102 | 2246,"[3, 4, 6]" 103 | 2275,"[3, 4, 6]" 104 | 2318,"[0, 2, 5, 7, 10]" 105 | 2337,"[2, 4, 5, 9, 10]" 106 | 2346,"[0, 2, 5, 7, 10]" 107 | 2372,"[2, 5, 8, 10]" 108 | 2373,"[1, 2, 5]" 109 | 2379,"[4, 8, 9]" 110 | 2380,"[4, 8, 9]" 111 | 2386,"[4, 6, 8]" 112 | 2387,"[4, 6, 8]" 113 | 2388,"[0, 3, 5, 10]" 114 | 2400,"[0, 2, 5, 9]" 115 | 2401,"[0, 2, 9]" 116 | 2409,"[8, 9]" 117 | 2412,"[4, 6, 7, 8, 9]" 118 | 2413,"[8, 9]" 119 | 2414,"[4, 8, 9]" 120 | 2435,"[1, 2, 5]" 121 | 2493,"[0, 4, 7]" 122 | 2494,"[0, 4, 7]" 123 | 2502,"[0, 3, 5]" 124 | 2503,"[0, 3, 5]" 125 | 2531,"[1, 5]" 126 | 2532,"[1, 5]" 127 | 2589,"[1, 2, 4]" 128 | 2656,"[2, 5, 9]" 129 | 2818,"[1, 2, 5, 8]" 130 | 2888,"[0, 3, 5]" 131 | 2920,"[0, 2, 3, 5, 8, 10]" 132 | 2935,"[2, 4, 5, 9, 10]" 133 | 3021,"[5, 8, 10]" 134 | 3050,"[2, 5, 8]" 135 | 3067,"[0, 2, 3, 10]" 136 | 3087,"[3, 6]" 137 | 3109,"[4, 6, 8]" 138 | 3179,"[0, 2, 4, 8, 9]" 139 | 3186,"[0, 2, 5, 9]" 140 | 3218,"[4, 8]" 141 | 3265,"[0, 5, 10]" 142 | 3275,"[1, 2]" 143 | 3307,"[0, 5]" 144 | 3409,"[3, 8]" 145 | 3417,"[1, 5, 10]" 146 | 3528,"[1, 6, 9]" 147 | 3559,"[0, 6, 8, 10]" 148 | 3571,"[1, 2, 6, 8]" 149 | 3578,"[5, 10]" 150 | 3650,"[2, 5, 9]" 151 | 3662,"[1, 5, 8]" 152 | 3714,"[0, 8]" 153 | 3736,"[2, 4, 5, 9, 10]" 154 | 3770,"[4, 9, 10]" 155 | 3810,"[3, 4, 6]" 156 | 3872,"[0, 2, 5, 7, 10]" 157 | 3879,"[1, 6, 9]" 158 | 3901,"[4, 6, 10]" 159 | 3933,"[0, 6, 8, 10]" 160 | 3965,"[0, 1, 2, 4, 9]" 161 | 3967,"[2, 4, 6, 9]" 162 | 3973,"[2, 4, 6, 9]" 163 | 4020,"[2, 4, 5, 10]" 164 | 4039,[1] 165 | 4074,"[1, 2, 4, 5]" 166 | 4212,"[4, 6, 7, 8, 10]" 167 | 4213,"[1, 2, 4, 5]" 168 | 4217,"[1, 2, 4]" 169 | 4222,"[1, 2, 4]" 170 | 4223,"[1, 6, 8]" 171 | 4253,"[0, 1, 2, 5, 8]" 172 | 4278,"[0, 1, 5, 6]" 173 | 4389,"[3, 9, 10]" 174 | 4483,"[0, 2, 4, 5, 9]" 175 | 4508,"[5, 10]" 176 | 4609,"[1, 2, 5]" 177 | 4680,"[1, 2, 4]" 178 | 4686,"[1, 2, 4]" 179 | 4745,[5] 180 | 4774,"[5, 10]" 181 | 4779,"[1, 2]" 182 | 4804,"[2, 3, 5, 10]" 183 | 4839,"[0, 6, 8, 10]" 184 | 4878,"[1, 2, 5]" 185 | -------------------------------------------------------------------------------- /data_od_evaluation/pima_gt_hbos.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 0,[7] 3 | 2,[1] 4 | 4,"[2, 6]" 5 | 6,"[1, 4]" 6 | 8,"[1, 4]" 7 | 9,"[1, 5, 7]" 8 | 11,"[0, 1, 7]" 9 | 13,[4] 10 | 14,"[1, 4]" 11 | 15,[2] 12 | 16,[4] 13 | 17,[0] 14 | 19,[4] 15 | 22,[1] 16 | 23,[0] 17 | 24,"[0, 1, 2, 4]" 18 | 25,"[0, 4, 7]" 19 | 26,"[1, 7]" 20 | 31,[4] 21 | 37,[7] 22 | 38,[3] 23 | 39,"[3, 4, 6, 7]" 24 | 43,"[2, 4]" 25 | 45,"[1, 6]" 26 | 48,[0] 27 | 53,"[4, 7]" 28 | 56,"[1, 4]" 29 | 61,[7] 30 | 64,[7] 31 | 66,"[2, 3, 7]" 32 | 70,[4] 33 | 72,"[0, 1, 2]" 34 | 78,[2] 35 | 84,"[2, 5]" 36 | 88,"[0, 4]" 37 | 93,[7] 38 | 99,"[3, 4]" 39 | 100,"[1, 6, 7]" 40 | 109,"[1, 3]" 41 | 110,"[1, 4]" 42 | 111,[4] 43 | 114,"[1, 4]" 44 | 115,[7] 45 | 116,[7] 46 | 120,"[1, 3, 4]" 47 | 124,[2] 48 | 125,"[1, 2, 3, 5]" 49 | 128,"[2, 3, 4, 7]" 50 | 129,[7] 51 | 130,"[1, 4]" 52 | 131,"[0, 2, 6]" 53 | 132,"[1, 4]" 54 | 143,"[0, 7]" 55 | 152,"[0, 2, 3, 4, 5, 6, 7]" 56 | 154,"[1, 5]" 57 | 155,"[1, 3, 5, 7]" 58 | 159,[0] 59 | 164,[2] 60 | 165,"[0, 2, 3, 4, 7]" 61 | 170,"[0, 7]" 62 | 171,"[0, 4]" 63 | 175,"[1, 4]" 64 | 177,[5] 65 | 179,[7] 66 | 185,[1] 67 | 186,"[1, 4, 7]" 68 | 187,"[2, 3]" 69 | 188,[4] 70 | 189,[4] 71 | 192,"[1, 7]" 72 | 193,"[0, 5]" 73 | 195,[4] 74 | 197,[3] 75 | 198,"[3, 4]" 76 | 199,[4] 77 | 206,"[1, 4]" 78 | 207,"[1, 2, 7]" 79 | 209,[1] 80 | 213,"[1, 4, 5]" 81 | 214,[4] 82 | 215,[4] 83 | 216,"[3, 4]" 84 | 218,"[3, 6, 7]" 85 | 219,[7] 86 | 220,[4] 87 | 221,[7] 88 | 227,[1] 89 | 230,"[1, 2, 5]" 90 | 231,[4] 91 | 235,[1] 92 | 236,"[1, 4]" 93 | 237,"[1, 2, 5]" 94 | 238,[1] 95 | 242,[2] 96 | 243,"[0, 2, 3, 4, 6]" 97 | 245,"[0, 1, 3, 6, 7]" 98 | 254,[4] 99 | 255,[3] 100 | 259,"[0, 6]" 101 | 261,[2] 102 | 264,[7] 103 | 266,[2] 104 | 269,[2] 105 | 270,"[0, 2, 3, 5, 6, 7]" 106 | 276,"[0, 2, 3]" 107 | 280,[1] 108 | 283,"[1, 2, 7]" 109 | 284,[7] 110 | 287,[4] 111 | 291,"[3, 6]" 112 | 292,[4] 113 | 293,"[2, 3, 4]" 114 | 296,[4] 115 | 298,"[0, 4, 7]" 116 | 300,[2] 117 | 301,"[1, 4]" 118 | 303,"[2, 5]" 119 | 306,"[1, 4, 7]" 120 | 308,[4] 121 | 309,[4] 122 | 312,"[1, 4]" 123 | 314,"[0, 3, 5, 6, 7]" 124 | 317,[1] 125 | 319,"[1, 7]" 126 | 321,"[0, 2, 3]" 127 | 322,[7] 128 | 323,"[0, 2, 3]" 129 | 326,[4] 130 | 328,"[4, 5]" 131 | 332,"[1, 2]" 132 | 337,[7] 133 | 338,[4] 134 | 339,[1] 135 | 349,[1] 136 | 355,"[1, 2, 7]" 137 | 356,"[2, 3, 4]" 138 | 357,"[0, 2, 3]" 139 | 359,"[1, 4]" 140 | 360,"[1, 4]" 141 | 363,[7] 142 | 366,[0] 143 | 369,[2] 144 | 370,[6] 145 | 375,"[4, 7]" 146 | 378,[5] 147 | 386,[7] 148 | 387,[2] 149 | 388,"[4, 7]" 150 | 391,[1] 151 | 394,"[0, 1, 2, 6]" 152 | 397,[3] 153 | 399,[1] 154 | 400,[7] 155 | 402,"[3, 4, 7]" 156 | 404,[1] 157 | 406,[7] 158 | 408,"[0, 1, 5, 6, 7]" 159 | 409,[4] 160 | 414,[4] 161 | 415,[4] 162 | 417,"[1, 7]" 163 | 419,[4] 164 | 424,[4] 165 | 425,"[1, 4]" 166 | 427,"[1, 4]" 167 | 429,[4] 168 | 435,[2] 169 | 440,[2] 170 | 443,"[0, 6]" 171 | 444,[3] 172 | 445,"[1, 5]" 173 | 448,[3] 174 | 451,[1] 175 | 455,"[0, 1]" 176 | 458,"[4, 7]" 177 | 468,[2] 178 | 476,[4] 179 | 480,[4] 180 | 484,[2] 181 | 485,[4] 182 | 493,"[3, 4, 6, 7]" 183 | 498,"[1, 4, 7]" 184 | 502,[1] 185 | 506,"[1, 4]" 186 | 510,"[0, 7]" 187 | 515,"[1, 4]" 188 | 516,"[4, 7]" 189 | 523,"[0, 7]" 190 | 535,[2] 191 | 539,"[2, 3, 4]" 192 | 540,[4] 193 | 541,[4] 194 | 542,[7] 195 | 545,"[1, 4]" 196 | 546,"[1, 4, 7]" 197 | 560,[7] 198 | 561,"[1, 4]" 199 | 569,[4] 200 | 577,[5] 201 | 579,[3] 202 | 580,"[1, 2, 3]" 203 | 584,[4] 204 | 586,"[1, 7]" 205 | 588,"[1, 4, 7]" 206 | 590,"[0, 2, 3]" 207 | 592,[7] 208 | 595,"[1, 4]" 209 | 598,[1] 210 | 603,"[4, 7]" 211 | 604,"[1, 2]" 212 | 606,"[1, 4]" 213 | 611,"[1, 4]" 214 | 612,[4] 215 | 614,"[0, 4, 7]" 216 | 618,"[0, 2, 3, 6, 7]" 217 | 619,[2] 218 | 630,[7] 219 | 635,[0] 220 | 638,"[0, 4, 5, 6]" 221 | 642,[7] 222 | 646,"[1, 4]" 223 | 647,"[1, 4]" 224 | 648,"[0, 4]" 225 | 655,[4] 226 | 659,[6] 227 | 661,"[1, 2, 3, 6]" 228 | 662,"[1, 2, 3]" 229 | 663,"[0, 2, 3, 4, 7]" 230 | 664,"[0, 2, 3, 7]" 231 | 666,[7] 232 | 667,"[0, 7]" 233 | 675,[1] 234 | 676,[7] 235 | 678,[2] 236 | 681,"[1, 5]" 237 | 683,[0] 238 | 689,"[1, 3, 4, 5, 7]" 239 | 691,"[0, 1, 2]" 240 | 693,"[3, 4, 7]" 241 | 695,[4] 242 | 696,"[1, 4]" 243 | 701,[7] 244 | 702,"[1, 2, 3, 7]" 245 | 706,[5] 246 | 708,"[0, 1, 7]" 247 | 709,[4] 248 | 712,"[0, 5, 7]" 249 | 715,"[1, 4]" 250 | 716,"[1, 4]" 251 | 719,[7] 252 | 722,"[4, 7]" 253 | 730,[7] 254 | 731,[2] 255 | 732,"[1, 4]" 256 | 739,[7] 257 | 740,"[0, 4, 5, 7]" 258 | 743,"[0, 1, 2, 7]" 259 | 746,[5] 260 | 748,"[1, 4]" 261 | 749,"[1, 7]" 262 | 750,"[0, 6]" 263 | 753,"[1, 4]" 264 | 754,"[1, 7]" 265 | 755,"[3, 4, 6, 7]" 266 | 757,[7] 267 | 759,"[1, 7]" 268 | 761,"[1, 3, 5, 7]" 269 | 766,[7] 270 | -------------------------------------------------------------------------------- /model_aton/ATON_ablation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time, math 3 | import torch 4 | import torch.utils.data as Data 5 | import torch.optim as optim 6 | from torch.optim import lr_scheduler 7 | from tqdm import tqdm 8 | 9 | from model_aton.utils import EarlyStopping, min_max_normalize 10 | from model_aton.datasets import MyHardSingleTripletSelector 11 | from model_aton.datasets import SingleTripletDataset 12 | from model_aton.networks import ATONablanet 13 | 14 | 15 | class ATONabla: 16 | """ 17 | ablated version that removes self-attention mechanism 18 | """ 19 | def __init__(self, nbrs_num=30, rand_num=30, 20 | n_epoch=10, batch_size=64, lr=0.1, n_linear=64, margin=5., 21 | verbose=True): 22 | self.verbose = verbose 23 | 24 | self.x = None 25 | self.y = None 26 | self.ano_idx = None 27 | self.dim = None 28 | 29 | self.reason_map = {} 30 | 31 | cuda = torch.cuda.is_available() 32 | self.device = torch.device("cuda" if cuda else "cpu") 33 | if cuda: 34 | torch.cuda.set_device(0) 35 | 36 | self.nbrs_num = nbrs_num 37 | self.rand_num = rand_num 38 | 39 | self.n_epoch = n_epoch 40 | self.batch_size = batch_size 41 | self.lr = lr 42 | self.n_linear = n_linear 43 | self.margin = margin 44 | return 45 | 46 | def fit(self, x, y): 47 | device = self.device 48 | 49 | self.dim = x.shape[1] 50 | x = min_max_normalize(x) 51 | self.ano_idx = np.where(y == 1)[0] 52 | 53 | self.x = torch.tensor(x, dtype=torch.float32).to(device) 54 | self.y = torch.tensor(y, dtype=torch.int64).to(device) 55 | 56 | W_lst = [] 57 | if self.verbose: 58 | iterator = range(len(self.ano_idx)) 59 | else: 60 | iterator = tqdm(range(len(self.ano_idx))) 61 | for ii in iterator: 62 | idx = self.ano_idx[ii] 63 | 64 | s_t = time.time() 65 | W = self.interpret_ano(idx) 66 | W_lst.append(W) 67 | if self.verbose: 68 | print("Ano_id:[{}], ({}/{}) \t time: {:.2f}s\n".format( 69 | idx, (ii + 1), len(self.ano_idx), (time.time() - s_t))) 70 | 71 | fea_weight_lst = [] 72 | for ii, idx in enumerate(self.ano_idx): 73 | w = W_lst[ii] 74 | fea_weight = np.zeros(self.dim) 75 | for j in range(len(w)): 76 | fea_weight += abs(w[j]) 77 | fea_weight_lst.append(fea_weight) 78 | return fea_weight_lst 79 | 80 | def interpret_ano(self, idx): 81 | device = self.device 82 | dim = self.dim 83 | 84 | data_loader, test_loader = self.prepare_triplets(idx) 85 | n_linear = self.n_linear 86 | model = ATONablanet(n_feature=dim, n_linear=n_linear) 87 | model.to(device) 88 | 89 | optimizer = optim.Adam(model.parameters(), lr=self.lr, weight_decay=1e-2) 90 | criterion_tml = torch.nn.TripletMarginLoss(margin=self.margin, p=2) 91 | 92 | scheduler = lr_scheduler.StepLR(optimizer, 5, gamma=0.1) 93 | early_stp = EarlyStopping(patience=3, verbose=False) 94 | 95 | for epoch in range(self.n_epoch): 96 | model.train() 97 | total_loss = 0 98 | es_time = time.time() 99 | 100 | batch_cnt = 0 101 | for anchor, pos, neg in data_loader: 102 | anchor, pos, neg = anchor.to(device), pos.to(device), neg.to(device) 103 | embed_anchor, embed_pos, embed_neg = model(anchor, pos, neg) 104 | loss = criterion_tml(embed_anchor, embed_pos, embed_neg) 105 | 106 | total_loss += loss 107 | 108 | optimizer.zero_grad() 109 | loss.backward() 110 | optimizer.step() 111 | batch_cnt += 1 112 | 113 | train_loss = total_loss / batch_cnt 114 | est = time.time() - es_time 115 | if (epoch + 1) % 1 == 0 and self.verbose: 116 | message = 'Epoch: [{:02}/{:02}] loss: {:.4f} Time: {:.2f}s'.format( 117 | epoch + 1, self.n_epoch, 118 | train_loss, est) 119 | print(message) 120 | scheduler.step() 121 | 122 | early_stp(train_loss, model) 123 | if early_stp.early_stop: 124 | model.load_state_dict(torch.load(early_stp.path)) 125 | if self.verbose: 126 | print("early stopping") 127 | break 128 | 129 | W = model.linear.weight.data.cpu().numpy() 130 | return W 131 | 132 | def prepare_triplets(self, idx): 133 | x = self.x 134 | y = self.y 135 | selector = MyHardSingleTripletSelector(nbrs_num=self.nbrs_num, rand_num=self.rand_num) 136 | dataset = SingleTripletDataset(idx, x, y, triplets_selector=selector) 137 | data_loader = Data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True) 138 | test_loader = Data.DataLoader(dataset, batch_size=len(dataset)) 139 | return data_loader, test_loader 140 | 141 | -------------------------------------------------------------------------------- /main2.py: -------------------------------------------------------------------------------- 1 | """ 2 | this script can perform outlier interpretation method SiNNe and Anchor 3 | These methods directly use feature subspace as interpretation 4 | 5 | @ Author: Hongzuo Xu 6 | @ email: hongzuo.xu@gmail.com or leogarcia@126.com or xuhongzuo13@nudt.edu.cn 7 | """ 8 | 9 | import os 10 | import ast 11 | import time, datetime 12 | import argparse 13 | import pandas as pd 14 | import numpy as np 15 | from prettytable import PrettyTable 16 | from model_sinne.SiNNE import SiNNE 17 | from model_iml.Anchor import Anchor 18 | from config import root 19 | from eval.evaluation_od import evaluation_od 20 | from utils.eval_print_utils import print_eval_runs2 21 | 22 | 23 | # ------------------- parameters ----------------- # 24 | algorithm_name = "anchor" 25 | 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument('--path', type=str, default="data/") 28 | parser.add_argument('--runs', type=int, default=1) 29 | parser.add_argument('--eval', type=ast.literal_eval, default=True, help='') 30 | if algorithm_name == "sinne": 31 | parser.add_argument('--max_level', default='full', help='') 32 | parser.add_argument("--width", type=int, default=10, help='') 33 | parser.add_argument("--ensemble_num", type=int, default=100, help='') 34 | parser.add_argument("--sample_num", type=int, default=8, help='') 35 | parser.add_argument("--pretrain", type=bool, default=False, help='') 36 | parser.add_argument("--verbose", type=bool, default=False, help='') 37 | elif algorithm_name == 'anchor': 38 | parser.add_argument('--kernel', default='rbf', help='') 39 | else: 40 | raise NotImplementedError("not supported algorithm") 41 | args = parser.parse_args() 42 | 43 | input_root_list = [root + args.path] 44 | od_eval_model = ["iforest", "copod", "hbos"] 45 | runs = args.runs 46 | record_name = "" 47 | 48 | # ------------------- record ----------------- # 49 | if not os.path.exists("record/" + algorithm_name): 50 | os.makedirs("record/" + algorithm_name) 51 | record_path = "record/" + algorithm_name + "/zout." + \ 52 | algorithm_name + "." + record_name + ".txt" 53 | doc = open(record_path, 'a') 54 | tab1 = PrettyTable(["parameter", "value"]) 55 | tab1.add_row(["@ data", str(input_root_list)]) 56 | tab1.add_row(["@ algorithm_name", str(algorithm_name)]) 57 | tab1.add_row(["@ runs", str(runs)]) 58 | tab1.add_row(["@ od_eval_model", str(od_eval_model)]) 59 | tab1.add_row(["@ start_time", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")]) 60 | for k in list(vars(args).keys()): 61 | tab1.add_row([k, vars(args)[k]]) 62 | print(tab1, file=doc) 63 | print(tab1) 64 | doc.close() 65 | time.sleep(0.2) 66 | 67 | 68 | def main(path, run_times): 69 | data_name = path.split("/")[-1].split(".")[0] 70 | 71 | # this is to remove the prefix index number of data set name, so that we can match the annotation file. 72 | data_name = data_name[3:] 73 | 74 | print("# ------------------ %s ------------------ # " % data_name) 75 | 76 | df = pd.read_csv(path) 77 | X = df.values[:, :-1] 78 | y = np.array(df.values[:, -1], dtype=int) 79 | 80 | runs_metric_lst = [[] for k in range(len(od_eval_model))] 81 | for i in range(run_times): 82 | print("runs: %d" % (i + 1)) 83 | time1 = time.time() 84 | 85 | if algorithm_name == "sinne": 86 | model = SiNNE(max_level=args.max_level, width=args.width, ensemble_num=args.ensemble_num, 87 | sample_num=args.sample_num, pretrain=args.pretrain) 88 | exp_subspace_list = model.fit(X, y) 89 | elif algorithm_name == 'anchor': 90 | model = Anchor() 91 | exp_subspace_list = model.fit(X, y) 92 | else: 93 | raise NotImplementedError("not implemented the algorithm") 94 | t = time.time() - time1 95 | 96 | if args.eval: 97 | # ---------------------- evaluation -------------------------- # 98 | for mm, eval_model in enumerate(od_eval_model): 99 | precision, recall, jaccard = evaluation_od(exp_subspace_list, X, y, data_name, model_name=eval_model) 100 | metric_lst = [precision, recall, jaccard, t] 101 | runs_metric_lst[mm].append(metric_lst) 102 | print("{}, eval_model: {}, {}".format(data_name, eval_model, metric_lst)) 103 | 104 | if args.eval: 105 | for mm in range(len(od_eval_model)): 106 | name = path.split("/")[-1].split(".")[0] 107 | txt = print_eval_runs2(runs_metric_lst[mm], data_name=name, algo_name=algorithm_name) 108 | print(txt) 109 | doc = open(record_path, 'a') 110 | print(txt, file=doc) 111 | doc.close() 112 | else: 113 | txt = data_name + "," + str(round(t, 2)) + "," + algorithm_name 114 | print(txt) 115 | doc = open(record_path, 'a') 116 | print(txt, file=doc) 117 | doc.close() 118 | return 119 | 120 | 121 | if __name__ == '__main__': 122 | for input_root in input_root_list: 123 | if os.path.isdir(input_root): 124 | for file_name in sorted(os.listdir(input_root)): 125 | if file_name.endswith(".csv"): 126 | input_path = str(os.path.join(input_root, file_name)) 127 | main(input_path, runs) 128 | 129 | else: 130 | input_path = input_root 131 | main(input_path, runs) 132 | -------------------------------------------------------------------------------- /data_od_evaluation/pima_gt_iforest.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 0,"[0, 1, 3, 6, 7]" 3 | 2,"[0, 1, 3, 5]" 4 | 4,"[2, 6]" 5 | 6,"[1, 2]" 6 | 8,"[0, 1, 3, 4, 7]" 7 | 9,"[0, 5]" 8 | 11,"[0, 1, 3, 5]" 9 | 13,[4] 10 | 14,"[3, 4, 5, 7]" 11 | 15,"[0, 1, 2]" 12 | 16,"[0, 1, 2, 3, 4, 5]" 13 | 17,"[0, 3]" 14 | 19,[7] 15 | 22,"[1, 2, 3]" 16 | 23,"[0, 7]" 17 | 24,"[0, 1, 2]" 18 | 25,"[0, 4]" 19 | 26,"[0, 3, 5]" 20 | 31,[4] 21 | 37,"[0, 1, 3, 6, 7]" 22 | 38,"[1, 3]" 23 | 39,"[3, 6, 7]" 24 | 43,"[1, 2, 3, 4, 5]" 25 | 45,"[0, 1, 6]" 26 | 48,"[0, 2, 5, 7]" 27 | 53,"[0, 1, 2, 4, 7]" 28 | 56,"[0, 1, 2, 4]" 29 | 61,"[0, 1, 3]" 30 | 64,"[0, 3]" 31 | 66,"[0, 6, 7]" 32 | 70,"[4, 6]" 33 | 72,"[0, 3]" 34 | 78,"[2, 5]" 35 | 84,"[2, 3]" 36 | 88,[0] 37 | 93,"[0, 7]" 38 | 99,"[0, 1, 2, 3, 4, 5]" 39 | 100,"[1, 3, 6]" 40 | 109,"[0, 4]" 41 | 110,"[1, 6, 7]" 42 | 111,"[0, 2, 4]" 43 | 114,"[0, 1, 2, 4]" 44 | 115,"[1, 2, 7]" 45 | 116,"[3, 7]" 46 | 120,"[3, 5]" 47 | 124,"[3, 7]" 48 | 125,"[2, 5]" 49 | 128,"[0, 2, 4, 7]" 50 | 129,"[0, 7]" 51 | 130,"[1, 3]" 52 | 131,"[0, 2, 3, 6, 7]" 53 | 132,"[1, 4]" 54 | 143,"[0, 3]" 55 | 152,"[0, 1, 2, 6, 7]" 56 | 154,"[0, 1, 3, 5]" 57 | 155,"[0, 3, 5]" 58 | 159,[0] 59 | 164,"[0, 1, 2, 3, 6]" 60 | 165,"[0, 4]" 61 | 170,"[0, 1, 3]" 62 | 171,"[0, 4]" 63 | 175,"[0, 1, 3]" 64 | 177,[5] 65 | 179,"[0, 6]" 66 | 185,"[0, 1]" 67 | 186,"[0, 1, 2, 4]" 68 | 187,"[2, 6]" 69 | 188,"[0, 3, 5, 7]" 70 | 189,"[0, 4]" 71 | 192,"[0, 1, 3]" 72 | 193,"[0, 1, 2, 5]" 73 | 195,"[0, 1, 2, 3, 4]" 74 | 197,"[3, 4]" 75 | 198,"[0, 3]" 76 | 199,[4] 77 | 206,"[0, 1, 2, 4, 5, 7]" 78 | 207,"[2, 3]" 79 | 209,"[0, 1]" 80 | 213,"[3, 5]" 81 | 214,"[0, 4]" 82 | 215,"[0, 4]" 83 | 216,"[0, 2, 3, 4, 7]" 84 | 218,"[0, 6]" 85 | 219,"[3, 5, 7]" 86 | 220,"[0, 1, 2, 4]" 87 | 221,"[0, 6, 7]" 88 | 227,"[1, 2, 3]" 89 | 230,"[0, 1, 3, 5, 7]" 90 | 231,"[0, 4, 5]" 91 | 235,"[1, 3, 5, 7]" 92 | 236,"[1, 7]" 93 | 237,"[1, 2, 3, 5, 7]" 94 | 238,"[0, 3]" 95 | 242,"[2, 3]" 96 | 243,"[0, 2, 6]" 97 | 245,"[0, 1, 2, 6, 7]" 98 | 254,"[0, 2, 3, 4]" 99 | 255,"[3, 7]" 100 | 259,"[0, 6, 7]" 101 | 261,"[2, 6]" 102 | 264,[7] 103 | 266,"[2, 6]" 104 | 269,"[1, 2]" 105 | 270,"[0, 1, 2, 5, 6]" 106 | 276,"[0, 2]" 107 | 280,"[0, 1, 3]" 108 | 283,"[0, 1, 3, 6, 7]" 109 | 284,"[0, 7]" 110 | 287,"[4, 5]" 111 | 291,"[0, 4]" 112 | 292,"[5, 6, 7]" 113 | 293,"[2, 3]" 114 | 296,[4] 115 | 298,[0] 116 | 300,"[0, 1, 2]" 117 | 301,"[1, 2]" 118 | 303,"[3, 5]" 119 | 306,"[0, 1, 3, 5, 7]" 120 | 308,"[0, 6]" 121 | 309,"[4, 6]" 122 | 312,"[0, 1, 3]" 123 | 314,"[0, 6]" 124 | 317,"[1, 3]" 125 | 319,"[1, 3, 5, 7]" 126 | 321,"[3, 6, 7]" 127 | 322,"[0, 7]" 128 | 323,"[0, 5]" 129 | 326,[4] 130 | 328,"[0, 5]" 131 | 332,"[1, 2]" 132 | 337,[7] 133 | 338,"[0, 4]" 134 | 339,"[0, 1, 3, 5]" 135 | 349,[1] 136 | 355,"[0, 1, 2, 3, 5, 7]" 137 | 356,"[2, 3]" 138 | 357,"[0, 2]" 139 | 359,"[0, 1]" 140 | 360,"[0, 1, 2, 4]" 141 | 363,"[0, 7]" 142 | 366,"[0, 3, 6, 7]" 143 | 369,[2] 144 | 370,"[0, 3, 4, 6, 7]" 145 | 375,"[0, 4]" 146 | 378,"[3, 5]" 147 | 386,"[0, 6]" 148 | 387,[2] 149 | 388,"[4, 7]" 150 | 391,"[1, 3, 5, 7]" 151 | 394,"[1, 3, 6]" 152 | 397,"[3, 7]" 153 | 399,[1] 154 | 400,"[0, 1, 3]" 155 | 402,"[0, 3]" 156 | 404,"[1, 2, 3]" 157 | 406,"[0, 7]" 158 | 408,"[1, 3, 6]" 159 | 409,"[2, 3, 4]" 160 | 414,"[0, 4]" 161 | 415,"[0, 1, 2, 4]" 162 | 417,"[0, 1, 5]" 163 | 419,"[4, 5]" 164 | 424,"[0, 4, 5]" 165 | 425,"[0, 1, 2, 4, 7]" 166 | 427,"[0, 1, 7]" 167 | 429,"[0, 1, 2, 4, 7]" 168 | 435,"[2, 5]" 169 | 440,"[0, 1, 2]" 170 | 443,"[0, 3, 6, 7]" 171 | 444,"[3, 4]" 172 | 445,"[3, 5]" 173 | 448,"[0, 4]" 174 | 451,"[3, 7]" 175 | 455,"[0, 2]" 176 | 458,"[0, 3]" 177 | 468,"[0, 1, 2]" 178 | 476,"[0, 3]" 179 | 480,[4] 180 | 484,"[2, 5]" 181 | 485,"[0, 4]" 182 | 493,"[0, 6, 7]" 183 | 498,"[0, 1, 3, 5, 7]" 184 | 502,[1] 185 | 506,"[0, 1, 7]" 186 | 510,"[0, 1]" 187 | 515,"[1, 3]" 188 | 516,"[0, 1, 2, 6, 7]" 189 | 523,"[0, 3]" 190 | 535,"[0, 1, 2]" 191 | 539,"[2, 3]" 192 | 540,"[0, 1, 4]" 193 | 541,"[0, 4]" 194 | 542,"[0, 1, 3, 6, 7]" 195 | 545,"[0, 1, 2, 4]" 196 | 546,"[1, 5, 7]" 197 | 560,"[6, 7]" 198 | 561,"[0, 1, 5]" 199 | 569,"[0, 4, 7]" 200 | 577,"[3, 5, 7]" 201 | 579,[3] 202 | 580,"[0, 1, 2, 3, 7]" 203 | 584,"[0, 4, 5, 7]" 204 | 586,"[0, 1, 3]" 205 | 588,"[0, 1, 6, 7]" 206 | 590,"[0, 5]" 207 | 592,"[0, 7]" 208 | 595,"[0, 1]" 209 | 598,"[0, 1, 3, 6, 7]" 210 | 603,"[4, 7]" 211 | 604,"[0, 1, 2]" 212 | 606,"[4, 6]" 213 | 611,"[1, 2]" 214 | 612,"[0, 1, 2, 4]" 215 | 614,"[0, 3]" 216 | 618,"[0, 6, 7]" 217 | 619,"[0, 1, 2]" 218 | 630,"[0, 3, 6, 7]" 219 | 635,"[0, 3]" 220 | 638,"[0, 5, 6, 7]" 221 | 642,"[3, 7]" 222 | 646,"[1, 5]" 223 | 647,"[0, 1, 2]" 224 | 648,"[0, 4]" 225 | 655,"[2, 4]" 226 | 659,"[1, 2, 6]" 227 | 661,"[1, 6]" 228 | 662,"[2, 3]" 229 | 663,"[0, 3]" 230 | 664,"[0, 2, 3, 7]" 231 | 666,[7] 232 | 667,"[0, 3]" 233 | 675,"[1, 3]" 234 | 676,"[0, 1, 3, 6, 7]" 235 | 678,"[2, 3]" 236 | 681,"[1, 5]" 237 | 683,"[0, 2, 3, 6, 7]" 238 | 689,"[3, 4, 5, 7]" 239 | 691,"[0, 1, 2]" 240 | 693,"[0, 3]" 241 | 695,"[0, 2, 4]" 242 | 696,"[1, 3]" 243 | 701,[7] 244 | 702,"[0, 6, 7]" 245 | 706,"[0, 5]" 246 | 708,"[0, 1, 3]" 247 | 709,"[1, 4]" 248 | 712,"[0, 5]" 249 | 715,"[0, 1, 2, 4]" 250 | 716,"[1, 6]" 251 | 719,"[1, 7]" 252 | 722,"[0, 1, 7]" 253 | 730,[7] 254 | 731,"[0, 7]" 255 | 732,"[0, 1, 5]" 256 | 739,"[0, 1, 3, 5, 7]" 257 | 740,"[0, 3, 5, 7]" 258 | 743,"[0, 2, 3, 6, 7]" 259 | 746,[5] 260 | 748,"[0, 1, 4]" 261 | 749,"[0, 1, 2, 6, 7]" 262 | 750,"[0, 6, 7]" 263 | 753,"[0, 1, 2, 4]" 264 | 754,"[0, 1]" 265 | 755,"[0, 6, 7]" 266 | 757,"[0, 7]" 267 | 759,"[1, 2, 7]" 268 | 761,"[0, 1, 5]" 269 | 766,"[0, 2, 7]" 270 | -------------------------------------------------------------------------------- /data_od_evaluation/pima_gt_copod.csv: -------------------------------------------------------------------------------- 1 | ano_idx,exp_subspace 2 | 0,"[0, 1, 6, 7]" 3 | 2,"[0, 1, 2, 5]" 4 | 4,"[2, 6]" 5 | 6,[2] 6 | 8,"[1, 3, 4, 5, 7]" 7 | 9,[5] 8 | 11,"[0, 1, 5]" 9 | 13,[4] 10 | 14,"[0, 1, 4, 5, 7]" 11 | 15,[2] 12 | 16,"[0, 2, 3, 4, 5]" 13 | 17,"[0, 5]" 14 | 19,"[3, 4, 5, 7]" 15 | 22,"[1, 2]" 16 | 23,"[0, 2, 3, 5]" 17 | 24,"[0, 2, 3, 4, 7]" 18 | 25,"[0, 4]" 19 | 26,"[0, 1, 5, 7]" 20 | 31,"[3, 4]" 21 | 37,"[0, 2, 3, 7]" 22 | 38,"[2, 3]" 23 | 39,"[0, 3, 4, 6, 7]" 24 | 43,"[0, 2, 4, 5]" 25 | 45,"[0, 1, 2, 6]" 26 | 48,"[0, 2, 3, 5]" 27 | 53,"[0, 2, 3, 4, 7]" 28 | 56,"[0, 1, 2, 3, 4, 5]" 29 | 61,"[0, 1, 7]" 30 | 64,"[0, 2, 7]" 31 | 66,"[0, 2, 6, 7]" 32 | 70,"[2, 6]" 33 | 72,"[0, 2]" 34 | 78,[2] 35 | 84,[2] 36 | 88,"[0, 2]" 37 | 93,"[1, 5, 7]" 38 | 99,"[2, 3, 4, 5]" 39 | 100,"[1, 6, 7]" 40 | 109,"[0, 2, 4, 5]" 41 | 110,"[1, 3, 4]" 42 | 111,"[0, 2, 4]" 43 | 114,[2] 44 | 115,"[1, 2, 7]" 45 | 116,"[0, 6, 7]" 46 | 120,"[3, 5]" 47 | 124,"[0, 2, 7]" 48 | 125,[2] 49 | 128,"[2, 3, 4, 7]" 50 | 129,"[6, 7]" 51 | 130,"[0, 1, 4, 5]" 52 | 131,"[0, 2, 6]" 53 | 132,"[1, 2, 3, 4]" 54 | 143,"[0, 2, 7]" 55 | 152,"[0, 1, 2, 6, 7]" 56 | 154,"[0, 1, 5, 7]" 57 | 155,"[0, 2, 3, 5]" 58 | 159,[0] 59 | 164,"[0, 2, 6]" 60 | 165,"[0, 4, 5, 6, 7]" 61 | 170,"[0, 2, 6, 7]" 62 | 171,"[0, 4]" 63 | 175,"[0, 1, 3, 4]" 64 | 177,"[0, 2, 3, 4, 5]" 65 | 179,"[6, 7]" 66 | 185,"[0, 1, 2, 6, 7]" 67 | 186,"[0, 4, 5, 7]" 68 | 187,"[2, 6]" 69 | 188,"[0, 2, 3, 4, 5]" 70 | 189,"[0, 2, 3, 4]" 71 | 192,"[0, 1, 2, 5]" 72 | 193,[2] 73 | 195,"[0, 2, 3, 4, 5]" 74 | 197,[5] 75 | 198,"[2, 3]" 76 | 199,"[0, 2, 4]" 77 | 206,"[1, 7]" 78 | 207,"[1, 2, 7]" 79 | 209,"[0, 1, 2, 7]" 80 | 213,"[0, 2, 4, 5]" 81 | 214,"[0, 2, 3, 4]" 82 | 215,"[0, 2, 3, 4, 5]" 83 | 216,"[0, 2, 3, 4]" 84 | 218,[6] 85 | 219,"[0, 2, 7]" 86 | 220,"[0, 1, 2, 4]" 87 | 221,"[6, 7]" 88 | 227,[2] 89 | 230,"[1, 2, 5]" 90 | 231,"[3, 4, 5, 7]" 91 | 235,"[1, 5]" 92 | 236,"[0, 1, 2, 7]" 93 | 237,"[1, 2, 5]" 94 | 238,"[0, 1, 2, 6]" 95 | 242,[2] 96 | 243,"[0, 2, 6]" 97 | 245,"[0, 1, 2, 6, 7]" 98 | 254,"[0, 2, 4, 5]" 99 | 255,"[2, 3]" 100 | 259,"[0, 6, 7]" 101 | 261,[2] 102 | 264,[2] 103 | 266,[2] 104 | 269,[2] 105 | 270,"[0, 2, 5, 6]" 106 | 276,[2] 107 | 280,"[0, 1, 2, 5]" 108 | 283,"[0, 1, 2, 7]" 109 | 284,"[5, 7]" 110 | 287,"[2, 3, 4, 5]" 111 | 291,"[0, 2, 6]" 112 | 292,"[2, 3, 4, 6, 7]" 113 | 293,[2] 114 | 296,"[3, 4, 5]" 115 | 298,"[0, 4]" 116 | 300,[2] 117 | 301,[2] 118 | 303,"[2, 5]" 119 | 306,"[0, 1, 5, 7]" 120 | 308,[6] 121 | 309,"[2, 4, 6, 7]" 122 | 312,"[1, 5]" 123 | 314,"[6, 7]" 124 | 317,[1] 125 | 319,"[1, 5, 7]" 126 | 321,[3] 127 | 322,[5] 128 | 323,"[0, 1, 2, 5]" 129 | 326,"[2, 3, 4]" 130 | 328,"[2, 3, 4, 5]" 131 | 332,[2] 132 | 337,"[0, 2, 7]" 133 | 338,"[0, 2, 3, 4, 6, 7]" 134 | 339,"[0, 1, 2, 5]" 135 | 349,"[0, 1, 2, 5]" 136 | 355,"[0, 1, 2, 7]" 137 | 356,[2] 138 | 357,"[0, 2]" 139 | 359,"[1, 3, 4, 5]" 140 | 360,"[0, 1, 2, 4]" 141 | 363,"[1, 7]" 142 | 366,"[0, 5]" 143 | 369,[2] 144 | 370,"[2, 3, 4, 6]" 145 | 375,"[0, 3, 4, 7]" 146 | 378,"[1, 5]" 147 | 386,"[0, 6, 7]" 148 | 387,"[0, 2, 3, 7]" 149 | 388,"[0, 4, 7]" 150 | 391,"[1, 5]" 151 | 394,"[0, 1, 6]" 152 | 397,"[2, 3]" 153 | 399,"[1, 2]" 154 | 400,[2] 155 | 402,"[0, 2, 3, 4]" 156 | 404,"[0, 1, 2, 6, 7]" 157 | 406,"[0, 5, 7]" 158 | 408,"[0, 1, 5, 6, 7]" 159 | 409,"[3, 4]" 160 | 414,"[0, 2, 3, 4]" 161 | 415,"[1, 2, 3, 4]" 162 | 417,"[0, 1, 5, 6, 7]" 163 | 419,"[2, 4, 5]" 164 | 424,"[0, 4, 5]" 165 | 425,"[0, 1, 3, 4, 5]" 166 | 427,"[1, 2]" 167 | 429,"[3, 4, 5, 7]" 168 | 435,[2] 169 | 440,"[1, 2]" 170 | 443,"[0, 2, 6]" 171 | 444,[2] 172 | 445,"[3, 5]" 173 | 448,"[0, 2, 3, 4]" 174 | 451,"[1, 5, 6, 7]" 175 | 455,"[0, 1, 2]" 176 | 458,"[0, 2, 3, 4, 6, 7]" 177 | 468,[2] 178 | 476,"[2, 3, 4]" 179 | 480,[4] 180 | 484,[2] 181 | 485,"[0, 2, 3, 4, 5]" 182 | 493,"[6, 7]" 183 | 498,"[0, 1, 5, 7]" 184 | 502,"[0, 1, 2, 3, 5]" 185 | 506,"[1, 2]" 186 | 510,"[0, 7]" 187 | 515,"[1, 2]" 188 | 516,"[0, 2, 3, 4, 7]" 189 | 523,"[0, 6, 7]" 190 | 535,[2] 191 | 539,"[2, 3]" 192 | 540,"[0, 3, 4, 5, 7]" 193 | 541,[4] 194 | 542,"[0, 2, 6, 7]" 195 | 545,"[0, 1, 2, 3, 4]" 196 | 546,"[0, 1, 5, 6, 7]" 197 | 560,"[6, 7]" 198 | 561,"[1, 2]" 199 | 569,"[0, 2, 3, 4]" 200 | 577,"[6, 7]" 201 | 579,[3] 202 | 580,"[2, 3]" 203 | 584,"[0, 4, 5, 7]" 204 | 586,"[0, 1, 2, 6, 7]" 205 | 588,"[1, 2, 6, 7]" 206 | 590,"[0, 5, 6, 7]" 207 | 592,"[1, 7]" 208 | 595,"[1, 2]" 209 | 598,"[1, 6, 7]" 210 | 603,"[0, 1, 6, 7]" 211 | 604,[2] 212 | 606,"[1, 2, 3, 4, 5]" 213 | 611,"[1, 2]" 214 | 612,"[0, 2, 3, 4]" 215 | 614,"[0, 4, 7]" 216 | 618,"[0, 6, 7]" 217 | 619,[2] 218 | 630,"[0, 2, 5, 6]" 219 | 635,"[0, 2]" 220 | 638,"[0, 5, 6]" 221 | 642,"[0, 1, 5, 7]" 222 | 646,"[1, 5]" 223 | 647,[2] 224 | 648,"[0, 2, 3, 4, 5]" 225 | 655,"[2, 4, 5]" 226 | 659,[6] 227 | 661,[1] 228 | 662,"[0, 2, 3, 4]" 229 | 663,"[0, 2, 3, 4]" 230 | 664,"[0, 2, 3, 7]" 231 | 666,[7] 232 | 667,"[0, 5, 6, 7]" 233 | 675,"[1, 2]" 234 | 676,"[0, 1, 5, 7]" 235 | 678,[2] 236 | 681,"[1, 5]" 237 | 683,"[0, 2, 6]" 238 | 689,"[3, 4, 5, 7]" 239 | 691,"[0, 2]" 240 | 693,"[0, 2, 3, 4]" 241 | 695,"[0, 2, 4]" 242 | 696,"[1, 5]" 243 | 701,"[0, 3, 5, 7]" 244 | 702,"[1, 2, 6, 7]" 245 | 706,[5] 246 | 708,"[0, 1, 2, 7]" 247 | 709,"[2, 3, 4, 5]" 248 | 712,"[0, 2, 3, 5]" 249 | 715,"[0, 1, 2, 4]" 250 | 716,"[1, 2, 3, 4]" 251 | 719,[7] 252 | 722,"[3, 4, 5, 7]" 253 | 730,"[3, 4, 5, 7]" 254 | 731,"[0, 1, 2, 5]" 255 | 732,"[1, 2, 3, 4, 5]" 256 | 739,"[1, 5, 7]" 257 | 740,"[0, 2, 3, 4, 5, 7]" 258 | 743,"[0, 1, 2, 6, 7]" 259 | 746,"[2, 3, 5]" 260 | 748,"[1, 4, 5]" 261 | 749,"[0, 1, 2, 5, 7]" 262 | 750,[6] 263 | 753,"[0, 1, 2, 3, 4, 5]" 264 | 754,"[0, 1, 2, 7]" 265 | 755,"[2, 3, 4, 6, 7]" 266 | 757,[7] 267 | 759,"[1, 2, 7]" 268 | 761,"[0, 1, 5, 7]" 269 | 766,[2] 270 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Outlier Interpretation 2 | 3 | This repository contains the source code for the paper **Beyond Outlier Detection: Interpreting Outliers by Attention-Guided Triplet Deviation Network** published in the Web Conference (WWW'21). 4 | 5 | Note that this task is also referred to as outlier explanation, outlier aspect mining/discovering, outlier property detection, and outlier description. 6 | 7 | 8 | 9 | ### Seven Outlier Interpretation Methods 10 | 11 | **This repository contains seven outlier interpretation methods: ATON [1], COIN[2], SiNNE[3], SHAP[4], LIME[5], Integrated Gradients [6], and Anchor [7].** 12 | 13 | [1] Beyond Outlier Detection: Outlier Interpretation by Attention-Guided Triplet Deviation Network. In WWW. 2021. 14 | 15 | [2] Contextual outlier interpretation. In IJCAI. 2018. 16 | 17 | [3] A new effective and efficient measure for outlying aspect mining. arXiv preprint arXiv:2004.13550. 2020. 18 | 19 | [4] A unified approach to interpreting model predictions. In NeuraIPS. 2017 20 | 21 | [5] "Why should I trust you?" Explaining the predictions of any classifier. In SIGKDD. 2016. 22 | 23 | [6] Axiomatic attribution for deep networks. In ICML. 2017. 24 | 25 | [7] Anchors: High Precision Model-Agnostic Explanations. In AAAI. 2018. 26 | 27 | 28 | 29 | ### Structure 30 | `data_od_evaluation`: Ground-truth outlier interpretation annotations of real-world datasets 31 | `data`: real-world datasets in csv format, the last column is label indicating each line is an outlier or a inlier 32 | `model_xx`: folders of ATON and its contenders, the competitors are introduced in Section 5.1.2 33 | `config.py`: configuration and default hyper-parameters 34 | `main.py` main script to run the experiments 35 | 36 | 37 | 38 | ### How to use? 39 | ##### 1. For ATON and competitor COIN, SHAP, and LIME, and IntGrad 40 | 1. modify variant `algorithm_name` in `main.py` (support algorithm: `aton`, `coin`, `shap`, `lime` in lowercase) 41 | 2. use `python main.py --path data/ --runs 10 ` 42 | 3. the results can be found in `record/[algorithm_name]/` folder 43 | 44 | ##### 2. For ATON' and competitor COIN' 45 | 1. modify variant `algorithm_name` in `main.py` to `aton` or `coin` 46 | 2. use `python main.py --path data/ --w2s_ratio auto --runs 10` to run ATON' 47 | use `python main.py --path data/ --w2s_ratio pn --runs 10` to run COIN' 48 | 49 | ##### 3. For competitor SiNNE and Anchor 50 | 1. modify variant `algorithm_name` in `main2.py` to `sinne` or `anchor` 51 | please run `python main2.py --path data/ --runs 10` 52 | 53 | 54 | 55 | ### args of main.py 56 | - `--path [str]` - the path of data folder or an individual data file (in csv format) 57 | 58 | - `--gpu [True/False]` - use GPU or not 59 | 60 | - `--runs [int]` - how many times to run a method on each dataset (we run 10 times and report average performance in our submission) 61 | 62 | - `--w2s_ratio [auto/real_len/pn]` - how to transfer feature weight to feature subspace 'real-len', 'auto', or 'pn' 63 | denote the same length with the ground-truth, auto generating subspace by the proposed threshold or positive-negative. 64 | (in our paper, we use 'pn' in COIN', use 'auto' in ATON'. As for methods which output, we directly use 'real-len'.) 65 | 66 | - `--eval [True/False]` - evaluate or not, use False for scalability test 67 | ... (other hypter-parameters of different methods. You may want to use -h to check the corresponding hypter-parameters after modifing the `algorithm_name`) 68 | 69 | 70 | 71 | ### Requirements 72 | main packages of this project 73 | ``` 74 | torch==1.3.0 75 | numpy==1.15.0 76 | pandas==0.25.2 77 | scikit-learn==0.23.1 78 | pyod==0.8.2 79 | tqdm==4.48.2 80 | prettytable==0.7.2 81 | shap==0.35.0 82 | lime==0.2.0.1 83 | alibi==0.5.5 84 | ``` 85 | 86 | 87 | 88 | ### Ground-truth annotations 89 | 90 | Please also find the Ground-truth outlier interpretation annotations in folder `data_od_evaluation`. 91 | *We expect these annotations can foster further possible reasearchs on this new practical probelm.* 92 | 93 | You may find that each dataset has three annotation files, please refer to the detailed annotation generation process in our submission. We detailedly introduced it in Section 5.1.4: 94 | 95 | **How to generate the ground-truth annotations:** 96 | > We employ three different kinds of representative outlier detection methods (i.e., ensemble-based method iForest, probability-based method COPOD, and distance-based method HBOS) to evaluate outlying degree of real outliers given every possible subspace. A good explanation for an outlier should be a high-contrast subspace that the outlier explicitly demonstrates its outlierness, and outlier detectors can easily and certainly predict it as an outlier in this subspace. Therefore, the ground-truth interpretation for each outlier is defined as the subspace that the outlier obtains the highest outlier score among all the possible subspaces. 97 | 98 | 99 | 100 | ### a typo in the paper 101 | 102 | 103 | 104 | In the second page, "As shown in Figure 1 (a), the queried outlier is ..., and the interpretation is feature subspace **$\{f1, f2\}$**" should be **$\{f1, f3\}$**. 105 | 106 | We appreciate @Zeyi Li (NJPU) for finding this typo. 107 | 108 | 109 | 110 | ### References 111 | - datasets are from ODDS, an outlier detection datasets library (http://odds.cs.stonybrook.edu/), and kaggle platform (https://www.kaggle.com/) 112 | - the source code of competitor COIN is publicly available in GitHub. 113 | 114 | 115 | 116 | ### Citation 117 | 118 | 😄 If you find this useful in your research, please consider citing: 119 | ``` 120 | @inproceedings{xu2021aton, 121 | title={Beyond Outlier Detection: Interpreting Outliers by Attention-Guided Triplet Deviation Network}, 122 | author={Xu, Hongzuo and Wang, Yijie and Jian, Songlei and Huang, Zhenyu and Wang, Yongjun and Liu, Ning and Li, Fei}, 123 | booktitle={Proceedings of The Web Conference 2021 (WWW’21)}, 124 | year={2021}, 125 | publisher={ACM} 126 | } 127 | ``` 128 | -------------------------------------------------------------------------------- /model_aton/networks.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script implements an outlier interpretation method of the following paper: 3 | "Beyond Outlier Detection: Outlier Interpretation by Attention-Guided Triplet Deviation Network". in WWW'21. 4 | @ Author: Hongzuo Xu 5 | @ email: hongzuo.xu@gmail.com or leogarcia@126.com or xuhongzuo13@nudt.edu.cn 6 | """ 7 | 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | 13 | 14 | class ATONnet(nn.Module): 15 | def __init__(self, attn_net, n_feature, n_linear): 16 | super(ATONnet, self).__init__() 17 | self.attn_net = attn_net 18 | self.linear = torch.nn.Linear(n_feature, n_linear, bias=False) 19 | 20 | def forward(self, anchor, positive, negative): 21 | anchor = self.linear(anchor) 22 | positive = self.linear(positive) 23 | negative = self.linear(negative) 24 | 25 | cat = torch.cat([negative, anchor, positive], dim=1) 26 | 27 | attn = self.attn_net(cat) 28 | embedded_n = negative * attn 29 | embedded_a = anchor * attn 30 | embedded_p = positive * attn 31 | 32 | embedded_n_dff = (1 - attn) * negative 33 | embedded_a_dff = (1 - attn) * anchor 34 | embedded_p_dff = (1 - attn) * positive 35 | dis1 = F.pairwise_distance(embedded_n_dff, embedded_a_dff) 36 | dis2 = F.pairwise_distance(embedded_p_dff, embedded_a_dff) 37 | dis = torch.abs(dis1 - dis2) 38 | 39 | return embedded_a, embedded_p, embedded_n, attn, dis 40 | 41 | def get_lnr(self, x): 42 | return self.linear(x) 43 | 44 | 45 | class AttentionNet(nn.Module): 46 | def __init__(self, in_feature, n_hidden, out_feature): 47 | super(AttentionNet, self).__init__() 48 | self.hidden = torch.nn.Linear(in_feature, n_hidden) 49 | self.out = torch.nn.Linear(n_hidden, out_feature) 50 | 51 | def forward(self, x): 52 | x = torch.relu(self.hidden(x)) 53 | x = self.out(x) 54 | _min = torch.unsqueeze(torch.min(x, dim=1)[0], 0).t() 55 | _max = torch.unsqueeze(torch.max(x, dim=1)[0], 0).t() 56 | x = (x - _min) / (_max - _min) 57 | return x 58 | 59 | 60 | class MyLoss(nn.Module): 61 | """ 62 | triplet deviation-based loss 63 | """ 64 | def __init__(self, alpha1, alpha2, margin): 65 | super(MyLoss, self).__init__() 66 | self.alpha1 = alpha1 67 | self.alpha2 = alpha2 68 | self.criterion_tml = torch.nn.TripletMarginLoss(margin=margin, p=2) 69 | return 70 | 71 | def forward(self, embed_anchor, embed_pos, embed_neg, dis): 72 | loss_tml = self.criterion_tml(embed_anchor, embed_pos, embed_neg) 73 | loss_dis = torch.mean(dis) 74 | loss = self.alpha1 * loss_tml + self.alpha2 * loss_dis 75 | return loss 76 | 77 | 78 | # ---------------------- ATON - ablation -------------------------- # 79 | # without attention 80 | class ATONablanet(nn.Module): 81 | def __init__(self, n_feature, n_linear): 82 | super(ATONablanet, self).__init__() 83 | self.linear = torch.nn.Linear(n_feature, n_linear, bias=False) 84 | 85 | def forward(self, anchor, positive, negative): 86 | embedded_a = self.linear(anchor) 87 | embedded_p = self.linear(positive) 88 | embedded_n = self.linear(negative) 89 | return embedded_a, embedded_p, embedded_n 90 | 91 | def get_lnr(self, x): 92 | return self.linear(x) 93 | 94 | 95 | # ---------------------- ATON - ablation -------------------------- # 96 | # without feature embedding module 97 | class ATONabla2net(nn.Module): 98 | """ 99 | without feature embedding module 100 | """ 101 | def __init__(self, attn_net): 102 | super(ATONabla2net, self).__init__() 103 | self.attn_net = attn_net 104 | 105 | def forward(self, anchor, positive, negative): 106 | cat = torch.cat([negative, anchor, positive], dim=1) 107 | attn = self.attn_net(cat) 108 | 109 | embedded_n = negative * attn 110 | embedded_a = anchor * attn 111 | embedded_p = positive * attn 112 | 113 | embedded_n_dff = (1 - attn) * negative 114 | embedded_a_dff = (1 - attn) * anchor 115 | embedded_p_dff = (1 - attn) * positive 116 | dis1 = F.pairwise_distance(embedded_n_dff, embedded_a_dff) 117 | dis2 = F.pairwise_distance(embedded_p_dff, embedded_a_dff) 118 | dis = torch.abs(dis1 - dis2) 119 | 120 | return embedded_a, embedded_p, embedded_n, attn, dis 121 | 122 | 123 | # -------------------------- ATON - ablation3 ------------------------------ # 124 | # test the significance of triplet deviation-based loss function 125 | 126 | class ATONabla3net(nn.Module): 127 | def __init__(self, attn_net, clf_net, n_feature, n_linear): 128 | super(ATONabla3net, self).__init__() 129 | self.attn_net = attn_net 130 | self.clf_net = clf_net 131 | self.linear = torch.nn.Linear(n_feature, n_linear, bias=False) 132 | 133 | def forward(self, x): 134 | x = self.linear(x) 135 | attn = self.attn_net(x) 136 | x = x * attn 137 | x = self.clf_net(x) 138 | return x, attn 139 | 140 | def get_lnr(self, x): 141 | return self.linear(x) 142 | 143 | 144 | class ClassificationNet(nn.Module): 145 | def __init__(self, n_feature): 146 | super(ClassificationNet, self).__init__() 147 | self.linear = torch.nn.Linear(n_feature, 2) 148 | 149 | def forward(self, x): 150 | x = self.linear(x) 151 | return x 152 | 153 | 154 | class MyLossClf(nn.Module): 155 | """ 156 | loss function for ablation3 157 | """ 158 | def __init__(self, alpha1, alpha2, alpha3, margin): 159 | super(MyLossClf, self).__init__() 160 | self.alpha1 = alpha1 161 | self.alpha2 = alpha2 162 | self.alpha3 = alpha3 163 | self.criterion_tml = torch.nn.TripletMarginLoss(margin=margin, p=2) 164 | self.criterion_cel = torch.nn.CrossEntropyLoss() 165 | return 166 | 167 | def forward(self, embed_anchor, embed_pos, embed_neg, clf_out, batch_y, dis): 168 | loss_tml = self.criterion_tml(embed_anchor, embed_pos, embed_neg) 169 | loss_cel = self.criterion_cel(clf_out, batch_y) 170 | loss_dis = torch.mean(dis) 171 | loss = self.alpha1 * loss_tml + + self.alpha2 * loss_cel + self.alpha3 * loss_dis 172 | return loss 173 | -------------------------------------------------------------------------------- /model_aton/ATON_ablation2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time, math 3 | import torch 4 | import torch.utils.data as Data 5 | import torch.optim as optim 6 | import torch.nn.functional as F 7 | 8 | from torch.optim import lr_scheduler 9 | from sklearn.neighbors import NearestNeighbors 10 | from sklearn import metrics 11 | from tqdm import tqdm 12 | from model_aton.utils import EarlyStopping, min_max_normalize 13 | 14 | from model_aton.datasets import MyHardSingleTripletSelector 15 | from model_aton.datasets import SingleTripletDataset 16 | from model_aton.networks import ATONabla2net, AttentionNet 17 | from model_aton.networks import MyLoss 18 | 19 | 20 | class ATONabla2: 21 | def __init__(self, nbrs_num=30, rand_num=30, alpha1=0.8, alpha2=0.2, 22 | n_epoch=10, batch_size=64, lr=0.1, margin=2., 23 | verbose=True, gpu=True): 24 | self.verbose = verbose 25 | 26 | self.x = None 27 | self.y = None 28 | self.ano_idx = None 29 | self.dim = None 30 | 31 | # a list of normal nbr of each anomaly 32 | self.normal_nbr_indices = [] 33 | 34 | cuda = torch.cuda.is_available() 35 | self.device = torch.device("cuda" if cuda and gpu else "cpu") 36 | if cuda: 37 | torch.cuda.set_device(0) 38 | print("device:", self.device) 39 | 40 | self.nbrs_num = nbrs_num 41 | self.rand_num = rand_num 42 | self.alpha1 = alpha1 43 | self.alpha2 = alpha2 44 | 45 | self.n_epoch = n_epoch 46 | self.batch_size = batch_size 47 | self.lr = lr 48 | self.margin = margin 49 | return 50 | 51 | def fit(self, x, y): 52 | device = self.device 53 | 54 | self.dim = x.shape[1] 55 | x = min_max_normalize(x) 56 | self.ano_idx = np.where(y == 1)[0] 57 | 58 | self.x = torch.tensor(x, dtype=torch.float32).to(device) 59 | self.y = torch.tensor(y, dtype=torch.int64).to(device) 60 | self.prepare_nbrs() 61 | 62 | # train model for each anomaly 63 | attn_lst = [] 64 | if self.verbose: 65 | iterator = range(len(self.ano_idx)) 66 | else: 67 | iterator = tqdm(range(len(self.ano_idx))) 68 | for ii in iterator: 69 | idx = self.ano_idx[ii] 70 | 71 | s_t = time.time() 72 | attn = self.interpret_ano(ii) 73 | attn_lst.append(attn) 74 | 75 | if self.verbose: 76 | print("Ano_id:[{}], ({}/{}) \t time: {:.2f}s\n".format( 77 | idx, (ii + 1), len(self.ano_idx), 78 | (time.time() - s_t))) 79 | 80 | # fea_weight_lst = [] 81 | # for ii, idx in enumerate(self.ano_idx): 82 | # attn = attn_lst[ii] 83 | # fea_weight = attn 84 | # fea_weight_lst.append(fea_weight) 85 | return attn_lst 86 | 87 | def interpret_ano(self, ii): 88 | idx = self.ano_idx[ii] 89 | device = self.device 90 | dim = self.dim 91 | 92 | nbr_indices = self.normal_nbr_indices[ii] 93 | data_loader, test_loader = self.prepare_triplets(idx, nbr_indices) 94 | attn_net = AttentionNet(in_feature=3 * dim, n_hidden=int(1.5 * dim), out_feature=dim) 95 | model = ATONabla2net(attn_net=attn_net) 96 | model.to(device) 97 | 98 | optimizer = optim.Adam(model.parameters(), lr=self.lr, weight_decay=1e-2) 99 | criterion = MyLoss(alpha1=self.alpha1, alpha2=self.alpha2, margin=self.margin) 100 | 101 | scheduler = lr_scheduler.StepLR(optimizer, 5, gamma=0.1) 102 | early_stp = EarlyStopping(patience=3, verbose=False) 103 | 104 | for epoch in range(self.n_epoch): 105 | model.train() 106 | total_loss = 0 107 | total_dis = 0 108 | es_time = time.time() 109 | 110 | batch_cnt = 0 111 | for anchor, pos, neg in data_loader: 112 | anchor, pos, neg = anchor.to(device), pos.to(device), neg.to(device) 113 | embed_anchor, embed_pos, embed_neg, attn, dis = model(anchor, pos, neg) 114 | 115 | loss = criterion(embed_anchor, embed_pos, embed_neg, dis) 116 | 117 | total_loss += loss 118 | total_dis += dis.mean() 119 | 120 | optimizer.zero_grad() 121 | loss.backward() 122 | optimizer.step() 123 | batch_cnt += 1 124 | 125 | train_loss = total_loss / batch_cnt 126 | # dis = total_dis / batch_cnt 127 | est = time.time() - es_time 128 | 129 | if self.verbose and (epoch + 1) % 1 == 0: 130 | message = 'Epoch: [{:02}/{:02}] loss: {:.4f} Time: {:.2f}s'.format( 131 | epoch + 1, self.n_epoch, train_loss, est) 132 | print(message) 133 | scheduler.step() 134 | 135 | early_stp(train_loss, model) 136 | if early_stp.early_stop: 137 | model.load_state_dict(torch.load(early_stp.path)) 138 | if self.verbose: 139 | print("early stopping") 140 | break 141 | 142 | for anchor, pos, neg in test_loader: 143 | model.eval() 144 | anchor, pos, neg = anchor.to(device), pos.to(device), neg.to(device) 145 | _, _, _, attn, _ = model(anchor, pos, neg) 146 | 147 | attn_avg = torch.mean(attn, dim=0) 148 | attn_avg = attn_avg.data.cpu().numpy() 149 | return attn_avg 150 | 151 | def prepare_triplets(self, idx, nbr_indices): 152 | x = self.x 153 | y = self.y 154 | selector = MyHardSingleTripletSelector(nbrs_num=self.nbrs_num, rand_num=self.rand_num, 155 | nbr_indices=nbr_indices) 156 | dataset = SingleTripletDataset(idx, x, y, triplets_selector=selector) 157 | data_loader = Data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True) 158 | test_loader = Data.DataLoader(dataset, batch_size=len(dataset)) 159 | return data_loader, test_loader 160 | 161 | def prepare_nbrs(self): 162 | x = self.x.cpu().data.numpy() 163 | y = self.y.cpu().data.numpy() 164 | 165 | anom_idx = np.where(y == 1)[0] 166 | x_anom = x[anom_idx] 167 | noml_idx = np.where(y == 0)[0] 168 | x_noml = x[noml_idx] 169 | n_neighbors = self.nbrs_num 170 | 171 | nbrs_local = NearestNeighbors(n_neighbors=n_neighbors).fit(x_noml) 172 | tmp_indices = nbrs_local.kneighbors(x_anom)[1] 173 | 174 | for idx in tmp_indices: 175 | nbr_indices = noml_idx[idx] 176 | self.normal_nbr_indices.append(nbr_indices) 177 | return 178 | -------------------------------------------------------------------------------- /eval/evaluation_od.py: -------------------------------------------------------------------------------- 1 | from sklearn.neighbors import LocalOutlierFactor 2 | from pyod.models.iforest import IForest 3 | from pyod.models.hbos import HBOS 4 | from pyod.models.loda import LODA 5 | from pyod.models.copod import COPOD 6 | from tqdm import tqdm 7 | import numpy as np 8 | import pandas as pd 9 | import os 10 | import ast 11 | import eval.evaluation_utils as utils 12 | from sklearn import metrics 13 | from config import eva_root 14 | 15 | 16 | def evaluation_od_train(x, y, data_name, model_name="iforest", chosen_subspace=None): 17 | """ 18 | using anomaly detector to yield anomaly score for each subspace, 19 | generate two files: the subspaces with the highest anomaly score & lof score for each subspace 20 | :param x: data matrix 21 | :param y: class information 22 | :param data_name: the data set name, using for naming the ground truth file 23 | :param model_name: anomaly detector name, default: lof 24 | :param chosen_subspace: use this to only evaluate a subset of the power set of full feature space 25 | :return: df: a ground-truth map using anomaly idx as key and ground truth feature subspace as value. 26 | """ 27 | global chosen_model 28 | 29 | dim = x.shape[1] 30 | ano_idx = np.where(y == 1)[0] 31 | n_ano = len(ano_idx) 32 | 33 | # get all the possible feature subset or just use given subset list 34 | f_subsets = utils.get_subset_candidate(dim, chosen_subspace) 35 | 36 | # score anomalies in each subspace, generate the score matrix 37 | n_subsets = len(f_subsets) 38 | score_matrix = np.zeros([n_ano, n_subsets]) 39 | for i in tqdm(range(n_subsets)): 40 | subset = f_subsets[i] 41 | x_subset = x[:, subset] 42 | 43 | 44 | if model_name == "iforest": 45 | clf = IForest() 46 | clf.fit(x_subset) 47 | od_score = clf.decision_scores_ 48 | elif model_name == "copod": 49 | clf = COPOD() 50 | clf.fit(x_subset) 51 | od_score = clf.decision_scores_ 52 | elif model_name == "hbos": 53 | clf = HBOS() 54 | clf.fit(x_subset) 55 | od_score = clf.decision_scores_ 56 | else: 57 | raise ValueError("unsupported od model") 58 | 59 | od_score = utils.min_max_norm(od_score) 60 | score_matrix[:, i] = od_score[ano_idx] 61 | 62 | if not os.path.exists(eva_root + "data_od_evaluation/"): 63 | os.makedirs(eva_root + "data_od_evaluation/") 64 | 65 | # score matrix to df 66 | anomaly_score_df = pd.DataFrame(data=score_matrix, columns=[str(s) for s in f_subsets]) 67 | col_name = anomaly_score_df.columns.tolist() 68 | col_name.insert(0, 'ano_idx') 69 | anomaly_score_df["ano_idx"] = ano_idx 70 | anomaly_score_df = anomaly_score_df.reindex(columns=col_name) 71 | path1 = eva_root + "data_od_evaluation/" + data_name + "_score_" + model_name + ".csv" 72 | anomaly_score_df.to_csv(path1, index=False) 73 | 74 | # get the ground truth (one subspace for each anomaly that the anomaly can obtain the highest anomaly score) 75 | g_truth_df = pd.DataFrame(columns=["ano_idx", "exp_subspace"]) 76 | 77 | exp_subspaces = [] 78 | for ii, ano_score in enumerate(score_matrix): 79 | max_score_idx = int(np.argmax(ano_score)) 80 | exp_subset = str(f_subsets[max_score_idx]) 81 | exp_subspaces.append(exp_subset) 82 | g_truth_df["ano_idx"] = ano_idx 83 | g_truth_df["exp_subspace"] = exp_subspaces 84 | 85 | g_truth_df.astype({"exp_subspace": "object"}) 86 | path2 = eva_root + "data_od_evaluation/" + data_name + "_gt_" + model_name + ".csv" 87 | g_truth_df.to_csv(path2, index=False) 88 | return anomaly_score_df, g_truth_df 89 | 90 | 91 | def evaluation_od(exp_subspace_list, x, y, data_name, model_name): 92 | """ 93 | use outlier detection to evaluate the explanation subspace for each anomaly data object, 94 | to evaluate whether this subspace is a high-contrast subspace to highlight this anomaly 95 | i.e., the anomaly detector can or cannot get a higher score in this space 96 | :param exp_subspace_list: explanation feature subspace for each anomaly, corresponding to ano_idx 97 | :param x: data set 98 | :param y: label 99 | :param data_name: name of dataset 100 | :param model_name: the name of anomaly detector to generate ground truth 101 | :return: average precision, jaccard, and anomaly score 102 | """ 103 | path1 = eva_root + "data_od_evaluation/" + data_name + "_gt_" + model_name + ".csv" 104 | if not os.path.exists(path1): 105 | print("annotation file not found, labeling now...") 106 | _, g_truth_df = evaluation_od_train(x, y, data_name, model_name) 107 | else: 108 | g_truth_df = pd.read_csv(path1) 109 | 110 | ano_idx = np.where(y == 1)[0] 111 | 112 | precision_list = np.zeros(len(ano_idx)) 113 | jaccard_list = np.zeros(len(ano_idx)) 114 | recall_list = np.zeros(len(ano_idx)) 115 | 116 | for ii, ano in enumerate(ano_idx): 117 | exp_subspace = list(exp_subspace_list[ii]) 118 | gt_subspace_str = g_truth_df.loc[g_truth_df["ano_idx"] == ano]["exp_subspace"].values[0] 119 | gt_subspace = ast.literal_eval(gt_subspace_str) 120 | 121 | overlap = list(set(gt_subspace).intersection(set(exp_subspace))) 122 | union = list(set(gt_subspace).union(set(exp_subspace))) 123 | 124 | precision_list[ii] = len(overlap) / len(exp_subspace) 125 | jaccard_list[ii] = len(overlap) / len(union) 126 | recall_list[ii] = len(overlap) / len(gt_subspace) 127 | 128 | return precision_list.mean(), recall_list.mean(), jaccard_list.mean() 129 | 130 | 131 | def evaluation_od_auc(feature_weight, x, y, data_name, model_name="iforest"): 132 | """ 133 | use outlier detection to evaluate the explanation subspace for each anomaly data, 134 | whether this subspace is a high-contrast subspace to highlight this anomaly 135 | :param exp_subspace_list: explanation feature subspace for each anomaly, corresponding to ano_idx 136 | :param x: data set 137 | :param y: label 138 | :param data_name: name of dataset 139 | :param model_name: the name of anomaly detector to generate ground truth 140 | :return: average precision, jaccard, and anomaly score 141 | """ 142 | path1 = eva_root + "data_od_evaluation/" + data_name + "_gt_" + model_name + ".csv" 143 | if not os.path.exists(path1): 144 | print("annotation file not found, labeling now...") 145 | _, g_truth_df = evaluation_od_train(x, y, data_name, model_name) 146 | else: 147 | g_truth_df = pd.read_csv(path1) 148 | 149 | ano_idx = np.where(y == 1)[0] 150 | dim = x.shape[1] 151 | 152 | auroc_list = np.zeros(len(ano_idx)) 153 | aupr_list = np.zeros(len(ano_idx)) 154 | for ii, ano in enumerate(ano_idx): 155 | score = feature_weight[ii] 156 | 157 | # ground_truth metrics 158 | gt_subspace_str = g_truth_df.loc[g_truth_df["ano_idx"] == ano]["exp_subspace"].values[0] 159 | gt_subspace = ast.literal_eval(gt_subspace_str) 160 | gt = np.zeros(dim, dtype=int) 161 | gt[gt_subspace] = 1 162 | 163 | if len(gt_subspace) == dim: 164 | auroc_list[ii] = 1 165 | aupr_list[ii] = 1 166 | else: 167 | precision, recall, _ = metrics.precision_recall_curve(gt, score) 168 | aupr_list[ii] = metrics.auc(recall, precision) 169 | auroc_list[ii] = metrics.roc_auc_score(gt, score) 170 | 171 | return aupr_list.mean(), auroc_list.mean() 172 | 173 | 174 | 175 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /model_aton/ATON.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script implements an outlier interpretation method of the following paper: 3 | "Beyond Outlier Detection: Outlier Interpretation by Attention-Guided Triplet Deviation Network". in WWW'21. 4 | @ Author: Hongzuo Xu 5 | @ email: hongzuo.xu@gmail.com or leogarcia@126.com or xuhongzuo13@nudt.edu.cn 6 | """ 7 | 8 | from pyod.models import lscp 9 | import numpy as np 10 | import time, math 11 | import torch 12 | import torch.utils.data as Data 13 | import torch.optim as optim 14 | import torch.nn.functional as F 15 | 16 | from torch.optim import lr_scheduler 17 | from sklearn.neighbors import NearestNeighbors 18 | from sklearn import metrics 19 | from tqdm import tqdm 20 | from model_aton.utils import EarlyStopping, min_max_normalize 21 | 22 | from model_aton.datasets import MyHardSingleTripletSelector 23 | from model_aton.datasets import SingleTripletDataset 24 | from model_aton.networks import ATONnet, AttentionNet 25 | from model_aton.networks import MyLoss 26 | 27 | 28 | class ATON: 29 | def __init__(self, nbrs_num=30, rand_num=30, alpha1=0.8, alpha2=0.2, 30 | n_epoch=10, batch_size=64, lr=0.1, n_linear=64, margin=2., 31 | verbose=True, gpu=True): 32 | self.verbose = verbose 33 | 34 | self.x = None 35 | self.y = None 36 | self.ano_idx = None 37 | self.dim = None 38 | 39 | # a list of normal nbr of each anomaly 40 | self.normal_nbr_indices = [] 41 | 42 | cuda = torch.cuda.is_available() 43 | self.device = torch.device("cuda" if cuda and gpu else "cpu") 44 | if cuda: 45 | torch.cuda.set_device(0) 46 | print("device:", self.device) 47 | 48 | self.nbrs_num = nbrs_num 49 | self.rand_num = rand_num 50 | self.alpha1 = alpha1 51 | self.alpha2 = alpha2 52 | 53 | self.n_epoch = n_epoch 54 | self.batch_size = batch_size 55 | self.lr = lr 56 | self.n_linear = n_linear 57 | self.margin = margin 58 | return 59 | 60 | def fit(self, x, y): 61 | device = self.device 62 | 63 | self.dim = x.shape[1] 64 | x = min_max_normalize(x) 65 | self.ano_idx = np.where(y == 1)[0] 66 | 67 | self.x = torch.tensor(x, dtype=torch.float32).to(device) 68 | self.y = torch.tensor(y, dtype=torch.int64).to(device) 69 | self.prepare_nbrs() 70 | 71 | # train model for each anomaly 72 | attn_lst, W_lst = [], [] 73 | if self.verbose: 74 | iterator = range(len(self.ano_idx)) 75 | else: 76 | iterator = tqdm(range(len(self.ano_idx))) 77 | for ii in iterator: 78 | idx = self.ano_idx[ii] 79 | 80 | s_t = time.time() 81 | attn, W = self.interpret_ano(ii) 82 | attn_lst.append(attn) 83 | W_lst.append(W) 84 | 85 | if self.verbose: 86 | print("Ano_id:[{}], ({}/{}) \t time: {:.2f}s\n".format( 87 | idx, (ii + 1), len(self.ano_idx), 88 | (time.time() - s_t))) 89 | 90 | fea_weight_lst = [] 91 | for ii, idx in enumerate(self.ano_idx): 92 | attn, w = attn_lst[ii], W_lst[ii] 93 | fea_weight = np.zeros(self.dim) 94 | 95 | # attention (linear space) + w --> feature weight (original space) 96 | for j in range(len(attn)): 97 | fea_weight += attn[j] * abs(w[j]) 98 | fea_weight_lst.append(fea_weight) 99 | return fea_weight_lst 100 | 101 | def interpret_ano(self, ii): 102 | idx = self.ano_idx[ii] 103 | device = self.device 104 | dim = self.dim 105 | 106 | nbr_indices = self.normal_nbr_indices[ii] 107 | data_loader, test_loader = self.prepare_triplets(idx, nbr_indices) 108 | n_linear = self.n_linear 109 | attn_net = AttentionNet(in_feature=3 * n_linear, n_hidden=int(1.5 * n_linear), out_feature=n_linear) 110 | model = ATONnet(attn_net=attn_net, n_feature=dim, n_linear=n_linear) 111 | model.to(device) 112 | 113 | optimizer = optim.Adam(model.parameters(), lr=self.lr, weight_decay=1e-2) 114 | criterion = MyLoss(alpha1=self.alpha1, alpha2=self.alpha2, margin=self.margin) 115 | 116 | scheduler = lr_scheduler.StepLR(optimizer, 5, gamma=0.1) 117 | early_stp = EarlyStopping(patience=3, verbose=False) 118 | 119 | for epoch in range(self.n_epoch): 120 | model.train() 121 | total_loss = 0 122 | total_dis = 0 123 | es_time = time.time() 124 | 125 | batch_cnt = 0 126 | for anchor, pos, neg in data_loader: 127 | anchor, pos, neg = anchor.to(device), pos.to(device), neg.to(device) 128 | embed_anchor, embed_pos, embed_neg, attn, dis = model(anchor, pos, neg) 129 | loss = criterion(embed_anchor, embed_pos, embed_neg, dis) 130 | 131 | total_loss += loss 132 | total_dis += dis.mean() 133 | 134 | optimizer.zero_grad() 135 | loss.backward() 136 | optimizer.step() 137 | batch_cnt += 1 138 | 139 | train_loss = total_loss / batch_cnt 140 | est = time.time() - es_time 141 | 142 | if self.verbose and (epoch + 1) % 1 == 0: 143 | message = 'Epoch: [{:02}/{:02}] loss: {:.4f} Time: {:.2f}s'.format(epoch + 1, self.n_epoch, 144 | train_loss, est) 145 | print(message) 146 | scheduler.step() 147 | 148 | early_stp(train_loss, model) 149 | if early_stp.early_stop: 150 | model.load_state_dict(torch.load(early_stp.path)) 151 | if self.verbose: 152 | print("early stopping") 153 | break 154 | 155 | # distill W and attn from network 156 | for anchor, pos, neg in test_loader: 157 | model.eval() 158 | anchor, pos, neg = anchor.to(device), pos.to(device), neg.to(device) 159 | _, _, _, attn, _ = model(anchor, pos, neg) 160 | 161 | attn_avg = torch.mean(attn, dim=0) 162 | attn_avg = attn_avg.data.cpu().numpy() 163 | W = model.linear.weight.data.cpu().numpy() 164 | return attn_avg, W 165 | 166 | def prepare_triplets(self, idx, nbr_indices): 167 | x = self.x 168 | y = self.y 169 | selector = MyHardSingleTripletSelector(nbrs_num=self.nbrs_num, rand_num=self.rand_num, 170 | nbr_indices=nbr_indices) 171 | dataset = SingleTripletDataset(idx, x, y, triplets_selector=selector) 172 | data_loader = Data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True) 173 | test_loader = Data.DataLoader(dataset, batch_size=len(dataset)) 174 | return data_loader, test_loader 175 | 176 | def prepare_nbrs(self): 177 | x = self.x.cpu().data.numpy() 178 | y = self.y.cpu().data.numpy() 179 | 180 | anom_idx = np.where(y == 1)[0] 181 | x_anom = x[anom_idx] 182 | noml_idx = np.where(y == 0)[0] 183 | x_noml = x[noml_idx] 184 | n_neighbors = self.nbrs_num 185 | 186 | nbrs_local = NearestNeighbors(n_neighbors=n_neighbors).fit(x_noml) 187 | tmp_indices = nbrs_local.kneighbors(x_anom)[1] 188 | 189 | for idx in tmp_indices: 190 | nbr_indices = noml_idx[idx] 191 | self.normal_nbr_indices.append(nbr_indices) 192 | return 193 | -------------------------------------------------------------------------------- /model_aton/ATON_ablation3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time, math 3 | import torch 4 | import torch.utils.data as Data 5 | import torch.optim as optim 6 | import torch.nn.functional as F 7 | 8 | from torch.optim import lr_scheduler 9 | from sklearn import metrics 10 | from tqdm import tqdm 11 | from model_aton.utils import EarlyStopping, min_max_normalize 12 | 13 | from model_aton.datasets import MyHardSingleSelectorClf, SingleDataset 14 | from model_aton.networks import ATONabla3net, AttentionNet, ClassificationNet 15 | import warnings 16 | warnings.filterwarnings("ignore") 17 | 18 | 19 | class ATONabla3: 20 | def __init__(self, nbrs_num=30, rand_num=30, 21 | n_epoch=10, batch_size=64, lr=0.1, n_linear=64, margin=2., 22 | verbose=True, gpu=True): 23 | self.verbose = verbose 24 | 25 | self.x = None 26 | self.y = None 27 | self.ano_idx = None 28 | self.dim = None 29 | 30 | self.reason_map = {} 31 | 32 | cuda = torch.cuda.is_available() 33 | self.device = torch.device("cuda" if cuda and gpu else "cpu") 34 | if cuda: 35 | torch.cuda.set_device(0) 36 | print("device:", self.device) 37 | 38 | self.nbrs_num = nbrs_num 39 | self.rand_num = rand_num 40 | 41 | self.n_epoch = n_epoch 42 | self.batch_size = batch_size 43 | self.lr = lr 44 | self.n_linear = n_linear 45 | self.margin = margin 46 | return 47 | 48 | def fit(self, x, y): 49 | device = self.device 50 | self.dim = x.shape[1] 51 | x = min_max_normalize(x) 52 | self.ano_idx = np.where(y == 1)[0] 53 | 54 | self.x = torch.tensor(x, dtype=torch.float32).to(device) 55 | self.y = torch.tensor(y, dtype=torch.int64).to(device) 56 | 57 | # train model for each anomaly 58 | attn_lst, W_lst = [], [] 59 | if self.verbose: 60 | iterator = range(len(self.ano_idx)) 61 | else: 62 | iterator = tqdm(range(len(self.ano_idx))) 63 | for ii in iterator: 64 | idx = self.ano_idx[ii] 65 | 66 | s_t = time.time() 67 | attn, W = self.interpret_ano(idx) 68 | attn_lst.append(attn) 69 | W_lst.append(W) 70 | 71 | if self.verbose: 72 | print("ano_idx [{} ({})] attn: {}".format(ii, idx, attn)) 73 | print("Ano_id:[{}], ({}/{}) \t time: {:.2f}s\n".format( 74 | idx, (ii + 1), len(self.ano_idx), 75 | (time.time() - s_t))) 76 | 77 | fea_weight_lst = [] 78 | for ii, idx in enumerate(self.ano_idx): 79 | attn, w = attn_lst[ii], W_lst[ii] 80 | fea_weight = np.zeros(self.dim) 81 | 82 | # attention (linear space) + w --> feature weight (original space) 83 | for j in range(len(attn)): 84 | fea_weight += attn[j] * abs(w[j]) 85 | fea_weight_lst.append(fea_weight) 86 | return fea_weight_lst 87 | 88 | def interpret_ano(self, idx): 89 | device = self.device 90 | dim = self.dim 91 | 92 | data_loader, test_loader = self.prepare_triplets(idx) 93 | n_linear = self.n_linear 94 | attn_net = AttentionNet(in_feature=n_linear, n_hidden=int(1.5 * n_linear), out_feature=n_linear) 95 | clf_net = ClassificationNet(n_feature=n_linear) 96 | 97 | model = ATONabla3net(attn_net=attn_net, clf_net=clf_net, n_feature=dim, n_linear=n_linear) 98 | model.to(device) 99 | 100 | optimizer = optim.Adam(model.parameters(), lr=self.lr, weight_decay=1e-2) 101 | criterion_cel = torch.nn.CrossEntropyLoss() 102 | 103 | scheduler = lr_scheduler.StepLR(optimizer, 5, gamma=0.1) 104 | early_stp = EarlyStopping(patience=3, verbose=False) 105 | 106 | for epoch in range(self.n_epoch): 107 | model.train() 108 | total_loss = 0 109 | total_acc = 0 110 | es_time = time.time() 111 | 112 | batch_cnt = 0 113 | for batch_x, batch_y in data_loader: 114 | batch_x, batch_y = batch_x.to(device), batch_y.to(device) 115 | 116 | clf_out, attn = model(batch_x) 117 | loss = criterion_cel(clf_out, batch_y) 118 | 119 | _, y_pred = torch.max(F.softmax(clf_out, dim=1).data.cpu(), 1) 120 | clf_acc = metrics.accuracy_score(batch_y.cpu().data.numpy(), y_pred.cpu().data.numpy()) 121 | 122 | total_loss += loss 123 | total_acc += clf_acc 124 | 125 | optimizer.zero_grad() 126 | loss.backward() 127 | optimizer.step() 128 | batch_cnt += 1 129 | 130 | train_loss = total_loss / batch_cnt 131 | train_acc = total_acc / batch_cnt 132 | est = time.time() - es_time 133 | 134 | if self.verbose and (epoch + 1) % 1 == 0: 135 | print('Epoch: [{:02}/{:02}] loss: {:.4f} acc: {:.4f} Time: {:.2f}s' 136 | .format(epoch + 1, self.n_epoch, train_loss, train_acc, est)) 137 | scheduler.step() 138 | 139 | early_stp(train_loss, model) 140 | if early_stp.early_stop: 141 | model.load_state_dict(torch.load(early_stp.path)) 142 | if self.verbose: 143 | print("early stopping") 144 | break 145 | 146 | for x, target in test_loader: 147 | model.eval() 148 | x = x.to(device) 149 | _, attn = model(x) 150 | 151 | attn_avg = torch.mean(attn, dim=0) 152 | attn_avg = attn_avg.data.cpu().numpy() 153 | W = model.linear.weight.data.cpu().numpy() 154 | return attn_avg, W 155 | 156 | def prepare_triplets(self, idx): 157 | x = self.x 158 | y = self.y 159 | 160 | selector = MyHardSingleSelectorClf(nbrs_num=self.nbrs_num, rand_num=self.rand_num) 161 | dataset = SingleDataset(idx, x, y, data_selector=selector) 162 | 163 | data_loader = Data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True) 164 | test_loader = Data.DataLoader(dataset, batch_size=len(dataset)) 165 | return data_loader, test_loader 166 | 167 | def weight2subspace(self, weight, r=0.7, num=-1): 168 | threshold = r * np.sum(weight) 169 | tmp_s = 0 170 | exp_subspace = [] 171 | sorted_idx1 = np.argsort(weight) 172 | sorted_idx = [sorted_idx1[self.dim - i -1] for i in range(self.dim)] 173 | if num != -1: 174 | exp_subspace = sorted_idx[:num] 175 | exp_subspace = list(np.sort(exp_subspace)) 176 | return exp_subspace 177 | 178 | for idx in sorted_idx: 179 | tmp_s += weight[idx] 180 | exp_subspace.append(idx) 181 | if tmp_s >= threshold: 182 | break 183 | exp_subspace = list(np.sort(exp_subspace)) 184 | return exp_subspace 185 | 186 | def weight2subspace_pn(self, weight): 187 | exp_subspace = [] 188 | for i in range(len(weight)): 189 | if weight[i] > 0: 190 | exp_subspace.append(i) 191 | if len(exp_subspace) == 0: 192 | exp_subspace = np.arange(len(weight)) 193 | exp_subspace = list(np.sort(exp_subspace)) 194 | return exp_subspace 195 | 196 | def get_exp_subspace(self, fea_weight_lst, w2s_ratio, real_exp_len=None): 197 | exp_subspace_lst = [] 198 | for ii, idx in enumerate(self.ano_idx): 199 | fea_weight = fea_weight_lst[ii] 200 | if w2s_ratio == "real_len": 201 | exp_subspace_lst.append(self.weight2subspace(fea_weight, num=real_exp_len[ii])) 202 | elif w2s_ratio == "auto": 203 | r = math.sqrt(2 / self.dim) 204 | exp_subspace_lst.append(self.weight2subspace(fea_weight, r=r)) 205 | elif w2s_ratio == "pn": 206 | exp_subspace_lst.append(self.weight2subspace_pn(fea_weight)) 207 | else: 208 | exp_subspace_lst.append(self.weight2subspace(fea_weight, r=w2s_ratio)) 209 | return exp_subspace_lst 210 | -------------------------------------------------------------------------------- /model_aton/datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script implements an outlier interpretation method of the following paper: 3 | "Beyond Outlier Detection: Outlier Interpretation by Attention-Guided Triplet Deviation Network". in WWW'21. 4 | @ Author: Hongzuo Xu 5 | @ email: hongzuo.xu@gmail.com or leogarcia@126.com or xuhongzuo13@nudt.edu.cn 6 | """ 7 | 8 | 9 | import numpy as np 10 | import torch 11 | from torch.utils.data import Dataset 12 | from sklearn.neighbors import NearestNeighbors 13 | 14 | 15 | class SingleTripletDataset(Dataset): 16 | def __init__(self, anom_idx, x, y, triplets_selector, transform=None): 17 | self.transform = transform 18 | self.data = x 19 | self.triplets = triplets_selector.get_triplets(anom_idx, x, y) 20 | 21 | def __getitem__(self, index): 22 | a_idx, p_idx, n_idx = self.triplets[index] 23 | anchor, positive, negative = self.data[a_idx], self.data[p_idx], self.data[n_idx] 24 | if self.transform is not None: 25 | anchor = self.transform(anchor) 26 | positive = self.transform(positive) 27 | negative = self.transform(negative) 28 | return anchor, positive, negative 29 | 30 | def __len__(self): 31 | return len(self.triplets) 32 | 33 | 34 | class SingleDataset(Dataset): 35 | def __init__(self, anom_idx, x, y, data_selector, transform=None): 36 | self.transform = transform 37 | self.selected_data = data_selector.get_data(anom_idx, x, y) 38 | 39 | def __getitem__(self, index): 40 | data = self.selected_data[0][index] 41 | target = self.selected_data[1][index] 42 | if self.transform is not None: 43 | data = self.transform(data) 44 | return data, target 45 | 46 | def __len__(self): 47 | return len(self.selected_data[0]) 48 | 49 | 50 | class SingleTripletDatasetClf(Dataset): 51 | def __init__(self, anom_idx, x, y, triplets_selector, transform=None): 52 | self.transform = transform 53 | self.data = x 54 | self.triplets, self.targets = triplets_selector.get_triplets(anom_idx, x, y) 55 | 56 | def __getitem__(self, index): 57 | a_idx, p_idx, n_idx = self.triplets[index] 58 | a_target, p_target, n_target = self.targets[index] 59 | anchor, positive, negative = self.data[a_idx], self.data[p_idx], self.data[n_idx] 60 | if self.transform is not None: 61 | anchor = self.transform(anchor) 62 | positive = self.transform(positive) 63 | negative = self.transform(negative) 64 | return anchor, positive, negative, a_target, p_target, n_target 65 | 66 | def __len__(self): 67 | return len(self.triplets) 68 | 69 | 70 | class MyHardSingleTripletSelector: 71 | def __init__(self, nbrs_num, rand_num, nbr_indices): 72 | self.x = None 73 | self.y = None 74 | self.nbrs_num = nbrs_num 75 | self.rand_num = rand_num 76 | self.nbr_indices = nbr_indices 77 | 78 | def get_triplets(self, anom_idx, x, y, normal_label=0): 79 | self.x = x.cpu().data.numpy() 80 | self.y = y.cpu().data.numpy() 81 | 82 | # anom_x = self.x[anom_idx] 83 | # x_noml = self.x[noml_idx] 84 | # n_neighbors = self.nbrs_num 85 | # nbrs_local = NearestNeighbors(n_neighbors=n_neighbors).fit(x_noml) 86 | # nbr_indices = noml_idx[nbrs_local.kneighbors([anom_x])[1].flatten()] 87 | 88 | noml_idx = np.where(self.y == normal_label)[0] 89 | nbr_indices = self.nbr_indices 90 | rand_num = self.rand_num 91 | 92 | rand_canddt = np.setdiff1d(noml_idx, nbr_indices) 93 | rand_indices = np.random.choice(rand_canddt, rand_num, replace=False) 94 | 95 | triplets = [[anchor, positive, anom_idx] 96 | for anchor in rand_indices 97 | for positive in nbr_indices] 98 | return torch.LongTensor(np.array(triplets)) 99 | 100 | 101 | class MyHardSingleSelectorClf: 102 | def __init__(self, nbrs_num, rand_num): 103 | self.nbrs_num = nbrs_num 104 | self.rand_num = rand_num 105 | 106 | def get_data(self, anom_idx, x, y, normal_label=0): 107 | x = x.cpu().data.numpy() 108 | y = y.cpu().data.numpy() 109 | 110 | anom_x = x[anom_idx] 111 | noml_idx = np.where(y == normal_label)[0] 112 | x_noml = x[noml_idx] 113 | 114 | nbrs_local = NearestNeighbors(n_neighbors=self.nbrs_num).fit(x_noml) 115 | nbr_indices = noml_idx[nbrs_local.kneighbors([anom_x])[1].flatten()] 116 | rand_canddt = np.setdiff1d(noml_idx, nbr_indices) 117 | rand_indices = np.random.choice(rand_canddt, self.rand_num, replace=False) 118 | 119 | # perturbation to augment 120 | dim = x.shape[1] 121 | anom_lst = [] 122 | anom_lst.append(anom_x) 123 | for i in range(self.rand_num + self.nbrs_num -1): 124 | new_anom_x = anom_x.copy() 125 | choose_f = np.random.choice(np.arange(dim), 3) 126 | for a in choose_f: 127 | new_anom_x[a] = anom_x[a] * 1.01 128 | anom_lst.append(new_anom_x) 129 | 130 | data_idx = np.hstack([rand_indices, nbr_indices]) 131 | norm_data = x[data_idx] 132 | data = np.vstack([np.array(anom_lst), norm_data]) 133 | target = np.hstack([np.ones(10), np.zeros(len(rand_indices), dtype=int), np.zeros(len(nbr_indices), dtype=int)]) 134 | 135 | return torch.FloatTensor(data), torch.LongTensor(target) 136 | 137 | 138 | class MyHardSingleTripletSelectorClf: 139 | def __init__(self, nbrs_num, rand_num): 140 | self.x = None 141 | self.y = None 142 | self.nbrs_num = nbrs_num 143 | self.rand_num = rand_num 144 | 145 | def get_triplets(self, anom_idx, x, y, normal_label=0): 146 | self.x = x.cpu().data.numpy() 147 | self.y = y.cpu().data.numpy() 148 | 149 | anom_x = self.x[anom_idx] 150 | noml_idx = np.where(self.y == normal_label)[0] 151 | x_noml = self.x[noml_idx] 152 | n_neighbors = self.nbrs_num 153 | rand_num = self.rand_num 154 | 155 | nbrs_local = NearestNeighbors(n_neighbors=n_neighbors).fit(x_noml) 156 | 157 | nbr_indices = noml_idx[nbrs_local.kneighbors([anom_x])[1].flatten()] 158 | # nbr_dist = nbrs_local.kneighbors([anom_x])[0].flatten() 159 | 160 | rand_canddt = np.setdiff1d(noml_idx, nbr_indices) 161 | rand_indices = np.random.choice(rand_canddt, rand_num, replace=False) 162 | 163 | triplets = [[anchor, positive, anom_idx] 164 | for anchor in rand_indices 165 | for positive in nbr_indices] 166 | 167 | # print("Generate triplets Num: [%d]" % len(triplets)) 168 | target = [[0, 0, 1]] * len(triplets) 169 | 170 | return torch.LongTensor(np.array(triplets)), torch.LongTensor(np.array(target)) 171 | 172 | 173 | class MyHardSingleTripletSelector2: 174 | def __init__(self, nbrs_num, rand_num): 175 | self.x = None 176 | self.y = None 177 | self.nbrs_num = nbrs_num 178 | self.rand_num = rand_num 179 | 180 | def get_triplets(self, anom_idx, x, y, normal_label=0): 181 | self.x = x.cpu().data.numpy() 182 | self.y = y.cpu().data.numpy() 183 | 184 | n_neighbors = self.nbrs_num 185 | rand_num = self.rand_num 186 | 187 | anom_x = self.x[anom_idx] 188 | 189 | anom_indices = np.where(self.y != normal_label)[0] 190 | noml_indices = np.where(self.y == normal_label)[0] 191 | noml_x = self.x[noml_indices] 192 | 193 | nbrs_local = NearestNeighbors(n_neighbors=n_neighbors).fit(noml_x) 194 | nbr_indices = noml_indices[nbrs_local.kneighbors([anom_x])[1].flatten()] 195 | # nbr_dist = nbrs_local.kneighbors([anom_x])[0].flatten() 196 | 197 | rand_canddt_nor = np.setdiff1d(noml_indices, nbr_indices) 198 | rand_nor_indices = np.random.choice(rand_canddt_nor, rand_num, replace=False) 199 | 200 | triplets1 = [[anchor, positive, anom_idx] 201 | for anchor in rand_nor_indices 202 | for positive in nbr_indices] 203 | 204 | rand_canddt_ano = np.setdiff1d(anom_indices, anom_idx) 205 | if len(rand_canddt_ano) < rand_num: 206 | rand_ano_indices = rand_canddt_ano 207 | else: 208 | rand_ano_indices = np.random.choice(rand_canddt_ano, rand_num, replace=False) 209 | 210 | triplets2 = [[anchor, anom_idx, negative] 211 | for anchor in rand_ano_indices 212 | for negative in nbr_indices] 213 | triplets = triplets1 + triplets2 214 | 215 | # print("Generate triplets Num: [%d]" % len(triplets)) 216 | target1 = [[0, 0, 1]] * len(triplets1) 217 | target2 = [[1, 1, 0]] * len(triplets2) 218 | target = target1 + target2 219 | 220 | return torch.LongTensor(np.array(triplets)), torch.LongTensor(np.array(target)) 221 | 222 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | """ 2 | this script can perform outlier interpretation method ATON, COIN, SHAP, LIME, and IntGrad 3 | These methods use feature weight as interpretation 4 | 5 | @ Author: Hongzuo Xu 6 | @ email: hongzuo.xu@gmail.com or leogarcia@126.com or xuhongzuo13@nudt.edu.cn 7 | """ 8 | 9 | import os 10 | import ast 11 | import glob 12 | import time, datetime 13 | import argparse 14 | import pandas as pd 15 | import numpy as np 16 | from prettytable import PrettyTable 17 | 18 | from model_aton.ATON import ATON 19 | from model_aton.ATON_ablation import ATONabla 20 | from model_aton.ATON_ablation2 import ATONabla2 21 | from model_aton.ATON_ablation3 import ATONabla3 22 | from model_iml.SHAP import SHAP 23 | from model_iml.LIME import LIME 24 | from model_coin.COIN import COIN 25 | # from model_iml.IntGrad import IntGrad 26 | 27 | from utils import model_utils 28 | from utils.eval_print_utils import print_eval_runs 29 | from eval.evaluation_od import evaluation_od, evaluation_od_auc 30 | from config import root, eva_root, get_parser 31 | import warnings 32 | 33 | warnings.filterwarnings("ignore") 34 | 35 | # ------------------- parser ----------------- # 36 | algorithm_name = "aton" 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument('--gpu', type=ast.literal_eval, default=True) 39 | parser.add_argument('--eval', type=ast.literal_eval, default=True, help='Evaluate the interpretation results or not') 40 | parser.add_argument('--path', type=str, default="data/", help='the input data path, can be a single csv ' 41 | 'or a data folder') 42 | parser.add_argument('--w2s_ratio', type=str, default='real_len', help='\'real-len\', \'auto\', \'pn\', or a ratio.') 43 | parser.add_argument('--runs', type=int, default=1) 44 | parser.add_argument('--record_name', type=str, default='') 45 | parser = get_parser(algorithm_name, parser) 46 | args = parser.parse_args() 47 | 48 | input_root_list = [root + args.path] 49 | w2s_ratio = args.w2s_ratio 50 | od_eval_model = ["iforest", "copod", "hbos"] # we obtain ground-truth annotations using three outlier detection methods 51 | runs = args.runs 52 | record_name = args.record_name 53 | 54 | # ------------------- record ----------------- # 55 | if not os.path.exists("record/" + algorithm_name): 56 | os.makedirs("record/" + algorithm_name) 57 | if not os.path.exists("checkpoints"): 58 | os.makedirs("checkpoints/") 59 | record_path = "record/" + algorithm_name + "/zout." + \ 60 | algorithm_name + "." + record_name + ".txt" 61 | doc = open(record_path, 'a') 62 | tab1 = PrettyTable(["parameter", "value"]) 63 | tab1.add_row(["@ data", str(input_root_list)]) 64 | tab1.add_row(["@ algorithm_name", str(algorithm_name)]) 65 | tab1.add_row(["@ w2s_ratio", str(w2s_ratio)]) 66 | tab1.add_row(["@ runs", str(runs)]) 67 | tab1.add_row(["@ od_eval_model", str(od_eval_model)]) 68 | tab1.add_row(["@ start_time", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")]) 69 | for k in list(vars(args).keys()): 70 | tab1.add_row([k, vars(args)[k]]) 71 | print(tab1, file=doc) 72 | print(tab1) 73 | doc.close() 74 | time.sleep(0.2) 75 | 76 | 77 | def main(path, run_times): 78 | print("eval:", args.eval) 79 | print("gpu :", args.gpu) 80 | data_name = path.split("/")[-1].split(".")[0] 81 | 82 | # this is to remove the prefix index number of data set name, so that we can match the annotation file. 83 | data_name = data_name[3:] 84 | 85 | print("# ------------------ %s ------------------ # " % data_name) 86 | 87 | df = pd.read_csv(path) 88 | X = df.values[:, :-1] 89 | y = np.array(df.values[:, -1], dtype=int) 90 | 91 | # get the real length of the ground-truth interpretation if the w2s_ratio is true 92 | real_len_lst = [] 93 | runs_metric_lst = [[] for k in range(len(od_eval_model))] 94 | if args.eval and args.w2s_ratio == "real_len": 95 | gt_lst = [] 96 | for eval_m in od_eval_model: 97 | folder = eva_root + "data_od_evaluation/" 98 | gt_path = os.path.join(folder, data_name + "_gt_" + eval_m + ".csv") 99 | if len(glob.glob(gt_path)) == 0: 100 | raise FileNotFoundError("no such gt file:" + gt_path) 101 | gt_str = pd.read_csv(gt_path)["exp_subspace"].values 102 | gt_lst.append([ast.literal_eval(gtt) for gtt in gt_str]) 103 | 104 | for gt in gt_lst: 105 | real_len_lst.append([len(gtt) for gtt in gt]) 106 | 107 | t = 0 108 | for i in range(run_times): 109 | print("runs: %d" % (i + 1)) 110 | time1 = time.time() 111 | 112 | # ------------ run the chosen algorithm to get interpretation (feature weight) ------------- # 113 | fea_weight_lst = run_model(algorithm_name, X, y) 114 | 115 | # ------------------- transfer feature weight to subspace ----------------- # 116 | subspace_outputs = [] 117 | if args.eval: 118 | for j in range(len(od_eval_model)): 119 | if w2s_ratio == "real_len": 120 | real_len = real_len_lst[j] 121 | subspace = model_utils.get_exp_subspace(fea_weight_lst, w2s_ratio=w2s_ratio, real_exp_len=real_len) 122 | else: 123 | subspace = model_utils.get_exp_subspace(fea_weight_lst, w2s_ratio=w2s_ratio) 124 | subspace_outputs.append(subspace) 125 | 126 | t = time.time() - time1 127 | 128 | # ---------------------- evaluation -------------------------- # 129 | if args.eval: 130 | for mm, eval_model in enumerate(od_eval_model): 131 | p, j, s = evaluation_od(subspace_outputs[mm], X, y, data_name, eval_model) 132 | auroc, aupr = evaluation_od_auc(fea_weight_lst, X, y, data_name, model_name=eval_model) 133 | metric_lst = [p, j, s, auroc, aupr, t] 134 | runs_metric_lst[mm].append(metric_lst) 135 | print("data: {}, eval_model: {}, {}".format(path.split("/")[-1].split(".")[0], eval_model, metric_lst)) 136 | 137 | if args.eval: 138 | name = path.split("/")[-1].split(".")[0] 139 | for mm in range(len(od_eval_model)): 140 | txt = print_eval_runs(runs_metric_lst[mm], data_name=name, algo_name=algorithm_name) 141 | print(txt) 142 | 143 | doc = open(record_path, 'a') 144 | print(txt, file=doc) 145 | doc.close() 146 | else: 147 | txt = data_name + "," + str(round(t, 2)) + "," + algorithm_name 148 | print(txt) 149 | doc = open(record_path, 'a') 150 | print(txt, file=doc) 151 | doc.close() 152 | return 153 | 154 | 155 | def run_model(algorithm, X, y): 156 | if algorithm == "aton": 157 | model = ATON(verbose=False, gpu=args.gpu, 158 | nbrs_num=args.nbrs_num, rand_num=args.rand_num, 159 | alpha1=args.alpha1, alpha2=args.alpha2, 160 | n_epoch=args.n_epoch, batch_size=args.batch_size, lr=args.lr, 161 | n_linear=args.n_linear, margin=args.margin) 162 | fea_weight_lst = model.fit(X, y) 163 | 164 | elif algorithm == "aton_ablation": 165 | model = ATONabla(verbose=False, 166 | nbrs_num=args.nbrs_num, rand_num=args.rand_num, n_epoch=args.n_epoch, 167 | batch_size=args.batch_size, lr=args.lr, n_linear=args.n_linear, margin=args.margin) 168 | fea_weight_lst = model.fit(X, y) 169 | 170 | elif algorithm == "aton_ablation2": 171 | model = ATONabla2(verbose=False, 172 | nbrs_num=args.nbrs_num, rand_num=args.rand_num, n_epoch=args.n_epoch, 173 | batch_size=args.batch_size, lr=args.lr, margin=args.margin) 174 | fea_weight_lst = model.fit(X, y) 175 | 176 | elif algorithm == "aton_ablation3": 177 | model = ATONabla3(verbose=False, gpu=True, 178 | nbrs_num=args.nbrs_num, rand_num=args.rand_num, n_epoch=args.n_epoch, 179 | batch_size=args.batch_size, lr=args.lr, n_linear=args.n_linear, margin=args.margin) 180 | fea_weight_lst = model.fit(X, y) 181 | 182 | elif algorithm == "shap": 183 | model = SHAP(kernel=args.kernel, n_sample=args.n_sample, threshold=args.threshold) 184 | fea_weight_lst = model.fit(X, y) 185 | 186 | elif algorithm == "lime": 187 | model = LIME(discretize_continuous=args.discretize_continuous, discretizer=args.discretizer) 188 | fea_weight_lst = model.fit(X, y) 189 | 190 | # elif algorithm == "intgrad": 191 | # model = IntGrad(n_steps=args.n_steps, method=args.method) 192 | # fea_weight_lst = model.fit(X, y) 193 | 194 | elif algorithm == "coin": 195 | sgnf_prior = 1 196 | model = COIN(X, y, args.ratio_nbr, AUG=args.AUG, MIN_CLUSTER_SIZE=args.MIN_CLUSTER_SIZE, 197 | MAX_NUM_CLUSTER=args.MAX_NUM_CLUSTER, VAL_TIMES=args.VAL_TIMES, 198 | C_SVM=args.C_SVM, THRE_PS=args.THRE_PS, DEFK=args.DEFK) 199 | fea_weight_lst = model.fit(sgnf_prior) 200 | else: 201 | raise NotImplementedError("not implemented the algorithm") 202 | return fea_weight_lst 203 | 204 | 205 | if __name__ == '__main__': 206 | for input_root in input_root_list: 207 | if os.path.isdir(input_root): 208 | for file_name in sorted(os.listdir(input_root)): 209 | if file_name.endswith(".csv"): 210 | input_path = str(os.path.join(input_root, file_name)) 211 | main(input_path, runs) 212 | 213 | else: 214 | input_path = input_root 215 | main(input_path, runs) 216 | -------------------------------------------------------------------------------- /data/01-vertebral.csv: -------------------------------------------------------------------------------- 1 | A0,A1,A2,A3,A4,A5,class 2 | 63.03,22.55,39.61,40.48,98.67,-0.25,0.0 3 | 39.06,10.06,25.02,29.0,114.41,4.56,0.0 4 | 68.83,22.22,50.09,46.61,105.99,-3.53,0.0 5 | 69.3,24.65,44.31,44.64,101.87,11.21,0.0 6 | 49.71,9.65,28.32,40.06,108.17,7.92,0.0 7 | 40.25,13.92,25.12,26.33,130.33,2.23,0.0 8 | 53.43,15.86,37.17,37.57,120.57,5.99,0.0 9 | 45.37,10.76,29.04,34.61,117.27,-10.68,0.0 10 | 43.79,13.53,42.69,30.26,125.0,13.29,0.0 11 | 36.69,5.01,41.95,31.68,84.24,0.66,0.0 12 | 49.71,13.04,31.33,36.67,108.65,-7.83,0.0 13 | 31.23,17.72,15.5,13.52,120.06,0.5,0.0 14 | 48.92,19.96,40.26,28.95,119.32,8.03,0.0 15 | 53.57,20.46,33.1,33.11,110.97,7.04,0.0 16 | 57.3,24.19,47.0,33.11,116.81,5.77,0.0 17 | 44.32,12.54,36.1,31.78,124.12,5.42,0.0 18 | 63.83,20.36,54.55,43.47,112.31,-0.62,0.0 19 | 31.28,3.14,32.56,28.13,129.01,3.62,0.0 20 | 38.7,13.44,31.0,25.25,123.16,1.43,0.0 21 | 41.73,12.25,30.12,29.48,116.59,-1.24,0.0 22 | 43.92,14.18,37.83,29.74,134.46,6.45,0.0 23 | 54.92,21.06,42.2,33.86,125.21,2.43,0.0 24 | 63.07,24.41,54.0,38.66,106.42,15.78,0.0 25 | 45.54,13.07,30.3,32.47,117.98,-4.99,0.0 26 | 36.13,22.76,29.0,13.37,115.58,-3.24,0.0 27 | 54.12,26.65,35.33,27.47,121.45,1.57,0.0 28 | 26.15,10.76,14.0,15.39,125.2,-10.09,0.0 29 | 43.58,16.51,47.0,27.07,109.27,8.99,0.0 30 | 44.55,21.93,26.79,22.62,111.07,2.65,0.0 31 | 66.88,24.89,49.28,41.99,113.48,-2.01,0.0 32 | 50.82,15.4,42.53,35.42,112.19,10.87,0.0 33 | 46.39,11.08,32.14,35.31,98.77,6.39,0.0 34 | 44.94,17.44,27.78,27.49,117.98,5.57,0.0 35 | 38.66,12.99,40.0,25.68,124.91,2.7,0.0 36 | 59.6,32.0,46.56,27.6,119.33,1.47,0.0 37 | 31.48,7.83,24.28,23.66,113.83,4.39,0.0 38 | 32.09,6.99,36.0,25.1,132.26,6.41,0.0 39 | 35.7,19.44,20.7,16.26,137.54,-0.26,0.0 40 | 55.84,28.85,47.69,27.0,123.31,2.81,0.0 41 | 52.42,19.01,35.87,33.41,116.56,1.69,0.0 42 | 35.49,11.7,15.59,23.79,106.94,-3.46,0.0 43 | 46.44,8.4,29.04,38.05,115.48,2.05,0.0 44 | 53.85,19.23,32.78,34.62,121.67,5.33,0.0 45 | 66.29,26.33,47.5,39.96,121.22,-0.8,0.0 46 | 56.03,16.3,62.28,39.73,114.02,-2.33,0.0 47 | 50.91,23.02,47.0,27.9,117.42,-2.53,0.0 48 | 48.33,22.23,36.18,26.1,117.38,6.48,0.0 49 | 41.35,16.58,30.71,24.78,113.27,-4.5,0.0 50 | 40.56,17.98,34.0,22.58,121.05,-1.54,0.0 51 | 41.77,17.9,20.03,23.87,118.36,2.06,0.0 52 | 55.29,20.44,34.0,34.85,115.88,3.56,0.0 53 | 74.43,41.56,27.7,32.88,107.95,5.0,0.0 54 | 50.21,29.76,36.1,20.45,128.29,5.74,0.0 55 | 30.15,11.92,34.0,18.23,112.68,11.46,0.0 56 | 41.17,17.32,33.47,23.85,116.38,-9.57,0.0 57 | 47.66,13.28,36.68,34.38,98.25,6.27,0.0 58 | 43.35,7.47,28.07,35.88,112.78,5.75,0.0 59 | 46.86,15.35,38.0,31.5,116.25,1.66,0.0 60 | 43.2,19.66,35.0,23.54,124.85,-2.92,0.0 61 | 48.11,14.93,35.56,33.18,124.06,7.95,0.0 62 | 74.38,32.05,78.77,42.32,143.56,56.13,0.0 63 | 89.68,32.7,83.13,56.98,129.96,92.03,0.0 64 | 44.53,9.43,52.0,35.1,134.71,29.11,0.0 65 | 77.69,21.38,64.43,56.31,114.82,26.93,0.0 66 | 76.15,21.94,82.96,54.21,123.93,10.43,0.0 67 | 83.93,41.29,62.0,42.65,115.01,26.59,0.0 68 | 78.49,22.18,60.0,56.31,118.53,27.38,0.0 69 | 75.65,19.34,64.15,56.31,95.9,69.55,0.0 70 | 72.08,18.95,51.0,53.13,114.21,1.01,0.0 71 | 58.6,-0.26,51.5,58.86,102.04,28.06,0.0 72 | 72.56,17.39,52.0,55.18,119.19,32.11,0.0 73 | 86.9,32.93,47.79,53.97,135.08,101.72,0.0 74 | 84.97,33.02,60.86,51.95,125.66,74.33,0.0 75 | 55.51,20.1,44.0,35.42,122.65,34.55,0.0 76 | 72.22,23.08,91.0,49.14,137.74,56.8,0.0 77 | 70.22,39.82,68.12,30.4,148.53,145.38,0.0 78 | 86.75,36.04,69.22,50.71,139.41,110.86,0.0 79 | 58.78,7.67,53.34,51.12,98.5,51.58,0.0 80 | 67.41,17.44,60.14,49.97,111.12,33.16,0.0 81 | 47.74,12.09,39.0,35.66,117.51,21.68,0.0 82 | 77.11,30.47,69.48,46.64,112.15,70.76,0.0 83 | 74.01,21.12,57.38,52.88,120.21,74.56,0.0 84 | 88.62,29.09,47.56,59.53,121.76,51.81,0.0 85 | 81.1,24.79,77.89,56.31,151.84,65.21,0.0 86 | 76.33,42.4,57.2,33.93,124.27,50.13,0.0 87 | 45.44,9.91,45.0,35.54,163.07,20.32,0.0 88 | 59.79,17.88,59.21,41.91,119.32,22.12,0.0 89 | 44.91,10.22,44.63,34.7,130.08,37.36,0.0 90 | 56.61,16.8,42.0,39.81,127.29,24.02,0.0 91 | 71.19,23.9,43.7,47.29,119.86,27.28,0.0 92 | 81.66,28.75,58.23,52.91,114.77,30.61,0.0 93 | 70.95,20.16,62.86,50.79,116.18,32.52,0.0 94 | 85.35,15.84,71.67,69.51,124.42,76.02,0.0 95 | 58.1,14.84,79.65,43.26,113.59,50.24,0.0 96 | 94.17,15.38,67.71,78.79,114.89,53.26,0.0 97 | 57.52,33.65,50.91,23.88,140.98,148.75,0.0 98 | 96.66,19.46,90.21,77.2,120.67,64.08,0.0 99 | 74.72,19.76,82.74,54.96,109.36,33.31,0.0 100 | 77.66,22.43,93.89,55.22,123.06,61.21,0.0 101 | 58.52,13.92,41.47,44.6,115.51,30.39,0.0 102 | 84.59,30.36,65.48,54.22,108.01,25.12,0.0 103 | 79.94,18.77,63.31,61.16,114.79,38.54,0.0 104 | 70.4,13.47,61.2,56.93,102.34,25.54,0.0 105 | 49.78,6.47,53.0,43.32,110.86,25.34,0.0 106 | 77.41,29.4,63.23,48.01,118.45,93.56,0.0 107 | 65.01,27.6,50.95,37.41,116.58,7.02,0.0 108 | 65.01,9.84,57.74,55.18,94.74,49.7,0.0 109 | 78.43,33.43,76.28,45.0,138.55,77.16,0.0 110 | 63.17,6.33,63.0,56.84,110.64,42.61,0.0 111 | 68.61,15.08,63.01,53.53,123.43,39.5,0.0 112 | 63.9,13.71,62.12,50.19,114.13,41.42,0.0 113 | 85.0,29.61,83.35,55.39,126.91,71.32,0.0 114 | 42.02,-6.55,67.9,48.58,111.59,27.34,0.0 115 | 69.76,19.28,48.5,50.48,96.49,51.17,0.0 116 | 80.99,36.84,86.96,44.14,141.09,85.87,0.0 117 | 129.83,8.4,48.38,121.43,107.69,418.54,0.0 118 | 70.48,12.49,62.42,57.99,114.19,56.9,0.0 119 | 86.04,38.75,47.87,47.29,122.09,61.99,0.0 120 | 65.54,24.16,45.78,41.38,136.44,16.38,0.0 121 | 60.75,15.75,43.2,45.0,113.05,31.69,0.0 122 | 54.74,12.1,41.0,42.65,117.64,40.38,0.0 123 | 83.88,23.08,87.14,60.8,124.65,80.56,0.0 124 | 80.07,48.07,52.4,32.01,110.71,67.73,0.0 125 | 65.67,10.54,56.49,55.12,109.16,53.93,0.0 126 | 74.72,14.32,32.5,60.4,107.18,37.02,0.0 127 | 48.06,5.69,57.06,42.37,95.44,32.84,0.0 128 | 70.68,21.7,59.18,48.97,103.01,27.81,0.0 129 | 80.43,17.0,66.54,63.43,116.44,57.78,0.0 130 | 90.51,28.27,69.81,62.24,100.89,58.82,0.0 131 | 77.24,16.74,49.78,60.5,110.69,39.79,0.0 132 | 50.07,9.12,32.17,40.95,99.71,26.77,0.0 133 | 69.78,13.78,58.0,56.0,118.93,17.91,0.0 134 | 69.63,21.12,52.77,48.5,116.8,54.82,0.0 135 | 81.75,20.12,70.56,61.63,119.43,55.51,0.0 136 | 52.2,17.21,78.09,34.99,136.97,54.94,0.0 137 | 77.12,30.35,77.48,46.77,110.61,82.09,0.0 138 | 88.02,39.84,81.77,48.18,116.6,56.77,0.0 139 | 83.4,34.31,78.42,49.09,110.47,49.67,0.0 140 | 72.05,24.7,79.87,47.35,107.17,56.43,0.0 141 | 85.1,21.07,91.73,64.03,109.06,38.03,0.0 142 | 69.56,15.4,74.44,54.16,105.07,29.7,0.0 143 | 89.5,48.9,72.0,40.6,134.63,118.35,0.0 144 | 85.29,18.28,100.74,67.01,110.66,58.88,0.0 145 | 60.63,20.6,64.54,40.03,117.23,104.86,0.0 146 | 60.04,14.31,58.04,45.73,105.13,30.41,0.0 147 | 85.64,42.69,78.75,42.95,105.14,42.89,0.0 148 | 85.58,30.46,78.23,55.12,114.87,68.38,0.0 149 | 55.08,-3.76,56.0,58.84,109.92,31.77,0.0 150 | 65.76,9.83,50.82,55.92,104.39,39.31,0.0 151 | 79.25,23.94,40.8,55.3,98.62,36.71,0.0 152 | 81.11,20.69,60.69,60.42,94.02,40.51,0.0 153 | 48.03,3.97,58.34,44.06,125.35,35.0,0.0 154 | 63.4,14.12,48.14,49.29,111.92,31.78,0.0 155 | 57.29,15.15,64.0,42.14,116.74,30.34,0.0 156 | 41.19,5.79,42.87,35.39,103.35,27.66,0.0 157 | 66.8,14.55,72.08,52.25,82.46,41.69,0.0 158 | 79.48,26.73,70.65,52.74,118.59,61.7,0.0 159 | 44.22,1.51,46.11,42.71,108.63,42.81,0.0 160 | 57.04,0.35,49.2,56.69,103.05,52.17,0.0 161 | 64.27,12.51,68.7,51.77,95.25,39.41,0.0 162 | 92.03,35.39,77.42,56.63,115.72,58.06,0.0 163 | 67.26,7.19,51.7,60.07,97.8,42.14,0.0 164 | 118.14,38.45,50.84,79.7,81.02,74.04,0.0 165 | 115.92,37.52,76.8,78.41,104.7,81.2,0.0 166 | 53.94,9.31,43.1,44.64,124.4,25.08,0.0 167 | 83.7,20.27,77.11,63.43,125.48,69.28,0.0 168 | 56.99,6.87,57.01,50.12,109.98,36.81,0.0 169 | 72.34,16.42,59.87,55.92,70.08,12.07,0.0 170 | 95.38,24.82,95.16,70.56,89.31,57.66,0.0 171 | 44.25,1.1,38.0,43.15,98.27,23.91,0.0 172 | 64.81,15.17,58.84,49.64,111.68,21.41,0.0 173 | 78.4,14.04,79.69,64.36,104.73,12.39,0.0 174 | 56.67,13.46,43.77,43.21,93.69,21.11,0.0 175 | 50.83,9.06,56.3,41.76,79.0,23.04,0.0 176 | 61.41,25.38,39.1,36.03,103.4,21.84,0.0 177 | 56.56,8.96,52.58,47.6,98.78,50.7,0.0 178 | 67.03,13.28,66.15,53.75,100.72,33.99,0.0 179 | 80.82,19.24,61.64,61.58,89.47,44.17,0.0 180 | 80.65,26.34,60.9,54.31,120.1,52.47,0.0 181 | 68.72,49.43,68.06,19.29,125.02,54.69,0.0 182 | 37.9,4.48,24.71,33.42,157.85,33.61,0.0 183 | 64.62,15.23,67.63,49.4,90.3,31.33,0.0 184 | 75.44,31.54,89.6,43.9,106.83,54.97,0.0 185 | 71.0,37.52,84.54,33.49,125.16,67.77,0.0 186 | 81.06,20.8,91.78,60.26,125.43,38.18,0.0 187 | 91.47,24.51,84.62,66.96,117.31,52.62,0.0 188 | 81.08,21.26,78.77,59.83,90.07,49.16,0.0 189 | 60.42,5.27,59.81,55.15,109.03,30.27,0.0 190 | 85.68,38.65,82.68,47.03,120.84,61.96,0.0 191 | 82.41,29.28,77.05,53.13,117.04,62.77,0.0 192 | 43.72,9.81,52.0,33.91,88.43,40.88,0.0 193 | 86.47,40.3,61.14,46.17,97.4,55.75,0.0 194 | 74.47,33.28,66.94,41.19,146.47,124.98,0.0 195 | 70.25,10.34,76.37,59.91,119.24,32.67,0.0 196 | 72.64,18.93,68.0,53.71,116.96,25.38,0.0 197 | 71.24,5.27,86.0,65.97,110.7,38.26,0.0 198 | 63.77,12.76,65.36,51.01,89.82,56.0,0.0 199 | 58.83,37.58,125.74,21.25,135.63,117.31,0.0 200 | 74.85,13.91,62.69,60.95,115.21,33.17,0.0 201 | 75.3,16.67,61.3,58.63,118.88,31.58,0.0 202 | 63.36,20.02,67.5,43.34,131.0,37.56,0.0 203 | 67.51,33.28,96.28,34.24,145.6,88.3,0.0 204 | 76.31,41.93,93.28,34.38,132.27,101.22,0.0 205 | 73.64,9.71,63.0,63.92,98.73,26.98,0.0 206 | 56.54,14.38,44.99,42.16,101.72,25.77,0.0 207 | 80.11,33.94,85.1,46.17,125.59,100.29,0.0 208 | 95.48,46.55,59.0,48.93,96.68,77.28,0.0 209 | 74.09,18.82,76.03,55.27,128.41,73.39,0.0 210 | 87.68,20.37,93.82,67.31,120.94,76.73,0.0 211 | 48.26,16.42,36.33,31.84,94.88,28.34,0.0 212 | 61.73,17.11,46.9,44.62,120.92,3.09,1.0 213 | 64.31,26.33,50.96,37.98,106.18,3.12,1.0 214 | 61.54,19.68,52.89,41.86,118.69,4.82,1.0 215 | 54.95,5.87,53.0,49.09,126.97,-0.63,1.0 216 | 48.8,18.02,52.0,30.78,139.15,10.44,1.0 217 | 40.41,-1.33,30.98,41.74,119.34,-6.17,1.0 218 | 53.94,20.72,29.22,33.22,114.37,-0.42,1.0 219 | 82.91,29.89,58.25,53.01,110.71,6.08,1.0 220 | 56.1,13.11,62.64,43.0,116.23,31.17,1.0 221 | 45.25,8.69,41.58,36.56,118.55,0.21,1.0 222 | 39.09,5.54,26.93,33.55,131.58,-0.76,1.0 223 | 49.0,13.11,51.87,35.88,126.4,0.54,1.0 224 | 67.54,14.66,58.0,52.88,123.63,25.97,1.0 225 | 54.75,9.75,48.0,45.0,123.04,8.24,1.0 226 | 54.6,21.49,29.36,33.11,118.34,-1.47,1.0 227 | 74.57,15.72,58.62,58.84,105.42,0.6,1.0 228 | 51.53,13.52,35.0,38.01,126.72,13.93,1.0 229 | 34.76,2.63,29.5,32.12,127.14,-0.46,1.0 230 | 38.13,6.56,50.45,31.57,132.11,6.34,1.0 231 | 43.12,13.82,40.35,29.3,128.52,0.97,1.0 232 | 54.5,6.82,47.0,47.68,111.79,-4.41,1.0 233 | 49.83,16.74,28.0,33.09,121.44,1.91,1.0 234 | 50.68,6.46,35.0,44.22,116.59,-0.21,1.0 235 | 74.98,14.92,53.73,60.05,105.65,1.59,1.0 236 | 48.17,9.59,39.71,38.58,135.62,5.36,1.0 237 | 63.62,16.93,49.35,46.68,117.09,-0.36,1.0 238 | 50.75,20.24,37.0,30.52,122.34,2.29,1.0 239 | 50.16,-2.97,42.0,53.13,131.8,-8.29,1.0 240 | 46.24,10.06,37.0,36.17,128.06,-5.1,1.0 241 | 69.0,13.29,55.57,55.71,126.61,10.83,1.0 242 | -------------------------------------------------------------------------------- /model_sinne/SiNNE.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.neighbors import NearestNeighbors 3 | from model_sinne import utils 4 | from tqdm import tqdm 5 | import pandas as pd 6 | 7 | 8 | class SiNNE: 9 | def __init__(self, max_level="full", width=10, ensemble_num=100, sample_num=8, pretrain=False, verbose=False): 10 | self.max_level = max_level 11 | self.width = width 12 | self.t = ensemble_num 13 | self.phi = sample_num 14 | 15 | self.pretrain = pretrain 16 | self.pretrain_nn_model_map = None 17 | self.verbose = verbose 18 | 19 | return 20 | 21 | def fit(self, x, y): 22 | dim = x.shape[1] 23 | norm_idx = np.where(y == 0)[0] 24 | anom_idx = np.where(y == 1)[0] 25 | anom_x = x[anom_idx] 26 | norm_x = x[norm_idx] 27 | 28 | if self.pretrain: 29 | self.pretrain_nn_model_map = self.pretrain_nn_models_pwset(x, y) 30 | nn_model_map = self.pretrain_nn_model_map 31 | else: 32 | nn_model_map = {} 33 | 34 | if self.max_level == "full": 35 | self.max_level = dim 36 | else: 37 | self.max_level = int(self.max_level) 38 | 39 | # step1: get the scores of dim=1 40 | dim1_scores_lst = np.zeros([dim, len(anom_idx)]) 41 | if self.verbose: 42 | print("Running level 1: ") 43 | for i in tqdm(range(dim)): 44 | if self.pretrain: 45 | [ensmb_samples_lst, ensmb_radius_lst, _] = nn_model_map[str([i])] 46 | else: 47 | norm_x_subspace = norm_x[:, [i]] 48 | [ensmb_samples_lst, ensmb_radius_lst, _] = self.training_nn_models(norm_x_subspace) 49 | 50 | subspaced_queries = anom_x[:, [i]] 51 | # dim1_scores_lst[i] = scoring(subspaced_queries, ensmb_radius_lst, ensmb_nn_lst) 52 | for jj, q in enumerate(subspaced_queries): 53 | dim1_scores_lst[i, jj] = self.single_scoring(q, ensmb_samples_lst, ensmb_radius_lst) 54 | 55 | D = np.arange(dim) 56 | exp_subspace_lst = [] 57 | for i in tqdm(range(len(anom_idx))): 58 | query = anom_x[i] 59 | 60 | init_score = dim1_scores_lst[:, i] 61 | init_subspaces = [[i] for i in range(dim)] 62 | 63 | if dim <= 50: 64 | keep_score = init_score 65 | keep_subspaces = init_subspaces 66 | else: 67 | start = len(init_score) - self.width 68 | keep_score = np.sort(init_score)[start:] 69 | indices = np.argsort(init_score)[start:] 70 | keep_subspaces = [init_subspaces[dd] for dd in indices] 71 | 72 | for level in range(2, self.max_level): 73 | if self.verbose: 74 | print("--------------------- level: [{}] ----------------------".format(level)) 75 | 76 | # filter the subspaces that are in previous level (has been explored) 77 | root_subspaces = [s for s in keep_subspaces if len(s) == level - 1] 78 | exploring_subspaces = [] 79 | for s in root_subspaces: 80 | other_features = np.setdiff1d(D, s) 81 | for f in other_features: 82 | this_subspace = list(np.sort(s + [f])) 83 | if this_subspace not in exploring_subspaces: 84 | exploring_subspaces.append(this_subspace) 85 | # print("add to exploring set") 86 | if self.verbose: 87 | print("exploring subspaces size: ", len(exploring_subspaces)) 88 | exploring_scores = np.zeros(len(exploring_subspaces)) 89 | if self.verbose: 90 | iterator = tqdm(range(len(exploring_subspaces))) 91 | else: 92 | iterator = range(len(exploring_subspaces)) 93 | 94 | for jj in iterator: 95 | s = exploring_subspaces[jj] 96 | if self.pretrain or str(s) in nn_model_map: 97 | [ensmb_samples_lst, ensmb_radius_lst, ensmb_nn_lst] = nn_model_map[str(s)] 98 | else: 99 | norm_x_subspace = norm_x[:, s] 100 | nn_model = self.training_nn_models(norm_x_subspace) 101 | nn_model_map[str(s)] = nn_model 102 | [ensmb_samples_lst, ensmb_radius_lst, ensmb_nn_lst] = nn_model 103 | 104 | # subspaced_query = [query[s]] 105 | # query_subspace_score = scoring(subspaced_query, ensmb_radius_lst, ensmb_nn_lst) 106 | 107 | # @NOTE: use a small bias to get larger score for shorter subspace, 108 | # then the model tend to use shorter subspaces as explanation if multiple subspaces have same score 109 | query_subspace_score = self.single_scoring(query[s], ensmb_samples_lst, ensmb_radius_lst) + \ 110 | (dim - len(s)) * 0.001 111 | exploring_scores[jj] = query_subspace_score 112 | 113 | 114 | scores = np.append(keep_score, exploring_scores) 115 | subspaces = keep_subspaces + exploring_subspaces 116 | 117 | if self.width > len(scores): 118 | start = 0 119 | else: 120 | start = len(scores) - self.width 121 | keep_score = np.sort(scores)[start:] 122 | indices = np.argsort(scores)[start:] 123 | keep_subspaces = [subspaces[dd] for dd in indices] 124 | 125 | if self.verbose: 126 | print("--------------------- level: [{}] ----------------------".format(level)) 127 | print(keep_score) 128 | print(keep_subspaces) 129 | exp_subspace = keep_subspaces[-1] 130 | exp_subspace_lst.append(exp_subspace) 131 | return exp_subspace_lst 132 | 133 | def training_nn_models(self, data): 134 | n_x = data.shape[0] 135 | ensmb_samples_lst = [] 136 | ensmb_radius_lst = [] 137 | ensmb_nn_lst = [] 138 | for i in range(self.t): 139 | samples = data[np.random.choice(np.arange(n_x), self.phi, replace=False)] 140 | ensmb_samples_lst.append(samples) 141 | 142 | # the nearest neighbor is data itself, so the n_neighbors is set as 2 143 | samples_nn = NearestNeighbors(n_neighbors=2).fit(samples) 144 | ensmb_nn_lst.append(samples_nn) 145 | 146 | radius = np.zeros(self.phi) 147 | for ii, xx in enumerate(samples): 148 | # nbr_idx = nbrs_local.kneighbors([xx])[1].flatten()[1] 149 | radius[ii] = samples_nn.kneighbors([xx])[0].flatten()[1] 150 | ensmb_radius_lst.append(radius) 151 | 152 | nn_model = [ensmb_samples_lst, ensmb_radius_lst, ensmb_nn_lst] 153 | return nn_model 154 | 155 | def single_scoring(self, single_x, ensmb_samples_lst, ensmb_radius_lst): 156 | outlier_score = 0 157 | 158 | for i in range(self.t): 159 | radius = ensmb_radius_lst[i] 160 | samples = ensmb_samples_lst[i] 161 | 162 | is_outlier = 1 163 | for j in range(self.phi): 164 | sample = samples[j] 165 | threshold = radius[j] 166 | dist = np.sqrt(np.sum((sample - single_x)**2)) 167 | if dist <= threshold: 168 | is_outlier = 0 169 | break 170 | outlier_score += is_outlier 171 | 172 | outlier_score = outlier_score / self.t 173 | return outlier_score 174 | 175 | # @TODO bug: it is wrong to only consider the nearest sample data in each model 176 | def scoring(self, test_x, ensmb_radius_lst, ensmb_nn_lst): 177 | outlier_scores = np.zeros(len(test_x)) 178 | t = len(ensmb_radius_lst) 179 | num_x = len(test_x) 180 | 181 | for i in range(t): 182 | radius = ensmb_radius_lst[i] 183 | nn = ensmb_nn_lst[i] 184 | 185 | # choosing the nearest data in the model i 186 | nbr_idx = nn.kneighbors(test_x)[1][:, 0] 187 | dists = nn.kneighbors(test_x)[0][:, 0] 188 | thresholds = radius[nbr_idx] 189 | 190 | for j in range(num_x): 191 | dist = dists[j] 192 | threshold = thresholds[j] 193 | if dist <= threshold: 194 | outlier_scores[j] += 0 195 | else: 196 | outlier_scores[j] += 1 197 | 198 | outlier_scores = outlier_scores / t 199 | return outlier_scores 200 | 201 | def pretrain_nn_models_pwset(self, x, y): 202 | dim = x.shape[1] 203 | norm_idx = np.where(y == 0)[0] 204 | x_norm = x[norm_idx] 205 | 206 | full_set = np.arange(dim) 207 | pwset = utils.powerset(full_set) 208 | pwset.remove([]) 209 | pwset_nn_model_map = {} 210 | 211 | for subspace in tqdm(pwset): 212 | norm_x_subspace = x_norm[:, subspace] 213 | nn_model = self.training_nn_models(norm_x_subspace) 214 | pwset_nn_model_map[str(subspace)] = nn_model 215 | return pwset_nn_model_map 216 | 217 | def fit_od(self, x): 218 | [ensmb_samples_lst, ensmb_radius_lst, _] = self.training_nn_models(x) 219 | score_lst = [] 220 | for i in tqdm(range(len(x))): 221 | xx = x[i] 222 | score = self.single_scoring(xx, ensmb_samples_lst, ensmb_radius_lst) 223 | score_lst.append(score) 224 | return score_lst 225 | 226 | 227 | # if __name__ == '__main__': 228 | # root = 'E:/1-anomaly detection/10-AnoExp/' 229 | # path = root + "data/tabular/new_pca/cardio_pca.csv" 230 | # df = pd.read_csv(path) 231 | # x = df.values[:, :-1] 232 | # y = np.array(df.values[:, -1], dtype=int) 233 | # 234 | # model = SiNNE(max_level="full", width=10, ensemble_num=100, sample_num=8, pretrain=False) 235 | # # exp_subspace_lst = model.fit(x,y) 236 | # # precision, jaccard, score = evaluation_od.evaluation_od(exp_subspace_lst, x, y, 237 | # # "thyroid", model_name="iforest") 238 | # # metric_lst = [precision, jaccard, score] 239 | # # print(metric_lst) 240 | # 241 | # # norm_idx = np.where(y == 0)[0] 242 | # # norm_x = x[norm_idx] 243 | # # [ensmb_samples_lst, ensmb_radius_lst, ensmb_nn_lst] = model.training_nn_models(norm_x) 244 | # # score_lst = [] 245 | # # for i in tqdm(range(len(x))): 246 | # # xx = x[i] 247 | # # score = model.single_scoring(xx, ensmb_samples_lst, ensmb_radius_lst) 248 | # # score_lst.append(score) 249 | # # from sklearn import metrics 250 | # # print(metrics.roc_auc_score(y, score_lst)) -------------------------------------------------------------------------------- /model_coin/COIN.py: -------------------------------------------------------------------------------- 1 | import os 2 | import copy 3 | import math 4 | import time 5 | import numpy as np 6 | from sklearn.neighbors import NearestNeighbors 7 | from sklearn.cluster import KMeans 8 | from sklearn import svm 9 | from model_coin.prediction_strength import optimalK 10 | from tqdm import tqdm 11 | 12 | 13 | class COIN(object): 14 | def __init__(self, data, inds_otlr, nbrs_ratio, 15 | AUG=1.0, MIN_CLUSTER_SIZE=5, MAX_NUM_CLUSTER=4, VAL_TIMES=10, C_SVM=1., 16 | RESOLUTION=0.05, THRE_PS=0.85, DEFK=0): 17 | """ 18 | data: Data matrix, each row represents one instance 19 | inds_otlr: A vector with each entry telling whether this instance is outlier (1) or not (0) 20 | nbrs_ratio: The ratio of normal instances as the context for each outlier 21 | AUG: An additional feature attached to the input as data augmentation 22 | MIN_CLUSTER_SIZE: Minimum number of nodes in each cluster 23 | MAX_NUM_CLUSTER: Maximum number of clusters considered in prediction strength computation 24 | VAL_TIMES: Number of iterations for computing prediction strength 25 | C_SVM: A hyperparameter in SVM (optimum value would be better to be estimated through validation) 26 | DEFK: Predefined number of clusters in each context. Value 0 means using Prediction Strength to estimate it. 27 | """ 28 | self.data = data 29 | self.dim = data.shape[1] 30 | 31 | self.inds_otlr = inds_otlr 32 | self.ano_idx = np.where(inds_otlr == 1)[0] 33 | 34 | self.AUG = float(AUG) 35 | 36 | self.num_inst = data.shape[0] 37 | self.num_feat = data.shape[1] 38 | self.num_nbrs = int(nbrs_ratio * self.num_inst) 39 | 40 | self.MIN_CLUSTER_SIZE = MIN_CLUSTER_SIZE 41 | self.MAX_NUM_CLUSTER = MAX_NUM_CLUSTER 42 | self.VAL_TIMES = VAL_TIMES 43 | self.C_SVM = C_SVM 44 | self.RESOLUTION = RESOLUTION 45 | self.THRE_PS = THRE_PS 46 | self.DEFK = DEFK 47 | 48 | # normal instances 49 | self.data_normal = self.data[np.where(self.inds_otlr == 0)[0]] 50 | 51 | # nearest nbrs object based on normal instances 52 | self.nbrs = NearestNeighbors(n_neighbors=self.num_nbrs, n_jobs=-1) 53 | self.nbrs.fit(self.data_normal) 54 | 55 | def interpret_outliers(self, ids_target, sgnf_vec, int_flag=0): 56 | """ 57 | ids_target: Indices of target outliers 58 | sgnf_vec: A vector indicating the importance of each attribute, as prior knowledge 59 | int_flag: Discrete attribute or not 60 | :return: A list of sorted (outlier_ID, outlierness) tuples, a list of clfs, attr importance 2D-array 61 | """ 62 | 63 | # Attach 0 to the augmented feature 64 | if isinstance(sgnf_vec, int) or isinstance(sgnf_vec, float): 65 | sgnf_vec = np.hstack((np.ones(self.num_feat), 0)) 66 | else: 67 | sgnf_vec = np.hstack((sgnf_vec, [0])) 68 | 69 | # Interpret each target outlier 70 | oid_devt_dict = dict() # id-score tuples 71 | score_attr_mat = [] 72 | 73 | for ii in tqdm(range(len(ids_target))): 74 | i = ids_target[ii] 75 | 76 | # Do clustering on the context, build one classifier for each cluster 77 | nums_c, clfs, cluster_attr_scale = self.cluster_context(i, int_flag) 78 | 79 | # Calculate outlierness score 80 | devt_i = self.CalculateOutlierness(i, clfs, nums_c, sgnf_vec) 81 | oid_devt_dict[i] = devt_i 82 | 83 | # Find outlying attributes 84 | score_attr = np.zeros(self.num_feat) 85 | for num_c, clf in zip(nums_c, clfs): 86 | score_attr += num_c * np.abs(clf.coef_[0]) # weighted by the normal cluster size 87 | score_attr /= float(np.sum(nums_c)) 88 | score_attr /= np.sum(score_attr) # relative importance 89 | score_attr_mat.append(copy.copy(score_attr)) 90 | # print(score_attr) 91 | 92 | return np.array(score_attr_mat), oid_devt_dict 93 | 94 | def cluster_context(self, id_outlier, int_flag): 95 | # find the context of the outlier 96 | dist_btwn, otlr_nbrs = self.nbrs.kneighbors([self.data[id_outlier]]) 97 | dist_btwn, otlr_nbrs = dist_btwn[0], self.data_normal[otlr_nbrs[0], :] 98 | # print(self.data[id_outlier]) 99 | # print(otlr_nbrs) 100 | 101 | # choose the number of clusters in the context 102 | if self.DEFK == 0: 103 | k_best = optimalK(otlr_nbrs, self.VAL_TIMES, self.MAX_NUM_CLUSTER, self.THRE_PS) 104 | else: 105 | k_best = self.DEFK 106 | k_best = min(k_best+1, self.MAX_NUM_CLUSTER) # empirically, it is better to have a lager K 107 | # print('Best k:', k_best) 108 | 109 | # clutering the context 110 | kmeans = KMeans(n_clusters=k_best, random_state=0).fit(otlr_nbrs) 111 | label_nbrs = kmeans.labels_ 112 | 113 | clfs = [] 114 | nbrs_mean = [] 115 | nums_c = [] 116 | cluster_attr_scale = [] 117 | 118 | # build a linear classifier for each cluster of nbrs 119 | for c in range(k_best): 120 | # indices for instances in cluster c 121 | inds_c = np.where(label_nbrs == c)[0] 122 | 123 | # the cluster cannot be too small 124 | if np.size(inds_c) < self.MIN_CLUSTER_SIZE: 125 | continue 126 | nums_c.append(len(inds_c)) 127 | 128 | # instances for cluster c 129 | otlr_nbrs_c = otlr_nbrs[inds_c, :] 130 | dist_btwn_c = dist_btwn[inds_c] 131 | 132 | # distance property of cluster c 133 | cluster_attr_scale.append(np.hstack((np.max(otlr_nbrs_c, axis=0) - np.min(otlr_nbrs_c, axis=0), 0))) # scale for each attr 134 | 135 | # synthetic sampling to build two classes 136 | insts_c0 = self.SyntheticSampling(otlr_nbrs_c, self.data[id_outlier], int_flag) 137 | insts_c1 = otlr_nbrs_c 138 | 139 | clf = self.SVCInterpreter(insts_c0, insts_c1) 140 | clfs.append(clf) 141 | nbrs_mean.append(np.average(insts_c1, axis=0)) 142 | 143 | return nums_c, clfs, cluster_attr_scale 144 | 145 | def SyntheticSampling(self, insts, otlr, int_flag): 146 | ''' 147 | Expand the outlier into a class. 148 | 149 | insts: normal instances 150 | otlr: the outlier instance 151 | expand_ratio: expand ratio 152 | int_flag: whether to round to int 153 | :return: two classes of data points 154 | ''' 155 | 156 | num_c0_new = insts.shape[0] - 1 157 | coeff_c0_new = np.random.rand(num_c0_new, insts.shape[0]) # transformation matrix for synthetic sampling 158 | nbrs_local = NearestNeighbors(n_neighbors=1).fit(insts) 159 | min_dist_to_nbr = nbrs_local.kneighbors([otlr])[0][0, 0]/insts.shape[1] 160 | 161 | for r in range(coeff_c0_new.shape[0]): 162 | coeff_c0_new[r, :] /= sum(coeff_c0_new[r, :]) 163 | insts_c0_new = np.dot(coeff_c0_new, insts - np.dot(np.ones((insts.shape[0], 1)), [otlr])) 164 | for r in range(insts_c0_new.shape[0]): # shrink to prevent overlap 165 | insts_c0_new[r, :] *= (0.2 * np.random.rand(1)[0] * min_dist_to_nbr) 166 | insts_c0_new += np.dot(np.ones((num_c0_new, 1)), [otlr]) # origin + shift 167 | if int_flag: 168 | insts_c0_new = np.round(insts_c0_new) 169 | insts_c0 = np.vstack((otlr, insts_c0_new)) 170 | 171 | return insts_c0 172 | 173 | def SVCInterpreter(self, insts_c0, insts_c1): 174 | # classification between normal instances and outliers, where outliers have negative output 175 | 176 | clf = svm.LinearSVC(penalty='l1', C=self.C_SVM, dual=False, intercept_scaling=self.AUG) 177 | X_c = np.vstack((insts_c0, insts_c1)) 178 | y_c = np.hstack((np.zeros(insts_c0.shape[0]), np.ones(insts_c1.shape[0]))) 179 | clf.fit(X_c, y_c) 180 | #print(insts_c1) 181 | #print(insts_c0) 182 | 183 | return clf 184 | 185 | def CalculateOutlierness(self, id_outlier, clfs, nums_c, sgnf_vec): 186 | otlr = self.data[id_outlier] 187 | 188 | devt_overall = 0. 189 | for c in range(len(nums_c)): 190 | # distance to the boundary 191 | otlr_aug = np.hstack((otlr, self.AUG)) 192 | w = np.hstack((clfs[c].coef_[0], clfs[c].intercept_[0]/self.AUG)) 193 | w_a = np.hstack((clfs[c].coef_[0], 0)) 194 | dist = -min(0, np.inner(otlr_aug, w))/np.linalg.norm(w_a) 195 | 196 | # rescale deviation according to attributes' importance 197 | devt = np.linalg.norm(np.multiply(dist * w_a / np.linalg.norm(w_a), sgnf_vec)) 198 | if np.isnan(devt): 199 | devt = 0. 200 | 201 | # weighted by the opponent cluster size 202 | devt_overall += devt * nums_c[c] 203 | 204 | devt_overall /= sum(nums_c) 205 | 206 | return devt_overall 207 | 208 | def fit(self, sgnf_prior): 209 | importance_attr, outlierness = self.interpret_outliers(self.ano_idx, sgnf_prior) 210 | return importance_attr 211 | 212 | def weight2subspace(self, weight, r=0.7, num=-1): 213 | threshold = r * np.sum(weight) 214 | tmp_s = 0 215 | exp_subspace = [] 216 | sorted_idx1 = np.argsort(weight) 217 | sorted_idx = [sorted_idx1[self.dim - i -1] for i in range(self.dim)] 218 | if num != -1: 219 | exp_subspace = sorted_idx[:num] 220 | exp_subspace = list(np.sort(exp_subspace)) 221 | return exp_subspace 222 | 223 | for idx in sorted_idx: 224 | tmp_s += weight[idx] 225 | exp_subspace.append(idx) 226 | if tmp_s >= threshold: 227 | break 228 | exp_subspace = list(np.sort(exp_subspace)) 229 | return exp_subspace 230 | 231 | def weight2subspace_pn(self, weight): 232 | exp_subspace = [] 233 | for i in range(len(weight)): 234 | # exp_subspace.append(list(np.where(weight[i] > 0)[0])) 235 | if weight[i] > 0: 236 | exp_subspace.append(i) 237 | if len(exp_subspace) == 0: 238 | exp_subspace = np.arange(len(weight)) 239 | exp_subspace = list(np.sort(exp_subspace)) 240 | return exp_subspace 241 | 242 | def get_exp_subspace(self, fea_weight_lst, w2s_ratio, real_exp_len=None): 243 | exp_subspace_lst = [] 244 | for ii, idx in enumerate(self.ano_idx): 245 | fea_weight = fea_weight_lst[ii] 246 | if w2s_ratio == "real_len": 247 | exp_subspace_lst.append(self.weight2subspace(fea_weight, num=real_exp_len[ii])) 248 | elif w2s_ratio == "auto": 249 | r = math.sqrt(2 / self.dim) 250 | exp_subspace_lst.append(self.weight2subspace(fea_weight, r=r)) 251 | elif w2s_ratio == "pn": 252 | exp_subspace_lst.append(self.weight2subspace_pn(fea_weight)) 253 | else: 254 | exp_subspace_lst.append(self.weight2subspace(fea_weight, r=w2s_ratio)) 255 | return exp_subspace_lst -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------