├── requirements.txt ├── Mateen.py ├── MateenUtils ├── AE.py ├── data_processing.py ├── merge_utils.py ├── utils.py ├── selection_utils.py └── main.py └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.0.1 2 | numpy==1.25.0 3 | pandas==1.5.3 4 | scipy==1.10.1 5 | sklearn==1.2.2 6 | tqdm==4.65.0 7 | -------------------------------------------------------------------------------- /Mateen.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pandas as pd 3 | import numpy as np 4 | import sys 5 | sys.path.append('MateenUtils/') 6 | 7 | import data_processing as dp 8 | import utils 9 | import main as Mateen_main 10 | 11 | parser = argparse.ArgumentParser() 12 | 13 | parser.add_argument('--dataset_name', type=str, default="IDS2017", choices=["IDS2017", "IDS2018", "Kitsune", "mKitsune", "rKitsune"]) 14 | 15 | parser.add_argument('--window_size', type=int, default=50000, choices=[10000, 50000, 100000]) 16 | 17 | 18 | parser.add_argument('--performance_thres', type=int, default=0.99, choices=[0.99, 0.95, 0.90, 0.85, 0.8]) 19 | 20 | 21 | parser.add_argument('--max_ensemble_length', type=int, default=3, choices=[3, 5, 7]) 22 | 23 | parser.add_argument('--selection_budget', type=int, default=0.01, choices=[0.005, 0.01, 0.05, 0.1]) 24 | 25 | parser.add_argument('--mini_batch_size', type=int, default=1000, choices=[500, 1000, 1500]) 26 | 27 | parser.add_argument('--retention_rate', type=int, default=0.3, choices=[0.3, 0.5, 0.9]) 28 | 29 | parser.add_argument('--lambda_0', type=int, default=0.1, choices=[0.1, 0.5, 1.0]) 30 | 31 | parser.add_argument('--shift_threshold', type=int, default=0.05, choices=[0.05, 0.1, 0.2]) 32 | 33 | 34 | args = parser.parse_args() 35 | 36 | 37 | def main(args): 38 | x_train, x_test, y_train, y_test = dp.prepare_data(scenario=args.dataset_name) 39 | x_slice, y_slice = dp.partition_array(x_data=x_test, y_data=y_test, slice_size=args.window_size) 40 | predicitons, probs_list = Mateen_main.adaptive_ensemble(x_train, y_train, x_slice, y_slice, args) 41 | _ = utils.getResult(y_test, predicitons) 42 | auc_rocs = utils.auc_roc_in_chunks(y_test, probs_list, chunk_size=args.window_size) 43 | print(f' Average AUC-ROC: {np.mean(auc_rocs)}, STD: {np.std(auc_rocs)}') 44 | df = pd.DataFrame({'Probabilities': probs_list, 'Predictions': predicitons}) 45 | df.to_csv(f'Results/{args.dataset_name}-{args.selection_budget}.csv', index=False) 46 | return 47 | 48 | 49 | main(args) -------------------------------------------------------------------------------- /MateenUtils/AE.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.backends.cudnn as cudnn 4 | import torch.optim as optim 5 | import random 6 | from tqdm import tqdm 7 | 8 | 9 | seed = 0 10 | torch.manual_seed(seed) 11 | torch.backends.cudnn.deterministic = True 12 | torch.backends.cudnn.benchmark = False 13 | random.seed(0) 14 | 15 | device = "cuda" if torch.cuda.is_available() else "cpu" 16 | 17 | class autoencoder(nn.Module): 18 | def __init__(self, feature_size): 19 | super(autoencoder, self).__init__() 20 | self.encoder = nn.Sequential(nn.Linear(feature_size, int(feature_size*0.75)), 21 | nn.ReLU(True), 22 | nn.Linear(int(feature_size*0.75), int(feature_size*0.5)), 23 | nn.ReLU(True), 24 | nn.Linear(int(feature_size*0.5),int(feature_size*0.25)), 25 | nn.ReLU(True), 26 | nn.Linear(int(feature_size*0.25),int(feature_size*0.1))) 27 | 28 | self.decoder = nn.Sequential(nn.Linear(int(feature_size*0.1),int(feature_size*0.25)), 29 | nn.ReLU(True), 30 | nn.Linear(int(feature_size*0.25),int(feature_size*0.5)), 31 | nn.ReLU(True), 32 | nn.Linear(int(feature_size*0.5),int(feature_size*0.75)), 33 | nn.ReLU(True), 34 | nn.Linear(int(feature_size*0.75),int(feature_size)), 35 | ) 36 | 37 | def forward(self, x): 38 | encode = self.encoder(x) 39 | decode = self.decoder(encode) 40 | return decode 41 | 42 | 43 | def train_autoencoder(model, train_loader, num_epochs=100, learning_rate=0.0001): 44 | model.to(device) 45 | criterion = nn.MSELoss() 46 | optimizer = optim.Adam(model.parameters(), lr=learning_rate) 47 | for epoch in tqdm(range(num_epochs)): 48 | model.train() 49 | for batch_data in train_loader: 50 | inputs = batch_data[0].to(device).float() 51 | targets = batch_data[0].to(device).float() 52 | optimizer.zero_grad() 53 | outputs = model(inputs) 54 | loss = criterion(outputs, targets) 55 | loss.backward() 56 | optimizer.step() 57 | model.eval() 58 | return model 59 | -------------------------------------------------------------------------------- /MateenUtils/data_processing.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.preprocessing import MinMaxScaler 4 | from torch.utils.data import DataLoader, TensorDataset 5 | import torch 6 | 7 | 8 | def load_dataset(file_path, file_type=None): 9 | if file_type == 'parquet': 10 | return pd.read_parquet(file_path) 11 | else: 12 | return pd.read_csv(file_path) 13 | 14 | def process_data(data): 15 | data = pd.DataFrame(data) 16 | return np.nan_to_num(data.astype(float)) 17 | 18 | def prepare_data(scenario="IDS2017"): 19 | if scenario == "IDS2017": 20 | data_2017_path = "Datasets/CICIDS2017/clean_data.csv" 21 | data_2017 = load_dataset(data_2017_path) 22 | train = data_2017[:693702] 23 | test = data_2017[693702:] 24 | elif scenario == "IDS2018": 25 | train_path = "Datasets/IDS2018/TrainData.csv" 26 | test_path = "Datasets/IDS2018/NewTestData.csv" 27 | train = load_dataset(train_path) 28 | test = load_dataset(test_path) 29 | elif scenario == "Kitsune": 30 | train_path = "Datasets/Kitsune/TrainData.csv" 31 | test_path = "Datasets/Kitsune/TestData.csv" 32 | train = load_dataset(train_path) 33 | print(f'Train Loaded with {len(train)} Samples') 34 | test = load_dataset(test_path) 35 | print(f'Test Loaded with {len(test)} Samples') 36 | elif scenario == "mKitsune": 37 | train_path = "Datasets/Kitsune/TrainData.csv" 38 | test_path = "Datasets/Kitsune/NewTestData.csv" 39 | train = load_dataset(train_path) 40 | print(f'Train Loaded with {len(train)} Samples') 41 | test = load_dataset(test_path) 42 | print(f'Test Loaded with {len(test)} Samples') 43 | elif scenario == "rKitsune": 44 | train_path = "Datasets/Kitsune/TrainData.csv" 45 | test_path = "Datasets/Kitsune/Recurring.csv" 46 | train = load_dataset(train_path) 47 | print(f'Train Loaded with {len(train)} Samples') 48 | test = load_dataset(test_path) 49 | print(f'Test Loaded with {len(test)} Samples') 50 | else: 51 | raise ValueError("Invalid scenario number.") 52 | 53 | print(f'Scenario {scenario} with: {len(train)} training samples and {len(test)} testing samples') 54 | y_train, x_train = train["Label"], process_data(train.drop('Label', axis=1)) 55 | y_test, x_test = test["Label"], process_data(test.drop('Label', axis=1)) 56 | scaler = MinMaxScaler().fit(x_train) 57 | x_train, x_test = scaler.transform(x_train), scaler.transform(x_test) 58 | 59 | return np.array(x_train), np.array(x_test), np.array(y_train), np.array(y_test) 60 | 61 | 62 | def partition_array(x_data=None, y_data=None, slice_size=50000): 63 | num_samples = x_data.shape[0] 64 | num_slices = num_samples // slice_size + 1 65 | 66 | x_slices = [] 67 | y_slices = [] 68 | for i in range(num_slices): 69 | start = i * slice_size 70 | end = min((i + 1) * slice_size, num_samples) 71 | x_slices.append(x_data[start:end]) 72 | y_slices.append(y_data[start:end]) 73 | print(f' Test data has been divided into slices of size {slice_size} and length of {len(x_slices)}') 74 | return x_slices, y_slices 75 | 76 | 77 | def loading_datasets(benign_train): 78 | train_dataset = TensorDataset(torch.tensor(benign_train)) 79 | train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True) 80 | return train_loader, benign_train 81 | 82 | def prepare_datasets(x_train, y_train): 83 | indexes_ben_train = np.where(y_train == 0)[0] 84 | benign_train = x_train[indexes_ben_train] 85 | train_loader, benign_train = loading_datasets(benign_train) 86 | return train_loader, benign_train 87 | 88 | def prepare_new_train_valid_data(x_train, new_set): 89 | if new_set is None: 90 | return x_train 91 | else: 92 | x_new_train = np.concatenate((x_train, new_set), axis=0) 93 | return x_new_train -------------------------------------------------------------------------------- /MateenUtils/merge_utils.py: -------------------------------------------------------------------------------- 1 | import AE as model_base 2 | from sklearn.metrics import f1_score 3 | import numpy as np 4 | import torch.nn as nn 5 | import torch.backends.cudnn as cudnn 6 | import torch 7 | import random 8 | import utils 9 | import copy 10 | 11 | seed = 0 12 | torch.manual_seed(seed) 13 | torch.backends.cudnn.deterministic = True 14 | torch.backends.cudnn.benchmark = False 15 | np.random.seed(seed) 16 | random.seed(0) 17 | 18 | device = "cuda" if torch.cuda.is_available() else "cpu" 19 | 20 | 21 | 22 | def get_best_models(mode, models, thresholds, data, y_true): 23 | f1_list = [] 24 | for i in range(len(models)): 25 | y_pred, probs = utils.preds_and_probs(models[i], thresholds[i], data) 26 | f1 = f1_score(y_true, y_pred, average='micro') 27 | f1_list.append(f1) 28 | index_of_max = f1_list.index(max(f1_list)) 29 | if mode == "selection": 30 | return models[index_of_max], thresholds[index_of_max], index_of_max, max(f1_list), f1_list 31 | elif mode == "merge": 32 | return f1_list 33 | 34 | def get_values_within_margin(lst, margin=0.10): 35 | max_value = max(lst) 36 | threshold = max_value * (1 - margin) 37 | indices = [i for i, value in enumerate(lst) if value >= threshold] 38 | return indices 39 | 40 | def compute_val_f1(model, data, y_true, previous_data): 41 | thres = utils.threshold_calulation(model, previous_data) 42 | predictions, _ = utils.preds_and_probs(model, thres, data) 43 | f1 = f1_score(y_true, predictions, average='micro') 44 | return f1 45 | 46 | def merge_layer_weights(layer1, layer2, alpha): 47 | layer2.weight.data = alpha * layer1.weight.data + (1 - alpha) * layer2.weight.data 48 | layer2.bias.data = alpha * layer1.bias.data + (1 - alpha) * layer2.bias.data 49 | 50 | 51 | def print_model_layer_names(model): 52 | for name, _ in model.named_parameters(): 53 | print(name) 54 | 55 | def merge_models(model1: nn.Module, model2: nn.Module, data, y_true, previous_data): 56 | input_shape = data.shape[1] 57 | empty_model = model_base.autoencoder(input_shape) 58 | empty_model.to(device) 59 | best_alpha = 0.5 60 | best_f1 = -float('inf') 61 | original_state_dict = model2.state_dict() 62 | for alpha_value in [i * 0.01 for i in range(101)]: 63 | alpha = torch.tensor(alpha_value).to(device) 64 | temp_model = copy.deepcopy(empty_model) 65 | temp_model.load_state_dict(original_state_dict) 66 | for (seq_name1, seq1), (seq_name2, seq2) in zip(model1._modules.items(), temp_model._modules.items()): 67 | for (layer_name1, layer1), (layer_name2, layer2) in zip(seq1._modules.items(), seq2._modules.items()): 68 | if isinstance(layer1, nn.Linear) and isinstance(layer2, nn.Linear): 69 | merge_layer_weights(layer1, layer2, alpha) 70 | 71 | f1 = compute_val_f1(temp_model, data, y_true, previous_data) 72 | if f1 > best_f1: 73 | best_f1 = f1 74 | best_alpha = alpha_value 75 | print(f' Alpha {alpha_value} -- F1: {f1}') 76 | 77 | print(f'Best Alpha is {best_alpha}') 78 | final_model = copy.deepcopy(empty_model) 79 | final_model.load_state_dict(original_state_dict) 80 | for (seq_name1, seq1), (seq_name2, seq2) in zip(model1._modules.items(), final_model._modules.items()): 81 | for (layer_name1, layer1), (layer_name2, layer2) in zip(seq1._modules.items(), seq2._modules.items()): 82 | if isinstance(layer1, nn.Linear) and isinstance(layer2, nn.Linear): 83 | merge_layer_weights(layer1, layer2, torch.tensor(best_alpha).to(device)) 84 | 85 | return final_model 86 | 87 | def merge_tmp_models(models, thresholds, data, y_true, previous_data): 88 | f1_scores = get_best_models("merge", models, thresholds, data, y_true) 89 | candidates_idx = get_values_within_margin(f1_scores, margin=0.10) 90 | if len(candidates_idx) == 1: 91 | return models[candidates_idx[0]] 92 | merged_model = models[candidates_idx[0]] 93 | for idx in candidates_idx[1:]: 94 | merged_model = merge_models(merged_model, models[idx],data, y_true, previous_data) 95 | 96 | print(f'Models {candidates_idx} Have Been Merged') 97 | return merged_model 98 | -------------------------------------------------------------------------------- /MateenUtils/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import random 4 | import torch.nn as nn 5 | import torch.backends.cudnn as cudnn 6 | from sklearn.metrics import balanced_accuracy_score, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score 7 | from collections import Counter 8 | 9 | 10 | seed = 0 11 | torch.manual_seed(seed) 12 | torch.backends.cudnn.deterministic = True 13 | torch.backends.cudnn.benchmark = False 14 | np.random.seed(seed) 15 | random.seed(0) 16 | 17 | device = "cuda" if torch.cuda.is_available() else "cpu" 18 | device 19 | 20 | 21 | getMSEvec = nn.MSELoss(reduction='none') 22 | 23 | def se2rmse(a): 24 | return torch.sqrt(sum(a.t())/a.shape[1]) 25 | 26 | def threshold_calulation(model, x_data): 27 | model.eval() 28 | output = model((torch.tensor(x_data).float()).to(device)) 29 | mse_vec = getMSEvec(output, torch.tensor(x_data).to(device)) 30 | rmse_vec = se2rmse(mse_vec).cpu().data.numpy() 31 | thres = max(rmse_vec) 32 | rmse_vec.sort() 33 | pctg = 0.95 34 | thres = rmse_vec[int(len(rmse_vec)*pctg)] 35 | return thres 36 | 37 | def preds_and_probs(model, threshold, X_test): 38 | X_test_tensor = torch.from_numpy(X_test).type(torch.float).to(device) 39 | model.eval() 40 | output = model(X_test_tensor) 41 | mse_vec = getMSEvec(output, X_test_tensor) 42 | rmse_vec = se2rmse(mse_vec).cpu().data.numpy() 43 | y_pred = np.asarray([0] * len(rmse_vec)) 44 | idx_mal = np.where(rmse_vec > threshold) 45 | y_pred[idx_mal] = 1 46 | return y_pred, rmse_vec 47 | 48 | 49 | def get_features_error(model, X_test): 50 | X_test_tensor = torch.from_numpy(X_test).type(torch.float).to(device) 51 | model.eval() 52 | output = model(X_test_tensor).cpu().data.numpy() 53 | errors = np.abs(X_test - output) 54 | return errors 55 | 56 | 57 | def getResult(y_true, y_pred): 58 | cm = confusion_matrix(y_true, y_pred) 59 | positive_label = 0 60 | print("Predicted Labels", Counter(y_pred)) 61 | print("True Labels", Counter(y_true)) 62 | if positive_label == 0: 63 | print('Positive label: 0') 64 | tp, fn, fp, tn = cm.ravel() 65 | else: 66 | print('Positive label: 1') 67 | tn, fp, fn, tp = cm.ravel() 68 | attacks = tp + fn 69 | normals = fp + tn 70 | accuracy = ((tp + tn) / (attacks + normals)) * 100 71 | precision = (tp / (tp + fp)) * 100 72 | recall = (tp / (tp + fn)) * 100 73 | f1 = (2 * (((precision / 100)* (recall / 100)) / ((precision / 100) + (recall / 100)))) * 100 74 | tnr = (tn / (tn + fp)) * 100 75 | macro_recall = recall_score(y_true, y_pred, average='macro') * 100 76 | macro_precision = precision_score(y_true, y_pred, average='macro') * 100 77 | macro_f1 = f1_score(y_true, y_pred, average='macro') * 100 78 | balanced_accuracy = balanced_accuracy_score(y_true, y_pred) * 100 79 | tpr = (tp / (tp + fn)) * 100 80 | 81 | print("General Accuracy: {:.4f}".format(accuracy)) 82 | print("Recall: {:.4f}".format(recall)) 83 | print("Precision: {:.4f}".format(precision)) 84 | print("F1 Score: {:.4f}".format(f1)) 85 | print("True Negative Rate: {:.4f}".format(tnr)) 86 | print(f"True Positive Rate: {tpr:.2f}%") 87 | print("Macro Recall: {:.4f}".format(macro_recall)) 88 | print("Macro Precision: {:.4f}".format(macro_precision)) 89 | print("Macro F1 Score: {:.4f}".format(macro_f1)) 90 | print("Balanced Accuracy: {:.4f}".format(balanced_accuracy)) 91 | return accuracy, recall, precision, f1, tnr, macro_recall, macro_precision, macro_f1, balanced_accuracy 92 | 93 | 94 | def auc_roc_in_chunks(y_test, probs_list, chunk_size=50000): 95 | num_chunks = len(y_test) // chunk_size + (1 if len(y_test) % chunk_size != 0 else 0) 96 | auc_roc_scores = [] 97 | for i in range(num_chunks): 98 | start = i * chunk_size 99 | end = start + chunk_size 100 | y_test_chunk = y_test[start:end] 101 | probs_list_chunk = probs_list[start:end] 102 | if set(y_test_chunk) == {0, 1}: 103 | auc_roc = roc_auc_score(y_test_chunk, probs_list_chunk) 104 | auc_roc_scores.append(auc_roc) 105 | return auc_roc_scores -------------------------------------------------------------------------------- /MateenUtils/selection_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.backends.cudnn as cudnn 4 | from scipy.spatial.distance import pdist, squareform 5 | import random 6 | import utils 7 | from tqdm import tqdm 8 | 9 | seed = 0 10 | torch.manual_seed(seed) 11 | torch.backends.cudnn.deterministic = True 12 | torch.backends.cudnn.benchmark = False 13 | np.random.seed(seed) 14 | random.seed(0) 15 | device = "cuda" if torch.cuda.is_available() else "cpu" 16 | 17 | 18 | 19 | 20 | 21 | def get_unique(reps, data_idx, min_distance): 22 | distances = squareform(pdist(reps)) 23 | num_samples = distances.shape[0] 24 | valid = np.ones(num_samples, dtype=bool) 25 | counts = np.zeros(num_samples, dtype=int) 26 | 27 | for i in range(num_samples): 28 | if valid[i]: 29 | neighbors = distances[i, :] < min_distance 30 | counts[i] = np.sum(neighbors) - 1 31 | valid[neighbors] = False 32 | valid[i] = True 33 | filtered_reps = reps[valid] 34 | filtered_data_idx = np.array(data_idx)[valid] 35 | counts = counts[valid].tolist() 36 | return filtered_reps, filtered_data_idx, counts 37 | 38 | def get_informative(model, data, data_idx, budget, retention_rate, initial_min_distance=0.1): 39 | data_idx = np.array(data_idx) 40 | data = torch.from_numpy(data).float().to(device) 41 | errs_vector = utils.getMSEvec(model(data), data).cpu().data.numpy() 42 | reps = model(data).cpu().data.numpy() 43 | reps = np.hstack((reps, errs_vector)) 44 | target_count = int(retention_rate * len(data)) 45 | lower_bound = 0 46 | upper_bound = 1 47 | tolerance = 0.0000001 48 | max_iterations = 50 49 | iteration = 0 50 | 51 | while (upper_bound - lower_bound) > tolerance and iteration < max_iterations: 52 | iteration += 1 53 | mid_point = (upper_bound + lower_bound) / 2 54 | filtered_reps, filtered_data_idx, similar_samples = get_unique(reps, data_idx, mid_point) 55 | if len(filtered_data_idx) < target_count: 56 | upper_bound = mid_point 57 | else: 58 | lower_bound = mid_point 59 | similar_samples = min_max_scaling(similar_samples) 60 | return filtered_data_idx, similar_samples 61 | 62 | def min_max_scaling(data): 63 | return (data - np.min(data)) / (np.max(data) - np.min(data)) 64 | 65 | def get_rep(model, data, idx, budget, similar_rates, lambda_0, lambda_1=1.0): 66 | data = torch.from_numpy(data).float().to(device) 67 | reps = utils.getMSEvec(model(data), data).cpu().data.numpy() 68 | distances = squareform(pdist(reps)) 69 | np.fill_diagonal(distances, 0) 70 | distance_sums = distances.sum(axis=1) 71 | distance_sums = min_max_scaling(distance_sums) 72 | final_score = (lambda_0 * distance_sums) + (lambda_1 * similar_rates) 73 | sorted_indices = np.argsort(-final_score) 74 | selected_data_idx_in_sorted = sorted_indices[:budget] 75 | selected_original_idx = idx[selected_data_idx_in_sorted] 76 | selected_data = data[selected_data_idx_in_sorted] 77 | return selected_original_idx 78 | 79 | def data_to_bins(model, data, batch_size=1000): 80 | data = torch.from_numpy(data).float().to(device) 81 | recon_errs = utils.se2rmse(utils.getMSEvec(model(data), data)).cpu().data.numpy() 82 | indices = np.arange(len(recon_errs)) 83 | sorted_indices = indices[np.argsort(recon_errs)[::-1]] 84 | num_batches = len(sorted_indices) // batch_size 85 | batches_indices = [] 86 | for i in range(num_batches): 87 | start_idx = i * batch_size 88 | end_idx = start_idx + batch_size 89 | batches_indices.append(sorted_indices[start_idx:end_idx]) 90 | return batches_indices 91 | 92 | 93 | def mateen_selector(model, data, labels, args): 94 | temp_idx = [] 95 | batch_size = args.mini_batch_size 96 | if len(labels) > batch_size: 97 | batches_indices = data_to_bins(model, data, batch_size=batch_size) 98 | else: 99 | batches_indices = [np.arange(len(data))] 100 | 101 | for batch in batches_indices: 102 | label_budget = int(args.selection_budget * len(batch)) 103 | informative_idx, similar_rates = get_informative(model, data[batch], batch, args.selection_budget, args.retention_rate) 104 | informative_idx = np.array(informative_idx) 105 | if len(informative_idx) > label_budget: 106 | selected_idx = get_rep(model, data[informative_idx], informative_idx, label_budget, similar_rates, args.lambda_0) 107 | temp_idx.extend(selected_idx) 108 | else: 109 | temp_idx.extend(informative_idx) 110 | temp_idx = np.array(temp_idx) 111 | selected_idx = [idx for idx in temp_idx if labels[idx] == 0] 112 | 113 | if len(selected_idx) == 0: 114 | return None, temp_idx, labels[temp_idx] 115 | return data[selected_idx], temp_idx, labels[temp_idx] 116 | -------------------------------------------------------------------------------- /MateenUtils/main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import f1_score 3 | from collections import Counter 4 | from datetime import datetime 5 | import copy 6 | import torch 7 | import torch.nn as nn 8 | import pandas as pd 9 | import random 10 | import torch.backends.cudnn as cudnn 11 | import AE as model_base 12 | import data_processing as dp 13 | import utils 14 | import merge_utils as merge 15 | import selection_utils as selection 16 | from scipy.stats import ks_2samp 17 | 18 | 19 | seed = 0 20 | torch.manual_seed(seed) 21 | torch.backends.cudnn.deterministic = True 22 | torch.backends.cudnn.benchmark = False 23 | np.random.seed(seed) 24 | random.seed(0) 25 | 26 | device = "cuda" if torch.cuda.is_available() else "cpu" 27 | 28 | 29 | def model_update(x_train, y_train=None, num_epochs=100, model=None): 30 | input_shape = x_train.shape[1] 31 | train_loader, _ = dp.loading_datasets(x_train) 32 | model = model_base.train_autoencoder(model, train_loader, num_epochs=num_epochs, learning_rate=0.0001) 33 | return model 34 | 35 | 36 | def load_model(load_mode, input_shape, scenario, train_loader, data, num_epochs): 37 | if load_mode == "new": 38 | model = model_base.autoencoder(input_shape) 39 | model = model_update(data, num_epochs=num_epochs, model=model) 40 | else: 41 | model = torch.load(f'Models/{scenario}.pth').to(device) 42 | return model 43 | 44 | 45 | def ensemble_training(x_train, y_train=None, num_epochs=10, mode=None, scenario=1, load_mode=None): 46 | input_shape = x_train.shape[1] 47 | if mode == "init": 48 | train_loader, benign_train = dp.prepare_datasets(x_train, y_train) 49 | elif mode == None: 50 | train_loader, _ = dp.loading_datasets(x_train) 51 | model = load_model(load_mode, input_shape, scenario, train_loader, x_train, num_epochs) 52 | return model 53 | 54 | 55 | def isit_shift(recon_old, recon_new, threshold): 56 | recon_old_sorted = sorted(recon_old) 57 | recon_new_sorted = sorted(recon_new) 58 | ks_statistic, p_value = ks_2samp(recon_old_sorted, recon_new_sorted) 59 | if p_value < threshold: 60 | return True 61 | else: 62 | print(f' No Shift !') 63 | return False 64 | 65 | 66 | 67 | def select_and_adapt(probs, probs_vector, data_slice, label_slice, models_list, threshold_list, benign_train, selected_model, y_pred, selected_threshold, x_train, y_train, args): 68 | print(datetime.now()) 69 | x_selected, selected_idx, selected_true = selection.mateen_selector(selected_model, data_slice, label_slice, args) 70 | print(datetime.now()) 71 | print(f'Selected Predictions {Counter(y_pred[selected_idx])}') 72 | print(f'Selected True labels {Counter(selected_true.flatten())}') 73 | print(f' Predictions {Counter(y_pred)}') 74 | print(f' True labels {Counter(label_slice.flatten())}') 75 | performance = f1_score(selected_true, y_pred[selected_idx], average='micro') 76 | if (performance < args.performance_thres): 77 | big_model = copy.deepcopy(models_list[0]) 78 | print(f' Bad Performance: {performance}') 79 | if x_selected is not None: 80 | print(x_selected.shape) 81 | print(' Train Temp Model') 82 | benign_train = np.concatenate((benign_train, x_selected)) 83 | new_model = model_update(x_selected, num_epochs=100, model=big_model) 84 | thres = utils.threshold_calulation(new_model, benign_train) 85 | models_list.append(new_model) 86 | threshold_list.append(thres) 87 | 88 | y_pred, _ = utils.preds_and_probs(models_list[0], threshold_list[0], data_slice[selected_idx]) 89 | big_model_performance = f1_score(selected_true, y_pred,average='micro') 90 | if (big_model_performance < args.performance_thres): 91 | print(f'Update Large Model (Current Performance {big_model_performance})') 92 | updated_model = model_update(benign_train, num_epochs=10, model=models_list[0]) 93 | updated_model_thres = utils.threshold_calulation(updated_model, benign_train) 94 | models_list[0] = updated_model 95 | threshold_list[0] = updated_model_thres 96 | if len(models_list) >= args.max_ensemble_length: 97 | print('Cleaning Ensemble') 98 | print(f' Ensemble Length {len(models_list)}') 99 | temp_models = models_list[1:-1] 100 | temp_thresholds = threshold_list[1:-1] 101 | print(f' Merged Length {len(temp_models)}') 102 | temp_model = merge.merge_tmp_models(temp_models, temp_thresholds, data_slice[selected_idx], label_slice[selected_idx], benign_train) 103 | print('Fine Tune Merged Model') 104 | temp_model_thres = utils.threshold_calulation(temp_model, benign_train) 105 | models_list = [models_list[0], temp_model, models_list[-1]] 106 | threshold_list = [threshold_list[0], temp_model_thres, threshold_list[-1]] 107 | selected_model, selected_threshold, model_idx, selected_f1, f1_list = merge.get_best_models("selection", models_list, threshold_list, data_slice[selected_idx], label_slice[selected_idx]) 108 | print(f' Model {model_idx} Selected with F1 {selected_f1} ; other models F1s {f1_list}') 109 | return models_list, threshold_list, selected_model, selected_threshold, benign_train, x_train, y_train 110 | 111 | def adaptive_ensemble(x_train, y_train, x_slice, y_slice, args): 112 | cade_model = None 113 | model = ensemble_training(x_train, y_train=y_train, num_epochs=100, mode="init", scenario=args.dataset_name) 114 | benign_train = x_train[y_train==0] 115 | selected_threshold = utils.threshold_calulation(model, benign_train) 116 | predicitons = [] 117 | probs_list = [] 118 | print(f'Updating Models Process Started!') 119 | models_list = [model] 120 | threshold_list = [selected_threshold] 121 | selected_model = model 122 | for i in range(len(x_slice)): 123 | print(f'Step {i+1}/{len(x_slice)}') 124 | y_pred, probs = utils.preds_and_probs(selected_model, selected_threshold, x_slice[i]) 125 | _, old_probs = utils.preds_and_probs(selected_model, selected_threshold, benign_train[-len(x_slice[i]):]) 126 | predicitons.extend(y_pred) 127 | probs_list.extend(probs) 128 | data_slice = x_slice[i] 129 | label_slice = y_slice[i] 130 | if i+1 == len(x_slice): 131 | return predicitons, probs_list 132 | if isit_shift(old_probs, probs, args.shift_threshold) == True: 133 | probs_vector = utils.get_features_error(selected_model, x_slice[i]) 134 | models_list, threshold_list, selected_model, selected_threshold, benign_train, x_train, y_train = select_and_adapt(probs, probs_vector, data_slice, label_slice, models_list, threshold_list, benign_train, selected_model, y_pred, selected_threshold, x_train, y_train, args) 135 | return predicitons, probs_list 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |
3 |
4 |