├── requirements.txt
├── Mateen.py
├── MateenUtils
    ├── AE.py
    ├── data_processing.py
    ├── merge_utils.py
    ├── utils.py
    ├── selection_utils.py
    └── main.py
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==2.0.1
2 | numpy==1.25.0
3 | pandas==1.5.3
4 | scipy==1.10.1
5 | sklearn==1.2.2
6 | tqdm==4.65.0
7 | 


--------------------------------------------------------------------------------
/Mateen.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pandas as pd
 3 | import numpy as np
 4 | import sys
 5 | sys.path.append('MateenUtils/')
 6 | 
 7 | import data_processing as dp
 8 | import utils
 9 | import main as Mateen_main
10 | 
11 | parser = argparse.ArgumentParser()
12 | 
13 | parser.add_argument('--dataset_name', type=str, default="IDS2017", choices=["IDS2017", "IDS2018", "Kitsune", "mKitsune", "rKitsune"])
14 | 
15 | parser.add_argument('--window_size', type=int, default=50000, choices=[10000, 50000, 100000])
16 | 
17 | 
18 | parser.add_argument('--performance_thres', type=int, default=0.99, choices=[0.99, 0.95, 0.90, 0.85, 0.8])
19 | 
20 | 
21 | parser.add_argument('--max_ensemble_length', type=int, default=3, choices=[3, 5, 7])
22 | 
23 | parser.add_argument('--selection_budget', type=int, default=0.01, choices=[0.005, 0.01, 0.05, 0.1])
24 | 
25 | parser.add_argument('--mini_batch_size', type=int, default=1000, choices=[500, 1000, 1500])
26 | 
27 | parser.add_argument('--retention_rate', type=int, default=0.3, choices=[0.3, 0.5, 0.9])
28 | 
29 | parser.add_argument('--lambda_0', type=int, default=0.1, choices=[0.1, 0.5, 1.0])
30 | 
31 | parser.add_argument('--shift_threshold', type=int, default=0.05, choices=[0.05, 0.1, 0.2])
32 | 
33 | 
34 | args = parser.parse_args()
35 | 
36 | 
37 | def main(args):
38 |     x_train, x_test, y_train, y_test = dp.prepare_data(scenario=args.dataset_name)
39 |     x_slice, y_slice = dp.partition_array(x_data=x_test, y_data=y_test, slice_size=args.window_size)
40 |     predicitons, probs_list = Mateen_main.adaptive_ensemble(x_train, y_train, x_slice, y_slice, args)
41 |     _ = utils.getResult(y_test, predicitons)
42 |     auc_rocs = utils.auc_roc_in_chunks(y_test, probs_list, chunk_size=args.window_size)
43 |     print(f' Average AUC-ROC: {np.mean(auc_rocs)}, STD: {np.std(auc_rocs)}')
44 |     df = pd.DataFrame({'Probabilities': probs_list, 'Predictions': predicitons})
45 |     df.to_csv(f'Results/{args.dataset_name}-{args.selection_budget}.csv', index=False)
46 |     return
47 |     
48 | 
49 | main(args)


--------------------------------------------------------------------------------
/MateenUtils/AE.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.backends.cudnn as cudnn
 4 | import torch.optim as optim
 5 | import random
 6 | from tqdm import tqdm
 7 | 
 8 | 
 9 | seed = 0
10 | torch.manual_seed(seed)
11 | torch.backends.cudnn.deterministic = True
12 | torch.backends.cudnn.benchmark = False
13 | random.seed(0)
14 | 
15 | device = "cuda" if torch.cuda.is_available() else "cpu"
16 | 
17 | class autoencoder(nn.Module):
18 |     def __init__(self, feature_size):
19 |         super(autoencoder, self).__init__()
20 |         self.encoder = nn.Sequential(nn.Linear(feature_size, int(feature_size*0.75)),
21 |                                      nn.ReLU(True),
22 |                                      nn.Linear(int(feature_size*0.75), int(feature_size*0.5)),
23 |                                      nn.ReLU(True),
24 |                                      nn.Linear(int(feature_size*0.5),int(feature_size*0.25)),
25 |                                      nn.ReLU(True),
26 |                                      nn.Linear(int(feature_size*0.25),int(feature_size*0.1)))
27 | 
28 |         self.decoder = nn.Sequential(nn.Linear(int(feature_size*0.1),int(feature_size*0.25)),
29 |                                      nn.ReLU(True),
30 |                                      nn.Linear(int(feature_size*0.25),int(feature_size*0.5)),
31 |                                      nn.ReLU(True),
32 |                                      nn.Linear(int(feature_size*0.5),int(feature_size*0.75)),
33 |                                      nn.ReLU(True),
34 |                                      nn.Linear(int(feature_size*0.75),int(feature_size)),
35 |                                      )
36 | 
37 |     def forward(self, x):
38 |         encode = self.encoder(x)
39 |         decode = self.decoder(encode)
40 |         return decode
41 | 
42 | 
43 | def train_autoencoder(model, train_loader, num_epochs=100, learning_rate=0.0001):
44 |     model.to(device)
45 |     criterion = nn.MSELoss()
46 |     optimizer = optim.Adam(model.parameters(), lr=learning_rate)          
47 |     for epoch in tqdm(range(num_epochs)):
48 |         model.train() 
49 |         for batch_data in train_loader:
50 |             inputs = batch_data[0].to(device).float()
51 |             targets = batch_data[0].to(device).float()
52 |             optimizer.zero_grad()   
53 |             outputs = model(inputs)
54 |             loss = criterion(outputs, targets)  
55 |             loss.backward()  
56 |             optimizer.step()          
57 |         model.eval()
58 |     return model
59 | 


--------------------------------------------------------------------------------
/MateenUtils/data_processing.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from sklearn.preprocessing import MinMaxScaler
 4 | from torch.utils.data import DataLoader, TensorDataset
 5 | import torch 
 6 | 
 7 | 
 8 | def load_dataset(file_path, file_type=None):
 9 |     if file_type == 'parquet':
10 |         return pd.read_parquet(file_path)
11 |     else:
12 |         return pd.read_csv(file_path)
13 | 
14 | def process_data(data):
15 |     data = pd.DataFrame(data)
16 |     return np.nan_to_num(data.astype(float))
17 | 
18 | def prepare_data(scenario="IDS2017"):
19 |     if scenario == "IDS2017":
20 |         data_2017_path = "Datasets/CICIDS2017/clean_data.csv"
21 |         data_2017 = load_dataset(data_2017_path)
22 |         train = data_2017[:693702]
23 |         test = data_2017[693702:]
24 |     elif scenario == "IDS2018":
25 |         train_path = "Datasets/IDS2018/TrainData.csv"
26 |         test_path = "Datasets/IDS2018/NewTestData.csv"
27 |         train = load_dataset(train_path)
28 |         test = load_dataset(test_path)
29 |     elif scenario == "Kitsune":
30 |         train_path = "Datasets/Kitsune/TrainData.csv"
31 |         test_path = "Datasets/Kitsune/TestData.csv"
32 |         train = load_dataset(train_path)
33 |         print(f'Train Loaded with {len(train)} Samples')
34 |         test = load_dataset(test_path)
35 |         print(f'Test Loaded with {len(test)} Samples')
36 |     elif scenario == "mKitsune":
37 |         train_path = "Datasets/Kitsune/TrainData.csv"
38 |         test_path = "Datasets/Kitsune/NewTestData.csv"
39 |         train = load_dataset(train_path)
40 |         print(f'Train Loaded with {len(train)} Samples')
41 |         test = load_dataset(test_path)
42 |         print(f'Test Loaded with {len(test)} Samples')
43 |     elif scenario == "rKitsune":
44 |         train_path = "Datasets/Kitsune/TrainData.csv"
45 |         test_path = "Datasets/Kitsune/Recurring.csv"
46 |         train = load_dataset(train_path)
47 |         print(f'Train Loaded with {len(train)} Samples')
48 |         test = load_dataset(test_path)
49 |         print(f'Test Loaded with {len(test)} Samples')
50 |     else:
51 |         raise ValueError("Invalid scenario number.")
52 | 
53 |     print(f'Scenario {scenario} with: {len(train)} training samples and {len(test)} testing samples')
54 |     y_train, x_train = train["Label"], process_data(train.drop('Label', axis=1))
55 |     y_test, x_test = test["Label"], process_data(test.drop('Label', axis=1))
56 |     scaler = MinMaxScaler().fit(x_train)
57 |     x_train, x_test = scaler.transform(x_train), scaler.transform(x_test)
58 |     
59 |     return np.array(x_train), np.array(x_test), np.array(y_train), np.array(y_test)
60 | 
61 | 
62 | def partition_array(x_data=None, y_data=None, slice_size=50000):
63 |     num_samples = x_data.shape[0]
64 |     num_slices = num_samples // slice_size + 1
65 |     
66 |     x_slices = []
67 |     y_slices = []
68 |     for i in range(num_slices):
69 |         start = i * slice_size
70 |         end = min((i + 1) * slice_size, num_samples)
71 |         x_slices.append(x_data[start:end])
72 |         y_slices.append(y_data[start:end])
73 |     print(f' Test data has been divided into slices of size {slice_size} and length of {len(x_slices)}')
74 |     return x_slices, y_slices
75 | 
76 | 
77 | def loading_datasets(benign_train):
78 |     train_dataset = TensorDataset(torch.tensor(benign_train))
79 |     train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
80 |     return train_loader, benign_train
81 | 
82 | def prepare_datasets(x_train, y_train):
83 |     indexes_ben_train = np.where(y_train == 0)[0]
84 |     benign_train = x_train[indexes_ben_train]
85 |     train_loader, benign_train = loading_datasets(benign_train)
86 |     return train_loader, benign_train
87 | 
88 | def prepare_new_train_valid_data(x_train, new_set):
89 |     if new_set is None:
90 |         return x_train
91 |     else:
92 |         x_new_train = np.concatenate((x_train, new_set), axis=0)
93 |     return x_new_train


--------------------------------------------------------------------------------
/MateenUtils/merge_utils.py:
--------------------------------------------------------------------------------
 1 | import AE as model_base
 2 | from sklearn.metrics import f1_score
 3 | import numpy as np
 4 | import torch.nn as nn
 5 | import torch.backends.cudnn as cudnn
 6 | import torch
 7 | import random
 8 | import utils
 9 | import copy
10 | 
11 | seed = 0
12 | torch.manual_seed(seed)
13 | torch.backends.cudnn.deterministic = True
14 | torch.backends.cudnn.benchmark = False
15 | np.random.seed(seed)
16 | random.seed(0)
17 | 
18 | device = "cuda" if torch.cuda.is_available() else "cpu"
19 | 
20 | 
21 | 
22 | def get_best_models(mode, models, thresholds, data, y_true):
23 |     f1_list = []
24 |     for i in range(len(models)):
25 |         y_pred, probs = utils.preds_and_probs(models[i], thresholds[i], data)
26 |         f1 = f1_score(y_true, y_pred, average='micro')
27 |         f1_list.append(f1)
28 |     index_of_max = f1_list.index(max(f1_list))
29 |     if mode == "selection":
30 |         return models[index_of_max], thresholds[index_of_max], index_of_max, max(f1_list), f1_list 
31 |     elif mode == "merge":
32 |         return f1_list
33 | 
34 | def get_values_within_margin(lst, margin=0.10):
35 |     max_value = max(lst)
36 |     threshold = max_value * (1 - margin)
37 |     indices = [i for i, value in enumerate(lst) if value >= threshold]
38 |     return indices
39 | 
40 | def compute_val_f1(model, data, y_true, previous_data):
41 |     thres = utils.threshold_calulation(model, previous_data)
42 |     predictions, _ = utils.preds_and_probs(model, thres, data)
43 |     f1 = f1_score(y_true, predictions, average='micro')
44 |     return f1
45 | 
46 | def merge_layer_weights(layer1, layer2, alpha):
47 |     layer2.weight.data = alpha * layer1.weight.data + (1 - alpha) * layer2.weight.data
48 |     layer2.bias.data = alpha * layer1.bias.data + (1 - alpha) * layer2.bias.data
49 | 
50 | 
51 | def print_model_layer_names(model):
52 |     for name, _ in model.named_parameters():
53 |         print(name)
54 | 
55 | def merge_models(model1: nn.Module, model2: nn.Module, data, y_true, previous_data):
56 |     input_shape = data.shape[1]
57 |     empty_model = model_base.autoencoder(input_shape)
58 |     empty_model.to(device)
59 |     best_alpha = 0.5  
60 |     best_f1 = -float('inf')
61 |     original_state_dict = model2.state_dict()  
62 |     for alpha_value in [i * 0.01 for i in range(101)]: 
63 |         alpha = torch.tensor(alpha_value).to(device)
64 |         temp_model = copy.deepcopy(empty_model)
65 |         temp_model.load_state_dict(original_state_dict) 
66 |         for (seq_name1, seq1), (seq_name2, seq2) in zip(model1._modules.items(), temp_model._modules.items()):
67 |             for (layer_name1, layer1), (layer_name2, layer2) in zip(seq1._modules.items(), seq2._modules.items()):
68 |                 if isinstance(layer1, nn.Linear) and isinstance(layer2, nn.Linear):
69 |                     merge_layer_weights(layer1, layer2, alpha)
70 | 
71 |         f1 = compute_val_f1(temp_model, data, y_true, previous_data)
72 |         if f1 > best_f1:
73 |             best_f1 = f1
74 |             best_alpha = alpha_value 
75 |             print(f' Alpha {alpha_value} -- F1: {f1}')
76 | 
77 |     print(f'Best Alpha is {best_alpha}')
78 |     final_model = copy.deepcopy(empty_model)
79 |     final_model.load_state_dict(original_state_dict) 
80 |     for (seq_name1, seq1), (seq_name2, seq2) in zip(model1._modules.items(), final_model._modules.items()):
81 |         for (layer_name1, layer1), (layer_name2, layer2) in zip(seq1._modules.items(), seq2._modules.items()):
82 |             if isinstance(layer1, nn.Linear) and isinstance(layer2, nn.Linear):
83 |                 merge_layer_weights(layer1, layer2, torch.tensor(best_alpha).to(device))
84 | 
85 |     return final_model  
86 | 
87 | def merge_tmp_models(models, thresholds, data, y_true, previous_data):
88 |     f1_scores = get_best_models("merge", models, thresholds, data, y_true)
89 |     candidates_idx = get_values_within_margin(f1_scores, margin=0.10)
90 |     if len(candidates_idx) == 1:
91 |         return models[candidates_idx[0]]
92 |     merged_model = models[candidates_idx[0]]
93 |     for idx in candidates_idx[1:]:
94 |         merged_model = merge_models(merged_model, models[idx],data, y_true, previous_data)
95 |     
96 |     print(f'Models {candidates_idx} Have Been Merged')
97 |     return merged_model
98 | 


--------------------------------------------------------------------------------
/MateenUtils/utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import random
  4 | import torch.nn as nn
  5 | import torch.backends.cudnn as cudnn
  6 | from sklearn.metrics import balanced_accuracy_score, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score
  7 | from collections import Counter
  8 | 
  9 | 
 10 | seed = 0
 11 | torch.manual_seed(seed)
 12 | torch.backends.cudnn.deterministic = True
 13 | torch.backends.cudnn.benchmark = False
 14 | np.random.seed(seed)
 15 | random.seed(0)
 16 | 
 17 | device = "cuda" if torch.cuda.is_available() else "cpu"
 18 | device
 19 | 
 20 | 
 21 | getMSEvec = nn.MSELoss(reduction='none')
 22 | 
 23 | def se2rmse(a):
 24 |     return torch.sqrt(sum(a.t())/a.shape[1])
 25 | 
 26 | def threshold_calulation(model, x_data):
 27 |     model.eval()
 28 |     output = model((torch.tensor(x_data).float()).to(device))
 29 |     mse_vec = getMSEvec(output, torch.tensor(x_data).to(device))
 30 |     rmse_vec = se2rmse(mse_vec).cpu().data.numpy()
 31 |     thres = max(rmse_vec)
 32 |     rmse_vec.sort()
 33 |     pctg = 0.95
 34 |     thres = rmse_vec[int(len(rmse_vec)*pctg)]
 35 |     return thres
 36 |     
 37 | def preds_and_probs(model, threshold, X_test):
 38 |     X_test_tensor = torch.from_numpy(X_test).type(torch.float).to(device)
 39 |     model.eval()
 40 |     output = model(X_test_tensor)
 41 |     mse_vec = getMSEvec(output, X_test_tensor)
 42 |     rmse_vec = se2rmse(mse_vec).cpu().data.numpy()
 43 |     y_pred = np.asarray([0] * len(rmse_vec))
 44 |     idx_mal = np.where(rmse_vec > threshold)
 45 |     y_pred[idx_mal] = 1
 46 |     return y_pred, rmse_vec
 47 | 
 48 | 
 49 | def get_features_error(model, X_test):
 50 |     X_test_tensor = torch.from_numpy(X_test).type(torch.float).to(device)
 51 |     model.eval()
 52 |     output = model(X_test_tensor).cpu().data.numpy()
 53 |     errors = np.abs(X_test - output)
 54 |     return errors
 55 | 
 56 | 
 57 | def getResult(y_true, y_pred):
 58 |     cm = confusion_matrix(y_true, y_pred)
 59 |     positive_label = 0
 60 |     print("Predicted Labels", Counter(y_pred))
 61 |     print("True Labels", Counter(y_true))
 62 |     if positive_label == 0:
 63 |         print('Positive label: 0')
 64 |         tp, fn, fp, tn = cm.ravel()
 65 |     else:
 66 |         print('Positive label: 1')
 67 |         tn, fp, fn, tp = cm.ravel()
 68 |     attacks = tp + fn
 69 |     normals = fp + tn
 70 |     accuracy = ((tp + tn) / (attacks + normals)) * 100
 71 |     precision = (tp / (tp + fp)) * 100
 72 |     recall = (tp / (tp + fn)) * 100
 73 |     f1 = (2 * (((precision / 100)* (recall / 100)) / ((precision / 100) + (recall / 100)))) * 100
 74 |     tnr = (tn / (tn + fp)) * 100
 75 |     macro_recall = recall_score(y_true, y_pred, average='macro') * 100
 76 |     macro_precision = precision_score(y_true, y_pred, average='macro') * 100
 77 |     macro_f1 = f1_score(y_true, y_pred, average='macro') * 100
 78 |     balanced_accuracy = balanced_accuracy_score(y_true, y_pred) * 100
 79 |     tpr = (tp / (tp + fn)) * 100
 80 | 
 81 |     print("General Accuracy: {:.4f}".format(accuracy))
 82 |     print("Recall: {:.4f}".format(recall))
 83 |     print("Precision: {:.4f}".format(precision))
 84 |     print("F1 Score: {:.4f}".format(f1))
 85 |     print("True Negative Rate: {:.4f}".format(tnr))
 86 |     print(f"True Positive Rate: {tpr:.2f}%")
 87 |     print("Macro Recall: {:.4f}".format(macro_recall))
 88 |     print("Macro Precision: {:.4f}".format(macro_precision))
 89 |     print("Macro F1 Score: {:.4f}".format(macro_f1))
 90 |     print("Balanced Accuracy: {:.4f}".format(balanced_accuracy))
 91 |     return accuracy, recall, precision, f1, tnr, macro_recall, macro_precision, macro_f1, balanced_accuracy
 92 | 
 93 | 
 94 | def auc_roc_in_chunks(y_test, probs_list, chunk_size=50000):
 95 |     num_chunks = len(y_test) // chunk_size + (1 if len(y_test) % chunk_size != 0 else 0)
 96 |     auc_roc_scores = []
 97 |     for i in range(num_chunks):
 98 |         start = i * chunk_size
 99 |         end = start + chunk_size
100 |         y_test_chunk = y_test[start:end]
101 |         probs_list_chunk = probs_list[start:end]
102 |         if set(y_test_chunk) == {0, 1}:
103 |             auc_roc = roc_auc_score(y_test_chunk, probs_list_chunk)
104 |             auc_roc_scores.append(auc_roc)    
105 |     return auc_roc_scores


--------------------------------------------------------------------------------
/MateenUtils/selection_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch 
  3 | import torch.backends.cudnn as cudnn
  4 | from scipy.spatial.distance import pdist, squareform
  5 | import random 
  6 | import utils
  7 | from tqdm import tqdm
  8 | 
  9 | seed = 0
 10 | torch.manual_seed(seed)
 11 | torch.backends.cudnn.deterministic = True
 12 | torch.backends.cudnn.benchmark = False
 13 | np.random.seed(seed)
 14 | random.seed(0)
 15 | device = "cuda" if torch.cuda.is_available() else "cpu"
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | def get_unique(reps, data_idx, min_distance):
 22 |     distances = squareform(pdist(reps))
 23 |     num_samples = distances.shape[0]
 24 |     valid = np.ones(num_samples, dtype=bool)
 25 |     counts = np.zeros(num_samples, dtype=int)
 26 |     
 27 |     for i in range(num_samples):
 28 |         if valid[i]:
 29 |             neighbors = distances[i, :] < min_distance
 30 |             counts[i] = np.sum(neighbors) - 1  
 31 |             valid[neighbors] = False  
 32 |             valid[i] = True  
 33 |     filtered_reps = reps[valid]
 34 |     filtered_data_idx = np.array(data_idx)[valid]
 35 |     counts = counts[valid].tolist()
 36 |     return filtered_reps, filtered_data_idx, counts
 37 | 
 38 | def get_informative(model, data, data_idx, budget, retention_rate, initial_min_distance=0.1):
 39 |     data_idx = np.array(data_idx)
 40 |     data = torch.from_numpy(data).float().to(device)
 41 |     errs_vector = utils.getMSEvec(model(data), data).cpu().data.numpy()
 42 |     reps = model(data).cpu().data.numpy()
 43 |     reps = np.hstack((reps, errs_vector))
 44 |     target_count = int(retention_rate * len(data))
 45 |     lower_bound = 0
 46 |     upper_bound = 1
 47 |     tolerance = 0.0000001  
 48 |     max_iterations = 50 
 49 |     iteration = 0
 50 | 
 51 |     while (upper_bound - lower_bound) > tolerance and iteration < max_iterations:
 52 |         iteration += 1
 53 |         mid_point = (upper_bound + lower_bound) / 2
 54 |         filtered_reps, filtered_data_idx, similar_samples = get_unique(reps, data_idx, mid_point)
 55 |         if len(filtered_data_idx) < target_count:
 56 |             upper_bound = mid_point  
 57 |         else:
 58 |             lower_bound = mid_point 
 59 |     similar_samples = min_max_scaling(similar_samples)
 60 |     return filtered_data_idx, similar_samples
 61 | 
 62 | def min_max_scaling(data):
 63 |     return (data - np.min(data)) / (np.max(data) - np.min(data))
 64 | 
 65 | def get_rep(model, data, idx, budget, similar_rates, lambda_0, lambda_1=1.0):
 66 |     data = torch.from_numpy(data).float().to(device)
 67 |     reps = utils.getMSEvec(model(data), data).cpu().data.numpy()
 68 |     distances = squareform(pdist(reps))
 69 |     np.fill_diagonal(distances, 0)  
 70 |     distance_sums = distances.sum(axis=1)
 71 |     distance_sums = min_max_scaling(distance_sums)
 72 |     final_score = (lambda_0 * distance_sums) + (lambda_1 * similar_rates)
 73 |     sorted_indices = np.argsort(-final_score) 
 74 |     selected_data_idx_in_sorted = sorted_indices[:budget]
 75 |     selected_original_idx = idx[selected_data_idx_in_sorted]
 76 |     selected_data = data[selected_data_idx_in_sorted]
 77 |     return selected_original_idx
 78 | 
 79 | def data_to_bins(model, data, batch_size=1000):
 80 |     data = torch.from_numpy(data).float().to(device)
 81 |     recon_errs = utils.se2rmse(utils.getMSEvec(model(data), data)).cpu().data.numpy()
 82 |     indices = np.arange(len(recon_errs))
 83 |     sorted_indices = indices[np.argsort(recon_errs)[::-1]]  
 84 |     num_batches = len(sorted_indices) // batch_size
 85 |     batches_indices = []
 86 |     for i in range(num_batches):
 87 |         start_idx = i * batch_size
 88 |         end_idx = start_idx + batch_size
 89 |         batches_indices.append(sorted_indices[start_idx:end_idx])
 90 |     return batches_indices
 91 | 
 92 | 
 93 | def mateen_selector(model, data, labels, args):    
 94 |     temp_idx = []
 95 |     batch_size = args.mini_batch_size
 96 |     if len(labels) > batch_size:
 97 |         batches_indices = data_to_bins(model, data, batch_size=batch_size)
 98 |     else:
 99 |         batches_indices = [np.arange(len(data))] 
100 |     
101 |     for batch in batches_indices:
102 |         label_budget = int(args.selection_budget * len(batch))
103 |         informative_idx, similar_rates = get_informative(model, data[batch], batch, args.selection_budget, args.retention_rate) 
104 |         informative_idx = np.array(informative_idx)
105 |         if len(informative_idx) > label_budget:
106 |             selected_idx = get_rep(model, data[informative_idx], informative_idx, label_budget, similar_rates, args.lambda_0)
107 |             temp_idx.extend(selected_idx)
108 |         else:
109 |             temp_idx.extend(informative_idx)
110 |     temp_idx = np.array(temp_idx)    
111 |     selected_idx = [idx for idx in temp_idx if labels[idx] == 0]
112 |     
113 |     if len(selected_idx) == 0:
114 |         return None, temp_idx, labels[temp_idx]
115 |     return data[selected_idx], temp_idx, labels[temp_idx]
116 | 


--------------------------------------------------------------------------------
/MateenUtils/main.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.metrics import f1_score
  3 | from collections import Counter
  4 | from datetime import datetime
  5 | import copy
  6 | import torch
  7 | import torch.nn as nn
  8 | import pandas as pd
  9 | import random
 10 | import torch.backends.cudnn as cudnn
 11 | import AE as model_base
 12 | import data_processing as dp
 13 | import utils
 14 | import merge_utils as merge
 15 | import selection_utils as selection
 16 | from scipy.stats import ks_2samp
 17 | 
 18 | 
 19 | seed = 0
 20 | torch.manual_seed(seed)
 21 | torch.backends.cudnn.deterministic = True
 22 | torch.backends.cudnn.benchmark = False
 23 | np.random.seed(seed)
 24 | random.seed(0)
 25 | 
 26 | device = "cuda" if torch.cuda.is_available() else "cpu"
 27 | 
 28 | 
 29 | def model_update(x_train, y_train=None, num_epochs=100, model=None):
 30 |     input_shape = x_train.shape[1]
 31 |     train_loader, _ = dp.loading_datasets(x_train)
 32 |     model = model_base.train_autoencoder(model, train_loader, num_epochs=num_epochs, learning_rate=0.0001)
 33 |     return model
 34 | 
 35 | 
 36 | def load_model(load_mode, input_shape, scenario, train_loader, data, num_epochs):
 37 |     if load_mode == "new":
 38 |         model = model_base.autoencoder(input_shape)
 39 |         model = model_update(data, num_epochs=num_epochs, model=model) 
 40 |     else:
 41 |         model = torch.load(f'Models/{scenario}.pth').to(device)
 42 |     return model
 43 |           
 44 |     
 45 | def ensemble_training(x_train, y_train=None, num_epochs=10, mode=None, scenario=1, load_mode=None):
 46 |     input_shape = x_train.shape[1]
 47 |     if mode == "init":
 48 |         train_loader, benign_train = dp.prepare_datasets(x_train, y_train)
 49 |     elif mode == None: 
 50 |         train_loader, _ = dp.loading_datasets(x_train)
 51 |     model = load_model(load_mode, input_shape, scenario, train_loader, x_train, num_epochs)
 52 |     return model
 53 |     
 54 | 
 55 | def isit_shift(recon_old, recon_new, threshold):
 56 |     recon_old_sorted = sorted(recon_old)
 57 |     recon_new_sorted = sorted(recon_new)
 58 |     ks_statistic, p_value = ks_2samp(recon_old_sorted, recon_new_sorted)
 59 |     if p_value < threshold:
 60 |         return True
 61 |     else:
 62 |         print(f' No Shift !')
 63 |         return False
 64 | 
 65 | 
 66 | 
 67 | def select_and_adapt(probs, probs_vector, data_slice, label_slice, models_list, threshold_list, benign_train, selected_model, y_pred, selected_threshold, x_train, y_train, args):
 68 |     print(datetime.now())
 69 |     x_selected, selected_idx, selected_true = selection.mateen_selector(selected_model, data_slice, label_slice, args)
 70 |     print(datetime.now())
 71 |     print(f'Selected Predictions {Counter(y_pred[selected_idx])}')
 72 |     print(f'Selected True labels {Counter(selected_true.flatten())}')
 73 |     print(f' Predictions {Counter(y_pred)}')
 74 |     print(f' True labels {Counter(label_slice.flatten())}')
 75 |     performance = f1_score(selected_true, y_pred[selected_idx], average='micro')
 76 |     if (performance < args.performance_thres):  
 77 |         big_model = copy.deepcopy(models_list[0])
 78 |         print(f' Bad Performance: {performance}')
 79 |         if x_selected is not None:
 80 |             print(x_selected.shape)
 81 |             print(' Train Temp Model')
 82 |             benign_train = np.concatenate((benign_train, x_selected))
 83 |             new_model =  model_update(x_selected, num_epochs=100, model=big_model) 
 84 |             thres = utils.threshold_calulation(new_model, benign_train) 
 85 |             models_list.append(new_model)
 86 |             threshold_list.append(thres)
 87 |             
 88 |             y_pred, _ = utils.preds_and_probs(models_list[0], threshold_list[0], data_slice[selected_idx])
 89 |             big_model_performance = f1_score(selected_true, y_pred,average='micro')
 90 |             if (big_model_performance < args.performance_thres):  
 91 |                 print(f'Update Large Model (Current Performance {big_model_performance})')
 92 |                 updated_model =  model_update(benign_train, num_epochs=10, model=models_list[0]) 
 93 |                 updated_model_thres = utils.threshold_calulation(updated_model, benign_train) 
 94 |                 models_list[0] = updated_model
 95 |                 threshold_list[0] = updated_model_thres
 96 |             if len(models_list) >= args.max_ensemble_length:
 97 |                 print('Cleaning Ensemble')
 98 |                 print(f' Ensemble Length {len(models_list)}')
 99 |                 temp_models = models_list[1:-1]
100 |                 temp_thresholds = threshold_list[1:-1]
101 |                 print(f' Merged Length {len(temp_models)}')
102 |                 temp_model = merge.merge_tmp_models(temp_models, temp_thresholds, data_slice[selected_idx], label_slice[selected_idx], benign_train)
103 |                 print('Fine Tune Merged Model')
104 |                 temp_model_thres = utils.threshold_calulation(temp_model, benign_train)
105 |                 models_list = [models_list[0], temp_model, models_list[-1]]
106 |                 threshold_list = [threshold_list[0], temp_model_thres, threshold_list[-1]]  
107 |         selected_model, selected_threshold, model_idx, selected_f1, f1_list = merge.get_best_models("selection", models_list, threshold_list, data_slice[selected_idx], label_slice[selected_idx]) 
108 |         print(f' Model {model_idx} Selected with F1 {selected_f1} ; other models F1s {f1_list}')
109 |     return models_list, threshold_list, selected_model, selected_threshold, benign_train, x_train, y_train
110 | 
111 | def adaptive_ensemble(x_train, y_train, x_slice, y_slice, args):
112 |     cade_model = None
113 |     model = ensemble_training(x_train, y_train=y_train, num_epochs=100, mode="init", scenario=args.dataset_name) 
114 |     benign_train = x_train[y_train==0]
115 |     selected_threshold = utils.threshold_calulation(model, benign_train)
116 |     predicitons = []
117 |     probs_list = []
118 |     print(f'Updating Models Process Started!')
119 |     models_list = [model]
120 |     threshold_list = [selected_threshold]
121 |     selected_model = model
122 |     for i in range(len(x_slice)):
123 |         print(f'Step {i+1}/{len(x_slice)}')
124 |         y_pred, probs = utils.preds_and_probs(selected_model, selected_threshold, x_slice[i])
125 |         _, old_probs = utils.preds_and_probs(selected_model, selected_threshold, benign_train[-len(x_slice[i]):])
126 |         predicitons.extend(y_pred)
127 |         probs_list.extend(probs)
128 |         data_slice = x_slice[i]
129 |         label_slice = y_slice[i]
130 |         if i+1 == len(x_slice):
131 |             return predicitons, probs_list
132 |         if isit_shift(old_probs, probs, args.shift_threshold) == True:
133 |             probs_vector = utils.get_features_error(selected_model, x_slice[i])
134 |             models_list, threshold_list, selected_model, selected_threshold, benign_train, x_train, y_train = select_and_adapt(probs, probs_vector, data_slice, label_slice, models_list, threshold_list, benign_train, selected_model, y_pred, selected_threshold, x_train, y_train, args)
135 |     return predicitons, probs_list
136 | 
137 | 
138 | 
139 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <p align="center">
  3 |   <img width="587" alt="mtn" src="https://github.com/user-attachments/assets/63a75b73-840d-4d62-8bed-34c157b869de">
  4 | </p>
  5 | 
  6 | 
  7 | # Overview
  8 | Mateen is an ensemble framework designed to enhance AutoEncoder (AE)-based one-class network intrusion detection systems by effectively managing distribution shifts in network traffic. It comprises four key components:
  9 | 
 10 | ### Shift Detection Function
 11 |   - **Purpose**: Detects distribution shifts in network traffic using statistical methods.
 12 |   
 13 | ### Sample Selection
 14 |   - **Subset Selection**: Identifies a representative subset of the network traffic samples that reflects the overall distribution after a shift.
 15 |   - **Labeling and Update Decision**: The subset is manually labeled to decide whether an update to the ensemble is necessary.
 16 | 
 17 | ### Shift Adaptation Module
 18 |   - **Incremental Model Update**: Integrates the benign data of the labeled subset with the existing training set. Then, updates the incremental model on this expanded set. 
 19 |   - **Temporary Model Training**: Initiates a new temporary model with the same weights as the incremental model. Then, train this model exclusively on the benign data of the labeled subset.
 20 |     
 21 | ### Complexity Reduction Module
 22 |   - **Model Merging**: Merges temporary models that perform similarly.
 23 |   - **Model Pruning**: Removes models that underperform compared to the best-performing model.
 24 | 
 25 | For further details, please refer to the main paper.
 26 | 
 27 | # Pre-requisites and requirements
 28 | Ensure the following dependencies are installed before running Mateen. You can install them using the command below:
 29 | ```bash
 30 | pip install -r requirements.txt
 31 | ```
 32 | 
 33 | Contents of '<b>requirements.txt</b>':
 34 | ```
 35 | torch==2.0.1
 36 | numpy==1.25.0
 37 | pandas==1.5.3
 38 | scipy==1.10.1
 39 | sklearn==1.2.2
 40 | tqdm==4.65.0
 41 | ```
 42 | 
 43 | 
 44 | # Models and Data 
 45 | You can download the pre-trained models, the processed data, as well as the results CSV files from the following link: 
 46 | <p align="center"> <a href="https://drive.google.com/drive/folders/1PG_tPCxmS2rdkIMokjBnQkXhIJgJJlEY?usp=drive_link" target="_blank">Google Drive Folder</a> </p>
 47 | 
 48 | The contents of the folder are as follows: 
 49 | - `Datasets.zip`: Contains the processed data.
 50 | - `Models.zip`: Contains the pre-trained models.
 51 | - `Results.zip`: Prediction results and probabilities across datasets.
 52 | 
 53 | Ensure these files are placed in the `Mateen/` directory after downloading and extracting.
 54 | 
 55 | # How to Use Mateen
 56 | 
 57 | To utilize Mateen with our settings, please follow these steps to set up the required datasets and run the framework.
 58 | 
 59 | ## Dataset Setup
 60 | 
 61 | First, download the datasets as mentioned in the [Models and Data](https://github.com/ICL-ml4csec/Mateen/edit/main/README.md#models-and-data) section. Ensure that the files are organized in the following directories:
 62 | 
 63 | - `Datasets/CICIDS2017/` for IDS2017
 64 | - `Datasets/IDS2018/` for IDS2018
 65 | - `Datasets/Kitsune/` for Kitsune and its variants. 
 66 | 
 67 | You can directly download and unzip the datasets into the main directory of Mateen (i.e., `Mateen/`).
 68 | 
 69 | ## Running Mateen
 70 | 
 71 | To run Mateen, use the following command:
 72 | 
 73 | ```bash
 74 | python Mateen.py
 75 | ```
 76 | ## Command-Line Options 
 77 | You can customize the execution using various command-line options:
 78 | 
 79 | ### Dataset Selection
 80 | Switch between datasets using the '<b>--dataset_name</b>' option.
 81 | 
 82 | Example:
 83 | ```bash
 84 | python Mateen.py --dataset_name "IDS2017"
 85 | ```
 86 | <details>
 87 |   <summary>Options</summary>
 88 |    "IDS2017", "IDS2018", "Kitsune", "mKitsune", and "rKitsune"
 89 | </details>
 90 | 
 91 | ### Window Size
 92 | Set the window size using the '<b>--window_size</b>' option.
 93 | 
 94 | Example:
 95 | ```bash
 96 | python Mateen.py --dataset_name "IDS2017" --window_size 50000
 97 | ```
 98 | <details>
 99 | <summary>Options</summary>
100 | 10000, 50000, and 100000
101 | </details>
102 | 
103 | ### Shift Detection Threshold
104 | Set the threshold using '<b>--shift_threshold</b>' option.
105 | 
106 | Example:
107 | ```bash
108 | python Mateen.py  --dataset_name "IDS2017" --window_size 50000 --shift_threshold 0.05
109 | ```
110 | 
111 | <details>
112 |     <summary>Options</summary>
113 |     0.05, 0.1, and 0.2
114 | </details>
115 | 
116 | ### Performance Threshold
117 | The minimum acceptable performance '<b>--performance_thres</b>' option.
118 | 
119 | Example:
120 | ```bash
121 | python Mateen.py --dataset_name "IDS2017" --window_size 50000 --shift_threshold 0.05 --performance_thres 0.99
122 | ```
123 | <details>
124 |   <summary>Options</summary>
125 |     0.99, 0.95, 0.90, 0.85, and 0.8
126 | </details>
127 | 
128 | ### Maximum Ensemble Size
129 | The maximum acceptable ensemble size '<b>--max_ensemble_length</b>' option. 
130 | 
131 | Example:
132 | ```bash
133 | python Mateen.py --dataset_name "IDS2017" --window_size 50000 --shift_threshold 0.05 --performance_thres 0.99 --max_ensemble_length 3
134 | ```
135 | <details>
136 |     <summary>Options</summary>
137 |     3, 5, and 7
138 | </details>
139 | 
140 | ### Selection Rate
141 | Set the selection rate for building a subset for manual labeling using the '<b>--selection_budget</b>' option.
142 | 
143 | Example:
144 | ```bash
145 | python Mateen.py  --dataset_name "IDS2017" --window_size 50000 --shift_threshold 0.05 --performance_thres 0.99 --max_ensemble_length 3 --selection_budget 0.01
146 | ```
147 | <details>
148 |     <summary>Options</summary>
149 |    0.005, 0.01, 0.05, and 0.1
150 | </details>
151 | 
152 | ### Mini Batch Size for Sample Selection
153 | Choose the min-batch size using the '<b>--mini_batch_size</b>' option.
154 | 
155 | Example:
156 | ```bash
157 | python Mateen.py --dataset_name "IDS2017" --window_size 50000 --shift_threshold 0.05 --performance_thres 0.99 --max_ensemble_length 3 --selection_budget 0.01 --mini_batch_size 1000
158 | ```
159 | <details>
160 |     <summary>Options</summary>
161 |     500, 1000, and 1500
162 | </details>
163 | 
164 | 
165 | ### Retention Rate
166 | Set the value of the retention rate using '<b>--retention_rate</b>' option.
167 | 
168 | Example:
169 | ```bash
170 | python Mateen.py --dataset_name "IDS2017" --window_size 50000 --shift_threshold 0.05 --performance_thres 0.99 --max_ensemble_length 3 --selection_budget 0.01 --mini_batch_size 1000 --retention_rate 0.3
171 | ```
172 | <details>
173 |     <summary>Options</summary>
174 |     0.3, 0.5, and 0.9
175 | </details>
176 | 
177 | ### Lambda 0 value
178 | Adjust the lambda_0 parameter with the '<b>--lambda_0'</b> option to adjust the weight assigned to uniqueness scores during the sample selection process.
179 | 
180 | Example:
181 | ```bash
182 | python Mateen.py  --dataset_name "IDS2017" --window_size 50000 --shift_threshold 0.05 --performance_thres 0.99 --max_ensemble_length 3 --selection_budget 0.01 --mini_batch_size 1000 --retention_rate 0.3 --lambda_0 0.1
183 | ```
184 | <details>
185 |     <summary>Options</summary>
186 |     0.1, 0.5, and 1.0
187 | </details>
188 | 
189 | 
190 | ## Hyperparameter Selection
191 | For further details about the hyperparameter selection, please refer to the main paper, Appendix C.
192 | 
193 | # Citation
194 | ```
195 | @inproceedings{alotaibi24mateen,
196 |   title={Mateen: Adaptive Ensemble Learning for Network Anomaly Detection},
197 |   author={Alotaibi, Fahad and Maffeis, Sergio},
198 |   booktitle={the 27th International Symposium on Research in Attacks, Intrusions and Defenses (RAID 2024)},
199 |   year={2024},
200 |   organization={Association for Computing Machinery}
201 | }
202 | 
203 | ```
204 | # Contact
205 | 
206 | If you have any questions or need further assistance, please feel free to reach out to me at any time: 
207 | - Email: `f.alotaibi21@imperial.ac.uk`
208 | - Alternate Email: `fahadalkarshmi@gmail.com`
209 | 


--------------------------------------------------------------------------------