├── .DS_Store ├── Baseline ├── .DS_Store ├── .idea │ ├── .gitignore │ ├── Baseline.iml │ ├── inspectionProfiles │ │ └── profiles_settings.xml │ ├── misc.xml │ ├── modules.xml │ └── vcs.xml ├── FastPC │ ├── baseline_evaluation.py │ ├── fastPC.py │ ├── hannlstm.py │ ├── interdependent.py │ ├── lbfgsb_scipy.py │ ├── libspot.so │ ├── pyspot.py │ ├── rca.py │ ├── test_FastPC_node_metric.py │ ├── test_FastPC_pod_combine.py │ ├── test_FastPC_pod_log.py │ ├── test_FastPC_pod_metric.py │ └── trace_expm.py ├── Nezha │ ├── 20240124 │ │ ├── 20240124-fault_list.json │ │ └── root_cause_hipster.json │ ├── INSTALL.md │ ├── LICENSE │ ├── alarm.py │ ├── data_integrate.py │ ├── data_parser.py │ ├── log.py │ ├── log_parsing.py │ ├── main.py │ ├── pattern_miner.py │ ├── pattern_ranker.py │ └── requirements.txt ├── Readme.md ├── SWAT&WADI │ ├── .DS_Store │ ├── baro-evaluation.py │ ├── baro.py │ ├── baro_algorithm.py │ ├── pyrca-evaluation.py │ └── pyrca-main.py ├── log_only │ ├── RCA_methods_log.py │ ├── baro_algorithm.py │ └── baro_main_log.py ├── metric_only │ ├── RCA_methods_metric.py │ ├── baro_algorithm.py │ └── baro_main_metric.py └── multimodal │ ├── RCA_methods_combined.py │ ├── baro_algorithm.py │ └── baro_main_combined.py ├── Crossiant_Metadata ├── Crossiant_Metadata_Cloud_Computing_Original.json ├── Crossiant_Metadata_Cloud_Computing_Preprocessed.json ├── Crossiant_Metadata_Product_Review_Original.json └── Crossiant_Metadata_Product_Review_Preprocessed.json ├── IT └── data preprocessing │ ├── Drain.py │ ├── JMeter_KPI.py │ ├── README.md │ ├── drain3.yaml │ ├── drain3_parse.py │ ├── json2message.py │ ├── log_PCA_extraction.py │ ├── log_frequency_extraction.py │ ├── log_golden_frequency.py │ └── metric_json2npy.py ├── LICENSE ├── OT └── data_preprocessing │ ├── Readme.md │ ├── SWaT │ ├── data_segment.py │ ├── node_data_cut.py │ ├── node_final_process.py │ ├── pod_data_cut.py │ ├── pod_final_process.py │ └── process.sh │ └── WADI │ ├── data_segment.py │ ├── node_data_cut.py │ ├── node_final_process.py │ ├── pod_data_cut.py │ ├── pod_final_process.py │ └── process.sh ├── Other ├── bg.png └── rca_update.png └── README.md /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KnowledgeDiscovery/rca_baselines/c560d8cc39c19f04c9ae74fac2404b1e5e8c3b4d/.DS_Store -------------------------------------------------------------------------------- /Baseline/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KnowledgeDiscovery/rca_baselines/c560d8cc39c19f04c9ae74fac2404b1e5e8c3b4d/Baseline/.DS_Store -------------------------------------------------------------------------------- /Baseline/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /Baseline/.idea/Baseline.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /Baseline/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /Baseline/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /Baseline/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Baseline/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Baseline/FastPC/baseline_evaluation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | folders = ['0606','0517','0524','0901','1203'] 4 | predicts = [] 5 | for fd in folders: 6 | pods_data = pd.read_csv(fd+'/output/Pod_level_combine_ranking.csv') 7 | pods = list(pods_data['pod']) 8 | # pods = [x.split('_')[1] for x in pods] 9 | predicts.append(pods) 10 | 11 | k = [1,3,5,7,10] 12 | 13 | def precision_on_topk(predicts,reals,k): 14 | pr = 0 15 | for pred, real in zip(predicts, reals): 16 | pred = pred[:k] 17 | hit_count = len(set(pred) & set(real)) 18 | min_len = min(k,len(real)) 19 | pr += hit_count/min_len 20 | return pr/len(reals) 21 | 22 | def mean_precision_k(predicts,reals,k): 23 | pr = 0 24 | for i in range(1,k+1): 25 | pr += precision_on_topk(predicts,reals,i) 26 | return pr/k 27 | 28 | def mrr(predicts,reals): 29 | mrr_val = 0 30 | for preds,real in zip(predicts,reals): 31 | tmp = [] 32 | for real_item in real: 33 | index = preds.index(real_item) if real_item in preds else sys.maxsize-1 34 | tmp.append(index+1) 35 | mrr_val += 1/min(tmp) 36 | return mrr_val/len(reals) 37 | 38 | reals = [['productpage-v1-5f9dbcd669-z2prs'], 39 | ['catalogue-8667bb6cbc-hqzfw'], 40 | ['catalogue-85fd4965b7-q8477'], 41 | ['catalogue-6c7b9b975-xfjps'], 42 | ['mongodb-v1-64c6b69879-p4wfp']] 43 | 44 | for item in k: 45 | pr = precision_on_topk(predicts,reals,item) 46 | map_val = mean_precision_k(predicts,reals,item) 47 | mrr_val = mrr(predicts,reals) 48 | print("pr@{}:{} map@{}:{} mrr:{}".format(item,pr,item,map_val,mrr_val)) 49 | -------------------------------------------------------------------------------- /Baseline/FastPC/interdependent.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import argparse 4 | from sklearn import preprocessing 5 | 6 | 7 | 8 | if __name__ == '__main__': 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--input', type=str, default='inrc') 11 | 12 | args = parser.parse_args() 13 | 14 | node_file = './0517/output/'+args.input + '_node_all.npy' 15 | pod_file = './0517/output/'+args.input + '_pod_all.npy' 16 | mp_file = '/nfs/users/zach/aiops_data/data/0517/p2n.npy' 17 | 18 | node_data = np.load(node_file, allow_pickle=True).item() 19 | pod_data = np.load(pod_file, allow_pickle=True).item() 20 | mp_data = np.load(mp_file, allow_pickle=True).item() 21 | 22 | pod_names = pod_data['columns'] 23 | node_names = node_data['columns'] 24 | 25 | pod_scores = pod_data['score'] 26 | node_scores = node_data['score'] 27 | 28 | p2s = dict(zip(pod_names, pod_scores)) 29 | n2s = dict(zip(node_names, node_scores)) 30 | 31 | ctotal = 0 32 | del_keys = [] 33 | for p in p2s: 34 | if p not in mp_data: 35 | del_keys.append(p) 36 | continue 37 | node = mp_data[p] 38 | p2s[p] = p2s[p] * n2s[node] 39 | ctotal += p2s[p] 40 | 41 | for k in del_keys: 42 | p2s.pop(k) 43 | 44 | fd = {} 45 | for p in p2s: 46 | fd[p] = [p2s[p] / ctotal] 47 | 48 | 49 | scores = pd.DataFrame.from_dict(fd, orient='index', columns=['ranking_score']) 50 | 51 | ranking_score = scores.reset_index(drop=True).to_numpy().reshape(-1) 52 | ranking_score = preprocessing.normalize([ranking_score]).ravel() 53 | #print(ranking_score) 54 | columns = list(scores.index) 55 | 56 | #scores = scores.sort_values(by='ranking_score', ascending=False) 57 | ranking = np.argsort(ranking_score)[::-1] 58 | 59 | K= len(ranking_score) 60 | #results_combine = {} 61 | 62 | results_combine = pd.DataFrame() 63 | results_combine['ranking'] = [i+1 for i in range(K)] 64 | #results_combine = pd.DataFrame(results_combine, columns = ['ranking']) 65 | results_combine ['pod'] = [columns[ranking[i]] for i in range(K)] 66 | results_combine ['score'] = [ranking_score[ranking[i]] for i in range(K)] 67 | results_combine.to_csv('./0517/output/'+ args.input + '_hierarchical_ranking_metrics.csv') 68 | print(results_combine) 69 | print('Successfully output the root cause results with considering both node level and pod level') 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /Baseline/FastPC/lbfgsb_scipy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import scipy.optimize as sopt 3 | 4 | 5 | class LBFGSBScipy(torch.optim.Optimizer): 6 | """Wrap L-BFGS-B algorithm, using scipy routines. 7 | """ 8 | 9 | def __init__(self, params): 10 | defaults = dict() 11 | super(LBFGSBScipy, self).__init__(params, defaults) 12 | 13 | if len(self.param_groups) != 1: 14 | raise ValueError("LBFGSBScipy doesn't support per-parameter options" 15 | " (parameter groups)") 16 | 17 | self._params = self.param_groups[0]['params'] 18 | self._numel = sum([p.numel() for p in self._params]) 19 | 20 | def _gather_flat_grad(self): 21 | views = [] 22 | for p in self._params: 23 | if p.grad is None: 24 | view = p.data.new(p.data.numel()).zero_() 25 | elif p.grad.data.is_sparse: 26 | view = p.grad.data.to_dense().view(-1) 27 | else: 28 | view = p.grad.data.view(-1) 29 | views.append(view) 30 | return torch.cat(views, 0) 31 | 32 | def _gather_flat_bounds(self): 33 | bounds = [] 34 | for p in self._params: 35 | if hasattr(p, 'bounds'): 36 | b = p.bounds 37 | else: 38 | b = [(None, None)] * p.numel() 39 | bounds += b 40 | return bounds 41 | 42 | def _gather_flat_params(self): 43 | views = [] 44 | for p in self._params: 45 | if p.data.is_sparse: 46 | view = p.data.to_dense().view(-1) 47 | else: 48 | view = p.data.view(-1) 49 | views.append(view) 50 | return torch.cat(views, 0) 51 | 52 | def _distribute_flat_params(self, params): 53 | offset = 0 54 | for p in self._params: 55 | numel = p.numel() 56 | # view as to avoid deprecated pointwise semantics 57 | p.data = params[offset:offset + numel].view_as(p.data) 58 | offset += numel 59 | assert offset == self._numel 60 | 61 | def step(self, closure): 62 | """Performs a single optimization step. 63 | 64 | Arguments: 65 | closure (callable): A closure that reevaluates the model 66 | and returns the loss. 67 | """ 68 | assert len(self.param_groups) == 1 69 | 70 | def wrapped_closure(flat_params): 71 | """closure must call zero_grad() and backward()""" 72 | flat_params = torch.from_numpy(flat_params) 73 | flat_params = flat_params.to(torch.get_default_dtype()) 74 | self._distribute_flat_params(flat_params) 75 | loss = closure() 76 | loss = loss.item() 77 | flat_grad = self._gather_flat_grad().cpu().detach().numpy() 78 | return loss, flat_grad.astype('float64') 79 | 80 | initial_params = self._gather_flat_params() 81 | initial_params = initial_params.cpu().detach().numpy() 82 | 83 | bounds = self._gather_flat_bounds() 84 | 85 | # Magic 86 | sol = sopt.minimize(wrapped_closure, 87 | initial_params, 88 | method='L-BFGS-B', 89 | jac=True, 90 | bounds=bounds) 91 | 92 | final_params = torch.from_numpy(sol.x) 93 | final_params = final_params.to(torch.get_default_dtype()) 94 | self._distribute_flat_params(final_params) 95 | 96 | 97 | def main(): 98 | import torch.nn as nn 99 | # torch.set_default_dtype(torch.double) 100 | 101 | n, d, out, j = 10000, 3000, 10, 0 102 | input = torch.randn(n, d) 103 | w_true = torch.rand(d, out) 104 | w_true[j, :] = 0 105 | target = torch.matmul(input, w_true) 106 | linear = nn.Linear(d, out) 107 | linear.weight.bounds = [(0, None)] * d * out # hack 108 | for m in range(out): 109 | linear.weight.bounds[m * d + j] = (0, 0) 110 | criterion = nn.MSELoss() 111 | optimizer = LBFGSBScipy(linear.parameters()) 112 | print(list(linear.parameters())) 113 | 114 | def closure(): 115 | optimizer.zero_grad() 116 | output = linear(input) 117 | loss = criterion(output, target) 118 | print('loss:', loss.item()) 119 | loss.backward() 120 | return loss 121 | optimizer.step(closure) 122 | print(list(linear.parameters())) 123 | print(w_true.t()) 124 | 125 | 126 | if __name__ == '__main__': 127 | main() 128 | 129 | -------------------------------------------------------------------------------- /Baseline/FastPC/libspot.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KnowledgeDiscovery/rca_baselines/c560d8cc39c19f04c9ae74fac2404b1e5e8c3b4d/Baseline/FastPC/libspot.so -------------------------------------------------------------------------------- /Baseline/FastPC/rca.py: -------------------------------------------------------------------------------- 1 | # import pyspot as ps 2 | import numpy as np 3 | import pandas as pd 4 | import networkx as nx 5 | from causalnex.structure import dynotears 6 | from causalnex.structure.dynotears import from_pandas_dynamic 7 | from pyspot import DSpot, Spot 8 | from pingouin import partial_corr 9 | import torch 10 | from hannlstm import hannLSTM, train_model_pgd 11 | from fastPC import Fast_PC_Causal_Graph 12 | import scipy 13 | from numpy.linalg import norm, inv 14 | from sklearn import preprocessing 15 | 16 | 17 | def optENMFSoft( A, P, M, c, tau, max_iter=100): 18 | 19 | n = A.shape[0] 20 | B = (1-c) * inv(np.eye(n) - c * A) 21 | BB = B.transpose() @ B 22 | 23 | t = 1e-30 24 | 25 | e = np.ones((n, 1)) 26 | s = scipy.special.softmax(B @ e) 27 | obj = norm((s @ s.transpose()) * M, 'fro') ** 2 + tau * norm(e, 1) 28 | obj_old = obj 29 | err = 1 30 | iter = 0 31 | 32 | # maxIter = 1000 33 | errorV=[] 34 | 35 | while (err > t) and (iter < max_iter): 36 | s=scipy.special.softmax(B @ e) 37 | phi=np.diag(s) - s @ s.transpose() 38 | 39 | numerator = 4*(B.transpose() @ phi) @ (P*M) @ s 40 | # print(numerator) 41 | numerator[numerator<0]=0 42 | denominator = 4 * B.transpose() @ ((phi@s@s.transpose())*M)@s+ tau * np.ones((n,1)) 43 | e=e * np.sqrt(np.sqrt(numerator/denominator)) 44 | # print(e) 45 | # %err=norm(e-e_old,'fro') 46 | obj=norm((s@s.transpose())*M - P,'fro') ** 2 + tau * norm(e,1) 47 | err=np.abs(obj-obj_old) 48 | obj_old=obj 49 | iter = iter +1 50 | errorV.append(err) 51 | return e 52 | 53 | def spot_detection(X, d: int=10, q: float=1e-4, n_init:int=100, level:float=0.98)->np.ndarray: 54 | 55 | # X_mean = np.mean(X, axis=0) 56 | # X_std = np.std(X, axis=0) 57 | # X_std[X_std < 1e-3] = 1 58 | # X = (X - X_mean) / X_std 59 | 60 | nvar = X.shape[1] 61 | T = X.shape[0] 62 | score_list = [] 63 | for i in range(nvar): 64 | S = DSpot(d=d, q=q, n_init=n_init, level=level) 65 | score = [] 66 | for t in range(T): 67 | xt = X[t, i] 68 | event = S.step(xt) 69 | st = 0 70 | if t >= n_init: 71 | # up alert 72 | if event == 1: 73 | upper_threshold = S.status().to_dict()['z_up'] 74 | assert(xt >= upper_threshold) 75 | 76 | if upper_threshold == 0: 77 | upper_threshold = 0.0001 78 | 79 | st = (xt - upper_threshold) / upper_threshold 80 | # print('z_up is event!') 81 | # down alert 82 | if event == -1: 83 | lower_threshold = S.status().to_dict()['z_down'] 84 | assert(xt <= lower_threshold) 85 | 86 | if lower_threshold == 0: 87 | lower_threshold = 0.0001 88 | 89 | st = (lower_threshold - xt) / lower_threshold 90 | # print('z_down is event!') 91 | st = np.abs(st) 92 | score.append(st) 93 | score_list.append(score) 94 | np_score = np.array(score_list).transpose() 95 | return np_score 96 | 97 | def detect_individual_causal(X: np.ndarray, 98 | method:str='SPOT', 99 | args:dict={'d': 10, 'q': 1e-4, 'n_init': 100, 'level':0.98})->np.ndarray: 100 | if method == 'SPOT': 101 | d = args['d'] 102 | q = args['q'] 103 | n_init = args['n_init'] 104 | level = args['level'] 105 | score = spot_detection(X, d, q, n_init, level) 106 | return score 107 | 108 | 109 | # LSTM based method 110 | def lstm(X: np.ndarray, hidden: int, context: int, lam: float, lam_ridge: float, lr: float, max_iter: int, check_every: int, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')): 111 | # device = torch.device('cuda:0') 112 | 113 | X_np = torch.tensor(X[np.newaxis], dtype=torch.float64, device=device) 114 | hannlstm = hannLSTM(X.shape[-1], hidden=hidden).to(device=device) 115 | X_np = X_np.float() 116 | train_loss_list = train_model_pgd(hannlstm, X_np, context=context, lam=lam, lam_ridge=lam_ridge, lr=lr, max_iter=max_iter, check_every=check_every) 117 | W_est = hannlstm.GC(False).cpu().data.numpy() 118 | 119 | return W_est 120 | 121 | 122 | def generate_causal_graph(X: np.ndarray, 123 | method: str ='dynotears', 124 | args: dict = {'lag': 10, 125 | 'lambda_w': 1e-3, 126 | 'lambda_a': 1e-3, 127 | 'max_iter':30})->np.ndarray: 128 | if method == 'lstm': 129 | torch.set_default_tensor_type(torch.FloatTensor) 130 | hidden = args['hidden'] 131 | context = args['context'] 132 | lam = args['lam'] 133 | lam_ridge = args['lam_ridge'] 134 | lr = args['lr'] 135 | max_iter = args['max_iter'] 136 | check_every = args['check_every'] 137 | device = args['device'] 138 | W_est = lstm(X, hidden, context, lam, lam_ridge, lr, max_iter, check_every, device) 139 | elif method == 'dynotears': 140 | if 'columns' not in args: 141 | columns = ['V{}'.format(i) for i in range(X.shape[1])] 142 | else: 143 | columns = args['columns'] 144 | lag = args['lag'] 145 | lambda_w = args['lambda_w'] 146 | lambda_a = args['lambda_a'] 147 | max_iter = args['max_iter'] 148 | 149 | X_lag = np.roll(X,1,axis=0) 150 | for lag_o in range(2,lag+1): 151 | X_lag = np.hstack((X_lag,np.roll(X,lag_o, axis=0))) 152 | W_est = dynotears.from_numpy_dynamic(X, X_lag, lambda_w, lambda_a, max_iter) 153 | elif method == 'fastpc': 154 | W_est = Fast_PC_Causal_Graph(pd.DataFrame(X),alpha=10**-6,cuda=True) 155 | return W_est 156 | 157 | # generate transition matrix from weight matrix 158 | # W: W[i,j] i->j 159 | def generate_Q(X:np.ndarray, W:np.ndarray, RI:int, rho:float, columns: list=None): 160 | n = W.shape[0] 161 | if columns is None: 162 | columns=['V{}'.format(i) for i in range(n)] 163 | df = pd.DataFrame(X, index=[i for i in range(X.shape[0])], columns=columns) 164 | 165 | # parent nodes 166 | PAak = [columns[i] for i, x in enumerate(W[:, RI]) if (x == 1) and (i != RI)] 167 | vak = columns[RI] 168 | # PA = [[columns[j] for j, x in enumerate(W[:, i]) if x == 1] for i in range(n)] 169 | # PAak_minus = [[c for c in PAak if c!=columns[i]] for i in range(n)] 170 | 171 | # partial correlation 172 | Rpc = [] 173 | for i in range(n): 174 | if i == RI: 175 | Rpc.append(0) 176 | continue 177 | vi = columns[i] 178 | PAak_minus_i = [c for c in PAak if c!=columns[i]] 179 | PAi = [columns[j] for j, x in enumerate(W[:, i]) if (x == 1) and (i != j) and (RI != j)] 180 | covar = list(set(PAak_minus_i).union(set(PAi))) 181 | rdf = partial_corr(df, x=vak, y=vi, covar=covar) 182 | Rpc.append(np.abs(rdf.values[0, 1])) 183 | 184 | Q = np.zeros((n, n)) 185 | for i in range(n): 186 | P = 0 187 | for j in range(n): 188 | if i == j: 189 | continue 190 | # from result to cause 191 | if W[j][i] == 1: 192 | Q[i][j] = Rpc[j] 193 | # from cause to result: 194 | if W[i][j] == 0: 195 | Q[j][i] = rho * Rpc[i] 196 | # stay 197 | P = max(P, Q[i][j]) 198 | Q[i][i] = max(0., Rpc[i] - P) 199 | # normalize each row 200 | rsum = np.sum(Q, axis=1).reshape(-1 , 1) 201 | rsum[rsum==0] = 1 202 | Q = Q / rsum 203 | return Q 204 | 205 | # random walk with restart 206 | def propagate_error(Q:np.ndarray, start:int, steps:int=1000, rp:float=0.05, max_self:int=10)->np.ndarray: 207 | n = Q.shape[0] 208 | count = np.zeros(n) 209 | current = start 210 | self_visit = 0 211 | for step in range(steps): 212 | # print(current) 213 | # input() 214 | if np.random.rand() > rp: 215 | prob = Q[current, :] 216 | if np.sum(prob) != 1: 217 | continue 218 | next = np.random.choice(n, 1, p=prob)[0] 219 | # check if need a restart, get stuck in one node 220 | if next == current: 221 | self_visit += 1 222 | if self_visit == max_self: 223 | current = start 224 | self_visit = 0 225 | continue 226 | current = next 227 | count[current] += 1 228 | else: 229 | current = start 230 | return count 231 | 232 | if __name__ == '__main__': 233 | data = np.load('may_pod_level_data.npy', allow_pickle=True).item() 234 | label = 'Book_Info_product' 235 | X = data[label]['Sequence'][:48000, :] 236 | X = np.sum(X.reshape((-1, 100, X.shape[1])), axis=1) 237 | columns = data[label]['Pod_Name'] + data[label]['JMeter_Feature'] 238 | std = np.std(X, axis=0) 239 | idx = [i for i, x in enumerate(std > 1e-3) if x] 240 | # idx = list(range(30)) 241 | X = X[:, idx] 242 | columns = [columns[i] for i in idx] 243 | 244 | print('X shape: ', X.shape) 245 | 246 | print('Detecting Individual Causal ...') 247 | ind_casual_score = detect_individual_causal(X, method='SPOT', args={'d':10, 'q':1e-4, 'n_init':100, 'level':0.98}) 248 | ind_casual_score = np.sum(ind_casual_score, axis=0) 249 | normalized_ind_casual_score = ind_casual_score 250 | normalized_ind_casual_score[:-1] = preprocessing.normalize([ind_casual_score[:-1]]) 251 | print('Detecting Individual Causal Done!') 252 | 253 | # causal graph 254 | print('Generating Causal Graph ...') 255 | cg = generate_causal_graph(X, method='gnn', args={'lag': 20, 'lambda_w': 1e-3, 'lambda_a': 1e-2}) 256 | print('Generating Causal Graph Done!') 257 | # threshold top K 258 | K = 0.3*len(cg.reshape(-1)) 259 | threshold = sorted(cg.reshape(-1), reverse=True)[K-1] 260 | W = np.where(cg>=threshold, 1, 0) 261 | # Wij : i->j 262 | W = W.transpose() 263 | # print('W:', W[:, -1]) 264 | print('Generating Q ...') 265 | Q = generate_Q(X, W, RI=W.shape[0]-1, rho=1e-2) 266 | print('Q:', Q[-1, :]) 267 | print('Q sum: ', np.sum(Q)) 268 | print('Generating Q Done!') 269 | # error propagation 270 | print('Propagaing Error ...') 271 | steps = 10000 272 | count = propagate_error(Q, start=W.shape[0]-1, steps=steps) 273 | count /= steps 274 | print('Propagating Eroor Done!') 275 | 276 | # root cause ranking 277 | print('Individual Causal Score: ', normalized_ind_casual_score) 278 | print('Topological Causal score: ', count) 279 | alpha = 0.3 280 | score = alpha * normalized_ind_casual_score[:-1] + (1 - alpha) * count[:-1] 281 | # top K 282 | K = 5 283 | ranking = np.argsort(score)[::-1] 284 | for i in range(K): 285 | print('{}: {} {}'.format(i, columns[ranking[i]], score[ranking[i]])) 286 | -------------------------------------------------------------------------------- /Baseline/FastPC/trace_expm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import scipy.linalg as slin 4 | 5 | 6 | class TraceExpm(torch.autograd.Function): 7 | @staticmethod 8 | def forward(ctx, input): 9 | # detach so we can cast to NumPy 10 | #E = slin.expm(input.detach().numpy()) 11 | E = torch.matrix_exp(input) 12 | #f = np.trace(E) 13 | f = torch.trace(E) 14 | #E = torch.from_numpy(E) 15 | ctx.save_for_backward(E) 16 | return torch.as_tensor(f, dtype=input.dtype) 17 | 18 | @staticmethod 19 | def backward(ctx, grad_output): 20 | E, = ctx.saved_tensors 21 | grad_input = grad_output * E.t() 22 | return grad_input 23 | 24 | 25 | trace_expm = TraceExpm.apply 26 | 27 | 28 | def main(): 29 | input = torch.randn(20, 20, dtype=torch.double, requires_grad=True) 30 | assert torch.autograd.gradcheck(trace_expm, input) 31 | 32 | input = torch.tensor([[1, 2], [3, 4.]], requires_grad=True) 33 | tre = trace_expm(input) 34 | f = 0.5 * tre * tre 35 | print('f\n', f.item()) 36 | f.backward() 37 | print('grad\n', input.grad) 38 | 39 | 40 | if __name__ == '__main__': 41 | main() 42 | -------------------------------------------------------------------------------- /Baseline/Nezha/20240124/20240124-fault_list.json: -------------------------------------------------------------------------------- 1 | { 2 | "2024-01-24": [ 3 | { 4 | "inject_time": "2024-01-24 03:10:51", 5 | "inject_timestamp": "1706087451", 6 | "inject_pod": "ip-10-1-100-109.ap-northeast-1.compute.internal", 7 | "inject_type": "infinite loop bug" 8 | } 9 | ] 10 | } -------------------------------------------------------------------------------- /Baseline/Nezha/20240124/root_cause_hipster.json: -------------------------------------------------------------------------------- 1 | { 2 | "ip-10-1-100-109.ap-northeast-1.compute.internal": { 3 | "return": "infinite loop bug", 4 | "exception": "infinite loop bug", 5 | "cpu_consumed": "pod_level_data_cpu_usage", 6 | "infinite loop bug": "pod_level_data_cpu_usage" 7 | } 8 | } -------------------------------------------------------------------------------- /Baseline/Nezha/INSTALL.md: -------------------------------------------------------------------------------- 1 | # Quick Start 2 | 3 | ### 1.1 Requirements 4 | 5 | - Python3.6 is recommended to run the anomaly detection. Otherwise, any python3 version should be fine. 6 | - Git is also needed. 7 | 8 | ### 1.2 Setup 9 | 10 | download `Nezha` first by `git clone git@github.com:IntelligentDDS/Nezha.git` 11 | 12 | `python3.6 -m pip install -r requirements.txt` to install the dependency for Nezha 13 | 14 | ### 1.3 Running Nezha 15 | 16 | #### 1.3.1 Localize OnlineBoutique at service level 17 | 18 | 19 | ``` 20 | python3.6 ./main.py --ns hipster --level service 21 | 22 | pattern_ranker.py:622: -------- hipster Fault numbuer : 56------- 23 | pattern_ranker.py:623: --------AS@1 Result------- 24 | pattern_ranker.py:624: 92.857143 % 25 | pattern_ranker.py:625: --------AS@3 Result------- 26 | pattern_ranker.py:626: 96.428571 % 27 | pattern_ranker.py:627: --------AS@5 Result------- 28 | pattern_ranker.py:628: 96.428571 % 29 | ``` 30 | 31 | #### 1.3.2 Localize OnlineBoutique at inner service level 32 | 33 | ``` 34 | python3.6 ./main.py --ns hipster --level inner 35 | 36 | pattern_ranker.py:622: -------- hipster Fault numbuer : 56------- 37 | pattern_ranker.py:623: --------AIS@1 Result------- 38 | pattern_ranker.py:624: 92.857143 % 39 | pattern_ranker.py:625: --------AIS@3 Result------- 40 | pattern_ranker.py:626: 96.428571 % 41 | pattern_ranker.py:627: --------AIS@5 Result------- 42 | pattern_ranker.py:628: 96.428571 % 43 | ``` 44 | 45 | #### 1.3.3 Localize Trainticket at service level 46 | 47 | ``` 48 | python3.6 ./main.py --ns ts --level service 49 | 50 | pattern_ranker.py:622: -------- ts Fault numbuer : 45------- 51 | pattern_ranker.py:623: --------AS@1 Result------- 52 | pattern_ranker.py:624: 86.666667 % 53 | pattern_ranker.py:625: --------AS@3 Result------- 54 | pattern_ranker.py:626: 97.777778 % 55 | pattern_ranker.py:627: --------AS@5 Result------- 56 | pattern_ranker.py:628: 97.777778 % 57 | ``` 58 | 59 | #### 1.3.4 Localize Trainticket at inner service level 60 | 61 | ``` 62 | python3.6 ./main.py --ns ts --level inner 63 | 64 | pattern_ranker.py:622: -------- ts Fault numbuer : 45------- 65 | pattern_ranker.py:623: --------AIS@1 Result------- 66 | pattern_ranker.py:624: 86.666667 % 67 | pattern_ranker.py:625: --------AIS@3 Result------- 68 | pattern_ranker.py:626: 97.777778 % 69 | pattern_ranker.py:627: --------AIS@5 Result------- 70 | pattern_ranker.py:628: 97.777778 % 71 | ``` 72 | 73 | The details of service level results and inner-service level results will be printed and recorded in `./log` -------------------------------------------------------------------------------- /Baseline/Nezha/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 IntelligentDDS 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /Baseline/Nezha/alarm.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from itertools import product 3 | import os 4 | import re 5 | import datetime 6 | from os.path import dirname 7 | from log import Logger 8 | import logging 9 | from yaml import FlowMappingEndToken 10 | import numpy as np 11 | import pandas as pd 12 | import matplotlib.pyplot as plt 13 | import statistics 14 | import numpy as np 15 | 16 | log_path = dirname(__file__) + '/log/' + str(datetime.datetime.now().strftime( 17 | '%Y-%m-%d')) + '_nezha.log' 18 | logger = Logger(log_path, logging.DEBUG, __name__).getlog() 19 | 20 | 21 | metric_threshold_dir = "metric_threshold" 22 | 23 | 24 | def get_svc(path): 25 | svc = path.rsplit('-', 1)[0] 26 | svc = svc.rsplit('-', 1)[0] 27 | 28 | return svc 29 | 30 | 31 | def generate_threshold(metric_dir, trace_file): 32 | """ 33 | fun generate_threshold: calculte mean and std for each metric of each servie 34 | write ruslt to metric_threshold_dir/service.csv 35 | :parameter 36 | metric_dir - metric dir in construction phase 37 | """ 38 | metric_map = {} 39 | path_list = os.listdir(metric_dir) 40 | for path in path_list: 41 | if "metric" in path: 42 | svc = path.rsplit('-', 1)[0] 43 | svc = svc.rsplit('-', 1)[0] 44 | if svc in metric_map: 45 | metric_map[svc].append(os.path.join(metric_dir, path)) 46 | else: 47 | metric_map[svc] = [os.path.join(metric_dir, path)] 48 | for svc in metric_map: 49 | frames = [] 50 | 51 | # get pod name 52 | for path in path_list: 53 | if svc in path: 54 | pod_name = path.split("_")[0] 55 | # print(pod_name) 56 | network_mean, network_std = get_netwrok_metric( 57 | trace_file=trace_file, pod_name=pod_name) 58 | break 59 | 60 | metric_threshold_file = metric_threshold_dir + "/" + svc + ".csv" 61 | for path in metric_map[svc]: 62 | frames.append(pd.read_csv(path, index_col=False, usecols=[ 63 | 'CpuUsageRate(%)', 'MemoryUsageRate(%)', 'SyscallRead', 'SyscallWrite'])) 64 | # concat pods of the same service 65 | result = pd.concat(frames) 66 | with open(metric_threshold_file, 'w', newline='') as f: 67 | writer = csv.writer(f) 68 | header = ['CpuUsageRate(%)', 'MemoryUsageRate(%)', 'SyscallRead', 69 | 'SyscallWrite', 'NetworkP90(ms)'] 70 | writer.writerow(header) 71 | mean_list = [] 72 | std_list = [] 73 | for metric in header: 74 | if metric == 'NetworkP90(ms)': 75 | continue 76 | mean_list.append(np.mean(result[metric])) 77 | std_list.append(np.std(result[metric])) 78 | mean_list.append(network_mean) 79 | std_list.append(network_std) 80 | writer.writerow(mean_list) 81 | writer.writerow(std_list) 82 | 83 | 84 | def get_netwrok_metric(trace_file, pod_name): 85 | """ 86 | func get_netwrok_metric: use trace data to get netwrok metric 87 | :parameter 88 | time - to regex timestamp e.g, "2022-04-18 13:00" 89 | data_dir 90 | pod_name 91 | :return 92 | p90 netwrok latency 93 | """ 94 | latency_list = [] 95 | 96 | if "front" in pod_name: 97 | # front end dose not calculate netwrok latency 98 | return 10, 10 99 | # 100 | # pod_reader = pd.read_csv( 101 | # trace_file, index_col='PodName', usecols=['TraceID', 'SpanID', 'ParentID', 'PodName', 'EndTimeUnixNano']) 102 | # parent_span_reader = pd.read_csv( 103 | # trace_file, index_col='SpanID', usecols=['TraceID', 'SpanID', 'ParentID', 'PodName', 'EndTimeUnixNano']) 104 | # 105 | # try: 106 | # pod_reader = pod_reader.reindex(columns=pod_name) 107 | # pod_spans = pod_reader.loc[[pod_name], ['SpanID', 'ParentID', 'PodName', 'EndTimeUnixNano']] 108 | # except: 109 | # service = pod_name.rsplit('-', 1)[0] 110 | # service = service.rsplit('-', 1)[0] 111 | # 112 | # csv_file = dirname(__file__) + "/metric_threshold/" + service + ".csv" 113 | # pod_reader = pd.read_csv(csv_file, usecols=['NetworkP90(ms)']) 114 | # # print("pod", pod_name, " not found in trace, return default ", 115 | # # float(pod_reader.iloc[0])) 116 | # 117 | # return float(pod_reader.iloc[0]), 0 118 | # 119 | # if len(pod_spans['SpanID']) > 0: 120 | # # process span independentlt and order by timestamp 121 | # for span_index in range(len(pod_spans['SpanID'])): 122 | # # span event 123 | # parent_id = pod_spans['ParentID'].iloc[span_index] 124 | # pod_start_time = int( 125 | # pod_spans['EndTimeUnixNano'].iloc[span_index]) 126 | # try: 127 | # parent_pod_span = parent_span_reader.loc[[ 128 | # parent_id], ['PodName', 'EndTimeUnixNano']] 129 | # if len(parent_pod_span) > 0: 130 | # for parent_span_index in range(len(parent_pod_span['PodName'])): 131 | # parent_pod_name = parent_pod_span['PodName'].iloc[parent_span_index] 132 | # parent_end_time = int( 133 | # parent_pod_span['EndTimeUnixNano'].iloc[parent_span_index]) 134 | # 135 | # if str(parent_pod_name) != str(pod_name): 136 | # latency = (parent_end_time - pod_start_time) / \ 137 | # 1000000 # convert to microsecond 138 | # # if "contacts-service" in pod_name: 139 | # # logger.info("%s, %s, %s, %s, %s" % ( 140 | # # pod_name, pod_spans['SpanID'].iloc[span_index], parent_pod_name, pod_spans['ParentID'].iloc[span_index], latency)) 141 | # latency_list.append(latency) 142 | # except: 143 | # pass 144 | # # logger.info("%s latency is %s" %(pod_name, np.percentile(latency_list, 90))) 145 | # if len(latency_list) > 2: 146 | # return np.percentile(latency_list, 90), statistics.stdev(latency_list) 147 | # else: 148 | # return 10, 10 149 | return 1, 1 150 | 151 | 152 | def determine_alarm(pod, metric_type, metric_value, std_num, ns): 153 | """ 154 | fun determine_alarm: determin whether violate 3-sgima 155 | :parameter 156 | pod - podname to find corrsponding metric threshold file 157 | metric_type - find correspding column 158 | metric_vault - compare with the history mean and std 159 | std_num - constrol std_num * std 160 | :return 161 | true - alarm 162 | false - no alarm 163 | """ 164 | 165 | path_list = os.listdir(metric_threshold_dir) 166 | 167 | if metric_type == "CpuUsageRate(%)" or metric_type == 'MemoryUsageRate(%)': 168 | if metric_value > 80: 169 | return True 170 | else: 171 | if ns == "hipster": 172 | # for hipster 173 | if metric_value > 200: 174 | return True 175 | elif ns == "ts": 176 | # for ts 177 | if metric_value > 300: 178 | return True 179 | return False 180 | # for path in path_list: 181 | # if re.search(path.split('.')[0], pod): 182 | # hisory_metric = pd.read_csv(os.path.join( 183 | # metric_threshold_dir, path), index_col=False, usecols=[metric_type]) 184 | # if metric_value > hisory_metric[metric_type][0] + std_num * hisory_metric[metric_type][1]: 185 | # return True 186 | # # elif metric_value < hisory_metric[metric_type][0] - std_num * hisory_metric[metric_type][1]: 187 | # # return True 188 | # else: 189 | # return False 190 | 191 | 192 | def generate_alarm(metric_list, ns, std_num=6): 193 | """ 194 | func generate_alarm: generate alram of each pod at current miniute 195 | :parameter 196 | metric_list - metric list from get_metric_with_time 197 | 198 | :return 199 | alarm_list, e.g., [{'pod': 'cartservice-579f59597d-n69b4', 'alarm': [{'metric_type': 'CpuUsageRate(%)', 'alarm_flag': True}]}] 200 | [{ 201 | pod: 202 | alarm: [ 203 | { 204 | metric_type: CpuUsageRate(%) 205 | alarm_flag: True 206 | } 207 | ] 208 | }] 209 | """ 210 | alarm_list = [] 211 | for pod_metric in metric_list: 212 | alarm = {} 213 | for i in range(len(pod_metric['metrics'])): 214 | alarm_flag = determine_alarm(pod=pod_metric["pod"], metric_type=pod_metric['metrics'][i]["metric_type"], 215 | metric_value=pod_metric['metrics'][i]["metric_value"], std_num=std_num, ns=ns) 216 | if alarm_flag: 217 | # if exist alarm_flag equal to true, create map 218 | if "pod" not in alarm: 219 | alarm = {"pod": pod_metric["pod"], "alarm": []} 220 | alarm['alarm'].append( 221 | {"metric_type": pod_metric['metrics'][i]["metric_type"], "alarm_flag": alarm_flag}) 222 | 223 | if "pod" in alarm: 224 | alarm_list.append(alarm) 225 | 226 | return alarm_list 227 | 228 | 229 | def get_metric_with_time(time, base_dir): 230 | """ 231 | func get_metric_with_time: get metric list at determined miniute 232 | :parameter 233 | time - to regex timestamp e.g, "2022-04-18 13:00" 234 | product_metric_dir 235 | :return 236 | target_list - traget metrics 237 | [ 238 | { 239 | pod: 240 | metrics: [ 241 | { 242 | "metric_type": 243 | "metric_value": 244 | } 245 | ] 246 | } 247 | 248 | ] 249 | """ 250 | # date = time.split(' ')[0] 251 | # hour_min = time.split(' ')[1] 252 | # hour = hour_min.split(':')[0] 253 | # min = hour_min.split(':')[1] 254 | trace_file = base_dir + "/trace/trace.csv" 255 | 256 | metric_dir = base_dir + "/metric/" 257 | 258 | path_list = os.listdir(metric_dir) 259 | 260 | # metric_list = ['CpuUsageRate(%)', 'MemoryUsageRate(%)', 'SyscallRead', 261 | # 'SyscallWrite'] 262 | # metric_list = ['CpuUsageRate(%)', 'MemoryUsageRate(%)'] 263 | target_list = [] 264 | for path in path_list: 265 | if "metric" in path: 266 | metrics = pd.read_csv(os.path.join(metric_dir, path)) 267 | metric_list = list(metrics.columns) 268 | metric_list.remove("TimeStamp") 269 | metric_list.remove("PodName") 270 | metric_list.remove("Time") 271 | if 'Date' in metric_list: 272 | metric_list.remove("Date") 273 | # metrics = pd.read_csv(os.path.join(product_metric_dir, path), index_col=False, usecols=['TimeStamp', 'PodName', 'CpuUsageRate(%)', 'MemoryUsageRate(%)', 'SyscallRead', 'SyscallWrite', 'PodServerLatencyP90(s)', 'PodClientLatencyP90(s)']) 274 | for index in range(len(metrics['Time'])): 275 | # regex timestamp 276 | if re.search(time, metrics['Time'][index]): 277 | target_metric = { 278 | "pod": metrics['PodName'][index], "metrics": []} 279 | for metric in metric_list: 280 | target_metric["metrics"].append({ 281 | "metric_type": metric, "metric_value": metrics[metric][index]}) 282 | network_p90, _ = get_netwrok_metric( 283 | trace_file=trace_file, pod_name=metrics['PodName'][index]) 284 | target_metric["metrics"].append( 285 | {"metric_type": "NetworkP90(ms)", "metric_value": network_p90}) 286 | target_list.append(target_metric) 287 | pod_num = len(path_list) 288 | # print(target_list) 289 | return target_list, pod_num 290 | 291 | -------------------------------------------------------------------------------- /Baseline/Nezha/data_parser.py: -------------------------------------------------------------------------------- 1 | import json 2 | import glob 3 | import os 4 | import pandas as pd 5 | import csv 6 | import re # Import regular expressions 7 | import numpy as np 8 | 9 | 10 | def remove_timestamps(message): 11 | # Remove datetime info of different formats 12 | message = re.sub(r'\[\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\]', '', message) 13 | message = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z', '', message) 14 | message = re.sub(r'\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}', '', message) 15 | message = re.sub(r'\d{4}\.\d{2}\.\d{2} \d{2}:\d{2}:\d{2}', '', message) 16 | message = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}\+\d{4}', '', message) 17 | message = re.sub(r'\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}', '', message) 18 | message = re.sub(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', '', message) 19 | message = re.sub(r'\[\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} \w{3}\]', '', message) 20 | message = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}\+\d{2}:\d{2}', '', message) 21 | message = re.sub(r'\w{3} \d{1,2}, \d{4}', '', message) 22 | message = re.sub(r'\d{1,2} \w{3} \d{4}', '', message) 23 | message = re.sub(r'\d{2}:\d{2} [AP]M', '', message) 24 | message = re.sub(r'\[\d{2}/\w{3}/\d{4} \d{2}:\d{2}:\d{2}\]', '', message) 25 | message = re.sub(r'^I\d{4} \d{2}:\d{2}:\d{2}\.\d{6}\s+\d+\s+\w+.\w+:\d+\] ', '', message) 26 | message = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}Z', '', message) 27 | 28 | return message.strip() 29 | 30 | 31 | 32 | def extract_log_message(log): 33 | # Check if the log is in JSON format 34 | if "msg:" in log or "\"msg\":" in log: 35 | log_json = json.loads(log) 36 | log_message = log_json.get('msg', '') 37 | elif "msg=" in log or "\"msg\"=" in log: 38 | msg_index = log.find("msg=") if "msg=" in log else log.find("\"msg\"=") 39 | first_quote_index = log.find('"', msg_index) 40 | last_quote_index = log.find('"', first_quote_index + 1) 41 | if last_quote_index != -1: 42 | log_message = log[first_quote_index + 1:last_quote_index] 43 | else: 44 | log_message = log[first_quote_index + 1:] 45 | else: 46 | log_message = log 47 | 48 | return log_message.strip() 49 | 50 | def dependency(path,output_dir): 51 | extract_pod_list=[] 52 | extract_node_list=[] 53 | folder_list = os.listdir(path) 54 | 55 | for folder in folder_list: 56 | json_file = path + folder + "/" + "*.json" 57 | for readfile in glob.glob(json_file): 58 | print(readfile) 59 | with open(readfile) as f: 60 | jsn = json.load(f) 61 | for jsn_hit in jsn['hits']['hits']: 62 | all_proc = [] 63 | all_node = [] 64 | if "kubernetes" in jsn_hit['_source'] and "pod_name" in jsn_hit['_source']['kubernetes']: 65 | pod = jsn_hit['_source']['kubernetes']['pod_name'] 66 | message = jsn_hit['_source']['message'] 67 | timestamp = jsn_hit['_source']['@timestamp'] 68 | if message.startswith('"'): 69 | message = message[1:] 70 | if message.endswith('"'): 71 | message = message[:-1] 72 | if "msg" in message: 73 | # print(message) 74 | message = extract_log_message(message) 75 | # message = json.loads(message)['msg'] 76 | 77 | message = remove_timestamps(message) 78 | all_proc.append(pod) 79 | all_proc.append(timestamp) 80 | all_proc.append(message) 81 | if all_proc: 82 | extract_pod_list.append(all_proc) 83 | if "systemd" in jsn_hit['_source'] and "t" in jsn_hit['_source']['systemd']: 84 | node = jsn_hit['_source']['hostname'] 85 | message = jsn_hit['_source']['message'] 86 | timestamp = jsn_hit['_source']['@timestamp'] 87 | if message.startswith('"'): 88 | message = message[1:] 89 | if message.endswith('"'): 90 | message = message[:-1] 91 | if "msg:" in message or "\"msg\":" in message: 92 | message = extract_log_message(message) 93 | # message = json.loads(message)['msg'] 94 | all_node.append(node) 95 | all_node.append(timestamp) 96 | all_node.append(message) 97 | if all_node: 98 | extract_node_list.append(all_node) 99 | # output file 100 | data_list_col=['Node','Timestamp','Messages'] 101 | node_df = pd.DataFrame(extract_node_list,columns=data_list_col) 102 | node_df.dropna() 103 | filename = 'Node_messages' 104 | node_df = node_df.sort_values(by='Timestamp') 105 | node_df.to_csv(output_dir + filename, index = False) 106 | csv_file = output_dir + filename 107 | partition_csv(csv_file, output_dir3) 108 | 109 | data_list_col=['Pod','Timestamp','Messages'] 110 | pod_df = pd.DataFrame(extract_pod_list,columns=data_list_col) 111 | pod_df.dropna() 112 | filename = 'Pod_messages' 113 | pod_df = pod_df.sort_values(by='Timestamp') 114 | pod_df.to_csv(output_dir + filename, index = False) 115 | csv_file = output_dir + filename 116 | partition_csv(csv_file, output_dir2) 117 | 118 | 119 | 120 | def partition_csv(csv_file, output_dir): 121 | isExist = os.path.exists(output_dir) 122 | if not isExist: 123 | os.mkdir(output_dir) 124 | # Creates empty set - this will be used to store the values that have already been used 125 | filelist = set() 126 | # Opens the large csv file in "read" mode 127 | with open(csv_file,'r') as csvfile: 128 | read_rows = csv.reader(csvfile) 129 | # Skip the column names 130 | next(read_rows) 131 | for row in read_rows: 132 | # Store the whole row as a string (rowstring) 133 | rowstring='\t'.join(row[1:]) 134 | # Defines filename as the first entry in the row - This could be made dynamic so that the user inputs a column name to use 135 | filename = (row[0]) 136 | # This basically makes sure it is not looking at the header row. 137 | # If the filename is not in the filelist set, add it to the list and create new csv file with header row. 138 | if filename not in filelist: 139 | filelist.add(filename) 140 | temp_file = output_dir + str(filename +'_messages') 141 | if os.path.exists(temp_file): 142 | os.remove(temp_file) 143 | with open(temp_file,'a') as f: 144 | f.write(rowstring) 145 | f.write("\n") 146 | f.close() 147 | # If the filename is in the filelist set, append the current row to the existing csv file. 148 | else: 149 | temp_file = output_dir + str(filename + '_messages') 150 | with open(temp_file,'a') as f: 151 | f.write(rowstring) 152 | f.write("\n") 153 | f.close() 154 | 155 | def data_integration(file_list, data_path, output_dir='./rca_data/'): 156 | # Create an empty dictionary to store data split by date and hour 157 | df_dict = dict() 158 | for file in file_list: 159 | file_dir = ''.join(data_path, file) 160 | df = pd.read_csv(file_dir) 161 | timestamps = df['Time'] 162 | pod_name = file[:-4] 163 | df['Time'] = pd.to_datetime(df['Time']) 164 | # Iterate over the unique dates in the 'Time' column 165 | for date in df['Time'].dt.date.unique(): 166 | # Filter data for the specific date 167 | date_data = df[df['Time'].dt.date == date] 168 | date_data['PodName'] = np.array([pod_name for _ in range(date_data.shape[0])]) 169 | date_data['Container'] = np.array(['server' for _ in range(date_data.shape[0])]) 170 | # Create a nested dictionary for each hour of this date 171 | hourly_dict = {} 172 | 173 | for hour in range(24): 174 | # Filter data for the specific hour within the date 175 | hourly_data = date_data[date_data['Time'].dt.hour == hour] 176 | hourly_data = hourly_data.rename(columns={ 177 | 'EventTemplate': 'Log', 178 | 'Time': 'Timestamp', 179 | }) 180 | # Store the split data in the nested dictionary with the hour as the key 181 | if hour not in hourly_dict: 182 | hourly_dict[hour] = hourly_data 183 | else: 184 | hourly_dict[hour] = hourly_dict[hour].append(hourly_data, ignore_index=True) 185 | 186 | # Store the hourly dictionary in the main dictionary with the date as the key 187 | df_dict[date] = hourly_dict 188 | 189 | 190 | if __name__ == "__main__": 191 | file_list =['openshift-apiserver-operator-68fd44b989-6rgcq_messages_structured.csv', 192 | 'network-operator-7c59d666f5-27lvk_messages_structured.csv', 193 | 'mongodb-v1-64c6b69879-p4wfp_messages_structured.csv', 194 | 'openshift-kube-scheduler-ocp4-control-plane-1_messages_structured.csv', 195 | 'openshift-kube-scheduler-ocp4-control-plane-2_messages_structured.csv', 196 | 'openshift-kube-scheduler-ocp4-control-plane-3_messages_structured.csv', 197 | 'ovs-ch5xp_messages_structured.csv', 198 | 'packageserver-67d8b69dc5-6rtj9_messages_structured.csv', 199 | 'prometheus-6cc8d9b85-sztrb_messages_structured.csv'] 200 | # Input log data directory 201 | # path = 'Path-to-the-dataset-directory' 202 | path = '/nfs/users/zach/aiops/data/1203/log_data/pod_removed/' 203 | # Output directories 204 | output_dir='./rca_data/' 205 | data_integration(file_list, path, output_dir) 206 | -------------------------------------------------------------------------------- /Baseline/Nezha/log.py: -------------------------------------------------------------------------------- 1 | #encoding = utf-8 2 | 3 | import logging 4 | 5 | 6 | class Logger(): 7 | def __init__(self, logname, loglevel=logging.DEBUG, loggername=None): 8 | ''' 9 | 指定保存日志的文件路径,日志级别,以及调用文件 10 | 将日志存入到指定的文件中 11 | ''' 12 | # 创建一个logger 13 | self.logger = logging.getLogger(loggername) 14 | self.logger.setLevel(loglevel) 15 | # 创建一个handler,用于写入日志文件 16 | fh = logging.FileHandler(logname) 17 | fh.setLevel(loglevel) 18 | if not self.logger.handlers: 19 | # 再创建一个handler,用于输出到控制台 20 | ch = logging.StreamHandler() 21 | ch.setLevel(loglevel) 22 | formatter = logging.Formatter( 23 | '[%(levelname)s]%(asctime)s %(filename)s:%(lineno)d: %(message)s') 24 | fh.setFormatter(formatter) 25 | ch.setFormatter(formatter) 26 | self.logger.addHandler(fh) 27 | self.logger.addHandler(ch) 28 | 29 | def getlog(self): 30 | return self.logger 31 | -------------------------------------------------------------------------------- /Baseline/Nezha/main.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | from pattern_ranker import * 4 | import argparse 5 | from log_parsing import * 6 | 7 | file_path = './' 8 | # print(file_path) 9 | log_path = file_path + '/log/' + str(datetime.datetime.now().strftime( 10 | '%Y-%m-%d')) + '_nezha.log' 11 | print(log_path) 12 | logger = Logger(log_path, logging.DEBUG, __name__).getlog() 13 | 14 | 15 | def get_miner(ns): 16 | template_indir = file_path + '/log_template' 17 | config = TemplateMinerConfig() 18 | config.load(file_path + "/log_template/drain3_" + ns + ".ini") 19 | config.profiling_enabled = False 20 | 21 | path = file_path + '/log_template/' + ns + ".bin" 22 | persistence = FilePersistence(path) 23 | template_miner = TemplateMiner(persistence, config=config) 24 | 25 | return template_miner 26 | 27 | # def generate_trace_id(log_dir): 28 | # trace_list = [] 29 | # for file in os.listdir(log_dir): 30 | # if file.endswith("_messages_structured.csv"): 31 | # trace_list.append(file[:-24]) 32 | # if not os.path.exists(log_dir + '../traceid/'): 33 | # os.mkdir(log_dir + '../traceid/') 34 | # pd.DataFrame(trace_list).to_csv(log_dir + '../traceid/trace_id.csv', index=False, header=False) 35 | 36 | 37 | if __name__ == '__main__': 38 | parser = argparse.ArgumentParser(description='Nezha') 39 | 40 | parser.add_argument('--ns', default="hipster", help='namespace') 41 | parser.add_argument('--level', default="service", help='service-level or inner-service level') 42 | parser.add_argument('--log_dir', default="./20240124/log/", help='the path to log data') 43 | parser.add_argument('--metric_dir', default="./20240124/Latency/", help='the path to metric data') 44 | parser.add_argument('--save_dir', default="./20240124/", help='the path to save preprocessed data') 45 | # parser.add_argument('--level', default="service", help='service-level or inner-service level') 46 | args = parser.parse_args() 47 | ns = args.ns 48 | level = args.level 49 | save_dir = args.save_dir 50 | log_dir = args.log_dir 51 | metric_dir = args.metric_dir 52 | kpi_file = save_dir + '/kpi_20240124_latency.csv' 53 | path1 = save_dir + "./20240124-fault_list.json" 54 | kpi_data = pd.read_csv(kpi_file) 55 | normal_time1 = str(pd.to_datetime(kpi_data['timeStamp'].iloc[0], unit='s')) 56 | time_index = int(kpi_data['timeStamp'].shape[0] * 0.6) 57 | preprocess(log_dir, metric_dir, save_dir) 58 | file_path = save_dir 59 | log_template_miner = get_miner(ns) 60 | inject_list = [path1] 61 | normal_time_list = [normal_time1] 62 | if level == "service": 63 | logger.info("------- Result at service level -------") 64 | evaluation_pod(normal_time_list, inject_list, ns, log_template_miner, file_path) 65 | else: 66 | logger.info("------- Result at inner service level -------") 67 | evaluation(normal_time_list, inject_list, ns, log_template_miner, file_path) 68 | -------------------------------------------------------------------------------- /Baseline/Nezha/pattern_miner.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from data_integrate import * 3 | 4 | log_path = dirname(__file__) + '/log/' + str(datetime.datetime.now().strftime( 5 | '%Y-%m-%d')) + '_nezha.log' 6 | logger = Logger(log_path, logging.DEBUG, __name__).getlog() 7 | 8 | 9 | # def frequent_pattern_miner(event_sequences): 10 | # """ 11 | # mining frequent pattern in event sequences (Discard) 12 | # input: 13 | # - event_sequences: event sequences belonging to the traces in time window, e.g., [[1,2,3],[2,3,4]] 14 | # output: 15 | # - pattern: frequent_pattern in the events, e.g., [['54', '29', '#SUP: 9'], ['54', '30', '#SUP: 9'], ['54', '32', '#SUP: 9']] 16 | # """ 17 | # print(datetime.datetime.now()) 18 | 19 | # spmf_path = dirname(__file__) + "/spmf" 20 | # spmf = Spmf("CM-SPAM", input_direct=event_sequences, 21 | # output_filename="./spmf/SPAM.txt", arguments=[0.01, 2, 2], spmf_bin_location_dir=spmf_path, memory=8192) 22 | # spmf.run() 23 | # pattern = spmf.parse_output() 24 | # print(pattern) 25 | # print(datetime.datetime.now()) 26 | # return pattern 27 | 28 | 29 | # def frequent_graph_miner(file_name, topk=30): 30 | # """ 31 | # mining frequent graph in event graph 32 | # input: 33 | # - file_name: input filename e.g., 34 | # output: 35 | # - pattern_list: frequent_child_graph_list [{'support': '519', 'node1': '180', 'node2': '264'}] 36 | # """ 37 | 38 | # # print(datetime.datetime.now()) 39 | 40 | # spmf_path = dirname(__file__) + "/spmf" 41 | # spmf = Spmf("TKG", input_filename=file_name, 42 | # output_filename="./spmf/tkg.txt", arguments=[topk, 2, False, False, True], spmf_bin_location_dir=spmf_path, memory=8192) 43 | # spmf.run() 44 | # pattern_result = spmf.parse_output() 45 | 46 | # # print(pattern_result) 47 | # # print(datetime.datetime.now()) 48 | 49 | # pattern_list = [] 50 | # for i in range(0, len(pattern_result), 6): 51 | # """ parse ['t # 29 * 519'], ['v 0 5'], ['v 1 265'], ['e 0 1 1'] """ 52 | # support = pattern_result[i][0].split(' ')[-1] 53 | # node1 = pattern_result[i+1][0].split(' ')[-1] 54 | # node2 = pattern_result[i+2][0].split(' ')[-1] 55 | # pattern = {"support": support, "child_graph": node1 + "_" + node2} 56 | # pattern_list.append(pattern) 57 | 58 | # pattern_list.sort(key=lambda k: k['support'], reverse=True) 59 | 60 | # return pattern_list 61 | 62 | 63 | # def generate_tkg_input(event_graphs): 64 | # """ 65 | # generate_tkg_input: 66 | # :parameter 67 | # event_graphs - graph list 68 | # :return 69 | # file_name - tkg input filename 70 | 71 | # details see at https://www.philippe-fournier-viger.com/spmf/TKG.php 72 | # t # 0 73 | # v 0 10 74 | # v 1 11 75 | # e 0 1 20 76 | # """ 77 | # file_name = dirname(__file__) + "/spmf/" + str(datetime.datetime.now().strftime( 78 | # '%Y-%m-%d')) + "_tkg_input.txt" 79 | # f = open(file_name, "w") 80 | 81 | # graph_number = 0 82 | # node_number = 0 83 | 84 | # for graph in event_graphs: 85 | # # write head 86 | # graph_head = "t # " + str(graph_number) + "\r\n" 87 | # f.write(graph_head) 88 | 89 | # node_map = {} 90 | # node_content = "" 91 | # edge_content = "" 92 | # for key in graph.adjacency_list.keys(): 93 | # if key.event not in node_map: 94 | # node_map[key.event] = node_number 95 | # node_content += "v " + \ 96 | # str(node_number) + " " + str(key.event) + "\r\n" 97 | # node_number += 1 98 | 99 | # for event in graph.adjacency_list[key]: 100 | # if event.event not in node_map: 101 | # node_map[event.event] = node_number 102 | # node_content += "v " + \ 103 | # str(node_number) + " " + str(event.event) + "\r\n" 104 | # node_number += 1 105 | 106 | # edge_content += "e " + \ 107 | # str(node_map[key.event]) + " " + \ 108 | # str(node_map[event.event]) + " 1\r\n" 109 | 110 | # f.write(node_content) 111 | # f.write(edge_content) 112 | # graph_number += 1 113 | # f.write("\r\n") 114 | # f.close() 115 | 116 | # return file_name 117 | 118 | 119 | def get_pattern_support(event_graphs): 120 | result_support_dict = {} 121 | total_pair = set() 122 | 123 | for event_graph in event_graphs: 124 | for key, value in event_graph.support_dict.items(): 125 | if key in total_pair: 126 | result_support_dict[key] += value 127 | else: 128 | result_support_dict[key] = value 129 | total_pair = total_pair | event_graph.pair_set 130 | 131 | result_support_dict = dict(sorted( 132 | result_support_dict.items(), key=lambda x: x[1], reverse=True)) 133 | 134 | return result_support_dict 135 | 136 | -------------------------------------------------------------------------------- /Baseline/Nezha/requirements.txt: -------------------------------------------------------------------------------- 1 | drain3==0.9.10 2 | matplotlib==3.3.4 3 | more_itertools==8.12.0 4 | numpy==2.0.0 5 | pandas==0.23.4 6 | psutil==5.9.0 7 | PyYAML==6.0.1 8 | -------------------------------------------------------------------------------- /Baseline/Readme.md: -------------------------------------------------------------------------------- 1 | # Baselines 2 | 3 | This folder contains the baseline methods for Lemma-RCA datasets evaluation with both single- and multi-modal settings. Note that SWAT and WADI datasets comply with single-modal setting. 4 | 5 | - FastPC: 6 | ``` 7 | python test_FastPC_pod_metric.py -case 20240115 ## for case 20240115 metric data only 8 | python test_FastPC_pod_log.py -case 20240115 ## for case 20240115 log data only 9 | python test_FastPC_pod_combine.py -case 20240115 ## for case 20240115 with both metric and log data 10 | ``` 11 | 12 | - Baro: 13 | ``` 14 | cd ./metric_only 15 | python baro_main_metric.py -case 20240115## for case 20240115 metric data only 16 | cd ./log_only 17 | python baro_main_log.py -case 20240115## for case 20240115 log data only 18 | cd ./multimodal 19 | python baro_main_combined.py -case 20240115 ## case 20240115 with for both metric and log data 20 | ``` 21 | 22 | 23 | - RCD: 24 | ``` 25 | cd ./metric_only 26 | python RCA_methods_metric.py -case 20240115 -model rcd ## for metric data only 27 | cd ./log_only 28 | python RCA_methods_log.py -case 20240115 -model rcd ## for log data only 29 | cd ./multimodal 30 | python RCA_methods_combined.py -case 20240115 -model rcd ## for both metric and log data 31 | ``` 32 | 33 | - CIRCA: 34 | ``` 35 | cd ./metric_only 36 | python RCA_methods_metric.py -case 20240115 -model circa ## for metric data only 37 | cd ./log_only 38 | python RCA_methods_log.py -case 20240115 -model circa ## for log data only 39 | cd ./multimodal 40 | python RCA_methods_combined.py -case 20240115 -model circa ## for both metric and log data 41 | ``` 42 | 43 | - epsilon_diagnosis: 44 | ``` 45 | cd ./metric_only 46 | python RCA_methods_metric.py -case 20240115 -model epsilon_diagnosis ## for metric data only 47 | cd ./log_only 48 | python RCA_methods_log.py -case 20240115 -model epsilon_diagnosis ## for log data only 49 | cd ./multimodal 50 | python RCA_methods_combined.py -case 20240115 -model epsilon_diagnosis ## for both metric and log data 51 | ``` 52 | 53 | - Nezha: 54 | ``` 55 | python main.py 56 | ``` 57 | For Nezha, we provide the demo code for the case 20240124. Due to inconsistant filename for each case, you may need to change the name of the folder for each case accordingly. 58 | 59 | To run the baseline methods for SWAT and WADI datasets, the only difference is the data loader and the labels for evaluation. The labels are given in the corresponding scripts. For the Baro method: 60 | 61 | - Baro for SWAT & WADI: 62 | ``` 63 | python baro.py 64 | python baro-evaluation.py 65 | ``` 66 | 67 | The RCD, epsilon_diagnosis, CIRCA methods are included in the pyrca package. Simply run: 68 | 69 | - Baro for SWAT & WADI: 70 | ``` 71 | python pyrca-main.py 72 | python pyrca-evaluation.py 73 | ``` 74 | 75 | ##### If you encounter the error regarding "name 'LIBSPOT' is not defined", please double-check if you are running the code in the directory of FastPC. 76 | 77 | #### If you fail to install pyrca package in windows, please use the following command: 78 | #### "pip install sfr-pyrca --use-pep517 git+https://github.com/SchmollerLab/python-javabridge-windows" 79 | 80 | -------------------------------------------------------------------------------- /Baseline/SWAT&WADI/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KnowledgeDiscovery/rca_baselines/c560d8cc39c19f04c9ae74fac2404b1e5e8c3b4d/Baseline/SWAT&WADI/.DS_Store -------------------------------------------------------------------------------- /Baseline/SWAT&WADI/baro-evaluation.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import glob 3 | import pandas as pd 4 | import numpy as np 5 | from collections import defaultdict 6 | 7 | files = glob.glob('./final*.csv') 8 | model = 'baro' 9 | 10 | model_files = defaultdict(list) 11 | 12 | for file in files: 13 | model_files[model].append(file) 14 | 15 | for key in model_files: 16 | model_files[key] = sorted(model_files[key], key=lambda x: x.split('/')[-2]) 17 | 18 | print(model_files) 19 | 20 | predicts = [] 21 | mfiles = model_files['baro'] 22 | for mf in mfiles: 23 | print(mf) 24 | mf_data = pd.read_csv(mf) 25 | root_cause_list = list(mf_data['root_cause'].values) 26 | if 'Latency' in root_cause_list: 27 | root_cause_list.remove('Latency') 28 | predicts.append(root_cause_list) 29 | 30 | reals = [ 31 | ['1_MV_001'], 32 | ['1_FIT_001'], 33 | ['2_LIT_002', '1_AIT_001'], 34 | ['2_MCV_101', '2_MCV_201', '2_MCV_301', '2_MCV_401', '2_MCV_501', '2_MCV_601'], 35 | ['2_MCV_101', '2_MCV_201'], 36 | ['1_AIT_002', '2_MV_003'], 37 | ['2_MCV_007'], 38 | ['1_P_006'], 39 | ['1_MV_001'], 40 | ['2_MCV_007'], 41 | ['2_MCV_007'], 42 | ['2_AIT_003'], 43 | ['2_MV_201', '2_P_201', '2_P_202', '2_P_203', '2_P_204', '2_P_205', '2_P_206'], 44 | ['2_LIT_002', '1_AIT_001'], 45 | ] 46 | 47 | def precision_on_topk(predicts,reals,k): 48 | pr = 0 49 | for pred, real in zip(predicts, reals): 50 | pred = pred[:k] 51 | hit_count = len(set(pred) & set(real)) 52 | min_len = min(k,len(real)) 53 | pr += hit_count/min_len 54 | return pr/len(reals) 55 | 56 | def mean_precision_k(predicts,reals,k): 57 | pr = 0 58 | for i in range(1,k+1): 59 | pr += precision_on_topk(predicts,reals,i) 60 | return pr/k 61 | 62 | def mrr(predicts,reals): 63 | mrr_val = 0 64 | for preds,real in zip(predicts,reals): 65 | tmp = [] 66 | for real_item in real: 67 | index = preds.index(real_item) if real_item in preds else sys.maxsize-1 68 | tmp.append(index+1) 69 | mrr_val += 1/min(tmp) 70 | return mrr_val/len(reals) 71 | 72 | k = [1,3,5,10] 73 | for item in k: 74 | pr_k = precision_on_topk(predicts,reals,item) 75 | map_k = mean_precision_k(predicts,reals,item) 76 | print("Precision@{}:{}".format(item,pr_k)) 77 | print('MAP@{}:{}'.format(item,map_k)) 78 | print('MRR:{}'.format(mrr(predicts,reals))) 79 | -------------------------------------------------------------------------------- /Baseline/SWAT&WADI/baro.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pickle 4 | import time 5 | import warnings 6 | warnings.filterwarnings("ignore") 7 | from sklearn.feature_selection import VarianceThreshold 8 | from baro_algorithm import bocpd, robust_scorer 9 | 10 | def data_convert(segment): 11 | columns = np.array(segment.iloc[:, 1:].columns) 12 | selector = VarianceThreshold(threshold=0) 13 | X = segment.iloc[:, 1:].values 14 | X_var = selector.fit_transform(X) 15 | idx = selector.get_support(indices=True) 16 | columns = columns[idx] 17 | X_var = pd.DataFrame(X_var) 18 | X_var.columns = list(columns) 19 | return X_var 20 | 21 | with open('../WADI/data_segments.pkl','rb') as f: 22 | data_segments = pickle.load(f) 23 | 24 | 25 | for ind,segment in enumerate(data_segments): 26 | segment = segment.iloc[:, 1:] 27 | print('{} fault starts to detect bayesian structure'.format(ind)) 28 | segment = data_convert(segment) 29 | columns = np.array(segment.columns) 30 | #np.save('{}_var_name.npy'.format(ind), columns) 31 | X = segment.values 32 | patch = 100 33 | sample = X.shape[0]//patch 34 | X = X[:patch*sample,:] 35 | X = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1) 36 | X_df = pd.DataFrame(X,columns=columns) 37 | anomalies = bocpd(X_df) 38 | print("Anomalies are detected at timestep:", anomalies[0]) 39 | results = robust_scorer(X_df,anomalies=anomalies) 40 | 41 | root_causes = [] 42 | for result in results: 43 | (root_cause, score) = result 44 | root_causes.append([root_cause, score]) 45 | root_causes = pd.DataFrame(root_causes) 46 | root_causes.columns = [['root_cause','score']] 47 | root_causes.to_csv("./final_{}_root_cause.csv".format(ind),index=False) 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /Baseline/SWAT&WADI/pyrca-evaluation.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import glob 3 | import pandas as pd 4 | import numpy as np 5 | from collections import defaultdict 6 | 7 | files = glob.glob('./final*.csv') 8 | model = 'ED' 9 | 10 | model_files = defaultdict(list) 11 | 12 | for file in files: 13 | if model in file: 14 | model_files[model].append(file) 15 | 16 | for key in model_files: 17 | model_files[key] = sorted(model_files[key], key=lambda x: x.split('/')[-2]) 18 | 19 | predicts = [] 20 | mfiles = model_files[model] 21 | print(mfiles) 22 | for mf in mfiles: 23 | print(mf) 24 | mf_data = pd.read_csv(mf) 25 | root_cause_list = list(mf_data['root_cause'].values) 26 | if 'label' in root_cause_list: 27 | root_cause_list.remove('label') 28 | predicts.append(root_cause_list) 29 | 30 | reals = [ 31 | ['1_MV_001'], 32 | ['1_FIT_001'], 33 | ['2_LIT_002', '1_AIT_001'], 34 | ['2_MCV_101', '2_MCV_201', '2_MCV_301', '2_MCV_401', '2_MCV_501', '2_MCV_601'], 35 | ['2_MCV_101', '2_MCV_201'], 36 | ['1_AIT_002', '2_MV_003'], 37 | ['2_MCV_007'], 38 | ['1_P_006'], 39 | ['1_MV_001'], 40 | ['2_MCV_007'], 41 | ['2_MCV_007'], 42 | ['2_AIT_003'], 43 | ['2_MV_201', '2_P_201', '2_P_202', '2_P_203', '2_P_204', '2_P_205', '2_P_206'], 44 | ['2_LIT_002', '1_AIT_001'], 45 | ] 46 | 47 | def precision_on_topk(predicts,reals,k): 48 | pr = 0 49 | for pred, real in zip(predicts, reals): 50 | pred = pred[:k] 51 | hit_count = len(set(pred) & set(real)) 52 | min_len = min(k,len(real)) 53 | pr += hit_count/min_len 54 | return pr/len(reals) 55 | 56 | def mean_precision_k(predicts,reals,k): 57 | pr = 0 58 | for i in range(1,k+1): 59 | pr += precision_on_topk(predicts,reals,i) 60 | return pr/k 61 | 62 | def mrr(predicts,reals): 63 | mrr_val = 0 64 | for preds,real in zip(predicts,reals): 65 | tmp = [] 66 | for real_item in real: 67 | index = preds.index(real_item) if real_item in preds else sys.maxsize-1 68 | tmp.append(index+1) 69 | mrr_val += 1/min(tmp) 70 | return mrr_val/len(reals) 71 | 72 | k = [1,3,5,10] 73 | for item in k: 74 | pr_k = precision_on_topk(predicts,reals,item) 75 | map_k = mean_precision_k(predicts,reals,item) 76 | print("Precision@{}:{}".format(item,pr_k)) 77 | print('MAP@{}:{}'.format(item,map_k)) 78 | print('MRR:{}'.format(mrr(predicts,reals))) 79 | -------------------------------------------------------------------------------- /Baseline/SWAT&WADI/pyrca-main.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn import preprocessing 5 | import os 6 | from sklearn.feature_selection import VarianceThreshold 7 | from sklearn.model_selection import train_test_split 8 | 9 | from causalnex.structure.notears import from_pandas 10 | import networkx as nx 11 | 12 | from pyrca.analyzers.ht import HT, HTConfig 13 | from pyrca.analyzers.epsilon_diagnosis import EpsilonDiagnosis, EpsilonDiagnosisConfig 14 | from pyrca.analyzers.rcd import RCD, RCDConfig 15 | 16 | import pandas as pd 17 | import networkx as nx 18 | import pickle 19 | 20 | 21 | def remove_cycles_from_adjacency_matrix(adj_matrix: pd.DataFrame) -> pd.DataFrame: 22 | G = nx.from_pandas_adjacency(adj_matrix, create_using=nx.DiGraph()) 23 | while True: 24 | try: 25 | cycle = nx.find_cycle(G, orientation='original') 26 | G.remove_edge(*cycle[0][:2]) 27 | except nx.NetworkXNoCycle: 28 | break 29 | adj_matrix_no_cycles = nx.to_pandas_adjacency(G, dtype=int) 30 | adj_matrix_no_cycles = adj_matrix_no_cycles.reindex_like(adj_matrix) 31 | print("Now, the adjacency matrix does not have cycles.") 32 | 33 | return adj_matrix_no_cycles 34 | 35 | def data_convert(segment): 36 | columns = np.array(segment.iloc[:, 1:].columns) 37 | selector = VarianceThreshold(threshold=0) 38 | X = segment.iloc[:, 1:].values 39 | X_var = selector.fit_transform(X) 40 | idx = selector.get_support(indices=True) 41 | columns = columns[idx] 42 | X_var = pd.DataFrame(X_var) 43 | X_var.columns = list(columns) 44 | 45 | return X_var 46 | 47 | def rca(ind, segment, model_name): 48 | segment = segment.iloc[:, 1:] 49 | print('{} fault starts to detect bayesian structure'.format(ind)) 50 | segment = data_convert(segment) 51 | columns = np.array(segment.columns) 52 | #np.save('{}_var_name.npy'.format(ind), columns) 53 | X = segment.values 54 | patch = 100 55 | sample = X.shape[0]//patch 56 | X = X[:patch*sample,:] 57 | X0 = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1) 58 | X = pd.DataFrame(X0,columns=columns) 59 | 60 | X_train, X_test = train_test_split(X, test_size=0.6, shuffle=False) 61 | print("Start to run") 62 | if model_name == "HT": 63 | model = HT(config=HTConfig(graph=estimated_matrix,root_cause_top_k=10)) 64 | model.train(X_train) 65 | results = model.find_root_causes(X_test, "label", True).to_list() 66 | elif model_name == "RCD": 67 | model = RCD(config=RCDConfig(k=10,alpha_limit=0.5)) 68 | results = model.find_root_causes(X_train,X_test).to_list() 69 | elif model_name == "ED": 70 | model = EpsilonDiagnosis(config=EpsilonDiagnosisConfig(root_cause_top_k=10)) 71 | model.train(X) 72 | results = model.find_root_causes(X).to_list() 73 | 74 | print("Saving") 75 | root_causes = [] 76 | for result in results: 77 | root_causes.append([result['root_cause'],result['score']]) 78 | root_causes = pd.DataFrame(root_causes) 79 | root_causes.columns = [['root_cause','score']] 80 | root_causes.to_csv("final_{}_{}_root_cause.csv".format(model_name, ind),index=False) 81 | 82 | return 83 | 84 | 85 | 86 | with open('../WADI/data_segments.pkl','rb') as f: 87 | data_segments = pickle.load(f) 88 | 89 | models = ['ED', 'RCD', 'HT'] 90 | # Run all 91 | for model_name in models: 92 | for ind,segment in enumerate(data_segments): 93 | print("Now running {} for data {}.".format(model_name, ind)) 94 | rca(ind, segment,model_name) 95 | print("-------------------") 96 | -------------------------------------------------------------------------------- /Baseline/log_only/RCA_methods_log.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn import preprocessing 4 | import os 5 | from sklearn.feature_selection import VarianceThreshold 6 | from sklearn.model_selection import train_test_split 7 | from causalnex.structure.notears import from_pandas 8 | from pyrca.analyzers.ht import HT, HTConfig 9 | from pyrca.analyzers.epsilon_diagnosis import EpsilonDiagnosis, EpsilonDiagnosisConfig 10 | from pyrca.analyzers.rcd import RCD, RCDConfig 11 | # from pyrca.analyzers 12 | import networkx as nx 13 | import argparse 14 | 15 | 16 | def remove_cycles_from_adjacency_matrix(adj_matrix: pd.DataFrame) -> pd.DataFrame: 17 | G = nx.from_pandas_adjacency(adj_matrix, create_using=nx.DiGraph()) 18 | while True: 19 | try: 20 | cycle = nx.find_cycle(G, orientation='original') 21 | G.remove_edge(*cycle[0][:2]) 22 | 23 | except nx.NetworkXNoCycle: 24 | break 25 | adj_matrix_no_cycles = nx.to_pandas_adjacency(G, dtype=int) 26 | adj_matrix_no_cycles = adj_matrix_no_cycles.reindex_like(adj_matrix) 27 | 28 | print("Now, the adjacency matrix does not have cycles.") 29 | return adj_matrix_no_cycles 30 | 31 | 32 | 33 | def main(args): 34 | model_name = args.model 35 | data_name = args.case 36 | metric_data = {} 37 | columns_common = {} 38 | metric_path = '../data/{}'.format(data_name) 39 | if data_name == '20220606': 40 | label = 'reviews-v3' 41 | elif data_name == '20210517' or data_name == '20210524': 42 | label = 'Book_Info_product' 43 | elif data_name == '20211203': 44 | label = 'ratings.book-info.svc.cluster.local:9080/*' 45 | elif data_name == '20240215': 46 | label = 'pod usage' 47 | elif data_name == '20240124': 48 | label = 'scenario8_app_request' 49 | elif data_name == '20231207': 50 | label = 'book_info' 51 | elif data_name == '20231221': 52 | label = 'book_info' 53 | elif data_name == '20240115': 54 | label = 'book_info' 55 | else: 56 | raise ValueError('Invalid data_name') 57 | 58 | if data_name in ['20220606', '20210517', '20210524', '20211203', '20231207']: 59 | POD_METRIC_FILE = {'log_golden_signal': 1, 'log_frequency': 1, 'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 60 | 'received_bandwidth': 1, 'transmit_bandwidth': 1} 61 | log_label = 'ratings.book-info.svc.cluster.local:9080/*' 62 | elif data_name in ['20231207']: 63 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1, 64 | 'received_bandwidth': 1, 'transmit_bandwidth': 1, 'log_golden_signal': 1, 65 | 'log_frequency': 1} 66 | log_label = 'book_info' 67 | elif data_name in ['20240124']: 68 | POD_METRIC_FILE = {'log_golden_signal': 1, 'log_frequency': 1, 'cpu_usage': 1, 'disk_used': 1, 'diskio_reads': 1, 'diskio_writes': 1, 'memory_used': 1, 69 | 'netstat_established': 1, 'swap_used': 1} 70 | log_label = 'ratings.book-info.svc.cluster.local:9080/*' 71 | elif data_name in ['20240215']: 72 | POD_METRIC_FILE = {'pod_cpu_limit': 1, 'pod_cpu_usage_total': 1, 'pod_cpu_utilization_over_pod_limit': 1, 'pod_memory_limit': 1, 73 | 'pod_memory_utilization_over_pod_limit': 1, 'pod_memory_working_set': 1, 'pod_network_rx_bytes': 1, 'pod_network_tx_bytes':1, 74 | 'log_golden_signal': 1, 'log_frequency': 1} 75 | log_label = 'book_info' 76 | elif data_name in ['20240115']: 77 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 78 | 'rate_storage_iops': 1, 'received_bandwidth': 1, 'transmit_bandwidth': 1, 'log_golden_signal': 1, 79 | 'log_frequency': 1} 80 | log_label = 'ratings.book-info.svc.cluster.local:9080/*' 81 | elif data_name in ['20231221']: 82 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1, 83 | 'received_bandwidth': 1, 'transmit_bandwidth': 1, 'log_golden_signal': 1, 'log_frequency': 1} 84 | log_label = 'ratings.book-info.svc.cluster.local:9080/*' 85 | else: 86 | raise ValueError('Invalid data_name') 87 | 88 | pathset = "./output/" 89 | if not (os.path.exists(pathset)): 90 | os.mkdir(pathset) 91 | 92 | for metric, weight in POD_METRIC_FILE.items(): 93 | if metric in ['log_PCA', 'log_golden_signal', 'log_frequency']: 94 | metric_file = '{}/pod_level_{}.npy'.format(metric_path, metric) 95 | metric_data[metric] = np.load(metric_file, allow_pickle=True).item() 96 | # log_label = 'ratings.book-info.svc.cluster.local:9080/*' 97 | if len(metric_data[metric].keys()) == 1: 98 | if log_label != label: 99 | metric_data[metric][label] = metric_data[metric][log_label] 100 | del metric_data[metric][log_label] 101 | else: 102 | metric_data[metric][label] = metric_data[metric] 103 | metric_data[metric][label]['Pod_Name'] = metric_data[metric][label]['Node_Name'] 104 | del metric_data[metric][label]['Node_Name'] 105 | metric_data[metric][label]['Sequence'] = metric_data[metric][label]['Sequence'].squeeze().T 106 | metric_data[metric][label]['KPI_Feature'] = [metric_data[metric][label]['KPI_Feature']] 107 | if columns_common: 108 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common)) 109 | else: 110 | columns_common = list(metric_data[metric][label]['Pod_Name']) 111 | else: 112 | metric_file = '{}/pod_level_data_{}.npy'.format(metric_path, metric) 113 | metric_data[metric] = np.load(metric_file, allow_pickle=True).item() 114 | if columns_common: 115 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common)) 116 | else: 117 | columns_common = list(metric_data[metric][label]['Pod_Name']) 118 | 119 | index_data = {} 120 | metric_names = [] 121 | metric_weight_assigned = [] 122 | for metric, weight in POD_METRIC_FILE.items(): 123 | index_data[metric] = [metric_data[metric][label]['Pod_Name'].index(x) for x in columns_common] 124 | metric_names = metric_names + [metric] 125 | metric_weight_assigned = metric_weight_assigned + [weight] 126 | 127 | metric_weight = np.zeros((len(POD_METRIC_FILE), 1)) 128 | metric_id = 0 129 | final_root_results = {} 130 | 131 | for metric in metric_names: 132 | print('For metric:', metric) 133 | data = metric_data[metric] 134 | X = data[label]['Sequence'] 135 | index = index_data[metric] 136 | # Preprocessing to reduce the redundant samples 137 | if X.shape[0] // 100 < 100: 138 | patch = 20 139 | else: 140 | patch = 100 141 | sample = X.shape[0] // patch 142 | X = X[:patch * sample, :] 143 | X = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1) 144 | X_metric = X[:, index] 145 | X_metric = preprocessing.normalize(X_metric, axis=0, norm='l1') 146 | X = np.append(X_metric, X[:, -1].reshape(-1, 1), axis=1) 147 | columns = list(columns_common) + data[label]['KPI_Feature'] 148 | 149 | std = np.std(X[:, :-1], axis=0) 150 | idx_std = [i for i, x in enumerate(std > 1e-5) if x] 151 | if len(idx_std) == 0: 152 | metric_weight[metric_id] = 0 153 | metric_id = metric_id + 1 154 | print(metric, ' all pods are all constant or quasi-constant') 155 | continue 156 | 157 | selector = VarianceThreshold(threshold=0) 158 | X_var = selector.fit_transform(X[:, :-1]) 159 | idx = selector.get_support(indices=True) 160 | # print('X shape after variance: ', X_var.shape) 161 | if X_var.shape[1] < 1: 162 | metric_weight[metric_id] = 0 163 | metric_id = metric_id + 1 164 | print(metric, ' all pods are all constant or quasi-constant') 165 | continue 166 | 167 | mask = np.full(len(columns_common), False, dtype=bool) 168 | mask[idx] = True 169 | idx = list(idx) + [X.shape[1] - 1] 170 | X = X[:, idx] 171 | columns = [columns[i] for i in idx] 172 | X = pd.DataFrame(X, columns=columns) 173 | if model_name == 'circa': 174 | sm = from_pandas(X) 175 | estimated_matrix = nx.to_pandas_adjacency(sm) 176 | quantile_value = np.quantile(estimated_matrix.values.flatten(), 0.95) 177 | estimated_matrix = (estimated_matrix > quantile_value).astype(int) 178 | estimated_matrix = remove_cycles_from_adjacency_matrix(estimated_matrix) 179 | # estimated_matrix.to_csv("{}_adjacency.csv".format(metric)) 180 | 181 | X_train, X_test = train_test_split(X, test_size=0.6, shuffle=False) 182 | 183 | X.insert(0, 'time', pd.date_range(start='2024-01-01', periods=len(X), freq='D')) 184 | 185 | X['time'] = X['time'].astype('int64') // 1_000_000_000 186 | X.columns = [f"{col}_cpu" if i < 9 else f"{col}_memory" for i, col in enumerate(X.columns)] 187 | 188 | 189 | X.iloc[:, 1:] = (X.iloc[:, 1:] - X.iloc[:, 1:].min()) / (X.iloc[:, 1:].max() - X.iloc[:, 1:].min()) 190 | 191 | if model_name == 'rcd': 192 | model = RCD(config=RCDConfig(k=3,alpha_limit=0.5)) 193 | results = model.find_root_causes(X_train, X_test).to_list() 194 | print(results) 195 | elif model_name == 'circa': 196 | model = HT(config=HTConfig(graph=estimated_matrix, root_cause_top_k=10)) 197 | model.train(X_train) 198 | results = model.find_root_causes(X_test, metric_data[metric][label]['KPI_Feature'][0], True).to_list() 199 | elif model_name == 'epsilon_diagnosis': 200 | model = EpsilonDiagnosis(config=EpsilonDiagnosisConfig(root_cause_top_k=10)) 201 | model.train(X) 202 | results = model.find_root_causes(X).to_list() 203 | else: 204 | raise ValueError('Invalid model_name') 205 | 206 | root_causes = [] 207 | for result in results: 208 | root_causes.append([result['root_cause'], result['score']]) 209 | if not os.path.exists('./{}_results'.format(model_name)): 210 | os.mkdir('./{}_results'.format(model_name)) 211 | if not os.path.exists('./{}_results/{}'.format(model_name, data_name)): 212 | os.mkdir('./{}_results/{}'.format(model_name, data_name)) 213 | 214 | root_causes = pd.DataFrame(root_causes) 215 | root_causes.columns = [['root_cause', 'score']] 216 | root_causes.to_csv("./{}_results/{}/{}_{}_{}_root_cause.csv".format(model_name, data_name, metric, model_name, data_name), 217 | index=False) 218 | final_root_results[metric] = root_causes 219 | concatenated_df = pd.concat(final_root_results.values(), ignore_index=True) 220 | concatenated_df.to_csv("./{}_results/{}/concated_df.csv".format(model_name, data_name), index=False) 221 | concatenated_df = pd.read_csv("./{}_results/{}/concated_df.csv".format(model_name, data_name)) 222 | aggregated_df = concatenated_df.groupby('root_cause')['score'].sum().reset_index() 223 | aggregated_df = aggregated_df.sort_values(by='score', ascending=False) 224 | aggregated_df.to_csv("./{}_results/{}/final_{}_{}_root_cause.csv".format(model_name, data_name, model_name, data_name), index=False) 225 | 226 | 227 | if __name__ == '__main__': 228 | parser = argparse.ArgumentParser(description='Baro') 229 | parser.add_argument("-case", type=str, default='20240115', help="case of the dataset") 230 | parser.add_argument("-model", type=str, default='rcd', help="model name, [rcd, circa, epsilon_diagnosis], default is rcd") 231 | parser.set_defaults(validation=True) 232 | args = parser.parse_args() 233 | main(args) 234 | 235 | 236 | 237 | 238 | 239 | 240 | -------------------------------------------------------------------------------- /Baseline/log_only/baro_main_log.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn import preprocessing 4 | import os 5 | from sklearn.feature_selection import VarianceThreshold 6 | from baro_algorithm import bocpd, robust_scorer 7 | import networkx as nx 8 | import argparse 9 | 10 | 11 | def remove_cycles_from_adjacency_matrix(adj_matrix: pd.DataFrame) -> pd.DataFrame: 12 | G = nx.from_pandas_adjacency(adj_matrix, create_using=nx.DiGraph()) 13 | while True: 14 | try: 15 | cycle = nx.find_cycle(G, orientation='original') 16 | G.remove_edge(*cycle[0][:2]) 17 | 18 | except nx.NetworkXNoCycle: 19 | break 20 | adj_matrix_no_cycles = nx.to_pandas_adjacency(G, dtype=int) 21 | adj_matrix_no_cycles = adj_matrix_no_cycles.reindex_like(adj_matrix) 22 | 23 | print("Now, the adjacency matrix does not have cycles.") 24 | return adj_matrix_no_cycles 25 | 26 | 27 | def main(args): 28 | metric_data = {} 29 | columns_common = {} 30 | method = 'baro' 31 | data_name = args.case 32 | metric_path = '../data/{}'.format(data_name) 33 | if data_name == '20220606': 34 | label = 'reviews-v3' 35 | elif data_name == '20210517' or data_name == '20210524': 36 | label = 'Book_Info_product' 37 | elif data_name == '20211203': 38 | label = 'ratings.book-info.svc.cluster.local:9080/*' 39 | elif data_name == '20240215': 40 | label = 'pod usage' 41 | elif data_name == '20240124': 42 | label = 'scenario8_app_request' 43 | elif data_name == '20231207': 44 | label = 'book_info' 45 | elif data_name == '20231221': 46 | label = 'book_info' 47 | elif data_name == '20240115': 48 | label = 'book_info' 49 | else: 50 | raise ValueError('Invalid data_name') 51 | 52 | POD_METRIC_FILE = {'log_golden_signal': 1, 'log_frequency': 1} 53 | if data_name in ['20220606', '20210517', '20210524', '20211203', '20231207', '20240124', '20240115', '20231221']: 54 | log_label = 'ratings.book-info.svc.cluster.local:9080/*' 55 | elif data_name in ['20231207', '20240215']: 56 | log_label = 'book_info' 57 | else: 58 | raise ValueError('Invalid data_name') 59 | model_name = 'baro' 60 | 61 | pathset = "./output/" 62 | if not(os.path.exists(pathset)): 63 | os.mkdir(pathset) 64 | 65 | for metric, weight in POD_METRIC_FILE.items(): 66 | if metric in ['log_PCA', 'log_golden_signal', 'log_frequency']: 67 | metric_file = '{}/pod_level_{}.npy'.format(metric_path, metric) 68 | metric_data[metric] = np.load(metric_file,allow_pickle=True).item() 69 | if len(metric_data[metric].keys()) == 1: 70 | if log_label != label: 71 | metric_data[metric][label] = metric_data[metric][log_label] 72 | del metric_data[metric][log_label] 73 | else: 74 | metric_data[metric][label] = metric_data[metric] 75 | metric_data[metric][label]['Pod_Name'] = metric_data[metric][label]['Node_Name'] 76 | del metric_data[metric][label]['Node_Name'] 77 | metric_data[metric][label]['Sequence'] = metric_data[metric][label]['Sequence'].squeeze().T 78 | metric_data[metric][label]['KPI_Feature'] = [metric_data[metric][label]['KPI_Feature']] 79 | if columns_common: 80 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common)) 81 | else: 82 | columns_common = list(metric_data[metric][label]['Pod_Name']) 83 | else: 84 | metric_file = '{}/pod_level_data_{}.npy'.format(metric_path, metric) 85 | metric_data[metric] = np.load(metric_file, allow_pickle=True).item() 86 | if columns_common: 87 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common)) 88 | else: 89 | columns_common = list(metric_data[metric][label]['Pod_Name']) 90 | 91 | 92 | index_data = {} 93 | metric_names = [] 94 | metric_weight_assigned = [] 95 | for metric, weight in POD_METRIC_FILE.items(): 96 | index_data[metric] = [metric_data[metric][label]['Pod_Name'].index(x) for x in columns_common] 97 | metric_names = metric_names + [metric] 98 | metric_weight_assigned = metric_weight_assigned + [weight] 99 | 100 | metric_weight = np.zeros((len(POD_METRIC_FILE),1)) 101 | metric_id = 0 102 | final_root_results = {} 103 | 104 | for metric in metric_names: 105 | print('For metric:', metric) 106 | data = metric_data[metric] 107 | X = data[label]['Sequence'] 108 | index = index_data[metric] 109 | 110 | # Preprocessing to reduce the redundant samples 111 | if X.shape[0] // 100 < 100: 112 | patch = 20 113 | else: 114 | patch = 100 115 | sample = X.shape[0]//patch 116 | X = X[:patch*sample,:] 117 | X = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1) 118 | X_metric = X[:, index] 119 | X_metric = preprocessing.normalize(X_metric, axis=0, norm = 'l1') 120 | X = np.append(X_metric, X[:, -1].reshape(-1,1), axis=1) 121 | columns = list(columns_common) + data[label]['KPI_Feature'] 122 | 123 | 124 | std = np.std(X[:, :-1], axis=0) 125 | idx_std = [i for i, x in enumerate(std > 1e-5) if x] 126 | if len(idx_std) == 0: 127 | metric_weight[metric_id] = 0 128 | metric_id = metric_id + 1 129 | print(metric,' all pods are all constant or quasi-constant') 130 | continue 131 | 132 | selector = VarianceThreshold(threshold = 0) 133 | X_var = selector.fit_transform(X[:, :-1]) 134 | idx = selector.get_support(indices = True) 135 | #print('X shape after variance: ', X_var.shape) 136 | if X_var.shape[1] < 1: 137 | metric_weight[metric_id] = 0 138 | metric_id = metric_id + 1 139 | print(metric,' all pods are all constant or quasi-constant') 140 | continue 141 | 142 | # causal_score = np.zeros(len(columns_common)) 143 | mask = np.full(len(columns_common), False,dtype=bool) 144 | mask[idx] = True 145 | idx = list(idx) + [X.shape[1]-1] 146 | X = X[:, idx] 147 | columns = [columns[i] for i in idx] 148 | X = pd.DataFrame(X,columns=columns) 149 | 150 | X.insert(0,'time', pd.date_range(start='2024-01-01', periods=len(X), freq='D')) 151 | 152 | X['time'] = X['time'].astype('int64') // 1_000_000_000 153 | X.columns = [f"{col}_cpu" if i < 9 else f"{col}_memory" for i, col in enumerate(X.columns)] 154 | 155 | X.iloc[:, 1:] = (X.iloc[:, 1:] - X.iloc[:, 1:].min()) / (X.iloc[:, 1:].max() - X.iloc[:, 1:].min()) 156 | 157 | anomalies = bocpd(X) 158 | print("Anomalies are detected at timestep:", anomalies[0]) 159 | results = robust_scorer(X,anomalies=anomalies) 160 | print(results) 161 | 162 | root_causes = [] 163 | for result in results: 164 | (root_cause, score) = result 165 | root_causes.append([root_cause, score]) 166 | if not os.path.exists('./{}_results'.format(method)): 167 | os.mkdir('./{}_results'.format(method)) 168 | if not os.path.exists('./{}_results/{}'.format(method, data_name)): 169 | os.mkdir('./{}_results/{}'.format(method, data_name)) 170 | 171 | root_causes = pd.DataFrame(root_causes) 172 | root_causes.columns = [['root_cause', 'score']] 173 | root_causes.to_csv("./{}_results/{}/{}_{}_{}_root_cause.csv".format(method, data_name, metric, model_name, data_name),index=False) 174 | 175 | final_root_results[metric] = root_causes 176 | 177 | concatenated_df = pd.concat(final_root_results.values(), ignore_index=True) 178 | concatenated_df.to_csv("./{}_results/{}/concated_df.csv".format(method, data_name), index=False) 179 | concatenated_df = pd.read_csv("./{}_results/{}/concated_df.csv".format(method, data_name)) 180 | aggregated_df = concatenated_df.groupby('root_cause')['score'].sum().reset_index() 181 | aggregated_df = aggregated_df.sort_values(by='score', ascending=False) 182 | aggregated_df.to_csv("./{}_results/{}/final_{}_{}_root_cause.csv".format(method, data_name, model_name, data_name), index=False) 183 | 184 | 185 | if __name__ == '__main__': 186 | parser = argparse.ArgumentParser(description='Baro') 187 | parser.add_argument("-case", type=str, default='20240115', help="case of the dataset") 188 | parser.set_defaults(validation=True) 189 | args = parser.parse_args() 190 | main(args) 191 | -------------------------------------------------------------------------------- /Baseline/metric_only/RCA_methods_metric.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn import preprocessing 4 | import os 5 | from sklearn.feature_selection import VarianceThreshold 6 | from sklearn.model_selection import train_test_split 7 | from causalnex.structure.notears import from_pandas 8 | from pyrca.analyzers.ht import HT, HTConfig 9 | from pyrca.analyzers.epsilon_diagnosis import EpsilonDiagnosis, EpsilonDiagnosisConfig 10 | from pyrca.analyzers.rcd import RCD, RCDConfig 11 | # from pyrca.analyzers 12 | import networkx as nx 13 | import argparse 14 | 15 | 16 | def remove_cycles_from_adjacency_matrix(adj_matrix: pd.DataFrame) -> pd.DataFrame: 17 | G = nx.from_pandas_adjacency(adj_matrix, create_using=nx.DiGraph()) 18 | while True: 19 | try: 20 | cycle = nx.find_cycle(G, orientation='original') 21 | G.remove_edge(*cycle[0][:2]) 22 | 23 | except nx.NetworkXNoCycle: 24 | break 25 | adj_matrix_no_cycles = nx.to_pandas_adjacency(G, dtype=int) 26 | adj_matrix_no_cycles = adj_matrix_no_cycles.reindex_like(adj_matrix) 27 | 28 | print("Now, the adjacency matrix does not have cycles.") 29 | return adj_matrix_no_cycles 30 | 31 | 32 | 33 | def main(args): 34 | model_name = args.model 35 | data_name = args.case 36 | metric_data = {} 37 | columns_common = {} 38 | metric_path = '../data/{}'.format(data_name) 39 | if data_name == '20220606': 40 | label = 'reviews-v3' 41 | elif data_name == '20210517' or data_name == '20210524': 42 | label = 'Book_Info_product' 43 | elif data_name == '20211203': 44 | label = 'ratings.book-info.svc.cluster.local:9080/*' 45 | elif data_name == '20240215': 46 | label = 'pod usage' 47 | elif data_name == '20240124': 48 | label = 'scenario8_app_request' 49 | elif data_name == '20231207': 50 | label = 'book_info' 51 | elif data_name == '20231221': 52 | label = 'book_info' 53 | elif data_name == '20240115': 54 | label = 'book_info' 55 | else: 56 | raise ValueError('Invalid data_name') 57 | 58 | if data_name in ['20220606', '20210517', '20210524', '20211203', '20231207']: 59 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 60 | 'received_bandwidth': 1, 'transmit_bandwidth': 1} 61 | log_label = 'ratings.book-info.svc.cluster.local:9080/*' 62 | elif data_name in ['20231207']: 63 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1, 64 | 'received_bandwidth': 1, 'transmit_bandwidth': 1} 65 | log_label = 'book_info' 66 | elif data_name in ['20240124']: 67 | POD_METRIC_FILE = {'cpu_usage': 1, 'disk_used': 1, 'diskio_reads': 1, 'diskio_writes': 1, 'memory_used': 1, 68 | 'netstat_established': 1, 'swap_used': 1} 69 | log_label = 'ratings.book-info.svc.cluster.local:9080/*' 70 | elif data_name in ['20240215']: 71 | POD_METRIC_FILE = {'pod_cpu_limit': 1, 'pod_cpu_usage_total': 1, 'pod_cpu_utilization_over_pod_limit': 1, 'pod_memory_limit': 1, 72 | 'pod_memory_utilization_over_pod_limit': 1, 'pod_memory_working_set': 1, 'pod_network_rx_bytes': 1, 'pod_network_tx_bytes':1} 73 | log_label = 'book_info' 74 | elif data_name in ['20240115']: 75 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 76 | 'rate_storage_iops': 1, 'received_bandwidth': 1, 'transmit_bandwidth': 1} 77 | log_label = 'ratings.book-info.svc.cluster.local:9080/*' 78 | elif data_name in ['20231221']: 79 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1, 80 | 'received_bandwidth': 1, 'transmit_bandwidth': 1} 81 | log_label = 'ratings.book-info.svc.cluster.local:9080/*' 82 | else: 83 | raise ValueError('Invalid data_name') 84 | 85 | pathset = "./output/" 86 | if not (os.path.exists(pathset)): 87 | os.mkdir(pathset) 88 | 89 | for metric, weight in POD_METRIC_FILE.items(): 90 | if metric in ['log_PCA', 'log_golden_signal', 'log_frequency']: 91 | metric_file = '{}/pod_level_{}.npy'.format(metric_path, metric) 92 | metric_data[metric] = np.load(metric_file, allow_pickle=True).item() 93 | # log_label = 'ratings.book-info.svc.cluster.local:9080/*' 94 | if len(metric_data[metric].keys()) == 1: 95 | if log_label != label: 96 | metric_data[metric][label] = metric_data[metric][log_label] 97 | del metric_data[metric][log_label] 98 | else: 99 | metric_data[metric][label] = metric_data[metric] 100 | metric_data[metric][label]['Pod_Name'] = metric_data[metric][label]['Node_Name'] 101 | del metric_data[metric][label]['Node_Name'] 102 | metric_data[metric][label]['Sequence'] = metric_data[metric][label]['Sequence'].squeeze().T 103 | metric_data[metric][label]['KPI_Feature'] = [metric_data[metric][label]['KPI_Feature']] 104 | if columns_common: 105 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common)) 106 | else: 107 | columns_common = list(metric_data[metric][label]['Pod_Name']) 108 | else: 109 | metric_file = '{}/pod_level_data_{}.npy'.format(metric_path, metric) 110 | metric_data[metric] = np.load(metric_file, allow_pickle=True).item() 111 | if columns_common: 112 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common)) 113 | else: 114 | columns_common = list(metric_data[metric][label]['Pod_Name']) 115 | 116 | index_data = {} 117 | metric_names = [] 118 | metric_weight_assigned = [] 119 | for metric, weight in POD_METRIC_FILE.items(): 120 | index_data[metric] = [metric_data[metric][label]['Pod_Name'].index(x) for x in columns_common] 121 | metric_names = metric_names + [metric] 122 | metric_weight_assigned = metric_weight_assigned + [weight] 123 | 124 | metric_weight = np.zeros((len(POD_METRIC_FILE), 1)) 125 | metric_id = 0 126 | final_root_results = {} 127 | 128 | for metric in metric_names: 129 | print('For metric:', metric) 130 | data = metric_data[metric] 131 | X = data[label]['Sequence'] 132 | index = index_data[metric] 133 | # Preprocessing to reduce the redundant samples 134 | if X.shape[0] // 100 < 100: 135 | patch = 20 136 | else: 137 | patch = 100 138 | sample = X.shape[0] // patch 139 | X = X[:patch * sample, :] 140 | X = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1) 141 | X_metric = X[:, index] 142 | X_metric = preprocessing.normalize(X_metric, axis=0, norm='l1') 143 | X = np.append(X_metric, X[:, -1].reshape(-1, 1), axis=1) 144 | columns = list(columns_common) + data[label]['KPI_Feature'] 145 | 146 | std = np.std(X[:, :-1], axis=0) 147 | idx_std = [i for i, x in enumerate(std > 1e-5) if x] 148 | if len(idx_std) == 0: 149 | metric_weight[metric_id] = 0 150 | metric_id = metric_id + 1 151 | print(metric, ' all pods are all constant or quasi-constant') 152 | continue 153 | 154 | selector = VarianceThreshold(threshold=0) 155 | X_var = selector.fit_transform(X[:, :-1]) 156 | idx = selector.get_support(indices=True) 157 | # print('X shape after variance: ', X_var.shape) 158 | if X_var.shape[1] < 1: 159 | metric_weight[metric_id] = 0 160 | metric_id = metric_id + 1 161 | print(metric, ' all pods are all constant or quasi-constant') 162 | continue 163 | 164 | mask = np.full(len(columns_common), False, dtype=bool) 165 | mask[idx] = True 166 | idx = list(idx) + [X.shape[1] - 1] 167 | X = X[:, idx] 168 | columns = [columns[i] for i in idx] 169 | X = pd.DataFrame(X, columns=columns) 170 | if model_name == 'circa': 171 | sm = from_pandas(X) 172 | estimated_matrix = nx.to_pandas_adjacency(sm) 173 | quantile_value = np.quantile(estimated_matrix.values.flatten(), 0.95) 174 | estimated_matrix = (estimated_matrix > quantile_value).astype(int) 175 | estimated_matrix = remove_cycles_from_adjacency_matrix(estimated_matrix) 176 | # estimated_matrix.to_csv("{}_adjacency.csv".format(metric)) 177 | 178 | X_train, X_test = train_test_split(X, test_size=0.6, shuffle=False) 179 | 180 | X.insert(0, 'time', pd.date_range(start='2024-01-01', periods=len(X), freq='D')) 181 | 182 | X['time'] = X['time'].astype('int64') // 1_000_000_000 183 | X.columns = [f"{col}_cpu" if i < 9 else f"{col}_memory" for i, col in enumerate(X.columns)] 184 | 185 | 186 | X.iloc[:, 1:] = (X.iloc[:, 1:] - X.iloc[:, 1:].min()) / (X.iloc[:, 1:].max() - X.iloc[:, 1:].min()) 187 | 188 | if model_name == 'rcd': 189 | model = RCD(config=RCDConfig(k=3,alpha_limit=0.5)) 190 | results = model.find_root_causes(X_train, X_test).to_list() 191 | print(results) 192 | elif model_name == 'circa': 193 | model = HT(config=HTConfig(graph=estimated_matrix, root_cause_top_k=10)) 194 | model.train(X_train) 195 | results = model.find_root_causes(X_test, metric_data[metric][label]['KPI_Feature'][0], True).to_list() 196 | elif model_name == 'epsilon_diagnosis': 197 | model = EpsilonDiagnosis(config=EpsilonDiagnosisConfig(root_cause_top_k=10)) 198 | model.train(X) 199 | results = model.find_root_causes(X).to_list() 200 | else: 201 | raise ValueError('Invalid model_name') 202 | 203 | root_causes = [] 204 | for result in results: 205 | root_causes.append([result['root_cause'], result['score']]) 206 | if not os.path.exists('./{}_results'.format(model_name)): 207 | os.mkdir('./{}_results'.format(model_name)) 208 | if not os.path.exists('./{}_results/{}'.format(model_name, data_name)): 209 | os.mkdir('./{}_results/{}'.format(model_name, data_name)) 210 | 211 | root_causes = pd.DataFrame(root_causes) 212 | root_causes.columns = [['root_cause', 'score']] 213 | root_causes.to_csv("./{}_results/{}/{}_{}_{}_root_cause.csv".format(model_name, data_name, metric, model_name, data_name), 214 | index=False) 215 | final_root_results[metric] = root_causes 216 | concatenated_df = pd.concat(final_root_results.values(), ignore_index=True) 217 | concatenated_df.to_csv("./{}_results/{}/concated_df.csv".format(model_name, data_name), index=False) 218 | concatenated_df = pd.read_csv("./{}_results/{}/concated_df.csv".format(model_name, data_name)) 219 | aggregated_df = concatenated_df.groupby('root_cause')['score'].sum().reset_index() 220 | aggregated_df = aggregated_df.sort_values(by='score', ascending=False) 221 | aggregated_df.to_csv("./{}_results/{}/final_{}_{}_root_cause.csv".format(model_name, data_name, model_name, data_name), index=False) 222 | 223 | 224 | if __name__ == '__main__': 225 | parser = argparse.ArgumentParser(description='Baro') 226 | parser.add_argument("-case", type=str, default='20240115', help="case of the dataset") 227 | parser.add_argument("-model", type=str, default='rcd', help="model name, [rcd, circa, epsilon_diagnosis], default is rcd") 228 | parser.set_defaults(validation=True) 229 | args = parser.parse_args() 230 | main(args) 231 | 232 | 233 | 234 | 235 | 236 | 237 | -------------------------------------------------------------------------------- /Baseline/metric_only/baro_main_metric.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn import preprocessing 4 | import os 5 | from sklearn.feature_selection import VarianceThreshold 6 | from baro_algorithm import bocpd, robust_scorer 7 | import networkx as nx 8 | import argparse 9 | 10 | 11 | def remove_cycles_from_adjacency_matrix(adj_matrix: pd.DataFrame) -> pd.DataFrame: 12 | G = nx.from_pandas_adjacency(adj_matrix, create_using=nx.DiGraph()) 13 | while True: 14 | try: 15 | cycle = nx.find_cycle(G, orientation='original') 16 | G.remove_edge(*cycle[0][:2]) 17 | 18 | except nx.NetworkXNoCycle: 19 | break 20 | adj_matrix_no_cycles = nx.to_pandas_adjacency(G, dtype=int) 21 | adj_matrix_no_cycles = adj_matrix_no_cycles.reindex_like(adj_matrix) 22 | 23 | print("Now, the adjacency matrix does not have cycles.") 24 | return adj_matrix_no_cycles 25 | 26 | 27 | def main(args): 28 | metric_data = {} 29 | columns_common = {} 30 | method = 'baro' 31 | data_name = args.case 32 | metric_path = '../data/{}'.format(data_name) 33 | if data_name == '20220606': 34 | label = 'reviews-v3' 35 | elif data_name == '20210517' or data_name == '20210524': 36 | label = 'Book_Info_product' 37 | elif data_name == '20211203': 38 | label = 'ratings.book-info.svc.cluster.local:9080/*' 39 | elif data_name == '20240215': 40 | label = 'pod usage' 41 | elif data_name == '20240124': 42 | label = 'scenario8_app_request' 43 | elif data_name == '20231207': 44 | label = 'book_info' 45 | elif data_name == '20231221': 46 | label = 'book_info' 47 | elif data_name == '20240115': 48 | label = 'book_info' 49 | else: 50 | raise ValueError('Invalid data_name') 51 | 52 | if data_name in ['20220606', '20210517', '20210524', '20211203', '20231207']: 53 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 54 | 'received_bandwidth': 1, 'transmit_bandwidth': 1} 55 | log_label = 'ratings.book-info.svc.cluster.local:9080/*' 56 | elif data_name in ['20231207']: 57 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1, 58 | 'received_bandwidth': 1, 'transmit_bandwidth': 1} 59 | log_label = 'book_info' 60 | elif data_name in ['20240124']: 61 | POD_METRIC_FILE = {'cpu_usage': 1, 'disk_used': 1, 'diskio_reads': 1, 'diskio_writes': 1, 'memory_used': 1, 62 | 'netstat_established': 1, 'swap_used': 1} 63 | log_label = 'ratings.book-info.svc.cluster.local:9080/*' 64 | elif data_name in ['20240215']: 65 | POD_METRIC_FILE = {'pod_cpu_limit': 1, 'pod_cpu_usage_total': 1, 'pod_cpu_utilization_over_pod_limit': 1, 'pod_memory_limit': 1, 66 | 'pod_memory_utilization_over_pod_limit': 1, 'pod_memory_working_set': 1, 'pod_network_rx_bytes': 1, 'pod_network_tx_bytes':1} 67 | log_label = 'book_info' 68 | elif data_name in ['20240115']: 69 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 70 | 'rate_storage_iops': 1, 'received_bandwidth': 1, 'transmit_bandwidth': 1} 71 | log_label = 'ratings.book-info.svc.cluster.local:9080/*' 72 | elif data_name in ['20231221']: 73 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1, 74 | 'received_bandwidth': 1, 'transmit_bandwidth': 1} 75 | log_label = 'ratings.book-info.svc.cluster.local:9080/*' 76 | else: 77 | raise ValueError('Invalid data_name') 78 | model_name = 'baro' 79 | 80 | pathset = "./output/" 81 | if not(os.path.exists(pathset)): 82 | os.mkdir(pathset) 83 | 84 | for metric, weight in POD_METRIC_FILE.items(): 85 | if metric in ['log_PCA', 'log_golden_signal', 'log_frequency']: 86 | metric_file = '{}/pod_level_{}.npy'.format(metric_path, metric) 87 | metric_data[metric] = np.load(metric_file,allow_pickle=True).item() 88 | if len(metric_data[metric].keys()) == 1: 89 | if log_label != label: 90 | metric_data[metric][label] = metric_data[metric][log_label] 91 | del metric_data[metric][log_label] 92 | else: 93 | metric_data[metric][label] = metric_data[metric] 94 | metric_data[metric][label]['Pod_Name'] = metric_data[metric][label]['Node_Name'] 95 | del metric_data[metric][label]['Node_Name'] 96 | metric_data[metric][label]['Sequence'] = metric_data[metric][label]['Sequence'].squeeze().T 97 | metric_data[metric][label]['KPI_Feature'] = [metric_data[metric][label]['KPI_Feature']] 98 | if columns_common: 99 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common)) 100 | else: 101 | columns_common = list(metric_data[metric][label]['Pod_Name']) 102 | else: 103 | metric_file = '{}/pod_level_data_{}.npy'.format(metric_path, metric) 104 | metric_data[metric] = np.load(metric_file, allow_pickle=True).item() 105 | if columns_common: 106 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common)) 107 | else: 108 | columns_common = list(metric_data[metric][label]['Pod_Name']) 109 | 110 | 111 | index_data = {} 112 | metric_names = [] 113 | metric_weight_assigned = [] 114 | for metric, weight in POD_METRIC_FILE.items(): 115 | index_data[metric] = [metric_data[metric][label]['Pod_Name'].index(x) for x in columns_common] 116 | metric_names = metric_names + [metric] 117 | metric_weight_assigned = metric_weight_assigned + [weight] 118 | 119 | metric_weight = np.zeros((len(POD_METRIC_FILE),1)) 120 | metric_id = 0 121 | final_root_results = {} 122 | 123 | for metric in metric_names: 124 | print('For metric:', metric) 125 | data = metric_data[metric] 126 | X = data[label]['Sequence'] 127 | index = index_data[metric] 128 | 129 | # Preprocessing to reduce the redundant samples 130 | if X.shape[0] // 100 < 100: 131 | patch = 20 132 | else: 133 | patch = 100 134 | sample = X.shape[0]//patch 135 | X = X[:patch*sample,:] 136 | X = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1) 137 | X_metric = X[:, index] 138 | X_metric = preprocessing.normalize(X_metric, axis=0, norm = 'l1') 139 | X = np.append(X_metric, X[:, -1].reshape(-1,1), axis=1) 140 | columns = list(columns_common) + data[label]['KPI_Feature'] 141 | 142 | 143 | std = np.std(X[:, :-1], axis=0) 144 | idx_std = [i for i, x in enumerate(std > 1e-5) if x] 145 | if len(idx_std) == 0: 146 | metric_weight[metric_id] = 0 147 | metric_id = metric_id + 1 148 | print(metric,' all pods are all constant or quasi-constant') 149 | continue 150 | 151 | selector = VarianceThreshold(threshold = 0) 152 | X_var = selector.fit_transform(X[:, :-1]) 153 | idx = selector.get_support(indices = True) 154 | #print('X shape after variance: ', X_var.shape) 155 | if X_var.shape[1] < 1: 156 | metric_weight[metric_id] = 0 157 | metric_id = metric_id + 1 158 | print(metric,' all pods are all constant or quasi-constant') 159 | continue 160 | 161 | # causal_score = np.zeros(len(columns_common)) 162 | mask = np.full(len(columns_common), False,dtype=bool) 163 | mask[idx] = True 164 | idx = list(idx) + [X.shape[1]-1] 165 | X = X[:, idx] 166 | columns = [columns[i] for i in idx] 167 | X = pd.DataFrame(X,columns=columns) 168 | 169 | X.insert(0,'time', pd.date_range(start='2024-01-01', periods=len(X), freq='D')) 170 | 171 | X['time'] = X['time'].astype('int64') // 1_000_000_000 172 | X.columns = [f"{col}_cpu" if i < 9 else f"{col}_memory" for i, col in enumerate(X.columns)] 173 | 174 | X.iloc[:, 1:] = (X.iloc[:, 1:] - X.iloc[:, 1:].min()) / (X.iloc[:, 1:].max() - X.iloc[:, 1:].min()) 175 | 176 | anomalies = bocpd(X) 177 | print("Anomalies are detected at timestep:", anomalies[0]) 178 | results = robust_scorer(X,anomalies=anomalies) 179 | print(results) 180 | 181 | root_causes = [] 182 | for result in results: 183 | (root_cause, score) = result 184 | root_causes.append([root_cause, score]) 185 | if not os.path.exists('./{}_results'.format(method)): 186 | os.mkdir('./{}_results'.format(method)) 187 | if not os.path.exists('./{}_results/{}'.format(method, data_name)): 188 | os.mkdir('./{}_results/{}'.format(method, data_name)) 189 | 190 | root_causes = pd.DataFrame(root_causes) 191 | root_causes.columns = [['root_cause', 'score']] 192 | root_causes.to_csv("./{}_results/{}/{}_{}_{}_root_cause.csv".format(method, data_name, metric, model_name, data_name),index=False) 193 | 194 | final_root_results[metric] = root_causes 195 | 196 | concatenated_df = pd.concat(final_root_results.values(), ignore_index=True) 197 | concatenated_df.to_csv("./{}_results/{}/concated_df.csv".format(method, data_name), index=False) 198 | concatenated_df = pd.read_csv("./{}_results/{}/concated_df.csv".format(method, data_name)) 199 | aggregated_df = concatenated_df.groupby('root_cause')['score'].sum().reset_index() 200 | aggregated_df = aggregated_df.sort_values(by='score', ascending=False) 201 | aggregated_df.to_csv("./{}_results/{}/final_{}_{}_root_cause.csv".format(method, data_name, model_name, data_name), index=False) 202 | 203 | 204 | if __name__ == '__main__': 205 | parser = argparse.ArgumentParser(description='Baro') 206 | parser.add_argument("-case", type=str, default='20240115', help="case of the dataset") 207 | parser.set_defaults(validation=True) 208 | args = parser.parse_args() 209 | main(args) 210 | -------------------------------------------------------------------------------- /Baseline/multimodal/baro_main_combined.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn import preprocessing 4 | import os 5 | from sklearn.feature_selection import VarianceThreshold 6 | from baro_algorithm import bocpd, robust_scorer 7 | import networkx as nx 8 | import argparse 9 | 10 | 11 | def remove_cycles_from_adjacency_matrix(adj_matrix: pd.DataFrame) -> pd.DataFrame: 12 | G = nx.from_pandas_adjacency(adj_matrix, create_using=nx.DiGraph()) 13 | while True: 14 | try: 15 | cycle = nx.find_cycle(G, orientation='original') 16 | G.remove_edge(*cycle[0][:2]) 17 | 18 | except nx.NetworkXNoCycle: 19 | break 20 | adj_matrix_no_cycles = nx.to_pandas_adjacency(G, dtype=int) 21 | adj_matrix_no_cycles = adj_matrix_no_cycles.reindex_like(adj_matrix) 22 | 23 | print("Now, the adjacency matrix does not have cycles.") 24 | return adj_matrix_no_cycles 25 | 26 | 27 | def main(args): 28 | metric_data = {} 29 | columns_common = {} 30 | method = 'baro' 31 | data_name = args.case 32 | metric_path = '../data/{}'.format(data_name) 33 | if data_name == '20220606': 34 | label = 'reviews-v3' 35 | elif data_name == '20210517' or data_name == '20210524': 36 | label = 'Book_Info_product' 37 | elif data_name == '20211203': 38 | label = 'ratings.book-info.svc.cluster.local:9080/*' 39 | elif data_name == '20240215': 40 | label = 'pod usage' 41 | elif data_name == '20240124': 42 | label = 'scenario8_app_request' 43 | elif data_name == '20231207': 44 | label = 'book_info' 45 | elif data_name == '20231221': 46 | label = 'book_info' 47 | elif data_name == '20240115': 48 | label = 'book_info' 49 | else: 50 | raise ValueError('Invalid data_name') 51 | 52 | if data_name in ['20220606', '20210517', '20210524', '20211203', '20231207']: 53 | POD_METRIC_FILE = {'log_golden_signal': 1, 'log_frequency': 1, 'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 54 | 'received_bandwidth': 1, 'transmit_bandwidth': 1} 55 | log_label = 'ratings.book-info.svc.cluster.local:9080/*' 56 | elif data_name in ['20231207']: 57 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1, 58 | 'received_bandwidth': 1, 'transmit_bandwidth': 1, 'log_golden_signal': 1, 59 | 'log_frequency': 1} 60 | log_label = 'book_info' 61 | elif data_name in ['20240124']: 62 | POD_METRIC_FILE = {'log_golden_signal': 1, 'log_frequency': 1, 'cpu_usage': 1, 'disk_used': 1, 'diskio_reads': 1, 'diskio_writes': 1, 'memory_used': 1, 63 | 'netstat_established': 1, 'swap_used': 1} 64 | log_label = 'ratings.book-info.svc.cluster.local:9080/*' 65 | elif data_name in ['20240215']: 66 | POD_METRIC_FILE = {'pod_cpu_limit': 1, 'pod_cpu_usage_total': 1, 'pod_cpu_utilization_over_pod_limit': 1, 'pod_memory_limit': 1, 67 | 'pod_memory_utilization_over_pod_limit': 1, 'pod_memory_working_set': 1, 'pod_network_rx_bytes': 1, 'pod_network_tx_bytes':1, 68 | 'log_golden_signal': 1, 'log_frequency': 1} 69 | log_label = 'book_info' 70 | elif data_name in ['20240115']: 71 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 72 | 'rate_storage_iops': 1, 'received_bandwidth': 1, 'transmit_bandwidth': 1, 'log_golden_signal': 1, 73 | 'log_frequency': 1} 74 | log_label = 'ratings.book-info.svc.cluster.local:9080/*' 75 | elif data_name in ['20231221']: 76 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1, 77 | 'received_bandwidth': 1, 'transmit_bandwidth': 1, 'log_golden_signal': 1, 'log_frequency': 1} 78 | log_label = 'ratings.book-info.svc.cluster.local:9080/*' 79 | else: 80 | raise ValueError('Invalid data_name') 81 | model_name = 'baro' 82 | 83 | pathset = "./output/" 84 | if not(os.path.exists(pathset)): 85 | os.mkdir(pathset) 86 | 87 | for metric, weight in POD_METRIC_FILE.items(): 88 | if metric in ['log_PCA', 'log_golden_signal', 'log_frequency']: 89 | metric_file = '{}/pod_level_{}.npy'.format(metric_path, metric) 90 | metric_data[metric] = np.load(metric_file,allow_pickle=True).item() 91 | if len(metric_data[metric].keys()) == 1: 92 | if log_label != label: 93 | metric_data[metric][label] = metric_data[metric][log_label] 94 | del metric_data[metric][log_label] 95 | else: 96 | metric_data[metric][label] = metric_data[metric] 97 | metric_data[metric][label]['Pod_Name'] = metric_data[metric][label]['Node_Name'] 98 | del metric_data[metric][label]['Node_Name'] 99 | metric_data[metric][label]['Sequence'] = metric_data[metric][label]['Sequence'].squeeze().T 100 | metric_data[metric][label]['KPI_Feature'] = [metric_data[metric][label]['KPI_Feature']] 101 | if columns_common: 102 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common)) 103 | else: 104 | columns_common = list(metric_data[metric][label]['Pod_Name']) 105 | else: 106 | metric_file = '{}/pod_level_data_{}.npy'.format(metric_path, metric) 107 | metric_data[metric] = np.load(metric_file, allow_pickle=True).item() 108 | if columns_common: 109 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common)) 110 | else: 111 | columns_common = list(metric_data[metric][label]['Pod_Name']) 112 | 113 | 114 | index_data = {} 115 | metric_names = [] 116 | metric_weight_assigned = [] 117 | for metric, weight in POD_METRIC_FILE.items(): 118 | index_data[metric] = [metric_data[metric][label]['Pod_Name'].index(x) for x in columns_common] 119 | metric_names = metric_names + [metric] 120 | metric_weight_assigned = metric_weight_assigned + [weight] 121 | 122 | metric_weight = np.zeros((len(POD_METRIC_FILE),1)) 123 | metric_id = 0 124 | final_root_results = {} 125 | 126 | for metric in metric_names: 127 | print('For metric:', metric) 128 | data = metric_data[metric] 129 | X = data[label]['Sequence'] 130 | index = index_data[metric] 131 | 132 | # Preprocessing to reduce the redundant samples 133 | if X.shape[0] // 100 < 100: 134 | patch = 20 135 | else: 136 | patch = 100 137 | sample = X.shape[0]//patch 138 | X = X[:patch*sample,:] 139 | X = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1) 140 | X_metric = X[:, index] 141 | X_metric = preprocessing.normalize(X_metric, axis=0, norm = 'l1') 142 | X = np.append(X_metric, X[:, -1].reshape(-1,1), axis=1) 143 | columns = list(columns_common) + data[label]['KPI_Feature'] 144 | 145 | 146 | std = np.std(X[:, :-1], axis=0) 147 | idx_std = [i for i, x in enumerate(std > 1e-5) if x] 148 | if len(idx_std) == 0: 149 | metric_weight[metric_id] = 0 150 | metric_id = metric_id + 1 151 | print(metric,' all pods are all constant or quasi-constant') 152 | continue 153 | 154 | selector = VarianceThreshold(threshold = 0) 155 | X_var = selector.fit_transform(X[:, :-1]) 156 | idx = selector.get_support(indices = True) 157 | #print('X shape after variance: ', X_var.shape) 158 | if X_var.shape[1] < 1: 159 | metric_weight[metric_id] = 0 160 | metric_id = metric_id + 1 161 | print(metric,' all pods are all constant or quasi-constant') 162 | continue 163 | 164 | # causal_score = np.zeros(len(columns_common)) 165 | mask = np.full(len(columns_common), False,dtype=bool) 166 | mask[idx] = True 167 | idx = list(idx) + [X.shape[1]-1] 168 | X = X[:, idx] 169 | columns = [columns[i] for i in idx] 170 | X = pd.DataFrame(X,columns=columns) 171 | 172 | X.insert(0,'time', pd.date_range(start='2024-01-01', periods=len(X), freq='D')) 173 | 174 | X['time'] = X['time'].astype('int64') // 1_000_000_000 175 | X.columns = [f"{col}_cpu" if i < 9 else f"{col}_memory" for i, col in enumerate(X.columns)] 176 | 177 | X.iloc[:, 1:] = (X.iloc[:, 1:] - X.iloc[:, 1:].min()) / (X.iloc[:, 1:].max() - X.iloc[:, 1:].min()) 178 | 179 | anomalies = bocpd(X) 180 | print("Anomalies are detected at timestep:", anomalies[0]) 181 | results = robust_scorer(X,anomalies=anomalies) 182 | print(results) 183 | 184 | root_causes = [] 185 | for result in results: 186 | (root_cause, score) = result 187 | root_causes.append([root_cause, score]) 188 | if not os.path.exists('./{}_results'.format(method)): 189 | os.mkdir('./{}_results'.format(method)) 190 | if not os.path.exists('./{}_results/{}'.format(method, data_name)): 191 | os.mkdir('./{}_results/{}'.format(method, data_name)) 192 | 193 | root_causes = pd.DataFrame(root_causes) 194 | root_causes.columns = [['root_cause', 'score']] 195 | root_causes.to_csv("./{}_results/{}/{}_{}_{}_root_cause.csv".format(method, data_name, metric, model_name, data_name),index=False) 196 | 197 | final_root_results[metric] = root_causes 198 | 199 | concatenated_df = pd.concat(final_root_results.values(), ignore_index=True) 200 | concatenated_df.to_csv("./{}_results/{}/concated_df.csv".format(method, data_name), index=False) 201 | concatenated_df = pd.read_csv("./{}_results/{}/concated_df.csv".format(method, data_name)) 202 | aggregated_df = concatenated_df.groupby('root_cause')['score'].sum().reset_index() 203 | aggregated_df = aggregated_df.sort_values(by='score', ascending=False) 204 | aggregated_df.to_csv("./{}_results/{}/final_{}_{}_root_cause.csv".format(method, data_name, model_name, data_name), index=False) 205 | 206 | 207 | if __name__ == '__main__': 208 | parser = argparse.ArgumentParser(description='Baro') 209 | parser.add_argument("-case", type=str, default='20240115', help="case of the dataset") 210 | parser.set_defaults(validation=True) 211 | args = parser.parse_args() 212 | main(args) 213 | -------------------------------------------------------------------------------- /Crossiant_Metadata/Crossiant_Metadata_Cloud_Computing_Original.json: -------------------------------------------------------------------------------- 1 | { 2 | "_id": "6664dbf513d2f73a727d47ff", 3 | "id": "Lemma-RCA-NEC/Cloud_Computing_Original", 4 | "author": "Lemma-RCA-NEC", 5 | "sha": "78ec9604fd0446d875175650c99acf30c95158c2", 6 | "lastModified": "2024-06-09T03:19:45.000Z", 7 | "private": false, 8 | "gated": false, 9 | "disabled": false, 10 | "tags": [ 11 | "task_categories:time-series-forecasting", 12 | "size_categories:100M" 12 | DRAIN: 13 | # Similarity threshold 14 | sim_th: 0.4 15 | # Depth of all leaf nodes 16 | depth: 4 17 | max_children: 100 18 | max_clusters: 1024 19 | #extra_delimiters: ["_"] 20 | extra_delimiters: "[]" 21 | PROFILING: 22 | enabled: False 23 | report_sec: 30 24 | EXTRA: 25 | input_file_name: 26 | - "*messages" 27 | out_dir: "./drain32_result" 28 | log_format: "