├── .DS_Store
├── Baseline
├── .DS_Store
├── .idea
│ ├── .gitignore
│ ├── Baseline.iml
│ ├── inspectionProfiles
│ │ └── profiles_settings.xml
│ ├── misc.xml
│ ├── modules.xml
│ └── vcs.xml
├── FastPC
│ ├── baseline_evaluation.py
│ ├── fastPC.py
│ ├── hannlstm.py
│ ├── interdependent.py
│ ├── lbfgsb_scipy.py
│ ├── libspot.so
│ ├── pyspot.py
│ ├── rca.py
│ ├── test_FastPC_node_metric.py
│ ├── test_FastPC_pod_combine.py
│ ├── test_FastPC_pod_log.py
│ ├── test_FastPC_pod_metric.py
│ └── trace_expm.py
├── Nezha
│ ├── 20240124
│ │ ├── 20240124-fault_list.json
│ │ └── root_cause_hipster.json
│ ├── INSTALL.md
│ ├── LICENSE
│ ├── alarm.py
│ ├── data_integrate.py
│ ├── data_parser.py
│ ├── log.py
│ ├── log_parsing.py
│ ├── main.py
│ ├── pattern_miner.py
│ ├── pattern_ranker.py
│ └── requirements.txt
├── Readme.md
├── SWAT&WADI
│ ├── .DS_Store
│ ├── baro-evaluation.py
│ ├── baro.py
│ ├── baro_algorithm.py
│ ├── pyrca-evaluation.py
│ └── pyrca-main.py
├── log_only
│ ├── RCA_methods_log.py
│ ├── baro_algorithm.py
│ └── baro_main_log.py
├── metric_only
│ ├── RCA_methods_metric.py
│ ├── baro_algorithm.py
│ └── baro_main_metric.py
└── multimodal
│ ├── RCA_methods_combined.py
│ ├── baro_algorithm.py
│ └── baro_main_combined.py
├── Crossiant_Metadata
├── Crossiant_Metadata_Cloud_Computing_Original.json
├── Crossiant_Metadata_Cloud_Computing_Preprocessed.json
├── Crossiant_Metadata_Product_Review_Original.json
└── Crossiant_Metadata_Product_Review_Preprocessed.json
├── IT
└── data preprocessing
│ ├── Drain.py
│ ├── JMeter_KPI.py
│ ├── README.md
│ ├── drain3.yaml
│ ├── drain3_parse.py
│ ├── json2message.py
│ ├── log_PCA_extraction.py
│ ├── log_frequency_extraction.py
│ ├── log_golden_frequency.py
│ └── metric_json2npy.py
├── LICENSE
├── OT
└── data_preprocessing
│ ├── Readme.md
│ ├── SWaT
│ ├── data_segment.py
│ ├── node_data_cut.py
│ ├── node_final_process.py
│ ├── pod_data_cut.py
│ ├── pod_final_process.py
│ └── process.sh
│ └── WADI
│ ├── data_segment.py
│ ├── node_data_cut.py
│ ├── node_final_process.py
│ ├── pod_data_cut.py
│ ├── pod_final_process.py
│ └── process.sh
├── Other
├── bg.png
└── rca_update.png
└── README.md
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KnowledgeDiscovery/rca_baselines/c560d8cc39c19f04c9ae74fac2404b1e5e8c3b4d/.DS_Store
--------------------------------------------------------------------------------
/Baseline/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KnowledgeDiscovery/rca_baselines/c560d8cc39c19f04c9ae74fac2404b1e5e8c3b4d/Baseline/.DS_Store
--------------------------------------------------------------------------------
/Baseline/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 |
--------------------------------------------------------------------------------
/Baseline/.idea/Baseline.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/Baseline/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Baseline/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/Baseline/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/Baseline/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Baseline/FastPC/baseline_evaluation.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | folders = ['0606','0517','0524','0901','1203']
4 | predicts = []
5 | for fd in folders:
6 | pods_data = pd.read_csv(fd+'/output/Pod_level_combine_ranking.csv')
7 | pods = list(pods_data['pod'])
8 | # pods = [x.split('_')[1] for x in pods]
9 | predicts.append(pods)
10 |
11 | k = [1,3,5,7,10]
12 |
13 | def precision_on_topk(predicts,reals,k):
14 | pr = 0
15 | for pred, real in zip(predicts, reals):
16 | pred = pred[:k]
17 | hit_count = len(set(pred) & set(real))
18 | min_len = min(k,len(real))
19 | pr += hit_count/min_len
20 | return pr/len(reals)
21 |
22 | def mean_precision_k(predicts,reals,k):
23 | pr = 0
24 | for i in range(1,k+1):
25 | pr += precision_on_topk(predicts,reals,i)
26 | return pr/k
27 |
28 | def mrr(predicts,reals):
29 | mrr_val = 0
30 | for preds,real in zip(predicts,reals):
31 | tmp = []
32 | for real_item in real:
33 | index = preds.index(real_item) if real_item in preds else sys.maxsize-1
34 | tmp.append(index+1)
35 | mrr_val += 1/min(tmp)
36 | return mrr_val/len(reals)
37 |
38 | reals = [['productpage-v1-5f9dbcd669-z2prs'],
39 | ['catalogue-8667bb6cbc-hqzfw'],
40 | ['catalogue-85fd4965b7-q8477'],
41 | ['catalogue-6c7b9b975-xfjps'],
42 | ['mongodb-v1-64c6b69879-p4wfp']]
43 |
44 | for item in k:
45 | pr = precision_on_topk(predicts,reals,item)
46 | map_val = mean_precision_k(predicts,reals,item)
47 | mrr_val = mrr(predicts,reals)
48 | print("pr@{}:{} map@{}:{} mrr:{}".format(item,pr,item,map_val,mrr_val))
49 |
--------------------------------------------------------------------------------
/Baseline/FastPC/interdependent.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import argparse
4 | from sklearn import preprocessing
5 |
6 |
7 |
8 | if __name__ == '__main__':
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--input', type=str, default='inrc')
11 |
12 | args = parser.parse_args()
13 |
14 | node_file = './0517/output/'+args.input + '_node_all.npy'
15 | pod_file = './0517/output/'+args.input + '_pod_all.npy'
16 | mp_file = '/nfs/users/zach/aiops_data/data/0517/p2n.npy'
17 |
18 | node_data = np.load(node_file, allow_pickle=True).item()
19 | pod_data = np.load(pod_file, allow_pickle=True).item()
20 | mp_data = np.load(mp_file, allow_pickle=True).item()
21 |
22 | pod_names = pod_data['columns']
23 | node_names = node_data['columns']
24 |
25 | pod_scores = pod_data['score']
26 | node_scores = node_data['score']
27 |
28 | p2s = dict(zip(pod_names, pod_scores))
29 | n2s = dict(zip(node_names, node_scores))
30 |
31 | ctotal = 0
32 | del_keys = []
33 | for p in p2s:
34 | if p not in mp_data:
35 | del_keys.append(p)
36 | continue
37 | node = mp_data[p]
38 | p2s[p] = p2s[p] * n2s[node]
39 | ctotal += p2s[p]
40 |
41 | for k in del_keys:
42 | p2s.pop(k)
43 |
44 | fd = {}
45 | for p in p2s:
46 | fd[p] = [p2s[p] / ctotal]
47 |
48 |
49 | scores = pd.DataFrame.from_dict(fd, orient='index', columns=['ranking_score'])
50 |
51 | ranking_score = scores.reset_index(drop=True).to_numpy().reshape(-1)
52 | ranking_score = preprocessing.normalize([ranking_score]).ravel()
53 | #print(ranking_score)
54 | columns = list(scores.index)
55 |
56 | #scores = scores.sort_values(by='ranking_score', ascending=False)
57 | ranking = np.argsort(ranking_score)[::-1]
58 |
59 | K= len(ranking_score)
60 | #results_combine = {}
61 |
62 | results_combine = pd.DataFrame()
63 | results_combine['ranking'] = [i+1 for i in range(K)]
64 | #results_combine = pd.DataFrame(results_combine, columns = ['ranking'])
65 | results_combine ['pod'] = [columns[ranking[i]] for i in range(K)]
66 | results_combine ['score'] = [ranking_score[ranking[i]] for i in range(K)]
67 | results_combine.to_csv('./0517/output/'+ args.input + '_hierarchical_ranking_metrics.csv')
68 | print(results_combine)
69 | print('Successfully output the root cause results with considering both node level and pod level')
70 |
71 |
72 |
73 |
74 |
75 |
76 |
--------------------------------------------------------------------------------
/Baseline/FastPC/lbfgsb_scipy.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import scipy.optimize as sopt
3 |
4 |
5 | class LBFGSBScipy(torch.optim.Optimizer):
6 | """Wrap L-BFGS-B algorithm, using scipy routines.
7 | """
8 |
9 | def __init__(self, params):
10 | defaults = dict()
11 | super(LBFGSBScipy, self).__init__(params, defaults)
12 |
13 | if len(self.param_groups) != 1:
14 | raise ValueError("LBFGSBScipy doesn't support per-parameter options"
15 | " (parameter groups)")
16 |
17 | self._params = self.param_groups[0]['params']
18 | self._numel = sum([p.numel() for p in self._params])
19 |
20 | def _gather_flat_grad(self):
21 | views = []
22 | for p in self._params:
23 | if p.grad is None:
24 | view = p.data.new(p.data.numel()).zero_()
25 | elif p.grad.data.is_sparse:
26 | view = p.grad.data.to_dense().view(-1)
27 | else:
28 | view = p.grad.data.view(-1)
29 | views.append(view)
30 | return torch.cat(views, 0)
31 |
32 | def _gather_flat_bounds(self):
33 | bounds = []
34 | for p in self._params:
35 | if hasattr(p, 'bounds'):
36 | b = p.bounds
37 | else:
38 | b = [(None, None)] * p.numel()
39 | bounds += b
40 | return bounds
41 |
42 | def _gather_flat_params(self):
43 | views = []
44 | for p in self._params:
45 | if p.data.is_sparse:
46 | view = p.data.to_dense().view(-1)
47 | else:
48 | view = p.data.view(-1)
49 | views.append(view)
50 | return torch.cat(views, 0)
51 |
52 | def _distribute_flat_params(self, params):
53 | offset = 0
54 | for p in self._params:
55 | numel = p.numel()
56 | # view as to avoid deprecated pointwise semantics
57 | p.data = params[offset:offset + numel].view_as(p.data)
58 | offset += numel
59 | assert offset == self._numel
60 |
61 | def step(self, closure):
62 | """Performs a single optimization step.
63 |
64 | Arguments:
65 | closure (callable): A closure that reevaluates the model
66 | and returns the loss.
67 | """
68 | assert len(self.param_groups) == 1
69 |
70 | def wrapped_closure(flat_params):
71 | """closure must call zero_grad() and backward()"""
72 | flat_params = torch.from_numpy(flat_params)
73 | flat_params = flat_params.to(torch.get_default_dtype())
74 | self._distribute_flat_params(flat_params)
75 | loss = closure()
76 | loss = loss.item()
77 | flat_grad = self._gather_flat_grad().cpu().detach().numpy()
78 | return loss, flat_grad.astype('float64')
79 |
80 | initial_params = self._gather_flat_params()
81 | initial_params = initial_params.cpu().detach().numpy()
82 |
83 | bounds = self._gather_flat_bounds()
84 |
85 | # Magic
86 | sol = sopt.minimize(wrapped_closure,
87 | initial_params,
88 | method='L-BFGS-B',
89 | jac=True,
90 | bounds=bounds)
91 |
92 | final_params = torch.from_numpy(sol.x)
93 | final_params = final_params.to(torch.get_default_dtype())
94 | self._distribute_flat_params(final_params)
95 |
96 |
97 | def main():
98 | import torch.nn as nn
99 | # torch.set_default_dtype(torch.double)
100 |
101 | n, d, out, j = 10000, 3000, 10, 0
102 | input = torch.randn(n, d)
103 | w_true = torch.rand(d, out)
104 | w_true[j, :] = 0
105 | target = torch.matmul(input, w_true)
106 | linear = nn.Linear(d, out)
107 | linear.weight.bounds = [(0, None)] * d * out # hack
108 | for m in range(out):
109 | linear.weight.bounds[m * d + j] = (0, 0)
110 | criterion = nn.MSELoss()
111 | optimizer = LBFGSBScipy(linear.parameters())
112 | print(list(linear.parameters()))
113 |
114 | def closure():
115 | optimizer.zero_grad()
116 | output = linear(input)
117 | loss = criterion(output, target)
118 | print('loss:', loss.item())
119 | loss.backward()
120 | return loss
121 | optimizer.step(closure)
122 | print(list(linear.parameters()))
123 | print(w_true.t())
124 |
125 |
126 | if __name__ == '__main__':
127 | main()
128 |
129 |
--------------------------------------------------------------------------------
/Baseline/FastPC/libspot.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KnowledgeDiscovery/rca_baselines/c560d8cc39c19f04c9ae74fac2404b1e5e8c3b4d/Baseline/FastPC/libspot.so
--------------------------------------------------------------------------------
/Baseline/FastPC/rca.py:
--------------------------------------------------------------------------------
1 | # import pyspot as ps
2 | import numpy as np
3 | import pandas as pd
4 | import networkx as nx
5 | from causalnex.structure import dynotears
6 | from causalnex.structure.dynotears import from_pandas_dynamic
7 | from pyspot import DSpot, Spot
8 | from pingouin import partial_corr
9 | import torch
10 | from hannlstm import hannLSTM, train_model_pgd
11 | from fastPC import Fast_PC_Causal_Graph
12 | import scipy
13 | from numpy.linalg import norm, inv
14 | from sklearn import preprocessing
15 |
16 |
17 | def optENMFSoft( A, P, M, c, tau, max_iter=100):
18 |
19 | n = A.shape[0]
20 | B = (1-c) * inv(np.eye(n) - c * A)
21 | BB = B.transpose() @ B
22 |
23 | t = 1e-30
24 |
25 | e = np.ones((n, 1))
26 | s = scipy.special.softmax(B @ e)
27 | obj = norm((s @ s.transpose()) * M, 'fro') ** 2 + tau * norm(e, 1)
28 | obj_old = obj
29 | err = 1
30 | iter = 0
31 |
32 | # maxIter = 1000
33 | errorV=[]
34 |
35 | while (err > t) and (iter < max_iter):
36 | s=scipy.special.softmax(B @ e)
37 | phi=np.diag(s) - s @ s.transpose()
38 |
39 | numerator = 4*(B.transpose() @ phi) @ (P*M) @ s
40 | # print(numerator)
41 | numerator[numerator<0]=0
42 | denominator = 4 * B.transpose() @ ((phi@s@s.transpose())*M)@s+ tau * np.ones((n,1))
43 | e=e * np.sqrt(np.sqrt(numerator/denominator))
44 | # print(e)
45 | # %err=norm(e-e_old,'fro')
46 | obj=norm((s@s.transpose())*M - P,'fro') ** 2 + tau * norm(e,1)
47 | err=np.abs(obj-obj_old)
48 | obj_old=obj
49 | iter = iter +1
50 | errorV.append(err)
51 | return e
52 |
53 | def spot_detection(X, d: int=10, q: float=1e-4, n_init:int=100, level:float=0.98)->np.ndarray:
54 |
55 | # X_mean = np.mean(X, axis=0)
56 | # X_std = np.std(X, axis=0)
57 | # X_std[X_std < 1e-3] = 1
58 | # X = (X - X_mean) / X_std
59 |
60 | nvar = X.shape[1]
61 | T = X.shape[0]
62 | score_list = []
63 | for i in range(nvar):
64 | S = DSpot(d=d, q=q, n_init=n_init, level=level)
65 | score = []
66 | for t in range(T):
67 | xt = X[t, i]
68 | event = S.step(xt)
69 | st = 0
70 | if t >= n_init:
71 | # up alert
72 | if event == 1:
73 | upper_threshold = S.status().to_dict()['z_up']
74 | assert(xt >= upper_threshold)
75 |
76 | if upper_threshold == 0:
77 | upper_threshold = 0.0001
78 |
79 | st = (xt - upper_threshold) / upper_threshold
80 | # print('z_up is event!')
81 | # down alert
82 | if event == -1:
83 | lower_threshold = S.status().to_dict()['z_down']
84 | assert(xt <= lower_threshold)
85 |
86 | if lower_threshold == 0:
87 | lower_threshold = 0.0001
88 |
89 | st = (lower_threshold - xt) / lower_threshold
90 | # print('z_down is event!')
91 | st = np.abs(st)
92 | score.append(st)
93 | score_list.append(score)
94 | np_score = np.array(score_list).transpose()
95 | return np_score
96 |
97 | def detect_individual_causal(X: np.ndarray,
98 | method:str='SPOT',
99 | args:dict={'d': 10, 'q': 1e-4, 'n_init': 100, 'level':0.98})->np.ndarray:
100 | if method == 'SPOT':
101 | d = args['d']
102 | q = args['q']
103 | n_init = args['n_init']
104 | level = args['level']
105 | score = spot_detection(X, d, q, n_init, level)
106 | return score
107 |
108 |
109 | # LSTM based method
110 | def lstm(X: np.ndarray, hidden: int, context: int, lam: float, lam_ridge: float, lr: float, max_iter: int, check_every: int, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
111 | # device = torch.device('cuda:0')
112 |
113 | X_np = torch.tensor(X[np.newaxis], dtype=torch.float64, device=device)
114 | hannlstm = hannLSTM(X.shape[-1], hidden=hidden).to(device=device)
115 | X_np = X_np.float()
116 | train_loss_list = train_model_pgd(hannlstm, X_np, context=context, lam=lam, lam_ridge=lam_ridge, lr=lr, max_iter=max_iter, check_every=check_every)
117 | W_est = hannlstm.GC(False).cpu().data.numpy()
118 |
119 | return W_est
120 |
121 |
122 | def generate_causal_graph(X: np.ndarray,
123 | method: str ='dynotears',
124 | args: dict = {'lag': 10,
125 | 'lambda_w': 1e-3,
126 | 'lambda_a': 1e-3,
127 | 'max_iter':30})->np.ndarray:
128 | if method == 'lstm':
129 | torch.set_default_tensor_type(torch.FloatTensor)
130 | hidden = args['hidden']
131 | context = args['context']
132 | lam = args['lam']
133 | lam_ridge = args['lam_ridge']
134 | lr = args['lr']
135 | max_iter = args['max_iter']
136 | check_every = args['check_every']
137 | device = args['device']
138 | W_est = lstm(X, hidden, context, lam, lam_ridge, lr, max_iter, check_every, device)
139 | elif method == 'dynotears':
140 | if 'columns' not in args:
141 | columns = ['V{}'.format(i) for i in range(X.shape[1])]
142 | else:
143 | columns = args['columns']
144 | lag = args['lag']
145 | lambda_w = args['lambda_w']
146 | lambda_a = args['lambda_a']
147 | max_iter = args['max_iter']
148 |
149 | X_lag = np.roll(X,1,axis=0)
150 | for lag_o in range(2,lag+1):
151 | X_lag = np.hstack((X_lag,np.roll(X,lag_o, axis=0)))
152 | W_est = dynotears.from_numpy_dynamic(X, X_lag, lambda_w, lambda_a, max_iter)
153 | elif method == 'fastpc':
154 | W_est = Fast_PC_Causal_Graph(pd.DataFrame(X),alpha=10**-6,cuda=True)
155 | return W_est
156 |
157 | # generate transition matrix from weight matrix
158 | # W: W[i,j] i->j
159 | def generate_Q(X:np.ndarray, W:np.ndarray, RI:int, rho:float, columns: list=None):
160 | n = W.shape[0]
161 | if columns is None:
162 | columns=['V{}'.format(i) for i in range(n)]
163 | df = pd.DataFrame(X, index=[i for i in range(X.shape[0])], columns=columns)
164 |
165 | # parent nodes
166 | PAak = [columns[i] for i, x in enumerate(W[:, RI]) if (x == 1) and (i != RI)]
167 | vak = columns[RI]
168 | # PA = [[columns[j] for j, x in enumerate(W[:, i]) if x == 1] for i in range(n)]
169 | # PAak_minus = [[c for c in PAak if c!=columns[i]] for i in range(n)]
170 |
171 | # partial correlation
172 | Rpc = []
173 | for i in range(n):
174 | if i == RI:
175 | Rpc.append(0)
176 | continue
177 | vi = columns[i]
178 | PAak_minus_i = [c for c in PAak if c!=columns[i]]
179 | PAi = [columns[j] for j, x in enumerate(W[:, i]) if (x == 1) and (i != j) and (RI != j)]
180 | covar = list(set(PAak_minus_i).union(set(PAi)))
181 | rdf = partial_corr(df, x=vak, y=vi, covar=covar)
182 | Rpc.append(np.abs(rdf.values[0, 1]))
183 |
184 | Q = np.zeros((n, n))
185 | for i in range(n):
186 | P = 0
187 | for j in range(n):
188 | if i == j:
189 | continue
190 | # from result to cause
191 | if W[j][i] == 1:
192 | Q[i][j] = Rpc[j]
193 | # from cause to result:
194 | if W[i][j] == 0:
195 | Q[j][i] = rho * Rpc[i]
196 | # stay
197 | P = max(P, Q[i][j])
198 | Q[i][i] = max(0., Rpc[i] - P)
199 | # normalize each row
200 | rsum = np.sum(Q, axis=1).reshape(-1 , 1)
201 | rsum[rsum==0] = 1
202 | Q = Q / rsum
203 | return Q
204 |
205 | # random walk with restart
206 | def propagate_error(Q:np.ndarray, start:int, steps:int=1000, rp:float=0.05, max_self:int=10)->np.ndarray:
207 | n = Q.shape[0]
208 | count = np.zeros(n)
209 | current = start
210 | self_visit = 0
211 | for step in range(steps):
212 | # print(current)
213 | # input()
214 | if np.random.rand() > rp:
215 | prob = Q[current, :]
216 | if np.sum(prob) != 1:
217 | continue
218 | next = np.random.choice(n, 1, p=prob)[0]
219 | # check if need a restart, get stuck in one node
220 | if next == current:
221 | self_visit += 1
222 | if self_visit == max_self:
223 | current = start
224 | self_visit = 0
225 | continue
226 | current = next
227 | count[current] += 1
228 | else:
229 | current = start
230 | return count
231 |
232 | if __name__ == '__main__':
233 | data = np.load('may_pod_level_data.npy', allow_pickle=True).item()
234 | label = 'Book_Info_product'
235 | X = data[label]['Sequence'][:48000, :]
236 | X = np.sum(X.reshape((-1, 100, X.shape[1])), axis=1)
237 | columns = data[label]['Pod_Name'] + data[label]['JMeter_Feature']
238 | std = np.std(X, axis=0)
239 | idx = [i for i, x in enumerate(std > 1e-3) if x]
240 | # idx = list(range(30))
241 | X = X[:, idx]
242 | columns = [columns[i] for i in idx]
243 |
244 | print('X shape: ', X.shape)
245 |
246 | print('Detecting Individual Causal ...')
247 | ind_casual_score = detect_individual_causal(X, method='SPOT', args={'d':10, 'q':1e-4, 'n_init':100, 'level':0.98})
248 | ind_casual_score = np.sum(ind_casual_score, axis=0)
249 | normalized_ind_casual_score = ind_casual_score
250 | normalized_ind_casual_score[:-1] = preprocessing.normalize([ind_casual_score[:-1]])
251 | print('Detecting Individual Causal Done!')
252 |
253 | # causal graph
254 | print('Generating Causal Graph ...')
255 | cg = generate_causal_graph(X, method='gnn', args={'lag': 20, 'lambda_w': 1e-3, 'lambda_a': 1e-2})
256 | print('Generating Causal Graph Done!')
257 | # threshold top K
258 | K = 0.3*len(cg.reshape(-1))
259 | threshold = sorted(cg.reshape(-1), reverse=True)[K-1]
260 | W = np.where(cg>=threshold, 1, 0)
261 | # Wij : i->j
262 | W = W.transpose()
263 | # print('W:', W[:, -1])
264 | print('Generating Q ...')
265 | Q = generate_Q(X, W, RI=W.shape[0]-1, rho=1e-2)
266 | print('Q:', Q[-1, :])
267 | print('Q sum: ', np.sum(Q))
268 | print('Generating Q Done!')
269 | # error propagation
270 | print('Propagaing Error ...')
271 | steps = 10000
272 | count = propagate_error(Q, start=W.shape[0]-1, steps=steps)
273 | count /= steps
274 | print('Propagating Eroor Done!')
275 |
276 | # root cause ranking
277 | print('Individual Causal Score: ', normalized_ind_casual_score)
278 | print('Topological Causal score: ', count)
279 | alpha = 0.3
280 | score = alpha * normalized_ind_casual_score[:-1] + (1 - alpha) * count[:-1]
281 | # top K
282 | K = 5
283 | ranking = np.argsort(score)[::-1]
284 | for i in range(K):
285 | print('{}: {} {}'.format(i, columns[ranking[i]], score[ranking[i]]))
286 |
--------------------------------------------------------------------------------
/Baseline/FastPC/trace_expm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import scipy.linalg as slin
4 |
5 |
6 | class TraceExpm(torch.autograd.Function):
7 | @staticmethod
8 | def forward(ctx, input):
9 | # detach so we can cast to NumPy
10 | #E = slin.expm(input.detach().numpy())
11 | E = torch.matrix_exp(input)
12 | #f = np.trace(E)
13 | f = torch.trace(E)
14 | #E = torch.from_numpy(E)
15 | ctx.save_for_backward(E)
16 | return torch.as_tensor(f, dtype=input.dtype)
17 |
18 | @staticmethod
19 | def backward(ctx, grad_output):
20 | E, = ctx.saved_tensors
21 | grad_input = grad_output * E.t()
22 | return grad_input
23 |
24 |
25 | trace_expm = TraceExpm.apply
26 |
27 |
28 | def main():
29 | input = torch.randn(20, 20, dtype=torch.double, requires_grad=True)
30 | assert torch.autograd.gradcheck(trace_expm, input)
31 |
32 | input = torch.tensor([[1, 2], [3, 4.]], requires_grad=True)
33 | tre = trace_expm(input)
34 | f = 0.5 * tre * tre
35 | print('f\n', f.item())
36 | f.backward()
37 | print('grad\n', input.grad)
38 |
39 |
40 | if __name__ == '__main__':
41 | main()
42 |
--------------------------------------------------------------------------------
/Baseline/Nezha/20240124/20240124-fault_list.json:
--------------------------------------------------------------------------------
1 | {
2 | "2024-01-24": [
3 | {
4 | "inject_time": "2024-01-24 03:10:51",
5 | "inject_timestamp": "1706087451",
6 | "inject_pod": "ip-10-1-100-109.ap-northeast-1.compute.internal",
7 | "inject_type": "infinite loop bug"
8 | }
9 | ]
10 | }
--------------------------------------------------------------------------------
/Baseline/Nezha/20240124/root_cause_hipster.json:
--------------------------------------------------------------------------------
1 | {
2 | "ip-10-1-100-109.ap-northeast-1.compute.internal": {
3 | "return": "infinite loop bug",
4 | "exception": "infinite loop bug",
5 | "cpu_consumed": "pod_level_data_cpu_usage",
6 | "infinite loop bug": "pod_level_data_cpu_usage"
7 | }
8 | }
--------------------------------------------------------------------------------
/Baseline/Nezha/INSTALL.md:
--------------------------------------------------------------------------------
1 | # Quick Start
2 |
3 | ### 1.1 Requirements
4 |
5 | - Python3.6 is recommended to run the anomaly detection. Otherwise, any python3 version should be fine.
6 | - Git is also needed.
7 |
8 | ### 1.2 Setup
9 |
10 | download `Nezha` first by `git clone git@github.com:IntelligentDDS/Nezha.git`
11 |
12 | `python3.6 -m pip install -r requirements.txt` to install the dependency for Nezha
13 |
14 | ### 1.3 Running Nezha
15 |
16 | #### 1.3.1 Localize OnlineBoutique at service level
17 |
18 |
19 | ```
20 | python3.6 ./main.py --ns hipster --level service
21 |
22 | pattern_ranker.py:622: -------- hipster Fault numbuer : 56-------
23 | pattern_ranker.py:623: --------AS@1 Result-------
24 | pattern_ranker.py:624: 92.857143 %
25 | pattern_ranker.py:625: --------AS@3 Result-------
26 | pattern_ranker.py:626: 96.428571 %
27 | pattern_ranker.py:627: --------AS@5 Result-------
28 | pattern_ranker.py:628: 96.428571 %
29 | ```
30 |
31 | #### 1.3.2 Localize OnlineBoutique at inner service level
32 |
33 | ```
34 | python3.6 ./main.py --ns hipster --level inner
35 |
36 | pattern_ranker.py:622: -------- hipster Fault numbuer : 56-------
37 | pattern_ranker.py:623: --------AIS@1 Result-------
38 | pattern_ranker.py:624: 92.857143 %
39 | pattern_ranker.py:625: --------AIS@3 Result-------
40 | pattern_ranker.py:626: 96.428571 %
41 | pattern_ranker.py:627: --------AIS@5 Result-------
42 | pattern_ranker.py:628: 96.428571 %
43 | ```
44 |
45 | #### 1.3.3 Localize Trainticket at service level
46 |
47 | ```
48 | python3.6 ./main.py --ns ts --level service
49 |
50 | pattern_ranker.py:622: -------- ts Fault numbuer : 45-------
51 | pattern_ranker.py:623: --------AS@1 Result-------
52 | pattern_ranker.py:624: 86.666667 %
53 | pattern_ranker.py:625: --------AS@3 Result-------
54 | pattern_ranker.py:626: 97.777778 %
55 | pattern_ranker.py:627: --------AS@5 Result-------
56 | pattern_ranker.py:628: 97.777778 %
57 | ```
58 |
59 | #### 1.3.4 Localize Trainticket at inner service level
60 |
61 | ```
62 | python3.6 ./main.py --ns ts --level inner
63 |
64 | pattern_ranker.py:622: -------- ts Fault numbuer : 45-------
65 | pattern_ranker.py:623: --------AIS@1 Result-------
66 | pattern_ranker.py:624: 86.666667 %
67 | pattern_ranker.py:625: --------AIS@3 Result-------
68 | pattern_ranker.py:626: 97.777778 %
69 | pattern_ranker.py:627: --------AIS@5 Result-------
70 | pattern_ranker.py:628: 97.777778 %
71 | ```
72 |
73 | The details of service level results and inner-service level results will be printed and recorded in `./log`
--------------------------------------------------------------------------------
/Baseline/Nezha/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 IntelligentDDS
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/Baseline/Nezha/alarm.py:
--------------------------------------------------------------------------------
1 | import csv
2 | from itertools import product
3 | import os
4 | import re
5 | import datetime
6 | from os.path import dirname
7 | from log import Logger
8 | import logging
9 | from yaml import FlowMappingEndToken
10 | import numpy as np
11 | import pandas as pd
12 | import matplotlib.pyplot as plt
13 | import statistics
14 | import numpy as np
15 |
16 | log_path = dirname(__file__) + '/log/' + str(datetime.datetime.now().strftime(
17 | '%Y-%m-%d')) + '_nezha.log'
18 | logger = Logger(log_path, logging.DEBUG, __name__).getlog()
19 |
20 |
21 | metric_threshold_dir = "metric_threshold"
22 |
23 |
24 | def get_svc(path):
25 | svc = path.rsplit('-', 1)[0]
26 | svc = svc.rsplit('-', 1)[0]
27 |
28 | return svc
29 |
30 |
31 | def generate_threshold(metric_dir, trace_file):
32 | """
33 | fun generate_threshold: calculte mean and std for each metric of each servie
34 | write ruslt to metric_threshold_dir/service.csv
35 | :parameter
36 | metric_dir - metric dir in construction phase
37 | """
38 | metric_map = {}
39 | path_list = os.listdir(metric_dir)
40 | for path in path_list:
41 | if "metric" in path:
42 | svc = path.rsplit('-', 1)[0]
43 | svc = svc.rsplit('-', 1)[0]
44 | if svc in metric_map:
45 | metric_map[svc].append(os.path.join(metric_dir, path))
46 | else:
47 | metric_map[svc] = [os.path.join(metric_dir, path)]
48 | for svc in metric_map:
49 | frames = []
50 |
51 | # get pod name
52 | for path in path_list:
53 | if svc in path:
54 | pod_name = path.split("_")[0]
55 | # print(pod_name)
56 | network_mean, network_std = get_netwrok_metric(
57 | trace_file=trace_file, pod_name=pod_name)
58 | break
59 |
60 | metric_threshold_file = metric_threshold_dir + "/" + svc + ".csv"
61 | for path in metric_map[svc]:
62 | frames.append(pd.read_csv(path, index_col=False, usecols=[
63 | 'CpuUsageRate(%)', 'MemoryUsageRate(%)', 'SyscallRead', 'SyscallWrite']))
64 | # concat pods of the same service
65 | result = pd.concat(frames)
66 | with open(metric_threshold_file, 'w', newline='') as f:
67 | writer = csv.writer(f)
68 | header = ['CpuUsageRate(%)', 'MemoryUsageRate(%)', 'SyscallRead',
69 | 'SyscallWrite', 'NetworkP90(ms)']
70 | writer.writerow(header)
71 | mean_list = []
72 | std_list = []
73 | for metric in header:
74 | if metric == 'NetworkP90(ms)':
75 | continue
76 | mean_list.append(np.mean(result[metric]))
77 | std_list.append(np.std(result[metric]))
78 | mean_list.append(network_mean)
79 | std_list.append(network_std)
80 | writer.writerow(mean_list)
81 | writer.writerow(std_list)
82 |
83 |
84 | def get_netwrok_metric(trace_file, pod_name):
85 | """
86 | func get_netwrok_metric: use trace data to get netwrok metric
87 | :parameter
88 | time - to regex timestamp e.g, "2022-04-18 13:00"
89 | data_dir
90 | pod_name
91 | :return
92 | p90 netwrok latency
93 | """
94 | latency_list = []
95 |
96 | if "front" in pod_name:
97 | # front end dose not calculate netwrok latency
98 | return 10, 10
99 | #
100 | # pod_reader = pd.read_csv(
101 | # trace_file, index_col='PodName', usecols=['TraceID', 'SpanID', 'ParentID', 'PodName', 'EndTimeUnixNano'])
102 | # parent_span_reader = pd.read_csv(
103 | # trace_file, index_col='SpanID', usecols=['TraceID', 'SpanID', 'ParentID', 'PodName', 'EndTimeUnixNano'])
104 | #
105 | # try:
106 | # pod_reader = pod_reader.reindex(columns=pod_name)
107 | # pod_spans = pod_reader.loc[[pod_name], ['SpanID', 'ParentID', 'PodName', 'EndTimeUnixNano']]
108 | # except:
109 | # service = pod_name.rsplit('-', 1)[0]
110 | # service = service.rsplit('-', 1)[0]
111 | #
112 | # csv_file = dirname(__file__) + "/metric_threshold/" + service + ".csv"
113 | # pod_reader = pd.read_csv(csv_file, usecols=['NetworkP90(ms)'])
114 | # # print("pod", pod_name, " not found in trace, return default ",
115 | # # float(pod_reader.iloc[0]))
116 | #
117 | # return float(pod_reader.iloc[0]), 0
118 | #
119 | # if len(pod_spans['SpanID']) > 0:
120 | # # process span independentlt and order by timestamp
121 | # for span_index in range(len(pod_spans['SpanID'])):
122 | # # span event
123 | # parent_id = pod_spans['ParentID'].iloc[span_index]
124 | # pod_start_time = int(
125 | # pod_spans['EndTimeUnixNano'].iloc[span_index])
126 | # try:
127 | # parent_pod_span = parent_span_reader.loc[[
128 | # parent_id], ['PodName', 'EndTimeUnixNano']]
129 | # if len(parent_pod_span) > 0:
130 | # for parent_span_index in range(len(parent_pod_span['PodName'])):
131 | # parent_pod_name = parent_pod_span['PodName'].iloc[parent_span_index]
132 | # parent_end_time = int(
133 | # parent_pod_span['EndTimeUnixNano'].iloc[parent_span_index])
134 | #
135 | # if str(parent_pod_name) != str(pod_name):
136 | # latency = (parent_end_time - pod_start_time) / \
137 | # 1000000 # convert to microsecond
138 | # # if "contacts-service" in pod_name:
139 | # # logger.info("%s, %s, %s, %s, %s" % (
140 | # # pod_name, pod_spans['SpanID'].iloc[span_index], parent_pod_name, pod_spans['ParentID'].iloc[span_index], latency))
141 | # latency_list.append(latency)
142 | # except:
143 | # pass
144 | # # logger.info("%s latency is %s" %(pod_name, np.percentile(latency_list, 90)))
145 | # if len(latency_list) > 2:
146 | # return np.percentile(latency_list, 90), statistics.stdev(latency_list)
147 | # else:
148 | # return 10, 10
149 | return 1, 1
150 |
151 |
152 | def determine_alarm(pod, metric_type, metric_value, std_num, ns):
153 | """
154 | fun determine_alarm: determin whether violate 3-sgima
155 | :parameter
156 | pod - podname to find corrsponding metric threshold file
157 | metric_type - find correspding column
158 | metric_vault - compare with the history mean and std
159 | std_num - constrol std_num * std
160 | :return
161 | true - alarm
162 | false - no alarm
163 | """
164 |
165 | path_list = os.listdir(metric_threshold_dir)
166 |
167 | if metric_type == "CpuUsageRate(%)" or metric_type == 'MemoryUsageRate(%)':
168 | if metric_value > 80:
169 | return True
170 | else:
171 | if ns == "hipster":
172 | # for hipster
173 | if metric_value > 200:
174 | return True
175 | elif ns == "ts":
176 | # for ts
177 | if metric_value > 300:
178 | return True
179 | return False
180 | # for path in path_list:
181 | # if re.search(path.split('.')[0], pod):
182 | # hisory_metric = pd.read_csv(os.path.join(
183 | # metric_threshold_dir, path), index_col=False, usecols=[metric_type])
184 | # if metric_value > hisory_metric[metric_type][0] + std_num * hisory_metric[metric_type][1]:
185 | # return True
186 | # # elif metric_value < hisory_metric[metric_type][0] - std_num * hisory_metric[metric_type][1]:
187 | # # return True
188 | # else:
189 | # return False
190 |
191 |
192 | def generate_alarm(metric_list, ns, std_num=6):
193 | """
194 | func generate_alarm: generate alram of each pod at current miniute
195 | :parameter
196 | metric_list - metric list from get_metric_with_time
197 |
198 | :return
199 | alarm_list, e.g., [{'pod': 'cartservice-579f59597d-n69b4', 'alarm': [{'metric_type': 'CpuUsageRate(%)', 'alarm_flag': True}]}]
200 | [{
201 | pod:
202 | alarm: [
203 | {
204 | metric_type: CpuUsageRate(%)
205 | alarm_flag: True
206 | }
207 | ]
208 | }]
209 | """
210 | alarm_list = []
211 | for pod_metric in metric_list:
212 | alarm = {}
213 | for i in range(len(pod_metric['metrics'])):
214 | alarm_flag = determine_alarm(pod=pod_metric["pod"], metric_type=pod_metric['metrics'][i]["metric_type"],
215 | metric_value=pod_metric['metrics'][i]["metric_value"], std_num=std_num, ns=ns)
216 | if alarm_flag:
217 | # if exist alarm_flag equal to true, create map
218 | if "pod" not in alarm:
219 | alarm = {"pod": pod_metric["pod"], "alarm": []}
220 | alarm['alarm'].append(
221 | {"metric_type": pod_metric['metrics'][i]["metric_type"], "alarm_flag": alarm_flag})
222 |
223 | if "pod" in alarm:
224 | alarm_list.append(alarm)
225 |
226 | return alarm_list
227 |
228 |
229 | def get_metric_with_time(time, base_dir):
230 | """
231 | func get_metric_with_time: get metric list at determined miniute
232 | :parameter
233 | time - to regex timestamp e.g, "2022-04-18 13:00"
234 | product_metric_dir
235 | :return
236 | target_list - traget metrics
237 | [
238 | {
239 | pod:
240 | metrics: [
241 | {
242 | "metric_type":
243 | "metric_value":
244 | }
245 | ]
246 | }
247 |
248 | ]
249 | """
250 | # date = time.split(' ')[0]
251 | # hour_min = time.split(' ')[1]
252 | # hour = hour_min.split(':')[0]
253 | # min = hour_min.split(':')[1]
254 | trace_file = base_dir + "/trace/trace.csv"
255 |
256 | metric_dir = base_dir + "/metric/"
257 |
258 | path_list = os.listdir(metric_dir)
259 |
260 | # metric_list = ['CpuUsageRate(%)', 'MemoryUsageRate(%)', 'SyscallRead',
261 | # 'SyscallWrite']
262 | # metric_list = ['CpuUsageRate(%)', 'MemoryUsageRate(%)']
263 | target_list = []
264 | for path in path_list:
265 | if "metric" in path:
266 | metrics = pd.read_csv(os.path.join(metric_dir, path))
267 | metric_list = list(metrics.columns)
268 | metric_list.remove("TimeStamp")
269 | metric_list.remove("PodName")
270 | metric_list.remove("Time")
271 | if 'Date' in metric_list:
272 | metric_list.remove("Date")
273 | # metrics = pd.read_csv(os.path.join(product_metric_dir, path), index_col=False, usecols=['TimeStamp', 'PodName', 'CpuUsageRate(%)', 'MemoryUsageRate(%)', 'SyscallRead', 'SyscallWrite', 'PodServerLatencyP90(s)', 'PodClientLatencyP90(s)'])
274 | for index in range(len(metrics['Time'])):
275 | # regex timestamp
276 | if re.search(time, metrics['Time'][index]):
277 | target_metric = {
278 | "pod": metrics['PodName'][index], "metrics": []}
279 | for metric in metric_list:
280 | target_metric["metrics"].append({
281 | "metric_type": metric, "metric_value": metrics[metric][index]})
282 | network_p90, _ = get_netwrok_metric(
283 | trace_file=trace_file, pod_name=metrics['PodName'][index])
284 | target_metric["metrics"].append(
285 | {"metric_type": "NetworkP90(ms)", "metric_value": network_p90})
286 | target_list.append(target_metric)
287 | pod_num = len(path_list)
288 | # print(target_list)
289 | return target_list, pod_num
290 |
291 |
--------------------------------------------------------------------------------
/Baseline/Nezha/data_parser.py:
--------------------------------------------------------------------------------
1 | import json
2 | import glob
3 | import os
4 | import pandas as pd
5 | import csv
6 | import re # Import regular expressions
7 | import numpy as np
8 |
9 |
10 | def remove_timestamps(message):
11 | # Remove datetime info of different formats
12 | message = re.sub(r'\[\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\]', '', message)
13 | message = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z', '', message)
14 | message = re.sub(r'\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}', '', message)
15 | message = re.sub(r'\d{4}\.\d{2}\.\d{2} \d{2}:\d{2}:\d{2}', '', message)
16 | message = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}\+\d{4}', '', message)
17 | message = re.sub(r'\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}', '', message)
18 | message = re.sub(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', '', message)
19 | message = re.sub(r'\[\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} \w{3}\]', '', message)
20 | message = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}\+\d{2}:\d{2}', '', message)
21 | message = re.sub(r'\w{3} \d{1,2}, \d{4}', '', message)
22 | message = re.sub(r'\d{1,2} \w{3} \d{4}', '', message)
23 | message = re.sub(r'\d{2}:\d{2} [AP]M', '', message)
24 | message = re.sub(r'\[\d{2}/\w{3}/\d{4} \d{2}:\d{2}:\d{2}\]', '', message)
25 | message = re.sub(r'^I\d{4} \d{2}:\d{2}:\d{2}\.\d{6}\s+\d+\s+\w+.\w+:\d+\] ', '', message)
26 | message = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}Z', '', message)
27 |
28 | return message.strip()
29 |
30 |
31 |
32 | def extract_log_message(log):
33 | # Check if the log is in JSON format
34 | if "msg:" in log or "\"msg\":" in log:
35 | log_json = json.loads(log)
36 | log_message = log_json.get('msg', '')
37 | elif "msg=" in log or "\"msg\"=" in log:
38 | msg_index = log.find("msg=") if "msg=" in log else log.find("\"msg\"=")
39 | first_quote_index = log.find('"', msg_index)
40 | last_quote_index = log.find('"', first_quote_index + 1)
41 | if last_quote_index != -1:
42 | log_message = log[first_quote_index + 1:last_quote_index]
43 | else:
44 | log_message = log[first_quote_index + 1:]
45 | else:
46 | log_message = log
47 |
48 | return log_message.strip()
49 |
50 | def dependency(path,output_dir):
51 | extract_pod_list=[]
52 | extract_node_list=[]
53 | folder_list = os.listdir(path)
54 |
55 | for folder in folder_list:
56 | json_file = path + folder + "/" + "*.json"
57 | for readfile in glob.glob(json_file):
58 | print(readfile)
59 | with open(readfile) as f:
60 | jsn = json.load(f)
61 | for jsn_hit in jsn['hits']['hits']:
62 | all_proc = []
63 | all_node = []
64 | if "kubernetes" in jsn_hit['_source'] and "pod_name" in jsn_hit['_source']['kubernetes']:
65 | pod = jsn_hit['_source']['kubernetes']['pod_name']
66 | message = jsn_hit['_source']['message']
67 | timestamp = jsn_hit['_source']['@timestamp']
68 | if message.startswith('"'):
69 | message = message[1:]
70 | if message.endswith('"'):
71 | message = message[:-1]
72 | if "msg" in message:
73 | # print(message)
74 | message = extract_log_message(message)
75 | # message = json.loads(message)['msg']
76 |
77 | message = remove_timestamps(message)
78 | all_proc.append(pod)
79 | all_proc.append(timestamp)
80 | all_proc.append(message)
81 | if all_proc:
82 | extract_pod_list.append(all_proc)
83 | if "systemd" in jsn_hit['_source'] and "t" in jsn_hit['_source']['systemd']:
84 | node = jsn_hit['_source']['hostname']
85 | message = jsn_hit['_source']['message']
86 | timestamp = jsn_hit['_source']['@timestamp']
87 | if message.startswith('"'):
88 | message = message[1:]
89 | if message.endswith('"'):
90 | message = message[:-1]
91 | if "msg:" in message or "\"msg\":" in message:
92 | message = extract_log_message(message)
93 | # message = json.loads(message)['msg']
94 | all_node.append(node)
95 | all_node.append(timestamp)
96 | all_node.append(message)
97 | if all_node:
98 | extract_node_list.append(all_node)
99 | # output file
100 | data_list_col=['Node','Timestamp','Messages']
101 | node_df = pd.DataFrame(extract_node_list,columns=data_list_col)
102 | node_df.dropna()
103 | filename = 'Node_messages'
104 | node_df = node_df.sort_values(by='Timestamp')
105 | node_df.to_csv(output_dir + filename, index = False)
106 | csv_file = output_dir + filename
107 | partition_csv(csv_file, output_dir3)
108 |
109 | data_list_col=['Pod','Timestamp','Messages']
110 | pod_df = pd.DataFrame(extract_pod_list,columns=data_list_col)
111 | pod_df.dropna()
112 | filename = 'Pod_messages'
113 | pod_df = pod_df.sort_values(by='Timestamp')
114 | pod_df.to_csv(output_dir + filename, index = False)
115 | csv_file = output_dir + filename
116 | partition_csv(csv_file, output_dir2)
117 |
118 |
119 |
120 | def partition_csv(csv_file, output_dir):
121 | isExist = os.path.exists(output_dir)
122 | if not isExist:
123 | os.mkdir(output_dir)
124 | # Creates empty set - this will be used to store the values that have already been used
125 | filelist = set()
126 | # Opens the large csv file in "read" mode
127 | with open(csv_file,'r') as csvfile:
128 | read_rows = csv.reader(csvfile)
129 | # Skip the column names
130 | next(read_rows)
131 | for row in read_rows:
132 | # Store the whole row as a string (rowstring)
133 | rowstring='\t'.join(row[1:])
134 | # Defines filename as the first entry in the row - This could be made dynamic so that the user inputs a column name to use
135 | filename = (row[0])
136 | # This basically makes sure it is not looking at the header row.
137 | # If the filename is not in the filelist set, add it to the list and create new csv file with header row.
138 | if filename not in filelist:
139 | filelist.add(filename)
140 | temp_file = output_dir + str(filename +'_messages')
141 | if os.path.exists(temp_file):
142 | os.remove(temp_file)
143 | with open(temp_file,'a') as f:
144 | f.write(rowstring)
145 | f.write("\n")
146 | f.close()
147 | # If the filename is in the filelist set, append the current row to the existing csv file.
148 | else:
149 | temp_file = output_dir + str(filename + '_messages')
150 | with open(temp_file,'a') as f:
151 | f.write(rowstring)
152 | f.write("\n")
153 | f.close()
154 |
155 | def data_integration(file_list, data_path, output_dir='./rca_data/'):
156 | # Create an empty dictionary to store data split by date and hour
157 | df_dict = dict()
158 | for file in file_list:
159 | file_dir = ''.join(data_path, file)
160 | df = pd.read_csv(file_dir)
161 | timestamps = df['Time']
162 | pod_name = file[:-4]
163 | df['Time'] = pd.to_datetime(df['Time'])
164 | # Iterate over the unique dates in the 'Time' column
165 | for date in df['Time'].dt.date.unique():
166 | # Filter data for the specific date
167 | date_data = df[df['Time'].dt.date == date]
168 | date_data['PodName'] = np.array([pod_name for _ in range(date_data.shape[0])])
169 | date_data['Container'] = np.array(['server' for _ in range(date_data.shape[0])])
170 | # Create a nested dictionary for each hour of this date
171 | hourly_dict = {}
172 |
173 | for hour in range(24):
174 | # Filter data for the specific hour within the date
175 | hourly_data = date_data[date_data['Time'].dt.hour == hour]
176 | hourly_data = hourly_data.rename(columns={
177 | 'EventTemplate': 'Log',
178 | 'Time': 'Timestamp',
179 | })
180 | # Store the split data in the nested dictionary with the hour as the key
181 | if hour not in hourly_dict:
182 | hourly_dict[hour] = hourly_data
183 | else:
184 | hourly_dict[hour] = hourly_dict[hour].append(hourly_data, ignore_index=True)
185 |
186 | # Store the hourly dictionary in the main dictionary with the date as the key
187 | df_dict[date] = hourly_dict
188 |
189 |
190 | if __name__ == "__main__":
191 | file_list =['openshift-apiserver-operator-68fd44b989-6rgcq_messages_structured.csv',
192 | 'network-operator-7c59d666f5-27lvk_messages_structured.csv',
193 | 'mongodb-v1-64c6b69879-p4wfp_messages_structured.csv',
194 | 'openshift-kube-scheduler-ocp4-control-plane-1_messages_structured.csv',
195 | 'openshift-kube-scheduler-ocp4-control-plane-2_messages_structured.csv',
196 | 'openshift-kube-scheduler-ocp4-control-plane-3_messages_structured.csv',
197 | 'ovs-ch5xp_messages_structured.csv',
198 | 'packageserver-67d8b69dc5-6rtj9_messages_structured.csv',
199 | 'prometheus-6cc8d9b85-sztrb_messages_structured.csv']
200 | # Input log data directory
201 | # path = 'Path-to-the-dataset-directory'
202 | path = '/nfs/users/zach/aiops/data/1203/log_data/pod_removed/'
203 | # Output directories
204 | output_dir='./rca_data/'
205 | data_integration(file_list, path, output_dir)
206 |
--------------------------------------------------------------------------------
/Baseline/Nezha/log.py:
--------------------------------------------------------------------------------
1 | #encoding = utf-8
2 |
3 | import logging
4 |
5 |
6 | class Logger():
7 | def __init__(self, logname, loglevel=logging.DEBUG, loggername=None):
8 | '''
9 | 指定保存日志的文件路径,日志级别,以及调用文件
10 | 将日志存入到指定的文件中
11 | '''
12 | # 创建一个logger
13 | self.logger = logging.getLogger(loggername)
14 | self.logger.setLevel(loglevel)
15 | # 创建一个handler,用于写入日志文件
16 | fh = logging.FileHandler(logname)
17 | fh.setLevel(loglevel)
18 | if not self.logger.handlers:
19 | # 再创建一个handler,用于输出到控制台
20 | ch = logging.StreamHandler()
21 | ch.setLevel(loglevel)
22 | formatter = logging.Formatter(
23 | '[%(levelname)s]%(asctime)s %(filename)s:%(lineno)d: %(message)s')
24 | fh.setFormatter(formatter)
25 | ch.setFormatter(formatter)
26 | self.logger.addHandler(fh)
27 | self.logger.addHandler(ch)
28 |
29 | def getlog(self):
30 | return self.logger
31 |
--------------------------------------------------------------------------------
/Baseline/Nezha/main.py:
--------------------------------------------------------------------------------
1 | import os.path
2 |
3 | from pattern_ranker import *
4 | import argparse
5 | from log_parsing import *
6 |
7 | file_path = './'
8 | # print(file_path)
9 | log_path = file_path + '/log/' + str(datetime.datetime.now().strftime(
10 | '%Y-%m-%d')) + '_nezha.log'
11 | print(log_path)
12 | logger = Logger(log_path, logging.DEBUG, __name__).getlog()
13 |
14 |
15 | def get_miner(ns):
16 | template_indir = file_path + '/log_template'
17 | config = TemplateMinerConfig()
18 | config.load(file_path + "/log_template/drain3_" + ns + ".ini")
19 | config.profiling_enabled = False
20 |
21 | path = file_path + '/log_template/' + ns + ".bin"
22 | persistence = FilePersistence(path)
23 | template_miner = TemplateMiner(persistence, config=config)
24 |
25 | return template_miner
26 |
27 | # def generate_trace_id(log_dir):
28 | # trace_list = []
29 | # for file in os.listdir(log_dir):
30 | # if file.endswith("_messages_structured.csv"):
31 | # trace_list.append(file[:-24])
32 | # if not os.path.exists(log_dir + '../traceid/'):
33 | # os.mkdir(log_dir + '../traceid/')
34 | # pd.DataFrame(trace_list).to_csv(log_dir + '../traceid/trace_id.csv', index=False, header=False)
35 |
36 |
37 | if __name__ == '__main__':
38 | parser = argparse.ArgumentParser(description='Nezha')
39 |
40 | parser.add_argument('--ns', default="hipster", help='namespace')
41 | parser.add_argument('--level', default="service", help='service-level or inner-service level')
42 | parser.add_argument('--log_dir', default="./20240124/log/", help='the path to log data')
43 | parser.add_argument('--metric_dir', default="./20240124/Latency/", help='the path to metric data')
44 | parser.add_argument('--save_dir', default="./20240124/", help='the path to save preprocessed data')
45 | # parser.add_argument('--level', default="service", help='service-level or inner-service level')
46 | args = parser.parse_args()
47 | ns = args.ns
48 | level = args.level
49 | save_dir = args.save_dir
50 | log_dir = args.log_dir
51 | metric_dir = args.metric_dir
52 | kpi_file = save_dir + '/kpi_20240124_latency.csv'
53 | path1 = save_dir + "./20240124-fault_list.json"
54 | kpi_data = pd.read_csv(kpi_file)
55 | normal_time1 = str(pd.to_datetime(kpi_data['timeStamp'].iloc[0], unit='s'))
56 | time_index = int(kpi_data['timeStamp'].shape[0] * 0.6)
57 | preprocess(log_dir, metric_dir, save_dir)
58 | file_path = save_dir
59 | log_template_miner = get_miner(ns)
60 | inject_list = [path1]
61 | normal_time_list = [normal_time1]
62 | if level == "service":
63 | logger.info("------- Result at service level -------")
64 | evaluation_pod(normal_time_list, inject_list, ns, log_template_miner, file_path)
65 | else:
66 | logger.info("------- Result at inner service level -------")
67 | evaluation(normal_time_list, inject_list, ns, log_template_miner, file_path)
68 |
--------------------------------------------------------------------------------
/Baseline/Nezha/pattern_miner.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | from data_integrate import *
3 |
4 | log_path = dirname(__file__) + '/log/' + str(datetime.datetime.now().strftime(
5 | '%Y-%m-%d')) + '_nezha.log'
6 | logger = Logger(log_path, logging.DEBUG, __name__).getlog()
7 |
8 |
9 | # def frequent_pattern_miner(event_sequences):
10 | # """
11 | # mining frequent pattern in event sequences (Discard)
12 | # input:
13 | # - event_sequences: event sequences belonging to the traces in time window, e.g., [[1,2,3],[2,3,4]]
14 | # output:
15 | # - pattern: frequent_pattern in the events, e.g., [['54', '29', '#SUP: 9'], ['54', '30', '#SUP: 9'], ['54', '32', '#SUP: 9']]
16 | # """
17 | # print(datetime.datetime.now())
18 |
19 | # spmf_path = dirname(__file__) + "/spmf"
20 | # spmf = Spmf("CM-SPAM", input_direct=event_sequences,
21 | # output_filename="./spmf/SPAM.txt", arguments=[0.01, 2, 2], spmf_bin_location_dir=spmf_path, memory=8192)
22 | # spmf.run()
23 | # pattern = spmf.parse_output()
24 | # print(pattern)
25 | # print(datetime.datetime.now())
26 | # return pattern
27 |
28 |
29 | # def frequent_graph_miner(file_name, topk=30):
30 | # """
31 | # mining frequent graph in event graph
32 | # input:
33 | # - file_name: input filename e.g.,
34 | # output:
35 | # - pattern_list: frequent_child_graph_list [{'support': '519', 'node1': '180', 'node2': '264'}]
36 | # """
37 |
38 | # # print(datetime.datetime.now())
39 |
40 | # spmf_path = dirname(__file__) + "/spmf"
41 | # spmf = Spmf("TKG", input_filename=file_name,
42 | # output_filename="./spmf/tkg.txt", arguments=[topk, 2, False, False, True], spmf_bin_location_dir=spmf_path, memory=8192)
43 | # spmf.run()
44 | # pattern_result = spmf.parse_output()
45 |
46 | # # print(pattern_result)
47 | # # print(datetime.datetime.now())
48 |
49 | # pattern_list = []
50 | # for i in range(0, len(pattern_result), 6):
51 | # """ parse ['t # 29 * 519'], ['v 0 5'], ['v 1 265'], ['e 0 1 1'] """
52 | # support = pattern_result[i][0].split(' ')[-1]
53 | # node1 = pattern_result[i+1][0].split(' ')[-1]
54 | # node2 = pattern_result[i+2][0].split(' ')[-1]
55 | # pattern = {"support": support, "child_graph": node1 + "_" + node2}
56 | # pattern_list.append(pattern)
57 |
58 | # pattern_list.sort(key=lambda k: k['support'], reverse=True)
59 |
60 | # return pattern_list
61 |
62 |
63 | # def generate_tkg_input(event_graphs):
64 | # """
65 | # generate_tkg_input:
66 | # :parameter
67 | # event_graphs - graph list
68 | # :return
69 | # file_name - tkg input filename
70 |
71 | # details see at https://www.philippe-fournier-viger.com/spmf/TKG.php
72 | # t # 0
73 | # v 0 10
74 | # v 1 11
75 | # e 0 1 20
76 | # """
77 | # file_name = dirname(__file__) + "/spmf/" + str(datetime.datetime.now().strftime(
78 | # '%Y-%m-%d')) + "_tkg_input.txt"
79 | # f = open(file_name, "w")
80 |
81 | # graph_number = 0
82 | # node_number = 0
83 |
84 | # for graph in event_graphs:
85 | # # write head
86 | # graph_head = "t # " + str(graph_number) + "\r\n"
87 | # f.write(graph_head)
88 |
89 | # node_map = {}
90 | # node_content = ""
91 | # edge_content = ""
92 | # for key in graph.adjacency_list.keys():
93 | # if key.event not in node_map:
94 | # node_map[key.event] = node_number
95 | # node_content += "v " + \
96 | # str(node_number) + " " + str(key.event) + "\r\n"
97 | # node_number += 1
98 |
99 | # for event in graph.adjacency_list[key]:
100 | # if event.event not in node_map:
101 | # node_map[event.event] = node_number
102 | # node_content += "v " + \
103 | # str(node_number) + " " + str(event.event) + "\r\n"
104 | # node_number += 1
105 |
106 | # edge_content += "e " + \
107 | # str(node_map[key.event]) + " " + \
108 | # str(node_map[event.event]) + " 1\r\n"
109 |
110 | # f.write(node_content)
111 | # f.write(edge_content)
112 | # graph_number += 1
113 | # f.write("\r\n")
114 | # f.close()
115 |
116 | # return file_name
117 |
118 |
119 | def get_pattern_support(event_graphs):
120 | result_support_dict = {}
121 | total_pair = set()
122 |
123 | for event_graph in event_graphs:
124 | for key, value in event_graph.support_dict.items():
125 | if key in total_pair:
126 | result_support_dict[key] += value
127 | else:
128 | result_support_dict[key] = value
129 | total_pair = total_pair | event_graph.pair_set
130 |
131 | result_support_dict = dict(sorted(
132 | result_support_dict.items(), key=lambda x: x[1], reverse=True))
133 |
134 | return result_support_dict
135 |
136 |
--------------------------------------------------------------------------------
/Baseline/Nezha/requirements.txt:
--------------------------------------------------------------------------------
1 | drain3==0.9.10
2 | matplotlib==3.3.4
3 | more_itertools==8.12.0
4 | numpy==2.0.0
5 | pandas==0.23.4
6 | psutil==5.9.0
7 | PyYAML==6.0.1
8 |
--------------------------------------------------------------------------------
/Baseline/Readme.md:
--------------------------------------------------------------------------------
1 | # Baselines
2 |
3 | This folder contains the baseline methods for Lemma-RCA datasets evaluation with both single- and multi-modal settings. Note that SWAT and WADI datasets comply with single-modal setting.
4 |
5 | - FastPC:
6 | ```
7 | python test_FastPC_pod_metric.py -case 20240115 ## for case 20240115 metric data only
8 | python test_FastPC_pod_log.py -case 20240115 ## for case 20240115 log data only
9 | python test_FastPC_pod_combine.py -case 20240115 ## for case 20240115 with both metric and log data
10 | ```
11 |
12 | - Baro:
13 | ```
14 | cd ./metric_only
15 | python baro_main_metric.py -case 20240115## for case 20240115 metric data only
16 | cd ./log_only
17 | python baro_main_log.py -case 20240115## for case 20240115 log data only
18 | cd ./multimodal
19 | python baro_main_combined.py -case 20240115 ## case 20240115 with for both metric and log data
20 | ```
21 |
22 |
23 | - RCD:
24 | ```
25 | cd ./metric_only
26 | python RCA_methods_metric.py -case 20240115 -model rcd ## for metric data only
27 | cd ./log_only
28 | python RCA_methods_log.py -case 20240115 -model rcd ## for log data only
29 | cd ./multimodal
30 | python RCA_methods_combined.py -case 20240115 -model rcd ## for both metric and log data
31 | ```
32 |
33 | - CIRCA:
34 | ```
35 | cd ./metric_only
36 | python RCA_methods_metric.py -case 20240115 -model circa ## for metric data only
37 | cd ./log_only
38 | python RCA_methods_log.py -case 20240115 -model circa ## for log data only
39 | cd ./multimodal
40 | python RCA_methods_combined.py -case 20240115 -model circa ## for both metric and log data
41 | ```
42 |
43 | - epsilon_diagnosis:
44 | ```
45 | cd ./metric_only
46 | python RCA_methods_metric.py -case 20240115 -model epsilon_diagnosis ## for metric data only
47 | cd ./log_only
48 | python RCA_methods_log.py -case 20240115 -model epsilon_diagnosis ## for log data only
49 | cd ./multimodal
50 | python RCA_methods_combined.py -case 20240115 -model epsilon_diagnosis ## for both metric and log data
51 | ```
52 |
53 | - Nezha:
54 | ```
55 | python main.py
56 | ```
57 | For Nezha, we provide the demo code for the case 20240124. Due to inconsistant filename for each case, you may need to change the name of the folder for each case accordingly.
58 |
59 | To run the baseline methods for SWAT and WADI datasets, the only difference is the data loader and the labels for evaluation. The labels are given in the corresponding scripts. For the Baro method:
60 |
61 | - Baro for SWAT & WADI:
62 | ```
63 | python baro.py
64 | python baro-evaluation.py
65 | ```
66 |
67 | The RCD, epsilon_diagnosis, CIRCA methods are included in the pyrca package. Simply run:
68 |
69 | - Baro for SWAT & WADI:
70 | ```
71 | python pyrca-main.py
72 | python pyrca-evaluation.py
73 | ```
74 |
75 | ##### If you encounter the error regarding "name 'LIBSPOT' is not defined", please double-check if you are running the code in the directory of FastPC.
76 |
77 | #### If you fail to install pyrca package in windows, please use the following command:
78 | #### "pip install sfr-pyrca --use-pep517 git+https://github.com/SchmollerLab/python-javabridge-windows"
79 |
80 |
--------------------------------------------------------------------------------
/Baseline/SWAT&WADI/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KnowledgeDiscovery/rca_baselines/c560d8cc39c19f04c9ae74fac2404b1e5e8c3b4d/Baseline/SWAT&WADI/.DS_Store
--------------------------------------------------------------------------------
/Baseline/SWAT&WADI/baro-evaluation.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import glob
3 | import pandas as pd
4 | import numpy as np
5 | from collections import defaultdict
6 |
7 | files = glob.glob('./final*.csv')
8 | model = 'baro'
9 |
10 | model_files = defaultdict(list)
11 |
12 | for file in files:
13 | model_files[model].append(file)
14 |
15 | for key in model_files:
16 | model_files[key] = sorted(model_files[key], key=lambda x: x.split('/')[-2])
17 |
18 | print(model_files)
19 |
20 | predicts = []
21 | mfiles = model_files['baro']
22 | for mf in mfiles:
23 | print(mf)
24 | mf_data = pd.read_csv(mf)
25 | root_cause_list = list(mf_data['root_cause'].values)
26 | if 'Latency' in root_cause_list:
27 | root_cause_list.remove('Latency')
28 | predicts.append(root_cause_list)
29 |
30 | reals = [
31 | ['1_MV_001'],
32 | ['1_FIT_001'],
33 | ['2_LIT_002', '1_AIT_001'],
34 | ['2_MCV_101', '2_MCV_201', '2_MCV_301', '2_MCV_401', '2_MCV_501', '2_MCV_601'],
35 | ['2_MCV_101', '2_MCV_201'],
36 | ['1_AIT_002', '2_MV_003'],
37 | ['2_MCV_007'],
38 | ['1_P_006'],
39 | ['1_MV_001'],
40 | ['2_MCV_007'],
41 | ['2_MCV_007'],
42 | ['2_AIT_003'],
43 | ['2_MV_201', '2_P_201', '2_P_202', '2_P_203', '2_P_204', '2_P_205', '2_P_206'],
44 | ['2_LIT_002', '1_AIT_001'],
45 | ]
46 |
47 | def precision_on_topk(predicts,reals,k):
48 | pr = 0
49 | for pred, real in zip(predicts, reals):
50 | pred = pred[:k]
51 | hit_count = len(set(pred) & set(real))
52 | min_len = min(k,len(real))
53 | pr += hit_count/min_len
54 | return pr/len(reals)
55 |
56 | def mean_precision_k(predicts,reals,k):
57 | pr = 0
58 | for i in range(1,k+1):
59 | pr += precision_on_topk(predicts,reals,i)
60 | return pr/k
61 |
62 | def mrr(predicts,reals):
63 | mrr_val = 0
64 | for preds,real in zip(predicts,reals):
65 | tmp = []
66 | for real_item in real:
67 | index = preds.index(real_item) if real_item in preds else sys.maxsize-1
68 | tmp.append(index+1)
69 | mrr_val += 1/min(tmp)
70 | return mrr_val/len(reals)
71 |
72 | k = [1,3,5,10]
73 | for item in k:
74 | pr_k = precision_on_topk(predicts,reals,item)
75 | map_k = mean_precision_k(predicts,reals,item)
76 | print("Precision@{}:{}".format(item,pr_k))
77 | print('MAP@{}:{}'.format(item,map_k))
78 | print('MRR:{}'.format(mrr(predicts,reals)))
79 |
--------------------------------------------------------------------------------
/Baseline/SWAT&WADI/baro.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pickle
4 | import time
5 | import warnings
6 | warnings.filterwarnings("ignore")
7 | from sklearn.feature_selection import VarianceThreshold
8 | from baro_algorithm import bocpd, robust_scorer
9 |
10 | def data_convert(segment):
11 | columns = np.array(segment.iloc[:, 1:].columns)
12 | selector = VarianceThreshold(threshold=0)
13 | X = segment.iloc[:, 1:].values
14 | X_var = selector.fit_transform(X)
15 | idx = selector.get_support(indices=True)
16 | columns = columns[idx]
17 | X_var = pd.DataFrame(X_var)
18 | X_var.columns = list(columns)
19 | return X_var
20 |
21 | with open('../WADI/data_segments.pkl','rb') as f:
22 | data_segments = pickle.load(f)
23 |
24 |
25 | for ind,segment in enumerate(data_segments):
26 | segment = segment.iloc[:, 1:]
27 | print('{} fault starts to detect bayesian structure'.format(ind))
28 | segment = data_convert(segment)
29 | columns = np.array(segment.columns)
30 | #np.save('{}_var_name.npy'.format(ind), columns)
31 | X = segment.values
32 | patch = 100
33 | sample = X.shape[0]//patch
34 | X = X[:patch*sample,:]
35 | X = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1)
36 | X_df = pd.DataFrame(X,columns=columns)
37 | anomalies = bocpd(X_df)
38 | print("Anomalies are detected at timestep:", anomalies[0])
39 | results = robust_scorer(X_df,anomalies=anomalies)
40 |
41 | root_causes = []
42 | for result in results:
43 | (root_cause, score) = result
44 | root_causes.append([root_cause, score])
45 | root_causes = pd.DataFrame(root_causes)
46 | root_causes.columns = [['root_cause','score']]
47 | root_causes.to_csv("./final_{}_root_cause.csv".format(ind),index=False)
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
--------------------------------------------------------------------------------
/Baseline/SWAT&WADI/pyrca-evaluation.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import glob
3 | import pandas as pd
4 | import numpy as np
5 | from collections import defaultdict
6 |
7 | files = glob.glob('./final*.csv')
8 | model = 'ED'
9 |
10 | model_files = defaultdict(list)
11 |
12 | for file in files:
13 | if model in file:
14 | model_files[model].append(file)
15 |
16 | for key in model_files:
17 | model_files[key] = sorted(model_files[key], key=lambda x: x.split('/')[-2])
18 |
19 | predicts = []
20 | mfiles = model_files[model]
21 | print(mfiles)
22 | for mf in mfiles:
23 | print(mf)
24 | mf_data = pd.read_csv(mf)
25 | root_cause_list = list(mf_data['root_cause'].values)
26 | if 'label' in root_cause_list:
27 | root_cause_list.remove('label')
28 | predicts.append(root_cause_list)
29 |
30 | reals = [
31 | ['1_MV_001'],
32 | ['1_FIT_001'],
33 | ['2_LIT_002', '1_AIT_001'],
34 | ['2_MCV_101', '2_MCV_201', '2_MCV_301', '2_MCV_401', '2_MCV_501', '2_MCV_601'],
35 | ['2_MCV_101', '2_MCV_201'],
36 | ['1_AIT_002', '2_MV_003'],
37 | ['2_MCV_007'],
38 | ['1_P_006'],
39 | ['1_MV_001'],
40 | ['2_MCV_007'],
41 | ['2_MCV_007'],
42 | ['2_AIT_003'],
43 | ['2_MV_201', '2_P_201', '2_P_202', '2_P_203', '2_P_204', '2_P_205', '2_P_206'],
44 | ['2_LIT_002', '1_AIT_001'],
45 | ]
46 |
47 | def precision_on_topk(predicts,reals,k):
48 | pr = 0
49 | for pred, real in zip(predicts, reals):
50 | pred = pred[:k]
51 | hit_count = len(set(pred) & set(real))
52 | min_len = min(k,len(real))
53 | pr += hit_count/min_len
54 | return pr/len(reals)
55 |
56 | def mean_precision_k(predicts,reals,k):
57 | pr = 0
58 | for i in range(1,k+1):
59 | pr += precision_on_topk(predicts,reals,i)
60 | return pr/k
61 |
62 | def mrr(predicts,reals):
63 | mrr_val = 0
64 | for preds,real in zip(predicts,reals):
65 | tmp = []
66 | for real_item in real:
67 | index = preds.index(real_item) if real_item in preds else sys.maxsize-1
68 | tmp.append(index+1)
69 | mrr_val += 1/min(tmp)
70 | return mrr_val/len(reals)
71 |
72 | k = [1,3,5,10]
73 | for item in k:
74 | pr_k = precision_on_topk(predicts,reals,item)
75 | map_k = mean_precision_k(predicts,reals,item)
76 | print("Precision@{}:{}".format(item,pr_k))
77 | print('MAP@{}:{}'.format(item,map_k))
78 | print('MRR:{}'.format(mrr(predicts,reals)))
79 |
--------------------------------------------------------------------------------
/Baseline/SWAT&WADI/pyrca-main.py:
--------------------------------------------------------------------------------
1 |
2 | import numpy as np
3 | import pandas as pd
4 | from sklearn import preprocessing
5 | import os
6 | from sklearn.feature_selection import VarianceThreshold
7 | from sklearn.model_selection import train_test_split
8 |
9 | from causalnex.structure.notears import from_pandas
10 | import networkx as nx
11 |
12 | from pyrca.analyzers.ht import HT, HTConfig
13 | from pyrca.analyzers.epsilon_diagnosis import EpsilonDiagnosis, EpsilonDiagnosisConfig
14 | from pyrca.analyzers.rcd import RCD, RCDConfig
15 |
16 | import pandas as pd
17 | import networkx as nx
18 | import pickle
19 |
20 |
21 | def remove_cycles_from_adjacency_matrix(adj_matrix: pd.DataFrame) -> pd.DataFrame:
22 | G = nx.from_pandas_adjacency(adj_matrix, create_using=nx.DiGraph())
23 | while True:
24 | try:
25 | cycle = nx.find_cycle(G, orientation='original')
26 | G.remove_edge(*cycle[0][:2])
27 | except nx.NetworkXNoCycle:
28 | break
29 | adj_matrix_no_cycles = nx.to_pandas_adjacency(G, dtype=int)
30 | adj_matrix_no_cycles = adj_matrix_no_cycles.reindex_like(adj_matrix)
31 | print("Now, the adjacency matrix does not have cycles.")
32 |
33 | return adj_matrix_no_cycles
34 |
35 | def data_convert(segment):
36 | columns = np.array(segment.iloc[:, 1:].columns)
37 | selector = VarianceThreshold(threshold=0)
38 | X = segment.iloc[:, 1:].values
39 | X_var = selector.fit_transform(X)
40 | idx = selector.get_support(indices=True)
41 | columns = columns[idx]
42 | X_var = pd.DataFrame(X_var)
43 | X_var.columns = list(columns)
44 |
45 | return X_var
46 |
47 | def rca(ind, segment, model_name):
48 | segment = segment.iloc[:, 1:]
49 | print('{} fault starts to detect bayesian structure'.format(ind))
50 | segment = data_convert(segment)
51 | columns = np.array(segment.columns)
52 | #np.save('{}_var_name.npy'.format(ind), columns)
53 | X = segment.values
54 | patch = 100
55 | sample = X.shape[0]//patch
56 | X = X[:patch*sample,:]
57 | X0 = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1)
58 | X = pd.DataFrame(X0,columns=columns)
59 |
60 | X_train, X_test = train_test_split(X, test_size=0.6, shuffle=False)
61 | print("Start to run")
62 | if model_name == "HT":
63 | model = HT(config=HTConfig(graph=estimated_matrix,root_cause_top_k=10))
64 | model.train(X_train)
65 | results = model.find_root_causes(X_test, "label", True).to_list()
66 | elif model_name == "RCD":
67 | model = RCD(config=RCDConfig(k=10,alpha_limit=0.5))
68 | results = model.find_root_causes(X_train,X_test).to_list()
69 | elif model_name == "ED":
70 | model = EpsilonDiagnosis(config=EpsilonDiagnosisConfig(root_cause_top_k=10))
71 | model.train(X)
72 | results = model.find_root_causes(X).to_list()
73 |
74 | print("Saving")
75 | root_causes = []
76 | for result in results:
77 | root_causes.append([result['root_cause'],result['score']])
78 | root_causes = pd.DataFrame(root_causes)
79 | root_causes.columns = [['root_cause','score']]
80 | root_causes.to_csv("final_{}_{}_root_cause.csv".format(model_name, ind),index=False)
81 |
82 | return
83 |
84 |
85 |
86 | with open('../WADI/data_segments.pkl','rb') as f:
87 | data_segments = pickle.load(f)
88 |
89 | models = ['ED', 'RCD', 'HT']
90 | # Run all
91 | for model_name in models:
92 | for ind,segment in enumerate(data_segments):
93 | print("Now running {} for data {}.".format(model_name, ind))
94 | rca(ind, segment,model_name)
95 | print("-------------------")
96 |
--------------------------------------------------------------------------------
/Baseline/log_only/RCA_methods_log.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn import preprocessing
4 | import os
5 | from sklearn.feature_selection import VarianceThreshold
6 | from sklearn.model_selection import train_test_split
7 | from causalnex.structure.notears import from_pandas
8 | from pyrca.analyzers.ht import HT, HTConfig
9 | from pyrca.analyzers.epsilon_diagnosis import EpsilonDiagnosis, EpsilonDiagnosisConfig
10 | from pyrca.analyzers.rcd import RCD, RCDConfig
11 | # from pyrca.analyzers
12 | import networkx as nx
13 | import argparse
14 |
15 |
16 | def remove_cycles_from_adjacency_matrix(adj_matrix: pd.DataFrame) -> pd.DataFrame:
17 | G = nx.from_pandas_adjacency(adj_matrix, create_using=nx.DiGraph())
18 | while True:
19 | try:
20 | cycle = nx.find_cycle(G, orientation='original')
21 | G.remove_edge(*cycle[0][:2])
22 |
23 | except nx.NetworkXNoCycle:
24 | break
25 | adj_matrix_no_cycles = nx.to_pandas_adjacency(G, dtype=int)
26 | adj_matrix_no_cycles = adj_matrix_no_cycles.reindex_like(adj_matrix)
27 |
28 | print("Now, the adjacency matrix does not have cycles.")
29 | return adj_matrix_no_cycles
30 |
31 |
32 |
33 | def main(args):
34 | model_name = args.model
35 | data_name = args.case
36 | metric_data = {}
37 | columns_common = {}
38 | metric_path = '../data/{}'.format(data_name)
39 | if data_name == '20220606':
40 | label = 'reviews-v3'
41 | elif data_name == '20210517' or data_name == '20210524':
42 | label = 'Book_Info_product'
43 | elif data_name == '20211203':
44 | label = 'ratings.book-info.svc.cluster.local:9080/*'
45 | elif data_name == '20240215':
46 | label = 'pod usage'
47 | elif data_name == '20240124':
48 | label = 'scenario8_app_request'
49 | elif data_name == '20231207':
50 | label = 'book_info'
51 | elif data_name == '20231221':
52 | label = 'book_info'
53 | elif data_name == '20240115':
54 | label = 'book_info'
55 | else:
56 | raise ValueError('Invalid data_name')
57 |
58 | if data_name in ['20220606', '20210517', '20210524', '20211203', '20231207']:
59 | POD_METRIC_FILE = {'log_golden_signal': 1, 'log_frequency': 1, 'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1,
60 | 'received_bandwidth': 1, 'transmit_bandwidth': 1}
61 | log_label = 'ratings.book-info.svc.cluster.local:9080/*'
62 | elif data_name in ['20231207']:
63 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1,
64 | 'received_bandwidth': 1, 'transmit_bandwidth': 1, 'log_golden_signal': 1,
65 | 'log_frequency': 1}
66 | log_label = 'book_info'
67 | elif data_name in ['20240124']:
68 | POD_METRIC_FILE = {'log_golden_signal': 1, 'log_frequency': 1, 'cpu_usage': 1, 'disk_used': 1, 'diskio_reads': 1, 'diskio_writes': 1, 'memory_used': 1,
69 | 'netstat_established': 1, 'swap_used': 1}
70 | log_label = 'ratings.book-info.svc.cluster.local:9080/*'
71 | elif data_name in ['20240215']:
72 | POD_METRIC_FILE = {'pod_cpu_limit': 1, 'pod_cpu_usage_total': 1, 'pod_cpu_utilization_over_pod_limit': 1, 'pod_memory_limit': 1,
73 | 'pod_memory_utilization_over_pod_limit': 1, 'pod_memory_working_set': 1, 'pod_network_rx_bytes': 1, 'pod_network_tx_bytes':1,
74 | 'log_golden_signal': 1, 'log_frequency': 1}
75 | log_label = 'book_info'
76 | elif data_name in ['20240115']:
77 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1,
78 | 'rate_storage_iops': 1, 'received_bandwidth': 1, 'transmit_bandwidth': 1, 'log_golden_signal': 1,
79 | 'log_frequency': 1}
80 | log_label = 'ratings.book-info.svc.cluster.local:9080/*'
81 | elif data_name in ['20231221']:
82 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1,
83 | 'received_bandwidth': 1, 'transmit_bandwidth': 1, 'log_golden_signal': 1, 'log_frequency': 1}
84 | log_label = 'ratings.book-info.svc.cluster.local:9080/*'
85 | else:
86 | raise ValueError('Invalid data_name')
87 |
88 | pathset = "./output/"
89 | if not (os.path.exists(pathset)):
90 | os.mkdir(pathset)
91 |
92 | for metric, weight in POD_METRIC_FILE.items():
93 | if metric in ['log_PCA', 'log_golden_signal', 'log_frequency']:
94 | metric_file = '{}/pod_level_{}.npy'.format(metric_path, metric)
95 | metric_data[metric] = np.load(metric_file, allow_pickle=True).item()
96 | # log_label = 'ratings.book-info.svc.cluster.local:9080/*'
97 | if len(metric_data[metric].keys()) == 1:
98 | if log_label != label:
99 | metric_data[metric][label] = metric_data[metric][log_label]
100 | del metric_data[metric][log_label]
101 | else:
102 | metric_data[metric][label] = metric_data[metric]
103 | metric_data[metric][label]['Pod_Name'] = metric_data[metric][label]['Node_Name']
104 | del metric_data[metric][label]['Node_Name']
105 | metric_data[metric][label]['Sequence'] = metric_data[metric][label]['Sequence'].squeeze().T
106 | metric_data[metric][label]['KPI_Feature'] = [metric_data[metric][label]['KPI_Feature']]
107 | if columns_common:
108 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
109 | else:
110 | columns_common = list(metric_data[metric][label]['Pod_Name'])
111 | else:
112 | metric_file = '{}/pod_level_data_{}.npy'.format(metric_path, metric)
113 | metric_data[metric] = np.load(metric_file, allow_pickle=True).item()
114 | if columns_common:
115 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
116 | else:
117 | columns_common = list(metric_data[metric][label]['Pod_Name'])
118 |
119 | index_data = {}
120 | metric_names = []
121 | metric_weight_assigned = []
122 | for metric, weight in POD_METRIC_FILE.items():
123 | index_data[metric] = [metric_data[metric][label]['Pod_Name'].index(x) for x in columns_common]
124 | metric_names = metric_names + [metric]
125 | metric_weight_assigned = metric_weight_assigned + [weight]
126 |
127 | metric_weight = np.zeros((len(POD_METRIC_FILE), 1))
128 | metric_id = 0
129 | final_root_results = {}
130 |
131 | for metric in metric_names:
132 | print('For metric:', metric)
133 | data = metric_data[metric]
134 | X = data[label]['Sequence']
135 | index = index_data[metric]
136 | # Preprocessing to reduce the redundant samples
137 | if X.shape[0] // 100 < 100:
138 | patch = 20
139 | else:
140 | patch = 100
141 | sample = X.shape[0] // patch
142 | X = X[:patch * sample, :]
143 | X = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1)
144 | X_metric = X[:, index]
145 | X_metric = preprocessing.normalize(X_metric, axis=0, norm='l1')
146 | X = np.append(X_metric, X[:, -1].reshape(-1, 1), axis=1)
147 | columns = list(columns_common) + data[label]['KPI_Feature']
148 |
149 | std = np.std(X[:, :-1], axis=0)
150 | idx_std = [i for i, x in enumerate(std > 1e-5) if x]
151 | if len(idx_std) == 0:
152 | metric_weight[metric_id] = 0
153 | metric_id = metric_id + 1
154 | print(metric, ' all pods are all constant or quasi-constant')
155 | continue
156 |
157 | selector = VarianceThreshold(threshold=0)
158 | X_var = selector.fit_transform(X[:, :-1])
159 | idx = selector.get_support(indices=True)
160 | # print('X shape after variance: ', X_var.shape)
161 | if X_var.shape[1] < 1:
162 | metric_weight[metric_id] = 0
163 | metric_id = metric_id + 1
164 | print(metric, ' all pods are all constant or quasi-constant')
165 | continue
166 |
167 | mask = np.full(len(columns_common), False, dtype=bool)
168 | mask[idx] = True
169 | idx = list(idx) + [X.shape[1] - 1]
170 | X = X[:, idx]
171 | columns = [columns[i] for i in idx]
172 | X = pd.DataFrame(X, columns=columns)
173 | if model_name == 'circa':
174 | sm = from_pandas(X)
175 | estimated_matrix = nx.to_pandas_adjacency(sm)
176 | quantile_value = np.quantile(estimated_matrix.values.flatten(), 0.95)
177 | estimated_matrix = (estimated_matrix > quantile_value).astype(int)
178 | estimated_matrix = remove_cycles_from_adjacency_matrix(estimated_matrix)
179 | # estimated_matrix.to_csv("{}_adjacency.csv".format(metric))
180 |
181 | X_train, X_test = train_test_split(X, test_size=0.6, shuffle=False)
182 |
183 | X.insert(0, 'time', pd.date_range(start='2024-01-01', periods=len(X), freq='D'))
184 |
185 | X['time'] = X['time'].astype('int64') // 1_000_000_000
186 | X.columns = [f"{col}_cpu" if i < 9 else f"{col}_memory" for i, col in enumerate(X.columns)]
187 |
188 |
189 | X.iloc[:, 1:] = (X.iloc[:, 1:] - X.iloc[:, 1:].min()) / (X.iloc[:, 1:].max() - X.iloc[:, 1:].min())
190 |
191 | if model_name == 'rcd':
192 | model = RCD(config=RCDConfig(k=3,alpha_limit=0.5))
193 | results = model.find_root_causes(X_train, X_test).to_list()
194 | print(results)
195 | elif model_name == 'circa':
196 | model = HT(config=HTConfig(graph=estimated_matrix, root_cause_top_k=10))
197 | model.train(X_train)
198 | results = model.find_root_causes(X_test, metric_data[metric][label]['KPI_Feature'][0], True).to_list()
199 | elif model_name == 'epsilon_diagnosis':
200 | model = EpsilonDiagnosis(config=EpsilonDiagnosisConfig(root_cause_top_k=10))
201 | model.train(X)
202 | results = model.find_root_causes(X).to_list()
203 | else:
204 | raise ValueError('Invalid model_name')
205 |
206 | root_causes = []
207 | for result in results:
208 | root_causes.append([result['root_cause'], result['score']])
209 | if not os.path.exists('./{}_results'.format(model_name)):
210 | os.mkdir('./{}_results'.format(model_name))
211 | if not os.path.exists('./{}_results/{}'.format(model_name, data_name)):
212 | os.mkdir('./{}_results/{}'.format(model_name, data_name))
213 |
214 | root_causes = pd.DataFrame(root_causes)
215 | root_causes.columns = [['root_cause', 'score']]
216 | root_causes.to_csv("./{}_results/{}/{}_{}_{}_root_cause.csv".format(model_name, data_name, metric, model_name, data_name),
217 | index=False)
218 | final_root_results[metric] = root_causes
219 | concatenated_df = pd.concat(final_root_results.values(), ignore_index=True)
220 | concatenated_df.to_csv("./{}_results/{}/concated_df.csv".format(model_name, data_name), index=False)
221 | concatenated_df = pd.read_csv("./{}_results/{}/concated_df.csv".format(model_name, data_name))
222 | aggregated_df = concatenated_df.groupby('root_cause')['score'].sum().reset_index()
223 | aggregated_df = aggregated_df.sort_values(by='score', ascending=False)
224 | aggregated_df.to_csv("./{}_results/{}/final_{}_{}_root_cause.csv".format(model_name, data_name, model_name, data_name), index=False)
225 |
226 |
227 | if __name__ == '__main__':
228 | parser = argparse.ArgumentParser(description='Baro')
229 | parser.add_argument("-case", type=str, default='20240115', help="case of the dataset")
230 | parser.add_argument("-model", type=str, default='rcd', help="model name, [rcd, circa, epsilon_diagnosis], default is rcd")
231 | parser.set_defaults(validation=True)
232 | args = parser.parse_args()
233 | main(args)
234 |
235 |
236 |
237 |
238 |
239 |
240 |
--------------------------------------------------------------------------------
/Baseline/log_only/baro_main_log.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn import preprocessing
4 | import os
5 | from sklearn.feature_selection import VarianceThreshold
6 | from baro_algorithm import bocpd, robust_scorer
7 | import networkx as nx
8 | import argparse
9 |
10 |
11 | def remove_cycles_from_adjacency_matrix(adj_matrix: pd.DataFrame) -> pd.DataFrame:
12 | G = nx.from_pandas_adjacency(adj_matrix, create_using=nx.DiGraph())
13 | while True:
14 | try:
15 | cycle = nx.find_cycle(G, orientation='original')
16 | G.remove_edge(*cycle[0][:2])
17 |
18 | except nx.NetworkXNoCycle:
19 | break
20 | adj_matrix_no_cycles = nx.to_pandas_adjacency(G, dtype=int)
21 | adj_matrix_no_cycles = adj_matrix_no_cycles.reindex_like(adj_matrix)
22 |
23 | print("Now, the adjacency matrix does not have cycles.")
24 | return adj_matrix_no_cycles
25 |
26 |
27 | def main(args):
28 | metric_data = {}
29 | columns_common = {}
30 | method = 'baro'
31 | data_name = args.case
32 | metric_path = '../data/{}'.format(data_name)
33 | if data_name == '20220606':
34 | label = 'reviews-v3'
35 | elif data_name == '20210517' or data_name == '20210524':
36 | label = 'Book_Info_product'
37 | elif data_name == '20211203':
38 | label = 'ratings.book-info.svc.cluster.local:9080/*'
39 | elif data_name == '20240215':
40 | label = 'pod usage'
41 | elif data_name == '20240124':
42 | label = 'scenario8_app_request'
43 | elif data_name == '20231207':
44 | label = 'book_info'
45 | elif data_name == '20231221':
46 | label = 'book_info'
47 | elif data_name == '20240115':
48 | label = 'book_info'
49 | else:
50 | raise ValueError('Invalid data_name')
51 |
52 | POD_METRIC_FILE = {'log_golden_signal': 1, 'log_frequency': 1}
53 | if data_name in ['20220606', '20210517', '20210524', '20211203', '20231207', '20240124', '20240115', '20231221']:
54 | log_label = 'ratings.book-info.svc.cluster.local:9080/*'
55 | elif data_name in ['20231207', '20240215']:
56 | log_label = 'book_info'
57 | else:
58 | raise ValueError('Invalid data_name')
59 | model_name = 'baro'
60 |
61 | pathset = "./output/"
62 | if not(os.path.exists(pathset)):
63 | os.mkdir(pathset)
64 |
65 | for metric, weight in POD_METRIC_FILE.items():
66 | if metric in ['log_PCA', 'log_golden_signal', 'log_frequency']:
67 | metric_file = '{}/pod_level_{}.npy'.format(metric_path, metric)
68 | metric_data[metric] = np.load(metric_file,allow_pickle=True).item()
69 | if len(metric_data[metric].keys()) == 1:
70 | if log_label != label:
71 | metric_data[metric][label] = metric_data[metric][log_label]
72 | del metric_data[metric][log_label]
73 | else:
74 | metric_data[metric][label] = metric_data[metric]
75 | metric_data[metric][label]['Pod_Name'] = metric_data[metric][label]['Node_Name']
76 | del metric_data[metric][label]['Node_Name']
77 | metric_data[metric][label]['Sequence'] = metric_data[metric][label]['Sequence'].squeeze().T
78 | metric_data[metric][label]['KPI_Feature'] = [metric_data[metric][label]['KPI_Feature']]
79 | if columns_common:
80 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
81 | else:
82 | columns_common = list(metric_data[metric][label]['Pod_Name'])
83 | else:
84 | metric_file = '{}/pod_level_data_{}.npy'.format(metric_path, metric)
85 | metric_data[metric] = np.load(metric_file, allow_pickle=True).item()
86 | if columns_common:
87 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
88 | else:
89 | columns_common = list(metric_data[metric][label]['Pod_Name'])
90 |
91 |
92 | index_data = {}
93 | metric_names = []
94 | metric_weight_assigned = []
95 | for metric, weight in POD_METRIC_FILE.items():
96 | index_data[metric] = [metric_data[metric][label]['Pod_Name'].index(x) for x in columns_common]
97 | metric_names = metric_names + [metric]
98 | metric_weight_assigned = metric_weight_assigned + [weight]
99 |
100 | metric_weight = np.zeros((len(POD_METRIC_FILE),1))
101 | metric_id = 0
102 | final_root_results = {}
103 |
104 | for metric in metric_names:
105 | print('For metric:', metric)
106 | data = metric_data[metric]
107 | X = data[label]['Sequence']
108 | index = index_data[metric]
109 |
110 | # Preprocessing to reduce the redundant samples
111 | if X.shape[0] // 100 < 100:
112 | patch = 20
113 | else:
114 | patch = 100
115 | sample = X.shape[0]//patch
116 | X = X[:patch*sample,:]
117 | X = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1)
118 | X_metric = X[:, index]
119 | X_metric = preprocessing.normalize(X_metric, axis=0, norm = 'l1')
120 | X = np.append(X_metric, X[:, -1].reshape(-1,1), axis=1)
121 | columns = list(columns_common) + data[label]['KPI_Feature']
122 |
123 |
124 | std = np.std(X[:, :-1], axis=0)
125 | idx_std = [i for i, x in enumerate(std > 1e-5) if x]
126 | if len(idx_std) == 0:
127 | metric_weight[metric_id] = 0
128 | metric_id = metric_id + 1
129 | print(metric,' all pods are all constant or quasi-constant')
130 | continue
131 |
132 | selector = VarianceThreshold(threshold = 0)
133 | X_var = selector.fit_transform(X[:, :-1])
134 | idx = selector.get_support(indices = True)
135 | #print('X shape after variance: ', X_var.shape)
136 | if X_var.shape[1] < 1:
137 | metric_weight[metric_id] = 0
138 | metric_id = metric_id + 1
139 | print(metric,' all pods are all constant or quasi-constant')
140 | continue
141 |
142 | # causal_score = np.zeros(len(columns_common))
143 | mask = np.full(len(columns_common), False,dtype=bool)
144 | mask[idx] = True
145 | idx = list(idx) + [X.shape[1]-1]
146 | X = X[:, idx]
147 | columns = [columns[i] for i in idx]
148 | X = pd.DataFrame(X,columns=columns)
149 |
150 | X.insert(0,'time', pd.date_range(start='2024-01-01', periods=len(X), freq='D'))
151 |
152 | X['time'] = X['time'].astype('int64') // 1_000_000_000
153 | X.columns = [f"{col}_cpu" if i < 9 else f"{col}_memory" for i, col in enumerate(X.columns)]
154 |
155 | X.iloc[:, 1:] = (X.iloc[:, 1:] - X.iloc[:, 1:].min()) / (X.iloc[:, 1:].max() - X.iloc[:, 1:].min())
156 |
157 | anomalies = bocpd(X)
158 | print("Anomalies are detected at timestep:", anomalies[0])
159 | results = robust_scorer(X,anomalies=anomalies)
160 | print(results)
161 |
162 | root_causes = []
163 | for result in results:
164 | (root_cause, score) = result
165 | root_causes.append([root_cause, score])
166 | if not os.path.exists('./{}_results'.format(method)):
167 | os.mkdir('./{}_results'.format(method))
168 | if not os.path.exists('./{}_results/{}'.format(method, data_name)):
169 | os.mkdir('./{}_results/{}'.format(method, data_name))
170 |
171 | root_causes = pd.DataFrame(root_causes)
172 | root_causes.columns = [['root_cause', 'score']]
173 | root_causes.to_csv("./{}_results/{}/{}_{}_{}_root_cause.csv".format(method, data_name, metric, model_name, data_name),index=False)
174 |
175 | final_root_results[metric] = root_causes
176 |
177 | concatenated_df = pd.concat(final_root_results.values(), ignore_index=True)
178 | concatenated_df.to_csv("./{}_results/{}/concated_df.csv".format(method, data_name), index=False)
179 | concatenated_df = pd.read_csv("./{}_results/{}/concated_df.csv".format(method, data_name))
180 | aggregated_df = concatenated_df.groupby('root_cause')['score'].sum().reset_index()
181 | aggregated_df = aggregated_df.sort_values(by='score', ascending=False)
182 | aggregated_df.to_csv("./{}_results/{}/final_{}_{}_root_cause.csv".format(method, data_name, model_name, data_name), index=False)
183 |
184 |
185 | if __name__ == '__main__':
186 | parser = argparse.ArgumentParser(description='Baro')
187 | parser.add_argument("-case", type=str, default='20240115', help="case of the dataset")
188 | parser.set_defaults(validation=True)
189 | args = parser.parse_args()
190 | main(args)
191 |
--------------------------------------------------------------------------------
/Baseline/metric_only/RCA_methods_metric.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn import preprocessing
4 | import os
5 | from sklearn.feature_selection import VarianceThreshold
6 | from sklearn.model_selection import train_test_split
7 | from causalnex.structure.notears import from_pandas
8 | from pyrca.analyzers.ht import HT, HTConfig
9 | from pyrca.analyzers.epsilon_diagnosis import EpsilonDiagnosis, EpsilonDiagnosisConfig
10 | from pyrca.analyzers.rcd import RCD, RCDConfig
11 | # from pyrca.analyzers
12 | import networkx as nx
13 | import argparse
14 |
15 |
16 | def remove_cycles_from_adjacency_matrix(adj_matrix: pd.DataFrame) -> pd.DataFrame:
17 | G = nx.from_pandas_adjacency(adj_matrix, create_using=nx.DiGraph())
18 | while True:
19 | try:
20 | cycle = nx.find_cycle(G, orientation='original')
21 | G.remove_edge(*cycle[0][:2])
22 |
23 | except nx.NetworkXNoCycle:
24 | break
25 | adj_matrix_no_cycles = nx.to_pandas_adjacency(G, dtype=int)
26 | adj_matrix_no_cycles = adj_matrix_no_cycles.reindex_like(adj_matrix)
27 |
28 | print("Now, the adjacency matrix does not have cycles.")
29 | return adj_matrix_no_cycles
30 |
31 |
32 |
33 | def main(args):
34 | model_name = args.model
35 | data_name = args.case
36 | metric_data = {}
37 | columns_common = {}
38 | metric_path = '../data/{}'.format(data_name)
39 | if data_name == '20220606':
40 | label = 'reviews-v3'
41 | elif data_name == '20210517' or data_name == '20210524':
42 | label = 'Book_Info_product'
43 | elif data_name == '20211203':
44 | label = 'ratings.book-info.svc.cluster.local:9080/*'
45 | elif data_name == '20240215':
46 | label = 'pod usage'
47 | elif data_name == '20240124':
48 | label = 'scenario8_app_request'
49 | elif data_name == '20231207':
50 | label = 'book_info'
51 | elif data_name == '20231221':
52 | label = 'book_info'
53 | elif data_name == '20240115':
54 | label = 'book_info'
55 | else:
56 | raise ValueError('Invalid data_name')
57 |
58 | if data_name in ['20220606', '20210517', '20210524', '20211203', '20231207']:
59 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1,
60 | 'received_bandwidth': 1, 'transmit_bandwidth': 1}
61 | log_label = 'ratings.book-info.svc.cluster.local:9080/*'
62 | elif data_name in ['20231207']:
63 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1,
64 | 'received_bandwidth': 1, 'transmit_bandwidth': 1}
65 | log_label = 'book_info'
66 | elif data_name in ['20240124']:
67 | POD_METRIC_FILE = {'cpu_usage': 1, 'disk_used': 1, 'diskio_reads': 1, 'diskio_writes': 1, 'memory_used': 1,
68 | 'netstat_established': 1, 'swap_used': 1}
69 | log_label = 'ratings.book-info.svc.cluster.local:9080/*'
70 | elif data_name in ['20240215']:
71 | POD_METRIC_FILE = {'pod_cpu_limit': 1, 'pod_cpu_usage_total': 1, 'pod_cpu_utilization_over_pod_limit': 1, 'pod_memory_limit': 1,
72 | 'pod_memory_utilization_over_pod_limit': 1, 'pod_memory_working_set': 1, 'pod_network_rx_bytes': 1, 'pod_network_tx_bytes':1}
73 | log_label = 'book_info'
74 | elif data_name in ['20240115']:
75 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1,
76 | 'rate_storage_iops': 1, 'received_bandwidth': 1, 'transmit_bandwidth': 1}
77 | log_label = 'ratings.book-info.svc.cluster.local:9080/*'
78 | elif data_name in ['20231221']:
79 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1,
80 | 'received_bandwidth': 1, 'transmit_bandwidth': 1}
81 | log_label = 'ratings.book-info.svc.cluster.local:9080/*'
82 | else:
83 | raise ValueError('Invalid data_name')
84 |
85 | pathset = "./output/"
86 | if not (os.path.exists(pathset)):
87 | os.mkdir(pathset)
88 |
89 | for metric, weight in POD_METRIC_FILE.items():
90 | if metric in ['log_PCA', 'log_golden_signal', 'log_frequency']:
91 | metric_file = '{}/pod_level_{}.npy'.format(metric_path, metric)
92 | metric_data[metric] = np.load(metric_file, allow_pickle=True).item()
93 | # log_label = 'ratings.book-info.svc.cluster.local:9080/*'
94 | if len(metric_data[metric].keys()) == 1:
95 | if log_label != label:
96 | metric_data[metric][label] = metric_data[metric][log_label]
97 | del metric_data[metric][log_label]
98 | else:
99 | metric_data[metric][label] = metric_data[metric]
100 | metric_data[metric][label]['Pod_Name'] = metric_data[metric][label]['Node_Name']
101 | del metric_data[metric][label]['Node_Name']
102 | metric_data[metric][label]['Sequence'] = metric_data[metric][label]['Sequence'].squeeze().T
103 | metric_data[metric][label]['KPI_Feature'] = [metric_data[metric][label]['KPI_Feature']]
104 | if columns_common:
105 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
106 | else:
107 | columns_common = list(metric_data[metric][label]['Pod_Name'])
108 | else:
109 | metric_file = '{}/pod_level_data_{}.npy'.format(metric_path, metric)
110 | metric_data[metric] = np.load(metric_file, allow_pickle=True).item()
111 | if columns_common:
112 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
113 | else:
114 | columns_common = list(metric_data[metric][label]['Pod_Name'])
115 |
116 | index_data = {}
117 | metric_names = []
118 | metric_weight_assigned = []
119 | for metric, weight in POD_METRIC_FILE.items():
120 | index_data[metric] = [metric_data[metric][label]['Pod_Name'].index(x) for x in columns_common]
121 | metric_names = metric_names + [metric]
122 | metric_weight_assigned = metric_weight_assigned + [weight]
123 |
124 | metric_weight = np.zeros((len(POD_METRIC_FILE), 1))
125 | metric_id = 0
126 | final_root_results = {}
127 |
128 | for metric in metric_names:
129 | print('For metric:', metric)
130 | data = metric_data[metric]
131 | X = data[label]['Sequence']
132 | index = index_data[metric]
133 | # Preprocessing to reduce the redundant samples
134 | if X.shape[0] // 100 < 100:
135 | patch = 20
136 | else:
137 | patch = 100
138 | sample = X.shape[0] // patch
139 | X = X[:patch * sample, :]
140 | X = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1)
141 | X_metric = X[:, index]
142 | X_metric = preprocessing.normalize(X_metric, axis=0, norm='l1')
143 | X = np.append(X_metric, X[:, -1].reshape(-1, 1), axis=1)
144 | columns = list(columns_common) + data[label]['KPI_Feature']
145 |
146 | std = np.std(X[:, :-1], axis=0)
147 | idx_std = [i for i, x in enumerate(std > 1e-5) if x]
148 | if len(idx_std) == 0:
149 | metric_weight[metric_id] = 0
150 | metric_id = metric_id + 1
151 | print(metric, ' all pods are all constant or quasi-constant')
152 | continue
153 |
154 | selector = VarianceThreshold(threshold=0)
155 | X_var = selector.fit_transform(X[:, :-1])
156 | idx = selector.get_support(indices=True)
157 | # print('X shape after variance: ', X_var.shape)
158 | if X_var.shape[1] < 1:
159 | metric_weight[metric_id] = 0
160 | metric_id = metric_id + 1
161 | print(metric, ' all pods are all constant or quasi-constant')
162 | continue
163 |
164 | mask = np.full(len(columns_common), False, dtype=bool)
165 | mask[idx] = True
166 | idx = list(idx) + [X.shape[1] - 1]
167 | X = X[:, idx]
168 | columns = [columns[i] for i in idx]
169 | X = pd.DataFrame(X, columns=columns)
170 | if model_name == 'circa':
171 | sm = from_pandas(X)
172 | estimated_matrix = nx.to_pandas_adjacency(sm)
173 | quantile_value = np.quantile(estimated_matrix.values.flatten(), 0.95)
174 | estimated_matrix = (estimated_matrix > quantile_value).astype(int)
175 | estimated_matrix = remove_cycles_from_adjacency_matrix(estimated_matrix)
176 | # estimated_matrix.to_csv("{}_adjacency.csv".format(metric))
177 |
178 | X_train, X_test = train_test_split(X, test_size=0.6, shuffle=False)
179 |
180 | X.insert(0, 'time', pd.date_range(start='2024-01-01', periods=len(X), freq='D'))
181 |
182 | X['time'] = X['time'].astype('int64') // 1_000_000_000
183 | X.columns = [f"{col}_cpu" if i < 9 else f"{col}_memory" for i, col in enumerate(X.columns)]
184 |
185 |
186 | X.iloc[:, 1:] = (X.iloc[:, 1:] - X.iloc[:, 1:].min()) / (X.iloc[:, 1:].max() - X.iloc[:, 1:].min())
187 |
188 | if model_name == 'rcd':
189 | model = RCD(config=RCDConfig(k=3,alpha_limit=0.5))
190 | results = model.find_root_causes(X_train, X_test).to_list()
191 | print(results)
192 | elif model_name == 'circa':
193 | model = HT(config=HTConfig(graph=estimated_matrix, root_cause_top_k=10))
194 | model.train(X_train)
195 | results = model.find_root_causes(X_test, metric_data[metric][label]['KPI_Feature'][0], True).to_list()
196 | elif model_name == 'epsilon_diagnosis':
197 | model = EpsilonDiagnosis(config=EpsilonDiagnosisConfig(root_cause_top_k=10))
198 | model.train(X)
199 | results = model.find_root_causes(X).to_list()
200 | else:
201 | raise ValueError('Invalid model_name')
202 |
203 | root_causes = []
204 | for result in results:
205 | root_causes.append([result['root_cause'], result['score']])
206 | if not os.path.exists('./{}_results'.format(model_name)):
207 | os.mkdir('./{}_results'.format(model_name))
208 | if not os.path.exists('./{}_results/{}'.format(model_name, data_name)):
209 | os.mkdir('./{}_results/{}'.format(model_name, data_name))
210 |
211 | root_causes = pd.DataFrame(root_causes)
212 | root_causes.columns = [['root_cause', 'score']]
213 | root_causes.to_csv("./{}_results/{}/{}_{}_{}_root_cause.csv".format(model_name, data_name, metric, model_name, data_name),
214 | index=False)
215 | final_root_results[metric] = root_causes
216 | concatenated_df = pd.concat(final_root_results.values(), ignore_index=True)
217 | concatenated_df.to_csv("./{}_results/{}/concated_df.csv".format(model_name, data_name), index=False)
218 | concatenated_df = pd.read_csv("./{}_results/{}/concated_df.csv".format(model_name, data_name))
219 | aggregated_df = concatenated_df.groupby('root_cause')['score'].sum().reset_index()
220 | aggregated_df = aggregated_df.sort_values(by='score', ascending=False)
221 | aggregated_df.to_csv("./{}_results/{}/final_{}_{}_root_cause.csv".format(model_name, data_name, model_name, data_name), index=False)
222 |
223 |
224 | if __name__ == '__main__':
225 | parser = argparse.ArgumentParser(description='Baro')
226 | parser.add_argument("-case", type=str, default='20240115', help="case of the dataset")
227 | parser.add_argument("-model", type=str, default='rcd', help="model name, [rcd, circa, epsilon_diagnosis], default is rcd")
228 | parser.set_defaults(validation=True)
229 | args = parser.parse_args()
230 | main(args)
231 |
232 |
233 |
234 |
235 |
236 |
237 |
--------------------------------------------------------------------------------
/Baseline/metric_only/baro_main_metric.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn import preprocessing
4 | import os
5 | from sklearn.feature_selection import VarianceThreshold
6 | from baro_algorithm import bocpd, robust_scorer
7 | import networkx as nx
8 | import argparse
9 |
10 |
11 | def remove_cycles_from_adjacency_matrix(adj_matrix: pd.DataFrame) -> pd.DataFrame:
12 | G = nx.from_pandas_adjacency(adj_matrix, create_using=nx.DiGraph())
13 | while True:
14 | try:
15 | cycle = nx.find_cycle(G, orientation='original')
16 | G.remove_edge(*cycle[0][:2])
17 |
18 | except nx.NetworkXNoCycle:
19 | break
20 | adj_matrix_no_cycles = nx.to_pandas_adjacency(G, dtype=int)
21 | adj_matrix_no_cycles = adj_matrix_no_cycles.reindex_like(adj_matrix)
22 |
23 | print("Now, the adjacency matrix does not have cycles.")
24 | return adj_matrix_no_cycles
25 |
26 |
27 | def main(args):
28 | metric_data = {}
29 | columns_common = {}
30 | method = 'baro'
31 | data_name = args.case
32 | metric_path = '../data/{}'.format(data_name)
33 | if data_name == '20220606':
34 | label = 'reviews-v3'
35 | elif data_name == '20210517' or data_name == '20210524':
36 | label = 'Book_Info_product'
37 | elif data_name == '20211203':
38 | label = 'ratings.book-info.svc.cluster.local:9080/*'
39 | elif data_name == '20240215':
40 | label = 'pod usage'
41 | elif data_name == '20240124':
42 | label = 'scenario8_app_request'
43 | elif data_name == '20231207':
44 | label = 'book_info'
45 | elif data_name == '20231221':
46 | label = 'book_info'
47 | elif data_name == '20240115':
48 | label = 'book_info'
49 | else:
50 | raise ValueError('Invalid data_name')
51 |
52 | if data_name in ['20220606', '20210517', '20210524', '20211203', '20231207']:
53 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1,
54 | 'received_bandwidth': 1, 'transmit_bandwidth': 1}
55 | log_label = 'ratings.book-info.svc.cluster.local:9080/*'
56 | elif data_name in ['20231207']:
57 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1,
58 | 'received_bandwidth': 1, 'transmit_bandwidth': 1}
59 | log_label = 'book_info'
60 | elif data_name in ['20240124']:
61 | POD_METRIC_FILE = {'cpu_usage': 1, 'disk_used': 1, 'diskio_reads': 1, 'diskio_writes': 1, 'memory_used': 1,
62 | 'netstat_established': 1, 'swap_used': 1}
63 | log_label = 'ratings.book-info.svc.cluster.local:9080/*'
64 | elif data_name in ['20240215']:
65 | POD_METRIC_FILE = {'pod_cpu_limit': 1, 'pod_cpu_usage_total': 1, 'pod_cpu_utilization_over_pod_limit': 1, 'pod_memory_limit': 1,
66 | 'pod_memory_utilization_over_pod_limit': 1, 'pod_memory_working_set': 1, 'pod_network_rx_bytes': 1, 'pod_network_tx_bytes':1}
67 | log_label = 'book_info'
68 | elif data_name in ['20240115']:
69 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1,
70 | 'rate_storage_iops': 1, 'received_bandwidth': 1, 'transmit_bandwidth': 1}
71 | log_label = 'ratings.book-info.svc.cluster.local:9080/*'
72 | elif data_name in ['20231221']:
73 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1,
74 | 'received_bandwidth': 1, 'transmit_bandwidth': 1}
75 | log_label = 'ratings.book-info.svc.cluster.local:9080/*'
76 | else:
77 | raise ValueError('Invalid data_name')
78 | model_name = 'baro'
79 |
80 | pathset = "./output/"
81 | if not(os.path.exists(pathset)):
82 | os.mkdir(pathset)
83 |
84 | for metric, weight in POD_METRIC_FILE.items():
85 | if metric in ['log_PCA', 'log_golden_signal', 'log_frequency']:
86 | metric_file = '{}/pod_level_{}.npy'.format(metric_path, metric)
87 | metric_data[metric] = np.load(metric_file,allow_pickle=True).item()
88 | if len(metric_data[metric].keys()) == 1:
89 | if log_label != label:
90 | metric_data[metric][label] = metric_data[metric][log_label]
91 | del metric_data[metric][log_label]
92 | else:
93 | metric_data[metric][label] = metric_data[metric]
94 | metric_data[metric][label]['Pod_Name'] = metric_data[metric][label]['Node_Name']
95 | del metric_data[metric][label]['Node_Name']
96 | metric_data[metric][label]['Sequence'] = metric_data[metric][label]['Sequence'].squeeze().T
97 | metric_data[metric][label]['KPI_Feature'] = [metric_data[metric][label]['KPI_Feature']]
98 | if columns_common:
99 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
100 | else:
101 | columns_common = list(metric_data[metric][label]['Pod_Name'])
102 | else:
103 | metric_file = '{}/pod_level_data_{}.npy'.format(metric_path, metric)
104 | metric_data[metric] = np.load(metric_file, allow_pickle=True).item()
105 | if columns_common:
106 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
107 | else:
108 | columns_common = list(metric_data[metric][label]['Pod_Name'])
109 |
110 |
111 | index_data = {}
112 | metric_names = []
113 | metric_weight_assigned = []
114 | for metric, weight in POD_METRIC_FILE.items():
115 | index_data[metric] = [metric_data[metric][label]['Pod_Name'].index(x) for x in columns_common]
116 | metric_names = metric_names + [metric]
117 | metric_weight_assigned = metric_weight_assigned + [weight]
118 |
119 | metric_weight = np.zeros((len(POD_METRIC_FILE),1))
120 | metric_id = 0
121 | final_root_results = {}
122 |
123 | for metric in metric_names:
124 | print('For metric:', metric)
125 | data = metric_data[metric]
126 | X = data[label]['Sequence']
127 | index = index_data[metric]
128 |
129 | # Preprocessing to reduce the redundant samples
130 | if X.shape[0] // 100 < 100:
131 | patch = 20
132 | else:
133 | patch = 100
134 | sample = X.shape[0]//patch
135 | X = X[:patch*sample,:]
136 | X = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1)
137 | X_metric = X[:, index]
138 | X_metric = preprocessing.normalize(X_metric, axis=0, norm = 'l1')
139 | X = np.append(X_metric, X[:, -1].reshape(-1,1), axis=1)
140 | columns = list(columns_common) + data[label]['KPI_Feature']
141 |
142 |
143 | std = np.std(X[:, :-1], axis=0)
144 | idx_std = [i for i, x in enumerate(std > 1e-5) if x]
145 | if len(idx_std) == 0:
146 | metric_weight[metric_id] = 0
147 | metric_id = metric_id + 1
148 | print(metric,' all pods are all constant or quasi-constant')
149 | continue
150 |
151 | selector = VarianceThreshold(threshold = 0)
152 | X_var = selector.fit_transform(X[:, :-1])
153 | idx = selector.get_support(indices = True)
154 | #print('X shape after variance: ', X_var.shape)
155 | if X_var.shape[1] < 1:
156 | metric_weight[metric_id] = 0
157 | metric_id = metric_id + 1
158 | print(metric,' all pods are all constant or quasi-constant')
159 | continue
160 |
161 | # causal_score = np.zeros(len(columns_common))
162 | mask = np.full(len(columns_common), False,dtype=bool)
163 | mask[idx] = True
164 | idx = list(idx) + [X.shape[1]-1]
165 | X = X[:, idx]
166 | columns = [columns[i] for i in idx]
167 | X = pd.DataFrame(X,columns=columns)
168 |
169 | X.insert(0,'time', pd.date_range(start='2024-01-01', periods=len(X), freq='D'))
170 |
171 | X['time'] = X['time'].astype('int64') // 1_000_000_000
172 | X.columns = [f"{col}_cpu" if i < 9 else f"{col}_memory" for i, col in enumerate(X.columns)]
173 |
174 | X.iloc[:, 1:] = (X.iloc[:, 1:] - X.iloc[:, 1:].min()) / (X.iloc[:, 1:].max() - X.iloc[:, 1:].min())
175 |
176 | anomalies = bocpd(X)
177 | print("Anomalies are detected at timestep:", anomalies[0])
178 | results = robust_scorer(X,anomalies=anomalies)
179 | print(results)
180 |
181 | root_causes = []
182 | for result in results:
183 | (root_cause, score) = result
184 | root_causes.append([root_cause, score])
185 | if not os.path.exists('./{}_results'.format(method)):
186 | os.mkdir('./{}_results'.format(method))
187 | if not os.path.exists('./{}_results/{}'.format(method, data_name)):
188 | os.mkdir('./{}_results/{}'.format(method, data_name))
189 |
190 | root_causes = pd.DataFrame(root_causes)
191 | root_causes.columns = [['root_cause', 'score']]
192 | root_causes.to_csv("./{}_results/{}/{}_{}_{}_root_cause.csv".format(method, data_name, metric, model_name, data_name),index=False)
193 |
194 | final_root_results[metric] = root_causes
195 |
196 | concatenated_df = pd.concat(final_root_results.values(), ignore_index=True)
197 | concatenated_df.to_csv("./{}_results/{}/concated_df.csv".format(method, data_name), index=False)
198 | concatenated_df = pd.read_csv("./{}_results/{}/concated_df.csv".format(method, data_name))
199 | aggregated_df = concatenated_df.groupby('root_cause')['score'].sum().reset_index()
200 | aggregated_df = aggregated_df.sort_values(by='score', ascending=False)
201 | aggregated_df.to_csv("./{}_results/{}/final_{}_{}_root_cause.csv".format(method, data_name, model_name, data_name), index=False)
202 |
203 |
204 | if __name__ == '__main__':
205 | parser = argparse.ArgumentParser(description='Baro')
206 | parser.add_argument("-case", type=str, default='20240115', help="case of the dataset")
207 | parser.set_defaults(validation=True)
208 | args = parser.parse_args()
209 | main(args)
210 |
--------------------------------------------------------------------------------
/Baseline/multimodal/baro_main_combined.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn import preprocessing
4 | import os
5 | from sklearn.feature_selection import VarianceThreshold
6 | from baro_algorithm import bocpd, robust_scorer
7 | import networkx as nx
8 | import argparse
9 |
10 |
11 | def remove_cycles_from_adjacency_matrix(adj_matrix: pd.DataFrame) -> pd.DataFrame:
12 | G = nx.from_pandas_adjacency(adj_matrix, create_using=nx.DiGraph())
13 | while True:
14 | try:
15 | cycle = nx.find_cycle(G, orientation='original')
16 | G.remove_edge(*cycle[0][:2])
17 |
18 | except nx.NetworkXNoCycle:
19 | break
20 | adj_matrix_no_cycles = nx.to_pandas_adjacency(G, dtype=int)
21 | adj_matrix_no_cycles = adj_matrix_no_cycles.reindex_like(adj_matrix)
22 |
23 | print("Now, the adjacency matrix does not have cycles.")
24 | return adj_matrix_no_cycles
25 |
26 |
27 | def main(args):
28 | metric_data = {}
29 | columns_common = {}
30 | method = 'baro'
31 | data_name = args.case
32 | metric_path = '../data/{}'.format(data_name)
33 | if data_name == '20220606':
34 | label = 'reviews-v3'
35 | elif data_name == '20210517' or data_name == '20210524':
36 | label = 'Book_Info_product'
37 | elif data_name == '20211203':
38 | label = 'ratings.book-info.svc.cluster.local:9080/*'
39 | elif data_name == '20240215':
40 | label = 'pod usage'
41 | elif data_name == '20240124':
42 | label = 'scenario8_app_request'
43 | elif data_name == '20231207':
44 | label = 'book_info'
45 | elif data_name == '20231221':
46 | label = 'book_info'
47 | elif data_name == '20240115':
48 | label = 'book_info'
49 | else:
50 | raise ValueError('Invalid data_name')
51 |
52 | if data_name in ['20220606', '20210517', '20210524', '20211203', '20231207']:
53 | POD_METRIC_FILE = {'log_golden_signal': 1, 'log_frequency': 1, 'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1,
54 | 'received_bandwidth': 1, 'transmit_bandwidth': 1}
55 | log_label = 'ratings.book-info.svc.cluster.local:9080/*'
56 | elif data_name in ['20231207']:
57 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1,
58 | 'received_bandwidth': 1, 'transmit_bandwidth': 1, 'log_golden_signal': 1,
59 | 'log_frequency': 1}
60 | log_label = 'book_info'
61 | elif data_name in ['20240124']:
62 | POD_METRIC_FILE = {'log_golden_signal': 1, 'log_frequency': 1, 'cpu_usage': 1, 'disk_used': 1, 'diskio_reads': 1, 'diskio_writes': 1, 'memory_used': 1,
63 | 'netstat_established': 1, 'swap_used': 1}
64 | log_label = 'ratings.book-info.svc.cluster.local:9080/*'
65 | elif data_name in ['20240215']:
66 | POD_METRIC_FILE = {'pod_cpu_limit': 1, 'pod_cpu_usage_total': 1, 'pod_cpu_utilization_over_pod_limit': 1, 'pod_memory_limit': 1,
67 | 'pod_memory_utilization_over_pod_limit': 1, 'pod_memory_working_set': 1, 'pod_network_rx_bytes': 1, 'pod_network_tx_bytes':1,
68 | 'log_golden_signal': 1, 'log_frequency': 1}
69 | log_label = 'book_info'
70 | elif data_name in ['20240115']:
71 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1,
72 | 'rate_storage_iops': 1, 'received_bandwidth': 1, 'transmit_bandwidth': 1, 'log_golden_signal': 1,
73 | 'log_frequency': 1}
74 | log_label = 'ratings.book-info.svc.cluster.local:9080/*'
75 | elif data_name in ['20231221']:
76 | POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1,
77 | 'received_bandwidth': 1, 'transmit_bandwidth': 1, 'log_golden_signal': 1, 'log_frequency': 1}
78 | log_label = 'ratings.book-info.svc.cluster.local:9080/*'
79 | else:
80 | raise ValueError('Invalid data_name')
81 | model_name = 'baro'
82 |
83 | pathset = "./output/"
84 | if not(os.path.exists(pathset)):
85 | os.mkdir(pathset)
86 |
87 | for metric, weight in POD_METRIC_FILE.items():
88 | if metric in ['log_PCA', 'log_golden_signal', 'log_frequency']:
89 | metric_file = '{}/pod_level_{}.npy'.format(metric_path, metric)
90 | metric_data[metric] = np.load(metric_file,allow_pickle=True).item()
91 | if len(metric_data[metric].keys()) == 1:
92 | if log_label != label:
93 | metric_data[metric][label] = metric_data[metric][log_label]
94 | del metric_data[metric][log_label]
95 | else:
96 | metric_data[metric][label] = metric_data[metric]
97 | metric_data[metric][label]['Pod_Name'] = metric_data[metric][label]['Node_Name']
98 | del metric_data[metric][label]['Node_Name']
99 | metric_data[metric][label]['Sequence'] = metric_data[metric][label]['Sequence'].squeeze().T
100 | metric_data[metric][label]['KPI_Feature'] = [metric_data[metric][label]['KPI_Feature']]
101 | if columns_common:
102 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
103 | else:
104 | columns_common = list(metric_data[metric][label]['Pod_Name'])
105 | else:
106 | metric_file = '{}/pod_level_data_{}.npy'.format(metric_path, metric)
107 | metric_data[metric] = np.load(metric_file, allow_pickle=True).item()
108 | if columns_common:
109 | columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
110 | else:
111 | columns_common = list(metric_data[metric][label]['Pod_Name'])
112 |
113 |
114 | index_data = {}
115 | metric_names = []
116 | metric_weight_assigned = []
117 | for metric, weight in POD_METRIC_FILE.items():
118 | index_data[metric] = [metric_data[metric][label]['Pod_Name'].index(x) for x in columns_common]
119 | metric_names = metric_names + [metric]
120 | metric_weight_assigned = metric_weight_assigned + [weight]
121 |
122 | metric_weight = np.zeros((len(POD_METRIC_FILE),1))
123 | metric_id = 0
124 | final_root_results = {}
125 |
126 | for metric in metric_names:
127 | print('For metric:', metric)
128 | data = metric_data[metric]
129 | X = data[label]['Sequence']
130 | index = index_data[metric]
131 |
132 | # Preprocessing to reduce the redundant samples
133 | if X.shape[0] // 100 < 100:
134 | patch = 20
135 | else:
136 | patch = 100
137 | sample = X.shape[0]//patch
138 | X = X[:patch*sample,:]
139 | X = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1)
140 | X_metric = X[:, index]
141 | X_metric = preprocessing.normalize(X_metric, axis=0, norm = 'l1')
142 | X = np.append(X_metric, X[:, -1].reshape(-1,1), axis=1)
143 | columns = list(columns_common) + data[label]['KPI_Feature']
144 |
145 |
146 | std = np.std(X[:, :-1], axis=0)
147 | idx_std = [i for i, x in enumerate(std > 1e-5) if x]
148 | if len(idx_std) == 0:
149 | metric_weight[metric_id] = 0
150 | metric_id = metric_id + 1
151 | print(metric,' all pods are all constant or quasi-constant')
152 | continue
153 |
154 | selector = VarianceThreshold(threshold = 0)
155 | X_var = selector.fit_transform(X[:, :-1])
156 | idx = selector.get_support(indices = True)
157 | #print('X shape after variance: ', X_var.shape)
158 | if X_var.shape[1] < 1:
159 | metric_weight[metric_id] = 0
160 | metric_id = metric_id + 1
161 | print(metric,' all pods are all constant or quasi-constant')
162 | continue
163 |
164 | # causal_score = np.zeros(len(columns_common))
165 | mask = np.full(len(columns_common), False,dtype=bool)
166 | mask[idx] = True
167 | idx = list(idx) + [X.shape[1]-1]
168 | X = X[:, idx]
169 | columns = [columns[i] for i in idx]
170 | X = pd.DataFrame(X,columns=columns)
171 |
172 | X.insert(0,'time', pd.date_range(start='2024-01-01', periods=len(X), freq='D'))
173 |
174 | X['time'] = X['time'].astype('int64') // 1_000_000_000
175 | X.columns = [f"{col}_cpu" if i < 9 else f"{col}_memory" for i, col in enumerate(X.columns)]
176 |
177 | X.iloc[:, 1:] = (X.iloc[:, 1:] - X.iloc[:, 1:].min()) / (X.iloc[:, 1:].max() - X.iloc[:, 1:].min())
178 |
179 | anomalies = bocpd(X)
180 | print("Anomalies are detected at timestep:", anomalies[0])
181 | results = robust_scorer(X,anomalies=anomalies)
182 | print(results)
183 |
184 | root_causes = []
185 | for result in results:
186 | (root_cause, score) = result
187 | root_causes.append([root_cause, score])
188 | if not os.path.exists('./{}_results'.format(method)):
189 | os.mkdir('./{}_results'.format(method))
190 | if not os.path.exists('./{}_results/{}'.format(method, data_name)):
191 | os.mkdir('./{}_results/{}'.format(method, data_name))
192 |
193 | root_causes = pd.DataFrame(root_causes)
194 | root_causes.columns = [['root_cause', 'score']]
195 | root_causes.to_csv("./{}_results/{}/{}_{}_{}_root_cause.csv".format(method, data_name, metric, model_name, data_name),index=False)
196 |
197 | final_root_results[metric] = root_causes
198 |
199 | concatenated_df = pd.concat(final_root_results.values(), ignore_index=True)
200 | concatenated_df.to_csv("./{}_results/{}/concated_df.csv".format(method, data_name), index=False)
201 | concatenated_df = pd.read_csv("./{}_results/{}/concated_df.csv".format(method, data_name))
202 | aggregated_df = concatenated_df.groupby('root_cause')['score'].sum().reset_index()
203 | aggregated_df = aggregated_df.sort_values(by='score', ascending=False)
204 | aggregated_df.to_csv("./{}_results/{}/final_{}_{}_root_cause.csv".format(method, data_name, model_name, data_name), index=False)
205 |
206 |
207 | if __name__ == '__main__':
208 | parser = argparse.ArgumentParser(description='Baro')
209 | parser.add_argument("-case", type=str, default='20240115', help="case of the dataset")
210 | parser.set_defaults(validation=True)
211 | args = parser.parse_args()
212 | main(args)
213 |
--------------------------------------------------------------------------------
/Crossiant_Metadata/Crossiant_Metadata_Cloud_Computing_Original.json:
--------------------------------------------------------------------------------
1 | {
2 | "_id": "6664dbf513d2f73a727d47ff",
3 | "id": "Lemma-RCA-NEC/Cloud_Computing_Original",
4 | "author": "Lemma-RCA-NEC",
5 | "sha": "78ec9604fd0446d875175650c99acf30c95158c2",
6 | "lastModified": "2024-06-09T03:19:45.000Z",
7 | "private": false,
8 | "gated": false,
9 | "disabled": false,
10 | "tags": [
11 | "task_categories:time-series-forecasting",
12 | "size_categories:100M"
12 | DRAIN:
13 | # Similarity threshold
14 | sim_th: 0.4
15 | # Depth of all leaf nodes
16 | depth: 4
17 | max_children: 100
18 | max_clusters: 1024
19 | #extra_delimiters: ["_"]
20 | extra_delimiters: "[]"
21 | PROFILING:
22 | enabled: False
23 | report_sec: 30
24 | EXTRA:
25 | input_file_name:
26 | - "*messages"
27 | out_dir: "./drain32_result"
28 | log_format: "