├── .DS_Store
├── Baseline
    ├── .DS_Store
    ├── .idea
    │   ├── .gitignore
    │   ├── Baseline.iml
    │   ├── inspectionProfiles
    │   │   └── profiles_settings.xml
    │   ├── misc.xml
    │   ├── modules.xml
    │   └── vcs.xml
    ├── FastPC
    │   ├── baseline_evaluation.py
    │   ├── fastPC.py
    │   ├── hannlstm.py
    │   ├── interdependent.py
    │   ├── lbfgsb_scipy.py
    │   ├── libspot.so
    │   ├── pyspot.py
    │   ├── rca.py
    │   ├── test_FastPC_node_metric.py
    │   ├── test_FastPC_pod_combine.py
    │   ├── test_FastPC_pod_log.py
    │   ├── test_FastPC_pod_metric.py
    │   └── trace_expm.py
    ├── Nezha
    │   ├── 20240124
    │   │   ├── 20240124-fault_list.json
    │   │   └── root_cause_hipster.json
    │   ├── INSTALL.md
    │   ├── LICENSE
    │   ├── alarm.py
    │   ├── data_integrate.py
    │   ├── data_parser.py
    │   ├── log.py
    │   ├── log_parsing.py
    │   ├── main.py
    │   ├── pattern_miner.py
    │   ├── pattern_ranker.py
    │   └── requirements.txt
    ├── Readme.md
    ├── SWAT&WADI
    │   ├── .DS_Store
    │   ├── baro-evaluation.py
    │   ├── baro.py
    │   ├── baro_algorithm.py
    │   ├── pyrca-evaluation.py
    │   └── pyrca-main.py
    ├── log_only
    │   ├── RCA_methods_log.py
    │   ├── baro_algorithm.py
    │   └── baro_main_log.py
    ├── metric_only
    │   ├── RCA_methods_metric.py
    │   ├── baro_algorithm.py
    │   └── baro_main_metric.py
    └── multimodal
    │   ├── RCA_methods_combined.py
    │   ├── baro_algorithm.py
    │   └── baro_main_combined.py
├── Crossiant_Metadata
    ├── Crossiant_Metadata_Cloud_Computing_Original.json
    ├── Crossiant_Metadata_Cloud_Computing_Preprocessed.json
    ├── Crossiant_Metadata_Product_Review_Original.json
    └── Crossiant_Metadata_Product_Review_Preprocessed.json
├── IT
    └── data preprocessing
    │   ├── Drain.py
    │   ├── JMeter_KPI.py
    │   ├── README.md
    │   ├── drain3.yaml
    │   ├── drain3_parse.py
    │   ├── json2message.py
    │   ├── log_PCA_extraction.py
    │   ├── log_frequency_extraction.py
    │   ├── log_golden_frequency.py
    │   └── metric_json2npy.py
├── LICENSE
├── OT
    └── data_preprocessing
    │   ├── Readme.md
    │   ├── SWaT
    │       ├── data_segment.py
    │       ├── node_data_cut.py
    │       ├── node_final_process.py
    │       ├── pod_data_cut.py
    │       ├── pod_final_process.py
    │       └── process.sh
    │   └── WADI
    │       ├── data_segment.py
    │       ├── node_data_cut.py
    │       ├── node_final_process.py
    │       ├── pod_data_cut.py
    │       ├── pod_final_process.py
    │       └── process.sh
├── Other
    ├── bg.png
    └── rca_update.png
└── README.md


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KnowledgeDiscovery/rca_baselines/c560d8cc39c19f04c9ae74fac2404b1e5e8c3b4d/.DS_Store


--------------------------------------------------------------------------------
/Baseline/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KnowledgeDiscovery/rca_baselines/c560d8cc39c19f04c9ae74fac2404b1e5e8c3b4d/Baseline/.DS_Store


--------------------------------------------------------------------------------
/Baseline/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 | 


--------------------------------------------------------------------------------
/Baseline/.idea/Baseline.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="PyDocumentationSettings">
 9 |     <option name="format" value="PLAIN" />
10 |     <option name="myDocStringFormat" value="Plain" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/Baseline/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/Baseline/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/Baseline/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/Baseline.iml" filepath="$PROJECT_DIR$/.idea/Baseline.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/Baseline/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/Baseline/FastPC/baseline_evaluation.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | folders = ['0606','0517','0524','0901','1203']
 4 | predicts = []
 5 | for fd in folders:
 6 |     pods_data = pd.read_csv(fd+'/output/Pod_level_combine_ranking.csv')
 7 |     pods = list(pods_data['pod'])
 8 |     # pods = [x.split('_')[1] for x in pods]
 9 |     predicts.append(pods)
10 | 
11 | k = [1,3,5,7,10]
12 | 
13 | def precision_on_topk(predicts,reals,k):
14 |     pr = 0
15 |     for pred, real in zip(predicts, reals):
16 |         pred = pred[:k]
17 |         hit_count = len(set(pred) & set(real))
18 |         min_len = min(k,len(real))
19 |         pr += hit_count/min_len
20 |     return pr/len(reals)
21 | 
22 | def mean_precision_k(predicts,reals,k):
23 |     pr = 0
24 |     for i in range(1,k+1):
25 |         pr += precision_on_topk(predicts,reals,i)
26 |     return pr/k
27 | 
28 | def mrr(predicts,reals):
29 |     mrr_val = 0
30 |     for preds,real in zip(predicts,reals):
31 |         tmp = []
32 |         for real_item in real:
33 |             index = preds.index(real_item) if real_item in preds else sys.maxsize-1
34 |             tmp.append(index+1)
35 |         mrr_val += 1/min(tmp)
36 |     return mrr_val/len(reals)
37 | 
38 | reals = [['productpage-v1-5f9dbcd669-z2prs'],
39 |          ['catalogue-8667bb6cbc-hqzfw'],
40 |          ['catalogue-85fd4965b7-q8477'],
41 |          ['catalogue-6c7b9b975-xfjps'],
42 |          ['mongodb-v1-64c6b69879-p4wfp']]
43 | 
44 | for item in k:
45 |     pr = precision_on_topk(predicts,reals,item)
46 |     map_val = mean_precision_k(predicts,reals,item)
47 |     mrr_val = mrr(predicts,reals)
48 |     print("pr@{}:{} map@{}:{} mrr:{}".format(item,pr,item,map_val,mrr_val))
49 | 


--------------------------------------------------------------------------------
/Baseline/FastPC/interdependent.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import argparse
 4 | from sklearn import preprocessing
 5 |    
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--input', type=str, default='inrc')
11 | 
12 |     args = parser.parse_args()
13 | 
14 |     node_file = './0517/output/'+args.input + '_node_all.npy'
15 |     pod_file = './0517/output/'+args.input + '_pod_all.npy'
16 |     mp_file = '/nfs/users/zach/aiops_data/data/0517/p2n.npy'
17 | 
18 |     node_data = np.load(node_file, allow_pickle=True).item()
19 |     pod_data = np.load(pod_file, allow_pickle=True).item()
20 |     mp_data = np.load(mp_file, allow_pickle=True).item()
21 | 
22 |     pod_names = pod_data['columns']
23 |     node_names = node_data['columns']
24 | 
25 |     pod_scores = pod_data['score']
26 |     node_scores = node_data['score']
27 | 
28 |     p2s = dict(zip(pod_names, pod_scores))
29 |     n2s = dict(zip(node_names, node_scores))
30 | 
31 |     ctotal = 0
32 |     del_keys = []
33 |     for p in p2s:
34 |         if p not in mp_data:
35 |             del_keys.append(p)
36 |             continue
37 |         node = mp_data[p]
38 |         p2s[p] = p2s[p] * n2s[node]
39 |         ctotal += p2s[p]
40 | 
41 |     for k in del_keys:
42 |         p2s.pop(k)
43 | 
44 |     fd = {}
45 |     for p in p2s:
46 |         fd[p] = [p2s[p] / ctotal]
47 |     
48 | 
49 |     scores = pd.DataFrame.from_dict(fd, orient='index', columns=['ranking_score'])
50 |   
51 |     ranking_score = scores.reset_index(drop=True).to_numpy().reshape(-1)
52 |     ranking_score = preprocessing.normalize([ranking_score]).ravel()
53 |     #print(ranking_score)
54 |     columns = list(scores.index)
55 |     
56 |     #scores = scores.sort_values(by='ranking_score', ascending=False)
57 |     ranking = np.argsort(ranking_score)[::-1]
58 | 
59 |     K= len(ranking_score)
60 |     #results_combine = {}
61 | 
62 |     results_combine = pd.DataFrame()
63 |     results_combine['ranking'] = [i+1 for i in range(K)]
64 |     #results_combine = pd.DataFrame(results_combine, columns = ['ranking'])
65 |     results_combine ['pod'] = [columns[ranking[i]] for i in range(K)]
66 |     results_combine ['score'] = [ranking_score[ranking[i]] for i in range(K)]
67 |     results_combine.to_csv('./0517/output/'+ args.input + '_hierarchical_ranking_metrics.csv')
68 |     print(results_combine)
69 |     print('Successfully output the root cause results with considering both node level and pod level')
70 | 
71 | 
72 |  
73 |   
74 |     
75 | 
76 | 


--------------------------------------------------------------------------------
/Baseline/FastPC/lbfgsb_scipy.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import scipy.optimize as sopt
  3 | 
  4 | 
  5 | class LBFGSBScipy(torch.optim.Optimizer):
  6 |     """Wrap L-BFGS-B algorithm, using scipy routines.    
  7 |     """
  8 | 
  9 |     def __init__(self, params):
 10 |         defaults = dict()
 11 |         super(LBFGSBScipy, self).__init__(params, defaults)
 12 | 
 13 |         if len(self.param_groups) != 1:
 14 |             raise ValueError("LBFGSBScipy doesn't support per-parameter options"
 15 |                              " (parameter groups)")
 16 | 
 17 |         self._params = self.param_groups[0]['params']
 18 |         self._numel = sum([p.numel() for p in self._params])
 19 | 
 20 |     def _gather_flat_grad(self):
 21 |         views = []
 22 |         for p in self._params:
 23 |             if p.grad is None:
 24 |                 view = p.data.new(p.data.numel()).zero_()
 25 |             elif p.grad.data.is_sparse:
 26 |                 view = p.grad.data.to_dense().view(-1)
 27 |             else:
 28 |                 view = p.grad.data.view(-1)
 29 |             views.append(view)
 30 |         return torch.cat(views, 0)
 31 | 
 32 |     def _gather_flat_bounds(self):
 33 |         bounds = []
 34 |         for p in self._params:
 35 |             if hasattr(p, 'bounds'):
 36 |                 b = p.bounds
 37 |             else:
 38 |                 b = [(None, None)] * p.numel()
 39 |             bounds += b
 40 |         return bounds
 41 | 
 42 |     def _gather_flat_params(self):
 43 |         views = []
 44 |         for p in self._params:
 45 |             if p.data.is_sparse:
 46 |                 view = p.data.to_dense().view(-1)
 47 |             else:
 48 |                 view = p.data.view(-1)
 49 |             views.append(view)
 50 |         return torch.cat(views, 0)
 51 | 
 52 |     def _distribute_flat_params(self, params):
 53 |         offset = 0
 54 |         for p in self._params:
 55 |             numel = p.numel()
 56 |             # view as to avoid deprecated pointwise semantics
 57 |             p.data = params[offset:offset + numel].view_as(p.data)
 58 |             offset += numel
 59 |         assert offset == self._numel
 60 | 
 61 |     def step(self, closure):
 62 |         """Performs a single optimization step.
 63 | 
 64 |         Arguments:
 65 |             closure (callable): A closure that reevaluates the model
 66 |                 and returns the loss.
 67 |         """
 68 |         assert len(self.param_groups) == 1
 69 | 
 70 |         def wrapped_closure(flat_params):
 71 |             """closure must call zero_grad() and backward()"""
 72 |             flat_params = torch.from_numpy(flat_params)
 73 |             flat_params = flat_params.to(torch.get_default_dtype())
 74 |             self._distribute_flat_params(flat_params)
 75 |             loss = closure()
 76 |             loss = loss.item()
 77 |             flat_grad = self._gather_flat_grad().cpu().detach().numpy()
 78 |             return loss, flat_grad.astype('float64')
 79 | 
 80 |         initial_params = self._gather_flat_params()
 81 |         initial_params = initial_params.cpu().detach().numpy()
 82 | 
 83 |         bounds = self._gather_flat_bounds()
 84 | 
 85 |         # Magic
 86 |         sol = sopt.minimize(wrapped_closure,
 87 |                             initial_params,
 88 |                             method='L-BFGS-B',
 89 |                             jac=True,
 90 |                             bounds=bounds)
 91 | 
 92 |         final_params = torch.from_numpy(sol.x)
 93 |         final_params = final_params.to(torch.get_default_dtype())
 94 |         self._distribute_flat_params(final_params)
 95 | 
 96 | 
 97 | def main():
 98 |     import torch.nn as nn
 99 |     # torch.set_default_dtype(torch.double)
100 | 
101 |     n, d, out, j = 10000, 3000, 10, 0
102 |     input = torch.randn(n, d)
103 |     w_true = torch.rand(d, out)
104 |     w_true[j, :] = 0
105 |     target = torch.matmul(input, w_true)
106 |     linear = nn.Linear(d, out)
107 |     linear.weight.bounds = [(0, None)] * d * out  # hack
108 |     for m in range(out):
109 |         linear.weight.bounds[m * d + j] = (0, 0)
110 |     criterion = nn.MSELoss()
111 |     optimizer = LBFGSBScipy(linear.parameters())
112 |     print(list(linear.parameters()))
113 | 
114 |     def closure():
115 |         optimizer.zero_grad()
116 |         output = linear(input)
117 |         loss = criterion(output, target)
118 |         print('loss:', loss.item())
119 |         loss.backward()
120 |         return loss
121 |     optimizer.step(closure)
122 |     print(list(linear.parameters()))
123 |     print(w_true.t())
124 | 
125 | 
126 | if __name__ == '__main__':
127 |     main()
128 | 
129 | 


--------------------------------------------------------------------------------
/Baseline/FastPC/libspot.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KnowledgeDiscovery/rca_baselines/c560d8cc39c19f04c9ae74fac2404b1e5e8c3b4d/Baseline/FastPC/libspot.so


--------------------------------------------------------------------------------
/Baseline/FastPC/rca.py:
--------------------------------------------------------------------------------
  1 | # import pyspot as ps
  2 | import numpy as np
  3 | import pandas as pd
  4 | import networkx as nx
  5 | from causalnex.structure import dynotears
  6 | from causalnex.structure.dynotears import from_pandas_dynamic
  7 | from pyspot import DSpot, Spot
  8 | from pingouin import partial_corr
  9 | import torch
 10 | from hannlstm import hannLSTM, train_model_pgd
 11 | from fastPC import Fast_PC_Causal_Graph
 12 | import scipy
 13 | from numpy.linalg import norm, inv
 14 | from sklearn import preprocessing
 15 | 
 16 | 
 17 | def optENMFSoft( A, P, M, c, tau, max_iter=100):
 18 | 
 19 |     n = A.shape[0]
 20 |     B = (1-c) * inv(np.eye(n) - c * A)
 21 |     BB = B.transpose() @ B
 22 |     
 23 |     t = 1e-30
 24 |     
 25 |     e = np.ones((n, 1))
 26 |     s = scipy.special.softmax(B @ e)
 27 |     obj = norm((s @ s.transpose()) * M, 'fro') ** 2 + tau * norm(e, 1)
 28 |     obj_old = obj
 29 |     err = 1
 30 |     iter = 0
 31 |     
 32 |     # maxIter = 1000
 33 |     errorV=[]
 34 |     
 35 |     while (err > t) and (iter < max_iter):
 36 |         s=scipy.special.softmax(B @ e)
 37 |         phi=np.diag(s) - s @ s.transpose()
 38 |         
 39 |         numerator = 4*(B.transpose() @ phi) @ (P*M) @ s
 40 |         # print(numerator)
 41 |         numerator[numerator<0]=0
 42 |         denominator = 4 * B.transpose() @ ((phi@s@s.transpose())*M)@s+ tau * np.ones((n,1))
 43 |         e=e * np.sqrt(np.sqrt(numerator/denominator))
 44 |         # print(e)
 45 |         # %err=norm(e-e_old,'fro')
 46 |         obj=norm((s@s.transpose())*M - P,'fro') ** 2 + tau * norm(e,1)
 47 |         err=np.abs(obj-obj_old)
 48 |         obj_old=obj
 49 |         iter = iter +1
 50 |         errorV.append(err)
 51 |     return e
 52 | 
 53 | def spot_detection(X, d: int=10, q: float=1e-4, n_init:int=100, level:float=0.98)->np.ndarray:
 54 | 
 55 |     # X_mean = np.mean(X, axis=0)
 56 |     # X_std = np.std(X, axis=0)
 57 |     # X_std[X_std < 1e-3] = 1
 58 |     # X = (X - X_mean) / X_std
 59 | 
 60 |     nvar = X.shape[1]
 61 |     T = X.shape[0]
 62 |     score_list = []
 63 |     for i in range(nvar):
 64 |         S = DSpot(d=d, q=q, n_init=n_init, level=level)
 65 |         score = []
 66 |         for t in range(T):
 67 |             xt = X[t, i]
 68 |             event = S.step(xt)
 69 |             st = 0
 70 |             if t >= n_init:
 71 |                 # up alert
 72 |                 if event == 1:
 73 |                     upper_threshold = S.status().to_dict()['z_up']
 74 |                     assert(xt >= upper_threshold)
 75 | 
 76 |                     if upper_threshold == 0:
 77 |                         upper_threshold = 0.0001
 78 | 
 79 |                     st = (xt - upper_threshold) / upper_threshold
 80 |                     # print('z_up is event!')
 81 |                 # down alert 
 82 |                 if event == -1:
 83 |                     lower_threshold = S.status().to_dict()['z_down']
 84 |                     assert(xt <= lower_threshold)
 85 | 
 86 |                     if lower_threshold == 0:
 87 |                         lower_threshold = 0.0001
 88 | 
 89 |                     st = (lower_threshold - xt) / lower_threshold
 90 |                     # print('z_down is event!')
 91 |                 st = np.abs(st)
 92 |             score.append(st)
 93 |         score_list.append(score)
 94 |     np_score = np.array(score_list).transpose()
 95 |     return np_score
 96 | 
 97 | def detect_individual_causal(X: np.ndarray,
 98 |                    method:str='SPOT',
 99 |                    args:dict={'d': 10, 'q': 1e-4, 'n_init': 100, 'level':0.98})->np.ndarray:
100 |     if method == 'SPOT':
101 |         d = args['d']
102 |         q = args['q']
103 |         n_init = args['n_init']
104 |         level = args['level']
105 |         score = spot_detection(X, d, q, n_init, level)
106 |     return score
107 | 
108 |     
109 | # LSTM based method
110 | def lstm(X: np.ndarray, hidden: int, context: int, lam: float, lam_ridge: float, lr: float, max_iter: int, check_every: int, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
111 |     # device = torch.device('cuda:0')
112 |     
113 |     X_np = torch.tensor(X[np.newaxis], dtype=torch.float64, device=device)
114 |     hannlstm = hannLSTM(X.shape[-1], hidden=hidden).to(device=device)
115 |     X_np = X_np.float()
116 |     train_loss_list = train_model_pgd(hannlstm, X_np, context=context, lam=lam, lam_ridge=lam_ridge, lr=lr, max_iter=max_iter, check_every=check_every)
117 |     W_est = hannlstm.GC(False).cpu().data.numpy()
118 | 
119 |     return W_est
120 | 
121 | 
122 | def generate_causal_graph(X: np.ndarray,
123 |                           method: str ='dynotears',
124 |                           args: dict = {'lag': 10,
125 |                                         'lambda_w': 1e-3,
126 |                                         'lambda_a': 1e-3,
127 |                                         'max_iter':30})->np.ndarray:
128 |     if method == 'lstm':
129 |         torch.set_default_tensor_type(torch.FloatTensor)
130 |         hidden = args['hidden']
131 |         context = args['context']
132 |         lam = args['lam']
133 |         lam_ridge = args['lam_ridge']
134 |         lr = args['lr']
135 |         max_iter = args['max_iter']
136 |         check_every = args['check_every']
137 |         device = args['device']
138 |         W_est = lstm(X, hidden, context, lam, lam_ridge, lr, max_iter, check_every, device)
139 |     elif method == 'dynotears':
140 |         if 'columns' not in args:
141 |             columns = ['V{}'.format(i) for i in range(X.shape[1])]
142 |         else:
143 |             columns = args['columns']
144 |         lag = args['lag']
145 |         lambda_w = args['lambda_w']
146 |         lambda_a = args['lambda_a']
147 |         max_iter = args['max_iter']
148 | 
149 |         X_lag = np.roll(X,1,axis=0)
150 |         for lag_o in range(2,lag+1):
151 |             X_lag = np.hstack((X_lag,np.roll(X,lag_o, axis=0)))
152 |         W_est = dynotears.from_numpy_dynamic(X, X_lag, lambda_w, lambda_a, max_iter)
153 |     elif method == 'fastpc':
154 |         W_est = Fast_PC_Causal_Graph(pd.DataFrame(X),alpha=10**-6,cuda=True)
155 |     return W_est
156 | 
157 | # generate transition matrix from weight matrix
158 | # W: W[i,j] i->j
159 | def generate_Q(X:np.ndarray, W:np.ndarray, RI:int, rho:float, columns: list=None):
160 |     n = W.shape[0]
161 |     if columns is None:
162 |         columns=['V{}'.format(i) for i in range(n)]
163 |     df = pd.DataFrame(X, index=[i for i in range(X.shape[0])], columns=columns)
164 |     
165 |     # parent nodes
166 |     PAak = [columns[i] for i, x in enumerate(W[:, RI]) if (x == 1) and (i != RI)]
167 |     vak = columns[RI]
168 |     # PA = [[columns[j] for j, x in enumerate(W[:, i]) if x == 1] for i in range(n)]
169 |     # PAak_minus = [[c for c in PAak if c!=columns[i]] for i in range(n)]
170 | 
171 |     # partial correlation
172 |     Rpc = []
173 |     for i in range(n):
174 |         if i == RI:
175 |             Rpc.append(0)
176 |             continue
177 |         vi = columns[i]
178 |         PAak_minus_i = [c for c in PAak if c!=columns[i]]
179 |         PAi = [columns[j] for j, x in enumerate(W[:, i]) if (x == 1) and (i != j) and (RI != j)]
180 |         covar = list(set(PAak_minus_i).union(set(PAi)))
181 |         rdf = partial_corr(df, x=vak, y=vi, covar=covar)
182 |         Rpc.append(np.abs(rdf.values[0, 1]))
183 |         
184 |     Q = np.zeros((n, n))
185 |     for i in range(n):
186 |         P = 0
187 |         for j in range(n):
188 |             if i == j:
189 |                 continue
190 |             # from result to cause
191 |             if W[j][i] == 1:
192 |                 Q[i][j] = Rpc[j]
193 |                 # from cause to result:
194 |                 if W[i][j] == 0:
195 |                     Q[j][i] = rho * Rpc[i]
196 |                 # stay
197 |                 P = max(P, Q[i][j])
198 |         Q[i][i] = max(0., Rpc[i] - P)
199 |     # normalize each row
200 |     rsum = np.sum(Q, axis=1).reshape(-1 , 1)
201 |     rsum[rsum==0] = 1
202 |     Q = Q / rsum
203 |     return Q
204 | 
205 | # random walk with restart
206 | def propagate_error(Q:np.ndarray, start:int, steps:int=1000, rp:float=0.05, max_self:int=10)->np.ndarray:
207 |     n = Q.shape[0]
208 |     count = np.zeros(n)
209 |     current = start
210 |     self_visit = 0
211 |     for step in range(steps):
212 |         # print(current)
213 |         # input()
214 |         if np.random.rand() > rp:
215 |             prob = Q[current, :]
216 |             if np.sum(prob) != 1:
217 |                 continue
218 |             next = np.random.choice(n, 1, p=prob)[0]
219 |             # check if need a restart, get stuck in one node
220 |             if next == current:
221 |                 self_visit += 1
222 |                 if self_visit == max_self:
223 |                     current = start
224 |                     self_visit = 0
225 |                     continue
226 |             current = next
227 |             count[current] += 1
228 |         else:
229 |             current = start
230 |     return count
231 | 
232 | if __name__ == '__main__':
233 |     data = np.load('may_pod_level_data.npy', allow_pickle=True).item()
234 |     label = 'Book_Info_product'
235 |     X = data[label]['Sequence'][:48000, :]
236 |     X = np.sum(X.reshape((-1, 100, X.shape[1])), axis=1)
237 |     columns = data[label]['Pod_Name'] + data[label]['JMeter_Feature']
238 |     std = np.std(X, axis=0)
239 |     idx = [i for i, x in enumerate(std > 1e-3) if x]
240 |     # idx = list(range(30))
241 |     X = X[:, idx]
242 |     columns = [columns[i] for i in idx]
243 | 
244 |     print('X shape: ', X.shape)
245 | 
246 |     print('Detecting Individual Causal ...')
247 |     ind_casual_score = detect_individual_causal(X, method='SPOT', args={'d':10, 'q':1e-4, 'n_init':100, 'level':0.98})
248 |     ind_casual_score = np.sum(ind_casual_score, axis=0)
249 |     normalized_ind_casual_score = ind_casual_score
250 |     normalized_ind_casual_score[:-1] = preprocessing.normalize([ind_casual_score[:-1]])
251 |     print('Detecting Individual Causal Done!')
252 | 
253 |     # causal graph
254 |     print('Generating Causal Graph ...')
255 |     cg = generate_causal_graph(X, method='gnn', args={'lag': 20, 'lambda_w': 1e-3, 'lambda_a': 1e-2})
256 |     print('Generating Causal Graph Done!')
257 |     # threshold top K
258 |     K = 0.3*len(cg.reshape(-1))
259 |     threshold = sorted(cg.reshape(-1), reverse=True)[K-1] 
260 |     W = np.where(cg>=threshold, 1, 0)
261 |     # Wij : i->j
262 |     W = W.transpose()
263 |     # print('W:', W[:, -1])
264 |     print('Generating Q ...')
265 |     Q = generate_Q(X, W, RI=W.shape[0]-1, rho=1e-2)
266 |     print('Q:', Q[-1, :])
267 |     print('Q sum: ', np.sum(Q))
268 |     print('Generating Q Done!')
269 |     # error propagation
270 |     print('Propagaing Error ...')
271 |     steps = 10000
272 |     count = propagate_error(Q, start=W.shape[0]-1, steps=steps)
273 |     count /= steps
274 |     print('Propagating Eroor Done!')
275 | 
276 |     # root cause ranking
277 |     print('Individual Causal Score: ', normalized_ind_casual_score)
278 |     print('Topological Causal score: ', count)
279 |     alpha = 0.3
280 |     score = alpha * normalized_ind_casual_score[:-1] + (1 - alpha) * count[:-1]
281 |     # top K
282 |     K = 5
283 |     ranking = np.argsort(score)[::-1]
284 |     for i in range(K):
285 |         print('{}: {} {}'.format(i, columns[ranking[i]], score[ranking[i]]))
286 | 


--------------------------------------------------------------------------------
/Baseline/FastPC/trace_expm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import scipy.linalg as slin
 4 | 
 5 | 
 6 | class TraceExpm(torch.autograd.Function):
 7 |     @staticmethod
 8 |     def forward(ctx, input):
 9 |         # detach so we can cast to NumPy
10 |         #E = slin.expm(input.detach().numpy())
11 |         E = torch.matrix_exp(input)
12 |         #f = np.trace(E)
13 |         f = torch.trace(E)
14 |         #E = torch.from_numpy(E)
15 |         ctx.save_for_backward(E)
16 |         return torch.as_tensor(f, dtype=input.dtype)
17 | 
18 |     @staticmethod
19 |     def backward(ctx, grad_output):
20 |         E, = ctx.saved_tensors
21 |         grad_input = grad_output * E.t()
22 |         return grad_input
23 | 
24 | 
25 | trace_expm = TraceExpm.apply
26 | 
27 | 
28 | def main():
29 |     input = torch.randn(20, 20, dtype=torch.double, requires_grad=True)
30 |     assert torch.autograd.gradcheck(trace_expm, input)
31 | 
32 |     input = torch.tensor([[1, 2], [3, 4.]], requires_grad=True)
33 |     tre = trace_expm(input)
34 |     f = 0.5 * tre * tre
35 |     print('f\n', f.item())
36 |     f.backward()
37 |     print('grad\n', input.grad)
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     main()
42 | 


--------------------------------------------------------------------------------
/Baseline/Nezha/20240124/20240124-fault_list.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "2024-01-24": [
 3 |         {
 4 |             "inject_time": "2024-01-24 03:10:51",
 5 |             "inject_timestamp": "1706087451",
 6 |             "inject_pod": "ip-10-1-100-109.ap-northeast-1.compute.internal",
 7 |             "inject_type": "infinite loop bug"
 8 |         }
 9 |     ]
10 | }


--------------------------------------------------------------------------------
/Baseline/Nezha/20240124/root_cause_hipster.json:
--------------------------------------------------------------------------------
1 | {
2 |     "ip-10-1-100-109.ap-northeast-1.compute.internal": {
3 |         "return": "infinite loop bug",
4 |         "exception": "infinite loop bug",
5 |         "cpu_consumed": "pod_level_data_cpu_usage",
6 |         "infinite loop bug": "pod_level_data_cpu_usage"
7 |     }
8 | }


--------------------------------------------------------------------------------
/Baseline/Nezha/INSTALL.md:
--------------------------------------------------------------------------------
 1 | # Quick Start
 2 | 
 3 | ### 1.1 Requirements 
 4 | 
 5 | - Python3.6 is recommended to run the anomaly detection. Otherwise, any python3 version should be fine.
 6 | - Git is also needed.
 7 | 
 8 | ### 1.2 Setup
 9 | 
10 | download `Nezha`  first by  `git clone git@github.com:IntelligentDDS/Nezha.git` 
11 | 
12 | `python3.6 -m pip install -r requirements.txt` to install the dependency for Nezha
13 | 
14 | ### 1.3 Running  Nezha
15 | 
16 | #### 1.3.1 Localize OnlineBoutique at service level
17 | 
18 | 
19 | ```
20 | python3.6 ./main.py --ns hipster --level service 
21 | 
22 | pattern_ranker.py:622: -------- hipster Fault numbuer : 56-------
23 | pattern_ranker.py:623: --------AS@1 Result-------
24 | pattern_ranker.py:624: 92.857143 %
25 | pattern_ranker.py:625: --------AS@3 Result-------
26 | pattern_ranker.py:626: 96.428571 %
27 | pattern_ranker.py:627: --------AS@5 Result-------
28 | pattern_ranker.py:628: 96.428571 %
29 | ```
30 | 
31 | #### 1.3.2 Localize OnlineBoutique at inner service level
32 | 
33 | ```
34 | python3.6 ./main.py --ns hipster --level inner
35 | 
36 | pattern_ranker.py:622: -------- hipster Fault numbuer : 56-------
37 | pattern_ranker.py:623: --------AIS@1 Result-------
38 | pattern_ranker.py:624: 92.857143 %
39 | pattern_ranker.py:625: --------AIS@3 Result-------
40 | pattern_ranker.py:626: 96.428571 %
41 | pattern_ranker.py:627: --------AIS@5 Result-------
42 | pattern_ranker.py:628: 96.428571 %
43 | ```
44 | 
45 | #### 1.3.3 Localize Trainticket at service level
46 | 
47 | ```
48 | python3.6 ./main.py --ns ts --level service
49 | 
50 | pattern_ranker.py:622: -------- ts Fault numbuer : 45-------
51 | pattern_ranker.py:623: --------AS@1 Result-------
52 | pattern_ranker.py:624: 86.666667 %
53 | pattern_ranker.py:625: --------AS@3 Result-------
54 | pattern_ranker.py:626: 97.777778 %
55 | pattern_ranker.py:627: --------AS@5 Result-------
56 | pattern_ranker.py:628: 97.777778 %
57 | ```
58 | 
59 | #### 1.3.4 Localize Trainticket at inner service level
60 | 
61 | ```
62 | python3.6 ./main.py --ns ts --level inner
63 | 
64 | pattern_ranker.py:622: -------- ts Fault numbuer : 45-------
65 | pattern_ranker.py:623: --------AIS@1 Result-------
66 | pattern_ranker.py:624: 86.666667 %
67 | pattern_ranker.py:625: --------AIS@3 Result-------
68 | pattern_ranker.py:626: 97.777778 %
69 | pattern_ranker.py:627: --------AIS@5 Result-------
70 | pattern_ranker.py:628: 97.777778 %
71 | ```
72 | 
73 | The details of service level results and inner-service level results will be printed and recorded in `./log`


--------------------------------------------------------------------------------
/Baseline/Nezha/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 IntelligentDDS
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/Baseline/Nezha/alarm.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | from itertools import product
  3 | import os
  4 | import re
  5 | import datetime
  6 | from os.path import dirname
  7 | from log import Logger
  8 | import logging
  9 | from yaml import FlowMappingEndToken
 10 | import numpy as np
 11 | import pandas as pd
 12 | import matplotlib.pyplot as plt
 13 | import statistics
 14 | import numpy as np
 15 | 
 16 | log_path = dirname(__file__) + '/log/' + str(datetime.datetime.now().strftime(
 17 |     '%Y-%m-%d')) + '_nezha.log'
 18 | logger = Logger(log_path, logging.DEBUG, __name__).getlog()
 19 | 
 20 | 
 21 | metric_threshold_dir = "metric_threshold"
 22 | 
 23 | 
 24 | def get_svc(path):
 25 |     svc = path.rsplit('-', 1)[0]
 26 |     svc = svc.rsplit('-', 1)[0]
 27 | 
 28 |     return svc
 29 | 
 30 | 
 31 | def generate_threshold(metric_dir, trace_file):
 32 |     """
 33 |     fun generate_threshold: calculte mean and std for each metric of each servie
 34 |     write ruslt to metric_threshold_dir/service.csv
 35 |     :parameter
 36 |         metric_dir - metric dir in construction phase
 37 |     """
 38 |     metric_map = {}
 39 |     path_list = os.listdir(metric_dir)
 40 |     for path in path_list:
 41 |         if "metric" in path:
 42 |             svc = path.rsplit('-', 1)[0]
 43 |             svc = svc.rsplit('-', 1)[0]
 44 |             if svc in metric_map:
 45 |                 metric_map[svc].append(os.path.join(metric_dir, path))
 46 |             else:
 47 |                 metric_map[svc] = [os.path.join(metric_dir, path)]
 48 |     for svc in metric_map:
 49 |         frames = []
 50 | 
 51 |         # get pod name
 52 |         for path in path_list:
 53 |             if svc in path:
 54 |                 pod_name = path.split("_")[0]
 55 |                 # print(pod_name)
 56 |                 network_mean,  network_std = get_netwrok_metric(
 57 |                     trace_file=trace_file, pod_name=pod_name)
 58 |                 break
 59 | 
 60 |         metric_threshold_file = metric_threshold_dir + "/" + svc + ".csv"
 61 |         for path in metric_map[svc]:
 62 |             frames.append(pd.read_csv(path, index_col=False, usecols=[
 63 |                 'CpuUsageRate(%)', 'MemoryUsageRate(%)', 'SyscallRead', 'SyscallWrite']))
 64 |         # concat pods of the same service
 65 |         result = pd.concat(frames)
 66 |         with open(metric_threshold_file, 'w', newline='') as f:
 67 |             writer = csv.writer(f)
 68 |             header = ['CpuUsageRate(%)', 'MemoryUsageRate(%)', 'SyscallRead',
 69 |                       'SyscallWrite', 'NetworkP90(ms)']
 70 |             writer.writerow(header)
 71 |             mean_list = []
 72 |             std_list = []
 73 |             for metric in header:
 74 |                 if metric == 'NetworkP90(ms)':
 75 |                     continue
 76 |                 mean_list.append(np.mean(result[metric]))
 77 |                 std_list.append(np.std(result[metric]))
 78 |             mean_list.append(network_mean)
 79 |             std_list.append(network_std)
 80 |             writer.writerow(mean_list)
 81 |             writer.writerow(std_list)
 82 | 
 83 | 
 84 | def get_netwrok_metric(trace_file, pod_name):
 85 |     """
 86 |     func get_netwrok_metric: use trace data to get netwrok metric
 87 |         :parameter
 88 |         time - to regex timestamp e.g, "2022-04-18 13:00"
 89 |         data_dir
 90 |         pod_name
 91 |         :return
 92 |         p90 netwrok latency
 93 |     """
 94 |     latency_list = []
 95 | 
 96 |     if "front" in pod_name:
 97 |         # front end dose not calculate netwrok latency
 98 |         return 10, 10
 99 |     #
100 |     # pod_reader = pd.read_csv(
101 |     #     trace_file, index_col='PodName', usecols=['TraceID', 'SpanID', 'ParentID', 'PodName', 'EndTimeUnixNano'])
102 |     # parent_span_reader = pd.read_csv(
103 |     #     trace_file, index_col='SpanID', usecols=['TraceID', 'SpanID', 'ParentID', 'PodName', 'EndTimeUnixNano'])
104 |     #
105 |     # try:
106 |     #     pod_reader = pod_reader.reindex(columns=pod_name)
107 |     #     pod_spans = pod_reader.loc[[pod_name], ['SpanID', 'ParentID', 'PodName', 'EndTimeUnixNano']]
108 |     # except:
109 |     #     service = pod_name.rsplit('-', 1)[0]
110 |     #     service = service.rsplit('-', 1)[0]
111 |     #
112 |     #     csv_file = dirname(__file__) +  "/metric_threshold/" + service + ".csv"
113 |     #     pod_reader = pd.read_csv(csv_file, usecols=['NetworkP90(ms)'])
114 |     #     # print("pod", pod_name, " not found in trace, return default ",
115 |     #     #       float(pod_reader.iloc[0]))
116 |     #
117 |     #     return float(pod_reader.iloc[0]), 0
118 |     #
119 |     # if len(pod_spans['SpanID']) > 0:
120 |     #     # process span independentlt and order by timestamp
121 |     #     for span_index in range(len(pod_spans['SpanID'])):
122 |     #         # span event
123 |     #         parent_id = pod_spans['ParentID'].iloc[span_index]
124 |     #         pod_start_time = int(
125 |     #             pod_spans['EndTimeUnixNano'].iloc[span_index])
126 |     #         try:
127 |     #             parent_pod_span = parent_span_reader.loc[[
128 |     #                 parent_id], ['PodName', 'EndTimeUnixNano']]
129 |     #             if len(parent_pod_span) > 0:
130 |     #                 for parent_span_index in range(len(parent_pod_span['PodName'])):
131 |     #                     parent_pod_name = parent_pod_span['PodName'].iloc[parent_span_index]
132 |     #                     parent_end_time = int(
133 |     #                         parent_pod_span['EndTimeUnixNano'].iloc[parent_span_index])
134 |     #
135 |     #                 if str(parent_pod_name) != str(pod_name):
136 |     #                     latency = (parent_end_time - pod_start_time) / \
137 |     #                         1000000  # convert to microsecond
138 |     #                     # if "contacts-service" in pod_name:
139 |     #                     #     logger.info("%s, %s, %s, %s, %s" % (
140 |     #                     #         pod_name, pod_spans['SpanID'].iloc[span_index], parent_pod_name, pod_spans['ParentID'].iloc[span_index], latency))
141 |     #                     latency_list.append(latency)
142 |     #         except:
143 |     #             pass
144 |     # # logger.info("%s latency is %s" %(pod_name, np.percentile(latency_list, 90)))
145 |     # if len(latency_list) > 2:
146 |     #     return np.percentile(latency_list, 90), statistics.stdev(latency_list)
147 |     # else:
148 |     #     return 10, 10
149 |     return 1, 1
150 | 
151 | 
152 | def determine_alarm(pod, metric_type, metric_value, std_num, ns):
153 |     """
154 |     fun determine_alarm: determin whether violate 3-sgima
155 |     :parameter
156 |         pod - podname to find corrsponding metric threshold file
157 |         metric_type - find correspding column
158 |         metric_vault - compare with the history mean and std
159 |         std_num - constrol std_num * std
160 |     :return
161 |         true - alarm
162 |         false - no alarm
163 |     """
164 | 
165 |     path_list = os.listdir(metric_threshold_dir)
166 | 
167 |     if metric_type == "CpuUsageRate(%)" or metric_type == 'MemoryUsageRate(%)':
168 |         if metric_value > 80:
169 |             return True
170 |     else:
171 |         if ns == "hipster":
172 |             # for hipster
173 |             if metric_value > 200:
174 |                 return True
175 |         elif ns == "ts":
176 |             # for ts
177 |             if metric_value > 300:
178 |                 return True
179 |     return False
180 |     # for path in path_list:
181 |     #     if re.search(path.split('.')[0], pod):
182 |     #         hisory_metric = pd.read_csv(os.path.join(
183 |     #             metric_threshold_dir, path), index_col=False, usecols=[metric_type])
184 |     #         if metric_value > hisory_metric[metric_type][0] + std_num * hisory_metric[metric_type][1]:
185 |     #             return True
186 |     #         # elif metric_value < hisory_metric[metric_type][0] - std_num * hisory_metric[metric_type][1]:
187 |     #         #     return True
188 |     #         else:
189 |     #             return False
190 | 
191 | 
192 | def generate_alarm(metric_list, ns, std_num=6):
193 |     """
194 |     func generate_alarm:  generate alram of each pod at current miniute
195 |     :parameter
196 |         metric_list - metric list from get_metric_with_time
197 | 
198 |     :return
199 |         alarm_list, e.g., [{'pod': 'cartservice-579f59597d-n69b4', 'alarm': [{'metric_type': 'CpuUsageRate(%)', 'alarm_flag': True}]}]
200 |         [{
201 |             pod:
202 |             alarm: [
203 |                 {
204 |                     metric_type: CpuUsageRate(%)
205 |                     alarm_flag: True
206 |                 }
207 |             ]
208 |         }]
209 |     """
210 |     alarm_list = []
211 |     for pod_metric in metric_list:
212 |         alarm = {}
213 |         for i in range(len(pod_metric['metrics'])):
214 |             alarm_flag = determine_alarm(pod=pod_metric["pod"], metric_type=pod_metric['metrics'][i]["metric_type"],
215 |                                          metric_value=pod_metric['metrics'][i]["metric_value"], std_num=std_num, ns=ns)
216 |             if alarm_flag:
217 |                 # if exist alarm_flag equal to true, create map
218 |                 if "pod" not in alarm:
219 |                     alarm = {"pod": pod_metric["pod"], "alarm": []}
220 |                     alarm['alarm'].append(
221 |                         {"metric_type": pod_metric['metrics'][i]["metric_type"], "alarm_flag": alarm_flag})
222 | 
223 |         if "pod" in alarm:
224 |             alarm_list.append(alarm)
225 | 
226 |     return alarm_list
227 | 
228 | 
229 | def get_metric_with_time(time, base_dir):
230 |     """
231 |     func get_metric_with_time: get metric list at determined miniute
232 |     :parameter
233 |         time - to regex timestamp e.g, "2022-04-18 13:00"
234 |         product_metric_dir
235 |     :return
236 |         target_list - traget metrics
237 |         [
238 |             {
239 |                 pod:
240 |                 metrics: [
241 |                     {
242 |                         "metric_type":
243 |                         "metric_value":
244 |                     }
245 |                 ]
246 |             }
247 | 
248 |         ]
249 |     """
250 |     # date = time.split(' ')[0]
251 |     # hour_min = time.split(' ')[1]
252 |     # hour = hour_min.split(':')[0]
253 |     # min = hour_min.split(':')[1]
254 |     trace_file = base_dir + "/trace/trace.csv"
255 | 
256 |     metric_dir = base_dir + "/metric/"
257 | 
258 |     path_list = os.listdir(metric_dir)
259 | 
260 |     # metric_list = ['CpuUsageRate(%)', 'MemoryUsageRate(%)', 'SyscallRead',
261 |     #                'SyscallWrite']
262 |     # metric_list = ['CpuUsageRate(%)', 'MemoryUsageRate(%)']
263 |     target_list = []
264 |     for path in path_list:
265 |         if "metric" in path:
266 |             metrics = pd.read_csv(os.path.join(metric_dir, path))
267 |             metric_list = list(metrics.columns)
268 |             metric_list.remove("TimeStamp")
269 |             metric_list.remove("PodName")
270 |             metric_list.remove("Time")
271 |             if 'Date' in metric_list:
272 |                 metric_list.remove("Date")
273 |             # metrics = pd.read_csv(os.path.join(product_metric_dir, path), index_col=False, usecols=['TimeStamp', 'PodName', 'CpuUsageRate(%)', 'MemoryUsageRate(%)', 'SyscallRead', 'SyscallWrite', 'PodServerLatencyP90(s)', 'PodClientLatencyP90(s)'])
274 |             for index in range(len(metrics['Time'])):
275 |                 # regex timestamp
276 |                 if re.search(time, metrics['Time'][index]):
277 |                     target_metric = {
278 |                         "pod": metrics['PodName'][index], "metrics": []}
279 |                     for metric in metric_list:
280 |                         target_metric["metrics"].append({
281 |                             "metric_type": metric, "metric_value": metrics[metric][index]})
282 |                     network_p90, _ = get_netwrok_metric(
283 |                         trace_file=trace_file, pod_name=metrics['PodName'][index])
284 |                     target_metric["metrics"].append(
285 |                         {"metric_type": "NetworkP90(ms)", "metric_value": network_p90})
286 |                     target_list.append(target_metric)
287 |     pod_num = len(path_list)
288 |     # print(target_list)
289 |     return target_list, pod_num
290 | 
291 | 


--------------------------------------------------------------------------------
/Baseline/Nezha/data_parser.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import glob
  3 | import os
  4 | import pandas as pd
  5 | import csv
  6 | import re  # Import regular expressions
  7 | import numpy as np
  8 | 
  9 | 
 10 | def remove_timestamps(message):
 11 |     # Remove datetime info of different formats
 12 |     message = re.sub(r'\[\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\]', '', message) 
 13 |     message = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z', '', message)  
 14 |     message = re.sub(r'\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}', '', message)  
 15 |     message = re.sub(r'\d{4}\.\d{2}\.\d{2} \d{2}:\d{2}:\d{2}', '', message)  
 16 |     message = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}\+\d{4}', '', message)  
 17 |     message = re.sub(r'\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}', '', message)
 18 |     message = re.sub(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', '', message)
 19 |     message = re.sub(r'\[\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} \w{3}\]', '', message)
 20 |     message = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}\+\d{2}:\d{2}', '', message)
 21 |     message = re.sub(r'\w{3} \d{1,2}, \d{4}', '', message)
 22 |     message = re.sub(r'\d{1,2} \w{3} \d{4}', '', message)
 23 |     message = re.sub(r'\d{2}:\d{2} [AP]M', '', message)
 24 |     message = re.sub(r'\[\d{2}/\w{3}/\d{4} \d{2}:\d{2}:\d{2}\]', '', message)
 25 |     message = re.sub(r'^I\d{4} \d{2}:\d{2}:\d{2}\.\d{6}\s+\d+\s+\w+.\w+:\d+\] ', '', message)
 26 |     message = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}Z', '', message)
 27 |     
 28 |     return message.strip()
 29 | 
 30 | 
 31 | 
 32 | def extract_log_message(log):
 33 |     # Check if the log is in JSON format
 34 |     if "msg:" in log or "\"msg\":" in log:
 35 |         log_json = json.loads(log)
 36 |         log_message = log_json.get('msg', '')
 37 |     elif "msg=" in log or "\"msg\"=" in log:
 38 |         msg_index = log.find("msg=") if "msg=" in log else log.find("\"msg\"=")
 39 |         first_quote_index = log.find('"', msg_index)
 40 |         last_quote_index = log.find('"', first_quote_index + 1)
 41 |         if last_quote_index != -1:
 42 |             log_message = log[first_quote_index + 1:last_quote_index]
 43 |         else:
 44 |             log_message = log[first_quote_index + 1:]
 45 |     else:
 46 |         log_message = log
 47 |     
 48 |     return log_message.strip()
 49 | 
 50 | def dependency(path,output_dir):
 51 |     extract_pod_list=[]
 52 |     extract_node_list=[]
 53 |     folder_list = os.listdir(path)
 54 |      
 55 |     for folder in folder_list:  
 56 |         json_file = path + folder + "/" + "*.json"
 57 |         for readfile in glob.glob(json_file):
 58 |             print(readfile)
 59 |             with open(readfile) as f:
 60 |                 jsn = json.load(f)
 61 |                 for jsn_hit in jsn['hits']['hits']:
 62 |                     all_proc = []
 63 |                     all_node = []
 64 |                     if "kubernetes" in jsn_hit['_source'] and "pod_name" in jsn_hit['_source']['kubernetes']:
 65 |                         pod = jsn_hit['_source']['kubernetes']['pod_name']
 66 |                         message = jsn_hit['_source']['message']
 67 |                         timestamp = jsn_hit['_source']['@timestamp']
 68 |                         if message.startswith('"'):
 69 |                             message = message[1:]
 70 |                         if message.endswith('"'): 
 71 |                             message = message[:-1]
 72 |                         if "msg" in message:
 73 |                             # print(message)
 74 |                             message = extract_log_message(message)
 75 |                             # message = json.loads(message)['msg']
 76 |                         
 77 |                         message = remove_timestamps(message)
 78 |                         all_proc.append(pod)
 79 |                         all_proc.append(timestamp)
 80 |                         all_proc.append(message)
 81 |                         if all_proc:
 82 |                             extract_pod_list.append(all_proc)
 83 |                     if "systemd" in jsn_hit['_source'] and "t" in jsn_hit['_source']['systemd']:
 84 |                         node = jsn_hit['_source']['hostname']
 85 |                         message = jsn_hit['_source']['message']
 86 |                         timestamp = jsn_hit['_source']['@timestamp']
 87 |                         if message.startswith('"'):
 88 |                             message = message[1:]
 89 |                         if message.endswith('"'): 
 90 |                             message = message[:-1]
 91 |                         if "msg:" in message  or "\"msg\":" in message:
 92 |                             message = extract_log_message(message)
 93 |                             # message = json.loads(message)['msg']
 94 |                         all_node.append(node)
 95 |                         all_node.append(timestamp)
 96 |                         all_node.append(message)
 97 |                         if all_node:
 98 |                             extract_node_list.append(all_node)                  
 99 |     # output file
100 |     data_list_col=['Node','Timestamp','Messages']
101 |     node_df = pd.DataFrame(extract_node_list,columns=data_list_col)
102 |     node_df.dropna()
103 |     filename = 'Node_messages'
104 |     node_df = node_df.sort_values(by='Timestamp')
105 |     node_df.to_csv(output_dir + filename, index = False) 
106 |     csv_file = output_dir + filename 
107 |     partition_csv(csv_file, output_dir3)
108 | 
109 |     data_list_col=['Pod','Timestamp','Messages']
110 |     pod_df = pd.DataFrame(extract_pod_list,columns=data_list_col)
111 |     pod_df.dropna()
112 |     filename = 'Pod_messages'
113 |     pod_df = pod_df.sort_values(by='Timestamp')
114 |     pod_df.to_csv(output_dir + filename, index = False) 
115 |     csv_file = output_dir + filename 
116 |     partition_csv(csv_file, output_dir2)
117 |     
118 | 
119 |     
120 | def partition_csv(csv_file, output_dir):
121 |     isExist = os.path.exists(output_dir)
122 |     if not isExist:
123 |         os.mkdir(output_dir)
124 |     # Creates empty set - this will be used to store the values that have already been used
125 |     filelist = set()
126 |     # Opens the large csv file in "read" mode
127 |     with open(csv_file,'r') as csvfile:
128 |         read_rows = csv.reader(csvfile)
129 |         # Skip the column names
130 |         next(read_rows)
131 |         for row in read_rows:
132 |             # Store the whole row as a string (rowstring)
133 |             rowstring='\t'.join(row[1:])
134 |             # Defines filename as the first entry in the row - This could be made dynamic so that the user inputs a column name to use
135 |             filename = (row[0])
136 |             # This basically makes sure it is not looking at the header row.
137 |             # If the filename is not in the filelist set, add it to the list and create new csv file with header row.
138 |             if filename not in filelist:    
139 |                 filelist.add(filename)
140 |                 temp_file = output_dir + str(filename +'_messages')
141 |                 if os.path.exists(temp_file):
142 |                     os.remove(temp_file)
143 |                 with open(temp_file,'a') as f:
144 |                     f.write(rowstring)
145 |                     f.write("\n")
146 |                     f.close()
147 |             # If the filename is in the filelist set, append the current row to the existing csv file.     
148 |             else:
149 |                 temp_file = output_dir + str(filename +  '_messages')
150 |                 with open(temp_file,'a') as f:
151 |                     f.write(rowstring)
152 |                     f.write("\n")
153 |                     f.close() 
154 |                     
155 | def data_integration(file_list, data_path,  output_dir='./rca_data/'):
156 |     # Create an empty dictionary to store data split by date and hour
157 |     df_dict = dict()
158 |     for file in file_list:
159 |         file_dir = ''.join(data_path, file)
160 |         df = pd.read_csv(file_dir)
161 |         timestamps = df['Time']
162 |         pod_name = file[:-4]
163 |         df['Time'] = pd.to_datetime(df['Time'])
164 |         # Iterate over the unique dates in the 'Time' column
165 |         for date in df['Time'].dt.date.unique():
166 |             # Filter data for the specific date
167 |             date_data = df[df['Time'].dt.date == date]
168 |             date_data['PodName'] = np.array([pod_name for _ in range(date_data.shape[0])])
169 |             date_data['Container'] = np.array(['server' for _ in range(date_data.shape[0])])
170 |             # Create a nested dictionary for each hour of this date
171 |             hourly_dict = {}
172 |             
173 |             for hour in range(24):
174 |                 # Filter data for the specific hour within the date
175 |                 hourly_data = date_data[date_data['Time'].dt.hour == hour]
176 |                 hourly_data = hourly_data.rename(columns={
177 |                 'EventTemplate': 'Log',
178 |                 'Time': 'Timestamp',
179 |                 })
180 |                 # Store the split data in the nested dictionary with the hour as the key
181 |                 if hour not in hourly_dict:
182 |                     hourly_dict[hour] = hourly_data
183 |                 else:
184 |                     hourly_dict[hour] = hourly_dict[hour].append(hourly_data, ignore_index=True)
185 |             
186 |             # Store the hourly dictionary in the main dictionary with the date as the key
187 |             df_dict[date] = hourly_dict
188 |         
189 |         
190 | if __name__ == "__main__":
191 |     file_list =['openshift-apiserver-operator-68fd44b989-6rgcq_messages_structured.csv',
192 |     'network-operator-7c59d666f5-27lvk_messages_structured.csv',
193 |     'mongodb-v1-64c6b69879-p4wfp_messages_structured.csv',
194 |     'openshift-kube-scheduler-ocp4-control-plane-1_messages_structured.csv',
195 |     'openshift-kube-scheduler-ocp4-control-plane-2_messages_structured.csv',
196 |     'openshift-kube-scheduler-ocp4-control-plane-3_messages_structured.csv',
197 |     'ovs-ch5xp_messages_structured.csv',
198 |     'packageserver-67d8b69dc5-6rtj9_messages_structured.csv',
199 |     'prometheus-6cc8d9b85-sztrb_messages_structured.csv']
200 |     # Input log data directory
201 |     # path = 'Path-to-the-dataset-directory' 
202 |     path = '/nfs/users/zach/aiops/data/1203/log_data/pod_removed/'
203 |     # Output directories  
204 |     output_dir='./rca_data/' 
205 |     data_integration(file_list, path, output_dir)
206 | 


--------------------------------------------------------------------------------
/Baseline/Nezha/log.py:
--------------------------------------------------------------------------------
 1 | #encoding = utf-8
 2 | 
 3 | import logging
 4 | 
 5 | 
 6 | class Logger():
 7 |     def __init__(self, logname, loglevel=logging.DEBUG, loggername=None):
 8 |         '''
 9 |            指定保存日志的文件路径，日志级别，以及调用文件
10 |            将日志存入到指定的文件中
11 |         '''
12 |         # 创建一个logger
13 |         self.logger = logging.getLogger(loggername)
14 |         self.logger.setLevel(loglevel)
15 |         # 创建一个handler，用于写入日志文件
16 |         fh = logging.FileHandler(logname)
17 |         fh.setLevel(loglevel)
18 |         if not self.logger.handlers:
19 |             # 再创建一个handler，用于输出到控制台
20 |             ch = logging.StreamHandler()
21 |             ch.setLevel(loglevel)
22 |             formatter = logging.Formatter(
23 |                 '[%(levelname)s]%(asctime)s %(filename)s:%(lineno)d: %(message)s')
24 |             fh.setFormatter(formatter)
25 |             ch.setFormatter(formatter)
26 |             self.logger.addHandler(fh)
27 |             self.logger.addHandler(ch)
28 | 
29 |     def getlog(self):
30 |         return self.logger
31 | 


--------------------------------------------------------------------------------
/Baseline/Nezha/main.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | 
 3 | from pattern_ranker import *
 4 | import argparse
 5 | from log_parsing import *
 6 | 
 7 | file_path = './'
 8 | # print(file_path)
 9 | log_path = file_path + '/log/' + str(datetime.datetime.now().strftime(
10 |     '%Y-%m-%d')) + '_nezha.log'
11 | print(log_path)
12 | logger = Logger(log_path, logging.DEBUG, __name__).getlog()
13 | 
14 | 
15 | def get_miner(ns):
16 |     template_indir = file_path + '/log_template'
17 |     config = TemplateMinerConfig()
18 |     config.load(file_path + "/log_template/drain3_" + ns + ".ini")
19 |     config.profiling_enabled = False
20 | 
21 |     path = file_path + '/log_template/' + ns + ".bin"
22 |     persistence = FilePersistence(path)
23 |     template_miner = TemplateMiner(persistence, config=config)
24 | 
25 |     return template_miner
26 | 
27 | # def generate_trace_id(log_dir):
28 | #     trace_list = []
29 | #     for file in os.listdir(log_dir):
30 | #         if file.endswith("_messages_structured.csv"):
31 | #             trace_list.append(file[:-24])
32 | #     if not os.path.exists(log_dir + '../traceid/'):
33 | #         os.mkdir(log_dir + '../traceid/')
34 | #     pd.DataFrame(trace_list).to_csv(log_dir + '../traceid/trace_id.csv', index=False,  header=False)
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     parser = argparse.ArgumentParser(description='Nezha')
39 | 
40 |     parser.add_argument('--ns', default="hipster", help='namespace')
41 |     parser.add_argument('--level', default="service", help='service-level or inner-service level')
42 |     parser.add_argument('--log_dir', default="./20240124/log/", help='the path to log data')
43 |     parser.add_argument('--metric_dir', default="./20240124/Latency/", help='the path to metric data')
44 |     parser.add_argument('--save_dir', default="./20240124/", help='the path to save preprocessed data')
45 |     # parser.add_argument('--level', default="service", help='service-level or inner-service level')
46 |     args = parser.parse_args()
47 |     ns = args.ns
48 |     level = args.level
49 |     save_dir = args.save_dir
50 |     log_dir = args.log_dir
51 |     metric_dir = args.metric_dir
52 |     kpi_file = save_dir + '/kpi_20240124_latency.csv'
53 |     path1 = save_dir + "./20240124-fault_list.json"
54 |     kpi_data = pd.read_csv(kpi_file)
55 |     normal_time1 = str(pd.to_datetime(kpi_data['timeStamp'].iloc[0], unit='s'))
56 |     time_index = int(kpi_data['timeStamp'].shape[0] * 0.6)
57 |     preprocess(log_dir, metric_dir, save_dir)
58 |     file_path = save_dir
59 |     log_template_miner = get_miner(ns)
60 |     inject_list = [path1]
61 |     normal_time_list = [normal_time1]
62 |     if level == "service":
63 |         logger.info("------- Result at service level -------")
64 |         evaluation_pod(normal_time_list, inject_list, ns, log_template_miner, file_path)
65 |     else:
66 |         logger.info("------- Result at inner service level -------")
67 |         evaluation(normal_time_list, inject_list, ns, log_template_miner, file_path)
68 | 


--------------------------------------------------------------------------------
/Baseline/Nezha/pattern_miner.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | from data_integrate import *
  3 | 
  4 | log_path = dirname(__file__) + '/log/' + str(datetime.datetime.now().strftime(
  5 |     '%Y-%m-%d')) + '_nezha.log'
  6 | logger = Logger(log_path, logging.DEBUG, __name__).getlog()
  7 | 
  8 | 
  9 | # def frequent_pattern_miner(event_sequences):
 10 | #     """
 11 | #     mining frequent pattern in event sequences (Discard)
 12 | #     input:
 13 | #         - event_sequences: event sequences belonging to the traces in time window, e.g., [[1,2,3],[2,3,4]]
 14 | #     output:
 15 | #         - pattern: frequent_pattern in the events, e.g.,  [['54', '29', '#SUP: 9'], ['54', '30', '#SUP: 9'], ['54', '32', '#SUP: 9']]
 16 | #     """
 17 | #     print(datetime.datetime.now())
 18 | 
 19 | #     spmf_path = dirname(__file__) + "/spmf"
 20 | #     spmf = Spmf("CM-SPAM", input_direct=event_sequences,
 21 | #                 output_filename="./spmf/SPAM.txt", arguments=[0.01, 2, 2], spmf_bin_location_dir=spmf_path, memory=8192)
 22 | #     spmf.run()
 23 | #     pattern = spmf.parse_output()
 24 | #     print(pattern)
 25 | #     print(datetime.datetime.now())
 26 | #     return pattern
 27 | 
 28 | 
 29 | # def frequent_graph_miner(file_name, topk=30):
 30 | #     """
 31 | #     mining frequent graph in event graph
 32 | #     input:
 33 | #         - file_name: input filename e.g.,
 34 | #     output:
 35 | #         - pattern_list: frequent_child_graph_list [{'support': '519', 'node1': '180', 'node2': '264'}]
 36 | #     """
 37 | 
 38 | #     # print(datetime.datetime.now())
 39 | 
 40 | #     spmf_path = dirname(__file__) + "/spmf"
 41 | #     spmf = Spmf("TKG", input_filename=file_name,
 42 | #                 output_filename="./spmf/tkg.txt", arguments=[topk, 2, False, False, True], spmf_bin_location_dir=spmf_path, memory=8192)
 43 | #     spmf.run()
 44 | #     pattern_result = spmf.parse_output()
 45 | 
 46 | #     # print(pattern_result)
 47 | #     # print(datetime.datetime.now())
 48 | 
 49 | #     pattern_list = []
 50 | #     for i in range(0, len(pattern_result), 6):
 51 | #         """ parse ['t # 29 * 519'], ['v 0 5'], ['v 1 265'], ['e 0 1 1'] """
 52 | #         support = pattern_result[i][0].split(' ')[-1]
 53 | #         node1 = pattern_result[i+1][0].split(' ')[-1]
 54 | #         node2 = pattern_result[i+2][0].split(' ')[-1]
 55 | #         pattern = {"support": support, "child_graph": node1 + "_" + node2}
 56 | #         pattern_list.append(pattern)
 57 | 
 58 | #     pattern_list.sort(key=lambda k: k['support'], reverse=True)
 59 | 
 60 | #     return pattern_list
 61 | 
 62 | 
 63 | # def generate_tkg_input(event_graphs):
 64 | #     """
 65 | #     generate_tkg_input:
 66 | #     :parameter
 67 | #         event_graphs - graph list
 68 | #     :return
 69 | #         file_name - tkg input filename
 70 | 
 71 | #     details see at https://www.philippe-fournier-viger.com/spmf/TKG.php
 72 | #     t # 0
 73 | #     v 0 10
 74 | #     v 1 11
 75 | #     e 0 1 20
 76 | #     """
 77 | #     file_name = dirname(__file__) + "/spmf/" + str(datetime.datetime.now().strftime(
 78 | #         '%Y-%m-%d')) + "_tkg_input.txt"
 79 | #     f = open(file_name, "w")
 80 | 
 81 | #     graph_number = 0
 82 | #     node_number = 0
 83 | 
 84 | #     for graph in event_graphs:
 85 | #         # write head
 86 | #         graph_head = "t # " + str(graph_number) + "\r\n"
 87 | #         f.write(graph_head)
 88 | 
 89 | #         node_map = {}
 90 | #         node_content = ""
 91 | #         edge_content = ""
 92 | #         for key in graph.adjacency_list.keys():
 93 | #             if key.event not in node_map:
 94 | #                 node_map[key.event] = node_number
 95 | #             node_content += "v " + \
 96 | #                 str(node_number) + " " + str(key.event) + "\r\n"
 97 | #             node_number += 1
 98 | 
 99 | #             for event in graph.adjacency_list[key]:
100 | #                 if event.event not in node_map:
101 | #                     node_map[event.event] = node_number
102 | #                 node_content += "v " + \
103 | #                     str(node_number) + " " + str(event.event) + "\r\n"
104 | #                 node_number += 1
105 | 
106 | #                 edge_content += "e " + \
107 | #                     str(node_map[key.event]) + " " + \
108 | #                     str(node_map[event.event]) + " 1\r\n"
109 | 
110 | #         f.write(node_content)
111 | #         f.write(edge_content)
112 | #         graph_number += 1
113 | #         f.write("\r\n")
114 | #     f.close()
115 | 
116 | #     return file_name
117 | 
118 | 
119 | def get_pattern_support(event_graphs):
120 |     result_support_dict = {}
121 |     total_pair = set()
122 | 
123 |     for event_graph in event_graphs:
124 |         for key, value in event_graph.support_dict.items():
125 |             if key in total_pair:
126 |                 result_support_dict[key] += value
127 |             else:
128 |                 result_support_dict[key] = value
129 |         total_pair = total_pair | event_graph.pair_set
130 | 
131 |     result_support_dict = dict(sorted(
132 |         result_support_dict.items(), key=lambda x: x[1], reverse=True))
133 | 
134 |     return result_support_dict
135 | 
136 | 


--------------------------------------------------------------------------------
/Baseline/Nezha/requirements.txt:
--------------------------------------------------------------------------------
1 | drain3==0.9.10
2 | matplotlib==3.3.4
3 | more_itertools==8.12.0
4 | numpy==2.0.0
5 | pandas==0.23.4
6 | psutil==5.9.0
7 | PyYAML==6.0.1
8 | 


--------------------------------------------------------------------------------
/Baseline/Readme.md:
--------------------------------------------------------------------------------
 1 | # Baselines 
 2 | 
 3 | This folder contains the baseline methods for Lemma-RCA datasets evaluation with both single- and multi-modal settings. Note that SWAT and WADI datasets comply with single-modal setting.
 4 | 
 5 |     - FastPC: 
 6 |     ```
 7 |         python test_FastPC_pod_metric.py -case 20240115 ## for case 20240115 metric data only
 8 |         python test_FastPC_pod_log.py  -case 20240115  ## for case 20240115 log data only
 9 |         python test_FastPC_pod_combine.py  -case 20240115  ## for case 20240115  with both metric and log data
10 |     ```
11 | 
12 |     - Baro: 
13 |     ```
14 |         cd ./metric_only
15 |         python baro_main_metric.py -case 20240115## for case 20240115 metric data only
16 |         cd ./log_only
17 |         python baro_main_log.py  -case 20240115## for case 20240115 log data only
18 |         cd ./multimodal
19 |         python baro_main_combined.py  -case 20240115 ## case 20240115 with for both metric and log data
20 |     ```
21 | 
22 | 
23 |     - RCD: 
24 |     ```
25 |         cd ./metric_only
26 |         python RCA_methods_metric.py -case 20240115 -model rcd ## for metric data only
27 |         cd ./log_only
28 |         python RCA_methods_log.py  -case 20240115 -model rcd ## for log data only
29 |         cd ./multimodal
30 |         python RCA_methods_combined.py  -case 20240115 -model rcd ## for both metric and log data
31 |     ```
32 | 
33 |     - CIRCA: 
34 |     ```
35 |         cd ./metric_only
36 |         python RCA_methods_metric.py -case 20240115 -model circa ## for metric data only
37 |         cd ./log_only
38 |         python RCA_methods_log.py  -case 20240115 -model circa ## for log data only
39 |         cd ./multimodal
40 |         python RCA_methods_combined.py  -case 20240115 -model circa ## for both metric and log data
41 |     ```
42 | 
43 |     - epsilon_diagnosis: 
44 |     ```
45 |         cd ./metric_only
46 |         python RCA_methods_metric.py -case 20240115 -model epsilon_diagnosis ## for metric data only
47 |         cd ./log_only
48 |         python RCA_methods_log.py  -case 20240115 -model epsilon_diagnosis ## for log data only
49 |         cd ./multimodal
50 |         python RCA_methods_combined.py  -case 20240115 -model epsilon_diagnosis ## for both metric and log data
51 |     ```
52 | 
53 |     - Nezha:
54 |     ```
55 |         python main.py
56 |     ```
57 |     For Nezha, we provide the demo code for the case 20240124. Due to inconsistant filename for each case, you may need to change the name of the folder for each case accordingly. 
58 | 
59 | To run the baseline methods for SWAT and WADI datasets, the only difference is the data loader and the labels for evaluation. The labels are given in the corresponding scripts. For the Baro method:
60 | 
61 |     - Baro for SWAT & WADI: 
62 |     ```
63 |         python baro.py 
64 |         python baro-evaluation.py
65 |     ```
66 | 
67 | The RCD, epsilon_diagnosis, CIRCA methods are included in the pyrca package. Simply run:
68 | 
69 |     - Baro for SWAT & WADI: 
70 |     ```
71 |         python pyrca-main.py 
72 |         python pyrca-evaluation.py
73 |     ```
74 | 
75 | ##### If you encounter the error regarding "name 'LIBSPOT' is not defined", please double-check if you are running the code in the directory of FastPC. 
76 | 
77 | #### If you fail to install pyrca package in windows, please use the following command:
78 | #### "pip install sfr-pyrca --use-pep517 git+https://github.com/SchmollerLab/python-javabridge-windows"
79 | 
80 | 


--------------------------------------------------------------------------------
/Baseline/SWAT&WADI/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KnowledgeDiscovery/rca_baselines/c560d8cc39c19f04c9ae74fac2404b1e5e8c3b4d/Baseline/SWAT&WADI/.DS_Store


--------------------------------------------------------------------------------
/Baseline/SWAT&WADI/baro-evaluation.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import glob
 3 | import pandas as pd
 4 | import numpy as np
 5 | from collections import defaultdict
 6 | 
 7 | files = glob.glob('./final*.csv')
 8 | model = 'baro'
 9 | 
10 | model_files = defaultdict(list)
11 | 
12 | for file in files:
13 |     model_files[model].append(file)
14 | 
15 | for key in model_files:
16 |     model_files[key] = sorted(model_files[key], key=lambda x: x.split('/')[-2])
17 | 
18 | print(model_files)
19 | 
20 | predicts = []
21 | mfiles = model_files['baro'] 
22 | for mf in mfiles:
23 |     print(mf)
24 |     mf_data = pd.read_csv(mf)
25 |     root_cause_list = list(mf_data['root_cause'].values)
26 |     if 'Latency' in root_cause_list:
27 |         root_cause_list.remove('Latency')
28 |     predicts.append(root_cause_list)
29 |         
30 | reals = [
31 |     ['1_MV_001'],
32 |     ['1_FIT_001'],
33 |     ['2_LIT_002', '1_AIT_001'],
34 |     ['2_MCV_101', '2_MCV_201', '2_MCV_301', '2_MCV_401', '2_MCV_501', '2_MCV_601'],
35 |     ['2_MCV_101', '2_MCV_201'],
36 |     ['1_AIT_002', '2_MV_003'],
37 |     ['2_MCV_007'],
38 |     ['1_P_006'],
39 |     ['1_MV_001'],
40 |     ['2_MCV_007'],
41 |     ['2_MCV_007'],
42 |     ['2_AIT_003'],
43 |     ['2_MV_201', '2_P_201', '2_P_202', '2_P_203', '2_P_204', '2_P_205', '2_P_206'],
44 |     ['2_LIT_002', '1_AIT_001'],
45 | ]
46 | 
47 | def precision_on_topk(predicts,reals,k):
48 |     pr = 0
49 |     for pred, real in zip(predicts, reals):
50 |         pred = pred[:k]
51 |         hit_count = len(set(pred) & set(real))
52 |         min_len = min(k,len(real))
53 |         pr += hit_count/min_len
54 |     return pr/len(reals)
55 | 
56 | def mean_precision_k(predicts,reals,k):
57 |     pr = 0
58 |     for i in range(1,k+1):
59 |         pr += precision_on_topk(predicts,reals,i)
60 |     return pr/k
61 | 
62 | def mrr(predicts,reals):
63 |     mrr_val = 0
64 |     for preds,real in zip(predicts,reals):
65 |         tmp = []
66 |         for real_item in real:
67 |             index = preds.index(real_item) if real_item in preds else sys.maxsize-1
68 |             tmp.append(index+1)
69 |         mrr_val += 1/min(tmp)
70 |     return mrr_val/len(reals)
71 | 
72 | k = [1,3,5,10]
73 | for item in k:
74 |     pr_k = precision_on_topk(predicts,reals,item)
75 |     map_k = mean_precision_k(predicts,reals,item)
76 |     print("Precision@{}:{}".format(item,pr_k))
77 |     print('MAP@{}:{}'.format(item,map_k))
78 | print('MRR:{}'.format(mrr(predicts,reals)))
79 | 


--------------------------------------------------------------------------------
/Baseline/SWAT&WADI/baro.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pickle
 4 | import time
 5 | import warnings
 6 | warnings.filterwarnings("ignore")
 7 | from sklearn.feature_selection import VarianceThreshold
 8 | from baro_algorithm import bocpd, robust_scorer
 9 | 
10 | def data_convert(segment):
11 |     columns = np.array(segment.iloc[:, 1:].columns)
12 |     selector = VarianceThreshold(threshold=0)
13 |     X = segment.iloc[:, 1:].values
14 |     X_var = selector.fit_transform(X)
15 |     idx = selector.get_support(indices=True)
16 |     columns = columns[idx]
17 |     X_var = pd.DataFrame(X_var)
18 |     X_var.columns = list(columns)
19 |     return X_var
20 |     
21 | with open('../WADI/data_segments.pkl','rb') as f:
22 |     data_segments = pickle.load(f)
23 | 
24 | 
25 | for ind,segment in enumerate(data_segments):
26 |     segment = segment.iloc[:, 1:]
27 |     print('{} fault starts to detect bayesian structure'.format(ind))
28 |     segment = data_convert(segment)
29 |     columns = np.array(segment.columns)
30 |     #np.save('{}_var_name.npy'.format(ind), columns)  
31 |     X = segment.values
32 |     patch = 100
33 |     sample = X.shape[0]//patch
34 |     X = X[:patch*sample,:]
35 |     X = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1)
36 |     X_df = pd.DataFrame(X,columns=columns)
37 |     anomalies = bocpd(X_df)
38 |     print("Anomalies are detected at timestep:", anomalies[0])
39 |     results = robust_scorer(X_df,anomalies=anomalies)
40 | 
41 |     root_causes  = []
42 |     for result in results:
43 |         (root_cause, score) = result
44 |         root_causes.append([root_cause, score])
45 |     root_causes = pd.DataFrame(root_causes)
46 |     root_causes.columns = [['root_cause','score']]
47 |     root_causes.to_csv("./final_{}_root_cause.csv".format(ind),index=False)
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/Baseline/SWAT&WADI/pyrca-evaluation.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import glob
 3 | import pandas as pd
 4 | import numpy as np
 5 | from collections import defaultdict
 6 | 
 7 | files = glob.glob('./final*.csv')
 8 | model = 'ED'
 9 | 
10 | model_files = defaultdict(list)
11 | 
12 | for file in files:
13 |     if model in file:
14 |         model_files[model].append(file)
15 | 
16 | for key in model_files:
17 |     model_files[key] = sorted(model_files[key], key=lambda x: x.split('/')[-2])
18 | 
19 | predicts = []
20 | mfiles = model_files[model] 
21 | print(mfiles)
22 | for mf in mfiles:
23 |     print(mf)
24 |     mf_data = pd.read_csv(mf)
25 |     root_cause_list = list(mf_data['root_cause'].values)
26 |     if 'label' in root_cause_list:
27 |         root_cause_list.remove('label')
28 |     predicts.append(root_cause_list)
29 |         
30 | reals = [
31 |     ['1_MV_001'],
32 |     ['1_FIT_001'],
33 |     ['2_LIT_002', '1_AIT_001'],
34 |     ['2_MCV_101', '2_MCV_201', '2_MCV_301', '2_MCV_401', '2_MCV_501', '2_MCV_601'],
35 |     ['2_MCV_101', '2_MCV_201'],
36 |     ['1_AIT_002', '2_MV_003'],
37 |     ['2_MCV_007'],
38 |     ['1_P_006'],
39 |     ['1_MV_001'],
40 |     ['2_MCV_007'],
41 |     ['2_MCV_007'],
42 |     ['2_AIT_003'],
43 |     ['2_MV_201', '2_P_201', '2_P_202', '2_P_203', '2_P_204', '2_P_205', '2_P_206'],
44 |     ['2_LIT_002', '1_AIT_001'],
45 | ]
46 | 
47 | def precision_on_topk(predicts,reals,k):
48 |     pr = 0
49 |     for pred, real in zip(predicts, reals):
50 |         pred = pred[:k]
51 |         hit_count = len(set(pred) & set(real))
52 |         min_len = min(k,len(real))
53 |         pr += hit_count/min_len
54 |     return pr/len(reals)
55 | 
56 | def mean_precision_k(predicts,reals,k):
57 |     pr = 0
58 |     for i in range(1,k+1):
59 |         pr += precision_on_topk(predicts,reals,i)
60 |     return pr/k
61 | 
62 | def mrr(predicts,reals):
63 |     mrr_val = 0
64 |     for preds,real in zip(predicts,reals):
65 |         tmp = []
66 |         for real_item in real:
67 |             index = preds.index(real_item) if real_item in preds else sys.maxsize-1
68 |             tmp.append(index+1)
69 |         mrr_val += 1/min(tmp)
70 |     return mrr_val/len(reals)
71 | 
72 | k = [1,3,5,10]
73 | for item in k:
74 |     pr_k = precision_on_topk(predicts,reals,item)
75 |     map_k = mean_precision_k(predicts,reals,item)
76 |     print("Precision@{}:{}".format(item,pr_k))
77 |     print('MAP@{}:{}'.format(item,map_k))
78 | print('MRR:{}'.format(mrr(predicts,reals)))
79 | 


--------------------------------------------------------------------------------
/Baseline/SWAT&WADI/pyrca-main.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import pandas as pd
 4 | from sklearn import preprocessing
 5 | import os
 6 | from sklearn.feature_selection import VarianceThreshold
 7 | from sklearn.model_selection import train_test_split
 8 | 
 9 | from causalnex.structure.notears import from_pandas
10 | import networkx as nx 
11 | 
12 | from pyrca.analyzers.ht import HT, HTConfig
13 | from pyrca.analyzers.epsilon_diagnosis import EpsilonDiagnosis, EpsilonDiagnosisConfig
14 | from pyrca.analyzers.rcd import RCD, RCDConfig
15 | 
16 | import pandas as pd
17 | import networkx as nx
18 | import pickle
19 | 
20 | 
21 | def remove_cycles_from_adjacency_matrix(adj_matrix: pd.DataFrame) -> pd.DataFrame:
22 |     G = nx.from_pandas_adjacency(adj_matrix, create_using=nx.DiGraph())
23 |     while True:
24 |         try:
25 |             cycle = nx.find_cycle(G, orientation='original')
26 |             G.remove_edge(*cycle[0][:2])
27 |         except nx.NetworkXNoCycle:
28 |             break
29 |     adj_matrix_no_cycles = nx.to_pandas_adjacency(G, dtype=int)
30 |     adj_matrix_no_cycles = adj_matrix_no_cycles.reindex_like(adj_matrix)
31 |     print("Now, the adjacency matrix does not have cycles.")
32 |     
33 |     return adj_matrix_no_cycles
34 | 
35 | def data_convert(segment):
36 |     columns = np.array(segment.iloc[:, 1:].columns)
37 |     selector = VarianceThreshold(threshold=0)
38 |     X = segment.iloc[:, 1:].values
39 |     X_var = selector.fit_transform(X)
40 |     idx = selector.get_support(indices=True)
41 |     columns = columns[idx]
42 |     X_var = pd.DataFrame(X_var)
43 |     X_var.columns = list(columns)
44 |     
45 |     return X_var
46 | 
47 | def rca(ind, segment, model_name):
48 |     segment = segment.iloc[:, 1:]
49 |     print('{} fault starts to detect bayesian structure'.format(ind))
50 |     segment = data_convert(segment)
51 |     columns = np.array(segment.columns)
52 |     #np.save('{}_var_name.npy'.format(ind), columns)  
53 |     X = segment.values
54 |     patch = 100
55 |     sample = X.shape[0]//patch
56 |     X = X[:patch*sample,:]
57 |     X0 = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1)
58 |     X = pd.DataFrame(X0,columns=columns)
59 | 
60 |     X_train, X_test = train_test_split(X, test_size=0.6, shuffle=False)
61 |     print("Start to run")
62 |     if model_name == "HT":
63 |         model = HT(config=HTConfig(graph=estimated_matrix,root_cause_top_k=10))
64 |         model.train(X_train)
65 |         results = model.find_root_causes(X_test, "label", True).to_list()
66 |     elif model_name == "RCD":
67 |         model = RCD(config=RCDConfig(k=10,alpha_limit=0.5))
68 |         results = model.find_root_causes(X_train,X_test).to_list()
69 |     elif model_name == "ED":
70 |         model = EpsilonDiagnosis(config=EpsilonDiagnosisConfig(root_cause_top_k=10))
71 |         model.train(X)
72 |         results = model.find_root_causes(X).to_list()
73 | 
74 |     print("Saving")
75 |     root_causes  = []
76 |     for result in results:
77 |         root_causes.append([result['root_cause'],result['score']])
78 |     root_causes = pd.DataFrame(root_causes)
79 |     root_causes.columns = [['root_cause','score']]
80 |     root_causes.to_csv("final_{}_{}_root_cause.csv".format(model_name, ind),index=False)
81 | 
82 |     return
83 | 
84 | 
85 |     
86 | with open('../WADI/data_segments.pkl','rb') as f:
87 |     data_segments = pickle.load(f)
88 | 
89 | models = ['ED', 'RCD', 'HT']
90 | # Run all
91 | for model_name in models:
92 |     for ind,segment in enumerate(data_segments):
93 |         print("Now running {} for data {}.".format(model_name, ind))
94 |         rca(ind, segment,model_name)
95 |         print("-------------------")
96 | 


--------------------------------------------------------------------------------
/Baseline/log_only/RCA_methods_log.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn import preprocessing
  4 | import os
  5 | from sklearn.feature_selection import VarianceThreshold
  6 | from sklearn.model_selection import train_test_split
  7 | from causalnex.structure.notears import from_pandas
  8 | from pyrca.analyzers.ht import HT, HTConfig
  9 | from pyrca.analyzers.epsilon_diagnosis import EpsilonDiagnosis, EpsilonDiagnosisConfig
 10 | from pyrca.analyzers.rcd import RCD, RCDConfig
 11 | # from pyrca.analyzers
 12 | import networkx as nx
 13 | import argparse
 14 | 
 15 | 
 16 | def remove_cycles_from_adjacency_matrix(adj_matrix: pd.DataFrame) -> pd.DataFrame:
 17 |     G = nx.from_pandas_adjacency(adj_matrix, create_using=nx.DiGraph())
 18 |     while True:
 19 |         try:
 20 |             cycle = nx.find_cycle(G, orientation='original')
 21 |             G.remove_edge(*cycle[0][:2])
 22 | 
 23 |         except nx.NetworkXNoCycle:
 24 |             break
 25 |     adj_matrix_no_cycles = nx.to_pandas_adjacency(G, dtype=int)
 26 |     adj_matrix_no_cycles = adj_matrix_no_cycles.reindex_like(adj_matrix)
 27 | 
 28 |     print("Now, the adjacency matrix does not have cycles.")
 29 |     return adj_matrix_no_cycles
 30 | 
 31 | 
 32 | 
 33 | def main(args):
 34 |     model_name = args.model
 35 |     data_name = args.case
 36 |     metric_data = {}
 37 |     columns_common = {}
 38 |     metric_path = '../data/{}'.format(data_name)
 39 |     if data_name == '20220606':
 40 |         label = 'reviews-v3'
 41 |     elif data_name == '20210517' or data_name == '20210524':
 42 |         label = 'Book_Info_product'
 43 |     elif data_name == '20211203':
 44 |         label = 'ratings.book-info.svc.cluster.local:9080/*'
 45 |     elif data_name == '20240215':
 46 |         label = 'pod usage'
 47 |     elif data_name == '20240124':
 48 |         label = 'scenario8_app_request'
 49 |     elif data_name == '20231207':
 50 |         label = 'book_info'
 51 |     elif data_name == '20231221':
 52 |         label = 'book_info'
 53 |     elif data_name == '20240115':
 54 |         label = 'book_info'
 55 |     else:
 56 |         raise ValueError('Invalid data_name')
 57 | 
 58 |     if data_name in ['20220606', '20210517', '20210524', '20211203', '20231207']:
 59 |         POD_METRIC_FILE = {'log_golden_signal': 1, 'log_frequency': 1, 'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1,
 60 |                            'received_bandwidth': 1, 'transmit_bandwidth': 1}
 61 |         log_label = 'ratings.book-info.svc.cluster.local:9080/*'
 62 |     elif data_name in ['20231207']:
 63 |         POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1,
 64 |                            'received_bandwidth': 1, 'transmit_bandwidth': 1, 'log_golden_signal': 1,
 65 |                            'log_frequency': 1}
 66 |         log_label = 'book_info'
 67 |     elif data_name in ['20240124']:
 68 |         POD_METRIC_FILE = {'log_golden_signal': 1, 'log_frequency': 1, 'cpu_usage': 1, 'disk_used': 1, 'diskio_reads': 1, 'diskio_writes': 1, 'memory_used': 1,
 69 |                            'netstat_established': 1, 'swap_used': 1}
 70 |         log_label = 'ratings.book-info.svc.cluster.local:9080/*'
 71 |     elif data_name in ['20240215']:
 72 |         POD_METRIC_FILE = {'pod_cpu_limit': 1, 'pod_cpu_usage_total': 1, 'pod_cpu_utilization_over_pod_limit': 1, 'pod_memory_limit': 1,
 73 |                            'pod_memory_utilization_over_pod_limit': 1, 'pod_memory_working_set': 1, 'pod_network_rx_bytes': 1, 'pod_network_tx_bytes':1,
 74 |                            'log_golden_signal': 1, 'log_frequency': 1}
 75 |         log_label = 'book_info'
 76 |     elif data_name in ['20240115']:
 77 |         POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1,
 78 |                            'rate_storage_iops': 1, 'received_bandwidth': 1, 'transmit_bandwidth': 1, 'log_golden_signal': 1,
 79 |                            'log_frequency': 1}
 80 |         log_label = 'ratings.book-info.svc.cluster.local:9080/*'
 81 |     elif data_name in ['20231221']:
 82 |         POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1,
 83 |                            'received_bandwidth': 1, 'transmit_bandwidth': 1, 'log_golden_signal': 1, 'log_frequency': 1}
 84 |         log_label = 'ratings.book-info.svc.cluster.local:9080/*'
 85 |     else:
 86 |         raise ValueError('Invalid data_name')
 87 | 
 88 |     pathset = "./output/"
 89 |     if not (os.path.exists(pathset)):
 90 |         os.mkdir(pathset)
 91 | 
 92 |     for metric, weight in POD_METRIC_FILE.items():
 93 |         if metric in ['log_PCA', 'log_golden_signal', 'log_frequency']:
 94 |             metric_file = '{}/pod_level_{}.npy'.format(metric_path, metric)
 95 |             metric_data[metric] = np.load(metric_file, allow_pickle=True).item()
 96 |             # log_label = 'ratings.book-info.svc.cluster.local:9080/*'
 97 |             if len(metric_data[metric].keys()) == 1:
 98 |                 if log_label != label:
 99 |                     metric_data[metric][label] = metric_data[metric][log_label]
100 |                     del metric_data[metric][log_label]
101 |             else:
102 |                  metric_data[metric][label] = metric_data[metric]
103 |             metric_data[metric][label]['Pod_Name'] = metric_data[metric][label]['Node_Name']
104 |             del metric_data[metric][label]['Node_Name']
105 |             metric_data[metric][label]['Sequence'] = metric_data[metric][label]['Sequence'].squeeze().T
106 |             metric_data[metric][label]['KPI_Feature'] = [metric_data[metric][label]['KPI_Feature']]
107 |             if columns_common:
108 |                 columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
109 |             else:
110 |                 columns_common = list(metric_data[metric][label]['Pod_Name'])
111 |         else:
112 |             metric_file = '{}/pod_level_data_{}.npy'.format(metric_path, metric)
113 |             metric_data[metric] = np.load(metric_file, allow_pickle=True).item()
114 |             if columns_common:
115 |                 columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
116 |             else:
117 |                 columns_common = list(metric_data[metric][label]['Pod_Name'])
118 | 
119 |     index_data = {}
120 |     metric_names = []
121 |     metric_weight_assigned = []
122 |     for metric, weight in POD_METRIC_FILE.items():
123 |         index_data[metric] = [metric_data[metric][label]['Pod_Name'].index(x) for x in columns_common]
124 |         metric_names = metric_names + [metric]
125 |         metric_weight_assigned = metric_weight_assigned + [weight]
126 | 
127 |     metric_weight = np.zeros((len(POD_METRIC_FILE), 1))
128 |     metric_id = 0
129 |     final_root_results = {}
130 | 
131 |     for metric in metric_names:
132 |         print('For metric:', metric)
133 |         data = metric_data[metric]
134 |         X = data[label]['Sequence']
135 |         index = index_data[metric]
136 |         # Preprocessing to reduce the redundant samples
137 |         if X.shape[0] // 100 < 100:
138 |             patch = 20
139 |         else:
140 |             patch = 100
141 |         sample = X.shape[0] // patch
142 |         X = X[:patch * sample, :]
143 |         X = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1)
144 |         X_metric = X[:, index]
145 |         X_metric = preprocessing.normalize(X_metric, axis=0, norm='l1')
146 |         X = np.append(X_metric, X[:, -1].reshape(-1, 1), axis=1)
147 |         columns = list(columns_common) + data[label]['KPI_Feature']
148 | 
149 |         std = np.std(X[:, :-1], axis=0)
150 |         idx_std = [i for i, x in enumerate(std > 1e-5) if x]
151 |         if len(idx_std) == 0:
152 |             metric_weight[metric_id] = 0
153 |             metric_id = metric_id + 1
154 |             print(metric, ' all pods are all constant or quasi-constant')
155 |             continue
156 | 
157 |         selector = VarianceThreshold(threshold=0)
158 |         X_var = selector.fit_transform(X[:, :-1])
159 |         idx = selector.get_support(indices=True)
160 |         # print('X shape after variance: ', X_var.shape)
161 |         if X_var.shape[1] < 1:
162 |             metric_weight[metric_id] = 0
163 |             metric_id = metric_id + 1
164 |             print(metric, ' all pods are all constant or quasi-constant')
165 |             continue
166 | 
167 |         mask = np.full(len(columns_common), False, dtype=bool)
168 |         mask[idx] = True
169 |         idx = list(idx) + [X.shape[1] - 1]
170 |         X = X[:, idx]
171 |         columns = [columns[i] for i in idx]
172 |         X = pd.DataFrame(X, columns=columns)
173 |         if model_name == 'circa':
174 |             sm = from_pandas(X)
175 |             estimated_matrix = nx.to_pandas_adjacency(sm)
176 |             quantile_value = np.quantile(estimated_matrix.values.flatten(), 0.95)
177 |             estimated_matrix = (estimated_matrix > quantile_value).astype(int)
178 |             estimated_matrix = remove_cycles_from_adjacency_matrix(estimated_matrix)
179 |             # estimated_matrix.to_csv("{}_adjacency.csv".format(metric))
180 | 
181 |         X_train, X_test = train_test_split(X, test_size=0.6, shuffle=False)
182 | 
183 |         X.insert(0, 'time', pd.date_range(start='2024-01-01', periods=len(X), freq='D'))
184 | 
185 |         X['time'] = X['time'].astype('int64') // 1_000_000_000
186 |         X.columns = [f"{col}_cpu" if i < 9 else f"{col}_memory" for i, col in enumerate(X.columns)]
187 | 
188 | 
189 |         X.iloc[:, 1:] = (X.iloc[:, 1:] - X.iloc[:, 1:].min()) / (X.iloc[:, 1:].max() - X.iloc[:, 1:].min())
190 | 
191 |         if model_name == 'rcd':
192 |             model = RCD(config=RCDConfig(k=3,alpha_limit=0.5))
193 |             results = model.find_root_causes(X_train, X_test).to_list()
194 |             print(results)
195 |         elif model_name == 'circa':
196 |             model = HT(config=HTConfig(graph=estimated_matrix, root_cause_top_k=10))
197 |             model.train(X_train)
198 |             results = model.find_root_causes(X_test, metric_data[metric][label]['KPI_Feature'][0], True).to_list()
199 |         elif model_name == 'epsilon_diagnosis':
200 |             model = EpsilonDiagnosis(config=EpsilonDiagnosisConfig(root_cause_top_k=10))
201 |             model.train(X)
202 |             results = model.find_root_causes(X).to_list()
203 |         else:
204 |             raise ValueError('Invalid model_name')
205 | 
206 |         root_causes = []
207 |         for result in results:
208 |             root_causes.append([result['root_cause'], result['score']])
209 |         if not os.path.exists('./{}_results'.format(model_name)):
210 |             os.mkdir('./{}_results'.format(model_name))
211 |         if not os.path.exists('./{}_results/{}'.format(model_name, data_name)):
212 |             os.mkdir('./{}_results/{}'.format(model_name, data_name))
213 | 
214 |         root_causes = pd.DataFrame(root_causes)
215 |         root_causes.columns = [['root_cause', 'score']]
216 |         root_causes.to_csv("./{}_results/{}/{}_{}_{}_root_cause.csv".format(model_name, data_name, metric, model_name, data_name),
217 |                            index=False)
218 |         final_root_results[metric] = root_causes
219 |     concatenated_df = pd.concat(final_root_results.values(), ignore_index=True)
220 |     concatenated_df.to_csv("./{}_results/{}/concated_df.csv".format(model_name, data_name), index=False)
221 |     concatenated_df = pd.read_csv("./{}_results/{}/concated_df.csv".format(model_name, data_name))
222 |     aggregated_df = concatenated_df.groupby('root_cause')['score'].sum().reset_index()
223 |     aggregated_df = aggregated_df.sort_values(by='score', ascending=False)
224 |     aggregated_df.to_csv("./{}_results/{}/final_{}_{}_root_cause.csv".format(model_name, data_name, model_name, data_name), index=False)
225 | 
226 | 
227 | if __name__ == '__main__':
228 |     parser = argparse.ArgumentParser(description='Baro')
229 |     parser.add_argument("-case", type=str, default='20240115', help="case of the dataset")
230 |     parser.add_argument("-model", type=str, default='rcd', help="model name, [rcd, circa, epsilon_diagnosis], default is rcd")
231 |     parser.set_defaults(validation=True)
232 |     args = parser.parse_args()
233 |     main(args)
234 | 
235 | 
236 | 
237 | 
238 | 
239 | 
240 | 


--------------------------------------------------------------------------------
/Baseline/log_only/baro_main_log.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn import preprocessing
  4 | import os
  5 | from sklearn.feature_selection import VarianceThreshold
  6 | from baro_algorithm import bocpd, robust_scorer
  7 | import networkx as nx
  8 | import argparse
  9 | 
 10 | 
 11 | def remove_cycles_from_adjacency_matrix(adj_matrix: pd.DataFrame) -> pd.DataFrame:
 12 |     G = nx.from_pandas_adjacency(adj_matrix, create_using=nx.DiGraph())
 13 |     while True:
 14 |         try:
 15 |             cycle = nx.find_cycle(G, orientation='original')
 16 |             G.remove_edge(*cycle[0][:2])
 17 | 
 18 |         except nx.NetworkXNoCycle:
 19 |             break
 20 |     adj_matrix_no_cycles = nx.to_pandas_adjacency(G, dtype=int)
 21 |     adj_matrix_no_cycles = adj_matrix_no_cycles.reindex_like(adj_matrix)
 22 | 
 23 |     print("Now, the adjacency matrix does not have cycles.")
 24 |     return adj_matrix_no_cycles
 25 | 
 26 | 
 27 | def main(args):
 28 |     metric_data = {}
 29 |     columns_common = {}
 30 |     method = 'baro'
 31 |     data_name = args.case
 32 |     metric_path = '../data/{}'.format(data_name)
 33 |     if data_name == '20220606':
 34 |         label = 'reviews-v3'
 35 |     elif data_name == '20210517' or data_name == '20210524':
 36 |         label = 'Book_Info_product'
 37 |     elif data_name == '20211203':
 38 |         label = 'ratings.book-info.svc.cluster.local:9080/*'
 39 |     elif data_name == '20240215':
 40 |         label = 'pod usage'
 41 |     elif data_name == '20240124':
 42 |         label = 'scenario8_app_request'
 43 |     elif data_name == '20231207':
 44 |         label = 'book_info'
 45 |     elif data_name == '20231221':
 46 |         label = 'book_info'
 47 |     elif data_name == '20240115':
 48 |         label = 'book_info'
 49 |     else:
 50 |         raise ValueError('Invalid data_name')
 51 | 
 52 |     POD_METRIC_FILE = {'log_golden_signal': 1, 'log_frequency': 1}
 53 |     if data_name in ['20220606', '20210517', '20210524', '20211203', '20231207', '20240124', '20240115', '20231221']:
 54 |         log_label = 'ratings.book-info.svc.cluster.local:9080/*'
 55 |     elif data_name in ['20231207', '20240215']:
 56 |         log_label = 'book_info'
 57 |     else:
 58 |         raise ValueError('Invalid data_name')
 59 |     model_name = 'baro'
 60 | 
 61 |     pathset = "./output/"
 62 |     if not(os.path.exists(pathset)):
 63 |         os.mkdir(pathset)
 64 | 
 65 |     for metric, weight in POD_METRIC_FILE.items():
 66 |         if metric in ['log_PCA', 'log_golden_signal', 'log_frequency']:
 67 |             metric_file = '{}/pod_level_{}.npy'.format(metric_path, metric)
 68 |             metric_data[metric] = np.load(metric_file,allow_pickle=True).item()
 69 |             if len(metric_data[metric].keys()) == 1:
 70 |                 if log_label != label:
 71 |                     metric_data[metric][label] = metric_data[metric][log_label]
 72 |                     del metric_data[metric][log_label]
 73 |             else:
 74 |                  metric_data[metric][label] = metric_data[metric]
 75 |             metric_data[metric][label]['Pod_Name'] = metric_data[metric][label]['Node_Name']
 76 |             del metric_data[metric][label]['Node_Name']
 77 |             metric_data[metric][label]['Sequence'] = metric_data[metric][label]['Sequence'].squeeze().T
 78 |             metric_data[metric][label]['KPI_Feature'] = [metric_data[metric][label]['KPI_Feature']]
 79 |             if columns_common:
 80 |                 columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
 81 |             else:
 82 |                 columns_common = list(metric_data[metric][label]['Pod_Name'])
 83 |         else:
 84 |             metric_file = '{}/pod_level_data_{}.npy'.format(metric_path, metric)
 85 |             metric_data[metric] = np.load(metric_file, allow_pickle=True).item()
 86 |             if columns_common:
 87 |                 columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
 88 |             else:
 89 |                 columns_common = list(metric_data[metric][label]['Pod_Name'])
 90 | 
 91 | 
 92 |     index_data = {}
 93 |     metric_names = []
 94 |     metric_weight_assigned = []
 95 |     for metric, weight in POD_METRIC_FILE.items():
 96 |         index_data[metric] = [metric_data[metric][label]['Pod_Name'].index(x) for x in columns_common]
 97 |         metric_names = metric_names + [metric]
 98 |         metric_weight_assigned = metric_weight_assigned  + [weight]
 99 | 
100 |     metric_weight  =  np.zeros((len(POD_METRIC_FILE),1))
101 |     metric_id = 0
102 |     final_root_results = {}
103 | 
104 |     for metric in metric_names:
105 |         print('For metric:', metric)
106 |         data = metric_data[metric]
107 |         X = data[label]['Sequence']
108 |         index = index_data[metric]
109 | 
110 |         # Preprocessing to reduce the redundant samples
111 |         if X.shape[0] // 100 < 100:
112 |             patch = 20
113 |         else:
114 |             patch = 100
115 |         sample = X.shape[0]//patch
116 |         X = X[:patch*sample,:]
117 |         X = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1)
118 |         X_metric = X[:, index]
119 |         X_metric = preprocessing.normalize(X_metric, axis=0, norm = 'l1')
120 |         X = np.append(X_metric, X[:, -1].reshape(-1,1), axis=1)
121 |         columns = list(columns_common) + data[label]['KPI_Feature']
122 | 
123 | 
124 |         std = np.std(X[:, :-1], axis=0)
125 |         idx_std = [i for i, x in enumerate(std > 1e-5) if x]
126 |         if len(idx_std) == 0:
127 |             metric_weight[metric_id] = 0
128 |             metric_id = metric_id + 1
129 |             print(metric,' all pods are all constant or quasi-constant')
130 |             continue
131 | 
132 |         selector = VarianceThreshold(threshold = 0)
133 |         X_var = selector.fit_transform(X[:, :-1])
134 |         idx = selector.get_support(indices = True)
135 |         #print('X shape after variance: ', X_var.shape)
136 |         if X_var.shape[1] < 1:
137 |             metric_weight[metric_id] = 0
138 |             metric_id = metric_id + 1
139 |             print(metric,' all pods are all constant or quasi-constant')
140 |             continue
141 | 
142 |         # causal_score = np.zeros(len(columns_common))
143 |         mask = np.full(len(columns_common), False,dtype=bool)
144 |         mask[idx] = True
145 |         idx = list(idx) + [X.shape[1]-1]
146 |         X = X[:, idx]
147 |         columns = [columns[i] for i in idx]
148 |         X = pd.DataFrame(X,columns=columns)
149 | 
150 |         X.insert(0,'time', pd.date_range(start='2024-01-01', periods=len(X), freq='D'))
151 | 
152 |         X['time'] = X['time'].astype('int64') // 1_000_000_000
153 |         X.columns = [f"{col}_cpu" if i < 9 else f"{col}_memory" for i, col in enumerate(X.columns)]
154 | 
155 |         X.iloc[:, 1:] = (X.iloc[:, 1:] - X.iloc[:, 1:].min()) / (X.iloc[:, 1:].max() - X.iloc[:, 1:].min())
156 | 
157 |         anomalies = bocpd(X)
158 |         print("Anomalies are detected at timestep:", anomalies[0])
159 |         results = robust_scorer(X,anomalies=anomalies)
160 |         print(results)
161 | 
162 |         root_causes  = []
163 |         for result in results:
164 |             (root_cause, score) = result
165 |             root_causes.append([root_cause, score])
166 |         if not os.path.exists('./{}_results'.format(method)):
167 |             os.mkdir('./{}_results'.format(method))
168 |         if not os.path.exists('./{}_results/{}'.format(method, data_name)):
169 |             os.mkdir('./{}_results/{}'.format(method, data_name))
170 | 
171 |         root_causes = pd.DataFrame(root_causes)
172 |         root_causes.columns = [['root_cause', 'score']]
173 |         root_causes.to_csv("./{}_results/{}/{}_{}_{}_root_cause.csv".format(method, data_name, metric, model_name, data_name),index=False)
174 | 
175 |         final_root_results[metric] = root_causes
176 | 
177 |     concatenated_df = pd.concat(final_root_results.values(), ignore_index=True)
178 |     concatenated_df.to_csv("./{}_results/{}/concated_df.csv".format(method, data_name), index=False)
179 |     concatenated_df = pd.read_csv("./{}_results/{}/concated_df.csv".format(method, data_name))
180 |     aggregated_df = concatenated_df.groupby('root_cause')['score'].sum().reset_index()
181 |     aggregated_df = aggregated_df.sort_values(by='score', ascending=False)
182 |     aggregated_df.to_csv("./{}_results/{}/final_{}_{}_root_cause.csv".format(method, data_name, model_name, data_name), index=False)
183 | 
184 | 
185 | if __name__ == '__main__':
186 |     parser = argparse.ArgumentParser(description='Baro')
187 |     parser.add_argument("-case", type=str, default='20240115', help="case of the dataset")
188 |     parser.set_defaults(validation=True)
189 |     args = parser.parse_args()
190 |     main(args)
191 | 


--------------------------------------------------------------------------------
/Baseline/metric_only/RCA_methods_metric.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn import preprocessing
  4 | import os
  5 | from sklearn.feature_selection import VarianceThreshold
  6 | from sklearn.model_selection import train_test_split
  7 | from causalnex.structure.notears import from_pandas
  8 | from pyrca.analyzers.ht import HT, HTConfig
  9 | from pyrca.analyzers.epsilon_diagnosis import EpsilonDiagnosis, EpsilonDiagnosisConfig
 10 | from pyrca.analyzers.rcd import RCD, RCDConfig
 11 | # from pyrca.analyzers
 12 | import networkx as nx
 13 | import argparse
 14 | 
 15 | 
 16 | def remove_cycles_from_adjacency_matrix(adj_matrix: pd.DataFrame) -> pd.DataFrame:
 17 |     G = nx.from_pandas_adjacency(adj_matrix, create_using=nx.DiGraph())
 18 |     while True:
 19 |         try:
 20 |             cycle = nx.find_cycle(G, orientation='original')
 21 |             G.remove_edge(*cycle[0][:2])
 22 | 
 23 |         except nx.NetworkXNoCycle:
 24 |             break
 25 |     adj_matrix_no_cycles = nx.to_pandas_adjacency(G, dtype=int)
 26 |     adj_matrix_no_cycles = adj_matrix_no_cycles.reindex_like(adj_matrix)
 27 | 
 28 |     print("Now, the adjacency matrix does not have cycles.")
 29 |     return adj_matrix_no_cycles
 30 | 
 31 | 
 32 | 
 33 | def main(args):
 34 |     model_name = args.model
 35 |     data_name = args.case
 36 |     metric_data = {}
 37 |     columns_common = {}
 38 |     metric_path = '../data/{}'.format(data_name)
 39 |     if data_name == '20220606':
 40 |         label = 'reviews-v3'
 41 |     elif data_name == '20210517' or data_name == '20210524':
 42 |         label = 'Book_Info_product'
 43 |     elif data_name == '20211203':
 44 |         label = 'ratings.book-info.svc.cluster.local:9080/*'
 45 |     elif data_name == '20240215':
 46 |         label = 'pod usage'
 47 |     elif data_name == '20240124':
 48 |         label = 'scenario8_app_request'
 49 |     elif data_name == '20231207':
 50 |         label = 'book_info'
 51 |     elif data_name == '20231221':
 52 |         label = 'book_info'
 53 |     elif data_name == '20240115':
 54 |         label = 'book_info'
 55 |     else:
 56 |         raise ValueError('Invalid data_name')
 57 | 
 58 |     if data_name in ['20220606', '20210517', '20210524', '20211203', '20231207']:
 59 |         POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1,
 60 |                            'received_bandwidth': 1, 'transmit_bandwidth': 1}
 61 |         log_label = 'ratings.book-info.svc.cluster.local:9080/*'
 62 |     elif data_name in ['20231207']:
 63 |         POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1,
 64 |                            'received_bandwidth': 1, 'transmit_bandwidth': 1}
 65 |         log_label = 'book_info'
 66 |     elif data_name in ['20240124']:
 67 |         POD_METRIC_FILE = {'cpu_usage': 1, 'disk_used': 1, 'diskio_reads': 1, 'diskio_writes': 1, 'memory_used': 1,
 68 |                            'netstat_established': 1, 'swap_used': 1}
 69 |         log_label = 'ratings.book-info.svc.cluster.local:9080/*'
 70 |     elif data_name in ['20240215']:
 71 |         POD_METRIC_FILE = {'pod_cpu_limit': 1, 'pod_cpu_usage_total': 1, 'pod_cpu_utilization_over_pod_limit': 1, 'pod_memory_limit': 1,
 72 |                            'pod_memory_utilization_over_pod_limit': 1, 'pod_memory_working_set': 1, 'pod_network_rx_bytes': 1, 'pod_network_tx_bytes':1}
 73 |         log_label = 'book_info'
 74 |     elif data_name in ['20240115']:
 75 |         POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1,
 76 |                            'rate_storage_iops': 1, 'received_bandwidth': 1, 'transmit_bandwidth': 1}
 77 |         log_label = 'ratings.book-info.svc.cluster.local:9080/*'
 78 |     elif data_name in ['20231221']:
 79 |         POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1,
 80 |                            'received_bandwidth': 1, 'transmit_bandwidth': 1}
 81 |         log_label = 'ratings.book-info.svc.cluster.local:9080/*'
 82 |     else:
 83 |         raise ValueError('Invalid data_name')
 84 | 
 85 |     pathset = "./output/"
 86 |     if not (os.path.exists(pathset)):
 87 |         os.mkdir(pathset)
 88 | 
 89 |     for metric, weight in POD_METRIC_FILE.items():
 90 |         if metric in ['log_PCA', 'log_golden_signal', 'log_frequency']:
 91 |             metric_file = '{}/pod_level_{}.npy'.format(metric_path, metric)
 92 |             metric_data[metric] = np.load(metric_file, allow_pickle=True).item()
 93 |             # log_label = 'ratings.book-info.svc.cluster.local:9080/*'
 94 |             if len(metric_data[metric].keys()) == 1:
 95 |                 if log_label != label:
 96 |                     metric_data[metric][label] = metric_data[metric][log_label]
 97 |                     del metric_data[metric][log_label]
 98 |             else:
 99 |                  metric_data[metric][label] = metric_data[metric]
100 |             metric_data[metric][label]['Pod_Name'] = metric_data[metric][label]['Node_Name']
101 |             del metric_data[metric][label]['Node_Name']
102 |             metric_data[metric][label]['Sequence'] = metric_data[metric][label]['Sequence'].squeeze().T
103 |             metric_data[metric][label]['KPI_Feature'] = [metric_data[metric][label]['KPI_Feature']]
104 |             if columns_common:
105 |                 columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
106 |             else:
107 |                 columns_common = list(metric_data[metric][label]['Pod_Name'])
108 |         else:
109 |             metric_file = '{}/pod_level_data_{}.npy'.format(metric_path, metric)
110 |             metric_data[metric] = np.load(metric_file, allow_pickle=True).item()
111 |             if columns_common:
112 |                 columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
113 |             else:
114 |                 columns_common = list(metric_data[metric][label]['Pod_Name'])
115 | 
116 |     index_data = {}
117 |     metric_names = []
118 |     metric_weight_assigned = []
119 |     for metric, weight in POD_METRIC_FILE.items():
120 |         index_data[metric] = [metric_data[metric][label]['Pod_Name'].index(x) for x in columns_common]
121 |         metric_names = metric_names + [metric]
122 |         metric_weight_assigned = metric_weight_assigned + [weight]
123 | 
124 |     metric_weight = np.zeros((len(POD_METRIC_FILE), 1))
125 |     metric_id = 0
126 |     final_root_results = {}
127 | 
128 |     for metric in metric_names:
129 |         print('For metric:', metric)
130 |         data = metric_data[metric]
131 |         X = data[label]['Sequence']
132 |         index = index_data[metric]
133 |         # Preprocessing to reduce the redundant samples
134 |         if X.shape[0] // 100 < 100:
135 |             patch = 20
136 |         else:
137 |             patch = 100
138 |         sample = X.shape[0] // patch
139 |         X = X[:patch * sample, :]
140 |         X = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1)
141 |         X_metric = X[:, index]
142 |         X_metric = preprocessing.normalize(X_metric, axis=0, norm='l1')
143 |         X = np.append(X_metric, X[:, -1].reshape(-1, 1), axis=1)
144 |         columns = list(columns_common) + data[label]['KPI_Feature']
145 | 
146 |         std = np.std(X[:, :-1], axis=0)
147 |         idx_std = [i for i, x in enumerate(std > 1e-5) if x]
148 |         if len(idx_std) == 0:
149 |             metric_weight[metric_id] = 0
150 |             metric_id = metric_id + 1
151 |             print(metric, ' all pods are all constant or quasi-constant')
152 |             continue
153 | 
154 |         selector = VarianceThreshold(threshold=0)
155 |         X_var = selector.fit_transform(X[:, :-1])
156 |         idx = selector.get_support(indices=True)
157 |         # print('X shape after variance: ', X_var.shape)
158 |         if X_var.shape[1] < 1:
159 |             metric_weight[metric_id] = 0
160 |             metric_id = metric_id + 1
161 |             print(metric, ' all pods are all constant or quasi-constant')
162 |             continue
163 | 
164 |         mask = np.full(len(columns_common), False, dtype=bool)
165 |         mask[idx] = True
166 |         idx = list(idx) + [X.shape[1] - 1]
167 |         X = X[:, idx]
168 |         columns = [columns[i] for i in idx]
169 |         X = pd.DataFrame(X, columns=columns)
170 |         if model_name == 'circa':
171 |             sm = from_pandas(X)
172 |             estimated_matrix = nx.to_pandas_adjacency(sm)
173 |             quantile_value = np.quantile(estimated_matrix.values.flatten(), 0.95)
174 |             estimated_matrix = (estimated_matrix > quantile_value).astype(int)
175 |             estimated_matrix = remove_cycles_from_adjacency_matrix(estimated_matrix)
176 |             # estimated_matrix.to_csv("{}_adjacency.csv".format(metric))
177 | 
178 |         X_train, X_test = train_test_split(X, test_size=0.6, shuffle=False)
179 | 
180 |         X.insert(0, 'time', pd.date_range(start='2024-01-01', periods=len(X), freq='D'))
181 | 
182 |         X['time'] = X['time'].astype('int64') // 1_000_000_000
183 |         X.columns = [f"{col}_cpu" if i < 9 else f"{col}_memory" for i, col in enumerate(X.columns)]
184 | 
185 | 
186 |         X.iloc[:, 1:] = (X.iloc[:, 1:] - X.iloc[:, 1:].min()) / (X.iloc[:, 1:].max() - X.iloc[:, 1:].min())
187 | 
188 |         if model_name == 'rcd':
189 |             model = RCD(config=RCDConfig(k=3,alpha_limit=0.5))
190 |             results = model.find_root_causes(X_train, X_test).to_list()
191 |             print(results)
192 |         elif model_name == 'circa':
193 |             model = HT(config=HTConfig(graph=estimated_matrix, root_cause_top_k=10))
194 |             model.train(X_train)
195 |             results = model.find_root_causes(X_test, metric_data[metric][label]['KPI_Feature'][0], True).to_list()
196 |         elif model_name == 'epsilon_diagnosis':
197 |             model = EpsilonDiagnosis(config=EpsilonDiagnosisConfig(root_cause_top_k=10))
198 |             model.train(X)
199 |             results = model.find_root_causes(X).to_list()
200 |         else:
201 |             raise ValueError('Invalid model_name')
202 | 
203 |         root_causes = []
204 |         for result in results:
205 |             root_causes.append([result['root_cause'], result['score']])
206 |         if not os.path.exists('./{}_results'.format(model_name)):
207 |             os.mkdir('./{}_results'.format(model_name))
208 |         if not os.path.exists('./{}_results/{}'.format(model_name, data_name)):
209 |             os.mkdir('./{}_results/{}'.format(model_name, data_name))
210 | 
211 |         root_causes = pd.DataFrame(root_causes)
212 |         root_causes.columns = [['root_cause', 'score']]
213 |         root_causes.to_csv("./{}_results/{}/{}_{}_{}_root_cause.csv".format(model_name, data_name, metric, model_name, data_name),
214 |                            index=False)
215 |         final_root_results[metric] = root_causes
216 |     concatenated_df = pd.concat(final_root_results.values(), ignore_index=True)
217 |     concatenated_df.to_csv("./{}_results/{}/concated_df.csv".format(model_name, data_name), index=False)
218 |     concatenated_df = pd.read_csv("./{}_results/{}/concated_df.csv".format(model_name, data_name))
219 |     aggregated_df = concatenated_df.groupby('root_cause')['score'].sum().reset_index()
220 |     aggregated_df = aggregated_df.sort_values(by='score', ascending=False)
221 |     aggregated_df.to_csv("./{}_results/{}/final_{}_{}_root_cause.csv".format(model_name, data_name, model_name, data_name), index=False)
222 | 
223 | 
224 | if __name__ == '__main__':
225 |     parser = argparse.ArgumentParser(description='Baro')
226 |     parser.add_argument("-case", type=str, default='20240115', help="case of the dataset")
227 |     parser.add_argument("-model", type=str, default='rcd', help="model name, [rcd, circa, epsilon_diagnosis], default is rcd")
228 |     parser.set_defaults(validation=True)
229 |     args = parser.parse_args()
230 |     main(args)
231 | 
232 | 
233 | 
234 | 
235 | 
236 | 
237 | 


--------------------------------------------------------------------------------
/Baseline/metric_only/baro_main_metric.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn import preprocessing
  4 | import os
  5 | from sklearn.feature_selection import VarianceThreshold
  6 | from baro_algorithm import bocpd, robust_scorer
  7 | import networkx as nx
  8 | import argparse
  9 | 
 10 | 
 11 | def remove_cycles_from_adjacency_matrix(adj_matrix: pd.DataFrame) -> pd.DataFrame:
 12 |     G = nx.from_pandas_adjacency(adj_matrix, create_using=nx.DiGraph())
 13 |     while True:
 14 |         try:
 15 |             cycle = nx.find_cycle(G, orientation='original')
 16 |             G.remove_edge(*cycle[0][:2])
 17 | 
 18 |         except nx.NetworkXNoCycle:
 19 |             break
 20 |     adj_matrix_no_cycles = nx.to_pandas_adjacency(G, dtype=int)
 21 |     adj_matrix_no_cycles = adj_matrix_no_cycles.reindex_like(adj_matrix)
 22 | 
 23 |     print("Now, the adjacency matrix does not have cycles.")
 24 |     return adj_matrix_no_cycles
 25 | 
 26 | 
 27 | def main(args):
 28 |     metric_data = {}
 29 |     columns_common = {}
 30 |     method = 'baro'
 31 |     data_name = args.case
 32 |     metric_path = '../data/{}'.format(data_name)
 33 |     if data_name == '20220606':
 34 |         label = 'reviews-v3'
 35 |     elif data_name == '20210517' or data_name == '20210524':
 36 |         label = 'Book_Info_product'
 37 |     elif data_name == '20211203':
 38 |         label = 'ratings.book-info.svc.cluster.local:9080/*'
 39 |     elif data_name == '20240215':
 40 |         label = 'pod usage'
 41 |     elif data_name == '20240124':
 42 |         label = 'scenario8_app_request'
 43 |     elif data_name == '20231207':
 44 |         label = 'book_info'
 45 |     elif data_name == '20231221':
 46 |         label = 'book_info'
 47 |     elif data_name == '20240115':
 48 |         label = 'book_info'
 49 |     else:
 50 |         raise ValueError('Invalid data_name')
 51 | 
 52 |     if data_name in ['20220606', '20210517', '20210524', '20211203', '20231207']:
 53 |         POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1,
 54 |                            'received_bandwidth': 1, 'transmit_bandwidth': 1}
 55 |         log_label = 'ratings.book-info.svc.cluster.local:9080/*'
 56 |     elif data_name in ['20231207']:
 57 |         POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1,
 58 |                            'received_bandwidth': 1, 'transmit_bandwidth': 1}
 59 |         log_label = 'book_info'
 60 |     elif data_name in ['20240124']:
 61 |         POD_METRIC_FILE = {'cpu_usage': 1, 'disk_used': 1, 'diskio_reads': 1, 'diskio_writes': 1, 'memory_used': 1,
 62 |                            'netstat_established': 1, 'swap_used': 1}
 63 |         log_label = 'ratings.book-info.svc.cluster.local:9080/*'
 64 |     elif data_name in ['20240215']:
 65 |         POD_METRIC_FILE = {'pod_cpu_limit': 1, 'pod_cpu_usage_total': 1, 'pod_cpu_utilization_over_pod_limit': 1, 'pod_memory_limit': 1,
 66 |                            'pod_memory_utilization_over_pod_limit': 1, 'pod_memory_working_set': 1, 'pod_network_rx_bytes': 1, 'pod_network_tx_bytes':1}
 67 |         log_label = 'book_info'
 68 |     elif data_name in ['20240115']:
 69 |         POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1,
 70 |                            'rate_storage_iops': 1, 'received_bandwidth': 1, 'transmit_bandwidth': 1}
 71 |         log_label = 'ratings.book-info.svc.cluster.local:9080/*'
 72 |     elif data_name in ['20231221']:
 73 |         POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1,
 74 |                            'received_bandwidth': 1, 'transmit_bandwidth': 1}
 75 |         log_label = 'ratings.book-info.svc.cluster.local:9080/*'
 76 |     else:
 77 |         raise ValueError('Invalid data_name')
 78 |     model_name = 'baro'
 79 | 
 80 |     pathset = "./output/"
 81 |     if not(os.path.exists(pathset)):
 82 |         os.mkdir(pathset)
 83 | 
 84 |     for metric, weight in POD_METRIC_FILE.items():
 85 |         if metric in ['log_PCA', 'log_golden_signal', 'log_frequency']:
 86 |             metric_file = '{}/pod_level_{}.npy'.format(metric_path, metric)
 87 |             metric_data[metric] = np.load(metric_file,allow_pickle=True).item()
 88 |             if len(metric_data[metric].keys()) == 1:
 89 |                 if log_label != label:
 90 |                     metric_data[metric][label] = metric_data[metric][log_label]
 91 |                     del metric_data[metric][log_label]
 92 |             else:
 93 |                  metric_data[metric][label] = metric_data[metric]
 94 |             metric_data[metric][label]['Pod_Name'] = metric_data[metric][label]['Node_Name']
 95 |             del metric_data[metric][label]['Node_Name']
 96 |             metric_data[metric][label]['Sequence'] = metric_data[metric][label]['Sequence'].squeeze().T
 97 |             metric_data[metric][label]['KPI_Feature'] = [metric_data[metric][label]['KPI_Feature']]
 98 |             if columns_common:
 99 |                 columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
100 |             else:
101 |                 columns_common = list(metric_data[metric][label]['Pod_Name'])
102 |         else:
103 |             metric_file = '{}/pod_level_data_{}.npy'.format(metric_path, metric)
104 |             metric_data[metric] = np.load(metric_file, allow_pickle=True).item()
105 |             if columns_common:
106 |                 columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
107 |             else:
108 |                 columns_common = list(metric_data[metric][label]['Pod_Name'])
109 | 
110 | 
111 |     index_data = {}
112 |     metric_names = []
113 |     metric_weight_assigned = []
114 |     for metric, weight in POD_METRIC_FILE.items():
115 |         index_data[metric] = [metric_data[metric][label]['Pod_Name'].index(x) for x in columns_common]
116 |         metric_names = metric_names + [metric]
117 |         metric_weight_assigned = metric_weight_assigned  + [weight]
118 | 
119 |     metric_weight  =  np.zeros((len(POD_METRIC_FILE),1))
120 |     metric_id = 0
121 |     final_root_results = {}
122 | 
123 |     for metric in metric_names:
124 |         print('For metric:', metric)
125 |         data = metric_data[metric]
126 |         X = data[label]['Sequence']
127 |         index = index_data[metric]
128 | 
129 |         # Preprocessing to reduce the redundant samples
130 |         if X.shape[0] // 100 < 100:
131 |             patch = 20
132 |         else:
133 |             patch = 100
134 |         sample = X.shape[0]//patch
135 |         X = X[:patch*sample,:]
136 |         X = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1)
137 |         X_metric = X[:, index]
138 |         X_metric = preprocessing.normalize(X_metric, axis=0, norm = 'l1')
139 |         X = np.append(X_metric, X[:, -1].reshape(-1,1), axis=1)
140 |         columns = list(columns_common) + data[label]['KPI_Feature']
141 | 
142 | 
143 |         std = np.std(X[:, :-1], axis=0)
144 |         idx_std = [i for i, x in enumerate(std > 1e-5) if x]
145 |         if len(idx_std) == 0:
146 |             metric_weight[metric_id] = 0
147 |             metric_id = metric_id + 1
148 |             print(metric,' all pods are all constant or quasi-constant')
149 |             continue
150 | 
151 |         selector = VarianceThreshold(threshold = 0)
152 |         X_var = selector.fit_transform(X[:, :-1])
153 |         idx = selector.get_support(indices = True)
154 |         #print('X shape after variance: ', X_var.shape)
155 |         if X_var.shape[1] < 1:
156 |             metric_weight[metric_id] = 0
157 |             metric_id = metric_id + 1
158 |             print(metric,' all pods are all constant or quasi-constant')
159 |             continue
160 | 
161 |         # causal_score = np.zeros(len(columns_common))
162 |         mask = np.full(len(columns_common), False,dtype=bool)
163 |         mask[idx] = True
164 |         idx = list(idx) + [X.shape[1]-1]
165 |         X = X[:, idx]
166 |         columns = [columns[i] for i in idx]
167 |         X = pd.DataFrame(X,columns=columns)
168 | 
169 |         X.insert(0,'time', pd.date_range(start='2024-01-01', periods=len(X), freq='D'))
170 | 
171 |         X['time'] = X['time'].astype('int64') // 1_000_000_000
172 |         X.columns = [f"{col}_cpu" if i < 9 else f"{col}_memory" for i, col in enumerate(X.columns)]
173 | 
174 |         X.iloc[:, 1:] = (X.iloc[:, 1:] - X.iloc[:, 1:].min()) / (X.iloc[:, 1:].max() - X.iloc[:, 1:].min())
175 | 
176 |         anomalies = bocpd(X)
177 |         print("Anomalies are detected at timestep:", anomalies[0])
178 |         results = robust_scorer(X,anomalies=anomalies)
179 |         print(results)
180 | 
181 |         root_causes  = []
182 |         for result in results:
183 |             (root_cause, score) = result
184 |             root_causes.append([root_cause, score])
185 |         if not os.path.exists('./{}_results'.format(method)):
186 |             os.mkdir('./{}_results'.format(method))
187 |         if not os.path.exists('./{}_results/{}'.format(method, data_name)):
188 |             os.mkdir('./{}_results/{}'.format(method, data_name))
189 | 
190 |         root_causes = pd.DataFrame(root_causes)
191 |         root_causes.columns = [['root_cause', 'score']]
192 |         root_causes.to_csv("./{}_results/{}/{}_{}_{}_root_cause.csv".format(method, data_name, metric, model_name, data_name),index=False)
193 | 
194 |         final_root_results[metric] = root_causes
195 | 
196 |     concatenated_df = pd.concat(final_root_results.values(), ignore_index=True)
197 |     concatenated_df.to_csv("./{}_results/{}/concated_df.csv".format(method, data_name), index=False)
198 |     concatenated_df = pd.read_csv("./{}_results/{}/concated_df.csv".format(method, data_name))
199 |     aggregated_df = concatenated_df.groupby('root_cause')['score'].sum().reset_index()
200 |     aggregated_df = aggregated_df.sort_values(by='score', ascending=False)
201 |     aggregated_df.to_csv("./{}_results/{}/final_{}_{}_root_cause.csv".format(method, data_name, model_name, data_name), index=False)
202 | 
203 | 
204 | if __name__ == '__main__':
205 |     parser = argparse.ArgumentParser(description='Baro')
206 |     parser.add_argument("-case", type=str, default='20240115', help="case of the dataset")
207 |     parser.set_defaults(validation=True)
208 |     args = parser.parse_args()
209 |     main(args)
210 | 


--------------------------------------------------------------------------------
/Baseline/multimodal/baro_main_combined.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn import preprocessing
  4 | import os
  5 | from sklearn.feature_selection import VarianceThreshold
  6 | from baro_algorithm import bocpd, robust_scorer
  7 | import networkx as nx
  8 | import argparse
  9 | 
 10 | 
 11 | def remove_cycles_from_adjacency_matrix(adj_matrix: pd.DataFrame) -> pd.DataFrame:
 12 |     G = nx.from_pandas_adjacency(adj_matrix, create_using=nx.DiGraph())
 13 |     while True:
 14 |         try:
 15 |             cycle = nx.find_cycle(G, orientation='original')
 16 |             G.remove_edge(*cycle[0][:2])
 17 | 
 18 |         except nx.NetworkXNoCycle:
 19 |             break
 20 |     adj_matrix_no_cycles = nx.to_pandas_adjacency(G, dtype=int)
 21 |     adj_matrix_no_cycles = adj_matrix_no_cycles.reindex_like(adj_matrix)
 22 | 
 23 |     print("Now, the adjacency matrix does not have cycles.")
 24 |     return adj_matrix_no_cycles
 25 | 
 26 | 
 27 | def main(args):
 28 |     metric_data = {}
 29 |     columns_common = {}
 30 |     method = 'baro'
 31 |     data_name = args.case
 32 |     metric_path = '../data/{}'.format(data_name)
 33 |     if data_name == '20220606':
 34 |         label = 'reviews-v3'
 35 |     elif data_name == '20210517' or data_name == '20210524':
 36 |         label = 'Book_Info_product'
 37 |     elif data_name == '20211203':
 38 |         label = 'ratings.book-info.svc.cluster.local:9080/*'
 39 |     elif data_name == '20240215':
 40 |         label = 'pod usage'
 41 |     elif data_name == '20240124':
 42 |         label = 'scenario8_app_request'
 43 |     elif data_name == '20231207':
 44 |         label = 'book_info'
 45 |     elif data_name == '20231221':
 46 |         label = 'book_info'
 47 |     elif data_name == '20240115':
 48 |         label = 'book_info'
 49 |     else:
 50 |         raise ValueError('Invalid data_name')
 51 | 
 52 |     if data_name in ['20220606', '20210517', '20210524', '20211203', '20231207']:
 53 |         POD_METRIC_FILE = {'log_golden_signal': 1, 'log_frequency': 1, 'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1,
 54 |                            'received_bandwidth': 1, 'transmit_bandwidth': 1}
 55 |         log_label = 'ratings.book-info.svc.cluster.local:9080/*'
 56 |     elif data_name in ['20231207']:
 57 |         POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1,
 58 |                            'received_bandwidth': 1, 'transmit_bandwidth': 1, 'log_golden_signal': 1,
 59 |                            'log_frequency': 1}
 60 |         log_label = 'book_info'
 61 |     elif data_name in ['20240124']:
 62 |         POD_METRIC_FILE = {'log_golden_signal': 1, 'log_frequency': 1, 'cpu_usage': 1, 'disk_used': 1, 'diskio_reads': 1, 'diskio_writes': 1, 'memory_used': 1,
 63 |                            'netstat_established': 1, 'swap_used': 1}
 64 |         log_label = 'ratings.book-info.svc.cluster.local:9080/*'
 65 |     elif data_name in ['20240215']:
 66 |         POD_METRIC_FILE = {'pod_cpu_limit': 1, 'pod_cpu_usage_total': 1, 'pod_cpu_utilization_over_pod_limit': 1, 'pod_memory_limit': 1,
 67 |                            'pod_memory_utilization_over_pod_limit': 1, 'pod_memory_working_set': 1, 'pod_network_rx_bytes': 1, 'pod_network_tx_bytes':1,
 68 |                            'log_golden_signal': 1, 'log_frequency': 1}
 69 |         log_label = 'book_info'
 70 |     elif data_name in ['20240115']:
 71 |         POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1,
 72 |                            'rate_storage_iops': 1, 'received_bandwidth': 1, 'transmit_bandwidth': 1, 'log_golden_signal': 1,
 73 |                            'log_frequency': 1}
 74 |         log_label = 'ratings.book-info.svc.cluster.local:9080/*'
 75 |     elif data_name in ['20231221']:
 76 |         POD_METRIC_FILE = {'cpu_usage': 1, 'memory_usage': 1, 'rate_received_packets': 1, 'rate_transmitted_packets': 1, 'rate_storage_iops': 1,
 77 |                            'received_bandwidth': 1, 'transmit_bandwidth': 1, 'log_golden_signal': 1, 'log_frequency': 1}
 78 |         log_label = 'ratings.book-info.svc.cluster.local:9080/*'
 79 |     else:
 80 |         raise ValueError('Invalid data_name')
 81 |     model_name = 'baro'
 82 | 
 83 |     pathset = "./output/"
 84 |     if not(os.path.exists(pathset)):
 85 |         os.mkdir(pathset)
 86 | 
 87 |     for metric, weight in POD_METRIC_FILE.items():
 88 |         if metric in ['log_PCA', 'log_golden_signal', 'log_frequency']:
 89 |             metric_file = '{}/pod_level_{}.npy'.format(metric_path, metric)
 90 |             metric_data[metric] = np.load(metric_file,allow_pickle=True).item()
 91 |             if len(metric_data[metric].keys()) == 1:
 92 |                 if log_label != label:
 93 |                     metric_data[metric][label] = metric_data[metric][log_label]
 94 |                     del metric_data[metric][log_label]
 95 |             else:
 96 |                  metric_data[metric][label] = metric_data[metric]
 97 |             metric_data[metric][label]['Pod_Name'] = metric_data[metric][label]['Node_Name']
 98 |             del metric_data[metric][label]['Node_Name']
 99 |             metric_data[metric][label]['Sequence'] = metric_data[metric][label]['Sequence'].squeeze().T
100 |             metric_data[metric][label]['KPI_Feature'] = [metric_data[metric][label]['KPI_Feature']]
101 |             if columns_common:
102 |                 columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
103 |             else:
104 |                 columns_common = list(metric_data[metric][label]['Pod_Name'])
105 |         else:
106 |             metric_file = '{}/pod_level_data_{}.npy'.format(metric_path, metric)
107 |             metric_data[metric] = np.load(metric_file, allow_pickle=True).item()
108 |             if columns_common:
109 |                 columns_common = list(set(metric_data[metric][label]['Pod_Name']).intersection(columns_common))
110 |             else:
111 |                 columns_common = list(metric_data[metric][label]['Pod_Name'])
112 | 
113 | 
114 |     index_data = {}
115 |     metric_names = []
116 |     metric_weight_assigned = []
117 |     for metric, weight in POD_METRIC_FILE.items():
118 |         index_data[metric] = [metric_data[metric][label]['Pod_Name'].index(x) for x in columns_common]
119 |         metric_names = metric_names + [metric]
120 |         metric_weight_assigned = metric_weight_assigned  + [weight]
121 | 
122 |     metric_weight  =  np.zeros((len(POD_METRIC_FILE),1))
123 |     metric_id = 0
124 |     final_root_results = {}
125 | 
126 |     for metric in metric_names:
127 |         print('For metric:', metric)
128 |         data = metric_data[metric]
129 |         X = data[label]['Sequence']
130 |         index = index_data[metric]
131 | 
132 |         # Preprocessing to reduce the redundant samples
133 |         if X.shape[0] // 100 < 100:
134 |             patch = 20
135 |         else:
136 |             patch = 100
137 |         sample = X.shape[0]//patch
138 |         X = X[:patch*sample,:]
139 |         X = np.sum(X.reshape((-1, patch, X.shape[1])), axis=1)
140 |         X_metric = X[:, index]
141 |         X_metric = preprocessing.normalize(X_metric, axis=0, norm = 'l1')
142 |         X = np.append(X_metric, X[:, -1].reshape(-1,1), axis=1)
143 |         columns = list(columns_common) + data[label]['KPI_Feature']
144 | 
145 | 
146 |         std = np.std(X[:, :-1], axis=0)
147 |         idx_std = [i for i, x in enumerate(std > 1e-5) if x]
148 |         if len(idx_std) == 0:
149 |             metric_weight[metric_id] = 0
150 |             metric_id = metric_id + 1
151 |             print(metric,' all pods are all constant or quasi-constant')
152 |             continue
153 | 
154 |         selector = VarianceThreshold(threshold = 0)
155 |         X_var = selector.fit_transform(X[:, :-1])
156 |         idx = selector.get_support(indices = True)
157 |         #print('X shape after variance: ', X_var.shape)
158 |         if X_var.shape[1] < 1:
159 |             metric_weight[metric_id] = 0
160 |             metric_id = metric_id + 1
161 |             print(metric,' all pods are all constant or quasi-constant')
162 |             continue
163 | 
164 |         # causal_score = np.zeros(len(columns_common))
165 |         mask = np.full(len(columns_common), False,dtype=bool)
166 |         mask[idx] = True
167 |         idx = list(idx) + [X.shape[1]-1]
168 |         X = X[:, idx]
169 |         columns = [columns[i] for i in idx]
170 |         X = pd.DataFrame(X,columns=columns)
171 | 
172 |         X.insert(0,'time', pd.date_range(start='2024-01-01', periods=len(X), freq='D'))
173 | 
174 |         X['time'] = X['time'].astype('int64') // 1_000_000_000
175 |         X.columns = [f"{col}_cpu" if i < 9 else f"{col}_memory" for i, col in enumerate(X.columns)]
176 | 
177 |         X.iloc[:, 1:] = (X.iloc[:, 1:] - X.iloc[:, 1:].min()) / (X.iloc[:, 1:].max() - X.iloc[:, 1:].min())
178 | 
179 |         anomalies = bocpd(X)
180 |         print("Anomalies are detected at timestep:", anomalies[0])
181 |         results = robust_scorer(X,anomalies=anomalies)
182 |         print(results)
183 | 
184 |         root_causes  = []
185 |         for result in results:
186 |             (root_cause, score) = result
187 |             root_causes.append([root_cause, score])
188 |         if not os.path.exists('./{}_results'.format(method)):
189 |             os.mkdir('./{}_results'.format(method))
190 |         if not os.path.exists('./{}_results/{}'.format(method, data_name)):
191 |             os.mkdir('./{}_results/{}'.format(method, data_name))
192 | 
193 |         root_causes = pd.DataFrame(root_causes)
194 |         root_causes.columns = [['root_cause', 'score']]
195 |         root_causes.to_csv("./{}_results/{}/{}_{}_{}_root_cause.csv".format(method, data_name, metric, model_name, data_name),index=False)
196 | 
197 |         final_root_results[metric] = root_causes
198 | 
199 |     concatenated_df = pd.concat(final_root_results.values(), ignore_index=True)
200 |     concatenated_df.to_csv("./{}_results/{}/concated_df.csv".format(method, data_name), index=False)
201 |     concatenated_df = pd.read_csv("./{}_results/{}/concated_df.csv".format(method, data_name))
202 |     aggregated_df = concatenated_df.groupby('root_cause')['score'].sum().reset_index()
203 |     aggregated_df = aggregated_df.sort_values(by='score', ascending=False)
204 |     aggregated_df.to_csv("./{}_results/{}/final_{}_{}_root_cause.csv".format(method, data_name, model_name, data_name), index=False)
205 | 
206 | 
207 | if __name__ == '__main__':
208 |     parser = argparse.ArgumentParser(description='Baro')
209 |     parser.add_argument("-case", type=str, default='20240115', help="case of the dataset")
210 |     parser.set_defaults(validation=True)
211 |     args = parser.parse_args()
212 |     main(args)
213 | 


--------------------------------------------------------------------------------
/Crossiant_Metadata/Crossiant_Metadata_Cloud_Computing_Original.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "_id": "6664dbf513d2f73a727d47ff",
 3 |     "id": "Lemma-RCA-NEC/Cloud_Computing_Original",
 4 |     "author": "Lemma-RCA-NEC",
 5 |     "sha": "78ec9604fd0446d875175650c99acf30c95158c2",
 6 |     "lastModified": "2024-06-09T03:19:45.000Z",
 7 |     "private": false,
 8 |     "gated": false,
 9 |     "disabled": false,
10 |     "tags": [
11 |         "task_categories:time-series-forecasting",
12 |         "size_categories:100M<n<1B",
13 |         "license:cc-by-nc-4.0",
14 |         "root cause analysis",
15 |         "microservice system",
16 |         "multi-modal learning",
17 |         "time series analysis",
18 |         "log analysis",
19 |         "region:us"
20 |     ],
21 |     "description": "\n\t\n\t\t\n\t\n\t\n\t\tData Description:\n\t\n\nBoth system metrics (json format) and log data (json format) were collected from Cloud Computing Platform with hundreds of system entities invovled. Six different types of real faults (including cryptojacking, silent pod degradation fault, malware attack, mistakes made by GitOps, configuration change failure, and bug infection) were simulated. \n\n\t\n\t\t\n\t\n\t\n\t\tCitation:\n\t\n\n[1] Lecheng Zheng, Zhengzhang Chen, Jingrui He, Haifeng Chen:\nMULAN: Multi-modal Causal… See the full description on the dataset page: https://huggingface.co/datasets/Lemma-RCA-NEC/Cloud_Computing_Original.",
22 |     "downloads": 0,
23 |     "likes": 0,
24 |     "cardData": {
25 |         "license": "cc-by-nc-4.0",
26 |         "tags": [
27 |             "root cause analysis",
28 |             "microservice system",
29 |             "multi-modal learning",
30 |             "time series analysis",
31 |             "log analysis"
32 |         ],
33 |         "pretty_name": "Cloud Computing Data",
34 |         "size_categories": [
35 |             "100M<n<1B"
36 |         ],
37 |         "task_categories": [
38 |             "time-series-forecasting"
39 |         ]
40 |     },
41 |     "siblings": [
42 |         {
43 |             "rfilename": ".gitattributes"
44 |         },
45 |         {
46 |             "rfilename": "20231207.zip"
47 |         },
48 |         {
49 |             "rfilename": "20231221.zip"
50 |         },
51 |         {
52 |             "rfilename": "20240115.zip"
53 |         },
54 |         {
55 |             "rfilename": "20240124.zip"
56 |         },
57 |         {
58 |             "rfilename": "20240207.zip"
59 |         },
60 |         {
61 |             "rfilename": "20240215.zip"
62 |         },
63 |         {
64 |             "rfilename": "README.md"
65 |         }
66 |     ],
67 |     "createdAt": "2024-06-08T22:32:21.000Z"
68 | }


--------------------------------------------------------------------------------
/Crossiant_Metadata/Crossiant_Metadata_Cloud_Computing_Preprocessed.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "_id": "6664dbba5760c06469becf1b",
 3 |     "id": "Lemma-RCA-NEC/Cloud_Computing_Preprocessed",
 4 |     "author": "Lemma-RCA-NEC",
 5 |     "sha": "7c56ceb5e1ed9b36c25e6ab37c96201f8bb2830d",
 6 |     "lastModified": "2024-06-09T14:16:20.000Z",
 7 |     "private": false,
 8 |     "gated": false,
 9 |     "disabled": false,
10 |     "tags": [
11 |         "task_categories:time-series-forecasting",
12 |         "size_categories:100M<n<1B",
13 |         "license:cc-by-nc-4.0",
14 |         "root cause analysis",
15 |         "microservice system",
16 |         "multi-modal learning",
17 |         "time series analysis",
18 |         "log analysis",
19 |         "region:us"
20 |     ],
21 |     "description": "\n\t\n\t\t\n\t\n\t\n\t\tData Description:\n\t\n\nPreprocessed system metrics and log data from Cloud Computing Platform.\nConstructed the metric time series (as npy format) from the original metrics data (Json format). \nExtracted the log messages from the original log data (Json format). Parsed the log messages into log event templates. \nNote: 20240207 data does not contain EKS log data; it solely comprises CloudTrail log data in CSV format. Consequently, this dataset does not require preprocessing with a log… See the full description on the dataset page: https://huggingface.co/datasets/Lemma-RCA-NEC/Cloud_Computing_Preprocessed.",
22 |     "downloads": 0,
23 |     "likes": 0,
24 |     "cardData": {
25 |         "license": "cc-by-nc-4.0",
26 |         "tags": [
27 |             "root cause analysis",
28 |             "microservice system",
29 |             "multi-modal learning",
30 |             "time series analysis",
31 |             "log analysis"
32 |         ],
33 |         "pretty_name": "Cloud Computing Data",
34 |         "size_categories": [
35 |             "100M<n<1B"
36 |         ],
37 |         "task_categories": [
38 |             "time-series-forecasting"
39 |         ]
40 |     },
41 |     "siblings": [
42 |         {
43 |             "rfilename": ".gitattributes"
44 |         },
45 |         {
46 |             "rfilename": "Log Data/20231207.zip"
47 |         },
48 |         {
49 |             "rfilename": "Log Data/20231221.zip"
50 |         },
51 |         {
52 |             "rfilename": "Log Data/20240115.zip"
53 |         },
54 |         {
55 |             "rfilename": "Log Data/20240124.zip"
56 |         },
57 |         {
58 |             "rfilename": "Log Data/20240215.zip"
59 |         },
60 |         {
61 |             "rfilename": "Metrics Data/20231207.zip"
62 |         },
63 |         {
64 |             "rfilename": "Metrics Data/20231221.zip"
65 |         },
66 |         {
67 |             "rfilename": "Metrics Data/20240115.zip"
68 |         },
69 |         {
70 |             "rfilename": "Metrics Data/20240124.zip"
71 |         },
72 |         {
73 |             "rfilename": "Metrics Data/20240207.zip"
74 |         },
75 |         {
76 |             "rfilename": "Metrics Data/20240215.zip"
77 |         },
78 |         {
79 |             "rfilename": "README.md"
80 |         }
81 |     ],
82 |     "createdAt": "2024-06-08T22:31:22.000Z"
83 | }


--------------------------------------------------------------------------------
/Crossiant_Metadata/Crossiant_Metadata_Product_Review_Original.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "_id": "6664db96bd364917ca8534d8",
 3 |     "id": "Lemma-RCA-NEC/Product_Review_Original",
 4 |     "author": "Lemma-RCA-NEC",
 5 |     "sha": "284169da895e13d1ae35cb26658a92e8bc9a3dad",
 6 |     "lastModified": "2024-06-08T23:32:31.000Z",
 7 |     "private": false,
 8 |     "gated": false,
 9 |     "disabled": false,
10 |     "tags": [
11 |         "task_categories:time-series-forecasting",
12 |         "size_categories:100M<n<1B",
13 |         "license:cc-by-nc-4.0",
14 |         "root cause analysis",
15 |         "microservice system",
16 |         "multi-modal learning",
17 |         "time series analysis",
18 |         "log analysis",
19 |         "region:us"
20 |     ],
21 |     "description": "\n\t\n\t\t\n\t\n\t\n\t\tData Description:\n\t\n\nBoth metrics and log data were collected from Product Review Microservice Platform with hundreds of system entities invovled. Four different types of real faults (including DDoS attack, external storage failure, node resource contention stress test, and noisy neighbor issue) were simulated. \n\n\t\n\t\t\n\t\n\t\n\t\tCitation:\n\t\n\n[1] Lecheng Zheng, Zhengzhang Chen, Jingrui He, Haifeng Chen:\nMULAN: Multi-modal Causal Structure Learning and Root Cause Analysis for Microservice… See the full description on the dataset page: https://huggingface.co/datasets/Lemma-RCA-NEC/Product_Review_Original.",
22 |     "downloads": 0,
23 |     "likes": 0,
24 |     "cardData": {
25 |         "license": "cc-by-nc-4.0",
26 |         "tags": [
27 |             "root cause analysis",
28 |             "microservice system",
29 |             "multi-modal learning",
30 |             "time series analysis",
31 |             "log analysis"
32 |         ],
33 |         "pretty_name": "Product Review Microservice Data",
34 |         "size_categories": [
35 |             "100M<n<1B"
36 |         ],
37 |         "task_categories": [
38 |             "time-series-forecasting"
39 |         ]
40 |     },
41 |     "siblings": [
42 |         {
43 |             "rfilename": ".gitattributes"
44 |         },
45 |         {
46 |             "rfilename": "20210517.zip"
47 |         },
48 |         {
49 |             "rfilename": "20210524.zip"
50 |         },
51 |         {
52 |             "rfilename": "20211203.zip"
53 |         },
54 |         {
55 |             "rfilename": "20220606.zip"
56 |         },
57 |         {
58 |             "rfilename": "README.md"
59 |         }
60 |     ],
61 |     "createdAt": "2024-06-08T22:30:46.000Z"
62 | }


--------------------------------------------------------------------------------
/Crossiant_Metadata/Crossiant_Metadata_Product_Review_Preprocessed.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "_id": "6664da70b52f0890723de1a9",
 3 |     "id": "Lemma-RCA-NEC/Product_Review_Preprocessed",
 4 |     "author": "Lemma-RCA-NEC",
 5 |     "sha": "3c7544c550e0f3434b17eb6fed5a4730c4a70e60",
 6 |     "lastModified": "2024-06-09T05:53:04.000Z",
 7 |     "private": false,
 8 |     "gated": false,
 9 |     "disabled": false,
10 |     "tags": [
11 |         "task_categories:time-series-forecasting",
12 |         "size_categories:100M<n<1B",
13 |         "license:cc-by-nc-4.0",
14 |         "root cause analysis",
15 |         "microservice system",
16 |         "multi-modal learning",
17 |         "time series analysis",
18 |         "log analysis",
19 |         "region:us"
20 |     ],
21 |     "description": "\n\t\n\t\t\n\t\n\t\n\t\tData Description:\n\t\n\nPreprocessed metrics and log data from Product Review Microservice Platform. \nConstructed the metric time series (as npy format) from the original metrics data (Json format). \nExtracted the log messages from the original log data (Json format). Parsed the log messages into log event templates. \n\n\t\n\t\t\n\t\n\t\n\t\tCitation:\n\t\n\n[1] Lecheng Zheng, Zhengzhang Chen, Jingrui He, Haifeng Chen:\nMULAN: Multi-modal Causal Structure Learning and Root Cause Analysis for… See the full description on the dataset page: https://huggingface.co/datasets/Lemma-RCA-NEC/Product_Review_Preprocessed.",
22 |     "downloads": 0,
23 |     "likes": 0,
24 |     "cardData": {
25 |         "license": "cc-by-nc-4.0",
26 |         "tags": [
27 |             "root cause analysis",
28 |             "microservice system",
29 |             "multi-modal learning",
30 |             "time series analysis",
31 |             "log analysis"
32 |         ],
33 |         "pretty_name": "Product Review Microservice Data",
34 |         "size_categories": [
35 |             "100M<n<1B"
36 |         ],
37 |         "task_categories": [
38 |             "time-series-forecasting"
39 |         ]
40 |     },
41 |     "siblings": [
42 |         {
43 |             "rfilename": ".gitattributes"
44 |         },
45 |         {
46 |             "rfilename": "Log Data/20210517.zip"
47 |         },
48 |         {
49 |             "rfilename": "Log Data/20210524.zip"
50 |         },
51 |         {
52 |             "rfilename": "Log Data/20211203.zip"
53 |         },
54 |         {
55 |             "rfilename": "Log Data/20220606.zip"
56 |         },
57 |         {
58 |             "rfilename": "Metrics Data/20210517.zip"
59 |         },
60 |         {
61 |             "rfilename": "Metrics Data/20210524.zip"
62 |         },
63 |         {
64 |             "rfilename": "Metrics Data/20211203.zip"
65 |         },
66 |         {
67 |             "rfilename": "Metrics Data/20220606.zip"
68 |         },
69 |         {
70 |             "rfilename": "README.md"
71 |         }
72 |     ],
73 |     "createdAt": "2024-06-08T22:25:52.000Z"
74 | }


--------------------------------------------------------------------------------
/IT/data preprocessing/JMeter_KPI.py:
--------------------------------------------------------------------------------
 1 | #Generate KPI from Jmeter data
 2 | import pandas as pd
 3 | import os
 4 | 
 5 | pathset = "./output/"
 6 | JMETER_PATH = './JMeter/result/Book_Info_Case04_49h_176400req_result.jtl'
 7 | jmeter_df = pd.read_csv(JMETER_PATH)
 8 | 
 9 | # convert timestamp
10 | jmeter_df['timeStamp'] = jmeter_df['timeStamp'] // 1000
11 | 
12 | jmeter_col = ['Latency'] # feature of interests
13 | columns = ['timeStamp', 'label'] + jmeter_col
14 | jmeter_df = jmeter_df[columns]
15 | 
16 | # label of interests
17 | label = 'Book_Info_product'
18 | 
19 | kpi = jmeter_df.loc[jmeter_df['label'] == label]
20 | 
21 | if not os.path.isdir(pathset):
22 |     os.makedirs(pathset)
23 |     
24 | output_file = os.path.join(pathset, 'KPI.csv')
25 | kpi.to_csv(output_file)
26 | 


--------------------------------------------------------------------------------
/IT/data preprocessing/README.md:
--------------------------------------------------------------------------------
 1 | # Preprocessing Log Data
 2 | 
 3 | ## Execution steps
 4 | 
 5 | ### Step 1: Preprocess data from original elasticsearch log (json format) to log messages
 6 | ```terminal command
 7 | python json2message.py
 8 | ```
 9 | 
10 | ***Notice***: Some of the arguments may need to change
11 | ```
12 |     --path, the input directory of the json format log data
13 |     --output_dir, the output directory of all log messages
14 |     --output_dir2, the output directory of pod-level log messages for each pod
15 |     --output_dir3, the output directory of node-level log messages for each node
16 | ```
17 | 
18 | ### Step 2: Usa Drain to parse both node-level and pod-level log messages
19 | 
20 | ```terminal command
21 | python drain3_parse.py ./output/log_prep_node/  -o "./drain3_result/node"
22 | 
23 | python drain3_parse.py ./output/log_prep_pod/   -o "./drain3_result/pod"
24 | 
25 | ```
26 | 
27 | ```
28 |     --input_dir, default="./output/log_prep_node/" or "./output/log_prep_pod/"
29 |     --output_dir, default="./drain3_result/node"   or "./drain3_result/pod"
30 |   
31 | ```
32 | 
33 | ### Step 3: Log feature extraction
34 | 
35 | ```terminal command
36 | python log_frequency_extraction.py --log_dir ./input_path/  --output_dir ./output_path
37 | 
38 | python log_golden_frequency.py --root_path ./input_path/  --output_dir ./output_path --save_dir ./output_path
39 | ```
40 | 
41 | # Preprocessing Metrics Data
42 | 
43 | ## Convert json format metrics data to npy format
44 | 
45 | ```terminal command
46 | python metric_json2npy.py
47 | ```
48 | 
49 | # Constructing KPI Data
50 | 
51 | ## Generate KPI from JMeter data
52 | 
53 | ```terminal command
54 | python JMeter_KPI.py
55 | ```
56 | 
57 | 


--------------------------------------------------------------------------------
/IT/data preprocessing/drain3.yaml:
--------------------------------------------------------------------------------
 1 | POD_NAMES:
 2 |   - all
 3 |   
 4 | DRAIN3:
 5 |   SNAPSHOT:
 6 |     snapshot_interval_minutes: 10
 7 |     compress_state: True
 8 |   MASKING:
 9 |     masking: []
10 |     mask_prefix: "<:"
11 |     mask_suffix: ":>"
12 |   DRAIN:
13 |     # Similarity threshold
14 |     sim_th: 0.4 
15 |     # Depth of all leaf nodes
16 |     depth: 4
17 |     max_children: 100
18 |     max_clusters: 1024
19 |     #extra_delimiters: ["_"]
20 |     extra_delimiters: "[]"
21 |   PROFILING:
22 |     enabled: False
23 |     report_sec: 30
24 |   EXTRA:
25 |     input_file_name: 
26 |       - "*messages"
27 |     out_dir: "./drain32_result"
28 |     log_format: "<Time>\t<Content>"
29 | 
30 | PERSISTENCE:
31 |   mode: NONE # LOAD|SAVE|NONE
32 |   type: FILE # FILE|REDIS|KAFKA
33 |   option:
34 |     file_name: drain_dump.bin


--------------------------------------------------------------------------------
/IT/data preprocessing/json2message.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import glob
  3 | import os
  4 | import pandas as pd
  5 | import csv
  6 | import re  # Import regular expressions
  7 | 
  8 | 
  9 | def remove_timestamps(message):
 10 |     # Remove datetime info of different formats
 11 |     message = re.sub(r'\[\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\]', '', message) 
 12 |     message = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z', '', message)  
 13 |     message = re.sub(r'\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}', '', message)  
 14 |     message = re.sub(r'\d{4}\.\d{2}\.\d{2} \d{2}:\d{2}:\d{2}', '', message)  
 15 |     message = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}\+\d{4}', '', message)  
 16 |     message = re.sub(r'\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}', '', message)
 17 |     message = re.sub(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', '', message)
 18 |     message = re.sub(r'\[\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} \w{3}\]', '', message)
 19 |     message = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}\+\d{2}:\d{2}', '', message)
 20 |     message = re.sub(r'\w{3} \d{1,2}, \d{4}', '', message)
 21 |     message = re.sub(r'\d{1,2} \w{3} \d{4}', '', message)
 22 |     message = re.sub(r'\d{2}:\d{2} [AP]M', '', message)
 23 |     message = re.sub(r'\[\d{2}/\w{3}/\d{4} \d{2}:\d{2}:\d{2}\]', '', message)
 24 |     message = re.sub(r'^I\d{4} \d{2}:\d{2}:\d{2}\.\d{6}\s+\d+\s+\w+.\w+:\d+\] ', '', message)
 25 |     message = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}Z', '', message)
 26 |     
 27 |     return message.strip()
 28 | 
 29 | 
 30 | 
 31 | def extract_log_message(log):
 32 |     # Check if the log is in JSON format
 33 |     if "msg:" in log or "\"msg\":" in log:
 34 |         log_json = json.loads(log)
 35 |         log_message = log_json.get('msg', '')
 36 |     elif "msg=" in log or "\"msg\"=" in log:
 37 |         msg_index = log.find("msg=") if "msg=" in log else log.find("\"msg\"=")
 38 |         first_quote_index = log.find('"', msg_index)
 39 |         last_quote_index = log.find('"', first_quote_index + 1)
 40 |         if last_quote_index != -1:
 41 |             log_message = log[first_quote_index + 1:last_quote_index]
 42 |         else:
 43 |             log_message = log[first_quote_index + 1:]
 44 |     else:
 45 |         log_message = log
 46 |     
 47 |     return log_message.strip()
 48 | 
 49 | def dependency(path,output_dir):
 50 |     extract_pod_list=[]
 51 |     extract_node_list=[]
 52 |     folder_list = os.listdir(path)
 53 |      
 54 |     for folder in folder_list:  
 55 |         json_file = path + folder + "/" + "*.json"
 56 |         for readfile in glob.glob(json_file):
 57 |             print(readfile)
 58 |             with open(readfile) as f:
 59 |                 jsn = json.load(f)
 60 |                 for jsn_hit in jsn['hits']['hits']:
 61 |                     all_proc = []
 62 |                     all_node = []
 63 |                     if "kubernetes" in jsn_hit['_source'] and "pod_name" in jsn_hit['_source']['kubernetes']:
 64 |                         pod = jsn_hit['_source']['kubernetes']['pod_name']
 65 |                         message = jsn_hit['_source']['message']
 66 |                         timestamp = jsn_hit['_source']['@timestamp']
 67 |                         if message.startswith('"'):
 68 |                             message = message[1:]
 69 |                         if message.endswith('"'): 
 70 |                             message = message[:-1]
 71 |                         if "msg" in message:
 72 |                             # print(message)
 73 |                             message = extract_log_message(message)
 74 |                             # message = json.loads(message)['msg']
 75 |                         
 76 |                         message = remove_timestamps(message)
 77 |                         all_proc.append(pod)
 78 |                         all_proc.append(timestamp)
 79 |                         all_proc.append(message)
 80 |                         if all_proc:
 81 |                             extract_pod_list.append(all_proc)
 82 |                     if "systemd" in jsn_hit['_source'] and "t" in jsn_hit['_source']['systemd']:
 83 |                         node = jsn_hit['_source']['hostname']
 84 |                         message = jsn_hit['_source']['message']
 85 |                         timestamp = jsn_hit['_source']['@timestamp']
 86 |                         if message.startswith('"'):
 87 |                             message = message[1:]
 88 |                         if message.endswith('"'): 
 89 |                             message = message[:-1]
 90 |                         if "msg:" in message  or "\"msg\":" in message:
 91 |                             message = extract_log_message(message)
 92 |                             # message = json.loads(message)['msg']
 93 |                         all_node.append(node)
 94 |                         all_node.append(timestamp)
 95 |                         all_node.append(message)
 96 |                         if all_node:
 97 |                             extract_node_list.append(all_node)                  
 98 |     # output file
 99 |     data_list_col=['Node','Timestamp','Messages']
100 |     node_df = pd.DataFrame(extract_node_list,columns=data_list_col)
101 |     node_df.dropna()
102 |     filename = 'Node_messages'
103 |     node_df = node_df.sort_values(by='Timestamp')
104 |     node_df.to_csv(output_dir + filename, index = False) 
105 |     csv_file = output_dir + filename 
106 |     partition_csv(csv_file, output_dir3)
107 | 
108 |     data_list_col=['Pod','Timestamp','Messages']
109 |     pod_df = pd.DataFrame(extract_pod_list,columns=data_list_col)
110 |     pod_df.dropna()
111 |     filename = 'Pod_messages'
112 |     pod_df = pod_df.sort_values(by='Timestamp')
113 |     pod_df.to_csv(output_dir + filename, index = False) 
114 |     csv_file = output_dir + filename 
115 |     partition_csv(csv_file, output_dir2)
116 |     
117 | 
118 |     
119 | def partition_csv(csv_file, output_dir):
120 |     isExist = os.path.exists(output_dir)
121 |     if not isExist:
122 |         os.mkdir(output_dir)
123 |     # Creates empty set - this will be used to store the values that have already been used
124 |     filelist = set()
125 |     # Opens the large csv file in "read" mode
126 |     with open(csv_file,'r') as csvfile:
127 |         read_rows = csv.reader(csvfile)
128 |         # Skip the column names
129 |         next(read_rows)
130 |         for row in read_rows:
131 |             # Store the whole row as a string (rowstring)
132 |             rowstring='\t'.join(row[1:])
133 |             # Defines filename as the first entry in the row - This could be made dynamic so that the user inputs a column name to use
134 |             filename = (row[0])
135 |             # This basically makes sure it is not looking at the header row.
136 |             # If the filename is not in the filelist set, add it to the list and create new csv file with header row.
137 |             if filename not in filelist:    
138 |                 filelist.add(filename)
139 |                 temp_file = output_dir + str(filename +'_messages')
140 |                 if os.path.exists(temp_file):
141 |                     os.remove(temp_file)
142 |                 with open(temp_file,'a') as f:
143 |                     f.write(rowstring)
144 |                     f.write("\n")
145 |                     f.close()
146 |             # If the filename is in the filelist set, append the current row to the existing csv file.     
147 |             else:
148 |                 temp_file = output_dir + str(filename +  '_messages')
149 |                 with open(temp_file,'a') as f:
150 |                     f.write(rowstring)
151 |                     f.write("\n")
152 |                     f.close() 
153 |     
154 | if __name__ == "__main__":
155 |     # Input log data directory
156 |     path = 'Path-to-the-dataset-directory' 
157 |     # Output directories  
158 |     output_dir='./output/'  
159 |     output_dir2='./output/log_prep_pod/' 
160 |     output_dir3='./output/log_prep_node/' 
161 |     dependency(path,output_dir)
162 | 


--------------------------------------------------------------------------------
/IT/data preprocessing/log_frequency_extraction.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('../')
  3 | import os
  4 | from argparse import ArgumentParser
  5 | import pandas as pd
  6 | import numpy as np
  7 | import pickle as pkl
  8 | # import time
  9 | 
 10 | ## Compute the frequency of each pod with the given pod templates and generate 3D log data with shape:
 11 | # 1 X  Number of time windows X  Number of pods.
 12 | 
 13 | class SlidingWindow():
 14 |     def __init__(self):
 15 |         pass
 16 | 
 17 |     def sliding_window(self, df, window_size, step_size):
 18 |         """
 19 |         :param df: dataframe columns=[timestamp, label, eventid, time duration, lineid]
 20 |         :param window_size: seconds,
 21 |         :param step_size: seconds
 22 |         :return: dataframe columns=[eventids, label, eventids, time durations, lineids]
 23 |         """
 24 |         log_size = df.shape[0]
 25 |         label_data, time_data, line_data = df.iloc[:, 1], df.iloc[:, 0], df.iloc[:, 4]
 26 |         logkey_data, deltaT_data = df.iloc[:, 2], df.iloc[:, 3]
 27 |         new_data = []
 28 |         occurrence = []
 29 |         start_end_index_pair = set()
 30 | 
 31 |         start_time = time_data[0]
 32 |         end_time = start_time + window_size
 33 |         start_index = 0
 34 |         end_index = 0
 35 | 
 36 |         # get the first start, end index, end time
 37 |         for cur_time in time_data:
 38 |             if cur_time < end_time:
 39 |                 end_index += 1
 40 |             else:
 41 |                 break
 42 | 
 43 |         start_end_index_pair.add(tuple([start_index, end_index]))
 44 | 
 45 |         # move the start and end index until next sliding window
 46 |         num_session = 1
 47 |         while end_index < log_size:
 48 |             start_time = start_time + step_size
 49 |             end_time = start_time + window_size
 50 |             for i in range(start_index, log_size):
 51 |                 if time_data[i] < start_time:
 52 |                     i += 1
 53 |                 else:
 54 |                     break
 55 |             for j in range(end_index, log_size):
 56 |                 if time_data[j] < end_time:
 57 |                     j += 1
 58 |                 else:
 59 |                     break
 60 |             start_index = i
 61 |             end_index = j
 62 | 
 63 |             # when start_index == end_index, there is no value in the window
 64 |             if start_index != end_index:
 65 |                 start_end_index_pair.add(tuple([start_index, end_index]))
 66 | 
 67 |             num_session += 1
 68 |             if num_session % 1000 == 0:
 69 |                 print("process {} time window".format(num_session), end='\r')
 70 | 
 71 |         for (start_index, end_index) in start_end_index_pair:
 72 |             dt = deltaT_data[start_index: end_index].values
 73 |             dt[0] = 0
 74 |             new_data.append([
 75 |                 time_data[start_index: end_index].values,
 76 |                 max(label_data[start_index:end_index]),
 77 |                 logkey_data[start_index: end_index].values,
 78 |                 dt,
 79 |                 line_data[start_index: end_index].values,
 80 |             ])
 81 |             occurrence.append(logkey_data[start_index: end_index].values.shape[0])
 82 |         new_df = pd.DataFrame(new_data, columns=df.columns)
 83 |         occurrence_df = pd.DataFrame({'occurrence': occurrence})
 84 |         new_df = new_df.join(occurrence_df)
 85 |         assert len(start_end_index_pair) == len(new_data)
 86 |         print('\nThere are %d instances (sliding windows) in this dataset' % len(start_end_index_pair))
 87 |         return new_df
 88 | 
 89 |         
 90 | def arg_parser(level='node'):
 91 |     """
 92 |     add parser parameters
 93 |     :return:
 94 |     """
 95 |     parser = ArgumentParser()
 96 |     parser.add_argument("--log_dir", default="./0524/log_data/", metavar="DIR", help="log data directory")
 97 |     parser.add_argument("--output_dir", default="./output/log_frequency.npy", metavar="DIR", help="output directory")
 98 |     parser.add_argument("--dataset_name", default='cpf', help="which dataset to use"),
 99 |     parser.add_argument("--node_name", default='all', help="which node to use, or all"),
100 |     parser.add_argument("--window_type", default='sliding', type=str, choices=["sliding", "session"],
101 |                         help="generating log sequence")
102 |     parser.add_argument('--window_size', default=30, type=float, help='window size(mins)')
103 |     parser.add_argument('--step_size', default=0.5, type=float, help='step size(mins)')
104 |     # parser.add_argument("--time_format", default='%Y-%m-%dT%H:%M:%S.%fZ', type=str,
105 |     #                    help="input time format")
106 |     parser.add_argument("--time_format", default='%Y-%m-%dT%H:%M:%S.%f+00:00', type=str,
107 |                         help="input time format")
108 |     parser.add_argument('--use_small', action='store_true', help='Whether to use small dataset to test the code')
109 |     return parser
110 | 
111 | def find_closest_value_index_sorted(input_list, target_value):
112 |     left = 0
113 |     right = len(input_list) - 1
114 |     closest_value = None
115 |     closest_index = None
116 | 
117 |     while left <= right:
118 |         mid = (left + right) // 2
119 |         closest_value = input_list[mid]
120 |         closest_index = mid
121 | 
122 |         if closest_value == target_value:
123 |             return closest_index
124 |         elif closest_value < target_value:
125 |             left = mid + 1
126 |         else:
127 |             right = mid - 1
128 | 
129 |     # Check if the closest value is the last element
130 |     if closest_value is None or abs(closest_value - target_value) > abs(input_list[right] - target_value):
131 |         closest_value = input_list[right]
132 |         closest_index = right
133 | 
134 |     return closest_index
135 | 
136 | 
137 | if __name__ == "__main__":
138 |     level = 'node'
139 |     dataset = '0524'
140 |     parser = arg_parser(level)
141 |     args = parser.parse_args()
142 |     largest_file_name = None
143 |     largest_file_size = 0
144 |     node_names = []
145 |     # check the total timestamps and compuate the time length after applying sliding windows algorithms to extract features.
146 |     
147 |     files = [f for f in os.listdir(args.log_dir) if os.path.isfile(os.path.join(args.log_dir, f))]
148 |     print(files)
149 |     for f in files:
150 |         if '_messages_structured' in f:
151 |             node_name = f.replace('_messages_structured.csv', '')
152 |             df = pd.read_csv(os.path.join(args.log_dir,  f), encoding='Latin-1', nrows=1500)
153 |             length = df.size
154 |             length_threshold = 1000
155 |             if length < length_threshold:
156 |                 print('The number of records in {} is less than {}. Skipping it!'.format(os.path.join(args.log_dir, f), length_threshold))
157 |                 continue
158 |             else:
159 |                 node_names.append(node_name)
160 |     
161 |     occurrence_sequence = {}
162 |     for node_name in node_names: 
163 |         data_dir=args.log_dir
164 |         output_dir = os.path.join(args.output_dir, node_name)
165 |         log_file = node_name + '_messages_structured.csv'
166 |         dataset_name=args.dataset_name
167 |         window_type=args.window_type
168 |         window_size=args.window_size
169 |         step_size=args.step_size
170 |         time_format = args.time_format
171 |         print("\nLoading", f'{data_dir}{log_file}')
172 |         df = pd.read_csv(f'{data_dir}{log_file}', dtype=object,encoding='Latin-1')
173 |         df.index.rename("LineId")
174 | 
175 |         window = SlidingWindow()
176 |         # data preprocess
177 |         df['datetime'] = pd.to_datetime(df["Time"], format=time_format)
178 |         df = df.sort_values(by='datetime', ignore_index=True)
179 |         print('Time span:', min(df['datetime']), '-', max(df['datetime']))
180 | 
181 |         # do not have label in cpf
182 |         df['Label'] = 0
183 |         df['timestamp'] = df["datetime"].values.astype(np.int64) // 10 ** 9
184 |         df['deltaT'] = df['datetime'].diff() / np.timedelta64(1, 's')
185 |         df['deltaT'].fillna(0)
186 | 
187 |         window_df = window.sliding_window(df[["timestamp", "Label", "EventId", "deltaT", "LineId"]],
188 |                                             window_size=float(window_size) * 60,
189 |                                             step_size=float(step_size) * 60)
190 |         window_df['starttime'] = window_df['timestamp'].apply(np.min)
191 |         window_df['endtime'] = window_df['timestamp'].apply(np.max)
192 |         window_df = window_df.sort_values(by='starttime')
193 |         print(window_df.head())
194 |         occurrence_sequence[node_name] = window_df[['starttime', 'occurrence']]
195 |     
196 |     # Align the time sequence
197 |     data = occurrence_sequence
198 |     print(data)
199 |     start_time = np.inf
200 |     end_time = 0
201 |     # get the start time and end time
202 |     for key, item in data.items():
203 |         timestamp = item['starttime'].min()
204 |         if timestamp < start_time:
205 |             start_time = timestamp
206 |         timestamp = item['starttime'].max()
207 |         if timestamp > end_time:
208 |             end_time = timestamp
209 |     print(start_time)
210 |     print(end_time)
211 |     length = int((end_time - start_time) // (args.step_size * 60)) + 1
212 |     
213 |     # log_sequence is the output time-series data with shape: # of nodes by # of time windows by 1
214 |     log_sequence = np.zeros((len(node_names), length, 1))
215 |     index = 0
216 |     for node_name in node_names:
217 |         item = data[node_name]
218 |         temp_data = np.zeros((length, 1))
219 |         idx = list(map(lambda x: int((x - start_time)//(args.step_size*60)), item['starttime']))
220 |         temp_data[idx] = np.array(item['occurrence']).reshape(-1, 1)
221 |         start_idx = int((item['starttime'].iloc[0] - start_time) //  (args.step_size * 60))
222 |         end_idx = int((item['starttime'].iloc[-1] - start_time) //  (args.step_size * 60))
223 |         print('For Entity:{}, the start_idx is {} and the end_idx is {}'.format(node_name, start_idx, end_idx))
224 |         for the_index in range(length):
225 |             if the_index not in idx:
226 |                 the_correct_idx = find_closest_value_index_sorted(idx, the_index)
227 |                 temp_data[the_index] = item['occurrence'][the_correct_idx]
228 |         print('The number of zeros in embedding is ', length - np.nonzero(temp_data.sum(axis=1))[0].shape[0])
229 |         log_sequence[index, :, :] = temp_data
230 |         
231 |         index += 1
232 |     print(log_sequence.sum(axis=1).reshape(-1,))
233 |     output_dir = args.output_dir
234 |     print('Saving the aligned sequence in {}'.format(output_dir))
235 |     ct = np.linspace(start_time, end_time, num=length, endpoint=True, dtype=int)
236 |     jlabel = 'ratings.book-info.svc.cluster.local:9080/*'
237 |     processed_data = {
238 |         'KPI_Label': jlabel,
239 |         'Node_Name': node_names,
240 |         'KPI_Feature': 'occurrence',
241 |         'Sequence': log_sequence,
242 |         'time': ct
243 |     }
244 |     np.save(output_dir, processed_data)
245 | 
246 | 


--------------------------------------------------------------------------------
/IT/data preprocessing/metric_json2npy.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import orjson
  3 | import os
  4 | import pandas as pd
  5 | import time
  6 | import numpy as np
  7 | from collections import defaultdict
  8 | 
  9 | # Minimum sample size for finding the common timestamps
 10 | min_sample_size = 1e5
 11 | 
 12 | def main():
 13 |     st = time.time()
 14 | 
 15 |     # KPI location
 16 |     JAEGER_PATH = './reviews-v2-5d8f8b6775-sl4vp.csv'
 17 |     jaeger_df = pd.read_csv(JAEGER_PATH)
 18 |     # convert timestamp
 19 |     jaeger_df['timeStamp'] = jaeger_df['timeStamp'] // 1000000
 20 |     jaeger_col = ['Latency'] # feature of interests
 21 |     columns = ['timeStamp', 'label'] + jaeger_col
 22 |     jaeger_df = jaeger_df[columns]
 23 |     groups = jaeger_df.groupby('label')
 24 |     g2i = groups.indices
 25 |     jaeger_data = {}
 26 |     
 27 |     for label, indices in g2i.items():
 28 |         group = jaeger_df.iloc[indices, :].sort_values(by=['timeStamp']).reset_index()
 29 |         # take average if multiple records with the same timestamp exist
 30 |         group = group.groupby('timeStamp').mean().reset_index()
 31 |         jaeger_data[label] = group
 32 |     
 33 |     ##### 1. Pod metric data Path ###
 34 |     POD_METRIC_PATH='./Metrics/pod/'
 35 |     ##### 2. Node metric data path###
 36 |     node_path = "./Metrics"
 37 |     
 38 |     # Metric name list
 39 |     pod_metrics_list = ['cpu_usage','memory_usage','received_bandwidth','transmit_bandwidth','rate_received_packets','rate_transmitted_packets']
 40 |     node_metrics_list = ['cpu_usage','cpu_saturation','memory_usage','memory_saturation','net_usage_receive(bytes)','net_usage_transmit(bytes)','net_saturation_receive','net_saturation_transmit','net_disk_io_usage','net_disk_io_saturation','net_disk_space_usage']
 41 |     
 42 |     # pod_file json -> csv 
 43 |     header_name = "pod"
 44 |     
 45 |     for pod_metrics in pod_metrics_list:
 46 |         pod_file_convert(POD_METRIC_PATH, pod_metrics, header_name, jaeger_data, jaeger_col)
 47 |        
 48 |     header_name = "node"
 49 |     #node_file json -> csv
 50 |     for node_metric in node_metrics_list:
 51 |        node_json_file = node_path + "/node/node_" + node_metric + "*.json"
 52 |        node_file_convert(node_path, node_metric, node_json_file, header_name, jaeger_data, jaeger_col)
 53 |     
 54 |     et = time.time()
 55 |     elapsed_time = et - st
 56 |     print('Execution time:', elapsed_time, 'seconds') 
 57 | 
 58 | # Execution time°
 59 | def func_time(func):
 60 |     t1 = time.time() 
 61 |     t2 = time.time()
 62 |     elapsed_time = t2-t1
 63 | 
 64 | 
 65 | # Convert pod-level metrics data 
 66 | def pod_file_convert(POD_METRIC_PATH, metric, header_name, jaeger_data, jaeger_col):
 67 |     pod_data = defaultdict(lambda : {})
 68 |     data_list = []
 69 |     data_list_col=['pod','time','value']
 70 |     
 71 |     for nslabel in os.listdir(POD_METRIC_PATH):
 72 |         ns_path = os.path.join(POD_METRIC_PATH, nslabel)
 73 |         
 74 |         if not os.path.isdir(ns_path):
 75 |             continue
 76 |             
 77 |         metric_file ='{}_{}*.json'.format(nslabel, metric)
 78 |         json_file = os.path.join(ns_path, metric_file)
 79 |         
 80 |         # globã§jsonã‚’èª­ã¿è¾¼ã¿
 81 |         for filename in glob.glob(json_file):
 82 |             json_data = orjson.loads(open(filename).read())
 83 |             for result in json_data['data']['result']:
 84 |                 for value in result['values']:
 85 |                     time_date = value[0]
 86 |                     data_list.append([result['metric']['pod'], time_date, float(value[1])])
 87 |     
 88 |     df = pd.DataFrame(data_list,columns=data_list_col)
 89 |     df = df.fillna(0)
 90 |     df = df.sort_values(['pod','time'])
 91 |     groups = df.groupby('pod')   
 92 |     g2i = groups.indices
 93 |     
 94 |     for plabel, indices in g2i.items():
 95 |         group = df.iloc[indices, [1, 2] ].groupby('time').mean().reset_index()
 96 |         group = group.sort_values(by='time').reset_index()
 97 |         if group.shape[0] < min_sample_size:
 98 |             continue
 99 |         pod_data[plabel] = group
100 |             
101 |     processed_data = {}
102 |     metric_file ='pod_level_data_{}.npy'.format(metric)
103 |     
104 |     # find common timestamps
105 |     for jlabel, jgroup in jaeger_data.items():
106 |         ct = set(list(jgroup['timeStamp'].values.reshape(-1)))
107 |         # ct = set()
108 |         for plabel, pgroup in pod_data.items():
109 |             ct = ct.intersection(set(list(pgroup['time'].values.reshape(-1))))         
110 |         
111 |         data_list=[]
112 |         plabel_list = []
113 |         for plabel, pgroup in pod_data.items():
114 |             plabel_list.append(plabel)
115 |             subset = pgroup.loc[pgroup['time'].isin(ct)].sort_values(by='time')
116 |             data_list.append(subset.iloc[:, -1].values.reshape((-1, 1)))
117 |             
118 |         # jaeger data
119 |         subset = jgroup.loc[jgroup['timeStamp'].isin(ct)].sort_values(by='timeStamp')
120 |         data_list.append(subset[jaeger_col].values.reshape((-1, len(jaeger_col))))
121 |         agg_data = np.concatenate(data_list, axis=1)
122 |         processed_data[jlabel] = {
123 |             'KPI_Label': jlabel,
124 |             'Pod_Name': plabel_list,
125 |             'KPI_Feature': jaeger_col,
126 |             'Sequence': agg_data,
127 |             'time': sorted(list(ct))
128 |         }
129 | 
130 |     np.save(metric_file, processed_data)
131 |     print('Pod Data of Metric', metric, 'Preprocessed')
132 | 
133 | # Convert node-level metric data
134 | def node_file_convert(node_path, metric, json_file, header_name, jaeger_data, jaeger_col):
135 |     data_list = []
136 |     data_list_col=['node','time','value']
137 |     metric_data = {}
138 | 
139 |     for filename in glob.glob(json_file):
140 |         json_data = orjson.loads(open(filename).read())
141 |         for result in json_data['data']['result']:
142 |             for value in result['values']:
143 |                 time_date = value[0]
144 |                 data_list.append([result['metric']['instance'], time_date, float(value[1])])
145 |     df = pd.DataFrame(data_list,columns=data_list_col)
146 |     
147 |     df['time'] = df['time'].astype(np.int64)
148 |     # df timestampã®æ•´åˆ—
149 |     df = df.sort_values(['node','time'])
150 |     groups = df.groupby('node')
151 |     g2i = groups.indices
152 |     node_data = {}
153 |     
154 |     for node, indices in g2i.items():
155 |         group = df.iloc[indices, :].sort_values(by='time').reset_index()
156 |         group = group.groupby('time').mean().reset_index()
157 |         node_data[node] = group
158 |     metric_data[metric] = node_data
159 |     
160 |    
161 |     processed_data = {}
162 |     metric_file ='node_level_data_{}.npy'.format(metric)
163 | 
164 |     
165 |     for jlabel, jgroup in jaeger_data.items():
166 |         ct = set(list(jgroup['timeStamp'].values.reshape(-1)))
167 |         for nlabel, ngroup in node_data.items():
168 |             ct = ct.intersection(set(list(ngroup['time'].values.reshape(-1))))
169 |         
170 |         # aggregate data for each jlable
171 |         # node data
172 |         data_list=[]
173 |         nlabel_list = []
174 |         
175 |         for nlabel, ngroup in node_data.items():
176 |             nlabel_list.append(nlabel)
177 |             subset = ngroup.loc[ngroup['time'].isin(ct)].sort_values(by='time')
178 |             data_list.append(subset['value'].values.reshape((-1, 1)))
179 | 
180 |         # jaeger data
181 |         subset = jgroup.loc[jgroup['timeStamp'].isin(ct)].sort_values(by='timeStamp')
182 |         data_list.append(subset[jaeger_col].values.reshape((-1, len(jaeger_col))))
183 |         agg_data = np.concatenate(data_list, axis=1)
184 |      
185 |         processed_data[jlabel] = {
186 |             'KPI_Label': jlabel,
187 |             'Node_Name': nlabel_list,
188 |             'KPI_Feature': jaeger_col,
189 |             'Sequence': agg_data,
190 |             'time': sorted(list(ct))
191 |         }
192 |         
193 |     np.save(metric_file, processed_data)  
194 |     print('Node Data of Metric', metric, 'Preprocessed')
195 | 
196 |        
197 | 
198 | if __name__ == "__main__":
199 |     main()
200 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |               Creative Commons Attribution-NonCommercial (CC BY-NC) 
2 |                            Version 4.0, November 2013
3 |                   https://creativecommons.org/licenses/by-nc/4.0
4 | 
5 | NonCommercial — You can not use the material for commercial purposes.
6 | 


--------------------------------------------------------------------------------
/OT/data_preprocessing/Readme.md:
--------------------------------------------------------------------------------
 1 | # Data Preprocessing of SWaT and WADI for Root Cause Localization Tasks
 2 | 
 3 | This folder contains the data preprocessing code for the SWaT and WADI datasets. These datasets were initially used for anomaly detection, but we have adapted them for root cause localization. Follow the steps below to download the original data and use our code to process it into the corresponding RCA datasets.
 4 | 
 5 | 1. **Download Original Data**: 
 6 |     - You can download the original datasets from the [iTrust website](https://itrust.sutd.edu.sg/itrust-labs_datasets/dataset_info/).
 7 |     - For SWaT, use the data from December 2015, as an example.
 8 |     - For WADI, use the WADI.A1 data, as an example.
 9 | 
10 | 2. **Processing Strategy**:
11 |     - Although the two datasets require slightly different processing steps, we use a consistent high-level strategy for both.
12 | 
13 | ## Files and their Functions
14 | 
15 | 1. **data_segment.py**
16 |     - Segments the monitoring time series data of different sensors/actuators based on the time of the attack event. Each segment contains one attack event and is 2 hours long.
17 | 
18 | 2. **node_data_cut.py**
19 |     - Generates the monitoring data of each node (high-level stage), including timestamp, system metrics, and the corresponding label.
20 | 
21 | 3. **pod_data_cut.py**
22 |     - Generates the monitoring data of each pod (low-level sensor), including timestamp, system metrics, and the corresponding label.
23 | 
24 | 4. **node_final_process.py**
25 |     - Reorganizes each data segment into a defined data structure. For each segment, it divides the data according to the monitoring metric. Each metric includes:
26 |         - **Sequence:** Monitoring metric of different nodes.
27 |         - **Time:** Corresponding timestamp.
28 |         - **Node_Name:** Name of each node.
29 |         - **KPI_Feature:** Corresponding system KPI.
30 | 
31 | 5. **pod_final_process.py**
32 |     - Reorganizes each data segment into a defined data structure. For each segment, it divides the data according to the monitoring metric. Each metric includes:
33 |         - **Sequence:** Monitoring metric of different pods.
34 |         - **Time:** Corresponding timestamp.
35 |         - **Node_Name:** Name of each pod.
36 |         - **KPI_Feature:** Corresponding system KPI.
37 | 
38 | 6. **process.sh**
39 |     - Unifies the previous five steps into one shell script. You can run all processing steps by executing `./process.sh`.
40 | 
41 | ## User Guidance
42 | 
43 | 1. Download the SWaT and WADI datasets based on the provided version (i.e., SWaT Dec 2015 and WADI.A1).
44 | 2. Run the processing steps for SWaT by executing `./SWaT/process.sh`.
45 | 3. Run the processing steps for WADI by executing `./WADI/process.sh`.
46 | 
47 | By following these steps, you will be able to preprocess the SWaT and WADI datasets for root cause localization effectively.
48 | 


--------------------------------------------------------------------------------
/OT/data_preprocessing/SWaT/data_segment.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | attack_info = pd.read_csv("attack_info.csv")
 4 | 
 5 | start_date_time = attack_info['Start Time'].values
 6 | end_time = attack_info['End Time'].values
 7 | 
 8 | end_date_time = []
 9 | for i,date_time in enumerate(start_date_time):
10 |     date_value = date_time.split(' ')[0]
11 |     end_date_time.append(date_value+' '+end_time[i])
12 | 
13 | attak_time_ranges = list(zip(start_date_time,end_date_time))
14 | 
15 | normal = pd.read_excel('normal.xlsx')
16 | attack = pd.read_excel('attack.xlsx')
17 | 
18 | normal['label'].replace(0,1,inplace=True)
19 | 
20 | normal['Timestamp'] = pd.to_datetime(normal['Timestamp'])
21 | normal.set_index(normal['Timestamp'],inplace=True)
22 | attack['Timestamp'] = pd.to_datetime(attack['Timestamp'])
23 | attack.set_index(attack['Timestamp'],inplace=True)
24 | 
25 | attack['label'].replace('Normal',1,inplace=True)
26 | attack['label'].replace('Attack',-1,inplace=True)
27 | attack['label'].replace('A ttack',-1,inplace=True)
28 | 
29 | attack_segments = []
30 | for time_range in attak_time_ranges:
31 |     attack_segments.append(attack[time_range[0]:time_range[1]])
32 | 
33 | time_ranges_before = [
34 |     ['2015-12-26 00:00:00','2015-12-26 10:29:13'],
35 |     ['2015-12-26 00:00:00','2015-12-26 10:51:07'],
36 |     ['2015-12-26 00:00:00','2015-12-26 11:47:38'],
37 |     ['2015-12-26 00:00:00','2015-12-26 11:11:24'],
38 |     ['2015-12-26 00:00:00','2015-12-26 11:35:39'],
39 |     ['2015-12-26 00:00:00','2015-12-26 14:38:11'],
40 |     ['2015-12-26 00:00:00','2015-12-26 18:29:59'],
41 |     ['2015-12-26 00:00:00','2015-12-26 22:55:17'],
42 |     ['2015-12-26 00:00:00','2015-12-26 01:42:33'],
43 |     ['2015-12-26 00:00:00','2015-12-26 09:51:07'],
44 |     ['2015-12-26 00:00:00','2015-12-26 10:01:49'],
45 |     ['2015-12-26 00:00:00','2015-12-26 01:17:07'],
46 |     ['2015-12-26 00:00:00','2015-12-26 15:31:59'],
47 |     ['2015-12-26 00:00:00','2015-12-26 15:47:39'],
48 |     ['2015-12-26 00:00:00','2015-12-26 17:12:39'],
49 |     ['2015-12-26 00:00:00','2015-12-26 17:18:55'],
50 | ]
51 | 
52 | data = pd.concat([normal['2015-12-25':'2015-12-28'],attack],axis=0)
53 | 
54 | normal_pattern_before = []
55 | for time_range in time_ranges_before:
56 |     normal_pattern_before.append(data[time_range[0]:time_range[1]])
57 | 
58 | 
59 | time_ranges_after = [
60 |     ['2015-12-26 10:44:54','2015-12-27 23:59:59'],
61 |     ['2015-12-26 10:58:31','2015-12-27 23:59:59'],
62 |     ['2015-12-26 11:54:09','2015-12-27 23:59:59'],
63 |     ['2015-12-26 11:15:18','2015-12-27 23:59:59'],
64 |     ['2015-12-26 11:42:51','2015-12-27 23:59:59'],
65 |     ['2015-12-26 14:50:09','2015-12-27 23:59:59'],
66 |     ['2015-12-26 18:42:01','2015-12-27 23:59:59'],
67 |     ['2015-12-26 23:03:01','2015-12-27 23:59:59'],
68 |     ['2015-12-26 01:54:11','2015-12-27 23:59:59'],
69 |     ['2015-12-26 09:56:29','2015-12-27 23:59:59'],
70 |     ['2015-12-26 10:12:02','2015-12-27 23:59:59'],
71 |     ['2015-12-26 01:45:19','2015-12-27 23:59:59'],
72 |     ['2015-12-26 15:34:01','2015-12-27 23:59:59'],
73 |     ['2015-12-26 16:07:11','2015-12-27 23:59:59'],
74 |     ['2015-12-26 17:14:21','2015-12-27 23:59:59'],
75 |     ['2015-12-26 17:26:57','2015-12-27 23:59:59'],
76 | ]
77 | normal_pattern_after = []
78 | for time_range in time_ranges_after:
79 |     normal_pattern_after.append(data[time_range[0]:time_range[1]])
80 | 
81 | final = []
82 | for before,attack,after in zip(normal_pattern_before,attack_segments,normal_pattern_after):
83 |     final.append(pd.concat([before,attack,after],axis=0))
84 | 
85 | print('end!')
86 | 
87 | import pickle
88 | with open('tmp_data/data_segments.pkl','wb') as f:
89 |     pickle.dump(final,f,protocol=1)
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/OT/data_preprocessing/SWaT/node_data_cut.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pickle
 4 | 
 5 | with open('tmp_data/data_segments.pkl','rb') as f:
 6 |        segments = pickle.load(f)
 7 | metrics = ['FIT','LIT','MV','P','PIT','AIT','DPIT','UV']
 8 | n2p = {'S1':['101','102'],
 9 |        'S2':['201','202','203','204','205','206'],
10 |        'S3':['301','302','303','304'],
11 |        'S4':['401','402','403','404'],
12 |        'S5':['501','502','503','504'],
13 |        'S6':['601','602','603']}
14 | 
15 | np.save('tmp_data/n2p.npy',n2p)
16 | n2p = np.load('tmp_data/n2p.npy',allow_pickle=True).item()
17 | 
18 | p2n = {}
19 | for node,pods in n2p.items():
20 |        for pod in pods:
21 |               p2n[pod] = node
22 | np.save('tmp_data/p2n.npy',p2n)
23 | p2n = np.load('tmp_data/p2n.npy',allow_pickle=True).item()
24 | 
25 | stage_to_sensor = {'S1': ['FIT101', 'LIT101', 'MV101', 'P101', 'P102'], 
26 |                    'S2': ['AIT201', 'AIT202', 'AIT203', 'FIT201', 'MV201', 'P201', 'P202', 'P203', 'P204', 'P205', 'P206'],
27 |                    'S3': ['DPIT301', 'FIT301', 'LIT301', 'MV301', 'MV302', 'MV303', 'MV304', 'P301', 'P302'], 
28 |                    'S4': ['AIT401', 'AIT402', 'FIT401', 'LIT401', 'P401', 'P402', 'P403', 'P404', 'UV401'], 
29 |                    'S5': ['AIT501', 'AIT502', 'AIT503', 'AIT504', 'FIT501', 'FIT502', 'FIT503', 'FIT504', 'P501', 'P502', 'PIT501', 'PIT502', 'PIT503'], 
30 |                    'S6': ['FIT601', 'P601', 'P602', 'P603']}
31 | stage_to_sensor = np.save('tmp_data/stage_to_sensor.npy', stage_to_sensor)
32 | stage_to_sensor = np.load('tmp_data/stage_to_sensor.npy',allow_pickle=True).item()
33 | 
34 | nodes = list(n2p.keys())
35 | pods = list(p2n.keys())
36 | 
37 | # finals = []
38 | for ind,segment in enumerate(segments):
39 |        segs = {}
40 |        for node in nodes:
41 |               result_data = []
42 |               node_data = segment[stage_to_sensor[node]]
43 |               node_columns = node_data.columns
44 |               result_data.append(list(segment['Timestamp']))
45 |               for metric in metrics:
46 |                      node_cols = [x for x in node_columns if metric in x]
47 |                      if metric == 'P':
48 |                             tmp = []
49 |                             for item in node_cols:
50 |                                    if 'PIT' in item or 'DPIT' in item:
51 |                                           continue
52 |                                    else:
53 |                                           tmp.append(item)
54 |                             node_cols = tmp
55 |                      if metric == 'PIT':
56 |                             tmp = []
57 |                             for item in node_cols:
58 |                                    if 'DPIT' in item:
59 |                                           continue
60 |                                    else:
61 |                                           tmp.append(item)
62 |                             node_cols = tmp
63 |                      metric_data = node_data[node_cols].T.values
64 |                      node_sum = np.zeros((metric_data.shape[1]))
65 |                      for item in metric_data:
66 |                             node_sum += item
67 |                      result_data.append(list(node_sum))
68 |               result_data.append(list(segment['label']))
69 |               result_data = pd.DataFrame(result_data)
70 |               result_data = result_data.T
71 |               result_data.columns = ['Timestamp','FIT', 'LIT', 'MV', 'P', 'PIT', 'AIT', 'DPIT', 'UV', 'label']
72 |               segs[node] = result_data
73 | 
74 |               print('{}-th segment {} has been done!'.format(ind,node))
75 | 
76 |        with open("tmp_data/{}_node_data_cut.pkl".format(ind),"wb") as f:
77 |            pickle.dump(segs,f,protocol=1)
78 | 
79 |        print("{} segment has been done!".format(ind))
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 
93 | 
94 | 
95 | 
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/OT/data_preprocessing/SWaT/node_final_process.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pickle
 4 | import glob
 5 | 
 6 | 
 7 | files = sorted(glob.glob('tmp_data/*_node_data_cut.pkl'),key=lambda x:int(x.split('/')[1].split('_')[0]))
 8 | metrics = ['FIT','LIT','MV','P','PIT','AIT','DPIT','UV']
 9 | for ind,file in enumerate(files):
10 |     with open(file,'rb') as f:
11 |         seg = pickle.load(f)
12 |         metric_data = {}
13 |         for metric in metrics:
14 |             res_mid = {}
15 |             res_metric = {}
16 |             met_data = []
17 |             nod_name = []
18 |             for nod,val in seg.items():
19 |                 nod_name.append(nod)
20 |                 met_data.append(list(val[metric].values))
21 |             met_data.append(list(val['label'].values))
22 |             met_data = pd.DataFrame(met_data).T
23 |             res_metric['Sequence'] = met_data.to_numpy()
24 |             times = val['Timestamp'].values.tolist()
25 |             res_metric['Time'] = times
26 |             res_metric['Node_Name'] = nod_name
27 |             res_metric['KPI_Label'] = 'rca'
28 |             res_metric['KPI_Feature'] = ['label']
29 |             res_mid['rca'] = res_metric
30 |             metric_data[metric] = res_mid
31 | 
32 |     with open('tmp_data/{}_node_level_final_data.pkl'.format(ind),'wb') as f:
33 |         pickle.dump(metric_data,f,protocol=1)
34 | 
35 |     print("{} segment has been done!".format(ind))
36 | 
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/OT/data_preprocessing/SWaT/pod_data_cut.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pickle
 4 | 
 5 | 
 6 | segments = np.load('tmp_data/data_segments.npz',allow_pickle=True)['segments']
 7 | metrics = ['FIT','LIT','MV','P','PIT','AIT','DPIT','UV']
 8 | n2p = {'S1':['101','102'],
 9 |        'S2':['201','202','203','204','205','206'],
10 |        'S3':['301','302','303','304'],
11 |        'S4':['401','402','403','404'],
12 |        'S5':['501','502','503','504'],
13 |        'S6':['601','602','603']}
14 | 
15 | np.save('tmp_data/n2p.npy',n2p)
16 | n2p = np.load('tmp_data/n2p.npy',allow_pickle=True).item()
17 | 
18 | p2n = {}
19 | for node,pods in n2p.items():
20 |        for pod in pods:
21 |               p2n[pod] = node
22 | np.save('tmp_data/p2n.npy',p2n)
23 | p2n = np.load('tmp_data/p2n.npy',allow_pickle=True).item()
24 | 
25 | stage_to_sensor = {'S1': ['FIT101', 'LIT101', 'MV101', 'P101', 'P102'], 
26 |                    'S2': ['AIT201', 'AIT202', 'AIT203', 'FIT201', 'MV201', 'P201', 'P202', 'P203', 'P204', 'P205', 'P206'],
27 |                    'S3': ['DPIT301', 'FIT301', 'LIT301', 'MV301', 'MV302', 'MV303', 'MV304', 'P301', 'P302'], 
28 |                    'S4': ['AIT401', 'AIT402', 'FIT401', 'LIT401', 'P401', 'P402', 'P403', 'P404', 'UV401'], 
29 |                    'S5': ['AIT501', 'AIT502', 'AIT503', 'AIT504', 'FIT501', 'FIT502', 'FIT503', 'FIT504', 'P501', 'P502', 'PIT501', 'PIT502', 'PIT503'], 
30 |                    'S6': ['FIT601', 'P601', 'P602', 'P603']}
31 | stage_to_sensor = np.save('tmp_data/stage_to_sensor.npy', stage_to_sensor)
32 | stage_to_sensor = np.load('tmp_data/stage_to_sensor.npy',allow_pickle=True).item()
33 | 
34 | nodes = list(n2p.keys())
35 | pods = list(p2n.keys())
36 | 
37 | finals = []
38 | for ind,segment in enumerate(segments):
39 |        segs = {}
40 |        for node,pods in n2p.items():
41 |               node_data = segment[stage_to_sensor[node]]
42 |               node_columns = node_data.columns
43 |               for pod in pods:
44 |                      result_data = []
45 |                      result_data.append(list(segment['Timestamp']))
46 |                      pod_columns = [x for x in node_columns if pod in x]
47 |                      for metric in metrics:
48 |                          pod_cols = [x for x in pod_columns if metric in x]
49 |                          if metric == 'P':
50 |                              tmp = []
51 |                              for item in pod_cols:
52 |                                  if 'PIT' in item or 'DPIT' in item:
53 |                                      continue
54 |                                  else:
55 |                                      tmp.append(item)
56 |                              pod_cols = tmp
57 |                          if metric == 'PIT':
58 |                              tmp = []
59 |                              for item in pod_cols:
60 |                                  if 'DPIT' in item:
61 |                                      continue
62 |                                  else:
63 |                                      tmp.append(item)
64 |                              pod_cols = tmp
65 |                          metric_data = node_data[pod_cols].T.values
66 |                          pod_sum = np.zeros((node_data.shape[0]))
67 |                          for item in metric_data:
68 |                              pod_sum += item
69 |                          result_data.append(list(pod_sum))
70 |                      result_data.append(list(segment['label']))
71 |                      result_data = pd.DataFrame(result_data)
72 |                      result_data = result_data.T
73 |                      result_data.columns = ['Timestamp', 'FIT', 'LIT', 'MV', 'P', 'PIT', 'AIT', 'DPIT', 'UV', 'label']
74 |                      segs[pod] = result_data
75 | 
76 |                      print('{}-th segment {} {} has been done!'.format(ind, node, pod))
77 |        print("{} segment has been done!".format(ind))
78 |        finals.append(segs)
79 | 
80 | with open("tmp_data/pod_data_cut.pkl","wb") as f:
81 |     pickle.dump(finals,f,protocol=1)
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/OT/data_preprocessing/SWaT/pod_final_process.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pickle
 4 | import time
 5 | import glob
 6 | 
 7 | 
 8 | files = sorted(glob.glob('tmp_data/*_pod_data_cut.pkl'),key=lambda x:int(x.split('/')[1].split('_')[0]))
 9 | metrics = ['FIT','LIT','MV','P','PIT','AIT','DPIT','UV']
10 | for ind,file in enumerate(files):
11 |     with open(file,'rb') as f:
12 |         seg = pickle.load(f)
13 |         metric_data = {}
14 |         for metric in metrics:
15 |             res_mid = {}
16 |             res_metric = {}
17 |             pod_name = []
18 |             met_data = []
19 |             for pod, val in seg.items():
20 |                 pod_name.append(pod)
21 |                 met_data.append(list(val[metric].values))
22 |             met_data.append(list(val['label'].values))
23 |             met_data = pd.DataFrame(met_data).T
24 |             res_metric['Sequence'] = met_data.to_numpy()
25 |             times = val['Timestamp'].values.tolist()
26 |             res_metric['Time'] = times
27 |             res_metric['Pod_Name'] = pod_name
28 |             res_metric['KPI_Label'] = 'rca'
29 |             res_metric['KPI_Feature'] = ['label']
30 |             res_mid['rca'] = res_metric
31 |             metric_data[metric] = res_mid
32 |         with open('tmp_data/{}_pod_level_final_data.pkl'.format(ind), 'wb') as f:
33 |             pickle.dump(metric_data, f, protocol=1)
34 |         print("{} segment {} metric has been done!".format(ind, metric))
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/OT/data_preprocessing/SWaT/process.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Ensure the script stops if any command fails
 4 | set -e
 5 | 
 6 | # Execute the Python scripts in order
 7 | python data_segment.py
 8 | python node_data_cut.py
 9 | python pod_data_cut.py
10 | python node_final_process.py
11 | python pod_final_process.py
12 | 
13 | echo "All scripts executed successfully!"
14 | 


--------------------------------------------------------------------------------
/OT/data_preprocessing/WADI/data_segment.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | normal = pd.read_csv("normal.csv",skiprows=4)
  5 | attack = pd.read_csv("attack.csv")
  6 | 
  7 | attack_cols = []
  8 | for col in list(attack.columns):
  9 |     attack_cols.append(col.split('\\')[-1])
 10 | 
 11 | normal_cols = []
 12 | for col in list(normal.columns):
 13 |     normal_cols.append(col.split('\\')[-1])
 14 | 
 15 | normal.columns = normal_cols
 16 | attack.columns = attack_cols
 17 | 
 18 | attack = attack.dropna(axis=1,how='all')
 19 | normal = normal.dropna(axis=1,how='all')
 20 | attack = attack.fillna(0)
 21 | normal = normal.fillna(0)
 22 | 
 23 | attack.insert(1,'Timestamp',attack['Date'] + ' '+ attack['Time'])
 24 | del attack['Date']
 25 | del attack['Time']
 26 | del attack['Row']
 27 | attack['Timestamp'] = pd.to_datetime(attack['Timestamp'])
 28 | attack.set_index(attack['Timestamp'],inplace=True)
 29 | 
 30 | 
 31 | normal.insert(1,'Timestamp',normal['Date'] + ' '+ normal['Time'])
 32 | del normal['Date']
 33 | del normal['Time']
 34 | del normal['Row']
 35 | normal['Timestamp'] = pd.to_datetime(normal['Timestamp'])
 36 | normal.set_index(normal['Timestamp'],inplace=True)
 37 | 
 38 | normal['label'] = 1
 39 | attack['label'] = 1
 40 | attack['2017-10-09 19:25:00':'2017-10-09 19:50:16']['label'] = -1
 41 | attack['2017-10-10 10:24:10':'2017-10-10 10:34:00']['label'] = -1
 42 | attack['2017-10-10 10:55:00':'2017-10-10 11:24:00']['label'] = -1
 43 | attack['2017-10-10 11:30:40':'2017-10-10 11:44:50']['label'] = -1
 44 | attack['2017-10-10 13:39:30':'2017-10-10 13:50:40']['label'] = -1
 45 | attack['2017-10-10 14:48:17':'2017-10-10 14:59:55']['label'] = -1
 46 | attack['2017-10-10 17:40:00':'2017-10-10 17:49:40']['label'] = -1
 47 | attack['2017-10-11 10:55:00':'2017-10-11 10:56:27']['label'] = -1
 48 | attack['2017-10-11 11:17:54':'2017-10-11 11:31:20']['label'] = -1
 49 | attack['2017-10-11 11:36:31':'2017-10-11 11:47:00']['label'] = -1
 50 | attack['2017-10-11 11:59:00':'2017-10-11 12:05:00']['label'] = -1
 51 | attack['2017-10-11 12:07:30':'2017-10-11 12:10:52']['label'] = -1
 52 | attack['2017-10-11 12:16:00':'2017-10-11 12:25:36']['label'] = -1
 53 | attack['2017-10-11 15:26:30':'2017-10-11 15:37:00']['label'] = -1
 54 | 
 55 | attack.to_hdf('tmp_data/attack.hdf',key='wdj',complevel=9)
 56 | normal.to_hdf('tmp_data/normal.hdf',key='wdj',complevel=9)
 57 | 
 58 | 
 59 | metrics = ['MV','LS','LT','FIT','AIT','MCV','P']
 60 | n2p = {
 61 |     'S1':[
 62 |         '1_001','1_002','1_003','1_004','1_005','1_006'
 63 |     ],
 64 |     'S2':[
 65 |         '2_101','2_201','2_301','2_401','2_501','2_601',
 66 |         '2_001','2_002','2_003', '2_004', '2_005','2_006','2_007', '2_009'
 67 |     ],
 68 |     'S3':[
 69 |         '3_001','3_002','3_003','3_004', '3_005'
 70 |     ]
 71 | }
 72 | 
 73 | np.save('tmp_data/n2p.npy',n2p)
 74 | n2p = np.load('tmp_data/n2p.npy',allow_pickle=True).item()
 75 | 
 76 | p2n = {}
 77 | for node,pods in n2p.items():
 78 |        for pod in pods:
 79 |               p2n[pod] = node
 80 | np.save('tmp_data/p2n.npy',p2n)
 81 | p2n = np.load('tmp_data/p2n.npy',allow_pickle=True).item()
 82 | 
 83 | columns = set(normal.columns) & set(attack.columns)
 84 | 
 85 | remain_cols = []
 86 | for metric in metrics:
 87 |     for col in columns:
 88 |         col_me = col.split('_')
 89 |         if len(col_me) > 2:
 90 |             if metric == col_me[1]:
 91 |                 remain_cols.append(col)
 92 | 
 93 | 
 94 | remain_cols.sort()
 95 | monitor_attack = attack[remain_cols]
 96 | monitor_normal = normal[remain_cols]
 97 | 
 98 | a2=sorted([x for x in columns if '2A' in x])
 99 | b2=sorted([x for x in columns if '2B' in x])
100 | 
101 | for a2_item,b2_item in zip(a2,b2):
102 |     ca = monitor_attack[a2_item] + monitor_attack[b2_item]
103 |     ca_name = a2_item.replace('2A','2')
104 |     monitor_attack[ca_name] = ca
105 |     del monitor_attack[a2_item], monitor_attack[b2_item]
106 | 
107 |     cn =  monitor_normal[a2_item] + monitor_normal[b2_item]
108 |     cn_name = a2_item.replace('2A','2')
109 |     monitor_normal[cn_name] = cn
110 |     del monitor_normal[a2_item], monitor_normal[b2_item]
111 | 
112 | monitor_attack.sort_index(axis=1,inplace=True)
113 | monitor_normal.sort_index(axis=1,inplace=True) #按照列名排序
114 | 
115 | monitor_attack.insert(0,'Timestamp',attack['Timestamp'])
116 | monitor_normal.insert(0,'Timestamp',normal['Timestamp'])
117 | 
118 | monitor_attack['label'] = attack['label']
119 | monitor_normal['label'] = normal['label']
120 | 
121 | monitor_attack.to_hdf('tmp_data/monitor_attack.hdf',complevel=9,key='wdj')
122 | monitor_normal.to_hdf('tmp_data/monitor_normal.hdf',complevel=9,key='wdj')
123 | 
124 | 
125 | monitor_attack = pd.read_hdf('tmp_data/monitor_attack.hdf')
126 | monitor_normal = pd.read_hdf('tmp_data/monitor_normal.hdf')
127 | 
128 | metrics = ['MV','LS','LT','FIT','AIT','MCV','P']
129 | n2p = {
130 |     'S1':[
131 |         '1_001','1_002','1_003','1_004','1_005','1_006'
132 |     ],
133 |     'S2':[
134 |         '2_101','2_201','2_301','2_401','2_501','2_601',
135 |         '2_001','2_002','2_003', '2_004', '2_005','2_006','2_007', '2_009'
136 |     ],
137 |     'S3':[
138 |         '3_001','3_002','3_003','3_004', '3_005'
139 |     ]
140 | }
141 | 
142 | for key,vals in n2p.items():
143 |     for val in vals:
144 |         for metric in metrics:
145 |             name = val.split('_')[0]+'_'+metric+'_'+val.split('_')[1]
146 |             cols = [x for x in list(monitor_attack.columns) if name in x]
147 |             if len(cols) == 1:
148 |                 monitor_attack.rename(columns={cols[0]:name},inplace=True)
149 |                 monitor_normal.rename(columns={cols[0]:name},inplace=True)
150 |             elif len(cols) > 1:
151 |                 col_result = monitor_attack[cols[0]]
152 |                 for i in range(1,len(cols)):
153 |                     col_result += monitor_attack[cols[i]]
154 |                 monitor_attack[name] = col_result
155 |                 for col in cols:
156 |                     del monitor_attack[col]
157 | 
158 |                 col_result_n = monitor_normal[cols[0]]
159 |                 for i in range(1,len(cols)):
160 |                     col_result_n += monitor_normal[cols[i]]
161 |                 monitor_normal[name] = col_result_n
162 |                 for col in cols:
163 |                     del monitor_normal[col]
164 |             else:
165 |                 continue
166 | 
167 | monitor_attack.sort_index(axis=1,inplace=True)
168 | monitor_normal.sort_index(axis=1,inplace=True)
169 | 
170 | attack_ts = monitor_attack['Timestamp']
171 | monitor_attack.drop('Timestamp',axis=1,inplace=True)
172 | monitor_attack.insert(0,'Timestamp',attack_ts)
173 | 
174 | 
175 | normal_ts = monitor_normal['Timestamp']
176 | monitor_normal.drop('Timestamp',axis=1,inplace=True)
177 | monitor_normal.insert(0,'Timestamp',normal_ts)
178 | 
179 | monitor_attack.to_hdf("tmp_data/attack_final.hdf",key='wdj',complevel=9)
180 | monitor_normal.to_hdf("tmp_data/normal_final.hdf",key='wdj',complevel=9)
181 | 
182 | 
183 | attack = pd.read_hdf('tmp_data/attack_final.hdf')
184 | normal = pd.read_hdf('tmp_data/normal_final.hdf')
185 | 
186 | attack_time_ranges = [
187 | ['2017-10-09 19:25:00','2017-10-09 19:50:16'],
188 | ['2017-10-10 10:24:10','2017-10-10 10:34:00'],
189 | ['2017-10-10 10:55:00','2017-10-10 11:24:00'],
190 | ['2017-10-10 11:30:40','2017-10-10 11:44:50'],
191 | ['2017-10-10 13:39:30','2017-10-10 13:50:40'],
192 | ['2017-10-10 14:48:17','2017-10-10 14:59:55'],
193 | ['2017-10-10 17:40:00','2017-10-10 17:49:40'],
194 | ['2017-10-11 10:55:00','2017-10-11 10:56:27'],
195 | ['2017-10-11 11:17:54','2017-10-11 11:31:20'],
196 | ['2017-10-11 11:36:31','2017-10-11 11:47:00'],
197 | ['2017-10-11 11:59:00','2017-10-11 12:05:00'],
198 | ['2017-10-11 12:07:30','2017-10-11 12:10:52'],
199 | ['2017-10-11 12:16:00','2017-10-11 12:25:36'],
200 | ['2017-10-11 15:26:30','2017-10-11 15:37:00']
201 | ]
202 | 
203 | attack_segments = []
204 | for time_range in attack_time_ranges:
205 |     attack_segments.append(attack[time_range[0]:time_range[1]])
206 | 
207 | data = pd.concat([normal['2017-10-05':'2017-10-06'],attack],axis=0)
208 | 
209 | time_ranges_before = [
210 |     ['2017-10-05 00:00:00','2017-10-05 19:24:59'],
211 |     ['2017-10-05 00:00:00','2017-10-05 10:24:09'],
212 |     ['2017-10-05 00:00:00','2017-10-05 10:54:59'],
213 |     ['2017-10-05 00:00:00','2017-10-05 11:30:39'],
214 |     ['2017-10-05 00:00:00','2017-10-05 13:39:29'],
215 |     ['2017-10-05 00:00:00','2017-10-05 14:48:16'],
216 |     ['2017-10-05 00:00:00','2017-10-05 17:39:59'],
217 |     ['2017-10-05 00:00:00','2017-10-05 10:54:59'],
218 |     ['2017-10-05 00:00:00','2017-10-05 11:17:53'],
219 |     ['2017-10-05 00:00:00','2017-10-05 11:36:30'],
220 |     ['2017-10-05 00:00:00','2017-10-05 11:58:59'],
221 |     ['2017-10-05 00:00:00','2017-10-05 12:07:29'],
222 |     ['2017-10-05 00:00:00','2017-10-05 12:15:59'],
223 |     ['2017-10-05 00:00:00','2017-10-05 15:26:29'],
224 | ]
225 | 
226 | time_ranges_after = [
227 |     ['2017-10-05 19:50:17','2017-10-06 23:59:59'],
228 |     ['2017-10-05 10:34:01','2017-10-06 23:59:59'],
229 |     ['2017-10-05 11:24:01','2017-10-06 23:59:59'],
230 |     ['2017-10-05 11:44:51','2017-10-06 23:59:59'],
231 |     ['2017-10-05 13:50:41','2017-10-06 23:59:59'],
232 |     ['2017-10-05 14:59:56','2017-10-06 23:59:59'],
233 |     ['2017-10-05 17:49:41','2017-10-06 23:59:59'],
234 |     ['2017-10-05 10:56:28','2017-10-06 23:59:59'],
235 |     ['2017-10-05 11:31:21','2017-10-06 23:59:59'],
236 |     ['2017-10-05 11:47:01','2017-10-06 23:59:59'],
237 |     ['2017-10-05 12:05:01','2017-10-06 23:59:59'],
238 |     ['2017-10-05 12:10:53','2017-10-06 23:59:59'],
239 |     ['2017-10-05 12:25:37','2017-10-06 23:59:59'],
240 |     ['2017-10-05 15:37:01','2017-10-06 23:59:59'],
241 | ]
242 | 
243 | normal_pattern_before = []
244 | for time_range in time_ranges_before:
245 |     normal_pattern_before.append(data[time_range[0]:time_range[1]])
246 | 
247 | normal_pattern_after = []
248 | for time_range in time_ranges_after:
249 |     normal_pattern_after.append(data[time_range[0]:time_range[1]])
250 | 
251 | final = []
252 | for before,attack,after in zip(normal_pattern_before,attack_segments,normal_pattern_after):
253 |     final.append(pd.concat([before,attack,after],axis=0))
254 | 
255 | import pickle
256 | with open('tmp_data/data_segments.pkl','wb') as f:
257 |     pickle.dump(final,f,protocol=1)
258 |     
259 | print('The process of data segment has been accomplished!')
260 | 
261 | 
262 | 
263 | 
264 | 


--------------------------------------------------------------------------------
/OT/data_preprocessing/WADI/node_data_cut.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import pickle
  4 | 
  5 | with open('tmp_data/data_segments.pkl','rb') as f:
  6 |        segments = pickle.load(f)
  7 | 
  8 | metrics = ['MV','LS','LT','FIT','AIT','MCV','P']
  9 | 
 10 | n2p = {
 11 |     'S1':[
 12 |         '1_001','1_002','1_003','1_004','1_005','1_006'
 13 |     ],
 14 |     'S2':[
 15 |         '2_101','2_201','2_301','2_401','2_501','2_601',
 16 |         '2_001','2_002','2_003', '2_004', '2_005','2_006','2_007', '2_009'
 17 |     ],
 18 |     'S3':[
 19 |         '3_001','3_002','3_003','3_004', '3_005'
 20 |     ]
 21 | }
 22 | 
 23 | np.save('tmp_data/n2p.npy',n2p)
 24 | n2p = np.load('tmp_data/n2p.npy',allow_pickle=True).item()
 25 | 
 26 | p2n = {}
 27 | for node,pods in n2p.items():
 28 |        for pod in pods:
 29 |               p2n[pod] = node
 30 | np.save('tmp_data/p2n.npy',p2n)
 31 | p2n = np.load('tmp_data/p2n.npy',allow_pickle=True).item()
 32 | 
 33 | stage_to_sensor = {'S1': ['1_AIT_001', '1_AIT_002', '1_AIT_003', '1_AIT_004', '1_AIT_005', '1_FIT_001', '1_LS_001', '1_LS_002', '1_LT_001', '1_MV_001', '1_MV_002', '1_MV_003', '1_MV_004', '1_P_001', '1_P_002', '1_P_003', '1_P_004', '1_P_005', '1_P_006'], 
 34 |                    'S2': ['2_AIT_001', '2_AIT_002', '2_AIT_003', '2_AIT_004', '2_FIT_001', '2_FIT_002', '2_FIT_003', '2_LS_101', '2_LS_201', '2_LS_301', '2_LS_401', '2_LS_501', '2_LS_601', '2_LT_001', '2_LT_002', '2_MCV_007', '2_MCV_101', '2_MCV_201', '2_MCV_301', '2_MCV_401', '2_MCV_501', '2_MCV_601', '2_MV_001', '2_MV_002', '2_MV_003', '2_MV_004', '2_MV_005', '2_MV_006', '2_MV_009', '2_MV_101', '2_MV_201', '2_MV_301', '2_MV_401', '2_MV_501', '2_MV_601', '2_P_003', '2_P_004'], 
 35 |                    'S3': ['3_AIT_001', '3_AIT_002', '3_AIT_003', '3_AIT_004', '3_AIT_005', '3_FIT_001', '3_LS_001', '3_LT_001', '3_MV_001', '3_MV_002', '3_MV_003', '3_P_001', '3_P_002', '3_P_003', '3_P_004']}
 36 | np.save('tmp_data/stage_to_sensor.npy',stage_to_sensor)
 37 | stage_to_sensor = np.load('tmp_data/stage_to_sensor.npy',allow_pickle=True).item()
 38 | 
 39 | nodes = list(n2p.keys())
 40 | pods = list(p2n.keys())
 41 | 
 42 | # finals = []
 43 | for ind,segment in enumerate(segments):
 44 |        segs = {}
 45 |        for node in nodes:
 46 |               result_data = []
 47 |               node_data = segment[stage_to_sensor[node[1]]]
 48 |               node_columns = node_data.columns
 49 |               result_data.append(list(segment['Timestamp']))
 50 |               for metric in metrics:
 51 |                      node_cols = [x for x in node_columns if metric in x]
 52 |                      if metric == 'P':
 53 |                             tmp = []
 54 |                             for item in node_cols:
 55 |                                    if 'PIT' in item or 'DPIT' in item:
 56 |                                           continue
 57 |                                    else:
 58 |                                           tmp.append(item)
 59 |                             node_cols = tmp
 60 |                      if metric == 'PIT':
 61 |                             tmp = []
 62 |                             for item in node_cols:
 63 |                                    if 'DPIT' in item:
 64 |                                           continue
 65 |                                    else:
 66 |                                           tmp.append(item)
 67 |                             node_cols = tmp
 68 |                      metric_data = node_data[node_cols].T.values
 69 |                      node_sum = np.zeros((metric_data.shape[1]))
 70 |                      for item in metric_data:
 71 |                             node_sum += item
 72 |                      result_data.append(list(node_sum))
 73 |               result_data.append(list(segment['label']))
 74 |               result_data = pd.DataFrame(result_data)
 75 |               result_data = result_data.T
 76 |               result_data.columns = ['Timestamp','MV','LS','LT','FIT','AIT','MCV','P','label']
 77 |               segs[node] = result_data
 78 |               print('{}-th segment {} has been done!'.format(ind,node))
 79 |        with open("tmp_data/{}_node_data_cut.pkl".format(ind),"wb") as f:
 80 |             pickle.dump(segs,f,protocol=1)
 81 |        print("{} segment has been done!".format(ind))
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/OT/data_preprocessing/WADI/node_final_process.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pickle
 4 | import time
 5 | import glob
 6 | 
 7 | files = sorted(glob.glob('tmp_data/*_node_data_cut.pkl'),key=lambda x:int(x.split('/')[1].split('_')[0]))
 8 | metrics = ['MV','LS','LT','FIT','AIT','MCV','P']
 9 | for ind,file in enumerate(files):
10 |     with open(file,'rb') as f:
11 |         seg = pickle.load(f)
12 |         metric_data = {}
13 |         for metric in metrics:
14 |             res_mid = {}
15 |             res_metric = {}
16 |             met_data = []
17 |             nod_name = []
18 |             for nod, val in seg.items():
19 |                 nod_name.append(nod)
20 |                 met_data.append(list(val[metric].values))
21 |             met_data.append(list(val['label'].values))
22 |             met_data = pd.DataFrame(met_data).T
23 |             res_metric['Sequence'] = met_data.to_numpy()
24 |             times = val['Timestamp'].values.tolist()
25 |             res_metric['Time'] = times
26 |             res_metric['Node_Name'] = nod_name
27 |             res_metric['KPI_Label'] = 'rca'
28 |             res_metric['KPI_Feature'] = ['label']
29 |             res_mid['rca'] = res_metric
30 |             metric_data[metric] = res_mid
31 |         # data.append(metric_data)
32 |     with open('tmp_data/{}_node_level_final_data.pkl'.format(ind), 'wb') as f:
33 |         pickle.dump(metric_data, f, protocol=1)
34 |     print("{} segment has been done!".format(ind))
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/OT/data_preprocessing/WADI/pod_data_cut.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pickle
 4 | import os
 5 | 
 6 | with open('tmp_data/data_segments.pkl', 'rb') as f:
 7 |     segments = pickle.load(f)
 8 | 
 9 | metrics = ['MV', 'LS', 'LT', 'FIT', 'AIT', 'MCV', 'P']
10 | 
11 | n2p = {
12 |     'S1': [
13 |         '1_001', '1_002', '1_003', '1_004', '1_005', '1_006'
14 |     ],
15 |     'S2': [
16 |         '2_101', '2_201', '2_301', '2_401', '2_501', '2_601',
17 |         '2_001', '2_002', '2_003', '2_004', '2_005', '2_006', '2_007', '2_009'
18 |     ],
19 |     'S3': [
20 |         '3_001', '3_002', '3_003', '3_004', '3_005'
21 |     ]
22 | }
23 | 
24 | np.save('tmp_data/n2p.npy', n2p)
25 | n2p = np.load('tmp_data/n2p.npy', allow_pickle=True).item()
26 | 
27 | p2n = {}
28 | for node, pods in n2p.items():
29 |     for pod in pods:
30 |         p2n[pod] = node
31 | np.save('tmp_data/p2n.npy', p2n)
32 | p2n = np.load('tmp_data/p2n.npy', allow_pickle=True).item()
33 | 
34 | stage_to_sensor = {'S1': ['1_AIT_001', '1_AIT_002', '1_AIT_003', '1_AIT_004', '1_AIT_005', '1_FIT_001', '1_LS_001', '1_LS_002', '1_LT_001', '1_MV_001', '1_MV_002', '1_MV_003', '1_MV_004', '1_P_001', '1_P_002', '1_P_003', '1_P_004', '1_P_005', '1_P_006'], 
35 |                    'S2': ['2_AIT_001', '2_AIT_002', '2_AIT_003', '2_AIT_004', '2_FIT_001', '2_FIT_002', '2_FIT_003', '2_LS_101', '2_LS_201', '2_LS_301', '2_LS_401', '2_LS_501', '2_LS_601', '2_LT_001', '2_LT_002', '2_MCV_007', '2_MCV_101', '2_MCV_201', '2_MCV_301', '2_MCV_401', '2_MCV_501', '2_MCV_601', '2_MV_001', '2_MV_002', '2_MV_003', '2_MV_004', '2_MV_005', '2_MV_006', '2_MV_009', '2_MV_101', '2_MV_201', '2_MV_301', '2_MV_401', '2_MV_501', '2_MV_601', '2_P_003', '2_P_004'], 
36 |                    'S3': ['3_AIT_001', '3_AIT_002', '3_AIT_003', '3_AIT_004', '3_AIT_005', '3_FIT_001', '3_LS_001', '3_LT_001', '3_MV_001', '3_MV_002', '3_MV_003', '3_P_001', '3_P_002', '3_P_003', '3_P_004']}
37 | np.save('tmp_data/stage_to_sensor.npy',stage_to_sensor)
38 | stage_to_sensor = np.load('tmp_data/stage_to_sensor.npy',allow_pickle=True).item()
39 | 
40 | nodes = list(n2p.keys())
41 | pods = list(p2n.keys())
42 | 
43 | # finals = []
44 | for ind,segment in enumerate(segments):
45 |        segs = {}
46 |        for node,pods in n2p.items():
47 |               node_data = segment[stage_to_sensor[node[1]]]
48 |               node_columns = node_data.columns
49 |               for pod in pods:
50 |                      result_data = []
51 |                      result_data.append(list(segment['Timestamp']))
52 |                      pod_columns = [x for x in node_columns if pod.split('_')[0] in x and pod.split('_')[1] in x]
53 |                      for metric in metrics:
54 |                          pod_cols = [x for x in pod_columns if metric in x]
55 |                          if metric == 'P':
56 |                              tmp = []
57 |                              for item in pod_cols:
58 |                                  if 'PIT' in item or 'DPIT' in item:
59 |                                      continue
60 |                                  else:
61 |                                      tmp.append(item)
62 |                              pod_cols = tmp
63 |                          if metric == 'PIT':
64 |                              tmp = []
65 |                              for item in pod_cols:
66 |                                  if 'DPIT' in item:
67 |                                      continue
68 |                                  else:
69 |                                      tmp.append(item)
70 |                              pod_cols = tmp
71 |                          metric_data = node_data[pod_cols].T.values
72 |                          pod_sum = np.zeros((node_data.shape[0]))
73 |                          for item in metric_data:
74 |                              pod_sum += item
75 |                          result_data.append(list(pod_sum))
76 |                      result_data.append(list(segment['label']))
77 |                      result_data = pd.DataFrame(result_data)
78 |                      result_data = result_data.T
79 |                      result_data.columns = ['Timestamp','MV','LS','LT','FIT','AIT','MCV','P','label']
80 |                      segs[pod] = result_data
81 |                      print('{}-th segment {} {} has been done!'.format(ind, node, pod))
82 |        # finals.append(segs)
83 |        with open("tmp_data/{}_pod_data_cut.pkl".format(ind), "wb") as f:
84 |            pickle.dump(segs, f, protocol=1)
85 |        print("{} segment has been done!".format(ind))
86 | 
87 | 
88 | print('end!')
89 | 


--------------------------------------------------------------------------------
/OT/data_preprocessing/WADI/pod_final_process.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pickle
 4 | import time
 5 | import glob
 6 | 
 7 | files = sorted(glob.glob('tmp_data/*_pod_data_cut.pkl'),key=lambda x:int(x.split('/')[1].split('_')[0]))
 8 | metrics = ['MV','LS','LT','FIT','AIT','MCV','P']
 9 | for ind,file in enumerate(files):
10 |     with open(file,'rb') as f:
11 |         seg = pickle.load(f)
12 |         metric_data = {}
13 |         for metric in metrics:
14 |             res_mid = {}
15 |             res_metric = {}
16 |             pod_name = []
17 |             met_data = []
18 |             for pod, val in seg.items():
19 |                 pod_name.append(pod)
20 |                 met_data.append(list(val[metric].values))
21 |             met_data.append(list(val['label'].values))
22 |             met_data = pd.DataFrame(met_data).T
23 |             res_metric['Sequence'] = met_data.to_numpy()
24 |             times = val['Timestamp'].values.tolist()
25 |             res_metric['Time'] = times
26 |             res_metric['Pod_Name'] = pod_name
27 |             res_metric['KPI_Label'] = 'rca'
28 |             res_metric['KPI_Feature'] = ['label']
29 |             res_mid['rca'] = res_metric
30 |             metric_data[metric] = res_mid
31 |             print("{} segment {} metric has been done!".format(ind, metric))
32 | 
33 |         with open('tmp_data/{}_pod_level_final_data.pkl'.format(ind), 'wb') as f:
34 |             pickle.dump(metric_data, f, protocol=1)
35 |         print("{} segment has been done!".format(ind))


--------------------------------------------------------------------------------
/OT/data_preprocessing/WADI/process.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Ensure the script stops if any command fails
 4 | set -e
 5 | 
 6 | # Execute the Python scripts in order
 7 | python data_segment.py
 8 | python node_data_cut.py
 9 | python pod_data_cut.py
10 | python node_final_process.py
11 | python pod_final_process.py
12 | 
13 | echo "All scripts executed successfully!"
14 | 


--------------------------------------------------------------------------------
/Other/bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KnowledgeDiscovery/rca_baselines/c560d8cc39c19f04c9ae74fac2404b1e5e8c3b4d/Other/bg.png


--------------------------------------------------------------------------------
/Other/rca_update.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KnowledgeDiscovery/rca_baselines/c560d8cc39c19f04c9ae74fac2404b1e5e8c3b4d/Other/rca_update.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # [LEMMA-RCA](https://lemma-rca.github.io/)
  2 | Root cause analysis (RCA) is a task of identifying the underlying causes of system faults/failures by analyzing the system monitoring data. LEMMA-RCA is a collection of multi-modal datasets with various real system faults to facilitate future research in RCA. It is also a multi-domain dataset, encompassing real-world applications such as microservice and water treatment/distribution systems. The datasets are released under the CC BY-NC 4.0 license and hosted on Huggingface, the codes are available on Github.
  3 | 
  4 | <p align="center">
  5 |       <img src="./Other/rca_update.png" alt="drawing" style="width:600px;"/> 
  6 | </p>
  7 | 
  8 | ### Real System Faults
  9 | Each dataset contains various system faults simulated from real-world scenarios. 
 10 | For details, please check our [website](https://lemma-rca.github.io/).
 11 | 
 12 | ### Multiple Domains and Dataset Download
 13 | LEMMA-RCA covers two domains and we provide both the raw data and preprocessed data. We release the dataset in [Huggingface](https://huggingface.co/Lemma-RCA-NEC) and the detailed data statistics can be found in [Lemma-RCA Webpage](https://lemma-rca.github.io/docs/data.html).  
 14 | - For the raw data version, we provide all json files where the microservice system stores both the metric data, the log data and even trace data. Users are expected to extract these two modalities by themself. The goal of the raw data is to provide the users more choice of preprocessing the raw data. 
 15 | - For the preprocessed data, we have extracted the metric data and unstructured log data for each pod. The users may use their own methods to preprocessed log data or use the provided code to preprocess the log data and convert it to time-series data. For instance, the code to preprocess the data in the IT domain is stored in the following directory.
 16 |   ```
 17 |    cd ./IT/data_preprocessing
 18 |   ```
 19 |   If you want to directly test the performance of these baseline methods, you may choose to download the preprocessed data.
 20 | - IT Operations (Product Review and Cloud Computing)
 21 | - OT Operations (Water Treatment/Distribution)
 22 | - Two Dataset Versions for Product Review: [[Raw Data](https://huggingface.co/datasets/Lemma-RCA-NEC/Product_Review_Original)][[Preprocessed Data](https://huggingface.co/datasets/Lemma-RCA-NEC/Product_Review_Preprocessed)]
 23 | - Two Dataset Versions for Cloud Computing: [[Raw Data](https://huggingface.co/datasets/Lemma-RCA-NEC/Cloud_Computing_Original)][[Preprocessed Data](https://huggingface.co/datasets/Lemma-RCA-NEC/Cloud_Computing_Preprocessed)]
 24 | 
 25 | 
 26 | ### Unified Evaluation
 27 | Six baseline methods are evaluated on LEMMA-RCA under various single- and multi-modal configurations.
 28 | 
 29 | ### Guideline for Evaluation 
 30 | Example: Using FastPC to evalute the Performance of Case 20211203 in Product Review
 31 | 
 32 | #### Step 1: Download the Case 20211203 of the [[preprocessed data from HuggingFace](https://huggingface.co/datasets/Lemma-RCA-NEC/Product_Review_Preprocessed/tree/main)].
 33 | You need to download both log and metric data if you would like to test the performance of FastPC on multi-modal data.
 34 | 
 35 | #### Notice: If you want to use metric data only, you can skip step 2 to step 5 and move directly to step 6 to detect root cause with metric data.
 36 | 
 37 | #### Step 2: Use the code in IT folder to preprocess the log data. 
 38 | ```
 39 | cd ./IT/data_preprocessing
 40 | ```
 41 | 
 42 | #### Step 3: Extract useful log information (such as pod/node names, log messages, etc.) from original elasticsearch log (json format) 
 43 | ```terminal command
 44 | python json2message.py
 45 | ```
 46 | 
 47 | ***Notice***: Some of the arguments may need to change
 48 | 
 49 | ```
 50 |     --path, the input directory of the json format log data
 51 |     --output_dir, the output directory of all log messages
 52 |     --output_dir2, the output directory of pod-level log messages for each pod
 53 |     --output_dir3, the output directory of node-level log messages for each node
 54 | ```
 55 | 
 56 | #### Step 4: Usa Drain to parse both node-level and pod-level log messages
 57 | 
 58 | ```terminal command
 59 | python drain3_parse.py ./output/log_prep_node/  -o "./drain3_result/node"
 60 | 
 61 | python drain3_parse.py ./output/log_prep_pod/   -o "./drain3_result/pod"
 62 | 
 63 | ```
 64 | 
 65 | ```
 66 |     --input_dir, default="./output/log_prep_node/" or "./output/log_prep_pod/"
 67 |     --output_dir, default="./drain3_result/node"   or "./drain3_result/pod"
 68 |   
 69 | ```
 70 | 
 71 | #### Step 5: Log feature extraction
 72 | 
 73 | ```terminal command
 74 | python log_frequency_extraction.py --log_dir ./input_path/  --output_dir ./output_path
 75 | 
 76 | python log_golden_frequency.py --root_path ./input_path/  --output_dir ./output_path --save_dir ./output_path
 77 | ```
 78 | 
 79 | #### Step 6. Evalute the performance of FastPC on the Case 20211203 with metric data only:
 80 | - Notice that you need to change the path of data, dataset name and output directory. 
 81 | ```
 82 | python test_FastPC_pod_metric.py  --dataset 20211203 --path_dir CHANGE_PATH_TO_DATASET_DIRECTORY --output_dir CHANGE_PATH_TO_OUTPUT_DIRECTORY
 83 | ```
 84 | You may also test the performance of FastPC with log data or two modalities with the following command:
 85 | ```
 86 | python test_FastPC_pod_log.py  ## for log data only
 87 | python test_FastPC_pod_combine.py  ## for both metric and log data
 88 | ```
 89 | 
 90 | ##### If you encounter the error regarding "name 'LIBSPOT' is not defined", please double-check if you are running the code in the directory of FastPC.
 91 | 
 92 | #### Step 7. Check the results
 93 | The results will be stored in the csv file as follows:
 94 | ```
 95 | ./Baseline/offline/output/Pod_level_combine_ranking.csv
 96 | ```
 97 | The root cause for 20211203 (MongoDB-v1) can be found in the readme.pptx file in the folder of downloaded preprocessed data.
 98 | 
 99 | 
100 | <!-- 
101 | ### File directory
102 | ```
103 | Root:
104 |       --|IT
105 |             --|data preprocessing
106 |                   --|json2message.py
107 |                   --|drain3.py
108 |                   --|drain3_parse.py
109 |                   --|README.md
110 |                   --|drain3.yaml
111 |                   --|log_frequency_extraction.py
112 |                   --|log_golden_frequency.py
113 | 
114 |       --|OT
115 |             --|data preprocessing
116 |                   --|SWaT
117 |                         --|data_segment.py
118 |                         --|node_data_cut.py
119 |                         --|node_final_precess.py
120 |                         --|pod_data_cut.py
121 |                         --|pod_final_process.py
122 |                         --|process.sh
123 |                   --|WADI
124 |                         --|data_segment.py
125 |                         --|node_data_cut.py
126 |                         --|node_final_precess.py
127 |                         --|pod_data_cut.py
128 |                         --|pod_final_process.py
129 |                         --|process.sh
130 |       --|Baseline
131 |             --|offline
132 |                   --|Dynotears
133 |                   --|FastPC
134 |                   --|GNN
135 |                   --|GOLEM
136 |                   --|LSTM
137 |             --|online
138 |                   --| baseline_final
139 | ``` -->
140 | ### Citation
141 | 
142 | If you use LEMMA-RCA in your work, please cite our paper:
143 | 
144 | Lecheng Zheng, Zhengzhang Chen, Dongjie Wang, Chengyuan Deng, Reon Matsuoka, and Haifeng Chen: LEMMA-RCA: A Large Multi-modal Multi-domain Dataset for Root Cause Analysis. CoRR abs/2406.05375 (2024)
145 | 
146 | ### License
147 | 
148 | Creative Commons Attribution-NoDerivatives (CC BY-ND) 4.0 International License
149 | 
150 | 
151 | 


--------------------------------------------------------------------------------