├── .gitignore
├── LICENSE
├── README.md
├── columns.json
├── requirements.txt
├── run_algorithm.py
├── run_evaluation.py
├── run_experiment.sh
├── squeeze
    ├── __init__.py
    ├── anomaly_amount_fileter.py
    ├── clustering
    │   ├── __init__.py
    │   ├── cluster.py
    │   └── density_cluster.py
    ├── squeeze.py
    └── squeeze_option.py
└── utility
    ├── __init__.py
    └── attribute_combination.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | output/
 3 | B0/
 4 | pB1/
 5 | B2/
 6 | B3/
 7 | B4/
 8 | A/
 9 | D/
10 | B0.json
11 | B1.json
12 | B2.json
13 | B3.json
14 | B4.json
15 | A.json
16 | D.json
17 | B0
18 | B1
19 | B2
20 | B3
21 | B4
22 | A
23 | D
24 | .DS_Store
25 | .venv/
26 | output.csv
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 NetManAIOps
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Squeeze
  2 | Implementation and datasets for ISSRE 2019 REG paper 'Generic and Robust Localization of Multi-Dimensional Root Cause'.
  3 | 
  4 | ## Requirements
  5 | At least `python>=3.6, <3.7` is required. Though Python should be backward-compatible, there is no built wheel for some requirements like SciPy for a higher Python version.
  6 | ``` bash
  7 | pip install -r requirements.txt
  8 | ```
  9 | 
 10 | ## Datasets
 11 | 
 12 | Datasets `A, B0, B1, B2, B3, B4, D` in Table VII are on [Zenodo](https://zenodo.org/record/8153367) (updated on 2023-07-17).
 13 | The ground truth root cause sets are in `injection_info.csv` in each subfolder.
 14 | 
 15 | ## Usage
 16 | 
 17 | ```
 18 | $python run_algorithm.py --help
 19 | Usage: run_algorithm.py [OPTIONS]
 20 | 
 21 |   :param name: :param input_path: :param output_path: :param num_workers:
 22 |   :param kwargs: :return:
 23 | 
 24 | Options:
 25 |   --name TEXT            name of this setting
 26 |   --input-path TEXT      will read data from {input_path}/{name}
 27 |   --output-path TEXT     if {output_path} is a dir, save to
 28 |                          {output_path}/{name}.json; otherwise save to
 29 |                          {output_path}
 30 |   --num-workers INTEGER  num of processes
 31 |   --derived              means we should read {timestamp}.a.csv and
 32 |                          {timestamp}.b.csv
 33 |   --help                 Show this message and exit.
 34 | ```
 35 | 
 36 | ``` 
 37 | $python run_evaluation.py --help
 38 | Usage: run_evaluation.py [OPTIONS]
 39 | 
 40 | Options:
 41 |   -i, --injection-info TEXT  injection_info.csv file
 42 |   -p, --predict TEXT         output json file
 43 |   -c, --config TEXT          config json file
 44 |   -o, --output-path TEXT     output path
 45 |   --help                     Show this message and exit.
 46 | ```
 47 | 
 48 | The config json file should contain the attribute names, e.g.:
 49 | 
 50 | ```
 51 | {
 52 |   "columns": [
 53 |     "a", "b", "c", "d"
 54 |   ]
 55 | }
 56 | ```
 57 | 
 58 | 
 59 | 
 60 | ## Example
 61 | 
 62 | 1.  Download `B3.tgz` and extract `B3.tgz` into `B3`.
 63 | 
 64 | 2.  Run this command:
 65 | 
 66 | ```
 67 | python run_algorithm.py --name B_cuboid_layer_2_n_ele_2 --input-path B3 --output-path output/ --num-workers 10
 68 | ```
 69 | 
 70 | ​	Then the results are summarized in `output/B_cuboid_layer_2_n_ele_2.json`:
 71 | 
 72 | ```json
 73 | [
 74 |     {
 75 |         "timestamp": 1450653900,
 76 |         "elapsed_time": 10.794443607330322,
 77 |         "root_cause": "b=b31&d=d2;a=a1&b=b11"
 78 |     },
 79 |     {
 80 |         "timestamp": 1450666800,
 81 |         "elapsed_time": 15.272005081176758,
 82 |         "root_cause": "b=b21&c=c1;a=a4&b=b9&c=c4"
 83 |     },
 84 |     {
 85 |         "timestamp": 1450667700,
 86 |         "elapsed_time": 15.22673487663269,
 87 |         "root_cause": "b=b11&c=c4;a=a2&d=d1"
 88 |     },
 89 |     ...
 90 | ]
 91 | ```
 92 | 
 93 | 3.  Run evaluation scripts
 94 | 
 95 | ``` bash
 96 | python run_evaluation.py -i B3/B_cuboid_layer_2_n_ele_2/injection_info.csv -p output/B_cuboid_layer_2_n_ele_2.json -c columns.json
 97 | ```
 98 | 
 99 | `columns.json` should contain all the attributes.
100 | 
101 | ```
102 | {
103 |   "columns": [
104 |     "a", "b", "c", "d"
105 |   ]
106 | }
107 | ```
108 | 
109 | Then we get the output (F1-score, precision, recall):
110 | 
111 | ```
112 | ......
113 | 0.7858942065491183 0.7918781725888325 0.78
114 | ```
115 | 
116 | ## Known Issues
117 | This version of codes is faithful to the published version.
118 | However, two known severe issues are harming the localization performance.
119 | 1. The calculation of `_a1` and `_a2` in `squeeze/squeeze.py:184` is incorrect, which is not following the description in the paper.
120 |    It should be corrected as follows
121 |    ``` python
122 |     reduced_data_p, _ = self.get_derived_dataframe(
123 |         frozenset(elements[:partition]), cuboid=cuboid,
124 |         reduction="sum", return_complement=True,
125 |         subset_indices=np.concatenate([indices, self.normal_indices]))
126 |     if len(reduced_data_p):
127 |         _a1, _a2 = data_p.predict.values * (
128 |                 reduced_data_p.real.item() / reduced_data_p.predict.item()
129 |         ), data_n.predict.values
130 |     else:
131 |         # print(elements[:partition], data_p, reduced_data_p)
132 |         assert len(data_p) == 0
133 |         _a1 = 0
134 |         _a2 = data_n.predict.values   
135 |    ```
136 | 2. The calculation of `score_weight` in `squeeze/suqeeze.py:256` may produce negative values, which will cause incorrect localization results. Different from 1, the calculation here is faithful to the paper. See https://github.com/NetManAIOps/Squeeze/issues/6
137 | 
138 | See also our [extended version](https://github.com/netmanaiops/psqueeze)
139 | 
140 | ## Citation
141 | 
142 | ```
143 | @inproceedings{squeeze,
144 |   title={Generic and Robust Localization of Multi-Dimensional Root Causes},
145 |   author={Li, Zeyan and Luo, Chengyang and Zhao, Yiwei and Sun, Yongqian and Sui, Kaixin and Wang, Xiping and Liu, Dapeng and Jin, Xing and Wang, Qi and Pei, Dan},
146 |   booktitle={2019 IEEE 30th International Symposium on Software Reliability Engineering (ISSRE)},
147 |   year={2019},
148 |   organization={IEEE}
149 | }
150 | ```
151 | 


--------------------------------------------------------------------------------
/columns.json:
--------------------------------------------------------------------------------
1 | {
2 |   "columns": [
3 |     "a", "b", "c", "d"
4 |   ]
5 | }
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | -i https://pypi.org/simple
 2 | bidict==0.18.0
 3 | click==7.0
 4 | cycler==0.10.0
 5 | joblib==0.13.2
 6 | kiwisolver==1.1.0
 7 | kneed==0.4.1
 8 | loguru==0.3.2
 9 | matplotlib==3.1.1
10 | numpy==1.17.0
11 | pandas==0.25.0
12 | pyparsing==2.4.2
13 | python-dateutil==2.8.0
14 | pytz==2019.2
15 | scikit-learn==0.21.3
16 | scipy==1.3.1
17 | seaborn==0.9.0
18 | six==1.12.0
19 | 


--------------------------------------------------------------------------------
/run_algorithm.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | from pathlib import Path
  4 | import click
  5 | from functools import reduce
  6 | from typing import Dict, List
  7 | import json
  8 | import numpy as np
  9 | # from run_apriori import run
 10 | from joblib import Parallel, delayed
 11 | # noinspection PyProtectedMember
 12 | from loguru._defaults import LOGURU_FORMAT
 13 | 
 14 | from utility import AC, AttributeCombination
 15 | from squeeze import Squeeze, SqueezeOption
 16 | import pandas as pd
 17 | from loguru import logger
 18 | 
 19 | import os
 20 | 
 21 | 
 22 | @click.command('Runner')
 23 | @click.option("--name", default="", help="name of this setting")
 24 | @click.option("--input-path", help="will read data from {input_path}/{name}")
 25 | @click.option("--output-path", help="if {output_path} is a dir, save to {output_path}/{name}.json; \
 26 | otherwise save to {output_path}")
 27 | @click.option("--num-workers", default=1, help="num of processes")
 28 | @click.option("--derived", is_flag=True, help="means we should read {timestamp}.a.csv and {timestamp}.b.csv")
 29 | def main(name, input_path, output_path, num_workers, **kwargs):
 30 |     """
 31 |     :param name:
 32 |     :param input_path:
 33 |     :param output_path:
 34 |     :param num_workers:
 35 |     :param kwargs:
 36 |     :return:
 37 |     """
 38 |     logger.remove()
 39 |     logger.add(
 40 |         sys.stdout, level="INFO",
 41 |         format="[<green>{time}</green>, <blue>{level}</blue>] <white>{message}</white>"
 42 |     )
 43 |     dervied = kwargs.pop('derived')
 44 | 
 45 |     input_path = Path(input_path)
 46 |     assert input_path.exists(), f"{input_path} does not exist"
 47 |     output_path = Path(output_path)
 48 |     logger.info(f"read data from {input_path / name}")
 49 |     if output_path.is_dir():
 50 |         output_path = output_path / f"{name}.json"
 51 |     elif not output_path.exists():
 52 |         logger.info(f"create {output_path}")
 53 |         output_path.mkdir()
 54 |         output_path = output_path / f"{name}.json"
 55 |     logger.info(f"save to {output_path}")
 56 |     injection_info = pd.read_csv(input_path / name / 'injection_info.csv', engine='c')
 57 |     timestamps = sorted(injection_info['timestamp'])
 58 |     # results = list(
 59 |     #     executor(file_path, output_path.parent, **kwargs)
 60 |     #     for file_path in map(lambda x: input_path / name / f'{x}.csv', timestamps)
 61 |     # )
 62 |     if not dervied:
 63 |         results = Parallel(n_jobs=num_workers, backend="multiprocessing", verbose=100)(
 64 |             delayed(executor)(file_path, output_path.parent, **kwargs)
 65 |             for file_path in map(lambda x: input_path / name / f'{x}.csv', timestamps))
 66 |     else:
 67 |         results = Parallel(n_jobs=num_workers, backend="multiprocessing", verbose=100)(
 68 |             delayed(executor_derived)(file_path_list, output_path.parent, **kwargs)
 69 |             for file_path_list in map(
 70 |                 lambda x: [input_path / name / f'{x}.a.csv', input_path / name / f'{x}.b.csv'],
 71 |                 timestamps
 72 |             )
 73 |         )
 74 |     with open(str(output_path.resolve()), "w+") as f:
 75 |         json.dump(results, f, indent=4)
 76 |     logger.info(results)
 77 | 
 78 | 
 79 | def executor(file_path: Path, output_path: Path, **kwargs) -> Dict:
 80 |     debug = kwargs.pop('debug', False),
 81 |     logger.remove()
 82 |     logger.add(
 83 |         sys.stdout, level='DEBUG',
 84 |         format=f"<yellow>{file_path.name}</yellow> - {LOGURU_FORMAT}",
 85 |         backtrace=True
 86 |     )
 87 |     logger.info(f"running squeeze for {file_path}")
 88 |     df = pd.read_csv(file_path.resolve(), engine='python', dtype='str', delimiter=r"\s*,\s*")
 89 |     df['real'] = df['real'].astype(float)
 90 |     df['predict'] = df['predict'].astype(float)
 91 |     try:
 92 |         timestamp = int(file_path.name.rstrip('.csv'))
 93 |     except ValueError:
 94 |         timestamp = file_path.name.rstrip('.csv')
 95 |         logger.warning(f"Unresolved timestamp: {timestamp}")
 96 |     tic = time.time()
 97 | 
 98 |     model = Squeeze(
 99 |         data_list=[df],
100 |         op=lambda x: x,
101 |         option=SqueezeOption(
102 |             debug=debug,
103 |             fig_save_path=f"{output_path.resolve()}/{timestamp}" + "{suffix}" + ".pdf",
104 |             **kwargs,
105 |         )
106 |     )
107 |     model.run()
108 |     logger.info("\n" + model.report)
109 |     try:
110 |         root_cause = AC.batch_to_string(
111 |             frozenset(reduce(lambda x, y: x.union(y), model.root_cause, set())))  # type:
112 |     except IndexError:
113 |         root_cause = ""
114 | 
115 |     toc = time.time()
116 |     elapsed_time = toc - tic
117 |     return {
118 |         'timestamp': timestamp,
119 |         'elapsed_time': elapsed_time,
120 |         'root_cause': root_cause,
121 |     }
122 | 
123 | 
124 | def executor_derived(file_path_list: List[Path], output_path: Path, **kwargs) -> Dict:
125 |     debug = kwargs.pop('debug', False),
126 |     logger.remove()
127 |     ts = file_path_list[0].name.rstrip('.a.csv')
128 |     logger.add(
129 |         sys.stdout, level='DEBUG',
130 |         format=f"<yellow>{ts}</yellow> - {LOGURU_FORMAT}",
131 |         backtrace=True
132 |     )
133 |     logger.info(f"running squeeze for {ts}")
134 |     dfa = pd.read_csv(file_path_list[0].resolve(), engine='python', dtype='str', delimiter=r"\s*,\s*")
135 |     dfa['real'] = dfa['real'].astype(float)
136 |     dfa['predict'] = dfa['predict'].astype(float)
137 |     dfb = pd.read_csv(file_path_list[1].resolve(), engine='python', dtype='str', delimiter=r"\s*,\s*")
138 |     dfb['real'] = dfb['real'].astype(float)
139 |     dfb['predict'] = dfb['predict'].astype(float)
140 |     zero_index = (dfa.real == 0) & (dfa.predict == 0) & (dfb.real == 0) & (dfb.predict == 0)
141 |     dfa = dfa[~zero_index]
142 |     dfb = dfb[~zero_index]
143 |     try:
144 |         timestamp = int(ts)
145 |     except ValueError:
146 |         timestamp = ts
147 |         logger.warning(f"Unresolved timestamp: {timestamp}")
148 |     tic = time.time()
149 | 
150 |     divide = lambda x, y: np.divide(x, y, out=np.zeros_like(x), where=y != 0)
151 |     model = Squeeze(
152 |         data_list=[dfa, dfb],
153 |         op=divide,
154 |         option=SqueezeOption(
155 |             debug=debug,
156 |             fig_save_path=f"{output_path.resolve()}/{timestamp}" + "{suffix}" + ".pdf",
157 |             enable_filter=True,
158 |             **kwargs,
159 |         )
160 |     )
161 |     model.run()
162 |     logger.info("\n" + model.report)
163 |     try:
164 |         root_cause = AC.batch_to_string(
165 |             frozenset(reduce(lambda x, y: x.union(y), model.root_cause, set())))  # type:
166 |     except IndexError:
167 |         root_cause = ""
168 | 
169 |     toc = time.time()
170 |     elapsed_time = toc - tic
171 |     return {
172 |         'timestamp': timestamp,
173 |         'elapsed_time': elapsed_time,
174 |         'root_cause': root_cause,
175 |     }
176 | 
177 | if __name__ == '__main__':
178 |     main()
179 | 


--------------------------------------------------------------------------------
/run_evaluation.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import pandas as pd
 3 | import json
 4 | from utility import AttributeCombination as AC
 5 | import numpy as np
 6 | 
 7 | 
 8 | @click.command()
 9 | @click.option("--injection-info", '-i', help='injection_info.csv file')
10 | @click.option("--predict", '-p', help='output json file')
11 | @click.option("--config", '-c', help='config json file')
12 | @click.option("--output-path", '-o', help="output path", default="./output.csv")
13 | def main(*args, **kwargs):
14 |     evaluate(*args, **kwargs)
15 | 
16 | 
17 | def evaluate(injection_info, predict, config, output_path, verbose=True, return_detail=False):
18 |     injection_info = pd.read_csv(injection_info)
19 |     with open(predict, 'r') as f:
20 |         predict = json.load(f)
21 |     with open(config, 'r') as f:
22 |         config = json.load(f)
23 |     injection_info.set_index(['timestamp'], inplace=True)
24 |     for idx, item in enumerate(predict):
25 |         try:
26 |             label = predict[idx]['label'] = AC.batch_from_string(
27 |                 injection_info.loc(axis=0)[int(item['timestamp']), 'set'],
28 |                 attribute_names=config['columns']
29 |             )
30 |             try:
31 |                 ret = AC.batch_from_string(
32 |                     item['root_cause'].replace('|', ';'),
33 |                     attribute_names=config['columns']
34 |                 )
35 |                 pred = predict[idx]['pred'] = ret
36 |             except Exception as e:
37 |                 print(item, e)
38 |                 continue
39 |             _fn = len(label)
40 |             _tp, _fp = 0, 0
41 |             for rc_item in pred:
42 |                 if rc_item in label:
43 |                     _fn -= 1
44 |                     _tp += 1
45 |                 else:
46 |                     _fp += 1
47 |         except KeyError:
48 |             continue
49 |         predict[idx]['tp'] = _tp
50 |         predict[idx]['fp'] = _fp
51 |         predict[idx]['fn'] = _fn
52 |         predict[idx]['cuboid_layer'] = len(list(label)[0].non_any_values)
53 |         predict[idx]['num_elements'] = len(label)
54 |         predict[idx]['significance'] = injection_info.loc(axis=0)[int(item['timestamp']), 'significance']
55 |         if verbose:
56 |             print("========================================")
57 |             print(f"timestamp:{item['timestamp']}")
58 |             print(f"label:{AC.batch_to_string(label)}")
59 |             print(f"pred :{AC.batch_to_string(pred)}")
60 |             print(f"tp: {_tp}, fp: {_fp}, fn: {_fn}")
61 |         del predict[idx]['root_cause']
62 |     df = pd.DataFrame.from_records(predict)
63 |     total_fscore = 2 * np.sum(df.tp) / (2 * np.sum(df.tp) + np.sum(df.fp) + np.sum(df.fn))
64 |     total_precision = np.sum(df.tp) / (np.sum(df.tp) + np.sum(df.fp))
65 |     total_recall = np.sum(df.tp) / (np.sum(df.tp) + np.sum(df.fn))
66 |     df_total = pd.DataFrame.from_dict(
67 |         {"tp": [np.sum(df.tp)],
68 |          "fp": [np.sum(df.fp)],
69 |          "fn": [np.sum(df.fn)],
70 |          "F1-Score": [total_fscore],
71 |          "Precision": [total_precision],
72 |          "Recall": [total_recall],
73 |          'Time Cost (s)': [np.mean(df['elapsed_time'])],
74 |          'time_std': [np.std(df['elapsed_time'])],
75 |          'Total Time Cost (s)': [np.sum(df['elapsed_time'])],
76 |          'length': len(predict),
77 |          # 'time_list': df['elapsed_time'].values,
78 |          }
79 |     )
80 |     if verbose:
81 |         print(df_total)
82 |     if output_path is not None:
83 |         df_total.to_csv(output_path, index=False)
84 |     if verbose:
85 |         print(total_fscore, total_precision, total_recall)
86 |     if return_detail:
87 |         return df
88 |     return df_total
89 | 
90 | 
91 | if __name__ == '__main__':
92 |     main()
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/run_experiment.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | DATASET=${1}
3 | SETTING=${2}
4 | NUM_WORKER=20
5 | python run_algorithm.py --name ${SETTING} --input-path ${DATASET} --output-path output/${DATASET}/ --num-workers ${NUM_WORKER}
6 | python run_evaluation.py -i ${DATASET}/${SETTING}/injection_info.csv -p output/${DATASET}/${SETTING}.json -c ${DATASET}.json
7 | 


--------------------------------------------------------------------------------
/squeeze/__init__.py:
--------------------------------------------------------------------------------
1 | from .squeeze import *
2 | from .squeeze_option import *
3 | 


--------------------------------------------------------------------------------
/squeeze/anomaly_amount_fileter.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from kneed import KneeLocator
 3 | from loguru import logger
 4 | from scipy.stats import gaussian_kde
 5 | 
 6 | 
 7 | class KPIFilter:
 8 |     def __init__(self, real_array, predict_array):
 9 |         # self.select_metrics = np.log(np.abs(real_array - predict_array) + 1) / 10
10 |         self.select_metrics = np.abs(real_array - predict_array)
11 |         # self.select_metrics = np.abs(predict_array - real_array) / np.abs(real_array + predict_array)
12 |         kernel = gaussian_kde(self.select_metrics)
13 |         _x = sorted(np.linspace(np.min(self.select_metrics), np.max(self.select_metrics), 1000))
14 |         _y = np.cumsum(kernel(_x))
15 |         knee = KneeLocator(_x, _y, curve='concave', direction='increasing').knee
16 |         logger.info(f"kneed: {knee}")
17 |         if knee is None:
18 |             logger.warning("no knee point found")
19 |             knee = np.min(self.select_metrics)
20 |         self.filtered_indices = np.where(self.select_metrics > knee)
21 | 
22 |         self.original_indices = np.arange(len(real_array))[self.filtered_indices]
23 | 
24 |     def inverse_map(self, indices):
25 |         return self.original_indices[indices]
26 | 


--------------------------------------------------------------------------------
/squeeze/clustering/__init__.py:
--------------------------------------------------------------------------------
 1 | from .cluster import *
 2 | from .density_cluster import *
 3 | 
 4 | 
 5 | def cluster_factory(option: SqueezeOption):
 6 |     method_map = {
 7 |         "density": DensityBased1dCluster,
 8 |     }
 9 |     return method_map[option.cluster_method](option)
10 | 


--------------------------------------------------------------------------------
/squeeze/clustering/cluster.py:
--------------------------------------------------------------------------------
 1 | from ..squeeze_option import SqueezeOption
 2 | from typing import List
 3 | import numpy as np
 4 | 
 5 | 
 6 | class Cluster:
 7 |     """
 8 |     one dim cluster, give a 1d-array, return each clusters indices
 9 |     """
10 | 
11 |     def __init__(self, option: SqueezeOption):
12 |         self.option = option
13 | 
14 |     def __call__(self, array) -> List[np.ndarray]:
15 |         raise NotImplementedError()
16 | 
17 | 


--------------------------------------------------------------------------------
/squeeze/clustering/density_cluster.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | import seaborn as sns
  3 | import numpy as np
  4 | from loguru import logger
  5 | from scipy.stats import gaussian_kde
  6 | from scipy.signal import argrelextrema
  7 | import matplotlib.pyplot as plt
  8 | from squeeze.clustering.cluster import Cluster
  9 | from squeeze.squeeze_option import SqueezeOption
 10 | from kneed import KneeLocator
 11 | 
 12 | 
 13 | def smooth(arr, window_size):
 14 |     new_arr = np.convolve(arr, np.ones(window_size), mode="valid") / window_size
 15 |     new_arr = np.concatenate([arr[:window_size - 1], new_arr])
 16 |     assert np.shape(new_arr) == np.shape(arr)
 17 |     return new_arr
 18 | 
 19 | 
 20 | class DensityBased1dCluster(Cluster):
 21 |     def __init__(self, option: SqueezeOption):
 22 |         super().__init__(option)
 23 |         assert option.density_estimation_method in {'kde', 'histogram'}
 24 |         self.density_estimation_func = {
 25 |             "kde": self._kde,
 26 |             "histogram": self._histogram,
 27 |         }[option.density_estimation_method]
 28 | 
 29 |     def _kde(self, array: np.ndarray):
 30 |         kernel = gaussian_kde(array, bw_method=self.option.kde_bw_method, weights=self.option.kde_weights)
 31 |         samples = np.arange(np.min(array), np.max(array), 0.01)
 32 |         kde_sample = kernel(points=samples)
 33 |         conv_kernel = self.option.density_smooth_conv_kernel
 34 |         kde_sample_smoothed = np.convolve(kde_sample, conv_kernel, 'full') / np.sum(conv_kernel)
 35 |         return kde_sample_smoothed, samples
 36 | 
 37 |     def _histogram(self, array: np.ndarray):
 38 |         def _get_hist(_width):
 39 |             if _width == 'auto':
 40 |                 _edges = np.histogram_bin_edges(array, 'auto').tolist()
 41 |                 _edges = [_edges[0] - 0.1 * i for i in range(-5, 0, -1)] + _edges + [_edges[-1] + 0.1 * i for i in
 42 |                                                                                      range(1, 6)]
 43 |             else:
 44 |                 _edges = np.arange(array_range[0] - _width * 6, array_range[1] + _width * 5, _width)
 45 |             h, edges = np.histogram(array, bins=_edges, density=True)
 46 |             h /= 100.
 47 |             # conv_kernel = self.option.density_smooth_conv_kernel
 48 |             # h = np.convolve(h, conv_kernel, 'full') / np.sum(conv_kernel)
 49 |             return h, np.convolve(edges, [1, 1], 'valid') / 2
 50 | 
 51 |         def _get_score(_clusters):
 52 |             if len(_clusters) <= 0:
 53 |                 return float('-inf')
 54 |             _mu = np.concatenate([np.repeat(np.mean(array[idx]), np.size(idx)) for idx in _clusters])
 55 |             _sigma = np.concatenate([np.repeat(np.std(array[idx]), np.size(idx)) for idx in _clusters]) + 1e-8
 56 |             # _arrays = np.concatenate([array[idx] for idx in _clusters])
 57 |             # _scores = np.sum(- np.log(_sigma) - np.square((_arrays - _mu) / _sigma))
 58 |             _scores = np.max(_sigma)
 59 |             return _scores
 60 | 
 61 |         array_range = np.min(array), np.max(array)
 62 |         width = self.option.histogram_bar_width
 63 |         # if width == 'auto':
 64 |         #     x_list = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10]
 65 |         #     hists = [_get_hist(_width) for _width in x_list]
 66 |         #     # y_list = [len(argrelextrema(
 67 |         #     #     _get_hist(_width=_width)[0], comparator=np.greater_equal,
 68 |         #     #     axis=0, order=self.option.cluster_smooth_window_size, mode='clip')[0]) for _width in x_list]
 69 |         #     clusters_list = [self._cluster(array, density_array, bins) for density_array, bins in hists]
 70 |         #     y_list = [_get_score(clusters) for clusters in clusters_list]
 71 |         #     split = KneeLocator(x_list, y_list, curve='concave', direction='increasing').knee
 72 |         #     if split is None:
 73 |         #         split = x_list[0]
 74 |         #     # elbow = x_list[np.argmax(y_list)]
 75 |         #     logger.debug(f"{x_list}, {y_list}, {split}")
 76 |         #     width = split
 77 | 
 78 |         return _get_hist(width)
 79 | 
 80 |     def _cluster(self, array, density_array: np.ndarray, bins, plot=False):
 81 |         def significant_greater(a, b):
 82 |             return (a - b) / (a + b) > 0.1
 83 | 
 84 |         order = 1
 85 |         extreme_max_indices = argrelextrema(
 86 |             density_array, comparator=lambda x, y: x > y,
 87 |             axis=0, order=order, mode='wrap')[0]
 88 |         extreme_min_indices = argrelextrema(
 89 |             density_array, comparator=lambda x, y: x <= y,
 90 |             axis=0, order=order, mode='wrap')[0]
 91 |         extreme_max_indices = list(filter(lambda x: density_array[x] > 0, extreme_max_indices))
 92 |         if plot:
 93 |             for idx in extreme_max_indices:
 94 |                 plt.axvline(bins[idx], linestyle="-", color="red", label="relmax", alpha=0.5, linewidth=0.8)
 95 |             for idx in extreme_min_indices:
 96 |                 plt.axvline(bins[idx], linestyle="--", color="blue", label="relmin", alpha=0.5, linewidth=0.8)
 97 | 
 98 |         cluster_list = []
 99 |         boundaries = np.asarray([float('-inf')] + [bins[index] for index in extreme_min_indices] + [float('+inf')])
100 |         if self.option.max_normal_deviation == 'auto':
101 |             mu = np.mean(np.abs(array))
102 |             max_normal = mu
103 |             logger.debug(f"max normal {max_normal}")
104 |             self.option.max_normal_deviation = max_normal
105 |         for index in extreme_max_indices:
106 |             left_boundary = boundaries[np.searchsorted(boundaries, bins[index], side='right') - 1]
107 |             right_boundary = boundaries[np.searchsorted(boundaries, bins[index], side='left')]
108 |             cluster_indices = np.where(
109 |                 np.logical_and(
110 |                     array <= right_boundary,
111 |                     array >= left_boundary,
112 |                     )
113 |             )[0]
114 |             cluster = array[cluster_indices]
115 |             mu = np.mean(np.abs(cluster))
116 |             logger.debug(f"({left_boundary, right_boundary}, {mu})")
117 |             if np.abs(mu) < self.option.max_normal_deviation or len(cluster) <= 0:
118 |                 continue
119 |             cluster_list.append(cluster_indices)
120 |         return cluster_list
121 | 
122 |     def __call__(self, array):
123 |         array = array.copy()
124 |         density_array, bins = self.density_estimation_func(array)
125 |         # normal_idxes = self._find_normal_indices(array, density_array, bins)
126 |         # density_array, bins = self.density_estimation_func(array[~normal_idxes])
127 |         density_array = np.copy(density_array)
128 |         if self.option.cluster_smooth_window_size == "auto":
129 |             # window_size = max(int(np.log(np.count_nonzero(bins[density_array > 0.])) / np.log(10)), 1)
130 |             window_size = max(np.count_nonzero(density_array > 0) // 10, 1)
131 |             logger.debug(f"auto window size: {window_size} {np.count_nonzero(density_array > 0)}")
132 |         else:
133 |             window_size = self.option.cluster_smooth_window_size
134 |         smoothed_density_array = smooth(density_array, window_size)
135 |         if self.option.debug:
136 |             fig, ax1 = plt.subplots(figsize=(3.6, 1.8))
137 |             sns.distplot(array, bins='auto', label="density", hist=True, kde=False, norm_hist=True, ax=ax1)
138 |             ax1.set_ylim([0, None])
139 |             # ax2 = ax1.twinx()
140 |             # ax2.plot(bins, smoothed_density_array, label="smoothed", linestyle="-.")
141 |             # ax2.set_ylim([0, None])
142 |         clusters = self._cluster(array, smoothed_density_array, bins, plot=self.option.debug)
143 |         if self.option.debug:
144 |             for cluster in clusters:
145 |                 left_boundary, right_boundary = np.min(array[cluster]), np.max(array[cluster])
146 |                 # plt.axvline(left_boundary, c='C0', alpha=0.5, linestyle='--')
147 |                 # plt.axvline(right_boundary, c='C1', alpha=0.5, linestyle=':')
148 |                 logger.debug(f"cluster: [{left_boundary}, {right_boundary}]")
149 |             by_label1 = dict(zip(*reversed(ax1.get_legend_handles_labels())))
150 |             # by_label2 = dict(zip(*reversed(ax2.get_legend_handles_labels())))
151 |             by_label2 = {}
152 |             # logger.debug(f"{by_label1}, {by_label2}")
153 |             plt.legend(
154 |                 list(by_label1.values()) + list(by_label2.values()),
155 |                 list(by_label1.keys()) + list(by_label2.keys()), bbox_to_anchor=(0.47, 0.5)
156 |             )
157 |             plt.xlim([-0.9, 1])
158 |             # plt.title(self.option.density_estimation_method)
159 |             plt.xlabel('deviation score')
160 |             plt.ylabel('pdf')
161 |             plt.tight_layout()
162 |             # plt.show()
163 |             plt.savefig(self.option.fig_save_path.format(suffix="_density_cluster"))
164 |             plt.close()
165 |         return clusters
166 | 
167 | 


--------------------------------------------------------------------------------
/squeeze/squeeze.py:
--------------------------------------------------------------------------------
  1 | from functools import lru_cache
  2 | from itertools import combinations
  3 | import pandas as pd
  4 | from typing import List, FrozenSet, Dict, Union
  5 | from loguru import logger
  6 | from scipy.stats import entropy, norm
  7 | from sklearn.metrics import log_loss
  8 | from typing import Tuple
  9 | from utility import AttributeCombination as AC, AttributeCombination
 10 | from bidict import bidict
 11 | import numpy as np
 12 | from squeeze.anomaly_amount_fileter import KPIFilter
 13 | from squeeze.squeeze_option import SqueezeOption
 14 | from squeeze.clustering import cluster_factory
 15 | from scipy.spatial.distance import cityblock, euclidean
 16 | 
 17 | 
 18 | class Squeeze:
 19 |     def __init__(self, data_list: List[pd.DataFrame], op=lambda x: x, option: SqueezeOption = SqueezeOption()):
 20 |         """
 21 |         :param data_list: dataframe without index,
 22 |             must have 'real' and 'predict' columns, other columns are considered as attributes
 23 |             all elements in this list must have exactly the same attribute combinations in the same order
 24 |         """
 25 |         self.option = option
 26 | 
 27 |         self.one_dim_cluster = cluster_factory(self.option)
 28 |         self.cluster_list = []  # type: List[np.ndarray]
 29 | 
 30 |         valid_idx = np.logical_and.reduce(
 31 |             [_.predict > 0 for _ in data_list],
 32 |         )
 33 | 
 34 |         self.data_list = list(_[valid_idx] for _ in data_list)
 35 |         self.op = op
 36 |         self.derived_data = self.get_derived_dataframe(None)  # type: pd.DataFrame
 37 |         # There is an error in injection
 38 |         self.derived_data.real -= min(np.min(self.derived_data.real), 0)
 39 | 
 40 |         self.attribute_names = list(sorted(set(self.derived_data.columns) - {'real', 'predict'}))
 41 |         logger.debug(f"available attributes: {self.attribute_names}")
 42 | 
 43 |         self.derived_data.sort_values(by=self.attribute_names, inplace=True)
 44 |         self.data_list = list(map(lambda x: x.sort_values(by=self.attribute_names), self.data_list))
 45 | 
 46 |         self.attribute_values = list(list(set(self.derived_data.loc[:, name].values)) for name in self.attribute_names)
 47 |         logger.debug(f"available values: {self.attribute_values}")
 48 | 
 49 |         self.ac_array = np.asarray(
 50 |             [AC(**record) for record in self.derived_data[self.attribute_names].to_dict(orient='records')])
 51 | 
 52 |         self._v = self.derived_data['real'].values
 53 |         self._f = self.derived_data['predict'].values
 54 |         assert all(self._v >= 0) and all(self._f >= 0), \
 55 |             f"currently we assume that KPIs are non-negative, {self.derived_data[~(self._f >= 0)]}"
 56 | 
 57 |         self.__finished = False
 58 |         self._root_cause = []
 59 | 
 60 |         self.filtered_indices = None
 61 | 
 62 |     @property
 63 |     @lru_cache()
 64 |     def root_cause(self):
 65 |         return self._root_cause
 66 | 
 67 |     @property
 68 |     @lru_cache()
 69 |     def report(self) -> str:
 70 |         cluster_impacts = [
 71 |             np.sum(np.abs(self._f[idx] - self._v[idx])) for idx in self.cluster_list
 72 |         ]
 73 |         unique_root_cause, rc_indies = np.unique(self.root_cause, return_index=True)
 74 |         cluster_impacts = [
 75 |             np.sum(cluster_impacts[idx]) for idx in rc_indies
 76 |         ]
 77 |         logger.debug(f"{unique_root_cause}, {cluster_impacts}")
 78 |         report_df = pd.DataFrame(columns=['root_cause', 'impact'])
 79 |         report_df['root_cause'] = list(AC.batch_to_string(_) for _ in unique_root_cause)
 80 |         report_df['impact'] = cluster_impacts
 81 |         report_df.sort_values(by=['impact'], inplace=True, ascending=False)
 82 |         return report_df.to_csv(index=False)
 83 | 
 84 |     @lru_cache()
 85 |     def get_cuboid_ac_array(self, cuboid: Tuple[str, ...]):
 86 |         return np.asarray(list(map(lambda x: x.mask(cuboid), self.ac_array)))
 87 | 
 88 |     @lru_cache()
 89 |     def get_indexed_data(self, cuboid: Tuple[str, ...]):
 90 |         return self.derived_data.set_index(list(cuboid))
 91 | 
 92 |     @property
 93 |     @lru_cache()
 94 |     def normal_indices(self):
 95 |         abnormal = np.sort(np.concatenate(self.cluster_list))
 96 |         idx = np.argsort(np.abs(self.leaf_deviation_score[abnormal]))
 97 |         abnormal = abnormal[idx]
 98 |         normal = np.where(np.abs(self.leaf_deviation_score) < self.leaf_deviation_score[abnormal[0]])[0]
 99 |         # normal = np.setdiff1d(np.arange(len(self.derived_data)), abnormal, assume_unique=True)
100 |         # return np.intersect1d(normal, self.filtered_indices, assume_unique=True)
101 |         return normal
102 | 
103 |     def run(self):
104 |         if self.__finished:
105 |             logger.warning(f"try to rerun {self}")
106 |             return self
107 |         if self.option.enable_filter:
108 |             kpi_filter = KPIFilter(self._v, self._f)
109 |             self.filtered_indices = kpi_filter.filtered_indices
110 |             cluster_list = self.one_dim_cluster(self.leaf_deviation_score[self.filtered_indices])
111 |             cluster_list = list(
112 |                 [kpi_filter.inverse_map(_) for _ in cluster_list]
113 |             )
114 |             cluster_list = list(
115 |                 [list(
116 |                     filter(lambda x: np.min(self.leaf_deviation_score[_]) <= self.leaf_deviation_score[x] <= np.max(
117 |                         self.leaf_deviation_score[_]), np.arange(len(self._f)))
118 |                 )
119 |                     for _ in cluster_list]
120 |             )
121 |             self.cluster_list = cluster_list
122 |         else:
123 |             self.filtered_indices = np.ones(len(self._v), dtype=bool)
124 |             self.cluster_list = self.one_dim_cluster(self.leaf_deviation_score)
125 | 
126 |         self.locate_root_cause()
127 |         self.__finished = True
128 |         self._root_cause = self._root_cause
129 |         return self
130 | 
131 |     def _locate_in_cuboid(self, cuboid, indices, **params) -> Tuple[FrozenSet[AC], float]:
132 |         """
133 |         :param cuboid: try to find root cause in this cuboid
134 |         :param indices: anomaly leaf nodes' indices
135 |         :return: root causes and their score
136 |         """
137 |         # mu = params.get("mu")
138 |         # sigma = params.get("sigma")
139 |         data_cuboid_indexed = self.get_indexed_data(cuboid)
140 |         logger.debug(f"current cuboid: {cuboid}")
141 | 
142 |         abnormal_cuboid_ac_arr = self.get_cuboid_ac_array(cuboid)[indices]
143 |         elements, num_elements = np.unique(abnormal_cuboid_ac_arr, return_counts=True)
144 | 
145 |         num_ele_descents = np.asarray(list(
146 |             np.count_nonzero(
147 |                 _.index_dataframe(data_cuboid_indexed),
148 |             ) for _ in elements
149 |         ))
150 |         # sort reversely by descent score
151 |         descent_score = num_elements / np.maximum(num_ele_descents, 1e-4)
152 |         idx = np.argsort(descent_score)[::-1]
153 |         elements = elements[idx]
154 |         num_ele_descents = num_ele_descents[idx]
155 |         num_elements = num_elements[idx]
156 | 
157 |         # descent_score = descent_score[idx]
158 |         del descent_score
159 | 
160 |         logger.debug(f"elements: {';'.join(str(_) for _ in elements)}")
161 | 
162 |         def _root_cause_score(partition: int) -> float:
163 |             dis_f = cityblock
164 |             data_p, data_n = self.get_derived_dataframe(
165 |                 frozenset(elements[:partition]), cuboid=cuboid,
166 |                 reduction=lambda x: x, return_complement=True,
167 |                 subset_indices=np.concatenate([indices, self.normal_indices]))
168 |             assert len(data_p) + len(data_n) == len(indices) + len(self.normal_indices), \
169 |                 f'{len(data_n)} {len(data_p)} {len(indices)} {len(self.normal_indices)}'
170 |             # dp = self.__deviation_score(data_p['real'].values, data_p['predict'].values)
171 |             # dn = self.__deviation_score(data_n['real'].values, data_n['predict'].values) if len(data_n) else []
172 |             # log_ll = np.mean(norm.pdf(dp, loc=mu, scale=sigma)) \
173 |             #          + np.mean(norm.pdf(dn, loc=0, scale=self.option.normal_deviation_std))
174 |             _abnormal_descent_score = np.sum(num_elements[:partition]) / np.sum(num_ele_descents[:partition])
175 |             _normal_descent_score = 1 - np.sum(num_elements[partition:] / np.sum(num_ele_descents[partition:]))
176 |             _ds = _normal_descent_score * _abnormal_descent_score
177 |             succinct = partition + len(cuboid) * len(cuboid)
178 |             _pv, _pf = np.sum(data_p.real.values), np.sum(data_p.predict.values)
179 |             _lp = len(data_p)
180 |             _v1, _v2 = data_p.real.values, data_n.real.values
181 |             _v = np.concatenate([_v1, _v2])
182 |             _f1, _f2 = data_p.predict.values, data_n.predict.values
183 |             _f = np.concatenate([_f1, _f2])
184 |             _a1, _a2 = data_p.predict.values * (_pv / _pf), data_n.predict.values
185 |             _a = np.concatenate([_a1, _a2])
186 |             divide = lambda x, y: x / y if y > 0 else (0 if x == 0 else float('inf'))
187 |             _ps = 1 - (divide(dis_f(_v1, _a1), len(_v1)) + divide(dis_f(_v2, _f2), len(_v2))) \
188 |                   / (divide(dis_f(_v1, _f1), len(_v1)) + divide(dis_f(_v2, _f2), len(_v2)))
189 |             logger.debug(
190 |                 f"partition:{partition} "
191 |                 # f"log_ll:{log_ll} "
192 |                 # f"impact: {impact_score} "
193 |                 f"succinct: {succinct} "
194 |                 f"ps: {_ps}"
195 |             )
196 |             # return _p * self.option.score_weight / (-succinct)
197 |             return _ps
198 | 
199 |         partitions = np.arange(
200 |             min(
201 |                 len(elements),
202 |                 self.option.max_num_elements_single_cluster,
203 |                 len(set(self.get_indexed_data(cuboid).index.values)) - 1
204 |             )
205 |         ) + 1
206 |         if len(partitions) <= 0:
207 |             return elements, float('-inf')
208 |         rc_scores = np.asarray(list(map(_root_cause_score, partitions)))
209 |         idx = np.argsort(rc_scores)[::-1]
210 |         partitions = partitions[idx]
211 |         rc_scores = rc_scores[idx]
212 | 
213 |         score = rc_scores[0]
214 |         rc = elements[:partitions[0].item()]
215 |         logger.debug(f"cuboid {cuboid} gives root cause {AC.batch_to_string(rc)} with score {score}")
216 |         return rc.tolist(), score
217 | 
218 |     def _locate_in_cluster(self, indices: np.ndarray):
219 |         """
220 |         :param indices:  indices of leaf nodes in this cluster
221 |         :return: None
222 |         """
223 |         mu = np.mean(self.leaf_deviation_score[indices])
224 |         sigma = np.maximum(np.std(self.leaf_deviation_score[indices]), 1e-4)
225 |         logger.debug(f"locate in cluster: {mu}(+-{sigma})")
226 |         max_cuboid_layer = len(self.attribute_names)
227 |         ret_lists = []
228 |         for cuboid_layer in np.arange(max_cuboid_layer) + 1:
229 |             layer_ret_lists = list(map(
230 |                 lambda x, _i=indices, _mu=mu, _sigma=sigma: self._locate_in_cuboid(x, indices=_i, mu=_mu, sigma=_sigma),
231 |                 combinations(self.attribute_names, cuboid_layer)
232 |             ))
233 |             ret_lists.extend([
234 |                 {
235 |                     'rc': x[0], 'score': x[1], 'n_ele': len(x[0]), 'layer': cuboid_layer,
236 |                     'rank': x[1] * self.option.score_weight - len(x[0]) * cuboid_layer
237 |                 } for x in layer_ret_lists
238 |             ])
239 |             if len(list(filter(lambda x: x['score'] > self.option.ps_upper_bound, ret_lists))):
240 |                 break
241 |         ret_lists = list(sorted(
242 |             ret_lists,
243 |             key=lambda x: x['rank'],
244 |             reverse=True)
245 |         )
246 |         if ret_lists:
247 |             ret = ret_lists[0]['rc']
248 |             logger.debug(
249 |                 f"find root cause: {AC.batch_to_string(ret)}, rank: {ret_lists[0]['rank']}, score: {ret_lists[0]['score']}")
250 |             self._root_cause.append(frozenset(ret))
251 |         else:
252 |             logger.info("failed to find root cause")
253 | 
254 |     def locate_root_cause(self):
255 |         if not self.cluster_list:
256 |             logger.info("We do not have abnormal points")
257 |             return
258 |         if self.option.score_weight == 'auto':
259 |             self.option.score_weight = - np.log(
260 |                 len(self.cluster_list) *
261 |                 sum(len(_) for _ in self.cluster_list) / len(self._f)) / np.log(
262 |                 sum(len(_) for _ in self.attribute_values)) * sum(len(_) for _ in self.attribute_values)
263 |             # self.option.score_weight = len(self.cluster_list) * \
264 |             #                            (np.log(sum(len(_) for _ in self.cluster_list)) + np.sum([np.log(len(_)) for _ in self.attribute_values]) - np.log(len(self.cluster_list)) - np.log(len(self.leaf_deviation_score))) \
265 |             #                            / np.log(np.mean([len(_) for _ in self.attribute_values])) * 10
266 |             logger.debug(f"auto score weight: {self.option.score_weight}")
267 |         for indices in self.cluster_list:
268 |             self._locate_in_cluster(indices)
269 | 
270 |     @property
271 |     @lru_cache()
272 |     def leaf_deviation_score(self):
273 |         with np.errstate(divide='ignore', invalid='ignore'):
274 |             deviation_scores = self.__deviation_score(self._v, self._f)
275 |         assert np.shape(deviation_scores) == np.shape(self._v) == np.shape(self._f)
276 |         assert np.sum(np.isnan(deviation_scores)) == 0, \
277 |             f"there are nan in deviation score {np.where(np.isnan(deviation_scores))}"
278 |         assert np.sum(~np.isfinite(deviation_scores)) == 0, \
279 |             f"there are infinity in deviation score {np.where(~np.isfinite(deviation_scores))}"
280 |         logger.debug(f"anomaly ratio ranges in [{np.min(deviation_scores)}, {np.max(deviation_scores)}]")
281 |         return deviation_scores
282 | 
283 |     def get_derived_dataframe(self, ac_set: Union[FrozenSet[AC], None], cuboid: Tuple[str] = None,
284 |                               reduction=None, return_complement=False, subset_indices=None):
285 |         subset = np.zeros(len(self.data_list[0]), dtype=np.bool)
286 |         if subset_indices is not None:
287 |             subset[subset_indices] = True
288 |         else:
289 |             subset[:] = True
290 | 
291 |         if reduction == "sum":
292 |             reduce = lambda x, _axis=0: np.sum(x, axis=_axis, keepdims=True)
293 |         else:
294 |             reduce = lambda x: x
295 | 
296 |         if ac_set is None:
297 |             idx = np.ones(shape=(len(self.data_list[0]),), dtype=np.bool)
298 |         else:
299 |             idx = AC.batch_index_dataframe(ac_set, self.get_indexed_data(cuboid))
300 | 
301 |         def _get_ret(_data_list):
302 |             if len(_data_list[0]) == 0:
303 |                 return pd.DataFrame(data=[], columns=['real', 'predict'])
304 |             _values = self.op(*[reduce(_data[["real", "predict"]].values) for _data in _data_list])
305 |             if np.size(_values) == 0:
306 |                 _values = []
307 |             if reduction == 'sum':
308 |                 _ret = pd.DataFrame(data=_values, columns=['real', 'predict'])
309 |             else:
310 |                 _ret = _data_list[0].copy(deep=True)
311 |                 _ret[['real', 'predict']] = _values
312 |             return _ret
313 | 
314 |         data_list = list(_[idx & subset] for _ in self.data_list)
315 |         if not return_complement:
316 |             return _get_ret(data_list)
317 |         complement_data_list = list(_[(~idx) & subset] for _ in self.data_list)
318 |         return _get_ret(data_list), _get_ret(complement_data_list)
319 | 
320 |     @staticmethod
321 |     def __deviation_score(v, f):
322 |         n = 1
323 |         with np.errstate(divide='ignore'):
324 |             ret = n * (f - v) / (n * f + v)
325 |             # ret = np.log(np.maximum(v, 1e-10)) - np.log(np.maximum(f, 1e-10))
326 |             # ret = (2 * sigmoid(1 - v / f) - 1)
327 |             # k = np.log(np.maximum(v, 1e-100)) - np.log(np.maximum(f, 1e-100))
328 |             # ret = (1 - k) / (1 + k)
329 |         ret[np.isnan(ret)] = 0.
330 |         return ret
331 | 


--------------------------------------------------------------------------------
/squeeze/squeeze_option.py:
--------------------------------------------------------------------------------
 1 | class SqueezeOption:
 2 |     def __init__(self, **kwargs):
 3 |         self.debug = False
 4 |         self.fig_save_path = "/outputs/fig_{suffix}.pdf"
 5 | 
 6 |         # Filter
 7 |         self.enable_filter = True
 8 | 
 9 |         # Density Estimation
10 |         self.cluster_method = "density"
11 |         self.density_estimation_method = 'histogram'
12 | 
13 |         # KDE
14 |         self.density_smooth_conv_kernel = [1.]
15 |         self.kde_bw_method = None
16 |         self.kde_weights = None
17 | 
18 |         # Histogram
19 |         self.histogram_bar_width = "auto"
20 | 
21 |         # relative max
22 |         self.max_allowed_deviation_bias = 0.10
23 |         self.max_allowed_deviation_std = 0.01
24 | 
25 |         # Cluster
26 |         self.cluster_smooth_window_size = "auto"
27 |         self.max_normal_deviation = 0.20
28 | 
29 |         # Group
30 |         # self.least_score = 2.0
31 |         self.least_descent_score = 0.6
32 |         self.normal_deviation_std = 0.1
33 |         self.score_weight = "auto"
34 |         self.max_num_elements_single_cluster = 12
35 |         self.ps_upper_bound = 0.90
36 | 
37 |         self.__dict__.update(kwargs)
38 | 


--------------------------------------------------------------------------------
/utility/__init__.py:
--------------------------------------------------------------------------------
1 | from .attribute_combination import *
2 | 


--------------------------------------------------------------------------------
/utility/attribute_combination.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from functools import reduce, lru_cache
  3 | import numpy as np
  4 | import pandas as pd
  5 | from loguru import logger
  6 | from typing import List, FrozenSet, Sequence, Union, Iterable
  7 | 
  8 | 
  9 | class AttributeCombination(dict):
 10 |     ANY = '__ANY__'
 11 | 
 12 |     def __init__(self, **kwargs):
 13 |         super().__init__(**{key: str(value) for key, value in kwargs.items()})
 14 |         self.__id = None
 15 |         self.non_any_keys = tuple()
 16 |         self.non_any_values = tuple()
 17 |         self.__is_terminal = False
 18 |         self.__update()
 19 | 
 20 |     def __update(self):
 21 |         self.__id = tuple((key, self[key]) for key in sorted(self.keys()))
 22 |         self.non_any_keys = tuple(_ for _ in sorted(self.keys()) if self[_] != self.ANY)
 23 |         self.non_any_values = tuple(self[_] for _ in sorted(self.keys()) if self[_] != self.ANY)
 24 |         self.__is_terminal = not any(self.ANY == value for value in self.values())
 25 | 
 26 |     def __eq__(self, other: 'AttributeCombination'):
 27 |         return self.__id == other.__id
 28 | 
 29 |     def __lt__(self, other):
 30 |         return self.__id < other.__id
 31 | 
 32 |     def __le__(self, other):
 33 |         return self.__id <= other.__id
 34 | 
 35 |     def __hash__(self):
 36 |         return hash(self.__id)
 37 | 
 38 |     def __setitem__(self, key, value):
 39 |         super().__setitem__(key, str(value))
 40 |         self.__update()
 41 | 
 42 |     def __str__(self):
 43 |         return "&".join(f"{key}={value}" for key, value in zip(self.non_any_keys, self.non_any_values))
 44 | 
 45 |     @staticmethod
 46 |     def from_string(string: str, attribute_names) -> 'AttributeCombination':
 47 |         ret = AttributeCombination.get_root_attribute_combination(attribute_names)
 48 |         for pair in string.split("&"):
 49 |             if pair == "":
 50 |                 continue
 51 |             key, value = pair.split("=")
 52 |             ret[key] = value
 53 |         return ret
 54 | 
 55 |     @staticmethod
 56 |     def batch_from_string(string: str, attribute_names) -> 'FrozenSet[AttributeCombination]':
 57 |         return frozenset({AttributeCombination.from_string(_, attribute_names) for _ in string.split(";")})
 58 | 
 59 |     @staticmethod
 60 |     def batch_to_string(sets: Iterable['AttributeCombination']) -> str:
 61 |         return ";".join(str(_) for _ in sets)
 62 | 
 63 |     def copy_and_update(self, other):
 64 |         o = copy.copy(self)
 65 |         o.update(other)
 66 |         o.__update()
 67 |         return o
 68 | 
 69 |     @staticmethod
 70 |     def get_attribute_combination(data: pd.DataFrame):
 71 |         columns = list(set(data.columns) - {'real', 'predict'})
 72 |         _attributes = AttributeCombination()
 73 |         for column in columns:
 74 |             _attributes[column] = AttributeCombination.ANY
 75 |         return _attributes
 76 | 
 77 |     def index_dataframe_without_index(self, data: pd.DataFrame):
 78 |         # noinspection PyTypeChecker
 79 |         return reduce(np.logical_and,
 80 |                       [data[key] == value for key, value in self.items() if value != self.ANY],
 81 |                       np.ones(len(data), dtype=bool))
 82 | 
 83 |     def index_dataframe(self, data: pd.DataFrame):
 84 |         if len(self.non_any_values) == 0:
 85 |             return np.ones(len(data), dtype=np.bool)
 86 |         try:
 87 |             arr = np.zeros(shape=len(data), dtype=np.bool)
 88 |             if len(self.non_any_values) == 1:
 89 |                 idx = data.index.get_loc(self.non_any_values[0])
 90 |             else:
 91 |                 idx = data.index.get_loc(self.non_any_values)
 92 |             arr[idx] = True
 93 |             return arr
 94 |         except KeyError:
 95 |             return np.zeros(len(data), dtype=np.bool)
 96 | 
 97 |     def is_terminal(self):
 98 |         return self.__is_terminal
 99 | 
100 |     @staticmethod
101 |     def batch_index_dataframe(attribute_combinations, data: pd.DataFrame):
102 |         # noinspection PyTypeChecker
103 |         index = reduce(np.logical_or,
104 |                        (_.index_dataframe(data) for _ in attribute_combinations),
105 |                        np.zeros(len(data), dtype=np.bool))
106 |         return index
107 | 
108 |     @staticmethod
109 |     def batch_index_dataframe_without_index(attribute_combinations, data: pd.DataFrame):
110 |         # noinspection PyTypeChecker
111 |         index = reduce(np.logical_or,
112 |                        (_.index_dataframe_without_index(data) for _ in attribute_combinations),
113 |                        np.zeros(len(data), dtype=np.bool))
114 |         return index
115 | 
116 |     @staticmethod
117 |     def get_root_attribute_combination(attribute_names):
118 |         return AttributeCombination(**{key: AttributeCombination.ANY for key in attribute_names})
119 | 
120 |     def is_descent(self, other):
121 |         return all(self.__attribute_is_descent(sorted(item_a), sorted(item_b))
122 |                    for item_a, item_b in zip(self.items(), other.items()))
123 | 
124 |     @staticmethod
125 |     def __attribute_is_descent(a, b):
126 |         return a[0] == b[0] and (a[1] == b[1] or b[1] == AttributeCombination.ANY)
127 | 
128 |     def mask(self, keys):
129 |         """
130 |         :param keys: keep which keys
131 |         :return: a new attribute combination, keep keys, the others are set ANY
132 |         """
133 |         to_fill_keys = set(self.keys()) - set(keys)
134 |         return self.copy_and_update({key: self.ANY for key in to_fill_keys})
135 | 
136 |     @staticmethod
137 |     def from_iops_2019_format(string: str, attribute_names=None) -> FrozenSet['AttributeCombination']:
138 |         """
139 |         :param attribute_names:
140 |         :param string:
141 |         :return:
142 |         """
143 |         if attribute_names is None:
144 |             attribute_names = ['i', 'e', 'c', 'p', 'l']
145 |         root = AttributeCombination(**{key: AttributeCombination.ANY for key in attribute_names})
146 |         results = {root.copy_and_update({_[0]: _ for _ in case.split('&') if _ != ''}) for case in string.split(';')}
147 |         return frozenset(results)
148 | 
149 |     @staticmethod
150 |     def to_iops_2019_format(attribute_combinations: Iterable['AttributeCombination']):
151 |         return ";".join("&".join(_.non_any_values) for _ in attribute_combinations)
152 | 
153 | 
154 | AC = AttributeCombination
155 | 


--------------------------------------------------------------------------------