├── pip-requirement.txt
├── anomalytransfer
    ├── __init__.py
    ├── utils
    │   ├── __init__.py
    │   ├── config.py
    │   ├── data.py
    │   ├── testing.py
    │   └── logging.py
    ├── transfer
    │   ├── __init__.py
    │   ├── data.py
    │   ├── models.py
    │   └── spot.py
    └── clustering
    │   ├── __init__.py
    │   ├── baseline_extraction.py
    │   ├── average.py
    │   ├── preprocessing.py
    │   └── models.py
├── env.sh
├── environment.yml
├── sample
    ├── scripts
    │   ├── transfer
    │   │   ├── utils.py
    │   │   ├── plot_kpi.py
    │   │   ├── naive_bagel.py
    │   │   ├── transfer_learning.py
    │   │   ├── cluster_transfer_train.py
    │   │   └── cluster_transfer_test.py
    │   ├── transfer_entirely
    │   │   ├── utils.py
    │   │   └── finetune.py
    │   ├── clustering
    │   │   ├── step2_baseline_extraction.py
    │   │   ├── step1_preprocessing.py
    │   │   ├── step3_average.py
    │   │   └── step4_clustering.py
    │   └── test_time
    │   │   ├── test_adtshl.py
    │   │   └── test_at.py
    └── configs
    │   └── default.conf
├── LICENSE
└── setup.py


/pip-requirement.txt:
--------------------------------------------------------------------------------
1 | torch
2 | tqdm
3 | numpy
4 | scipy
5 | pandas
6 | matplotlib
7 | sklearn


--------------------------------------------------------------------------------
/anomalytransfer/__init__.py:
--------------------------------------------------------------------------------
1 | import anomalytransfer.transfer as transfer
2 | import anomalytransfer.clustering as clustering
3 | import anomalytransfer.utils as utils
4 | 


--------------------------------------------------------------------------------
/env.sh:
--------------------------------------------------------------------------------
1 | # if not to setup `anomalytranfer` package, use this CLI:
2 | # source env.sh to set `PYTHONPATH` (execute the .sh with the same path)
3 | 
4 | export PYTHONPATH=`pwd`
5 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: at
 2 | channels:
 3 |   - pytorch
 4 |   - defaults
 5 | dependencies:
 6 |   - scikit-learn
 7 |   - pandas
 8 |   - pytorch
 9 |   - matplotlib
10 |   - tqdm
11 | 


--------------------------------------------------------------------------------
/anomalytransfer/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from anomalytransfer.utils.config import *
2 | from anomalytransfer.utils.logging import *
3 | from anomalytransfer.utils.data import *
4 | from anomalytransfer.utils.testing import *
5 | 


--------------------------------------------------------------------------------
/anomalytransfer/transfer/__init__.py:
--------------------------------------------------------------------------------
1 | import anomalytransfer.transfer.data as data
2 | import anomalytransfer.transfer.models as models
3 | import anomalytransfer.transfer.spot as spot
4 | 
5 | from anomalytransfer.transfer.spot import SPOT
6 | from anomalytransfer.transfer.models import AnomalyDetector
7 | 


--------------------------------------------------------------------------------
/anomalytransfer/clustering/__init__.py:
--------------------------------------------------------------------------------
1 | import anomalytransfer.clustering.average as average
2 | import anomalytransfer.clustering.baseline_extraction as baseline_extraction
3 | import anomalytransfer.clustering.preprocessing as preprocessing
4 | import anomalytransfer.clustering.models as models
5 | 
6 | from anomalytransfer.clustering.models import LatentTransformer
7 | 


--------------------------------------------------------------------------------
/sample/scripts/transfer/utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | class run_time:
 4 |     def __init__(self, func = None):
 5 |         self.func = func
 6 | 
 7 |     def __call__(self, *args, **kwargs):
 8 |         start = time.time()
 9 |         res = self.func(*args, **kwargs)
10 |         end = time.time()
11 |         print(f"time: {end - start}")
12 |         return res
13 | 
14 |     def __enter__(self):
15 |         self.start = time.time()
16 |         return self
17 | 
18 |     def __exit__(self, exc_type, exc_val, exc_tb):
19 |         self.end = time.time()
20 |     
21 |     def get_time(self):
22 |         return self.end - self.start
23 | 


--------------------------------------------------------------------------------
/sample/scripts/transfer_entirely/utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | class run_time:
 4 |     def __init__(self, func = None):
 5 |         self.func = func
 6 | 
 7 |     def __call__(self, *args, **kwargs):
 8 |         start = time.time()
 9 |         res = self.func(*args, **kwargs)
10 |         end = time.time()
11 |         print(f"time: {end - start}")
12 |         return res
13 | 
14 |     def __enter__(self):
15 |         self.start = time.time()
16 |         return self
17 | 
18 |     def __exit__(self, exc_type, exc_val, exc_tb):
19 |         self.end = time.time()
20 |     
21 |     def get_time(self):
22 |         return self.end - self.start
23 | 


--------------------------------------------------------------------------------
/anomalytransfer/utils/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | PROJECT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 3 | import configparser
 4 | 
 5 | DEFAULT_CONFIGS = [
 6 |     os.path.join(PROJECT_PATH, "sample", "configs", "default.conf"),
 7 | ]
 8 | 
 9 | LOCAL_CONFIGS = [
10 |     os.path.join(PROJECT_PATH, "sample", "configs", "local.conf"),
11 | ]
12 | 
13 | 
14 | def config() -> configparser.ConfigParser:
15 |     config_parser = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation())
16 |     config_list = []
17 |     config_list.extend(DEFAULT_CONFIGS)
18 |     for local_config in LOCAL_CONFIGS:
19 |         if os.path.exists(local_config):
20 |             config_list.append(local_config)
21 |     config_parser.read(config_list)
22 |     return config_parser
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Zhong Zhenyu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/sample/scripts/clustering/step2_baseline_extraction.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import anomalytransfer as at
 4 | 
 5 | 
 6 | def main():
 7 |     at.utils.mkdirs(OUTPUT)
 8 |     file_list = at.utils.file_list(INPUT)
 9 |     progbar = at.utils.ProgBar(len(file_list), interval=0.5, unit_name='file')
10 |     print('Extracting baselines...')
11 | 
12 |     for file in file_list:
13 |         filename = at.utils.filename(file)
14 |         df = pd.read_csv(file)
15 |         values = at.clustering.baseline_extraction.smoothing_extreme_values(df.value)
16 |         standardized = at.clustering.baseline_extraction.extract_baseline(values, window_size=WINDOW_SIZE)
17 |         df = pd.DataFrame({'timestamp': df.timestamp.iloc[WINDOW_SIZE - 1:], 'value': standardized[0]})
18 |         df.to_csv(os.path.join(OUTPUT, filename + '.csv'), index=False)
19 |         progbar.add(1)
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     config = at.utils.config()
24 | 
25 |     INPUT = config.get('CLUSTERING_BASELINE_EXTRACTION', 'input')
26 |     OUTPUT = config.get('CLUSTERING_BASELINE_EXTRACTION', 'output')
27 |     WINDOW_SIZE = config.getint('CLUSTERING_BASELINE_EXTRACTION', 'window_size')
28 | 
29 |     main()
30 | 


--------------------------------------------------------------------------------
/anomalytransfer/clustering/baseline_extraction.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import anomalytransfer as at
 3 | 
 4 | from typing import Sequence, Tuple
 5 | 
 6 | 
 7 | def smoothing_extreme_values(values: Sequence) -> np.ndarray:
 8 |     values = np.asarray(values, np.float32)
 9 |     if len(values.shape) != 1:
10 |         raise ValueError('`values` must be a 1-D array')
11 | 
12 |     abnormal_portion = 0.05
13 |     values_deviation = np.abs(values)
14 | 
15 |     abnormal_max = np.max(values_deviation)
16 |     abnormal_index = np.argwhere(values_deviation >= abnormal_max * (1 - abnormal_portion))
17 |     abnormal = abnormal_index.reshape(len(abnormal_index))
18 |     normal_index = np.argwhere(values_deviation < abnormal_max * (1 - abnormal_portion))
19 |     normal = normal_index.reshape(len(normal_index))
20 |     normal_values = values[normal]
21 |     abnormal_values = np.interp(abnormal, normal, normal_values)
22 |     values[abnormal] = abnormal_values
23 | 
24 |     return values
25 | 
26 | 
27 | def extract_baseline(values: Sequence, window_size: int) -> Tuple[np.ndarray, float, float]:
28 |     baseline = np.convolve(values, np.ones((window_size,)) / window_size, mode='valid')
29 |     return at.clustering.preprocessing.standardize(baseline)
30 | 


--------------------------------------------------------------------------------
/sample/scripts/clustering/step1_preprocessing.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import anomalytransfer as at
 4 | 
 5 | 
 6 | def main():
 7 |     at.utils.mkdirs(OUTPUT)
 8 |     file_list = at.utils.file_list(INPUT)
 9 |     progbar = at.utils.ProgBar(len(file_list), interval=0.5, unit_name='file')
10 |     print('Preprocessing...')
11 | 
12 |     for file in file_list:
13 |         filename = at.utils.filename(file)
14 |         df = pd.read_csv(file)
15 |         timestamps, _, ret_arrays = at.clustering.preprocessing.linear_interpolation(df.timestamp, [df.value])
16 |         # ! Don't downsample before train bagel
17 |         # timestamps, values = at.clustering.preprocessing.down_sampling([timestamps, ret_arrays[0]],
18 |         #                                                                step=DOWN_SAMPLING_STEP)
19 |         values, _, _ = at.clustering.preprocessing.standardize(ret_arrays[0])
20 |         df = pd.DataFrame({'timestamp': timestamps, 'value': values})
21 |         df.to_csv(os.path.join(OUTPUT, filename + '.csv'), index=False)
22 |         progbar.add(1)
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     config = at.utils.config()
27 | 
28 |     INPUT = config.get('CLUSTERING_PREPROCESSING', 'input')
29 |     OUTPUT = config.get('CLUSTERING_PREPROCESSING', 'output')
30 |     DOWN_SAMPLING_STEP = config.getint('CLUSTERING_PREPROCESSING', 'down_sampling_step')
31 | 
32 |     main()
33 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open('README.md', 'r') as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name='anomalytransfer',
 8 |     version='0.3.0',
 9 |     author='AlumiK',
10 |     author_email='nczzy1997@gmail.com',
11 |     license='MIT',
12 |     description='Implementation of AnomalyTransfer in PyTorch',
13 |     long_description=long_description,
14 |     long_description_content_type='text/markdown',
15 |     url='https://github.com/AlumiK/anomalytransfer',
16 |     packages=setuptools.find_packages(include=['anomalytransfer', 'anomalytransfer.*']),
17 |     platforms='any',
18 |     install_requires=[
19 |         'pandas',
20 |         'scikit-learn',
21 |         'torch',
22 |         'tqdm'
23 |     ],
24 |     extras_require={
25 |         'dev': [
26 |             'matplotlib',
27 |         ],
28 |     },
29 |     dependency_links=[
30 |         'https://download.pytorch.org/whl/torch_stable.html',
31 |     ],
32 |     classifiers=[
33 |         'Development Status :: 2 - Pre-Alpha',
34 |         'Intended Audience :: Developers',
35 |         'Programming Language :: Python :: 3',
36 |         'Programming Language :: Python :: 3.8',
37 |         'License :: OSI Approved :: MIT License',
38 |         'Operating System :: OS Independent',
39 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
40 |         'Topic :: Software Development :: Libraries :: Python Modules',
41 |     ],
42 |     python_requires='==3.8',
43 | )
44 | 


--------------------------------------------------------------------------------
/anomalytransfer/utils/data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import anomalytransfer as at
 4 | 
 5 | from typing import Sequence, Tuple
 6 | 
 7 | 
 8 | def filename(file: str) -> str:
 9 |     return os.path.splitext(os.path.basename(file))[0]
10 |  
11 | 
12 | def mkdirs(*dir_list):
13 |     for directory in dir_list:
14 |         os.makedirs(directory, exist_ok=True)
15 | 
16 | 
17 | def file_list(path: str) -> Sequence:
18 |     if os.path.isdir(path):
19 |         return [os.path.join(path, file) for file in os.listdir(path) if file.endswith(".csv")]
20 |     else:
21 |         return [path]
22 | 
23 | 
24 | def load_kpi(file: str, **kwargs) -> at.transfer.data.KPI:
25 |     df = pd.read_csv(file, **kwargs)
26 |     df.dropna(0, inplace=True)
27 |     return at.transfer.data.KPI(timestamps=df.timestamp,
28 |                                 values=df.value,
29 |                                 labels=df.get('label', None),
30 |                                 name=filename(file))
31 | 
32 | 
33 | class KPIStats:
34 | 
35 |     def __init__(self, kpi: at.transfer.data.KPI):
36 |         self.num_points = len(kpi.values)
37 |         self.num_missing = len(kpi.missing[kpi.missing == 1])
38 |         self.num_anomaly = len(kpi.labels[kpi.labels == 1])
39 |         self.missing_rate = self.num_missing / self.num_points
40 |         self.anomaly_rate = self.num_anomaly / self.num_points
41 | 
42 | 
43 | def get_kpi_stats(*kpis: at.transfer.data.KPI) -> Tuple[KPIStats, ...]:
44 |     ret = []
45 |     for kpi in kpis:
46 |         ret.append(KPIStats(kpi))
47 |     return tuple(ret)
48 | 


--------------------------------------------------------------------------------
/sample/scripts/clustering/step3_average.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import anomalytransfer as at
 4 | 
 5 | 
 6 | def main():
 7 |     at.utils.mkdirs(OUTPUT_DAILY, OUTPUT_WEEKLY)
 8 |     file_list = at.utils.file_list(INPUT)
 9 |     progbar = at.utils.ProgBar(len(file_list), interval=0.5, unit_name='file')
10 |     print('Extracting sub-curves...')
11 | 
12 |     for file in file_list:
13 |         filename = at.utils.filename(file)
14 |         df = pd.read_csv(file)
15 |         daily_average, ts_average = at.clustering.average.get_daily_average(
16 |             *at.clustering.average.group_data_by_weekday(timestamps=df.timestamp, values=df.value))
17 |         for i in range(len(daily_average)):
18 |             df = pd.DataFrame({
19 |                 'timestamp': ts_average[i], 
20 |                 'value': daily_average[i]
21 |             })
22 |             
23 |             df.to_csv(os.path.join(OUTPUT_DAILY, filename + f'_wd{i}.csv'), index=False)
24 |         weekly_average, ts_average = at.clustering.average.get_weekly_average(daily_average, ts_average)
25 |         df = pd.DataFrame({
26 |             'timestamp': ts_average,
27 |             'value': weekly_average
28 |         })
29 |         df.to_csv(os.path.join(OUTPUT_WEEKLY, filename + '.csv'), index=False)
30 |         progbar.add(1)
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     config = at.utils.config()
35 | 
36 |     INPUT = config.get('CLUSTERING_AVERAGE', 'input')
37 |     OUTPUT_DAILY = config.get('CLUSTERING_AVERAGE', 'output_daily')
38 |     OUTPUT_WEEKLY = config.get('CLUSTERING_AVERAGE', 'output_weekly')
39 | 
40 |     main()
41 | 


--------------------------------------------------------------------------------
/sample/configs/default.conf:
--------------------------------------------------------------------------------
 1 | [COMMON]
 2 | num_threads=6
 3 | project_path=/home/zhangshenglin/project/anomalytransfer/
 4 | 
 5 | [CLUSTERING_PREPROCESSING]
 6 | # input=${COMMON:project_path}/input
 7 | input=/home/zhangshenglin/data/kpi-nab
 8 | output=${COMMON:project_path}/out/clustering/preprocessing
 9 | down_sampling_step=10
10 | 
11 | [CLUSTERING_BASELINE_EXTRACTION]
12 | input=${COMMON:project_path}/out/clustering/preprocessing
13 | output=${COMMON:project_path}/out/clustering/baseline_extraction
14 | window_size=5
15 | 
16 | [CLUSTERING_AVERAGE]
17 | input=${COMMON:project_path}/out/clustering/baseline_extraction
18 | output_daily=${COMMON:project_path}/out/clustering/average/daily
19 | output_weekly=${COMMON:project_path}/out/clustering/average/weekly
20 | 
21 | [CLUSTERING]
22 | input=${COMMON:project_path}/out/clustering/average/daily
23 | output=${COMMON:project_path}/out/clustering/clustering
24 | epochs=200
25 | n_clusters=10
26 | 
27 | [BAGEL]
28 | epochs=200
29 | # input=${COMMON:project_path}/input
30 | input=/home/zhangshenglin/data/kpi-nab
31 | output=${COMMON:project_path}/out/bagel
32 | 
33 | [PLOT_KPI]
34 | # input=${COMMON:project_path}/input
35 | input=/home/zhangshenglin/data/kpi-nab
36 | output=${COMMON:project_path}/out/plot_kpi
37 | fig_width=32
38 | fig_height=6
39 | fig_dpi=144
40 | 
41 | [TRANSFER_LEARNING]
42 | base_epochs=200
43 | data_epochs=200
44 | input=${COMMON:project_path}/out/clustering/clustering/top_k_daily_cluster
45 | test_output=${COMMON:project_path}/out/transfer_learning/test_results
46 | output=${COMMON:project_path}/out/transfer_learning/results
47 | model_path=${COMMON:project_path}/out/transfer_learning/models
48 | ratio=0.7
49 | 


--------------------------------------------------------------------------------
/sample/scripts/test_time/test_adtshl.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import anomalytransfer as at
 4 | os.environ["CUDA_VISIBLE_DEVICES"] = '' 
 5 | import pandas as pd
 6 | 
 7 | def main():
 8 |     at.utils.mkdirs(OUTPUT)
 9 |     file_list = at.utils.file_list(INPUT)[0:1]  # only one
10 |     proglog = at.utils.ProgLog(len(file_list))
11 | 
12 |     result = {}
13 |     for file in file_list:
14 |         for exp in range(10):
15 |             kpi = at.utils.load_kpi(file)
16 |             proglog.log(kpi=kpi.name)
17 |             kpi.complete_timestamp()
18 |             total_minutes = 24 * 60
19 |             interval = kpi.interval / 60
20 |             num_of_point = int(total_minutes / interval)
21 |             train_kpi, test_kpi = kpi.split_by_idx(num_of_point)
22 | 
23 |             train_kpi, mean, std = train_kpi.standardize()
24 |             test_kpi, _, _ = test_kpi.standardize(mean=mean, std=std)
25 | 
26 |             model = at.transfer.AnomalyDetector()
27 |             history = model.fit(kpi=train_kpi.no_labels(), epochs=EPOCHS)
28 |             result[f"ts_{exp}"] = history['ts']
29 |             result[f"loss_{exp}"] = history['loss']
30 |     dt = pd.DataFrame(result)
31 |     # dt.to_csv("adtshl.csv", index=False)
32 |     dt.to_csv("bagel.csv", index=False)
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s [%(levelname)s]] %(message)s')
37 | 
38 |     config = at.utils.config()
39 |     NUM_THREADS = config.getint('COMMON', 'num_threads')
40 |     EPOCHS = config.getint('BAGEL', 'epochs')
41 |     INPUT = config.get('BAGEL', 'input')
42 |     OUTPUT = config.get('BAGEL', 'output')
43 | 
44 |     # at.utils.set_num_threads(NUM_THREADS)
45 |     main()
46 | 


--------------------------------------------------------------------------------
/anomalytransfer/clustering/average.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import numpy as np
 3 | 
 4 | from typing import Sequence, Tuple
 5 | 
 6 | 
 7 | def _get_weekday(timestamps: Sequence) -> Sequence:
 8 |     return [datetime.datetime.fromtimestamp(t).weekday() for t in timestamps]
 9 | 
10 | 
11 | def group_data_by_weekday(timestamps: Sequence, values: Sequence) -> Tuple[Sequence, Sequence]:
12 |     timestamps = np.asarray(timestamps, dtype=np.int64)
13 |     values = np.asarray(values, dtype=np.float32)
14 |     weekday = _get_weekday(timestamps)
15 |     grouped_data = [[], [], [], [], [], [], []]
16 |     grouped_ts = [[], [], [], [], [], [], []]
17 |     current_weekday = weekday[0]
18 |     current_index = 0
19 |     for i in range(len(weekday)):
20 |         if weekday[i] != current_weekday:
21 |             if current_index != 0:
22 |                 # ! Add more 119 points (Bagel ignore the first 119 points!)
23 |                 grouped_data[current_weekday].append(values[(current_index-119):i])
24 |                 grouped_ts[current_weekday].append(timestamps[(current_index-119):i])
25 |             current_weekday = weekday[i]
26 |             current_index = i
27 |     return grouped_data, grouped_ts
28 | 
29 | 
30 | def get_daily_average(grouped_data: Sequence, grouped_ts: Sequence) -> Tuple[Sequence[np.ndarray], Sequence[np.ndarray]]:
31 |     daily_average = []
32 |     ts_average = []
33 |     for weekday, ts in zip(grouped_data, grouped_ts):
34 |         daily_average.append(np.mean(weekday, axis=0))
35 |         ts_average.append(ts[0])
36 |     return daily_average, ts_average
37 | 
38 | 
39 | def get_weekly_average(daily_average: Sequence, ts_average: Sequence) -> Tuple[np.ndarray, np.ndarray]:
40 |     return np.concatenate(daily_average), np.concatenate(ts_average)
41 | 


--------------------------------------------------------------------------------
/sample/scripts/transfer/plot_kpi.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import datetime
 3 | import numpy as np
 4 | import anomalytransfer as at
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | 
 8 | def _expand(a: np.ndarray) -> np.ndarray:
 9 |     ret = np.copy(a)
10 |     for i in range(length := len(a)):
11 |         if a[i] == 1:
12 |             if i - 1 >= 0:
13 |                 ret[i - 1] = 1
14 |             if i + 1 < length:
15 |                 ret[i + 1] = 1
16 |     return ret
17 | 
18 | 
19 | def _plot_kpi(kpi: at.transfer.data.KPI):
20 |     x = [datetime.datetime.fromtimestamp(timestamp) for timestamp in kpi.timestamps]
21 |     y_anomaly, y_missing = np.copy(kpi.values), np.copy(kpi.values)
22 |     y_anomaly[_expand(kpi.labels) == 0] = np.inf
23 |     y_missing[_expand(kpi.missing) == 0] = np.inf
24 |     plt.plot(x, kpi.values)
25 |     plt.plot(x, y_anomaly, color='red')
26 |     plt.plot(x, y_missing, color='orange')
27 |     plt.title(kpi.name)
28 |     plt.ylim(-7.5, 7.5)
29 | 
30 | 
31 | def main():
32 |     at.utils.mkdirs(OUTPUT)
33 |     file_list = at.utils.file_list(INPUT)
34 | 
35 |     plt.figure(figsize=(FIG_W, FIG_H), dpi=FIG_DPI)
36 |     progbar = at.utils.ProgBar(len(file_list), interval=0.5, unit_name='file')
37 |     print('Plotting...')
38 | 
39 |     for file in file_list:
40 |         kpi = at.utils.load_kpi(file)
41 |         kpi, _, _ = kpi.standardize()
42 |         kpi.complete_timestamp()
43 |         _plot_kpi(kpi)
44 |         plt.savefig(os.path.join(OUTPUT, f'{kpi.name}.png'))
45 |         plt.clf()
46 |         progbar.add(1)
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     config = at.utils.config()
51 | 
52 |     INPUT = config.get('PLOT_KPI', 'input')
53 |     OUTPUT = config.get('PLOT_KPI', 'output')
54 |     FIG_W = config.getfloat('PLOT_KPI', 'fig_width')
55 |     FIG_H = config.getfloat('PLOT_KPI', 'fig_height')
56 |     FIG_DPI = config.getint('PLOT_KPI', 'fig_dpi')
57 | 
58 |     main()
59 | 


--------------------------------------------------------------------------------
/sample/scripts/transfer/naive_bagel.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import anomalytransfer as at
 4 | os.environ["CUDA_VISIBLE_DEVICES"] = '1' 
 5 | 
 6 | def main():
 7 |     at.utils.mkdirs(OUTPUT)
 8 |     file_list = at.utils.file_list(INPUT)
 9 |     proglog = at.utils.ProgLog(len(file_list))
10 | 
11 |     for file in file_list:
12 |         kpi = at.utils.load_kpi(file)
13 |         proglog.log(kpi=kpi.name)
14 |         kpi.complete_timestamp()
15 |         total_minutes = 7 * 24 * 60
16 |         interval = kpi.interval / 60
17 |         num_of_point = int(total_minutes / interval)
18 |         train_kpi, test_kpi = kpi.split_by_idx(num_of_point)
19 | 
20 |         train_kpi, mean, std = train_kpi.standardize()
21 |         test_kpi, _, _ = test_kpi.standardize(mean=mean, std=std)
22 | 
23 |         model = at.transfer.AnomalyDetector()
24 |         model.fit(kpi=train_kpi.no_labels(), epochs=EPOCHS)
25 |         anomaly_scores = model.predict(test_kpi)
26 | 
27 |         results = at.utils.get_test_results(labels=test_kpi.labels,
28 |                                             scores=anomaly_scores,
29 |                                             missing=test_kpi.missing)
30 |         stats = at.utils.get_kpi_stats(kpi, test_kpi)
31 |         at.utils.log_test_results(kpi.name, results=results)
32 | 
33 |         with open(f'{os.path.join(OUTPUT, kpi.name)}.txt', 'w') as output:
34 |             output.write(f'[result]\n'
35 |                          f'threshold={results.get("threshold")}\n'
36 |                          f'precision={results.get("precision"):.3f}\n'
37 |                          f'recall={results.get("recall"):.3f}\n'
38 |                          f'f1_score={results.get("f1score"):.3f}\n\n'
39 | 
40 |                          '[overall]\n'
41 |                          f'num_points={stats[0].num_points}\n'
42 |                          f'num_missing_points={stats[0].num_missing}\n'
43 |                          f'missing_rate={stats[0].missing_rate:.6f}\n'
44 |                          f'num_anomaly_points={stats[0].num_anomaly}\n'
45 |                          f'anomaly_rate={stats[0].anomaly_rate:.6f}\n\n'
46 | 
47 |                          '[test]\n'
48 |                          f'num_points={stats[1].num_points}\n'
49 |                          f'num_missing_points={stats[1].num_missing}\n'
50 |                          f'missing_rate={stats[1].missing_rate:.6f}\n'
51 |                          f'num_anomaly_points={stats[1].num_anomaly}\n'
52 |                          f'anomaly_rate={stats[1].anomaly_rate:.6f}\n')
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s [%(levelname)s]] %(message)s')
57 | 
58 |     config = at.utils.config()
59 |     NUM_THREADS = config.getint('COMMON', 'num_threads')
60 |     EPOCHS = config.getint('BAGEL', 'epochs')
61 |     INPUT = config.get('BAGEL', 'input')
62 |     OUTPUT = config.get('BAGEL', 'output')
63 | 
64 |     # at.utils.set_num_threads(NUM_THREADS)
65 |     main()
66 | 


--------------------------------------------------------------------------------
/anomalytransfer/clustering/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from typing import Sequence, Tuple
 4 | 
 5 | 
 6 | def linear_interpolation(timestamp: Sequence, arrays: Sequence[Sequence]) \
 7 |         -> Tuple[np.ndarray, np.ndarray, Sequence[np.ndarray]]:
 8 |     timestamp = np.asarray(timestamp, np.int64)
 9 |     if len(timestamp.shape) != 1:
10 |         raise ValueError('`timestamp` must be a 1-D array')
11 | 
12 |     arrays = [np.asarray(array) for array in arrays]
13 |     for i, array in enumerate(arrays):
14 |         if array.shape != timestamp.shape:
15 |             raise ValueError(f'The shape of ``arrays[{i}]`` does not agree with '
16 |                              f'the shape of `timestamp` ({array.shape} vs {timestamp.shape})')
17 | 
18 |     src_index = np.argsort(timestamp)
19 |     timestamp_sorted = timestamp[src_index]
20 |     intervals = np.unique(np.diff(timestamp_sorted))
21 |     interval = np.min(intervals)
22 |     if interval == 0:
23 |         raise ValueError('Duplicated values in `timestamp`')
24 |     for itv in intervals:
25 |         if itv % interval != 0:
26 |             raise ValueError('Not all intervals in `timestamp` are multiples of the minimum interval')
27 | 
28 |     length = (timestamp_sorted[-1] - timestamp_sorted[0]) // interval + 1
29 |     ret_timestamp = np.arange(timestamp_sorted[0], timestamp_sorted[-1] + interval, interval, dtype=np.int64)
30 |     ret_missing = np.ones([length], dtype=np.int32)
31 |     ret_arrays = [np.zeros([length], dtype=array.dtype) for array in arrays]
32 |     dst_index = np.asarray((timestamp_sorted - timestamp_sorted[0]) // interval, dtype=np.int)
33 |     ret_missing[dst_index] = 0
34 |     miss_index = np.argwhere(ret_missing == 1)
35 |     for ret_array, array in zip(ret_arrays, arrays):
36 |         ret_array[dst_index] = array[src_index]
37 | 
38 |     for ret_array in ret_arrays:
39 |         if len(miss_index) > 0:
40 |             neg = miss_index.reshape(len(miss_index))
41 |             pos_index = np.argwhere(ret_missing == 0)
42 |             pos = pos_index.reshape(len(pos_index))
43 |             pos_values = ret_array[pos]
44 |             neg_values = np.interp(neg, pos, pos_values)
45 |             ret_array[neg] = neg_values
46 | 
47 |     return ret_timestamp, ret_missing, ret_arrays
48 | 
49 | 
50 | def standardize(values: Sequence, mean: float = None, std: float = None) -> Tuple[np.ndarray, float, float]:
51 |     values = np.asarray(values, dtype=np.float32)
52 |     if len(values.shape) != 1:
53 |         raise ValueError('`values` must be a 1-D array')
54 |     if (mean is None) != (std is None):
55 |         raise ValueError('`mean` and `std` must be both None or not None')
56 | 
57 |     if mean is None:
58 |         val = values
59 |         mean = val.mean()
60 |         std = val.std()
61 | 
62 |     return (values - mean) / std, mean, std
63 | 
64 | 
65 | def down_sampling(arrays: Sequence[Sequence], step: int) -> Tuple[Sequence, ...]:
66 |     ret_arrays = []
67 |     for array in arrays:
68 |         array = array[::step]
69 |         ret_arrays.append(array)
70 |     return tuple(ret_arrays)
71 | 


--------------------------------------------------------------------------------
/sample/scripts/test_time/test_at.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pandas as pd
 3 | from anomalytransfer.transfer.data import KPI
 4 | import os
 5 | os.environ["CUDA_VISIBLE_DEVICES"] = ""
 6 | import logging
 7 | import anomalytransfer as at
 8 | import numpy as np
 9 | from glob import glob
10 | from typing import Sequence, Tuple, Dict, Optional, cast
11 | np.seterr(divide='ignore', invalid='ignore')
12 | import pandas as pd
13 | 
14 | 
15 | def main():
16 |     raw_csvs = glob(os.path.join(INPUT, "*.csv"))[0:1]
17 |     assert len(raw_csvs) > 0
18 | 
19 |     time_map = {}
20 |     result = {}
21 |     for raw_csv in raw_csvs:
22 |         for exp in range(10):
23 |             print(f"The KPI: {raw_csv}")
24 |             raw_kpi_name = os.path.splitext(os.path.basename(raw_csv))[0]
25 |             time_map[raw_kpi_name] = 0
26 |             raw_kpi = at.utils.load_kpi(raw_csv)
27 |             raw_kpi, _, _ = raw_kpi.standardize()
28 |             raw_kpi.complete_timestamp()
29 | 
30 |             # get daily KPI
31 |             train_week_day_map, test_week_day_map, test_kpi = raw_kpi.split_days(days=7)
32 | 
33 |             # get cluster map
34 |             cluster_map = {}   # weekday -> cluster_name
35 |             for cluster in os.listdir(DAILY_OUTPUT):
36 |                 data_path = os.path.join(DAILY_OUTPUT, cluster, "data")
37 |                 raw_csv_daily = glob(os.path.join(
38 |                     data_path, f"{raw_kpi_name}*.csv"))
39 |                 raw_csv_daily = [int(os.path.splitext(os.path.basename(csv))[
40 |                                     0][-1]) for csv in raw_csv_daily]
41 |                 for daily in raw_csv_daily:
42 |                     assert daily not in cluster_map
43 |                     cluster_map[daily] = cluster
44 | 
45 |             # fine-tune with train_kpi
46 |             for weekday, kpi_seq in train_week_day_map.items():
47 |                 dst_cluster_name = cluster_map[weekday]
48 |                 cluster_model_path = os.path.join(MODEL_PATH, dst_cluster_name)
49 |                 model = at.transfer.models.AnomalyDetector()
50 |                 if os.path.exists(os.path.join(cluster_model_path, "finetune")):
51 |                     model.load(cluster_model_path, "finetune")
52 |                 else:
53 |                     model.load(cluster_model_path, "base")
54 | 
55 |                 for kpi in kpi_seq:
56 |                     history = model.fit(kpi, epochs=DATA_EPOCHS, verbose=1)
57 |                     result[f"ts_{exp}"] = history['ts']
58 |                     result[f"loss_{exp}"] = history['loss']
59 |                 if len(kpi_seq) > 0:
60 |                     model.save(cluster_model_path, "finetune")
61 |     dt = pd.DataFrame(result)
62 |     dt.to_csv("at.csv", index=False)
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     logging.basicConfig(level=logging.INFO,
67 |                         format='[%(asctime)s [%(levelname)s]] %(message)s')
68 | 
69 |     config = at.utils.config()
70 |     CLUSTER_OUTPUT = config.get("CLUSTERING", "output")
71 |     DAILY_OUTPUT = os.path.join(CLUSTER_OUTPUT, "daily_cluster")
72 | 
73 |     INPUT = config.get('BAGEL', 'input')
74 |     OUTPUT = config.get('TRANSFER_LEARNING', 'output')
75 |     MODEL_PATH = config.get('TRANSFER_LEARNING', 'model_path')
76 |     DATA_EPOCHS = config.getint('TRANSFER_LEARNING', 'data_epochs')
77 | 
78 |     main()
79 | 


--------------------------------------------------------------------------------
/anomalytransfer/utils/testing.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import anomalytransfer as at
  4 | 
  5 | from typing import Sequence, Dict, Tuple, Optional
  6 | from sklearn.metrics import precision_recall_curve, precision_recall_fscore_support
  7 | 
  8 | 
  9 | def adjust_scores(labels: np.ndarray,
 10 |                    scores: np.ndarray,
 11 |                    delay: Optional[int] = None,
 12 |                    inplace: bool = False) -> np.ndarray:
 13 |     if np.shape(scores) != np.shape(labels):
 14 |         raise ValueError('`labels` and `scores` must have same shape')
 15 |     if delay is None:
 16 |         delay = len(scores)
 17 |     splits = np.where(labels[1:] != labels[:-1])[0] + 1
 18 |     is_anomaly = labels[0] == 1
 19 |     adjusted_scores = np.copy(scores) if not inplace else scores
 20 |     pos = 0
 21 |     for part in splits:
 22 |         if is_anomaly:
 23 |             ptr = min(pos + delay + 1, part)
 24 |             adjusted_scores[pos: ptr] = np.max(adjusted_scores[pos: ptr])
 25 |             adjusted_scores[ptr: part] = np.maximum(adjusted_scores[ptr: part], adjusted_scores[pos])
 26 |         is_anomaly = not is_anomaly
 27 |         pos = part
 28 |     part = len(labels)
 29 |     if is_anomaly:
 30 |         ptr = min(pos + delay + 1, part)
 31 |         adjusted_scores[pos: part] = np.max(adjusted_scores[pos: ptr])
 32 |     return adjusted_scores
 33 | 
 34 | 
 35 | def _ignore_missing(series_list: Sequence, missing: np.ndarray) -> Tuple[np.ndarray, ...]:
 36 |     ret = []
 37 |     for series in series_list:
 38 |         series = np.copy(series)
 39 |         ret.append(series[missing != 1])
 40 |     return tuple(ret)
 41 | 
 42 | 
 43 | def _best_f1score(labels: np.ndarray, scores: np.ndarray) -> Tuple[float, float, float, float]:
 44 |     precision, recall, thresholds = precision_recall_curve(y_true=labels, probas_pred=scores, pos_label=1.0)
 45 |     f1score = 2 * precision * recall / np.clip(precision + recall, a_min=1e-8, a_max=None)
 46 | 
 47 |     best_threshold = thresholds[np.argmax(f1score)]
 48 |     best_precision = precision[np.argmax(f1score)]
 49 |     best_recall = recall[np.argmax(f1score)]
 50 | 
 51 |     return best_threshold, best_precision, best_recall, np.max(f1score)
 52 | 
 53 | 
 54 | def _f1score_given_alarms(labels: Sequence, alarms: Sequence) -> Tuple[float, float, float, float]:
 55 |     pred = np.zeros(len(labels))
 56 |     pred[alarms] = 1
 57 |     precision, recall, f1score, _ = precision_recall_fscore_support(y_true=labels,
 58 |                                                                     y_pred=pred,
 59 |                                                                     average='binary',
 60 |                                                                     pos_label=1)
 61 |     return np.nan, precision, recall, f1score
 62 | 
 63 | 
 64 | def set_num_threads(num_threads: int):
 65 |     torch.set_num_threads(num_threads)
 66 | 
 67 | 
 68 | def get_test_results(labels: np.ndarray,
 69 |                      scores: np.ndarray,
 70 |                      missing: np.ndarray,
 71 |                      window_size: int = 120,
 72 |                      use_spot: bool = False,
 73 |                      **kwargs) -> Dict:
 74 |     labels = labels[window_size - 1:]
 75 |     scores = scores[window_size - 1:]
 76 |     missing = missing[window_size - 1:]
 77 |     scores = adjust_scores(labels=labels, scores=scores)
 78 |     adjusted_labels, adjusted_scores = _ignore_missing([labels, scores], missing=missing)
 79 | 
 80 |     if use_spot:
 81 |         n_init = 1000
 82 |         init_data = adjusted_scores[:n_init]
 83 |         data = adjusted_scores[n_init:]
 84 |         labels = adjusted_labels[n_init:]
 85 | 
 86 |         result = {}
 87 |         for risk in kwargs.get('risks', [0.0001]):
 88 |             risk_result = {}
 89 |             for level in kwargs.get('levels', [0.98]):
 90 |                 threshold, precision, recall, f1score = -1, -1, -1, -1
 91 |                 try:
 92 |                     spot = at.transfer.SPOT(q=risk)
 93 |                     spot.fit(init_data, data)
 94 |                     spot.initialize(level=level)
 95 |                     r = spot.run()
 96 |                     alarms = r['alarms']
 97 |                     threshold, precision, recall, f1score = _f1score_given_alarms(labels=labels, alarms=alarms)
 98 |                 except Exception:
 99 |                     pass
100 |                     # import traceback
101 |                     # traceback.print_exc()
102 |                 finally:
103 |                     level_result = {
104 |                         'threshold': threshold,
105 |                         'precision': precision,
106 |                         'recall': recall,
107 |                         'f1score': f1score
108 |                     }
109 |                     risk_result[f'{level}'] = level_result
110 |             result[f'{risk}'] = risk_result
111 |         return result
112 |     else:
113 |         try:
114 |             threshold, precision, recall, f1score = _best_f1score(labels=adjusted_labels, scores=adjusted_scores)
115 |             return {
116 |                 'threshold': threshold,
117 |                 'precision': precision,
118 |                 'recall': recall,
119 |                 'f1score': f1score,
120 |                 "scores": adjusted_scores,
121 |                 "labels": adjusted_labels
122 |             }
123 |         except:
124 |             import traceback
125 |             traceback.print_exc()
126 |             return {
127 |                 "scores": adjusted_scores,
128 |                 "labels": adjusted_labels
129 |             }
130 | 


--------------------------------------------------------------------------------
/anomalytransfer/clustering/models.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import numpy as np
  4 | import anomalytransfer as at
  5 | 
  6 | from typing import Optional
  7 | from torch.backends import cudnn
  8 | from torch.utils.data import TensorDataset, DataLoader
  9 | 
 10 | 
 11 | class Encoder(torch.nn.Module):
 12 | 
 13 |     def __init__(self, input_dim: int, hidden_dim: int, latent_dim: int):
 14 |         super().__init__()
 15 | 
 16 |         self._lstm = torch.nn.LSTM(input_dim, hidden_dim)
 17 |         self._hidden_to_mean = torch.nn.Linear(hidden_dim, latent_dim)
 18 |         self._hidden_to_log_std = torch.nn.Linear(hidden_dim, latent_dim)
 19 | 
 20 |         torch.nn.init.xavier_uniform_(self._hidden_to_mean.weight)
 21 |         torch.nn.init.xavier_uniform_(self._hidden_to_log_std.weight)
 22 | 
 23 |     def forward(self, x):
 24 |         _, (h_end, c_end) = self._lstm(x)
 25 |         hidden = h_end[-1, :, :]
 26 |         self.mean = self._hidden_to_mean(hidden)
 27 |         self.log_std = self._hidden_to_log_std(hidden)
 28 |         if self.training:
 29 |             std = torch.exp(0.5 * self.log_std)
 30 |             eps = torch.randn_like(std)
 31 |             return eps.mul_(std).add_(self.mean)
 32 |         else:
 33 |             return self.mean
 34 | 
 35 | 
 36 | class Decoder(torch.nn.Module):
 37 | 
 38 |     def __init__(self,
 39 |                  seq_length: int,
 40 |                  latent_dim: int,
 41 |                  hidden_dim: int,
 42 |                  output_dim: int,
 43 |                  batch_size: int,
 44 |                  device: str):
 45 |         super(Decoder, self).__init__()
 46 | 
 47 |         self._lstm = torch.nn.LSTM(1, hidden_dim)
 48 |         self._latent_to_hidden = torch.nn.Linear(latent_dim, hidden_dim)
 49 |         self._hidden_to_output = torch.nn.Linear(hidden_dim, output_dim)
 50 | 
 51 |         self._model_input = torch.zeros(seq_length, batch_size, 1).to(device)
 52 |         self._c_0 = torch.zeros(1, batch_size, hidden_dim).to(device)
 53 | 
 54 |         torch.nn.init.xavier_uniform_(self._latent_to_hidden.weight)
 55 |         torch.nn.init.xavier_uniform_(self._hidden_to_output.weight)
 56 | 
 57 |     def forward(self, x):
 58 |         hidden = self._latent_to_hidden(x)
 59 |         h_0 = torch.stack([hidden])
 60 |         hidden, _ = self._lstm(self._model_input, (h_0, self._c_0))
 61 |         return self._hidden_to_output(hidden)
 62 | 
 63 | 
 64 | class LatentTransformer(torch.nn.Module):
 65 | 
 66 |     def __init__(self,
 67 |                  seq_length: int,
 68 |                  input_dim: int,
 69 |                  hidden_dim: int = 90,
 70 |                  latent_dim: int = 20,
 71 |                  batch_size: int = 32,
 72 |                  max_grad_norm: int = 5,
 73 |                  device: Optional[str] = None):
 74 |         super().__init__()
 75 | 
 76 |         self._batch_size = batch_size
 77 |         self._max_grad_norm = max_grad_norm
 78 | 
 79 |         cudnn.benchmark = True
 80 |         if device is None:
 81 |             self._device = 'cuda' if torch.cuda.is_available() else 'cpu'
 82 |         else:
 83 |             self._device = device
 84 | 
 85 |         self._encoder = Encoder(input_dim=input_dim, hidden_dim=hidden_dim, latent_dim=latent_dim)
 86 |         self._decoder = Decoder(seq_length=seq_length, latent_dim=latent_dim, hidden_dim=hidden_dim,
 87 |                                 output_dim=input_dim, batch_size=self._batch_size, device=self._device)
 88 |         self._is_fitted = False
 89 |         self.to(self._device)
 90 | 
 91 |         self._optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
 92 |         self._lr_scheduler = torch.optim.lr_scheduler.StepLR(self._optimizer, step_size=20, gamma=0.9)
 93 |         self._loss_fn = torch.nn.MSELoss(reduction='sum')
 94 | 
 95 |     def forward(self, x):
 96 |         return self._decoder(self._encoder(x))
 97 | 
 98 |     def _loss(self, x) -> torch.Tensor:
 99 |         x_recon = self(x)
100 |         mean, log_std = self._encoder.mean, self._encoder.log_std
101 |         kl_loss = -0.5 * torch.mean(1 + log_std - mean.pow(2) - log_std.exp())
102 |         reconstruction_loss = self._loss_fn(x_recon, x)
103 |         return kl_loss + reconstruction_loss
104 | 
105 |     def fit(self, data: torch.Tensor, epochs: int, verbose=1):
106 |         self.train()
107 |         dataset = TensorDataset(data.to(self._device))
108 |         train_loader = DataLoader(dataset=dataset, batch_size=self._batch_size, shuffle=True, drop_last=True)
109 |         print('Training Epochs')
110 |         if verbose:
111 |             progbar = at.utils.ProgBar(epochs, interval=0.5, stateful_metrics=['loss'], unit_name='epoch')
112 |         for i in range(epochs):
113 |             epoch_losses = []
114 |             for x in train_loader:
115 |                 x = x[0].permute(1, 0, 2)
116 |                 self._optimizer.zero_grad()
117 |                 loss = self._loss(x)
118 |                 loss.backward()
119 |                 torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=self._max_grad_norm)
120 |                 self._optimizer.step()
121 |                 epoch_losses.append(loss)
122 |             self._lr_scheduler.step()
123 |             epoch_loss = torch.mean(torch.as_tensor(epoch_losses)).numpy()
124 |             if verbose:
125 |                 progbar.add(1, values=[('loss', epoch_loss)])
126 |         self._is_fitted = True
127 | 
128 |     def transform(self, data: torch.Tensor) -> np.ndarray:
129 |         self.eval()
130 |         dataset = TensorDataset(data.to(self._device))
131 |         test_loader = DataLoader(dataset=dataset, batch_size=self._batch_size)
132 |         print('Transforming Steps')
133 |         progbar = at.utils.ProgBar(len(test_loader), interval=0.5)
134 |         if self._is_fitted:
135 |             with torch.no_grad():
136 |                 latent = []
137 |                 for x in test_loader:
138 |                     x = x[0].permute(1, 0, 2)
139 |                     x = self._encoder(x).cpu().numpy()
140 |                     latent.append(x)
141 |                     progbar.add(1)
142 |                 return np.concatenate(latent, axis=0)
143 |         raise RuntimeError('Model needs to be fitted')
144 | 
145 |     def save(self, path: str):
146 |         if self._is_fitted:
147 |             torch.save(self.state_dict(), path)
148 |         else:
149 |             raise RuntimeError('Model needs to be fitted')
150 | 
151 |     def load(self, path: str):
152 |         self._is_fitted = True
153 |         self.load_state_dict(torch.load(path))
154 | 


--------------------------------------------------------------------------------
/sample/scripts/transfer/transfer_learning.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import anomalytransfer as at
  4 | 
  5 | from typing import Sequence, Tuple, Dict, Optional
  6 | 
  7 | import argparse
  8 | parser = argparse.ArgumentParser()
  9 | parser.add_argument("--th", type=float)
 10 | args = parser.parse_args()
 11 | 
 12 | def make_base_model(kpi: at.transfer.data.KPI, model_path: str, epochs: int):
 13 |     kpi.complete_timestamp()
 14 |     kpi, _, _ = kpi.standardize()
 15 |     at.utils.mkdirs(os.path.join(model_path, kpi.name))
 16 |     model = at.transfer.models.AnomalyDetector()
 17 |     model.fit(kpi=kpi.no_labels(), epochs=epochs)
 18 |     model.save(path=model_path, name=kpi.name)
 19 | 
 20 | 
 21 | def train_test(train_kpi: at.transfer.data.KPI,
 22 |                test_kpi: at.transfer.data.KPI,
 23 |                epochs: int,
 24 |                mask: Optional[Sequence] = None,
 25 |                **kwargs) -> float:
 26 |     model = at.transfer.models.AnomalyDetector()
 27 |     if "model_path" in kwargs:
 28 |         model.load_partial(path=kwargs.get('model_path'), name=kwargs.get('base_kpi').name, mask=mask)
 29 | 
 30 |     if mask is not None:
 31 |         model.freeze(mask)
 32 |         model.fit(kpi=train_kpi.no_labels(), epochs=epochs)
 33 |         model.unfreeze(mask)
 34 |     else:
 35 |         model.fit(kpi=train_kpi.no_labels(), epochs=epochs)
 36 |     anomaly_scores = model.predict(test_kpi)
 37 |     results = at.utils.get_test_results(labels=test_kpi.labels,
 38 |                                         scores=anomaly_scores,
 39 |                                         missing=test_kpi.missing,
 40 |                                         use_spot=True)
 41 |     results = results['0.0001']['0.98']
 42 |     at.utils.log_test_results(name=test_kpi.name, results=results)
 43 |     return results['f1score']
 44 | 
 45 | 
 46 | def transfer_learning(base_kpi: at.transfer.data.KPI,
 47 |                       data_kpi: at.transfer.data.KPI,
 48 |                       train_ratio: float,
 49 |                       model_path: str,
 50 |                       epochs: int) -> Optional[Dict]:
 51 |     result = {}
 52 |     progress = at.utils.ProgLog(3, indent=3)
 53 | 
 54 |     progress.log(step='Preparing KPI...')
 55 |     data_kpi.complete_timestamp()
 56 |     train_kpi, test_kpi, _ = data_kpi.split((train_ratio, 0.3, 0.7 - train_ratio))
 57 |     train_kpi, mean, std = train_kpi.standardize()
 58 |     test_kpi, _, _ = test_kpi.standardize(mean=mean, std=std)
 59 | 
 60 |     # Ignore kpi curves that have less than 5 anomalies
 61 |     if len(test_kpi.values[test_kpi.labels == 1]) < 5:
 62 |         print('Less than 5 anomalies. Skipping...')
 63 |         return None
 64 | 
 65 |     progress.log(step='Training and testing before transfer...')
 66 |     result['f1score_pre_transfer'] = train_test(train_kpi=train_kpi,
 67 |                                                 test_kpi=test_kpi,
 68 |                                                 epochs=epochs)
 69 | 
 70 |     progress.log(step='Training and testing after transfer...')
 71 |     sbd = at.transfer.models.sbd_(base_kpi, data_kpi)
 72 |     mask = at.transfer.models.find_optimal_mask(sbd,
 73 |                                                 # threshold=0.3,
 74 |                                                 threshold=args.th / 10,
 75 |                                                 less_mask=((1, 1, 1), (1, 1, 1)),
 76 |                                                 greater_mask=((1, 1, 0), (0, 1, 1)),)
 77 |     result['f1score_post_transfer'] = train_test(train_kpi=train_kpi,
 78 |                                                  test_kpi=test_kpi,
 79 |                                                  epochs=epochs,
 80 |                                                  mask=mask,
 81 |                                                  model_path=model_path,
 82 |                                                  base_kpi=base_kpi)
 83 | 
 84 |     return result
 85 | 
 86 | 
 87 | def cluster_data(path: str) -> Tuple[str, str]:
 88 |     base = None
 89 |     data = None
 90 |     for item in os.listdir(path):
 91 |         item_path = os.path.join(path, item)
 92 |         if os.path.isdir(item_path):
 93 |             data = item_path
 94 |         else:
 95 |             base = item_path
 96 |     if base is None or data is None:
 97 |         raise ValueError('Base path or data path not found')
 98 |     return base, data
 99 | 
100 | 
101 | def main():
102 |     at.utils.mkdirs(OUTPUT, MODEL_PATH)
103 |     clusters = os.listdir(INPUT)
104 | 
105 |     cluster_prog = at.utils.ProgLog(len(clusters))
106 |     for cluster in clusters:
107 |         cluster_prog.log(cluster=cluster)
108 | 
109 |         base, data = cluster_data(os.path.join(INPUT, cluster))
110 |         file_list = at.utils.file_list(data)
111 |         step_progress = at.utils.ProgLog(2, indent=1)
112 | 
113 |         step_progress.log(step='Making base model...', cluster=cluster)
114 |         base_kpi = at.utils.load_kpi(base)
115 |         make_base_model(kpi=base_kpi, model_path=MODEL_PATH, epochs=BASE_EPOCHS)
116 | 
117 |         step_progress.log(step='Performing transfer learning...', cluster=cluster)
118 |         output_path = os.path.join(OUTPUT, f'{cluster}.csv')
119 |         with open(output_path, 'w') as output:
120 |             output.write('kpi_name,f1score_pre_transfer,f1score_post_transfer\n')
121 | 
122 |         file_progress = at.utils.ProgLog(len(file_list), indent=2)
123 |         for file in file_list:
124 |             data_kpi = at.utils.load_kpi(file)
125 |             file_progress.log(kpi=data_kpi.name, cluster=cluster)
126 |             result = transfer_learning(base_kpi=base_kpi,
127 |                                        data_kpi=data_kpi,
128 |                                        train_ratio=RATIO,
129 |                                        model_path=MODEL_PATH,
130 |                                        epochs=DATA_EPOCHS)
131 |             if result is not None:
132 |                 with open(output_path, 'a') as output:
133 |                     output.write(f'{data_kpi.name},'
134 |                                  f'{result.get("f1score_pre_transfer"):.3f},'
135 |                                  f'{result.get("f1score_post_transfer"):.3f}\n')
136 | 
137 | 
138 | if __name__ == '__main__':
139 |     logging.basicConfig(level=logging.INFO, format='[%(asctime)s [%(levelname)s]] %(message)s')
140 | 
141 |     config = at.utils.config()
142 |     NUM_THREADS = config.getint('COMMON', 'num_threads')
143 |     BASE_EPOCHS = config.getint('TRANSFER_LEARNING', 'base_epochs')
144 |     DATA_EPOCHS = config.getint('TRANSFER_LEARNING', 'data_epochs')
145 |     INPUT = config.get('TRANSFER_LEARNING', 'input')
146 |     OUTPUT = config.get('TRANSFER_LEARNING', 'output')
147 |     MODEL_PATH = config.get('TRANSFER_LEARNING', 'model_path')
148 |     RATIO = config.getfloat('TRANSFER_LEARNING', 'ratio')
149 | 
150 |     at.utils.set_num_threads(NUM_THREADS)
151 |     main()
152 | 


--------------------------------------------------------------------------------
/sample/scripts/transfer/cluster_transfer_train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  3 | 
  4 | import logging
  5 | import anomalytransfer as at
  6 | from glob import glob
  7 | from utils import run_time
  8 | 
  9 | from typing import Sequence, Tuple, Dict, Optional
 10 | 
 11 | 
 12 | def make_base_model(kpi: at.transfer.data.KPI, model_path: str, epochs: int):
 13 |     kpi.complete_timestamp()
 14 |     kpi, _, _ = kpi.standardize()
 15 |     at.utils.mkdirs(os.path.join(model_path, kpi.name))
 16 |     model = at.transfer.models.AnomalyDetector()
 17 |     model.fit(kpi=kpi.no_labels(), epochs=epochs, verbose=1)
 18 |     model.save(path=model_path, name=kpi.name)
 19 | 
 20 | 
 21 | def train_test(train_kpi: at.transfer.data.KPI,
 22 |                epochs: int,
 23 |                test_kpi: at.transfer.data.KPI = None,
 24 |                mask: Optional[Sequence] = None,
 25 |                **kwargs) -> float:
 26 |     model = at.transfer.models.AnomalyDetector()
 27 |     if mask is not None:
 28 |         model.load_partial(path=kwargs.get('model_path'), name=kwargs.get('base_kpi').name, mask=mask)
 29 |         model.freeze(mask)
 30 |         model.fit(kpi=train_kpi.no_labels(), epochs=epochs, verbose=1)
 31 |         model.unfreeze(mask)
 32 |     model.fit(kpi=train_kpi.no_labels(), epochs=epochs, verbose=1)
 33 |     if test_kpi is not None and test_kpi.labels is not None:
 34 |         anomaly_scores = model.predict(test_kpi, verbose=1)
 35 |         results = at.utils.get_test_results(labels=test_kpi.labels,
 36 |                                             scores=anomaly_scores,
 37 |                                             missing=test_kpi.missing,
 38 |                                             use_spot=False)
 39 |         at.utils.log_test_results(name=test_kpi.name, results=results)
 40 |         return results['f1score']
 41 |     else:
 42 |         return None
 43 | 
 44 | def transfer_learning(base_kpi: at.transfer.data.KPI,
 45 |                       data_kpi: at.transfer.data.KPI,
 46 |                       train_ratio: float,
 47 |                       model_path: str,
 48 |                       epochs: int,
 49 |                       TH: int) -> Optional[Dict]:
 50 |     result = {}
 51 |     progress = at.utils.ProgLog(3, indent=3)
 52 | 
 53 |     progress.log(step='Preparing KPI...')
 54 |     data_kpi.complete_timestamp()
 55 |     train_kpi = data_kpi
 56 |     train_kpi, mean, std = train_kpi.standardize()
 57 | 
 58 |     progress.log(step='Training and testing before transfer...')
 59 |     result['f1score_pre_transfer'] = train_test(train_kpi=train_kpi,
 60 |                                                 epochs=epochs)
 61 | 
 62 |     progress.log(step='Training and testing after transfer...')
 63 |     sbd = at.transfer.models.sbd_(base_kpi, data_kpi)
 64 |     mask = at.transfer.models.find_optimal_mask(sbd,
 65 |                                                 threshold=args.th / 10,
 66 |                                                 # threshold=0.3,
 67 |                                                 less_mask=((1, 1, 1), (1, 1, 1)),
 68 |                                                 greater_mask=((1, 1, 0), (0, 1, 1)))
 69 |     result['f1score_post_transfer'] = train_test(train_kpi=train_kpi,
 70 |                                                  epochs=epochs,
 71 |                                                  mask=mask,
 72 |                                                  model_path=model_path,
 73 |                                                  base_kpi=base_kpi)
 74 | 
 75 |     return result
 76 | 
 77 | 
 78 | def cluster_data(path: str) -> Tuple[str, str]:
 79 |     base = None
 80 |     data = None
 81 |     for item in os.listdir(path):
 82 |         item_path = os.path.join(path, item)
 83 |         if os.path.isdir(item_path):
 84 |             data = item_path
 85 |         else:
 86 |             base = item_path
 87 |     if base is None or data is None:
 88 |         raise ValueError('Base path or data path not found')
 89 |     return base, data
 90 | 
 91 | 
 92 | def main(TH: int):
 93 |     at.utils.mkdirs(OUTPUT, MODEL_PATH)
 94 |     clusters = os.listdir(INPUT)
 95 | 
 96 |     cluster_prog = at.utils.ProgLog(len(clusters))
 97 |     time_map = {}
 98 |     for cluster in clusters:
 99 |         cluster_model_path = os.path.join(MODEL_PATH, str(TH), cluster)
100 |         cluster_name = os.path.basename(cluster_model_path)
101 |         time_map[cluster_name] = 0
102 |         cluster_prog.log(cluster=cluster)
103 | 
104 |         base, data = cluster_data(os.path.join(INPUT, cluster))
105 |         file_list = at.utils.file_list(data)
106 |         step_progress = at.utils.ProgLog(2, indent=1)
107 | 
108 |         step_progress.log(step='Making base model...', cluster=cluster)
109 |         base_kpi = at.utils.load_kpi(base)
110 |         make_base_model(kpi=base_kpi, model_path=cluster_model_path, epochs=BASE_EPOCHS)
111 | 
112 |     #     step_progress.log(step='Performing transfer learning...', cluster=cluster)
113 |     #     output_path = os.path.join(OUTPUT, f'{cluster}.csv')
114 |     #     with open(output_path, 'w') as output:
115 |     #         output.write('kpi_name,f1score_pre_transfer,f1score_post_transfer\n')
116 | 
117 |     #     file_progress = at.utils.ProgLog(len(file_list), indent=2)
118 |     #     for file in file_list:
119 |     #         data_kpi = at.utils.load_kpi(file)
120 |     #         file_progress.log(kpi=data_kpi.name, cluster=cluster)
121 |     #         with run_time() as t:
122 |     #             result = transfer_learning(base_kpi=base_kpi,
123 |     #                                     data_kpi=data_kpi,
124 |     #                                     train_ratio=RATIO,
125 |     #                                     model_path=cluster_model_path,
126 |     #                                     epochs=DATA_EPOCHS)
127 |     #         time_map[cluster_name] += t.get_time()
128 |     #         if result is not None and \
129 |     #             result.get("f1score_pre_transfer") is not None and \
130 |     #             result.get("f1score_post_transfer") is not None:
131 |     #             with open(output_path, 'a') as output:
132 |     #                 output.write(f'{data_kpi.name},'
133 |     #                              f'{result.get("f1score_pre_transfer"):.3f},'
134 |     #                              f'{result.get("f1score_post_transfer"):.3f}\n')
135 |     # import json
136 |     # json.dump(time_map, open("train_cluster_time.json", "w"), indent=4)
137 | 
138 | if __name__ == '__main__':
139 |     logging.basicConfig(level=logging.INFO, format='[%(asctime)s [%(levelname)s]] %(message)s')
140 | 
141 |     config = at.utils.config()
142 |     CLUSTER_OUTPUT = config.get("CLUSTERING", "output")
143 |     BASE_EPOCHS = config.getint('TRANSFER_LEARNING', 'base_epochs')
144 |     DATA_EPOCHS = config.getint('TRANSFER_LEARNING', 'data_epochs')
145 |     INPUT = config.get('TRANSFER_LEARNING', 'input')
146 |     OUTPUT = config.get('TRANSFER_LEARNING', 'output')
147 |     MODEL_PATH = config.get('TRANSFER_LEARNING', 'model_path')
148 |     RATIO = config.getfloat('TRANSFER_LEARNING', 'ratio')
149 | 
150 |     for th in range(0, 21, 1):
151 |         main(th)
152 | 


--------------------------------------------------------------------------------
/sample/scripts/transfer/cluster_transfer_test.py:
--------------------------------------------------------------------------------
  1 | from sample.scripts.transfer.utils import run_time
  2 | import torch
  3 | import pandas as pd
  4 | from anomalytransfer.transfer.data import KPI
  5 | import os
  6 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  7 | import logging
  8 | import anomalytransfer as at
  9 | import numpy as np
 10 | from glob import glob
 11 | from typing import Sequence, Tuple, Dict, Optional, cast
 12 | np.seterr(divide='ignore', invalid='ignore')
 13 | 
 14 | 
 15 | def _ignore_missing(series_list: Sequence, missing: np.ndarray) -> Tuple[np.ndarray, ...]:
 16 |     ret = []
 17 |     for series in series_list:
 18 |         series = np.copy(series)
 19 |         ret.append(series[missing != 1])
 20 |     return tuple(ret)
 21 | 
 22 | 
 23 | def get_test_results(
 24 |         timestamps: np.ndarray,
 25 |         labels: np.ndarray,
 26 |         scores: np.ndarray,
 27 |         missing: np.ndarray,
 28 |         values: np.ndarray,
 29 |         window_size: int = 120,
 30 |         **kwargs) -> Dict:
 31 |     timestamps = timestamps[window_size - 1:]
 32 |     labels = labels[window_size - 1:]
 33 |     scores = scores[window_size - 1:]
 34 |     missing = missing[window_size - 1:]
 35 |     values = values[window_size - 1:]
 36 |     adjusted_timestamps, adjusted_labels, adjusted_scores, adjusted_values = _ignore_missing(
 37 |         [timestamps, labels, scores, values], missing=missing)
 38 | 
 39 |     return {
 40 |         "timestamp": adjusted_timestamps,
 41 |         "scores": adjusted_scores,
 42 |         "labels": adjusted_labels,
 43 |         "values": adjusted_values
 44 |     }
 45 | 
 46 | 
 47 | def main(TH: int):
 48 |     raw_csvs = glob(os.path.join(INPUT, "*.csv"))
 49 |     assert len(raw_csvs) > 0
 50 | 
 51 |     models = glob(os.path.join(MODEL_PATH, str(TH), "cluster-*"))
 52 |     assert len(models) > 0
 53 | 
 54 |     time_map = {}
 55 |     for raw_csv in raw_csvs:
 56 |         print(f"The KPI: {raw_csv}")
 57 |         raw_kpi_name = os.path.splitext(os.path.basename(raw_csv))[0]
 58 |         time_map[raw_kpi_name] = 0
 59 |         raw_kpi = at.utils.load_kpi(raw_csv)
 60 |         raw_kpi, _, _ = raw_kpi.standardize()
 61 |         raw_kpi.complete_timestamp()
 62 | 
 63 |         total_timestamps = []
 64 |         total_scores = []
 65 |         total_labels = []
 66 |         total_values = []
 67 | 
 68 |         # get daily KPI
 69 |         train_week_day_map, test_week_day_map, test_kpi = raw_kpi.split_days(days=7)
 70 | 
 71 |         # get cluster map
 72 |         cluster_map = {}   # weekday -> cluster_name
 73 |         for cluster in os.listdir(DAILY_OUTPUT):
 74 |             data_path = os.path.join(DAILY_OUTPUT, cluster, "data")
 75 |             raw_csv_daily = glob(os.path.join(
 76 |                 data_path, f"{raw_kpi_name}*.csv"))
 77 |             raw_csv_daily = [int(os.path.splitext(os.path.basename(csv))[
 78 |                                  0][-1]) for csv in raw_csv_daily]
 79 |             for daily in raw_csv_daily:
 80 |                 assert daily not in cluster_map
 81 |                 cluster_map[daily] = cluster
 82 | 
 83 |         # fine-tune with train_kpi
 84 |         for weekday, kpi_seq in train_week_day_map.items():
 85 |             dst_cluster_name = cluster_map[weekday]
 86 |             cluster_model_path = os.path.join(MODEL_PATH, str(TH), dst_cluster_name)
 87 |             model = at.transfer.models.AnomalyDetector()
 88 |             if os.path.exists(os.path.join(cluster_model_path, "finetune")):
 89 |                 model.load(cluster_model_path, "finetune")
 90 |             else:
 91 |                 model.load(cluster_model_path, "base")
 92 | 
 93 |             for kpi in kpi_seq:
 94 |                 with run_time() as t:
 95 |                     model.fit(kpi, epochs=DATA_EPOCHS, verbose=1)
 96 |                 time_map[raw_kpi_name] += t.get_time()
 97 |             if len(kpi_seq) > 0:
 98 |                 model.save(cluster_model_path, "finetune")
 99 | 
100 |         # test
101 |         for weekday, kpi_seq in test_week_day_map.items():
102 |             dst_cluster_name = cluster_map[weekday]
103 |             cluster_model_path = os.path.join(MODEL_PATH, str(TH), dst_cluster_name)
104 |             assert os.path.exists(os.path.join(
105 |                 cluster_model_path, "finetune")), f"the train stage of {dst_cluster_name} is missed..."
106 | 
107 |             model = at.transfer.models.AnomalyDetector()
108 |             model.load(cluster_model_path, "finetune")
109 |             for kpi in kpi_seq:
110 |                 kpi = cast(KPI, kpi)
111 |                 anomaly_scores = model.predict(kpi, verbose=1)
112 |                 try:
113 |                     results = get_test_results(
114 |                         timestamps=kpi.timestamps,
115 |                         labels=kpi.labels,
116 |                         scores=anomaly_scores,
117 |                         missing=kpi.missing,
118 |                         values=kpi.values
119 |                     )
120 |                     # results = results['0.0001']['0.98']
121 | 
122 |                     total_timestamps.extend(results["timestamp"])
123 |                     total_scores.extend(results["scores"])
124 |                     total_labels.extend(results["labels"])
125 |                     total_values.extend(results["values"])
126 |                 except:
127 |                     import traceback
128 |                     traceback.print_exc()
129 |                     exit(-1)
130 | 
131 |         total_timestamps = np.asarray(total_timestamps)
132 |         total_scores = np.asarray(total_scores)
133 |         total_labels = np.asarray(total_labels)
134 |         total_values = np.asarray(total_values)
135 | 
136 |         sort_idx = np.argsort(total_timestamps)
137 |         total_timestamps = total_timestamps[sort_idx]
138 |         total_scores = total_scores[sort_idx]
139 |         total_values = total_values[sort_idx]
140 |         total_labels = total_labels[sort_idx]
141 | 
142 |         # # adjust after concatenate
143 |         adjusted_scores = at.utils.adjust_scores(
144 |             labels=total_labels, scores=total_scores)
145 | 
146 |         dt = pd.DataFrame({
147 |             "ts": total_timestamps,
148 |             "scores": adjusted_scores,
149 |             "values": total_values,
150 |             "label": total_labels,
151 |         })
152 |         # if not os.path.exists(os.path.join(OUTPUT, "transfer")):
153 |         #     os.makedirs(os.path.join(OUTPUT, "transfer"), exist_ok=True)
154 |         if not os.path.exists(os.path.join(OUTPUT, f"transfer_{TH / 10}")):
155 |             os.makedirs(os.path.join(OUTPUT, f"transfer_{TH / 10}"))
156 |         dt.to_csv(os.path.join(OUTPUT, f"transfer_{TH / 10}",
157 |                   f"{raw_kpi_name}.csv"), index=False)
158 | 
159 |     import json
160 |     json.dump(time_map, open("test_time.json", "w"), indent=4)
161 | 
162 | 
163 | if __name__ == '__main__':
164 |     logging.basicConfig(level=logging.INFO,
165 |                         format='[%(asctime)s [%(levelname)s]] %(message)s')
166 | 
167 |     config = at.utils.config()
168 |     CLUSTER_OUTPUT = config.get("CLUSTERING", "output")
169 |     DAILY_OUTPUT = os.path.join(CLUSTER_OUTPUT, "daily_cluster")
170 | 
171 |     INPUT = config.get('BAGEL', 'input')
172 |     OUTPUT = config.get('TRANSFER_LEARNING', 'output')
173 |     MODEL_PATH = config.get('TRANSFER_LEARNING', 'model_path')
174 |     DATA_EPOCHS = config.getint('TRANSFER_LEARNING', 'data_epochs')
175 | 
176 |     for th in range(18, 21, 1):
177 |         main(th)
178 | 


--------------------------------------------------------------------------------
/sample/scripts/clustering/step4_clustering.py:
--------------------------------------------------------------------------------
  1 | from sklearn.cluster import KMeans
  2 | import torch
  3 | import pandas as pd
  4 | import numpy as np
  5 | from typing import Sequence, Tuple
  6 | import logging
  7 | import anomalytransfer as at
  8 | import os
  9 | import shutil
 10 | os.environ["CUDA_VISIBLE_DEVICES"] = ""
 11 | 
 12 | 
 13 | def _load_data(path: str) -> Tuple[np.ndarray, Sequence]:
 14 |     file_list = at.utils.file_list(path)
 15 |     progbar = at.utils.ProgBar(len(file_list), interval=0.5, unit_name='file')
 16 |     values = []
 17 |     names = []
 18 |     for file in file_list:
 19 |         filename = at.utils.filename(file)
 20 |         names.append(filename)
 21 |         values.append(at.clustering.preprocessing.down_sampling(
 22 |             [pd.read_csv(file).value.to_numpy()], step=DOWN_SAMPLING_STEP)[0])
 23 |         progbar.add(1)
 24 |     values = np.expand_dims(np.asarray(values), -1).astype(np.float32)
 25 |     return values, names
 26 | 
 27 | 
 28 | def _get_latent_vectors(x: np.ndarray) -> np.ndarray:
 29 |     x = torch.as_tensor(x)
 30 |     seq_length = x.shape[1]
 31 |     input_dim = x.shape[2]
 32 | 
 33 |     model = at.clustering.LatentTransformer(
 34 |         seq_length=seq_length, input_dim=input_dim)
 35 |     model.fit(x, epochs=EPOCHS)
 36 |     model.save(os.path.join(OUTPUT, 'model.pt'))
 37 |     return model.transform(x)
 38 | 
 39 | 
 40 | def _get_clustering_result(labels: Sequence, names: Sequence) -> Tuple[Sequence, Sequence]:
 41 |     class_count = {}
 42 |     base_names = []
 43 |     classes = []
 44 | 
 45 |     for i in range(len(names)):
 46 |         base_name = names[i][:-4]
 47 |         if base_name not in class_count.keys():
 48 |             class_count[base_name] = [0] * N_CLUSTERS
 49 |         class_count[base_name][labels[i]] += 1
 50 | 
 51 |     for k, v in class_count.items():
 52 |         base_names.append(k)
 53 |         classes.append(np.argmax(v))
 54 | 
 55 |     return base_names, classes
 56 | 
 57 | 
 58 | def _sse_get_best_cluster_num(latent):
 59 |     if not os.path.exists("SSE (best cluster num).png"):
 60 |         distance_centroid = []
 61 |         max_clusters = 50
 62 |         for i in range(1, max_clusters):
 63 |             km = KMeans(n_clusters=i)
 64 |             km.fit(latent)
 65 |             distance_centroid.append(km.inertia_)
 66 | 
 67 |         import matplotlib.pyplot as plt
 68 |         plt.figure()
 69 |         plt.plot(range(1, max_clusters), distance_centroid, marker="o")
 70 |         plt.xlabel("The num of clusters")
 71 |         plt.ylabel("SSE")
 72 |         plt.savefig("SSE (best cluster num).png")
 73 | 
 74 | 
 75 | def _get_distance_centroid(features: np.ndarray, centroid: np.ndarray, labels: np.ndarray):
 76 |     """
 77 |     return: (N_samples, order_in_each_cluster)
 78 |     """
 79 |     distance_order = np.zeros([features.shape[0]])
 80 |     for label in range(centroid.shape[0]):
 81 |         center: np.ndarray = centroid[label]
 82 |         feature_idx = np.where(labels == label)[0]
 83 |         feature_with_label: np.ndarray = features[feature_idx]
 84 |         distance = np.sqrt(
 85 |             np.power((feature_with_label-center), 2).sum(axis=1))
 86 |         order_idx = np.argsort(distance)
 87 |         order = np.zeros_like(order_idx)
 88 |         order[order_idx] = range(1, order.shape[0]+1)
 89 |         distance_order[feature_idx] = order
 90 |     return distance_order
 91 | 
 92 | 
 93 | def _save_top_k_daily_kpi(order: np.ndarray, labels: np.ndarray, names: Sequence):
 94 |     """
 95 |     save the KPIs from the average stage
 96 |     """
 97 |     # save the entrire daily-kpi cluster result
 98 |     output_root = os.path.join(OUTPUT, "daily_cluster")
 99 |     if not os.path.exists(output_root):
100 |         os.makedirs(output_root, exist_ok=True)
101 |     for i, (label, name) in enumerate(zip(labels, names)):
102 |         save_path = os.path.join(output_root, f"cluster-{label}", "data")
103 |         if not os.path.exists(save_path):
104 |             os.makedirs(save_path, exist_ok=True)
105 |         save_file = os.path.join(save_path, f"{name}.csv")
106 |         src_file = os.path.join(AVERAGE_OUTPUT, f"{name}.csv")
107 |         shutil.copyfile(src_file, save_file)
108 | 
109 |     top_k_idx = np.where(order <= TOP_K)[0]
110 | 
111 |     top_k_labels = labels[top_k_idx]
112 |     top_k_name = np.asarray(names)[top_k_idx]
113 |     top_k_order = order[top_k_idx]
114 | 
115 |     output_root = os.path.join(OUTPUT, "top_k_daily_cluster")
116 |     if not os.path.exists(output_root):
117 |         os.makedirs(output_root, exist_ok=True)
118 | 
119 |     label_based = set()
120 |     for i, (label, name, order) in enumerate(zip(top_k_labels, top_k_name, top_k_order)):
121 |         if order == 1 and label not in label_based:  # generate base
122 |             label_based.add(label)
123 |             save_path = os.path.join(output_root, f"cluster-{label}")
124 |             if not os.path.exists(save_path):
125 |                 os.makedirs(save_path, exist_ok=True)
126 |             save_file = os.path.join(save_path, "base.csv")
127 |         else:
128 |             save_path = os.path.join(output_root, f"cluster-{label}", "data")
129 |             if not os.path.exists(save_path):
130 |                 os.makedirs(save_path, exist_ok=True)
131 |             save_file = os.path.join(save_path, f"{name}.csv")
132 | 
133 |         src_file = os.path.join(AVERAGE_OUTPUT, f"{name}.csv")
134 |         shutil.copyfile(src_file, save_file)
135 | 
136 | 
137 | def _save_base_kpi(base_names: Sequence, classes: Sequence):
138 |     """
139 |     save the KPIs from the preprocess stage
140 |     """
141 |     output_root = os.path.join(OUTPUT, "base_cluster")
142 |     if not os.path.exists(output_root):
143 |         os.makedirs(output_root, exist_ok=True)
144 | 
145 |     tag = np.zeros([len(classes)])
146 |     for base, cls in zip(base_names, classes):
147 |         if tag[cls] == 0:  # generate base
148 |             save_root = os.path.join(output_root, f"cluster-{cls}")
149 |             save_file = os.path.join(save_root, "base.csv")
150 |             tag[cls] = 1
151 |         else:
152 |             save_root = os.path.join(output_root, f"cluster-{cls}", "data")
153 |             save_file = os.path.join(save_root, f"{base}.csv")
154 |         if not os.path.exists(save_root):
155 |             os.makedirs(save_root, exist_ok=True)
156 |         src_file = os.path.join(RAW_INPUT, f"{base}.csv")
157 |         shutil.copy(src_file, save_file)
158 | 
159 | 
160 | def main():
161 |     at.utils.mkdirs(OUTPUT)
162 |     step_progress = at.utils.ProgLog(4)
163 | 
164 |     step_progress.log(step='Preparing data...')
165 |     values, names = _load_data(INPUT)
166 | 
167 |     step_progress.log(step='Getting latent vectors...')
168 |     latent = _get_latent_vectors(values)  # (n_samples, 20)
169 | 
170 |     step_progress.log(
171 |         step='Performing K-means clustering on latent vectors...')
172 | 
173 |     _sse_get_best_cluster_num(latent)
174 | 
175 |     k_means = KMeans(n_clusters=N_CLUSTERS)
176 |     k_means.fit(latent)
177 | 
178 |     # get the distance of samples to their closest cluster center
179 |     labels = k_means.labels_  # (n_samples, )
180 |     cluster_centers = k_means.cluster_centers_  # (n_clusters, n_features)
181 |     order = _get_distance_centroid(latent, cluster_centers, labels)
182 | 
183 |     # get TOP K kpi (with the shortest distance from centroid)
184 |     _save_top_k_daily_kpi(order, labels, names)
185 | 
186 |     step_progress.log(step='Computing clustering result...')
187 |     base_names, classes = _get_clustering_result(labels=labels, names=names)
188 |     _save_base_kpi(base_names, classes)
189 |     df = pd.DataFrame({'name': base_names, 'cluster': classes})
190 |     df.to_csv(os.path.join(OUTPUT, 'result.csv'), index=False)
191 | 
192 | 
193 | if __name__ == '__main__':
194 |     logging.basicConfig(level=logging.DEBUG,
195 |                         format='[%(asctime)s [%(levelname)s]] %(message)s')
196 |     config = at.utils.config()
197 | 
198 |     INPUT = config.get('CLUSTERING', 'input')
199 |     OUTPUT = config.get('CLUSTERING', 'output')
200 |     EPOCHS = config.getint('CLUSTERING', 'epochs')
201 |     AVERAGE_OUTPUT = config.get("CLUSTERING_AVERAGE", "output_daily")
202 |     RAW_INPUT = config.get("CLUSTERING_PREPROCESSING", "input")
203 |     DOWN_SAMPLING_STEP = config.getint(
204 |         'CLUSTERING_PREPROCESSING', 'down_sampling_step')
205 |     try:
206 |         N_CLUSTERS = config.getint('CLUSTERING', 'n_clusters')
207 |     except:
208 |         # see `"SSE (best cluster num).png"` to set best cluster number.
209 |         N_CLUSTERS = 1
210 | 
211 |     TOP_K = 50
212 |     main()
213 | 


--------------------------------------------------------------------------------
/sample/scripts/transfer_entirely/finetune.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from multiprocessing import Pool
  3 | 
  4 | from torch.cuda import is_available
  5 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
  6 | 
  7 | import logging
  8 | import anomalytransfer as at
  9 | from glob import glob
 10 | from utils import run_time
 11 | 
 12 | from typing import Sequence, Tuple, Dict, Optional
 13 | import pandas as pd
 14 | import numpy as np
 15 | import torch
 16 | from sklearn.metrics import precision_recall_curve
 17 | from tqdm import tqdm
 18 | 
 19 | logging.basicConfig(level=logging.INFO, format='[%(asctime)s [%(levelname)s]] %(message)s')
 20 | 
 21 | config = at.utils.config()
 22 | CLUSTER_OUTPUT = config.get("CLUSTERING", "output")
 23 | EPOCHS = config.getint("CLUSTERING", "epochs")
 24 | BASE_EPOCHS = config.getint('TRANSFER_LEARNING', 'base_epochs')
 25 | DATA_EPOCHS = config.getint('TRANSFER_LEARNING', 'data_epochs')
 26 | INPUT = config.get('TRANSFER_LEARNING', 'input')
 27 | OUTPUT = config.get('TRANSFER_LEARNING', 'output')
 28 | MODEL_PATH = config.get('TRANSFER_LEARNING', 'model_path')
 29 | RATIO = config.getfloat('TRANSFER_LEARNING', 'ratio')
 30 | 
 31 | RAW_INPUT = config.get("CLUSTERING_PREPROCESSING", "input")
 32 | 
 33 | 
 34 | def _get_latent_vectors(x: np.ndarray) -> np.ndarray:
 35 |     x = torch.as_tensor(x)
 36 |     seq_length = x.shape[1]
 37 |     input_dim = x.shape[2]
 38 | 
 39 |     model = at.clustering.LatentTransformer(
 40 |         seq_length=seq_length, input_dim=input_dim)
 41 |     model.fit(x, epochs=EPOCHS, verbose=0)
 42 |     model.save(os.path.join(OUTPUT, 'model.pt'))
 43 |     return model.transform(x)
 44 | 
 45 | 
 46 | def cluster_data(path: str) -> Tuple[str, str]:
 47 |     base = None
 48 |     data = None
 49 |     for item in os.listdir(path):
 50 |         item_path = os.path.join(path, item)
 51 |         if os.path.isdir(item_path):
 52 |             data = item_path
 53 |         else:
 54 |             base = item_path
 55 |     if base is None or data is None:
 56 |         raise ValueError('Base path or data path not found')
 57 |     return base, data
 58 | 
 59 | def make_base_model(kpi: at.transfer.data.KPI, epochs: int):
 60 |     kpi.complete_timestamp()
 61 |     kpi, _, _ = kpi.standardize()
 62 |     model = at.transfer.models.AnomalyDetector()
 63 |     model.fit(kpi=kpi.no_labels(), epochs=epochs, verbose=0)
 64 |     return model
 65 | 
 66 | def train_test(train_kpi: at.transfer.data.KPI,
 67 |                epochs: int,
 68 |                test_kpi: at.transfer.data.KPI = None,
 69 |                mask: Optional[Sequence] = None,
 70 |                **kwargs) -> float:
 71 |     model = at.transfer.models.AnomalyDetector()
 72 |     if mask is not None:
 73 |         model.load_partial(path=kwargs.get('model_path'), name=kwargs.get('base_kpi').name, mask=mask)
 74 |         model.freeze(mask)
 75 |         model.fit(kpi=train_kpi.no_labels(), epochs=epochs, verbose=0)
 76 |         model.unfreeze(mask)
 77 |     model.fit(kpi=train_kpi.no_labels(), epochs=epochs, verbose=0)
 78 |     if test_kpi is not None and test_kpi.labels is not None:
 79 |         anomaly_scores = model.predict(test_kpi, verbose=0)
 80 |         results = at.utils.get_test_results(labels=test_kpi.labels,
 81 |                                             scores=anomaly_scores,
 82 |                                             missing=test_kpi.missing,
 83 |                                             use_spot=False)
 84 |         at.utils.log_test_results(name=test_kpi.name, results=results)
 85 |         return results['f1score']
 86 |     else:
 87 |         return None
 88 | 
 89 | 
 90 | def _ignore_missing(series_list: Sequence, missing: np.ndarray) -> Tuple[np.ndarray, ...]:
 91 |     ret = []
 92 |     for series in series_list:
 93 |         series = np.copy(series)
 94 |         ret.append(series[missing != 1])
 95 |     return tuple(ret)
 96 | 
 97 | 
 98 | def get_test_results(
 99 |         timestamps: np.ndarray,
100 |         labels: np.ndarray,
101 |         scores: np.ndarray,
102 |         missing: np.ndarray,
103 |         values: np.ndarray,
104 |         window_size: int = 120,
105 |         **kwargs) -> Dict:
106 |     timestamps = timestamps[window_size - 1:]
107 |     labels = labels[window_size - 1:]
108 |     scores = scores[window_size - 1:]
109 |     missing = missing[window_size - 1:]
110 |     values = values[window_size - 1:]
111 |     adjusted_timestamps, adjusted_labels, adjusted_scores, adjusted_values = _ignore_missing(
112 |         [timestamps, labels, scores, values], missing=missing
113 |     )
114 | 
115 |     adjusted_scores = at.utils.adjust_scores(
116 |             labels=adjusted_labels, scores=adjusted_scores)
117 |     precision, recall, th = precision_recall_curve(adjusted_labels, adjusted_scores, pos_label=1)
118 | 
119 |     f1_score = 2 * precision * recall / (precision + recall + 1e-6)
120 | 
121 |     arg_max = np.argmax(f1_score)
122 | 
123 |     best_precision, best_recall, best_f1_score = precision[arg_max], recall[arg_max], f1_score[arg_max]
124 |     threshold = th[arg_max]
125 |     return best_f1_score
126 | 
127 | 
128 | def main(finetune_num=200):
129 |     print(finetune_num)
130 |     # with torch.cuda.device(torch.device(f"cuda:{finetune_num//200%2}")):
131 |     clusters = os.listdir(INPUT)
132 |     base_values = []
133 |     base_models = []
134 |     for cluster in tqdm(clusters, total=len(clusters)):
135 |         base, data = cluster_data(os.path.join(INPUT, cluster))
136 |         base_kpi = at.utils.load_kpi(base)
137 |         base_kpi.complete_timestamp()
138 |         base_kpi, _, _ = base_kpi.standardize()
139 |         base_model = make_base_model(base_kpi, BASE_EPOCHS)
140 |         base_models.append(base_model)
141 | 
142 |         dt = pd.read_csv(base)
143 |         base_values.append(dt["value"])
144 | 
145 |     file_list = at.utils.file_list(RAW_INPUT)
146 |     cluster_values = []
147 |     finetune_values = []
148 |     test_kpis = []
149 |     names = []
150 |     for file in file_list:
151 |         data_kpi = at.utils.load_kpi(file)
152 |         data_kpi.complete_timestamp()
153 |         data_kpi, _, _ = data_kpi.standardize()
154 |         filename = at.utils.filename(file)
155 |         names.append(filename)
156 | 
157 |         # split idx
158 |         ts = data_kpi.timestamps
159 |         ts = ts % (60 * 60 * 24)
160 |         split_idx = np.where(ts <= 60)[0]
161 |         _, data_kpi = data_kpi.split_by_idx(split_idx[0], window_size=1)
162 | 
163 |         # split to [for cluster] and [for finetune]
164 |         ts = data_kpi.timestamps
165 |         ts = ts % (60 * 60 * 24)
166 |         split_idx = np.where(ts <= 60)[0]
167 |         cluster_value, finetune_value = data_kpi.split_by_idx(split_idx[1], window_size=1)
168 |         finetune_value, test_value = finetune_value.split_by_idx(finetune_num, window_size=1)
169 | 
170 |         cluster_values.append(cluster_value.values)
171 |         finetune_values.append(finetune_value)
172 |         test_kpis.append(test_value)
173 |     
174 | 
175 |     # get latent var
176 |     base_values = np.asarray(base_values, dtype=np.float32)[..., None]
177 |     base_feature = _get_latent_vectors(base_values)
178 |     
179 |     cluster_values = np.asarray(cluster_values, dtype=np.float32)[..., None]
180 |     cluster_feature = _get_latent_vectors(cluster_values)
181 |     
182 |     tmp_result = {name: 0 for name in names}
183 |     tmp_result["num_of_points"] = finetune_num
184 |     for i, (ft, finetune, test_kpi, name) in enumerate(zip(cluster_feature, finetune_values, test_kpis, names)):
185 |         cluster_idx = np.argmin(np.sum((ft - base_feature)**2, axis=1))
186 |         base_model = base_models[cluster_idx]
187 |         # base_model.fit(kpi=finetune.no_labels(), epochs=DATA_EPOCHS, verbose=0)
188 |         anomaly_scores = base_model.predict(test_kpi, verbose=1)
189 |         f1_score = get_test_results(
190 |                     timestamps=test_kpi.timestamps,
191 |                     labels=test_kpi.labels,
192 |                     scores=anomaly_scores,
193 |                     missing=test_kpi.missing,
194 |                     values=test_kpi.values
195 |                 )
196 |         tmp_result[name] = f1_score
197 |         print(f"{i} - {name}")
198 | 
199 |     return tmp_result
200 | 
201 | if __name__ == '__main__':
202 | 
203 |     # for num in range(200, 5000, 200):
204 |     #     main(num)
205 |     with Pool(1) as pool:
206 |         results = pool.map(main, range(200, 201, 200))
207 |         # results = pool.map(main, range(200, 201, 200))
208 |     final_result = pd.DataFrame(columns=list(results[0].keys()))
209 |     for res in results:
210 |         final_result = final_result.append(res, ignore_index=True)
211 |     
212 |     final_result = final_result.sort_values("num_of_points")
213 |     final_result.to_csv("result.csv", index=False)
214 | 


--------------------------------------------------------------------------------
/anomalytransfer/utils/logging.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import time
  4 | import logging
  5 | import numpy as np
  6 | 
  7 | from typing import Dict
  8 | 
  9 | 
 10 | class ProgLog:
 11 | 
 12 |     def __init__(self, n: int, indent: int = 0):
 13 |         self._n = n
 14 |         self._indent = indent
 15 |         self._current = 1
 16 | 
 17 |     def log(self, **extra):
 18 |         message = ''
 19 |         for k, v in extra.items():
 20 |             message += f' {k}={v}'
 21 |         logging.info(f'{"-" * (2 * self._indent - 1) + " " if self._indent > 0 else ""}'
 22 |                      f'Progress >>> {self._current}/{self._n}{message}')
 23 |         self._current += 1
 24 | 
 25 | 
 26 | class ProgBar:
 27 |     """Displays a progress bar.
 28 | 
 29 |       Arguments:
 30 |           target: Total number of steps expected, None if unknown.
 31 |           width: Progress bar width on screen.
 32 |           verbose: Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose)
 33 |           stateful_metrics: Iterable of string names of metrics that should *not* be
 34 |             averaged over time. Metrics in this list will be displayed as-is. All
 35 |             others will be averaged by the progbar before display.
 36 |           interval: Minimum visual progress update interval (in seconds).
 37 |           unit_name: Display name for step counts (usually "step" or "sample").
 38 |     """
 39 | 
 40 |     def __init__(self,
 41 |                  target,
 42 |                  width=30,
 43 |                  verbose=1,
 44 |                  interval=0.05,
 45 |                  stateful_metrics=None,
 46 |                  unit_name='step'):
 47 |         self.target = target
 48 |         self.width = width
 49 |         self.verbose = verbose
 50 |         self.interval = interval
 51 |         self.unit_name = unit_name
 52 |         if stateful_metrics:
 53 |             self.stateful_metrics = set(stateful_metrics)
 54 |         else:
 55 |             self.stateful_metrics = set()
 56 | 
 57 |         self._dynamic_display = ((hasattr(sys.stdout, 'isatty') and
 58 |                                   sys.stdout.isatty()) or
 59 |                                  'ipykernel' in sys.modules or
 60 |                                  'posix' in sys.modules or
 61 |                                  'PYCHARM_HOSTED' in os.environ)
 62 |         self._total_width = 0
 63 |         self._seen_so_far = 0
 64 |         # We use a dict + list to avoid garbage collection
 65 |         # issues found in OrderedDict
 66 |         self._values = {}
 67 |         self._values_order = []
 68 |         self._start = time.time()
 69 |         self._last_update = 0
 70 | 
 71 |     def update(self, current, values=None, finalize=None):
 72 |         """Updates the progress bar.
 73 | 
 74 |         Arguments:
 75 |             current: Index of current step.
 76 |             values: List of tuples: `(name, value_for_last_step)`. If `name` is in
 77 |               `stateful_metrics`, `value_for_last_step` will be displayed as-is.
 78 |               Else, an average of the metric over time will be displayed.
 79 |             finalize: Whether this is the last update for the progress bar. If
 80 |               `None`, defaults to `current >= self.target`.
 81 |         """
 82 |         if finalize is None:
 83 |             if self.target is None:
 84 |                 finalize = False
 85 |             else:
 86 |                 finalize = current >= self.target
 87 | 
 88 |         values = values or []
 89 |         for k, v in values:
 90 |             if k not in self._values_order:
 91 |                 self._values_order.append(k)
 92 |             if k not in self.stateful_metrics:
 93 |                 # In the case that progress bar doesn't have a target value in the first
 94 |                 # epoch, both on_batch_end and on_epoch_end will be called, which will
 95 |                 # cause 'current' and 'self._seen_so_far' to have the same value. Force
 96 |                 # the minimal value to 1 here, otherwise stateful_metric will be 0s.
 97 |                 value_base = max(current - self._seen_so_far, 1)
 98 |                 if k not in self._values:
 99 |                     self._values[k] = [v * value_base, value_base]
100 |                 else:
101 |                     self._values[k][0] += v * value_base
102 |                     self._values[k][1] += value_base
103 |             else:
104 |                 # Stateful metrics output a numeric value. This representation
105 |                 # means "take an average from a single value" but keeps the
106 |                 # numeric formatting.
107 |                 self._values[k] = [v, 1]
108 |         self._seen_so_far = current
109 | 
110 |         now = time.time()
111 |         info = ' - %.0fs' % (now - self._start)
112 |         if self.verbose == 1:
113 |             if now - self._last_update < self.interval and not finalize:
114 |                 return
115 | 
116 |             prev_total_width = self._total_width
117 |             if self._dynamic_display:
118 |                 sys.stdout.write('\b' * prev_total_width)
119 |                 sys.stdout.write('\r')
120 |             else:
121 |                 sys.stdout.write('\n')
122 | 
123 |             if self.target is not None:
124 |                 numdigits = int(np.log10(self.target)) + 1
125 |                 bar = ('%' + str(numdigits) + 'd/%d [') % (current, self.target)
126 |                 prog = float(current) / self.target
127 |                 prog_width = int(self.width * prog)
128 |                 if prog_width > 0:
129 |                     bar += ('=' * (prog_width - 1))
130 |                     if current < self.target:
131 |                         bar += '>'
132 |                     else:
133 |                         bar += '='
134 |                 bar += ('.' * (self.width - prog_width))
135 |                 bar += ']'
136 |             else:
137 |                 bar = '%7d/Unknown' % current
138 | 
139 |             self._total_width = len(bar)
140 |             sys.stdout.write(bar)
141 | 
142 |             if current:
143 |                 time_per_unit = (now - self._start) / current
144 |             else:
145 |                 time_per_unit = 0
146 | 
147 |             if self.target is None or finalize:
148 |                 if time_per_unit >= 1 or time_per_unit == 0:
149 |                     info += ' %.0fs/%s' % (time_per_unit, self.unit_name)
150 |                 elif time_per_unit >= 1e-3:
151 |                     info += ' %.0fms/%s' % (time_per_unit * 1e3, self.unit_name)
152 |                 else:
153 |                     info += ' %.0fus/%s' % (time_per_unit * 1e6, self.unit_name)
154 |             else:
155 |                 eta = time_per_unit * (self.target - current)
156 |                 if eta > 3600:
157 |                     eta_format = '%d:%02d:%02d' % (eta // 3600,
158 |                                                    (eta % 3600) // 60, eta % 60)
159 |                 elif eta > 60:
160 |                     eta_format = '%d:%02d' % (eta // 60, eta % 60)
161 |                 else:
162 |                     eta_format = '%ds' % eta
163 | 
164 |                 info = ' - ETA: %s' % eta_format
165 | 
166 |             for k in self._values_order:
167 |                 info += ' - %s:' % k
168 |                 if isinstance(self._values[k], list):
169 |                     avg = np.mean(self._values[k][0] / max(1, self._values[k][1]))
170 |                     if abs(avg) > 1e-3:
171 |                         info += ' %.4f' % avg
172 |                     else:
173 |                         info += ' %.4e' % avg
174 |                 else:
175 |                     info += ' %s' % self._values[k]
176 | 
177 |             self._total_width += len(info)
178 |             if prev_total_width > self._total_width:
179 |                 info += (' ' * (prev_total_width - self._total_width))
180 | 
181 |             if finalize:
182 |                 info += '\n'
183 | 
184 |             sys.stdout.write(info)
185 |             sys.stdout.flush()
186 | 
187 |         elif self.verbose == 2:
188 |             if finalize:
189 |                 numdigits = int(np.log10(self.target)) + 1
190 |                 count = ('%' + str(numdigits) + 'd/%d') % (current, self.target)
191 |                 info = count + info
192 |                 for k in self._values_order:
193 |                     info += ' - %s:' % k
194 |                     avg = np.mean(self._values[k][0] / max(1, self._values[k][1]))
195 |                     if avg > 1e-3:
196 |                         info += ' %.4f' % avg
197 |                     else:
198 |                         info += ' %.4e' % avg
199 |                 info += '\n'
200 | 
201 |                 sys.stdout.write(info)
202 |                 sys.stdout.flush()
203 | 
204 |         self._last_update = now
205 | 
206 |     def add(self, n, values=None):
207 |         self.update(self._seen_so_far + n, values)
208 | 
209 | 
210 | def log_test_results(name: str, results: Dict):
211 |     logging.info(f'kpi: {name}')
212 |     logging.info(f'threshold: {results.get("threshold")}')
213 |     logging.info(f'precision: {results.get("precision"):.3f}')
214 |     logging.info(f'recall: {results.get("recall"):.3f}')
215 |     logging.info(f'f1score: {results.get("f1score"):.3f}')
216 | 


--------------------------------------------------------------------------------
/anomalytransfer/transfer/data.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | import torch
  3 | import numpy as np
  4 | from datetime import datetime
  5 | 
  6 | from typing import Dict, Sequence, Tuple, Optional
  7 | from torch.utils.data import TensorDataset
  8 | 
  9 | 
 10 | class KPI:
 11 | 
 12 |     def __init__(self,
 13 |                  timestamps: Sequence,
 14 |                  values: Sequence,
 15 |                  labels: Optional[Sequence] = None,
 16 |                  missing: Optional[Sequence] = None,
 17 |                  name: Optional[str] = None):
 18 |         self.timestamps = np.asarray(timestamps, dtype=np.int)
 19 |         self.values = np.asarray(values, dtype=np.float32)
 20 | 
 21 |         if labels is None:
 22 |             self.labels = np.zeros(np.shape(values), dtype=np.int)
 23 |         else:
 24 |             self.labels = np.asarray(labels, dtype=np.int)
 25 | 
 26 |         if missing is None:
 27 |             self.missing = np.zeros(np.shape(values), dtype=np.int)
 28 |         else:
 29 |             self.missing = np.asarray(missing, dtype=np.int)
 30 | 
 31 |         if name is None:
 32 |             self.name = str(uuid.uuid4())
 33 |         else:
 34 |             self.name = name
 35 | 
 36 |         self.labels[self.missing == 1] = 0
 37 | 
 38 |     @property
 39 |     def abnormal(self) -> np.ndarray:
 40 |         return np.logical_or(self.missing, self.labels).astype(np.int)
 41 | 
 42 |     def complete_timestamp(self):
 43 |         src_idx = np.argsort(self.timestamps)
 44 |         timestamp_sorted = self.timestamps[src_idx]
 45 |         intervals = np.unique(np.diff(timestamp_sorted))
 46 |         interval = np.min(intervals)
 47 |         self.interval = interval
 48 |         if interval == 0:
 49 |             raise ValueError('Duplicated values in `timestamp`')
 50 |         for itv in intervals:
 51 |             if itv % interval != 0:
 52 |                 raise ValueError(
 53 |                     'Not all intervals in `timestamp` are multiples of the minimum interval')
 54 | 
 55 |         length = (timestamp_sorted[-1] - timestamp_sorted[0]) // interval + 1
 56 |         new_timestamps = np.arange(
 57 |             timestamp_sorted[0], timestamp_sorted[-1] + interval, interval, dtype=np.int)
 58 |         new_values = np.zeros([length], dtype=self.values.dtype)
 59 |         new_labels = np.zeros([length], dtype=self.labels.dtype)
 60 |         new_missing = np.ones([length], dtype=self.missing.dtype)
 61 | 
 62 |         dst_idx = np.asarray(
 63 |             (timestamp_sorted - timestamp_sorted[0]) // interval, dtype=np.int)
 64 |         new_values[dst_idx] = self.values[src_idx]
 65 |         new_labels[dst_idx] = self.labels[src_idx]
 66 |         new_missing[dst_idx] = self.missing[src_idx]
 67 | 
 68 |         self.timestamps = new_timestamps
 69 |         self.values = new_values
 70 |         self.labels = new_labels
 71 |         self.missing = new_missing
 72 | 
 73 |     def split(self, ratios: Sequence) -> Tuple['KPI', ...]:
 74 |         if abs(1.0 - sum(ratios)) > 1e-4:
 75 |             raise ValueError('The sum of `ratios` must be 1')
 76 |         partition = np.asarray(np.cumsum(np.asarray(
 77 |             ratios, dtype=np.float32)) * len(self.values), dtype=np.int)
 78 |         partition[-1] = len(self.values)
 79 |         partition = np.concatenate(([0], partition))
 80 |         ret = []
 81 |         for low, high in zip(partition[:-1], partition[1:]):
 82 |             ret.append(KPI(timestamps=self.timestamps[low:high],
 83 |                            values=self.values[low:high],
 84 |                            labels=self.labels[low:high],
 85 |                            missing=self.missing[low:high],
 86 |                            name=self.name))
 87 |         return tuple(ret)
 88 | 
 89 |     def split_by_idx(self, idx: int, window_size: int = 120) -> Tuple['KPI', 'KPI']:
 90 |         assert len(self.timestamps) > idx
 91 |         ret = []
 92 |         ret.append(KPI(timestamps=self.timestamps[:idx],
 93 |                        values=self.values[:idx],
 94 |                        labels=self.labels[:idx],
 95 |                        missing=self.missing[:idx],
 96 |                        name=self.name))
 97 |         # 保留前一天的 window_size - 1个点, 从而解决时间窗口的问题
 98 |         ret.append(KPI(timestamps=self.timestamps[idx-(window_size-1):],
 99 |                        values=self.values[idx-(window_size-1):],
100 |                        labels=self.labels[idx-(window_size-1):],
101 |                        missing=self.missing[idx-(window_size-1):],
102 |                        name=self.name))
103 |         return tuple(ret)
104 | 
105 |     def split_days(self, days: int, window_size: int = 120) -> Tuple[Dict, Dict, 'KPI']:
106 |         """
107 |         split the KPI into train_kpi[...], test_kpi[...]
108 |         the number of train_kpi equals to `days` (depends on time interval)
109 |         """
110 |         total_minutes = days * 24 * 60
111 |         inteval = self.interval / 60  # interval in minute
112 |         num_of_point = int(total_minutes / inteval)
113 |         train_kpi, test_kpi = self.split_by_idx(num_of_point, window_size = window_size)
114 | 
115 |         # split by day
116 |         train_ts = train_kpi.timestamps
117 |         train_datetime = [datetime.fromtimestamp(ts) for ts in train_ts]
118 | 
119 |         test_ts = test_kpi.timestamps
120 |         test_datetime = [datetime.fromtimestamp(ts) for ts in test_ts]
121 | 
122 |         train_week_day_map = self._get_daily_kpi(train_kpi, train_datetime, window_size)
123 |         test_week_day_map = self._get_daily_kpi(test_kpi, test_datetime, window_size)
124 |         return train_week_day_map, test_week_day_map, test_kpi
125 | 
126 |     def standardize(self, mean: Optional[float] = None, std: Optional[float] = None) -> Tuple['KPI', float, float]:
127 |         if (mean is None) != (std is None):
128 |             raise ValueError('`mean` and `std` must be both None or not None')
129 |         if mean is None:
130 |             mean = self.values.mean()
131 |             std = self.values.std()
132 |         values = (self.values - mean) / std
133 |         kpi = KPI(timestamps=self.timestamps, values=values,
134 |                   labels=self.labels, missing=self.missing, name=self.name)
135 |         return kpi, mean, std
136 | 
137 |     def use_labels(self, rate: float = 1.) -> 'KPI':
138 |         if not 0. <= rate <= 1.:
139 |             raise ValueError('`rate` must be in [0, 1]')
140 |         if rate == 0.:
141 |             return KPI(timestamps=self.timestamps, values=self.values, labels=None, missing=self.missing,
142 |                        name=self.name)
143 |         if rate == 1.:
144 |             return self
145 |         labels = np.copy(self.labels)
146 |         anomaly_idx = labels.nonzero()[0]
147 |         drop_idx = np.random.choice(anomaly_idx, round(
148 |             (1 - rate) * len(anomaly_idx)), replace=False)
149 |         labels[drop_idx] = 0
150 |         return KPI(timestamps=self.timestamps, values=self.values, labels=labels, missing=self.missing, name=self.name)
151 | 
152 |     def no_labels(self) -> 'KPI':
153 |         return self.use_labels(0.)
154 | 
155 |     def _get_daily_kpi(self, kpi: 'KPI', datetime_seq: Sequence, window_size: int = 120):
156 |         i = window_size
157 |         week_map = {}
158 |         while i < len(kpi.timestamps):
159 |             if datetime_seq[i].day != datetime_seq[i-1].day:
160 |                 weekday = datetime_seq[i-1].weekday()
161 |                 datetime_seq = datetime_seq[i-(window_size-1):]
162 |                 dst_kpi, kpi = kpi.split_by_idx(i, window_size=window_size)
163 |                 i = window_size
164 |                 if len(dst_kpi.timestamps) > window_size:
165 |                     if weekday not in week_map:
166 |                         week_map[weekday] = []
167 |                     week_map[weekday].append(dst_kpi)
168 |             else:
169 |                 i += 1
170 |         if len(kpi.timestamps) > window_size * 4:
171 |             weekday = datetime_seq[0].weekday()
172 |             if weekday not in week_map:
173 |                 week_map[weekday] = []
174 |             week_map[weekday].append(kpi)
175 |         return week_map
176 | 
177 | 
178 | class KPIDataset:
179 | 
180 |     def __init__(self, kpi: KPI, window_size: int, missing_injection_rate: float = 0.):
181 |         self._window_size = window_size
182 |         self._missing_injection_rate = missing_injection_rate
183 | 
184 |         self._one_hot_minute = self._one_hot(
185 |             self._ts2minute(kpi.timestamps), depth=60)
186 |         self._one_hot_hour = self._one_hot(
187 |             self._ts2hour(kpi.timestamps), depth=24)
188 |         self._one_hot_weekday = self._one_hot(
189 |             self._ts2weekday(kpi.timestamps), depth=7)
190 | 
191 |         self._value_windows = self._to_windows(kpi.values)
192 |         self._label_windows = self._to_windows(kpi.labels)
193 |         self._normal_windows = self._to_windows(1 - kpi.abnormal)
194 | 
195 |         self._time_code = []
196 |         self._values = []
197 |         self._normal = []
198 |         for i in range(len(self._value_windows)):
199 |             values = np.copy(self._value_windows[i]).astype(np.float32)
200 |             labels = np.copy(self._label_windows[i]).astype(np.int)
201 |             normal = np.copy(self._normal_windows[i]).astype(np.int)
202 | 
203 |             injected_missing = np.random.binomial(
204 |                 1, self._missing_injection_rate, np.shape(values[normal == 1]))
205 |             normal[normal == 1] = 1 - injected_missing
206 |             values[np.logical_and(normal == 0, labels == 0)] = 0.
207 | 
208 |             time_index = i + self._window_size - 1
209 |             time_code = np.concatenate(
210 |                 [self._one_hot_minute[time_index], self._one_hot_hour[time_index],
211 |                     self._one_hot_weekday[time_index]],
212 |                 axis=-1
213 |             )
214 | 
215 |             self._time_code.append(time_code)
216 |             self._values.append(values)
217 |             self._normal.append(normal)
218 | 
219 |     def _to_windows(self, series: np.ndarray) -> np.ndarray:
220 |         return np.lib.stride_tricks.as_strided(
221 |             series,
222 |             shape=(np.size(series, 0) - self._window_size + 1, self._window_size),
223 |             strides=(series.strides[-1], series.strides[-1])
224 |         )
225 | 
226 |     @staticmethod
227 |     def _ts2hour(ts: np.ndarray) -> np.ndarray:
228 |         return (ts % 86400) // 3600
229 | 
230 |     @staticmethod
231 |     def _ts2minute(ts: np.ndarray) -> np.ndarray:
232 |         return ((ts % 86400) % 3600) // 60
233 | 
234 |     @staticmethod
235 |     def _ts2weekday(ts: np.ndarray) -> np.ndarray:
236 |         return np.zeros_like(((ts // 86400) + 4) % 7)
237 | 
238 |     @staticmethod
239 |     def _one_hot(indices: Sequence, depth: int) -> np.ndarray:
240 |         return np.eye(depth)[indices]
241 | 
242 |     @property
243 |     def time_code(self) -> np.ndarray:
244 |         return np.asarray(self._time_code, dtype=np.float32)
245 | 
246 |     @property
247 |     def values(self) -> np.ndarray:
248 |         return np.asarray(self._values, dtype=np.float32)
249 | 
250 |     @property
251 |     def normal(self) -> np.ndarray:
252 |         return np.asarray(self._normal, dtype=np.float32)
253 | 
254 |     def to_torch(self, device: str) -> TensorDataset:
255 |         return TensorDataset(torch.as_tensor(self.values, device=torch.device(device)),
256 |                              torch.as_tensor(
257 |                                  self.time_code, device=torch.device(device)),
258 |                              torch.as_tensor(self.normal, device=torch.device(device)))
259 | 


--------------------------------------------------------------------------------
/anomalytransfer/transfer/models.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import numpy as np
  4 | import anomalytransfer as at
  5 | 
  6 | from typing import Sequence, Tuple, Dict, Optional
  7 | from torch.backends import cudnn
  8 | from torch.utils.data import DataLoader
  9 | import time
 10 | 
 11 | ADTSHL = True
 12 | 
 13 | class AutoencoderLayer(torch.nn.Module):
 14 | 
 15 |     def __init__(self, input_dim: int, output_dim: int, hidden_dims: Sequence[int]):
 16 |         super().__init__()
 17 |         self._hidden = torch.nn.Sequential()
 18 |         last_dim = input_dim
 19 | 
 20 | 
 21 |         # adtshl
 22 |         if ADTSHL:
 23 |             for i, hidden_dim in enumerate(hidden_dims):
 24 |                 self._hidden.add_module(f'hidden_{i}', torch.nn.Conv1d(last_dim, hidden_dim, kernel_size=7, stride=1, padding=3))
 25 |                 self._hidden.add_module(f'relu_{i}', torch.nn.ReLU())
 26 |                 last_dim = hidden_dim
 27 |             self._mean = torch.nn.Conv1d(last_dim, output_dim, kernel_size=7, stride=1, padding=3)
 28 |             self._std = torch.nn.Sequential(
 29 |                 torch.nn.Conv1d(last_dim, output_dim, kernel_size=7, stride=1, padding=3),
 30 |                 torch.nn.Softplus(),
 31 |             )
 32 |         else:
 33 |             # naive bagel
 34 |             for i, hidden_dim in enumerate(hidden_dims):
 35 |                 self._hidden.add_module(f'hidden_{i}', torch.nn.Linear(last_dim, hidden_dim))
 36 |                 self._hidden.add_module(f'relu_{i}', torch.nn.ReLU())
 37 |                 last_dim = hidden_dim
 38 |             self._mean = torch.nn.Linear(last_dim, output_dim)
 39 |             self._std = torch.nn.Sequential(
 40 |                 torch.nn.Linear(last_dim, output_dim),
 41 |                 torch.nn.Softplus(),
 42 |             )
 43 | 
 44 |     def forward(self, x: torch.Tensor):
 45 |         if ADTSHL:
 46 |             shape_zero_squeeze = False
 47 |             x = x.unsqueeze(dim=-1)
 48 |             if x.shape[0] == 1:
 49 |                 shape_zero_squeeze = True
 50 |                 x = x.squeeze(dim=0)
 51 |         x = self._hidden(x)
 52 |         mean = self._mean(x)
 53 |         std = self._std(x) + 1e-6
 54 |         if ADTSHL:
 55 |             mean = mean.squeeze(dim=-1)
 56 |             std = std.squeeze(dim=-1)
 57 |             if shape_zero_squeeze:
 58 |                 mean = mean.unsqueeze(dim=0)
 59 |                 std = std.unsqueeze(dim=0)
 60 |         return mean, std
 61 | 
 62 |     def save(self, path: str, mask: Sequence, net: str):
 63 |         for idx in range(0, len(self._hidden), 2):
 64 |             if mask[idx // 2] == 1:
 65 |                 torch.save(self._hidden[idx].state_dict(), os.path.join(path, f'{net}-hidden-{idx // 2}.pt'))
 66 |         if mask[-1] == 1:
 67 |             torch.save(self._mean.state_dict(), os.path.join(path, f'{net}-mean.pt'))
 68 |             torch.save(self._std.state_dict(), os.path.join(path, f'{net}-std.pt'))
 69 | 
 70 |     def load(self, path: str, mask: Sequence, net: str):
 71 |         device = "cuda:0" if torch.cuda.is_available() else "cpu"
 72 |         for idx in range(0, len(self._hidden), 2):
 73 |             if mask[idx // 2] == 1:
 74 |                 self._hidden[idx].load_state_dict(torch.load(os.path.join(path, f'{net}-hidden-{idx // 2}.pt'), map_location=device))
 75 |         if mask[-1] == 1:
 76 |             self._mean.load_state_dict(torch.load(os.path.join(path, f'{net}-mean.pt'), map_location=device))
 77 |             self._std.load_state_dict(torch.load(os.path.join(path, f'{net}-std.pt'), map_location=device))
 78 | 
 79 |     def freeze(self, mask: Sequence):
 80 |         for idx in range(0, len(self._hidden), 2):
 81 |             if mask[idx // 2] == 1:
 82 |                 for p in self._hidden[idx].parameters():
 83 |                     p.requires_grad = False
 84 |         if mask[-1] == 1:
 85 |             for p in self._mean.parameters():
 86 |                 p.requires_grad = False
 87 |             for p in self._std.parameters():
 88 |                 p.requires_grad = False
 89 | 
 90 |     def unfreeze(self):
 91 |         for p in self.parameters():
 92 |             p.requires_grad = True
 93 | 
 94 | 
 95 | class ConditionalVariationalAutoencoder(torch.nn.Module):
 96 | 
 97 |     def __init__(self, encoder: AutoencoderLayer, decoder: AutoencoderLayer, device: str):
 98 |         super().__init__()
 99 |         self._encoder = encoder
100 |         self._decoder = decoder
101 |         self._device = device
102 | 
103 |     def forward(self, inputs, **kwargs):
104 |         x, y = tuple(inputs)
105 |         n_samples = kwargs.get('n_samples', 1)
106 |         concatted = torch.cat([x, y], dim=-1)
107 |         z_mean, z_std = self._encoder(concatted)
108 |         q_zx = torch.distributions.Normal(z_mean, z_std)
109 |         p_z = torch.distributions.Normal(
110 |             torch.zeros(z_mean.size()).to(self._device),
111 |             torch.ones(z_std.size()).to(self._device)
112 |         )
113 |         z = p_z.sample((n_samples,)) * torch.unsqueeze(z_std, 0) + torch.unsqueeze(z_mean, 0)
114 |         y = y.expand(n_samples, -1, -1)
115 |         concatted = torch.cat([z, y], dim=-1)
116 |         x_mean, x_std = self._decoder(concatted)
117 |         p_xz = torch.distributions.Normal(x_mean, x_std)
118 |         return q_zx, p_xz, z
119 | 
120 |     def save_partial(self, path: str, name: str, mask: Sequence):
121 |         path = os.path.join(path, name)
122 |         if not os.path.exists(path):
123 |             os.makedirs(path, exist_ok=True)
124 |         self._encoder.save(path, mask=mask[0], net='encoder')
125 |         self._decoder.save(path, mask=mask[1], net='decoder')
126 | 
127 |     def load_partial(self, path: str, name: str, mask: Sequence):
128 |         path = os.path.join(path, name)
129 |         if not os.path.exists(path):
130 |             os.makedirs(path, exist_ok=True)
131 |         self._encoder.load(path, mask=mask[0], net='encoder')
132 |         self._decoder.load(path, mask=mask[1], net='decoder')
133 | 
134 |     def freeze(self, mask: Sequence):
135 |         if 0 in mask[0] or 0 in mask[1]:
136 |             self._encoder.freeze(mask[0])
137 |             self._decoder.freeze(mask[1])
138 | 
139 |     def unfreeze(self, mask: Sequence):
140 |         if 0 in mask[0] or 0 in mask[1]:
141 |             self._encoder.unfreeze()
142 |             self._decoder.unfreeze()
143 | 
144 | 
145 | class AnomalyDetector:
146 | 
147 |     def __init__(self,
148 |                  window_size: int = 120,
149 |                  hidden_dims: Sequence = (100, 100),
150 |                  latent_dim: int = 8,
151 |                  learning_rate: float = 1e-3,
152 |                  dropout_rate: float = 0.1,
153 |                  device: Optional[str] = None):
154 |         cudnn.benchmark = True
155 |         if device is None:
156 |             self._device = 'cuda' if torch.cuda.is_available() else 'cpu'
157 |         else:
158 |             self._device = device
159 | 
160 |         self._window_size = window_size
161 |         self._hidden_dims = hidden_dims
162 |         self._dropout_rate = dropout_rate
163 |         cond_size = 60 + 24 + 7
164 |         self._model = ConditionalVariationalAutoencoder(
165 |             encoder=AutoencoderLayer(
166 |                 input_dim=window_size + cond_size,
167 |                 output_dim=latent_dim,
168 |                 hidden_dims=hidden_dims,
169 |             ),
170 |             decoder=AutoencoderLayer(
171 |                 input_dim=latent_dim + cond_size,
172 |                 output_dim=window_size,
173 |                 hidden_dims=list(reversed(hidden_dims)),
174 |             ),
175 |             device=self._device
176 |         ).to(self._device)
177 |         self._p_z = torch.distributions.Normal(
178 |             torch.zeros(latent_dim).to(self._device),
179 |             torch.ones(latent_dim).to(self._device)
180 |         )
181 |         self._optimizer = torch.optim.Adam(self._model.parameters(), lr=learning_rate, weight_decay=1e-3)
182 |         self._lr_scheduler = torch.optim.lr_scheduler.StepLR(self._optimizer, step_size=10, gamma=0.75)
183 | 
184 |     @staticmethod
185 |     def _m_elbo(x: torch.Tensor,
186 |                 z: torch.Tensor,
187 |                 normal: torch.Tensor,
188 |                 q_zx: torch.distributions.Normal,
189 |                 p_z: torch.distributions.Normal,
190 |                 p_xz: torch.distributions.Normal) -> torch.Tensor:
191 |         x = torch.unsqueeze(x, 0)
192 |         normal = torch.unsqueeze(normal, 0)
193 |         log_p_xz = p_xz.log_prob(x)
194 |         log_q_zx = torch.sum(q_zx.log_prob(z), -1)
195 |         log_p_z = torch.sum(p_z.log_prob(z), -1)
196 |         ratio = (torch.sum(normal, -1) / float(normal.size()[-1]))
197 |         return torch.mean(torch.sum(log_p_xz * normal, -1) + log_p_z * ratio - log_q_zx)
198 | 
199 |     def _missing_imputation(self,
200 |                             x: torch.Tensor,
201 |                             y: torch.Tensor,
202 |                             normal: torch.Tensor,
203 |                             max_iter: int = 10) -> torch.Tensor:
204 |         with torch.no_grad():
205 |             for _ in range(max_iter):
206 |                 _, p_xz, _ = self._model([x, y])
207 |                 x[normal == 0.] = p_xz.sample()[0][normal == 0.]
208 |         return x
209 | 
210 |     def _train_step(self, x: torch.Tensor, y: torch.Tensor, normal: torch.Tensor) -> torch.Tensor:
211 |         self._optimizer.zero_grad()
212 |         y = torch.nn.Dropout(self._dropout_rate)(y)
213 |         q_zx, p_xz, z = self._model([x, y])
214 |         loss = -self._m_elbo(x, z, normal, q_zx, self._p_z, p_xz)
215 |         loss.backward()
216 |         torch.nn.utils.clip_grad_norm_(self._model.parameters(), max_norm=10.)
217 |         self._optimizer.step()
218 |         return loss
219 | 
220 |     def _validation_step(self, x: torch.Tensor, y: torch.Tensor, normal: torch.Tensor) -> torch.Tensor:
221 |         q_zx, p_xz, z = self._model([x, y])
222 |         return -self._m_elbo(x, z, normal, q_zx, self._p_z, p_xz)
223 | 
224 |     def _test_step(self, x: torch.Tensor, y: torch.Tensor, normal: torch.Tensor) -> Tuple[torch.Tensor, np.ndarray]:
225 |         x = self._missing_imputation(x, y, normal)
226 |         q_zx, p_xz, z = self._model([x, y], n_samples=1 if ADTSHL else 128)
227 |         test_loss = -self._m_elbo(x, z, normal, q_zx, self._p_z, p_xz)
228 |         log_p_xz = p_xz.log_prob(x)
229 |         return test_loss, log_p_xz
230 | 
231 |     def fit(self,
232 |             kpi: 'at.transfer.data.KPI',
233 |             epochs: int,
234 |             validation_kpi: Optional['at.transfer.data.KPI'] = None,
235 |             batch_size: int = 256,
236 |             verbose: int = 1) -> Dict:
237 |         dataset = at.transfer.data.KPIDataset(kpi, window_size=self._window_size, missing_injection_rate=0.01)
238 |         dataset = DataLoader(dataset.to_torch(self._device), batch_size=batch_size, shuffle=True, drop_last=True)
239 |         validation_dataset = None
240 |         if validation_kpi is not None:
241 |             validation_dataset = at.transfer.data.KPIDataset(validation_kpi, window_size=self._window_size)
242 |             validation_dataset = DataLoader(validation_dataset.to_torch(self._device),
243 |                                             batch_size=batch_size,
244 |                                             shuffle=True
245 |                                             )
246 | 
247 |         start = time.time()
248 |         ts = []
249 |         losses = []
250 |         val_losses = []
251 |         history = {}
252 |         progbar = None
253 |         if verbose == 1:
254 |             print('Training Epochs')
255 |             progbar = at.utils.ProgBar(epochs, interval=0.5, stateful_metrics=['loss', 'val_loss'], unit_name='epoch')
256 | 
257 |         for epoch in range(epochs):
258 |             epoch_losses = []
259 |             epoch_val_losses = []
260 |             epoch_val_loss = np.nan
261 | 
262 |             if verbose == 2:
263 |                 print(f'Training Epoch {epoch + 1}/{epochs}')
264 |                 progbar = at.utils.ProgBar(
265 |                     target=len(dataset) + (0 if validation_kpi is None else len(validation_dataset)),
266 |                     interval=0.5
267 |                 )
268 |             self._model.train()
269 |             for batch in dataset:
270 |                 loss = self._train_step(*batch)
271 |                 epoch_losses.append(loss)
272 |                 if verbose == 2:
273 |                     progbar.add(1, values=[('loss', loss.detach().cpu().numpy())])
274 |             epoch_loss = torch.mean(torch.as_tensor(epoch_losses)).numpy()
275 |             ts.append(time.time()-start)
276 |             losses.append(epoch_loss)
277 | 
278 |             if validation_kpi is not None:
279 |                 with torch.no_grad():
280 |                     self._model.eval()
281 |                     for batch in validation_dataset:
282 |                         val_loss = self._validation_step(*batch)
283 |                         epoch_val_losses.append(val_loss)
284 |                         if verbose == 2:
285 |                             progbar.add(1, values=[('val_loss', val_loss.cpu().numpy())])
286 |                 epoch_val_loss = torch.mean(torch.as_tensor(epoch_val_losses)).numpy()
287 |                 val_losses.append(epoch_val_loss)
288 | 
289 |             if verbose == 1:
290 |                 values = []
291 |                 if not np.isnan(epoch_loss):
292 |                     values.append(('loss', epoch_loss))
293 |                 if not np.isnan(epoch_val_loss):
294 |                     values.append(('val_loss', epoch_val_loss))
295 |                 progbar.add(1, values=values)
296 | 
297 |             self._lr_scheduler.step()
298 | 
299 |         history['loss'] = losses
300 |         history['ts'] = ts
301 |         if len(val_losses) > 0:
302 |             history['val_loss'] = val_losses
303 |         return history
304 | 
305 |     def predict(self, kpi: 'at.transfer.data.KPI', batch_size: int = 256, verbose: int = 1) -> np.ndarray:
306 |         kpi = kpi.no_labels()
307 |         dataset = at.transfer.data.KPIDataset(kpi, window_size=self._window_size)
308 |         dataset = DataLoader(dataset.to_torch(self._device), batch_size=batch_size)
309 |         progbar = None
310 |         if verbose == 1:
311 |             print('Testing Epoch')
312 |             progbar = at.utils.ProgBar(len(dataset), interval=0.5)
313 |         anomaly_scores = []
314 |         with torch.no_grad():
315 |             self._model.eval()
316 |             for batch in dataset:
317 |                 test_loss, log_p_xz = self._test_step(*batch)
318 |                 anomaly_scores.extend(-torch.mean(log_p_xz[:, :, -1], dim=0).cpu())
319 |                 if verbose == 1:
320 |                     progbar.add(1, values=[('test_loss', test_loss.cpu().numpy())])
321 |         anomaly_scores = np.asarray(anomaly_scores, dtype=np.float32)
322 |         return np.concatenate([np.ones(self._window_size - 1) * np.min(anomaly_scores), anomaly_scores])
323 | 
324 |     def save(self, path: str, name: str):
325 |         mask = [[1] * (len(self._hidden_dims) + 1)] * 2
326 |         self.save_partial(path, name, mask)
327 | 
328 |     def load(self, path: str, name: str):
329 |         mask = [[1] * (len(self._hidden_dims) + 1)] * 2
330 |         self.load_partial(path, name, mask)
331 | 
332 |     def save_partial(self, path: str, name: str, mask: Sequence):
333 |         self._model.save_partial(path, name, mask)
334 | 
335 |     def load_partial(self, path: str, name: str, mask: Sequence):
336 |         self._model.load_partial(path, name, mask)
337 | 
338 |     def freeze(self, mask: Sequence):
339 |         self._model.freeze(mask)
340 | 
341 |     def unfreeze(self, mask: Sequence):
342 |         self._model.unfreeze(mask)
343 | 
344 | 
345 | def sbd_(a: 'at.transfer.data.KPI', b: 'at.transfer.data.KPI') -> float:
346 |     l2_a = np.linalg.norm(a.values)
347 |     l2_b = np.linalg.norm(b.values)
348 |     cross_correlation = np.convolve(a.values, b.values, mode='full')
349 |     return 1 - np.max(cross_correlation) / (l2_a * l2_b)
350 | 
351 | 
352 | def find_optimal_mask(sbd: float, less_mask: Sequence, greater_mask: Sequence, threshold: float = 0.3) -> Sequence:
353 |     if sbd <= threshold:
354 |         return less_mask
355 |     return greater_mask
356 | 


--------------------------------------------------------------------------------
/anomalytransfer/transfer/spot.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python3
   2 | # -*- coding: utf-8 -*-
   3 | """
   4 | Created on Mon Dec 12 10:08:16 2016
   5 | 
   6 | @author: Alban Siffer 
   7 | @company: Amossys
   8 | @license: GNU GPLv3
   9 | """
  10 | 
  11 | from scipy.optimize import minimize
  12 | from math import log,floor
  13 | import numpy as np
  14 | import pandas as pd
  15 | import matplotlib.pyplot as plt
  16 | import tqdm
  17 | 
  18 | # colors for plot
  19 | deep_saffron = '#FF9933'
  20 | air_force_blue = '#5D8AA8'
  21 | 
  22 | 
  23 | """
  24 | ================================= MAIN CLASS ==================================
  25 | """
  26 | 
  27 | class SPOT:
  28 |     """
  29 |     This class allows to run SPOT algorithm on univariate dataset (upper-bound)
  30 |     
  31 |     Attributes
  32 |     ----------
  33 |     proba : float
  34 |         Detection level (risk), chosen by the user
  35 |         
  36 |     extreme_quantile : float
  37 |         current threshold (bound between normal and abnormal events)
  38 |         
  39 |     data : numpy.array
  40 |         stream
  41 |     
  42 |     init_data : numpy.array
  43 |         initial batch of observations (for the calibration/initialization step)
  44 |     
  45 |     init_threshold : float
  46 |         initial threshold computed during the calibration step
  47 |     
  48 |     peaks : numpy.array
  49 |         array of peaks (excesses above the initial threshold)
  50 |     
  51 |     n : int
  52 |         number of observed values
  53 |     
  54 |     Nt : int
  55 |         number of observed peaks
  56 |     """
  57 |     
  58 |     def __init__(self, q = 1e-4):
  59 |         """
  60 |         Constructor
  61 | 
  62 | 	    Parameters
  63 | 	    ----------
  64 | 	    q
  65 | 		    Detection level (risk)
  66 | 	
  67 | 	    Returns
  68 | 	    ----------
  69 |     	SPOT object
  70 |         """
  71 |         self.proba = q
  72 |         self.extreme_quantile = None
  73 |         self.data = None
  74 |         self.init_data = None
  75 |         self.init_threshold = None
  76 |         self.peaks = None
  77 |         self.n = 0
  78 |         self.Nt = 0
  79 |         
  80 |     def __str__(self):
  81 |         s = ''
  82 |         s += 'Streaming Peaks-Over-Threshold Object\n'
  83 |         s += 'Detection level q = %s\n' % self.proba
  84 |         if self.data is not None:
  85 |             s += 'Data imported : Yes\n'
  86 |             s += '\t initialization  : %s values\n' % self.init_data.size
  87 |             s += '\t stream : %s values\n' % self.data.size
  88 |         else:
  89 |             s += 'Data imported : No\n'
  90 |             return s
  91 |             
  92 |         if self.n == 0:
  93 |             s += 'Algorithm initialized : No\n'
  94 |         else:
  95 |             s += 'Algorithm initialized : Yes\n'
  96 |             s += '\t initial threshold : %s\n' % self.init_threshold
  97 |             
  98 |             r = self.n-self.init_data.size
  99 |             if r > 0:
 100 |                 s += 'Algorithm run : Yes\n'
 101 |                 s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n)
 102 |             else:
 103 |                 s += '\t number of peaks  : %s\n' % self.Nt
 104 |                 s += '\t extreme quantile : %s\n' % self.extreme_quantile
 105 |                 s += 'Algorithm run : No\n'
 106 |         return s
 107 |     
 108 |     
 109 |     def fit(self,init_data,data):
 110 |         """
 111 |         Import data to SPOT object
 112 |         
 113 |         Parameters
 114 | 	    ----------
 115 | 	    init_data : list, numpy.array or pandas.Series
 116 | 		    initial batch to calibrate the algorithm
 117 |             
 118 |         data : numpy.array
 119 | 		    data for the run (list, np.array or pd.series)
 120 | 	
 121 |         """
 122 |         if isinstance(data,list):
 123 |             self.data = np.array(data)
 124 |         elif isinstance(data,np.ndarray):
 125 |             self.data = data
 126 |         elif isinstance(data,pd.Series):
 127 |             self.data = data.values
 128 |         else:
 129 |             print('This data format (%s) is not supported' % type(data))
 130 |             return
 131 |             
 132 |         if isinstance(init_data,list):
 133 |             self.init_data = np.array(init_data)
 134 |         elif isinstance(init_data,np.ndarray):
 135 |             self.init_data = init_data
 136 |         elif isinstance(init_data,pd.Series):
 137 |             self.init_data = init_data.values
 138 |         elif isinstance(init_data,int):
 139 |             self.init_data = self.data[:init_data]
 140 |             self.data = self.data[init_data:]
 141 |         elif isinstance(init_data,float) & (init_data<1) & (init_data>0):
 142 |             r = int(init_data*data.size)
 143 |             self.init_data = self.data[:r]
 144 |             self.data = self.data[r:]
 145 |         else:
 146 |             print('The initial data cannot be set')
 147 |             return
 148 |         
 149 |     def add(self,data):
 150 |         """
 151 |         This function allows to append data to the already fitted data
 152 |         
 153 |         Parameters
 154 | 	    ----------
 155 | 	    data : list, numpy.array, pandas.Series
 156 | 		    data to append
 157 |         """
 158 |         if isinstance(data,list):
 159 |             data = np.array(data)
 160 |         elif isinstance(data,np.ndarray):
 161 |             data = data
 162 |         elif isinstance(data,pd.Series):
 163 |             data = data.values
 164 |         else:
 165 |             print('This data format (%s) is not supported' % type(data))
 166 |             return
 167 |         
 168 |         self.data = np.append(self.data,data)
 169 |         return
 170 |     
 171 |     def initialize(self, level = 0.98, verbose = True):
 172 |         """
 173 |         Run the calibration (initialization) step
 174 |         
 175 |         Parameters
 176 | 	    ----------
 177 |         level : float
 178 |             (default 0.98) Probability associated with the initial threshold t 
 179 | 	    verbose : bool
 180 | 		    (default = True) If True, gives details about the batch initialization
 181 |         """
 182 |         level = level-floor(level)
 183 |         
 184 |         n_init = self.init_data.size
 185 |         
 186 |         S = np.sort(self.init_data)     # we sort X to get the empirical quantile
 187 |         self.init_threshold = S[int(level*n_init)] # t is fixed for the whole algorithm
 188 | 
 189 |         # initial peaks
 190 |         self.peaks = self.init_data[self.init_data>self.init_threshold]-self.init_threshold 
 191 |         self.Nt = self.peaks.size
 192 |         self.n = n_init
 193 |         
 194 |         if verbose:
 195 |             print('Initial threshold : %s' % self.init_threshold)
 196 |             print('Number of peaks : %s' % self.Nt)
 197 |             print('Grimshaw maximum log-likelihood estimation ... ', end = '')
 198 |             
 199 |         g,s,l = self._grimshaw()
 200 |         self.extreme_quantile = self._quantile(g,s)
 201 |         
 202 |         if verbose:
 203 |             print('[done]')
 204 |             print('\t'+chr(0x03B3) + ' = ' + str(g))
 205 |             print('\t'+chr(0x03C3) + ' = ' + str(s))
 206 |             print('\tL = ' + str(l))
 207 |             print('Extreme quantile (probability = %s): %s' % (self.proba,self.extreme_quantile))
 208 |         
 209 |         return 
 210 |     
 211 |     
 212 |     
 213 |     
 214 |     def _rootsFinder(fun,jac,bounds,npoints,method):
 215 |         """
 216 |         Find possible roots of a scalar function
 217 |         
 218 |         Parameters
 219 |         ----------
 220 |         fun : function
 221 | 		    scalar function 
 222 |         jac : function
 223 |             first order derivative of the function  
 224 |         bounds : tuple
 225 |             (min,max) interval for the roots search    
 226 |         npoints : int
 227 |             maximum number of roots to output      
 228 |         method : str
 229 |             'regular' : regular sample of the search interval, 'random' : uniform (distribution) sample of the search interval
 230 |         
 231 |         Returns
 232 |         ----------
 233 |         numpy.array
 234 |             possible roots of the function
 235 |         """
 236 |         if method == 'regular':
 237 |             step = (bounds[1]-bounds[0])/(npoints+1)
 238 |             X0 = np.arange(bounds[0]+step,bounds[1],step)
 239 |         elif method == 'random':
 240 |             X0 = np.random.uniform(bounds[0],bounds[1],npoints)
 241 |         
 242 |         def objFun(X,f,jac):
 243 |             g = 0
 244 |             j = np.zeros(X.shape)
 245 |             i = 0
 246 |             for x in X:
 247 |                 fx = f(x)
 248 |                 g = g+fx**2
 249 |                 j[i] = 2*fx*jac(x)
 250 |                 i = i+1
 251 |             return g,j
 252 |         
 253 |         opt = minimize(lambda X:objFun(X,fun,jac), X0, 
 254 |                        method='L-BFGS-B', 
 255 |                        jac=True, bounds=[bounds]*len(X0))
 256 |         
 257 |         X = opt.x
 258 |         np.round(X,decimals = 5)
 259 |         return np.unique(X)
 260 |     
 261 |     
 262 |     def _log_likelihood(Y,gamma,sigma):
 263 |         """
 264 |         Compute the log-likelihood for the Generalized Pareto Distribution (μ=0)
 265 |         
 266 |         Parameters
 267 |         ----------
 268 |         Y : numpy.array
 269 | 		    observations
 270 |         gamma : float
 271 |             GPD index parameter
 272 |         sigma : float
 273 |             GPD scale parameter (>0)   
 274 | 
 275 |         Returns
 276 |         ----------
 277 |         float
 278 |             log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0)
 279 |         """
 280 |         n = Y.size
 281 |         if gamma != 0:
 282 |             tau = gamma/sigma
 283 |             L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum()
 284 |         else:
 285 |             L = n * ( 1 + log(Y.mean()) )
 286 |         return L
 287 | 
 288 | 
 289 |     def _grimshaw(self,epsilon = 1e-8, n_points = 10):
 290 |         """
 291 |         Compute the GPD parameters estimation with the Grimshaw's trick
 292 |         
 293 |         Parameters
 294 |         ----------
 295 |         epsilon : float
 296 | 		    numerical parameter to perform (default : 1e-8)
 297 |         n_points : int
 298 |             maximum number of candidates for maximum likelihood (default : 10)
 299 | 
 300 |         Returns
 301 |         ----------
 302 |         gamma_best,sigma_best,ll_best
 303 |             gamma estimates, sigma estimates and corresponding log-likelihood
 304 |         """
 305 |         def u(s):
 306 |             return 1 + np.log(s).mean()
 307 |             
 308 |         def v(s):
 309 |             return np.mean(1/s)
 310 |         
 311 |         def w(Y,t):
 312 |             s = 1+t*Y
 313 |             us = u(s)
 314 |             vs = v(s)
 315 |             return us*vs-1
 316 |         
 317 |         def jac_w(Y,t):
 318 |             s = 1+t*Y
 319 |             us = u(s)
 320 |             vs = v(s)
 321 |             jac_us = (1/t)*(1-vs)
 322 |             jac_vs = (1/t)*(-vs+np.mean(1/s**2))
 323 |             return us*jac_vs+vs*jac_us
 324 |             
 325 |     
 326 |         Ym = self.peaks.min()
 327 |         YM = self.peaks.max()
 328 |         Ymean = self.peaks.mean()
 329 |         
 330 |         
 331 |         a = -1/YM
 332 |         if abs(a)<2*epsilon:
 333 |             epsilon = abs(a)/n_points
 334 |         
 335 |         a = a + epsilon
 336 |         b = 2*(Ymean-Ym)/(Ymean*Ym)
 337 |         c = 2*(Ymean-Ym)/(Ym**2)
 338 |     
 339 |         # We look for possible roots
 340 |         left_zeros = SPOT._rootsFinder(lambda t: w(self.peaks,t),
 341 |                                  lambda t: jac_w(self.peaks,t),
 342 |                                  (a+epsilon,-epsilon),
 343 |                                  n_points,'regular')
 344 |         
 345 |         right_zeros = SPOT._rootsFinder(lambda t: w(self.peaks,t),
 346 |                                   lambda t: jac_w(self.peaks,t),
 347 |                                   (b,c),
 348 |                                   n_points,'regular')
 349 |     
 350 |         # all the possible roots
 351 |         zeros = np.concatenate((left_zeros,right_zeros))
 352 |         
 353 |         # 0 is always a solution so we initialize with it
 354 |         gamma_best = 0
 355 |         sigma_best = Ymean
 356 |         ll_best = SPOT._log_likelihood(self.peaks,gamma_best,sigma_best)
 357 |         
 358 |         # we look for better candidates
 359 |         for z in zeros:
 360 |             gamma = u(1+z*self.peaks)-1
 361 |             sigma = gamma/z
 362 |             ll = SPOT._log_likelihood(self.peaks,gamma,sigma)
 363 |             if ll>ll_best:
 364 |                 gamma_best = gamma
 365 |                 sigma_best = sigma
 366 |                 ll_best = ll
 367 |     
 368 |         return gamma_best,sigma_best,ll_best
 369 | 
 370 |     
 371 | 
 372 |     def _quantile(self,gamma,sigma):
 373 |         """
 374 |         Compute the quantile at level 1-q
 375 |         
 376 |         Parameters
 377 |         ----------
 378 |         gamma : float
 379 | 		    GPD parameter
 380 |         sigma : float
 381 |             GPD parameter
 382 | 
 383 |         Returns
 384 |         ----------
 385 |         float
 386 |             quantile at level 1-q for the GPD(γ,σ,μ=0)
 387 |         """
 388 |         r = self.n * self.proba / self.Nt
 389 |         if gamma != 0:
 390 |             return self.init_threshold + (sigma/gamma)*(pow(r,-gamma)-1)
 391 |         else:
 392 |             return self.init_threshold - sigma*log(r)
 393 | 
 394 |         
 395 |     def run(self, with_alarm = True):
 396 |         """
 397 |         Run SPOT on the stream
 398 |         
 399 |         Parameters
 400 |         ----------
 401 |         with_alarm : bool
 402 | 		    (default = True) If False, SPOT will adapt the threshold assuming \
 403 |             there is no abnormal values
 404 | 
 405 | 
 406 |         Returns
 407 |         ----------
 408 |         dict
 409 |             keys : 'thresholds' and 'alarms'
 410 |             
 411 |             'thresholds' contains the extreme quantiles and 'alarms' contains \
 412 |             the indexes of the values which have triggered alarms
 413 |             
 414 |         """
 415 |         if (self.n>self.init_data.size):
 416 |             print('Warning : the algorithm seems to have already been run, you \
 417 |             should initialize before running again')
 418 |             return {}
 419 |         
 420 |         # list of the thresholds
 421 |         th = []
 422 |         alarm = []
 423 |         # Loop over the stream
 424 |         for i in tqdm.tqdm(range(self.data.size)):
 425 |     
 426 |             # If the observed value exceeds the current threshold (alarm case)
 427 |             if self.data[i]>self.extreme_quantile:
 428 |                 # if we want to alarm, we put it in the alarm list
 429 |                 if with_alarm:
 430 |                     alarm.append(i)
 431 |                 # otherwise we add it in the peaks
 432 |                 else:
 433 |                     self.peaks = np.append(self.peaks,self.data[i]-self.init_threshold)
 434 |                     self.Nt += 1
 435 |                     self.n += 1
 436 |                     # and we update the thresholds
 437 | 
 438 |                     g,s,l = self._grimshaw()
 439 |                     self.extreme_quantile = self._quantile(g,s)
 440 | 
 441 |             # case where the value exceeds the initial threshold but not the alarm ones
 442 |             elif self.data[i]>self.init_threshold:
 443 |                     # we add it in the peaks
 444 |                     self.peaks = np.append(self.peaks,self.data[i]-self.init_threshold)
 445 |                     self.Nt += 1
 446 |                     self.n += 1
 447 |                     # and we update the thresholds
 448 | 
 449 |                     g,s,l = self._grimshaw()
 450 |                     self.extreme_quantile = self._quantile(g,s)
 451 |             else:
 452 |                 self.n += 1
 453 | 
 454 |                 
 455 |             th.append(self.extreme_quantile) # thresholds record
 456 |         
 457 |         return {'thresholds' : th, 'alarms': alarm}
 458 |     
 459 | 
 460 |     def plot(self,run_results,with_alarm = True):
 461 |         """
 462 |         Plot the results of given by the run
 463 |         
 464 |         Parameters
 465 |         ----------
 466 |         run_results : dict
 467 |             results given by the 'run' method
 468 |         with_alarm : bool
 469 | 		    (default = True) If True, alarms are plotted.
 470 | 
 471 | 
 472 |         Returns
 473 |         ----------
 474 |         list
 475 |             list of the plots
 476 |             
 477 |         """
 478 |         x = range(self.data.size)
 479 |         K = run_results.keys()
 480 |         
 481 |         ts_fig, = plt.plot(x,self.data,color=air_force_blue)
 482 |         fig = [ts_fig]
 483 |         
 484 |         if 'thresholds' in K:
 485 |             th = run_results['thresholds']
 486 |             th_fig, = plt.plot(x,th,color=deep_saffron,lw=2,ls='dashed')
 487 |             fig.append(th_fig)
 488 |         
 489 |         if with_alarm and ('alarms' in K):
 490 |             alarm = run_results['alarms']
 491 |             al_fig = plt.scatter(alarm,self.data[alarm],color='red')
 492 |             fig.append(al_fig)
 493 |             
 494 |         plt.xlim((0,self.data.size))
 495 | 
 496 |         
 497 |         return fig
 498 |             
 499 | 
 500 | 
 501 | 
 502 | 
 503 | 
 504 | 
 505 | 
 506 | 
 507 | 
 508 | """
 509 | ============================ UPPER & LOWER BOUNDS =============================
 510 | """
 511 | 
 512 | 
 513 | 
 514 | 
 515 | class biSPOT:
 516 |     """
 517 |     This class allows to run biSPOT algorithm on univariate dataset (upper and lower bounds)
 518 |     
 519 |     Attributes
 520 |     ----------
 521 |     proba : float
 522 |         Detection level (risk), chosen by the user
 523 |         
 524 |     extreme_quantile : float
 525 |         current threshold (bound between normal and abnormal events)
 526 |         
 527 |     data : numpy.array
 528 |         stream
 529 |     
 530 |     init_data : numpy.array
 531 |         initial batch of observations (for the calibration/initialization step)
 532 |     
 533 |     init_threshold : float
 534 |         initial threshold computed during the calibration step
 535 |     
 536 |     peaks : numpy.array
 537 |         array of peaks (excesses above the initial threshold)
 538 |     
 539 |     n : int
 540 |         number of observed values
 541 |     
 542 |     Nt : int
 543 |         number of observed peaks
 544 |     """
 545 |     def __init__(self, q = 1e-4):
 546 |         """
 547 |         Constructor
 548 | 
 549 | 	    Parameters
 550 | 	    ----------
 551 | 	    q
 552 | 		    Detection level (risk)
 553 | 	
 554 | 	    Returns
 555 | 	    ----------
 556 |         biSPOT object
 557 |         """
 558 |         self.proba = q
 559 |         self.data = None
 560 |         self.init_data = None
 561 |         self.n = 0
 562 |         nonedict =  {'up':None,'down':None}
 563 |         
 564 |         self.extreme_quantile = dict.copy(nonedict)
 565 |         self.init_threshold = dict.copy(nonedict)
 566 |         self.peaks = dict.copy(nonedict)
 567 |         self.gamma = dict.copy(nonedict)
 568 |         self.sigma = dict.copy(nonedict)
 569 |         self.Nt = {'up':0,'down':0}
 570 |         
 571 |         
 572 |     def __str__(self):
 573 |         s = ''
 574 |         s += 'Streaming Peaks-Over-Threshold Object\n'
 575 |         s += 'Detection level q = %s\n' % self.proba
 576 |         if self.data is not None:
 577 |             s += 'Data imported : Yes\n'
 578 |             s += '\t initialization  : %s values\n' % self.init_data.size
 579 |             s += '\t stream : %s values\n' % self.data.size
 580 |         else:
 581 |             s += 'Data imported : No\n'
 582 |             return s
 583 |             
 584 |         if self.n == 0:
 585 |             s += 'Algorithm initialized : No\n'
 586 |         else:
 587 |             s += 'Algorithm initialized : Yes\n'
 588 |             s += '\t initial threshold : %s\n' % self.init_threshold
 589 |             
 590 |             r = self.n-self.init_data.size
 591 |             if r > 0:
 592 |                 s += 'Algorithm run : Yes\n'
 593 |                 s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n)
 594 |                 s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n)
 595 |             else:
 596 |                 s += '\t number of peaks  : %s\n' % self.Nt
 597 |                 s += '\t upper extreme quantile : %s\n' % self.extreme_quantile['up']
 598 |                 s += '\t lower extreme quantile : %s\n' % self.extreme_quantile['down']
 599 |                 s += 'Algorithm run : No\n'
 600 |         return s
 601 |     
 602 |     
 603 |     def fit(self,init_data,data):
 604 |         """
 605 |         Import data to biSPOT object
 606 |         
 607 |         Parameters
 608 | 	    ----------
 609 | 	    init_data : list, numpy.array or pandas.Series
 610 | 		    initial batch to calibrate the algorithm ()
 611 |             
 612 |         data : numpy.array
 613 | 		    data for the run (list, np.array or pd.series)
 614 | 	
 615 |         """
 616 |         if isinstance(data,list):
 617 |             self.data = np.array(data)
 618 |         elif isinstance(data,np.ndarray):
 619 |             self.data = data
 620 |         elif isinstance(data,pd.Series):
 621 |             self.data = data.values
 622 |         else:
 623 |             print('This data format (%s) is not supported' % type(data))
 624 |             return
 625 |             
 626 |         if isinstance(init_data,list):
 627 |             self.init_data = np.array(init_data)
 628 |         elif isinstance(init_data,np.ndarray):
 629 |             self.init_data = init_data
 630 |         elif isinstance(init_data,pd.Series):
 631 |             self.init_data = init_data.values
 632 |         elif isinstance(init_data,int):
 633 |             self.init_data = self.data[:init_data]
 634 |             self.data = self.data[init_data:]
 635 |         elif isinstance(init_data,float) & (init_data<1) & (init_data>0):
 636 |             r = int(init_data*data.size)
 637 |             self.init_data = self.data[:r]
 638 |             self.data = self.data[r:]
 639 |         else:
 640 |             print('The initial data cannot be set')
 641 |             return
 642 |         
 643 |     def add(self,data):
 644 |         """
 645 |         This function allows to append data to the already fitted data
 646 |         
 647 |         Parameters
 648 | 	    ----------
 649 | 	    data : list, numpy.array, pandas.Series
 650 | 		    data to append
 651 |         """
 652 |         if isinstance(data,list):
 653 |             data = np.array(data)
 654 |         elif isinstance(data,np.ndarray):
 655 |             data = data
 656 |         elif isinstance(data,pd.Series):
 657 |             data = data.values
 658 |         else:
 659 |             print('This data format (%s) is not supported' % type(data))
 660 |             return
 661 |         
 662 |         self.data = np.append(self.data,data)
 663 |         return
 664 | 
 665 |     def initialize(self, verbose = True):
 666 |         """
 667 |         Run the calibration (initialization) step
 668 |         
 669 |         Parameters
 670 | 	    ----------
 671 | 	    verbose : bool
 672 | 		    (default = True) If True, gives details about the batch initialization
 673 |         """
 674 |         n_init = self.init_data.size
 675 |         
 676 |         S = np.sort(self.init_data)     # we sort X to get the empirical quantile
 677 |         self.init_threshold['up'] = S[int(0.98*n_init)] # t is fixed for the whole algorithm
 678 |         self.init_threshold['down'] = S[int(0.02*n_init)] # t is fixed for the whole algorithm
 679 | 
 680 |         # initial peaks
 681 |         self.peaks['up'] = self.init_data[self.init_data>self.init_threshold['up']]-self.init_threshold['up']
 682 |         self.peaks['down'] = -(self.init_data[self.init_data<self.init_threshold['down']]-self.init_threshold['down'])
 683 |         self.Nt['up'] = self.peaks['up'].size
 684 |         self.Nt['down'] = self.peaks['down'].size
 685 |         self.n = n_init
 686 |         
 687 |         if verbose:
 688 |             print('Initial threshold : %s' % self.init_threshold)
 689 |             print('Number of peaks : %s' % self.Nt)
 690 |             print('Grimshaw maximum log-likelihood estimation ... ', end = '')
 691 |             
 692 |         l = {'up':None,'down':None}
 693 |         for side in ['up','down']:
 694 |             g,s,l[side] = self._grimshaw(side)
 695 |             self.extreme_quantile[side] = self._quantile(side,g,s)
 696 |             self.gamma[side] = g
 697 |             self.sigma[side] = s
 698 |         
 699 |         ltab = 20
 700 |         form = ('\t'+'%20s' + '%20.2f' + '%20.2f')
 701 |         if verbose:
 702 |             print('[done]')
 703 |             print('\t' + 'Parameters'.rjust(ltab) + 'Upper'.rjust(ltab) + 'Lower'.rjust(ltab))
 704 |             print('\t' + '-'*ltab*3)
 705 |             print(form % (chr(0x03B3),self.gamma['up'],self.gamma['down']))
 706 |             print(form % (chr(0x03C3),self.sigma['up'],self.sigma['down']))
 707 |             print(form % ('likelihood',l['up'],l['down']))
 708 |             print(form % ('Extreme quantile',self.extreme_quantile['up'],self.extreme_quantile['down']))
 709 |             print('\t' + '-'*ltab*3)
 710 |         return 
 711 |     
 712 |     
 713 |     
 714 |     
 715 |     def _rootsFinder(fun,jac,bounds,npoints,method):
 716 |         """
 717 |         Find possible roots of a scalar function
 718 |         
 719 |         Parameters
 720 |         ----------
 721 |         fun : function
 722 | 		    scalar function 
 723 |         jac : function
 724 |             first order derivative of the function  
 725 |         bounds : tuple
 726 |             (min,max) interval for the roots search    
 727 |         npoints : int
 728 |             maximum number of roots to output      
 729 |         method : str
 730 |             'regular' : regular sample of the search interval, 'random' : uniform (distribution) sample of the search interval
 731 |         
 732 |         Returns
 733 |         ----------
 734 |         numpy.array
 735 |             possible roots of the function
 736 |         """
 737 |         if method == 'regular':
 738 |             step = (bounds[1]-bounds[0])/(npoints+1)
 739 |             X0 = np.arange(bounds[0]+step,bounds[1],step)
 740 |         elif method == 'random':
 741 |             X0 = np.random.uniform(bounds[0],bounds[1],npoints)
 742 |         
 743 |         def objFun(X,f,jac):
 744 |             g = 0
 745 |             j = np.zeros(X.shape)
 746 |             i = 0
 747 |             for x in X:
 748 |                 fx = f(x)
 749 |                 g = g+fx**2
 750 |                 j[i] = 2*fx*jac(x)
 751 |                 i = i+1
 752 |             return g,j
 753 |         opt = minimize(lambda X:objFun(X,fun,jac), X0, 
 754 |                        method='L-BFGS-B', 
 755 |                        jac=True, bounds=[bounds]*len(X0))
 756 |         
 757 |         X = opt.x
 758 |         np.round(X,decimals = 5)
 759 |         return np.unique(X)
 760 |     
 761 |     
 762 |     def _log_likelihood(Y,gamma,sigma):
 763 |         """
 764 |         Compute the log-likelihood for the Generalized Pareto Distribution (μ=0)
 765 |         
 766 |         Parameters
 767 |         ----------
 768 |         Y : numpy.array
 769 | 		    observations
 770 |         gamma : float
 771 |             GPD index parameter
 772 |         sigma : float
 773 |             GPD scale parameter (>0)   
 774 | 
 775 |         Returns
 776 |         ----------
 777 |         float
 778 |             log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0)
 779 |         """
 780 |         n = Y.size
 781 |         if gamma != 0:
 782 |             tau = gamma/sigma
 783 |             L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum()
 784 |         else:
 785 |             L = n * ( 1 + log(Y.mean()) )
 786 |         return L
 787 | 
 788 | 
 789 |     def _grimshaw(self,side,epsilon = 1e-8, n_points = 10):
 790 |         """
 791 |         Compute the GPD parameters estimation with the Grimshaw's trick
 792 |         
 793 |         Parameters
 794 |         ----------
 795 |         epsilon : float
 796 | 		    numerical parameter to perform (default : 1e-8)
 797 |         n_points : int
 798 |             maximum number of candidates for maximum likelihood (default : 10)
 799 | 
 800 |         Returns
 801 |         ----------
 802 |         gamma_best,sigma_best,ll_best
 803 |             gamma estimates, sigma estimates and corresponding log-likelihood
 804 |         """
 805 |         def u(s):
 806 |             return 1 + np.log(s).mean()
 807 |             
 808 |         def v(s):
 809 |             return np.mean(1/s)
 810 |         
 811 |         def w(Y,t):
 812 |             s = 1+t*Y
 813 |             us = u(s)
 814 |             vs = v(s)
 815 |             return us*vs-1
 816 |         
 817 |         def jac_w(Y,t):
 818 |             s = 1+t*Y
 819 |             us = u(s)
 820 |             vs = v(s)
 821 |             jac_us = (1/t)*(1-vs)
 822 |             jac_vs = (1/t)*(-vs+np.mean(1/s**2))
 823 |             return us*jac_vs+vs*jac_us
 824 |             
 825 |     
 826 |         Ym = self.peaks[side].min()
 827 |         YM = self.peaks[side].max()
 828 |         Ymean = self.peaks[side].mean()
 829 |         
 830 |         
 831 |         a = -1/YM
 832 |         if abs(a)<2*epsilon:
 833 |             epsilon = abs(a)/n_points
 834 |         
 835 |         a = a + epsilon
 836 |         b = 2*(Ymean-Ym)/(Ymean*Ym)
 837 |         c = 2*(Ymean-Ym)/(Ym**2)
 838 |     
 839 |         # We look for possible roots
 840 |         left_zeros = biSPOT._rootsFinder(lambda t: w(self.peaks[side],t),
 841 |                                  lambda t: jac_w(self.peaks[side],t),
 842 |                                  (a+epsilon,-epsilon),
 843 |                                  n_points,'regular')
 844 |         
 845 |         right_zeros = biSPOT._rootsFinder(lambda t: w(self.peaks[side],t),
 846 |                                   lambda t: jac_w(self.peaks[side],t),
 847 |                                   (b,c),
 848 |                                   n_points,'regular')
 849 |     
 850 |         # all the possible roots
 851 |         zeros = np.concatenate((left_zeros,right_zeros))
 852 |         
 853 |         # 0 is always a solution so we initialize with it
 854 |         gamma_best = 0
 855 |         sigma_best = Ymean
 856 |         ll_best = biSPOT._log_likelihood(self.peaks[side],gamma_best,sigma_best)
 857 |         
 858 |         # we look for better candidates
 859 |         for z in zeros:
 860 |             gamma = u(1+z*self.peaks[side])-1
 861 |             sigma = gamma/z
 862 |             ll = biSPOT._log_likelihood(self.peaks[side],gamma,sigma)
 863 |             if ll>ll_best:
 864 |                 gamma_best = gamma
 865 |                 sigma_best = sigma
 866 |                 ll_best = ll
 867 |     
 868 |         return gamma_best,sigma_best,ll_best
 869 | 
 870 |     
 871 | 
 872 |     def _quantile(self,side,gamma,sigma):
 873 |         """
 874 |         Compute the quantile at level 1-q for a given side
 875 |         
 876 |         Parameters
 877 |         ----------
 878 |         side : str
 879 |             'up' or 'down'
 880 |         gamma : float
 881 | 		    GPD parameter
 882 |         sigma : float
 883 |             GPD parameter
 884 | 
 885 |         Returns
 886 |         ----------
 887 |         float
 888 |             quantile at level 1-q for the GPD(γ,σ,μ=0)
 889 |         """
 890 |         if side == 'up':
 891 |             r = self.n * self.proba / self.Nt[side]
 892 |             if gamma != 0:
 893 |                 return self.init_threshold['up'] + (sigma/gamma)*(pow(r,-gamma)-1)
 894 |             else:
 895 |                 return self.init_threshold['up'] - sigma*log(r)
 896 |         elif side == 'down':
 897 |             r = self.n * self.proba / self.Nt[side]
 898 |             if gamma != 0:
 899 |                 return self.init_threshold['down'] - (sigma/gamma)*(pow(r,-gamma)-1)
 900 |             else:
 901 |                 return self.init_threshold['down'] + sigma*log(r)
 902 |         else:
 903 |             print('error : the side is not right')
 904 | 
 905 |         
 906 |     def run(self, with_alarm = True):
 907 |         """
 908 |         Run biSPOT on the stream
 909 |         
 910 |         Parameters
 911 |         ----------
 912 |         with_alarm : bool
 913 | 		    (default = True) If False, SPOT will adapt the threshold assuming \
 914 |             there is no abnormal values
 915 | 
 916 | 
 917 |         Returns
 918 |         ----------
 919 |         dict
 920 |             keys : 'upper_thresholds', 'lower_thresholds' and 'alarms'
 921 |             
 922 |             '***-thresholds' contains the extreme quantiles and 'alarms' contains \
 923 |             the indexes of the values which have triggered alarms
 924 |             
 925 |         """
 926 |         if (self.n>self.init_data.size):
 927 |             print('Warning : the algorithm seems to have already been run, you \
 928 |             should initialize before running again')
 929 |             return {}
 930 |         
 931 |         # list of the thresholds
 932 |         thup = []
 933 |         thdown = []
 934 |         alarm = []
 935 |         # Loop over the stream
 936 |         for i in tqdm.tqdm(range(self.data.size)):
 937 |     
 938 |             # If the observed value exceeds the current threshold (alarm case)
 939 |             if self.data[i]>self.extreme_quantile['up'] :
 940 |                 # if we want to alarm, we put it in the alarm list
 941 |                 if with_alarm:
 942 |                     alarm.append(i)
 943 |                 # otherwise we add it in the peaks
 944 |                 else:
 945 |                     self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up'])
 946 |                     self.Nt['up'] += 1
 947 |                     self.n += 1
 948 |                     # and we update the thresholds
 949 | 
 950 |                     g,s,l = self._grimshaw('up')
 951 |                     self.extreme_quantile['up'] = self._quantile('up',g,s)
 952 | 
 953 |             # case where the value exceeds the initial threshold but not the alarm ones
 954 |             elif self.data[i]>self.init_threshold['up']:
 955 |                     # we add it in the peaks
 956 |                     self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up'])
 957 |                     self.Nt['up'] += 1
 958 |                     self.n += 1
 959 |                     # and we update the thresholds
 960 | 
 961 |                     g,s,l = self._grimshaw('up')
 962 |                     self.extreme_quantile['up'] = self._quantile('up',g,s)
 963 |                     
 964 |             elif self.data[i]<self.extreme_quantile['down'] :
 965 |                 # if we want to alarm, we put it in the alarm list
 966 |                 if with_alarm:
 967 |                     alarm.append(i)
 968 |                 # otherwise we add it in the peaks
 969 |                 else:
 970 |                     self.peaks['down'] = np.append(self.peaks['down'],-(self.data[i]-self.init_threshold['down']))
 971 |                     self.Nt['down'] += 1
 972 |                     self.n += 1
 973 |                     # and we update the thresholds
 974 | 
 975 |                     g,s,l = self._grimshaw('down')
 976 |                     self.extreme_quantile['down'] = self._quantile('down',g,s)
 977 | 
 978 |             # case where the value exceeds the initial threshold but not the alarm ones
 979 |             elif self.data[i]<self.init_threshold['down']:
 980 |                     # we add it in the peaks
 981 |                     self.peaks['down'] = np.append(self.peaks['down'],-(self.data[i]-self.init_threshold['down']))
 982 |                     self.Nt['down'] += 1
 983 |                     self.n += 1
 984 |                     # and we update the thresholds
 985 | 
 986 |                     g,s,l = self._grimshaw('down')
 987 |                     self.extreme_quantile['down'] = self._quantile('down',g,s)
 988 |             else:
 989 |                 self.n += 1
 990 | 
 991 |                 
 992 |             thup.append(self.extreme_quantile['up']) # thresholds record
 993 |             thdown.append(self.extreme_quantile['down']) # thresholds record
 994 |         
 995 |         return {'upper_thresholds' : thup,'lower_thresholds' : thdown, 'alarms': alarm}
 996 |     
 997 |     def plot(self,run_results,with_alarm = True):
 998 |         """
 999 |         Plot the results of given by the run
1000 |         
1001 |         Parameters
1002 |         ----------
1003 |         run_results : dict
1004 |             results given by the 'run' method
1005 |         with_alarm : bool
1006 | 		    (default = True) If True, alarms are plotted.
1007 | 
1008 | 
1009 |         Returns
1010 |         ----------
1011 |         list
1012 |             list of the plots
1013 |             
1014 |         """
1015 |         x = range(self.data.size)
1016 |         K = run_results.keys()
1017 |         
1018 |         ts_fig, = plt.plot(x,self.data,color=air_force_blue)
1019 |         fig = [ts_fig]
1020 |         
1021 |         if 'upper_thresholds' in K:
1022 |             thup = run_results['upper_thresholds']
1023 |             uth_fig, = plt.plot(x,thup,color=deep_saffron,lw=2,ls='dashed')
1024 |             fig.append(uth_fig)
1025 |             
1026 |         if 'lower_thresholds' in K:
1027 |             thdown = run_results['lower_thresholds']
1028 |             lth_fig, = plt.plot(x,thdown,color=deep_saffron,lw=2,ls='dashed')
1029 |             fig.append(lth_fig)
1030 |         
1031 |         if with_alarm and ('alarms' in K):
1032 |             alarm = run_results['alarms']
1033 |             al_fig = plt.scatter(alarm,self.data[alarm],color='red')
1034 |             fig.append(al_fig)
1035 |             
1036 |         plt.xlim((0,self.data.size))
1037 | 
1038 |         
1039 |         return fig
1040 | 
1041 | 
1042 | 
1043 | 
1044 | 
1045 | 
1046 | 
1047 | 
1048 | """
1049 | ================================= WITH DRIFT ==================================
1050 | """
1051 | 
1052 | def backMean(X,d):
1053 |     M = []
1054 |     w = X[:d].sum()
1055 |     M.append(w/d)
1056 |     for i in range(d,len(X)):
1057 |         w = w - X[i-d] + X[i]
1058 |         M.append(w/d)
1059 |     return np.array(M)
1060 | 
1061 | 
1062 | 
1063 | class dSPOT:
1064 |     """
1065 |     This class allows to run DSPOT algorithm on univariate dataset (upper-bound)
1066 |     
1067 |     Attributes
1068 |     ----------
1069 |     proba : float
1070 |         Detection level (risk), chosen by the user
1071 |         
1072 |     depth : int
1073 |         Number of observations to compute the moving average
1074 |         
1075 |     extreme_quantile : float
1076 |         current threshold (bound between normal and abnormal events)
1077 |         
1078 |     data : numpy.array
1079 |         stream
1080 |     
1081 |     init_data : numpy.array
1082 |         initial batch of observations (for the calibration/initialization step)
1083 |     
1084 |     init_threshold : float
1085 |         initial threshold computed during the calibration step
1086 |     
1087 |     peaks : numpy.array
1088 |         array of peaks (excesses above the initial threshold)
1089 |     
1090 |     n : int
1091 |         number of observed values
1092 |     
1093 |     Nt : int
1094 |         number of observed peaks
1095 |     """
1096 |     def __init__(self, q, depth):
1097 |         self.proba = q
1098 |         self.extreme_quantile = None
1099 |         self.data = None
1100 |         self.init_data = None
1101 |         self.init_threshold = None
1102 |         self.peaks = None
1103 |         self.n = 0
1104 |         self.Nt = 0
1105 |         self.depth = depth
1106 |         
1107 |     def __str__(self):
1108 |         s = ''
1109 |         s += 'Streaming Peaks-Over-Threshold Object\n'
1110 |         s += 'Detection level q = %s\n' % self.proba
1111 |         if self.data is not None:
1112 |             s += 'Data imported : Yes\n'
1113 |             s += '\t initialization  : %s values\n' % self.init_data.size
1114 |             s += '\t stream : %s values\n' % self.data.size
1115 |         else:
1116 |             s += 'Data imported : No\n'
1117 |             return s
1118 |             
1119 |         if self.n == 0:
1120 |             s += 'Algorithm initialized : No\n'
1121 |         else:
1122 |             s += 'Algorithm initialized : Yes\n'
1123 |             s += '\t initial threshold : %s\n' % self.init_threshold
1124 |             
1125 |             r = self.n-self.init_data.size
1126 |             if r > 0:
1127 |                 s += 'Algorithm run : Yes\n'
1128 |                 s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n)
1129 |                 s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n)
1130 |             else:
1131 |                 s += '\t number of peaks  : %s\n' % self.Nt
1132 |                 s += '\t extreme quantile : %s\n' % self.extreme_quantile
1133 |                 s += 'Algorithm run : No\n'
1134 |         return s
1135 |     
1136 |     
1137 |     def fit(self,init_data,data):
1138 |         """
1139 |         Import data to DSPOT object
1140 |         
1141 |         Parameters
1142 | 	    ----------
1143 | 	    init_data : list, numpy.array or pandas.Series
1144 | 		    initial batch to calibrate the algorithm
1145 |             
1146 |         data : numpy.array
1147 | 		    data for the run (list, np.array or pd.series)
1148 | 	
1149 |         """
1150 |         if isinstance(data,list):
1151 |             self.data = np.array(data)
1152 |         elif isinstance(data,np.ndarray):
1153 |             self.data = data
1154 |         elif isinstance(data,pd.Series):
1155 |             self.data = data.values
1156 |         else:
1157 |             print('This data format (%s) is not supported' % type(data))
1158 |             return
1159 |             
1160 |         if isinstance(init_data,list):
1161 |             self.init_data = np.array(init_data)
1162 |         elif isinstance(init_data,np.ndarray):
1163 |             self.init_data = init_data
1164 |         elif isinstance(init_data,pd.Series):
1165 |             self.init_data = init_data.values
1166 |         elif isinstance(init_data,int):
1167 |             self.init_data = self.data[:init_data]
1168 |             self.data = self.data[init_data:]
1169 |         elif isinstance(init_data,float) & (init_data<1) & (init_data>0):
1170 |             r = int(init_data*data.size)
1171 |             self.init_data = self.data[:r]
1172 |             self.data = self.data[r:]
1173 |         else:
1174 |             print('The initial data cannot be set')
1175 |             return
1176 |         
1177 |     def add(self,data):
1178 |         """
1179 |         This function allows to append data to the already fitted data
1180 |         
1181 |         Parameters
1182 | 	    ----------
1183 | 	    data : list, numpy.array, pandas.Series
1184 | 		    data to append
1185 |         """
1186 |         if isinstance(data,list):
1187 |             data = np.array(data)
1188 |         elif isinstance(data,np.ndarray):
1189 |             data = data
1190 |         elif isinstance(data,pd.Series):
1191 |             data = data.values
1192 |         else:
1193 |             print('This data format (%s) is not supported' % type(data))
1194 |             return
1195 |         
1196 |         self.data = np.append(self.data,data)
1197 |         return
1198 |     
1199 |     def initialize(self, verbose = True):
1200 |         """
1201 |         Run the calibration (initialization) step
1202 |         
1203 |         Parameters
1204 | 	    ----------
1205 | 	    verbose : bool
1206 | 		    (default = True) If True, gives details about the batch initialization
1207 |         """
1208 |         n_init = self.init_data.size - self.depth
1209 |         
1210 |         M = backMean(self.init_data,self.depth)
1211 |         T = self.init_data[self.depth:]-M[:-1] # new variable
1212 |         
1213 |         S = np.sort(T)     # we sort X to get the empirical quantile
1214 |         self.init_threshold = S[int(0.98*n_init)] # t is fixed for the whole algorithm
1215 | 
1216 |         # initial peaks
1217 |         self.peaks = T[T>self.init_threshold]-self.init_threshold 
1218 |         self.Nt = self.peaks.size
1219 |         self.n = n_init
1220 |         
1221 |         if verbose:
1222 |             print('Initial threshold : %s' % self.init_threshold)
1223 |             print('Number of peaks : %s' % self.Nt)
1224 |             print('Grimshaw maximum log-likelihood estimation ... ', end = '')
1225 |             
1226 |         g,s,l = self._grimshaw()
1227 |         self.extreme_quantile = self._quantile(g,s)
1228 |         
1229 |         if verbose:
1230 |             print('[done]')
1231 |             print('\t'+chr(0x03B3) + ' = ' + str(g))
1232 |             print('\t'+chr(0x03C3) + ' = ' + str(s))
1233 |             print('\tL = ' + str(l))
1234 |             print('Extreme quantile (probability = %s): %s' % (self.proba,self.extreme_quantile))
1235 |         
1236 |         return
1237 |     
1238 |     
1239 |     
1240 |     
1241 |     def _rootsFinder(fun,jac,bounds,npoints,method):
1242 |         """
1243 |         Find possible roots of a scalar function
1244 |         
1245 |         Parameters
1246 |         ----------
1247 |         fun : function
1248 | 		    scalar function 
1249 |         jac : function
1250 |             first order derivative of the function  
1251 |         bounds : tuple
1252 |             (min,max) interval for the roots search    
1253 |         npoints : int
1254 |             maximum number of roots to output      
1255 |         method : str
1256 |             'regular' : regular sample of the search interval, 'random' : uniform (distribution) sample of the search interval
1257 |         
1258 |         Returns
1259 |         ----------
1260 |         numpy.array
1261 |             possible roots of the function
1262 |         """
1263 |         if method == 'regular':
1264 |             step = (bounds[1]-bounds[0])/(npoints+1)
1265 |             X0 = np.arange(bounds[0]+step,bounds[1],step)
1266 |         elif method == 'random':
1267 |             X0 = np.random.uniform(bounds[0],bounds[1],npoints)
1268 |         
1269 |         def objFun(X,f,jac):
1270 |             g = 0
1271 |             j = np.zeros(X.shape)
1272 |             i = 0
1273 |             for x in X:
1274 |                 fx = f(x)
1275 |                 g = g+fx**2
1276 |                 j[i] = 2*fx*jac(x)
1277 |                 i = i+1
1278 |             return g,j
1279 |         
1280 |         opt = minimize(lambda X:objFun(X,fun,jac), X0, 
1281 |                        method='L-BFGS-B', 
1282 |                        jac=True, bounds=[bounds]*len(X0))
1283 |         
1284 |         X = opt.x
1285 |         np.round(X,decimals = 5)
1286 |         return np.unique(X)
1287 |     
1288 |     
1289 |     def _log_likelihood(Y,gamma,sigma):
1290 |         """
1291 |         Compute the log-likelihood for the Generalized Pareto Distribution (μ=0)
1292 |         
1293 |         Parameters
1294 |         ----------
1295 |         Y : numpy.array
1296 | 		    observations
1297 |         gamma : float
1298 |             GPD index parameter
1299 |         sigma : float
1300 |             GPD scale parameter (>0)   
1301 | 
1302 |         Returns
1303 |         ----------
1304 |         float
1305 |             log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0)
1306 |         """
1307 |         n = Y.size
1308 |         if gamma != 0:
1309 |             tau = gamma/sigma
1310 |             L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum()
1311 |         else:
1312 |             L = n * ( 1 + log(Y.mean()) )
1313 |         return L
1314 | 
1315 | 
1316 |     def _grimshaw(self,epsilon = 1e-8, n_points = 10):
1317 |         """
1318 |         Compute the GPD parameters estimation with the Grimshaw's trick
1319 |         
1320 |         Parameters
1321 |         ----------
1322 |         epsilon : float
1323 | 		    numerical parameter to perform (default : 1e-8)
1324 |         n_points : int
1325 |             maximum number of candidates for maximum likelihood (default : 10)
1326 | 
1327 |         Returns
1328 |         ----------
1329 |         gamma_best,sigma_best,ll_best
1330 |             gamma estimates, sigma estimates and corresponding log-likelihood
1331 |         """
1332 |         def u(s):
1333 |             return 1 + np.log(s).mean()
1334 |             
1335 |         def v(s):
1336 |             return np.mean(1/s)
1337 |         
1338 |         def w(Y,t):
1339 |             s = 1+t*Y
1340 |             us = u(s)
1341 |             vs = v(s)
1342 |             return us*vs-1
1343 |         
1344 |         def jac_w(Y,t):
1345 |             s = 1+t*Y
1346 |             us = u(s)
1347 |             vs = v(s)
1348 |             jac_us = (1/t)*(1-vs)
1349 |             jac_vs = (1/t)*(-vs+np.mean(1/s**2))
1350 |             return us*jac_vs+vs*jac_us
1351 |             
1352 |     
1353 |         Ym = self.peaks.min()
1354 |         YM = self.peaks.max()
1355 |         Ymean = self.peaks.mean()
1356 |         
1357 |         
1358 |         a = -1/YM
1359 |         if abs(a)<2*epsilon:
1360 |             epsilon = abs(a)/n_points
1361 |         
1362 |         a = a + epsilon
1363 |         b = 2*(Ymean-Ym)/(Ymean*Ym)
1364 |         c = 2*(Ymean-Ym)/(Ym**2)
1365 |     
1366 |         # We look for possible roots
1367 |         left_zeros = SPOT._rootsFinder(lambda t: w(self.peaks,t),
1368 |                                  lambda t: jac_w(self.peaks,t),
1369 |                                  (a+epsilon,-epsilon),
1370 |                                  n_points,'regular')
1371 |         
1372 |         right_zeros = SPOT._rootsFinder(lambda t: w(self.peaks,t),
1373 |                                   lambda t: jac_w(self.peaks,t),
1374 |                                   (b,c),
1375 |                                   n_points,'regular')
1376 |     
1377 |         # all the possible roots
1378 |         zeros = np.concatenate((left_zeros,right_zeros))
1379 |         
1380 |         # 0 is always a solution so we initialize with it
1381 |         gamma_best = 0
1382 |         sigma_best = Ymean
1383 |         ll_best = SPOT._log_likelihood(self.peaks,gamma_best,sigma_best)
1384 |         
1385 |         # we look for better candidates
1386 |         for z in zeros:
1387 |             gamma = u(1+z*self.peaks)-1
1388 |             sigma = gamma/z
1389 |             ll = dSPOT._log_likelihood(self.peaks,gamma,sigma)
1390 |             if ll>ll_best:
1391 |                 gamma_best = gamma
1392 |                 sigma_best = sigma
1393 |                 ll_best = ll
1394 |     
1395 |         return gamma_best,sigma_best,ll_best
1396 | 
1397 |     
1398 | 
1399 |     def _quantile(self,gamma,sigma):
1400 |         """
1401 |         Compute the quantile at level 1-q
1402 |         
1403 |         Parameters
1404 |         ----------
1405 |         gamma : float
1406 | 		    GPD parameter
1407 |         sigma : float
1408 |             GPD parameter
1409 | 
1410 |         Returns
1411 |         ----------
1412 |         float
1413 |             quantile at level 1-q for the GPD(γ,σ,μ=0)
1414 |         """
1415 |         r = self.n * self.proba / self.Nt
1416 |         if gamma != 0:
1417 |             return self.init_threshold + (sigma/gamma)*(pow(r,-gamma)-1)
1418 |         else:
1419 |             return self.init_threshold - sigma*log(r)
1420 | 
1421 |         
1422 |     def run(self, with_alarm = True):
1423 |         """
1424 |         Run biSPOT on the stream
1425 |         
1426 |         Parameters
1427 |         ----------
1428 |         with_alarm : bool
1429 | 		    (default = True) If False, SPOT will adapt the threshold assuming \
1430 |             there is no abnormal values
1431 | 
1432 | 
1433 |         Returns
1434 |         ----------
1435 |         dict
1436 |             keys : 'upper_thresholds', 'lower_thresholds' and 'alarms'
1437 |             
1438 |             '***-thresholds' contains the extreme quantiles and 'alarms' contains \
1439 |             the indexes of the values which have triggered alarms
1440 |             
1441 |         """
1442 |         if (self.n>self.init_data.size):
1443 |             print('Warning : the algorithm seems to have already been run, you \
1444 |             should initialize before running again')
1445 |             return {}
1446 |         
1447 |         # actual normal window
1448 |         W = self.init_data[-self.depth:]
1449 |         
1450 |         # list of the thresholds
1451 |         th = []
1452 |         alarm = []
1453 |         # Loop over the stream
1454 |         for i in tqdm.tqdm(range(self.data.size)):
1455 |             Mi = W.mean()
1456 |             # If the observed value exceeds the current threshold (alarm case)
1457 |             if (self.data[i]-Mi)>self.extreme_quantile:
1458 |                 # if we want to alarm, we put it in the alarm list
1459 |                 if with_alarm:
1460 |                     alarm.append(i)
1461 |                 # otherwise we add it in the peaks
1462 |                 else:
1463 |                     self.peaks = np.append(self.peaks,self.data[i]-Mi-self.init_threshold)
1464 |                     self.Nt += 1
1465 |                     self.n += 1
1466 |                     # and we update the thresholds
1467 | 
1468 |                     g,s,l = self._grimshaw()
1469 |                     self.extreme_quantile = self._quantile(g,s) #+ Mi
1470 |                     W = np.append(W[1:],self.data[i])
1471 | 
1472 |             # case where the value exceeds the initial threshold but not the alarm ones
1473 |             elif (self.data[i]-Mi)>self.init_threshold:
1474 |                     # we add it in the peaks
1475 |                     self.peaks = np.append(self.peaks,self.data[i]-Mi-self.init_threshold)
1476 |                     self.Nt += 1
1477 |                     self.n += 1
1478 |                     # and we update the thresholds
1479 | 
1480 |                     g,s,l = self._grimshaw()
1481 |                     self.extreme_quantile = self._quantile(g,s) #+ Mi
1482 |                     W = np.append(W[1:],self.data[i])
1483 |             else:
1484 |                 self.n += 1
1485 |                 W = np.append(W[1:],self.data[i])
1486 | 
1487 |                 
1488 |             th.append(self.extreme_quantile+Mi) # thresholds record
1489 |         
1490 |         return {'thresholds' : th, 'alarms': alarm}
1491 |     
1492 | 
1493 |     def plot(self,run_results, with_alarm = True):
1494 |         """
1495 |         Plot the results given by the run
1496 |         
1497 |         Parameters
1498 |         ----------
1499 |         run_results : dict
1500 |             results given by the 'run' method
1501 |         with_alarm : bool
1502 | 		    (default = True) If True, alarms are plotted.
1503 | 
1504 | 
1505 |         Returns
1506 |         ----------
1507 |         list
1508 |             list of the plots
1509 |             
1510 |         """
1511 |         x = range(self.data.size)
1512 |         K = run_results.keys()
1513 |         
1514 |         ts_fig, = plt.plot(x,self.data,color=air_force_blue)
1515 |         fig = [ts_fig]
1516 |         
1517 | #        if 'upper_thresholds' in K:
1518 | #            thup = run_results['upper_thresholds']
1519 | #            uth_fig, = plt.plot(x,thup,color=deep_saffron,lw=2,ls='dashed')
1520 | #            fig.append(uth_fig)
1521 | #            
1522 | #        if 'lower_thresholds' in K:
1523 | #            thdown = run_results['lower_thresholds']
1524 | #            lth_fig, = plt.plot(x,thdown,color=deep_saffron,lw=2,ls='dashed')
1525 | #            fig.append(lth_fig)
1526 |         
1527 |         if 'thresholds' in K:
1528 |             th = run_results['thresholds']
1529 |             th_fig, = plt.plot(x,th,color=deep_saffron,lw=2,ls='dashed')
1530 |             fig.append(th_fig)
1531 |         
1532 |         if with_alarm and ('alarms' in K):
1533 |             alarm = run_results['alarms']
1534 |             if len(alarm)>0:
1535 |                 plt.scatter(alarm,self.data[alarm],color='red')
1536 |             
1537 |         plt.xlim((0,self.data.size))
1538 | 
1539 |         
1540 |         return fig
1541 |             
1542 | 
1543 | 
1544 | 
1545 | 
1546 | 
1547 | 
1548 | """
1549 | =========================== DRIFT & DOUBLE BOUNDS =============================
1550 | """
1551 | 
1552 | 
1553 | 
1554 | class bidSPOT:
1555 |     """
1556 |     This class allows to run DSPOT algorithm on univariate dataset (upper and lower bounds)
1557 |     
1558 |     Attributes
1559 |     ----------
1560 |     proba : float
1561 |         Detection level (risk), chosen by the user
1562 |         
1563 |     depth : int
1564 |         Number of observations to compute the moving average
1565 |         
1566 |     extreme_quantile : float
1567 |         current threshold (bound between normal and abnormal events)
1568 |         
1569 |     data : numpy.array
1570 |         stream
1571 |     
1572 |     init_data : numpy.array
1573 |         initial batch of observations (for the calibration/initialization step)
1574 |     
1575 |     init_threshold : float
1576 |         initial threshold computed during the calibration step
1577 |     
1578 |     peaks : numpy.array
1579 |         array of peaks (excesses above the initial threshold)
1580 |     
1581 |     n : int
1582 |         number of observed values
1583 |     
1584 |     Nt : int
1585 |         number of observed peaks
1586 |     """
1587 |     def __init__(self, q = 1e-4, depth = 10):
1588 |         self.proba = q
1589 |         self.data = None
1590 |         self.init_data = None
1591 |         self.n = 0
1592 |         self.depth = depth
1593 |         
1594 |         nonedict =  {'up':None,'down':None}
1595 |         
1596 |         self.extreme_quantile = dict.copy(nonedict)
1597 |         self.init_threshold = dict.copy(nonedict)
1598 |         self.peaks = dict.copy(nonedict)
1599 |         self.gamma = dict.copy(nonedict)
1600 |         self.sigma = dict.copy(nonedict)
1601 |         self.Nt = {'up':0,'down':0}
1602 |         
1603 |         
1604 |     def __str__(self):
1605 |         s = ''
1606 |         s += 'Streaming Peaks-Over-Threshold Object\n'
1607 |         s += 'Detection level q = %s\n' % self.proba
1608 |         if self.data is not None:
1609 |             s += 'Data imported : Yes\n'
1610 |             s += '\t initialization  : %s values\n' % self.init_data.size
1611 |             s += '\t stream : %s values\n' % self.data.size
1612 |         else:
1613 |             s += 'Data imported : No\n'
1614 |             return s
1615 |             
1616 |         if self.n == 0:
1617 |             s += 'Algorithm initialized : No\n'
1618 |         else:
1619 |             s += 'Algorithm initialized : Yes\n'
1620 |             s += '\t initial threshold : %s\n' % self.init_threshold
1621 |             
1622 |             r = self.n-self.init_data.size
1623 |             if r > 0:
1624 |                 s += 'Algorithm run : Yes\n'
1625 |                 s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n)
1626 |                 s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n)
1627 |             else:
1628 |                 s += '\t number of peaks  : %s\n' % self.Nt
1629 |                 s += '\t upper extreme quantile : %s\n' % self.extreme_quantile['up']
1630 |                 s += '\t lower extreme quantile : %s\n' % self.extreme_quantile['down']
1631 |                 s += 'Algorithm run : No\n'
1632 |         return s
1633 |     
1634 |     
1635 |     def fit(self,init_data,data):
1636 |         """
1637 |         Import data to biDSPOT object
1638 |         
1639 |         Parameters
1640 | 	    ----------
1641 | 	    init_data : list, numpy.array or pandas.Series
1642 | 		    initial batch to calibrate the algorithm
1643 |             
1644 |         data : numpy.array
1645 | 		    data for the run (list, np.array or pd.series)
1646 | 	
1647 |         """
1648 |         if isinstance(data,list):
1649 |             self.data = np.array(data)
1650 |         elif isinstance(data,np.ndarray):
1651 |             self.data = data
1652 |         elif isinstance(data,pd.Series):
1653 |             self.data = data.values
1654 |         else:
1655 |             print('This data format (%s) is not supported' % type(data))
1656 |             return
1657 |             
1658 |         if isinstance(init_data,list):
1659 |             self.init_data = np.array(init_data)
1660 |         elif isinstance(init_data,np.ndarray):
1661 |             self.init_data = init_data
1662 |         elif isinstance(init_data,pd.Series):
1663 |             self.init_data = init_data.values
1664 |         elif isinstance(init_data,int):
1665 |             self.init_data = self.data[:init_data]
1666 |             self.data = self.data[init_data:]
1667 |         elif isinstance(init_data,float) & (init_data<1) & (init_data>0):
1668 |             r = int(init_data*data.size)
1669 |             self.init_data = self.data[:r]
1670 |             self.data = self.data[r:]
1671 |         else:
1672 |             print('The initial data cannot be set')
1673 |             return
1674 |         
1675 |     def add(self,data):
1676 |         """
1677 |         This function allows to append data to the already fitted data
1678 |         
1679 |         Parameters
1680 | 	    ----------
1681 | 	    data : list, numpy.array, pandas.Series
1682 | 		    data to append
1683 |         """
1684 |         if isinstance(data,list):
1685 |             data = np.array(data)
1686 |         elif isinstance(data,np.ndarray):
1687 |             data = data
1688 |         elif isinstance(data,pd.Series):
1689 |             data = data.values
1690 |         else:
1691 |             print('This data format (%s) is not supported' % type(data))
1692 |             return
1693 |         
1694 |         self.data = np.append(self.data,data)
1695 |         return
1696 |     
1697 |     def initialize(self, verbose = True):
1698 |         """
1699 |         Run the calibration (initialization) step
1700 |         
1701 |         Parameters
1702 | 	    ----------
1703 | 	    verbose : bool
1704 | 		    (default = True) If True, gives details about the batch initialization
1705 |         """
1706 |         n_init = self.init_data.size - self.depth
1707 |         
1708 |         M = backMean(self.init_data,self.depth)
1709 |         T = self.init_data[self.depth:]-M[:-1] # new variable
1710 |         
1711 |         S = np.sort(T)     # we sort T to get the empirical quantile
1712 |         self.init_threshold['up'] = S[int(0.98*n_init)] # t is fixed for the whole algorithm
1713 |         self.init_threshold['down'] = S[int(0.02*n_init)] # t is fixed for the whole algorithm
1714 | 
1715 |         # initial peaks
1716 |         self.peaks['up'] = T[T>self.init_threshold['up']]-self.init_threshold['up']
1717 |         self.peaks['down'] = -( T[ T<self.init_threshold['down'] ] - self.init_threshold['down'] )
1718 |         self.Nt['up'] = self.peaks['up'].size
1719 |         self.Nt['down'] = self.peaks['down'].size
1720 |         self.n = n_init
1721 |         
1722 |         if verbose:
1723 |             print('Initial threshold : %s' % self.init_threshold)
1724 |             print('Number of peaks : %s' % self.Nt)
1725 |             print('Grimshaw maximum log-likelihood estimation ... ', end = '')
1726 |             
1727 |         l = {'up':None,'down':None}
1728 |         for side in ['up','down']:
1729 |             g,s,l[side] = self._grimshaw(side)
1730 |             self.extreme_quantile[side] = self._quantile(side,g,s)
1731 |             self.gamma[side] = g
1732 |             self.sigma[side] = s
1733 |         
1734 |         ltab = 20
1735 |         form = ('\t'+'%20s' + '%20.2f' + '%20.2f')
1736 |         if verbose:
1737 |             print('[done]')
1738 |             print('\t' + 'Parameters'.rjust(ltab) + 'Upper'.rjust(ltab) + 'Lower'.rjust(ltab))
1739 |             print('\t' + '-'*ltab*3)
1740 |             print(form % (chr(0x03B3),self.gamma['up'],self.gamma['down']))
1741 |             print(form % (chr(0x03C3),self.sigma['up'],self.sigma['down']))
1742 |             print(form % ('likelihood',l['up'],l['down']))
1743 |             print(form % ('Extreme quantile',self.extreme_quantile['up'],self.extreme_quantile['down']))
1744 |             print('\t' + '-'*ltab*3)
1745 |         return 
1746 |     
1747 |     
1748 |     
1749 |     
1750 |     def _rootsFinder(fun,jac,bounds,npoints,method):
1751 |         """
1752 |         Find possible roots of a scalar function
1753 |         
1754 |         Parameters
1755 |         ----------
1756 |         fun : function
1757 | 		    scalar function 
1758 |         jac : function
1759 |             first order derivative of the function  
1760 |         bounds : tuple
1761 |             (min,max) interval for the roots search    
1762 |         npoints : int
1763 |             maximum number of roots to output      
1764 |         method : str
1765 |             'regular' : regular sample of the search interval, 'random' : uniform (distribution) sample of the search interval
1766 |         
1767 |         Returns
1768 |         ----------
1769 |         numpy.array
1770 |             possible roots of the function
1771 |         """
1772 |         if method == 'regular':
1773 |             step = (bounds[1]-bounds[0])/(npoints+1)
1774 |             X0 = np.arange(bounds[0]+step,bounds[1],step)
1775 |         elif method == 'random':
1776 |             X0 = np.random.uniform(bounds[0],bounds[1],npoints)
1777 |         
1778 |         def objFun(X,f,jac):
1779 |             g = 0
1780 |             j = np.zeros(X.shape)
1781 |             i = 0
1782 |             for x in X:
1783 |                 fx = f(x)
1784 |                 g = g+fx**2
1785 |                 j[i] = 2*fx*jac(x)
1786 |                 i = i+1
1787 |             return g,j
1788 |         
1789 |         opt = minimize(lambda X:objFun(X,fun,jac), X0, 
1790 |                        method='L-BFGS-B', 
1791 |                        jac=True, bounds=[bounds]*len(X0))
1792 |         
1793 |         X = opt.x
1794 |         np.round(X,decimals = 5)
1795 |         return np.unique(X)
1796 |     
1797 |     
1798 |     def _log_likelihood(Y,gamma,sigma):
1799 |         """
1800 |         Compute the log-likelihood for the Generalized Pareto Distribution (μ=0)
1801 |         
1802 |         Parameters
1803 |         ----------
1804 |         Y : numpy.array
1805 | 		    observations
1806 |         gamma : float
1807 |             GPD index parameter
1808 |         sigma : float
1809 |             GPD scale parameter (>0)   
1810 | 
1811 |         Returns
1812 |         ----------
1813 |         float
1814 |             log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0)
1815 |         """
1816 |         n = Y.size
1817 |         if gamma != 0:
1818 |             tau = gamma/sigma
1819 |             L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum()
1820 |         else:
1821 |             L = n * ( 1 + log(Y.mean()) )
1822 |         return L
1823 | 
1824 | 
1825 |     def _grimshaw(self,side,epsilon = 1e-8, n_points = 8):
1826 |         """
1827 |         Compute the GPD parameters estimation with the Grimshaw's trick
1828 |         
1829 |         Parameters
1830 |         ----------
1831 |         epsilon : float
1832 | 		    numerical parameter to perform (default : 1e-8)
1833 |         n_points : int
1834 |             maximum number of candidates for maximum likelihood (default : 10)
1835 | 
1836 |         Returns
1837 |         ----------
1838 |         gamma_best,sigma_best,ll_best
1839 |             gamma estimates, sigma estimates and corresponding log-likelihood
1840 |         """
1841 |         def u(s):
1842 |             return 1 + np.log(s).mean()
1843 |             
1844 |         def v(s):
1845 |             return np.mean(1/s)
1846 |         
1847 |         def w(Y,t):
1848 |             s = 1+t*Y
1849 |             us = u(s)
1850 |             vs = v(s)
1851 |             return us*vs-1
1852 |         
1853 |         def jac_w(Y,t):
1854 |             s = 1+t*Y
1855 |             us = u(s)
1856 |             vs = v(s)
1857 |             jac_us = (1/t)*(1-vs)
1858 |             jac_vs = (1/t)*(-vs+np.mean(1/s**2))
1859 |             return us*jac_vs+vs*jac_us
1860 |             
1861 |     
1862 |         Ym = self.peaks[side].min()
1863 |         YM = self.peaks[side].max()
1864 |         Ymean = self.peaks[side].mean()
1865 |         
1866 |         
1867 |         a = -1/YM
1868 |         if abs(a)<2*epsilon:
1869 |             epsilon = abs(a)/n_points
1870 |         
1871 |         a = a + epsilon
1872 |         b = 2*(Ymean-Ym)/(Ymean*Ym)
1873 |         c = 2*(Ymean-Ym)/(Ym**2)
1874 |     
1875 |         # We look for possible roots
1876 |         left_zeros = bidSPOT._rootsFinder(lambda t: w(self.peaks[side],t),
1877 |                                  lambda t: jac_w(self.peaks[side],t),
1878 |                                  (a+epsilon,-epsilon),
1879 |                                  n_points,'regular')
1880 |         
1881 |         right_zeros = bidSPOT._rootsFinder(lambda t: w(self.peaks[side],t),
1882 |                                   lambda t: jac_w(self.peaks[side],t),
1883 |                                   (b,c),
1884 |                                   n_points,'regular')
1885 |     
1886 |         # all the possible roots
1887 |         zeros = np.concatenate((left_zeros,right_zeros))
1888 |         
1889 |         # 0 is always a solution so we initialize with it
1890 |         gamma_best = 0
1891 |         sigma_best = Ymean
1892 |         ll_best = bidSPOT._log_likelihood(self.peaks[side],gamma_best,sigma_best)
1893 |         
1894 |         # we look for better candidates
1895 |         for z in zeros:
1896 |             gamma = u(1+z*self.peaks[side])-1
1897 |             sigma = gamma/z
1898 |             ll = bidSPOT._log_likelihood(self.peaks[side],gamma,sigma)
1899 |             if ll>ll_best:
1900 |                 gamma_best = gamma
1901 |                 sigma_best = sigma
1902 |                 ll_best = ll
1903 |     
1904 |         return gamma_best,sigma_best,ll_best
1905 | 
1906 |     
1907 | 
1908 |     def _quantile(self,side,gamma,sigma):
1909 |         """
1910 |         Compute the quantile at level 1-q for a given side
1911 |         
1912 |         Parameters
1913 |         ----------
1914 |         side : str
1915 |             'up' or 'down'
1916 |         gamma : float
1917 | 		    GPD parameter
1918 |         sigma : float
1919 |             GPD parameter
1920 | 
1921 |         Returns
1922 |         ----------
1923 |         float
1924 |             quantile at level 1-q for the GPD(γ,σ,μ=0)
1925 |         """
1926 |         if side == 'up':
1927 |             r = self.n * self.proba / self.Nt[side]
1928 |             if gamma != 0:
1929 |                 return self.init_threshold['up'] + (sigma/gamma)*(pow(r,-gamma)-1)
1930 |             else:
1931 |                 return self.init_threshold['up'] - sigma*log(r)
1932 |         elif side == 'down':
1933 |             r = self.n * self.proba / self.Nt[side]
1934 |             if gamma != 0:
1935 |                 return self.init_threshold['down'] - (sigma/gamma)*(pow(r,-gamma)-1)
1936 |             else:
1937 |                 return self.init_threshold['down'] + sigma*log(r)
1938 |         else:
1939 |             print('error : the side is not right')
1940 | 
1941 |         
1942 |     def run(self, with_alarm = True, plot = True):
1943 |         """
1944 |         Run biDSPOT on the stream
1945 |         
1946 |         Parameters
1947 |         ----------
1948 |         with_alarm : bool
1949 | 		    (default = True) If False, SPOT will adapt the threshold assuming \
1950 |             there is no abnormal values
1951 | 
1952 | 
1953 |         Returns
1954 |         ----------
1955 |         dict
1956 |             keys : 'upper_thresholds', 'lower_thresholds' and 'alarms'
1957 |             
1958 |             '***-thresholds' contains the extreme quantiles and 'alarms' contains \
1959 |             the indexes of the values which have triggered alarms
1960 |             
1961 |         """
1962 |         if (self.n>self.init_data.size):
1963 |             print('Warning : the algorithm seems to have already been run, you \
1964 |             should initialize before running again')
1965 |             return {}
1966 |         
1967 |         # actual normal window
1968 |         W = self.init_data[-self.depth:]
1969 |         
1970 |         # list of the thresholds
1971 |         thup = []
1972 |         thdown = []
1973 |         alarm = []
1974 |         # Loop over the stream
1975 |         for i in tqdm.tqdm(range(self.data.size)):
1976 |             Mi = W.mean()
1977 |             Ni = self.data[i]-Mi
1978 |             # If the observed value exceeds the current threshold (alarm case)
1979 |             if Ni>self.extreme_quantile['up'] :
1980 |                 # if we want to alarm, we put it in the alarm list
1981 |                 if with_alarm:
1982 |                     alarm.append(i)
1983 |                 # otherwise we add it in the peaks
1984 |                 else:
1985 |                     self.peaks['up'] = np.append(self.peaks['up'],Ni-self.init_threshold['up'])
1986 |                     self.Nt['up'] += 1
1987 |                     self.n += 1
1988 |                     # and we update the thresholds
1989 | 
1990 |                     g,s,l = self._grimshaw('up')
1991 |                     self.extreme_quantile['up'] = self._quantile('up',g,s)
1992 |                     W = np.append(W[1:],self.data[i])
1993 |                     
1994 |             # case where the value exceeds the initial threshold but not the alarm ones
1995 |             elif Ni>self.init_threshold['up']:
1996 |                     # we add it in the peaks
1997 |                     self.peaks['up'] = np.append(self.peaks['up'],Ni-self.init_threshold['up'])
1998 |                     self.Nt['up'] += 1
1999 |                     self.n += 1
2000 |                     # and we update the thresholds
2001 |                     g,s,l = self._grimshaw('up')
2002 |                     self.extreme_quantile['up'] = self._quantile('up',g,s)
2003 |                     W = np.append(W[1:],self.data[i])
2004 |                     
2005 |             elif Ni<self.extreme_quantile['down'] :
2006 |                 # if we want to alarm, we put it in the alarm list
2007 |                 if with_alarm:
2008 |                     alarm.append(i)
2009 |                 # otherwise we add it in the peaks
2010 |                 else:
2011 |                     self.peaks['down'] = np.append(self.peaks['down'],-(Ni-self.init_threshold['down']))
2012 |                     self.Nt['down'] += 1
2013 |                     self.n += 1
2014 |                     # and we update the thresholds
2015 | 
2016 |                     g,s,l = self._grimshaw('down')
2017 |                     self.extreme_quantile['down'] = self._quantile('down',g,s)
2018 |                     W = np.append(W[1:],self.data[i])
2019 |                     
2020 |             # case where the value exceeds the initial threshold but not the alarm ones
2021 |             elif Ni<self.init_threshold['down']:
2022 |                     # we add it in the peaks
2023 |                     self.peaks['down'] = np.append(self.peaks['down'],-(Ni-self.init_threshold['down']))
2024 |                     self.Nt['down'] += 1
2025 |                     self.n += 1
2026 |                     # and we update the thresholds
2027 | 
2028 |                     g,s,l = self._grimshaw('down')
2029 |                     self.extreme_quantile['down'] = self._quantile('down',g,s)
2030 |                     W = np.append(W[1:],self.data[i])
2031 |             else:
2032 |                 self.n += 1
2033 |                 W = np.append(W[1:],self.data[i])
2034 | 
2035 |                 
2036 |             thup.append(self.extreme_quantile['up']+Mi) # upper thresholds record
2037 |             thdown.append(self.extreme_quantile['down']+Mi) # lower thresholds record
2038 |         
2039 |         return {'upper_thresholds' : thup,'lower_thresholds' : thdown, 'alarms': alarm}
2040 |     
2041 | 
2042 |     def plot(self,run_results, with_alarm = True):
2043 |         """
2044 |         Plot the results given by the run
2045 |         
2046 |         Parameters
2047 |         ----------
2048 |         run_results : dict
2049 |             results given by the 'run' method
2050 |         with_alarm : bool
2051 | 		    (default = True) If True, alarms are plotted.
2052 | 
2053 | 
2054 |         Returns
2055 |         ----------
2056 |         list
2057 |             list of the plots
2058 |             
2059 |         """
2060 |         x = range(self.data.size)
2061 |         K = run_results.keys()
2062 |         
2063 |         ts_fig, = plt.plot(x,self.data,color=air_force_blue)
2064 |         fig = [ts_fig]
2065 |         
2066 |         if 'upper_thresholds' in K:
2067 |             thup = run_results['upper_thresholds']
2068 |             uth_fig, = plt.plot(x,thup,color=deep_saffron,lw=2,ls='dashed')
2069 |             fig.append(uth_fig)
2070 |             
2071 |         if 'lower_thresholds' in K:
2072 |             thdown = run_results['lower_thresholds']
2073 |             lth_fig, = plt.plot(x,thdown,color=deep_saffron,lw=2,ls='dashed')
2074 |             fig.append(lth_fig)
2075 |         
2076 |         if with_alarm and ('alarms' in K):
2077 |             alarm = run_results['alarms']
2078 |             if len(alarm)>0:
2079 |                 al_fig = plt.scatter(alarm,self.data[alarm],color='red')
2080 |                 fig.append(al_fig)
2081 |             
2082 |         plt.xlim((0,self.data.size))
2083 | 
2084 |         
2085 |         return fig
2086 | 
2087 | 
2088 | 
2089 | 
2090 | 
2091 | 
2092 | 
2093 | 
2094 | 
2095 | 


--------------------------------------------------------------------------------