├── pip-requirement.txt ├── anomalytransfer ├── __init__.py ├── utils │ ├── __init__.py │ ├── config.py │ ├── data.py │ ├── testing.py │ └── logging.py ├── transfer │ ├── __init__.py │ ├── data.py │ ├── models.py │ └── spot.py └── clustering │ ├── __init__.py │ ├── baseline_extraction.py │ ├── average.py │ ├── preprocessing.py │ └── models.py ├── env.sh ├── environment.yml ├── sample ├── scripts │ ├── transfer │ │ ├── utils.py │ │ ├── plot_kpi.py │ │ ├── naive_bagel.py │ │ ├── transfer_learning.py │ │ ├── cluster_transfer_train.py │ │ └── cluster_transfer_test.py │ ├── transfer_entirely │ │ ├── utils.py │ │ └── finetune.py │ ├── clustering │ │ ├── step2_baseline_extraction.py │ │ ├── step1_preprocessing.py │ │ ├── step3_average.py │ │ └── step4_clustering.py │ └── test_time │ │ ├── test_adtshl.py │ │ └── test_at.py └── configs │ └── default.conf ├── LICENSE └── setup.py /pip-requirement.txt: -------------------------------------------------------------------------------- 1 | torch 2 | tqdm 3 | numpy 4 | scipy 5 | pandas 6 | matplotlib 7 | sklearn -------------------------------------------------------------------------------- /anomalytransfer/__init__.py: -------------------------------------------------------------------------------- 1 | import anomalytransfer.transfer as transfer 2 | import anomalytransfer.clustering as clustering 3 | import anomalytransfer.utils as utils 4 | -------------------------------------------------------------------------------- /env.sh: -------------------------------------------------------------------------------- 1 | # if not to setup `anomalytranfer` package, use this CLI: 2 | # source env.sh to set `PYTHONPATH` (execute the .sh with the same path) 3 | 4 | export PYTHONPATH=`pwd` 5 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: at 2 | channels: 3 | - pytorch 4 | - defaults 5 | dependencies: 6 | - scikit-learn 7 | - pandas 8 | - pytorch 9 | - matplotlib 10 | - tqdm 11 | -------------------------------------------------------------------------------- /anomalytransfer/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from anomalytransfer.utils.config import * 2 | from anomalytransfer.utils.logging import * 3 | from anomalytransfer.utils.data import * 4 | from anomalytransfer.utils.testing import * 5 | -------------------------------------------------------------------------------- /anomalytransfer/transfer/__init__.py: -------------------------------------------------------------------------------- 1 | import anomalytransfer.transfer.data as data 2 | import anomalytransfer.transfer.models as models 3 | import anomalytransfer.transfer.spot as spot 4 | 5 | from anomalytransfer.transfer.spot import SPOT 6 | from anomalytransfer.transfer.models import AnomalyDetector 7 | -------------------------------------------------------------------------------- /anomalytransfer/clustering/__init__.py: -------------------------------------------------------------------------------- 1 | import anomalytransfer.clustering.average as average 2 | import anomalytransfer.clustering.baseline_extraction as baseline_extraction 3 | import anomalytransfer.clustering.preprocessing as preprocessing 4 | import anomalytransfer.clustering.models as models 5 | 6 | from anomalytransfer.clustering.models import LatentTransformer 7 | -------------------------------------------------------------------------------- /sample/scripts/transfer/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | class run_time: 4 | def __init__(self, func = None): 5 | self.func = func 6 | 7 | def __call__(self, *args, **kwargs): 8 | start = time.time() 9 | res = self.func(*args, **kwargs) 10 | end = time.time() 11 | print(f"time: {end - start}") 12 | return res 13 | 14 | def __enter__(self): 15 | self.start = time.time() 16 | return self 17 | 18 | def __exit__(self, exc_type, exc_val, exc_tb): 19 | self.end = time.time() 20 | 21 | def get_time(self): 22 | return self.end - self.start 23 | -------------------------------------------------------------------------------- /sample/scripts/transfer_entirely/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | class run_time: 4 | def __init__(self, func = None): 5 | self.func = func 6 | 7 | def __call__(self, *args, **kwargs): 8 | start = time.time() 9 | res = self.func(*args, **kwargs) 10 | end = time.time() 11 | print(f"time: {end - start}") 12 | return res 13 | 14 | def __enter__(self): 15 | self.start = time.time() 16 | return self 17 | 18 | def __exit__(self, exc_type, exc_val, exc_tb): 19 | self.end = time.time() 20 | 21 | def get_time(self): 22 | return self.end - self.start 23 | -------------------------------------------------------------------------------- /anomalytransfer/utils/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | PROJECT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 3 | import configparser 4 | 5 | DEFAULT_CONFIGS = [ 6 | os.path.join(PROJECT_PATH, "sample", "configs", "default.conf"), 7 | ] 8 | 9 | LOCAL_CONFIGS = [ 10 | os.path.join(PROJECT_PATH, "sample", "configs", "local.conf"), 11 | ] 12 | 13 | 14 | def config() -> configparser.ConfigParser: 15 | config_parser = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation()) 16 | config_list = [] 17 | config_list.extend(DEFAULT_CONFIGS) 18 | for local_config in LOCAL_CONFIGS: 19 | if os.path.exists(local_config): 20 | config_list.append(local_config) 21 | config_parser.read(config_list) 22 | return config_parser 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Zhong Zhenyu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /sample/scripts/clustering/step2_baseline_extraction.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import anomalytransfer as at 4 | 5 | 6 | def main(): 7 | at.utils.mkdirs(OUTPUT) 8 | file_list = at.utils.file_list(INPUT) 9 | progbar = at.utils.ProgBar(len(file_list), interval=0.5, unit_name='file') 10 | print('Extracting baselines...') 11 | 12 | for file in file_list: 13 | filename = at.utils.filename(file) 14 | df = pd.read_csv(file) 15 | values = at.clustering.baseline_extraction.smoothing_extreme_values(df.value) 16 | standardized = at.clustering.baseline_extraction.extract_baseline(values, window_size=WINDOW_SIZE) 17 | df = pd.DataFrame({'timestamp': df.timestamp.iloc[WINDOW_SIZE - 1:], 'value': standardized[0]}) 18 | df.to_csv(os.path.join(OUTPUT, filename + '.csv'), index=False) 19 | progbar.add(1) 20 | 21 | 22 | if __name__ == '__main__': 23 | config = at.utils.config() 24 | 25 | INPUT = config.get('CLUSTERING_BASELINE_EXTRACTION', 'input') 26 | OUTPUT = config.get('CLUSTERING_BASELINE_EXTRACTION', 'output') 27 | WINDOW_SIZE = config.getint('CLUSTERING_BASELINE_EXTRACTION', 'window_size') 28 | 29 | main() 30 | -------------------------------------------------------------------------------- /anomalytransfer/clustering/baseline_extraction.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import anomalytransfer as at 3 | 4 | from typing import Sequence, Tuple 5 | 6 | 7 | def smoothing_extreme_values(values: Sequence) -> np.ndarray: 8 | values = np.asarray(values, np.float32) 9 | if len(values.shape) != 1: 10 | raise ValueError('`values` must be a 1-D array') 11 | 12 | abnormal_portion = 0.05 13 | values_deviation = np.abs(values) 14 | 15 | abnormal_max = np.max(values_deviation) 16 | abnormal_index = np.argwhere(values_deviation >= abnormal_max * (1 - abnormal_portion)) 17 | abnormal = abnormal_index.reshape(len(abnormal_index)) 18 | normal_index = np.argwhere(values_deviation < abnormal_max * (1 - abnormal_portion)) 19 | normal = normal_index.reshape(len(normal_index)) 20 | normal_values = values[normal] 21 | abnormal_values = np.interp(abnormal, normal, normal_values) 22 | values[abnormal] = abnormal_values 23 | 24 | return values 25 | 26 | 27 | def extract_baseline(values: Sequence, window_size: int) -> Tuple[np.ndarray, float, float]: 28 | baseline = np.convolve(values, np.ones((window_size,)) / window_size, mode='valid') 29 | return at.clustering.preprocessing.standardize(baseline) 30 | -------------------------------------------------------------------------------- /sample/scripts/clustering/step1_preprocessing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import anomalytransfer as at 4 | 5 | 6 | def main(): 7 | at.utils.mkdirs(OUTPUT) 8 | file_list = at.utils.file_list(INPUT) 9 | progbar = at.utils.ProgBar(len(file_list), interval=0.5, unit_name='file') 10 | print('Preprocessing...') 11 | 12 | for file in file_list: 13 | filename = at.utils.filename(file) 14 | df = pd.read_csv(file) 15 | timestamps, _, ret_arrays = at.clustering.preprocessing.linear_interpolation(df.timestamp, [df.value]) 16 | # ! Don't downsample before train bagel 17 | # timestamps, values = at.clustering.preprocessing.down_sampling([timestamps, ret_arrays[0]], 18 | # step=DOWN_SAMPLING_STEP) 19 | values, _, _ = at.clustering.preprocessing.standardize(ret_arrays[0]) 20 | df = pd.DataFrame({'timestamp': timestamps, 'value': values}) 21 | df.to_csv(os.path.join(OUTPUT, filename + '.csv'), index=False) 22 | progbar.add(1) 23 | 24 | 25 | if __name__ == '__main__': 26 | config = at.utils.config() 27 | 28 | INPUT = config.get('CLUSTERING_PREPROCESSING', 'input') 29 | OUTPUT = config.get('CLUSTERING_PREPROCESSING', 'output') 30 | DOWN_SAMPLING_STEP = config.getint('CLUSTERING_PREPROCESSING', 'down_sampling_step') 31 | 32 | main() 33 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open('README.md', 'r') as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name='anomalytransfer', 8 | version='0.3.0', 9 | author='AlumiK', 10 | author_email='nczzy1997@gmail.com', 11 | license='MIT', 12 | description='Implementation of AnomalyTransfer in PyTorch', 13 | long_description=long_description, 14 | long_description_content_type='text/markdown', 15 | url='https://github.com/AlumiK/anomalytransfer', 16 | packages=setuptools.find_packages(include=['anomalytransfer', 'anomalytransfer.*']), 17 | platforms='any', 18 | install_requires=[ 19 | 'pandas', 20 | 'scikit-learn', 21 | 'torch', 22 | 'tqdm' 23 | ], 24 | extras_require={ 25 | 'dev': [ 26 | 'matplotlib', 27 | ], 28 | }, 29 | dependency_links=[ 30 | 'https://download.pytorch.org/whl/torch_stable.html', 31 | ], 32 | classifiers=[ 33 | 'Development Status :: 2 - Pre-Alpha', 34 | 'Intended Audience :: Developers', 35 | 'Programming Language :: Python :: 3', 36 | 'Programming Language :: Python :: 3.8', 37 | 'License :: OSI Approved :: MIT License', 38 | 'Operating System :: OS Independent', 39 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 40 | 'Topic :: Software Development :: Libraries :: Python Modules', 41 | ], 42 | python_requires='==3.8', 43 | ) 44 | -------------------------------------------------------------------------------- /anomalytransfer/utils/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import anomalytransfer as at 4 | 5 | from typing import Sequence, Tuple 6 | 7 | 8 | def filename(file: str) -> str: 9 | return os.path.splitext(os.path.basename(file))[0] 10 | 11 | 12 | def mkdirs(*dir_list): 13 | for directory in dir_list: 14 | os.makedirs(directory, exist_ok=True) 15 | 16 | 17 | def file_list(path: str) -> Sequence: 18 | if os.path.isdir(path): 19 | return [os.path.join(path, file) for file in os.listdir(path) if file.endswith(".csv")] 20 | else: 21 | return [path] 22 | 23 | 24 | def load_kpi(file: str, **kwargs) -> at.transfer.data.KPI: 25 | df = pd.read_csv(file, **kwargs) 26 | df.dropna(0, inplace=True) 27 | return at.transfer.data.KPI(timestamps=df.timestamp, 28 | values=df.value, 29 | labels=df.get('label', None), 30 | name=filename(file)) 31 | 32 | 33 | class KPIStats: 34 | 35 | def __init__(self, kpi: at.transfer.data.KPI): 36 | self.num_points = len(kpi.values) 37 | self.num_missing = len(kpi.missing[kpi.missing == 1]) 38 | self.num_anomaly = len(kpi.labels[kpi.labels == 1]) 39 | self.missing_rate = self.num_missing / self.num_points 40 | self.anomaly_rate = self.num_anomaly / self.num_points 41 | 42 | 43 | def get_kpi_stats(*kpis: at.transfer.data.KPI) -> Tuple[KPIStats, ...]: 44 | ret = [] 45 | for kpi in kpis: 46 | ret.append(KPIStats(kpi)) 47 | return tuple(ret) 48 | -------------------------------------------------------------------------------- /sample/scripts/clustering/step3_average.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import anomalytransfer as at 4 | 5 | 6 | def main(): 7 | at.utils.mkdirs(OUTPUT_DAILY, OUTPUT_WEEKLY) 8 | file_list = at.utils.file_list(INPUT) 9 | progbar = at.utils.ProgBar(len(file_list), interval=0.5, unit_name='file') 10 | print('Extracting sub-curves...') 11 | 12 | for file in file_list: 13 | filename = at.utils.filename(file) 14 | df = pd.read_csv(file) 15 | daily_average, ts_average = at.clustering.average.get_daily_average( 16 | *at.clustering.average.group_data_by_weekday(timestamps=df.timestamp, values=df.value)) 17 | for i in range(len(daily_average)): 18 | df = pd.DataFrame({ 19 | 'timestamp': ts_average[i], 20 | 'value': daily_average[i] 21 | }) 22 | 23 | df.to_csv(os.path.join(OUTPUT_DAILY, filename + f'_wd{i}.csv'), index=False) 24 | weekly_average, ts_average = at.clustering.average.get_weekly_average(daily_average, ts_average) 25 | df = pd.DataFrame({ 26 | 'timestamp': ts_average, 27 | 'value': weekly_average 28 | }) 29 | df.to_csv(os.path.join(OUTPUT_WEEKLY, filename + '.csv'), index=False) 30 | progbar.add(1) 31 | 32 | 33 | if __name__ == '__main__': 34 | config = at.utils.config() 35 | 36 | INPUT = config.get('CLUSTERING_AVERAGE', 'input') 37 | OUTPUT_DAILY = config.get('CLUSTERING_AVERAGE', 'output_daily') 38 | OUTPUT_WEEKLY = config.get('CLUSTERING_AVERAGE', 'output_weekly') 39 | 40 | main() 41 | -------------------------------------------------------------------------------- /sample/configs/default.conf: -------------------------------------------------------------------------------- 1 | [COMMON] 2 | num_threads=6 3 | project_path=/home/zhangshenglin/project/anomalytransfer/ 4 | 5 | [CLUSTERING_PREPROCESSING] 6 | # input=${COMMON:project_path}/input 7 | input=/home/zhangshenglin/data/kpi-nab 8 | output=${COMMON:project_path}/out/clustering/preprocessing 9 | down_sampling_step=10 10 | 11 | [CLUSTERING_BASELINE_EXTRACTION] 12 | input=${COMMON:project_path}/out/clustering/preprocessing 13 | output=${COMMON:project_path}/out/clustering/baseline_extraction 14 | window_size=5 15 | 16 | [CLUSTERING_AVERAGE] 17 | input=${COMMON:project_path}/out/clustering/baseline_extraction 18 | output_daily=${COMMON:project_path}/out/clustering/average/daily 19 | output_weekly=${COMMON:project_path}/out/clustering/average/weekly 20 | 21 | [CLUSTERING] 22 | input=${COMMON:project_path}/out/clustering/average/daily 23 | output=${COMMON:project_path}/out/clustering/clustering 24 | epochs=200 25 | n_clusters=10 26 | 27 | [BAGEL] 28 | epochs=200 29 | # input=${COMMON:project_path}/input 30 | input=/home/zhangshenglin/data/kpi-nab 31 | output=${COMMON:project_path}/out/bagel 32 | 33 | [PLOT_KPI] 34 | # input=${COMMON:project_path}/input 35 | input=/home/zhangshenglin/data/kpi-nab 36 | output=${COMMON:project_path}/out/plot_kpi 37 | fig_width=32 38 | fig_height=6 39 | fig_dpi=144 40 | 41 | [TRANSFER_LEARNING] 42 | base_epochs=200 43 | data_epochs=200 44 | input=${COMMON:project_path}/out/clustering/clustering/top_k_daily_cluster 45 | test_output=${COMMON:project_path}/out/transfer_learning/test_results 46 | output=${COMMON:project_path}/out/transfer_learning/results 47 | model_path=${COMMON:project_path}/out/transfer_learning/models 48 | ratio=0.7 49 | -------------------------------------------------------------------------------- /sample/scripts/test_time/test_adtshl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import anomalytransfer as at 4 | os.environ["CUDA_VISIBLE_DEVICES"] = '' 5 | import pandas as pd 6 | 7 | def main(): 8 | at.utils.mkdirs(OUTPUT) 9 | file_list = at.utils.file_list(INPUT)[0:1] # only one 10 | proglog = at.utils.ProgLog(len(file_list)) 11 | 12 | result = {} 13 | for file in file_list: 14 | for exp in range(10): 15 | kpi = at.utils.load_kpi(file) 16 | proglog.log(kpi=kpi.name) 17 | kpi.complete_timestamp() 18 | total_minutes = 24 * 60 19 | interval = kpi.interval / 60 20 | num_of_point = int(total_minutes / interval) 21 | train_kpi, test_kpi = kpi.split_by_idx(num_of_point) 22 | 23 | train_kpi, mean, std = train_kpi.standardize() 24 | test_kpi, _, _ = test_kpi.standardize(mean=mean, std=std) 25 | 26 | model = at.transfer.AnomalyDetector() 27 | history = model.fit(kpi=train_kpi.no_labels(), epochs=EPOCHS) 28 | result[f"ts_{exp}"] = history['ts'] 29 | result[f"loss_{exp}"] = history['loss'] 30 | dt = pd.DataFrame(result) 31 | # dt.to_csv("adtshl.csv", index=False) 32 | dt.to_csv("bagel.csv", index=False) 33 | 34 | 35 | if __name__ == '__main__': 36 | logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s [%(levelname)s]] %(message)s') 37 | 38 | config = at.utils.config() 39 | NUM_THREADS = config.getint('COMMON', 'num_threads') 40 | EPOCHS = config.getint('BAGEL', 'epochs') 41 | INPUT = config.get('BAGEL', 'input') 42 | OUTPUT = config.get('BAGEL', 'output') 43 | 44 | # at.utils.set_num_threads(NUM_THREADS) 45 | main() 46 | -------------------------------------------------------------------------------- /anomalytransfer/clustering/average.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import numpy as np 3 | 4 | from typing import Sequence, Tuple 5 | 6 | 7 | def _get_weekday(timestamps: Sequence) -> Sequence: 8 | return [datetime.datetime.fromtimestamp(t).weekday() for t in timestamps] 9 | 10 | 11 | def group_data_by_weekday(timestamps: Sequence, values: Sequence) -> Tuple[Sequence, Sequence]: 12 | timestamps = np.asarray(timestamps, dtype=np.int64) 13 | values = np.asarray(values, dtype=np.float32) 14 | weekday = _get_weekday(timestamps) 15 | grouped_data = [[], [], [], [], [], [], []] 16 | grouped_ts = [[], [], [], [], [], [], []] 17 | current_weekday = weekday[0] 18 | current_index = 0 19 | for i in range(len(weekday)): 20 | if weekday[i] != current_weekday: 21 | if current_index != 0: 22 | # ! Add more 119 points (Bagel ignore the first 119 points!) 23 | grouped_data[current_weekday].append(values[(current_index-119):i]) 24 | grouped_ts[current_weekday].append(timestamps[(current_index-119):i]) 25 | current_weekday = weekday[i] 26 | current_index = i 27 | return grouped_data, grouped_ts 28 | 29 | 30 | def get_daily_average(grouped_data: Sequence, grouped_ts: Sequence) -> Tuple[Sequence[np.ndarray], Sequence[np.ndarray]]: 31 | daily_average = [] 32 | ts_average = [] 33 | for weekday, ts in zip(grouped_data, grouped_ts): 34 | daily_average.append(np.mean(weekday, axis=0)) 35 | ts_average.append(ts[0]) 36 | return daily_average, ts_average 37 | 38 | 39 | def get_weekly_average(daily_average: Sequence, ts_average: Sequence) -> Tuple[np.ndarray, np.ndarray]: 40 | return np.concatenate(daily_average), np.concatenate(ts_average) 41 | -------------------------------------------------------------------------------- /sample/scripts/transfer/plot_kpi.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | import numpy as np 4 | import anomalytransfer as at 5 | import matplotlib.pyplot as plt 6 | 7 | 8 | def _expand(a: np.ndarray) -> np.ndarray: 9 | ret = np.copy(a) 10 | for i in range(length := len(a)): 11 | if a[i] == 1: 12 | if i - 1 >= 0: 13 | ret[i - 1] = 1 14 | if i + 1 < length: 15 | ret[i + 1] = 1 16 | return ret 17 | 18 | 19 | def _plot_kpi(kpi: at.transfer.data.KPI): 20 | x = [datetime.datetime.fromtimestamp(timestamp) for timestamp in kpi.timestamps] 21 | y_anomaly, y_missing = np.copy(kpi.values), np.copy(kpi.values) 22 | y_anomaly[_expand(kpi.labels) == 0] = np.inf 23 | y_missing[_expand(kpi.missing) == 0] = np.inf 24 | plt.plot(x, kpi.values) 25 | plt.plot(x, y_anomaly, color='red') 26 | plt.plot(x, y_missing, color='orange') 27 | plt.title(kpi.name) 28 | plt.ylim(-7.5, 7.5) 29 | 30 | 31 | def main(): 32 | at.utils.mkdirs(OUTPUT) 33 | file_list = at.utils.file_list(INPUT) 34 | 35 | plt.figure(figsize=(FIG_W, FIG_H), dpi=FIG_DPI) 36 | progbar = at.utils.ProgBar(len(file_list), interval=0.5, unit_name='file') 37 | print('Plotting...') 38 | 39 | for file in file_list: 40 | kpi = at.utils.load_kpi(file) 41 | kpi, _, _ = kpi.standardize() 42 | kpi.complete_timestamp() 43 | _plot_kpi(kpi) 44 | plt.savefig(os.path.join(OUTPUT, f'{kpi.name}.png')) 45 | plt.clf() 46 | progbar.add(1) 47 | 48 | 49 | if __name__ == '__main__': 50 | config = at.utils.config() 51 | 52 | INPUT = config.get('PLOT_KPI', 'input') 53 | OUTPUT = config.get('PLOT_KPI', 'output') 54 | FIG_W = config.getfloat('PLOT_KPI', 'fig_width') 55 | FIG_H = config.getfloat('PLOT_KPI', 'fig_height') 56 | FIG_DPI = config.getint('PLOT_KPI', 'fig_dpi') 57 | 58 | main() 59 | -------------------------------------------------------------------------------- /sample/scripts/transfer/naive_bagel.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import anomalytransfer as at 4 | os.environ["CUDA_VISIBLE_DEVICES"] = '1' 5 | 6 | def main(): 7 | at.utils.mkdirs(OUTPUT) 8 | file_list = at.utils.file_list(INPUT) 9 | proglog = at.utils.ProgLog(len(file_list)) 10 | 11 | for file in file_list: 12 | kpi = at.utils.load_kpi(file) 13 | proglog.log(kpi=kpi.name) 14 | kpi.complete_timestamp() 15 | total_minutes = 7 * 24 * 60 16 | interval = kpi.interval / 60 17 | num_of_point = int(total_minutes / interval) 18 | train_kpi, test_kpi = kpi.split_by_idx(num_of_point) 19 | 20 | train_kpi, mean, std = train_kpi.standardize() 21 | test_kpi, _, _ = test_kpi.standardize(mean=mean, std=std) 22 | 23 | model = at.transfer.AnomalyDetector() 24 | model.fit(kpi=train_kpi.no_labels(), epochs=EPOCHS) 25 | anomaly_scores = model.predict(test_kpi) 26 | 27 | results = at.utils.get_test_results(labels=test_kpi.labels, 28 | scores=anomaly_scores, 29 | missing=test_kpi.missing) 30 | stats = at.utils.get_kpi_stats(kpi, test_kpi) 31 | at.utils.log_test_results(kpi.name, results=results) 32 | 33 | with open(f'{os.path.join(OUTPUT, kpi.name)}.txt', 'w') as output: 34 | output.write(f'[result]\n' 35 | f'threshold={results.get("threshold")}\n' 36 | f'precision={results.get("precision"):.3f}\n' 37 | f'recall={results.get("recall"):.3f}\n' 38 | f'f1_score={results.get("f1score"):.3f}\n\n' 39 | 40 | '[overall]\n' 41 | f'num_points={stats[0].num_points}\n' 42 | f'num_missing_points={stats[0].num_missing}\n' 43 | f'missing_rate={stats[0].missing_rate:.6f}\n' 44 | f'num_anomaly_points={stats[0].num_anomaly}\n' 45 | f'anomaly_rate={stats[0].anomaly_rate:.6f}\n\n' 46 | 47 | '[test]\n' 48 | f'num_points={stats[1].num_points}\n' 49 | f'num_missing_points={stats[1].num_missing}\n' 50 | f'missing_rate={stats[1].missing_rate:.6f}\n' 51 | f'num_anomaly_points={stats[1].num_anomaly}\n' 52 | f'anomaly_rate={stats[1].anomaly_rate:.6f}\n') 53 | 54 | 55 | if __name__ == '__main__': 56 | logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s [%(levelname)s]] %(message)s') 57 | 58 | config = at.utils.config() 59 | NUM_THREADS = config.getint('COMMON', 'num_threads') 60 | EPOCHS = config.getint('BAGEL', 'epochs') 61 | INPUT = config.get('BAGEL', 'input') 62 | OUTPUT = config.get('BAGEL', 'output') 63 | 64 | # at.utils.set_num_threads(NUM_THREADS) 65 | main() 66 | -------------------------------------------------------------------------------- /anomalytransfer/clustering/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from typing import Sequence, Tuple 4 | 5 | 6 | def linear_interpolation(timestamp: Sequence, arrays: Sequence[Sequence]) \ 7 | -> Tuple[np.ndarray, np.ndarray, Sequence[np.ndarray]]: 8 | timestamp = np.asarray(timestamp, np.int64) 9 | if len(timestamp.shape) != 1: 10 | raise ValueError('`timestamp` must be a 1-D array') 11 | 12 | arrays = [np.asarray(array) for array in arrays] 13 | for i, array in enumerate(arrays): 14 | if array.shape != timestamp.shape: 15 | raise ValueError(f'The shape of ``arrays[{i}]`` does not agree with ' 16 | f'the shape of `timestamp` ({array.shape} vs {timestamp.shape})') 17 | 18 | src_index = np.argsort(timestamp) 19 | timestamp_sorted = timestamp[src_index] 20 | intervals = np.unique(np.diff(timestamp_sorted)) 21 | interval = np.min(intervals) 22 | if interval == 0: 23 | raise ValueError('Duplicated values in `timestamp`') 24 | for itv in intervals: 25 | if itv % interval != 0: 26 | raise ValueError('Not all intervals in `timestamp` are multiples of the minimum interval') 27 | 28 | length = (timestamp_sorted[-1] - timestamp_sorted[0]) // interval + 1 29 | ret_timestamp = np.arange(timestamp_sorted[0], timestamp_sorted[-1] + interval, interval, dtype=np.int64) 30 | ret_missing = np.ones([length], dtype=np.int32) 31 | ret_arrays = [np.zeros([length], dtype=array.dtype) for array in arrays] 32 | dst_index = np.asarray((timestamp_sorted - timestamp_sorted[0]) // interval, dtype=np.int) 33 | ret_missing[dst_index] = 0 34 | miss_index = np.argwhere(ret_missing == 1) 35 | for ret_array, array in zip(ret_arrays, arrays): 36 | ret_array[dst_index] = array[src_index] 37 | 38 | for ret_array in ret_arrays: 39 | if len(miss_index) > 0: 40 | neg = miss_index.reshape(len(miss_index)) 41 | pos_index = np.argwhere(ret_missing == 0) 42 | pos = pos_index.reshape(len(pos_index)) 43 | pos_values = ret_array[pos] 44 | neg_values = np.interp(neg, pos, pos_values) 45 | ret_array[neg] = neg_values 46 | 47 | return ret_timestamp, ret_missing, ret_arrays 48 | 49 | 50 | def standardize(values: Sequence, mean: float = None, std: float = None) -> Tuple[np.ndarray, float, float]: 51 | values = np.asarray(values, dtype=np.float32) 52 | if len(values.shape) != 1: 53 | raise ValueError('`values` must be a 1-D array') 54 | if (mean is None) != (std is None): 55 | raise ValueError('`mean` and `std` must be both None or not None') 56 | 57 | if mean is None: 58 | val = values 59 | mean = val.mean() 60 | std = val.std() 61 | 62 | return (values - mean) / std, mean, std 63 | 64 | 65 | def down_sampling(arrays: Sequence[Sequence], step: int) -> Tuple[Sequence, ...]: 66 | ret_arrays = [] 67 | for array in arrays: 68 | array = array[::step] 69 | ret_arrays.append(array) 70 | return tuple(ret_arrays) 71 | -------------------------------------------------------------------------------- /sample/scripts/test_time/test_at.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pandas as pd 3 | from anomalytransfer.transfer.data import KPI 4 | import os 5 | os.environ["CUDA_VISIBLE_DEVICES"] = "" 6 | import logging 7 | import anomalytransfer as at 8 | import numpy as np 9 | from glob import glob 10 | from typing import Sequence, Tuple, Dict, Optional, cast 11 | np.seterr(divide='ignore', invalid='ignore') 12 | import pandas as pd 13 | 14 | 15 | def main(): 16 | raw_csvs = glob(os.path.join(INPUT, "*.csv"))[0:1] 17 | assert len(raw_csvs) > 0 18 | 19 | time_map = {} 20 | result = {} 21 | for raw_csv in raw_csvs: 22 | for exp in range(10): 23 | print(f"The KPI: {raw_csv}") 24 | raw_kpi_name = os.path.splitext(os.path.basename(raw_csv))[0] 25 | time_map[raw_kpi_name] = 0 26 | raw_kpi = at.utils.load_kpi(raw_csv) 27 | raw_kpi, _, _ = raw_kpi.standardize() 28 | raw_kpi.complete_timestamp() 29 | 30 | # get daily KPI 31 | train_week_day_map, test_week_day_map, test_kpi = raw_kpi.split_days(days=7) 32 | 33 | # get cluster map 34 | cluster_map = {} # weekday -> cluster_name 35 | for cluster in os.listdir(DAILY_OUTPUT): 36 | data_path = os.path.join(DAILY_OUTPUT, cluster, "data") 37 | raw_csv_daily = glob(os.path.join( 38 | data_path, f"{raw_kpi_name}*.csv")) 39 | raw_csv_daily = [int(os.path.splitext(os.path.basename(csv))[ 40 | 0][-1]) for csv in raw_csv_daily] 41 | for daily in raw_csv_daily: 42 | assert daily not in cluster_map 43 | cluster_map[daily] = cluster 44 | 45 | # fine-tune with train_kpi 46 | for weekday, kpi_seq in train_week_day_map.items(): 47 | dst_cluster_name = cluster_map[weekday] 48 | cluster_model_path = os.path.join(MODEL_PATH, dst_cluster_name) 49 | model = at.transfer.models.AnomalyDetector() 50 | if os.path.exists(os.path.join(cluster_model_path, "finetune")): 51 | model.load(cluster_model_path, "finetune") 52 | else: 53 | model.load(cluster_model_path, "base") 54 | 55 | for kpi in kpi_seq: 56 | history = model.fit(kpi, epochs=DATA_EPOCHS, verbose=1) 57 | result[f"ts_{exp}"] = history['ts'] 58 | result[f"loss_{exp}"] = history['loss'] 59 | if len(kpi_seq) > 0: 60 | model.save(cluster_model_path, "finetune") 61 | dt = pd.DataFrame(result) 62 | dt.to_csv("at.csv", index=False) 63 | 64 | 65 | if __name__ == '__main__': 66 | logging.basicConfig(level=logging.INFO, 67 | format='[%(asctime)s [%(levelname)s]] %(message)s') 68 | 69 | config = at.utils.config() 70 | CLUSTER_OUTPUT = config.get("CLUSTERING", "output") 71 | DAILY_OUTPUT = os.path.join(CLUSTER_OUTPUT, "daily_cluster") 72 | 73 | INPUT = config.get('BAGEL', 'input') 74 | OUTPUT = config.get('TRANSFER_LEARNING', 'output') 75 | MODEL_PATH = config.get('TRANSFER_LEARNING', 'model_path') 76 | DATA_EPOCHS = config.getint('TRANSFER_LEARNING', 'data_epochs') 77 | 78 | main() 79 | -------------------------------------------------------------------------------- /anomalytransfer/utils/testing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import anomalytransfer as at 4 | 5 | from typing import Sequence, Dict, Tuple, Optional 6 | from sklearn.metrics import precision_recall_curve, precision_recall_fscore_support 7 | 8 | 9 | def adjust_scores(labels: np.ndarray, 10 | scores: np.ndarray, 11 | delay: Optional[int] = None, 12 | inplace: bool = False) -> np.ndarray: 13 | if np.shape(scores) != np.shape(labels): 14 | raise ValueError('`labels` and `scores` must have same shape') 15 | if delay is None: 16 | delay = len(scores) 17 | splits = np.where(labels[1:] != labels[:-1])[0] + 1 18 | is_anomaly = labels[0] == 1 19 | adjusted_scores = np.copy(scores) if not inplace else scores 20 | pos = 0 21 | for part in splits: 22 | if is_anomaly: 23 | ptr = min(pos + delay + 1, part) 24 | adjusted_scores[pos: ptr] = np.max(adjusted_scores[pos: ptr]) 25 | adjusted_scores[ptr: part] = np.maximum(adjusted_scores[ptr: part], adjusted_scores[pos]) 26 | is_anomaly = not is_anomaly 27 | pos = part 28 | part = len(labels) 29 | if is_anomaly: 30 | ptr = min(pos + delay + 1, part) 31 | adjusted_scores[pos: part] = np.max(adjusted_scores[pos: ptr]) 32 | return adjusted_scores 33 | 34 | 35 | def _ignore_missing(series_list: Sequence, missing: np.ndarray) -> Tuple[np.ndarray, ...]: 36 | ret = [] 37 | for series in series_list: 38 | series = np.copy(series) 39 | ret.append(series[missing != 1]) 40 | return tuple(ret) 41 | 42 | 43 | def _best_f1score(labels: np.ndarray, scores: np.ndarray) -> Tuple[float, float, float, float]: 44 | precision, recall, thresholds = precision_recall_curve(y_true=labels, probas_pred=scores, pos_label=1.0) 45 | f1score = 2 * precision * recall / np.clip(precision + recall, a_min=1e-8, a_max=None) 46 | 47 | best_threshold = thresholds[np.argmax(f1score)] 48 | best_precision = precision[np.argmax(f1score)] 49 | best_recall = recall[np.argmax(f1score)] 50 | 51 | return best_threshold, best_precision, best_recall, np.max(f1score) 52 | 53 | 54 | def _f1score_given_alarms(labels: Sequence, alarms: Sequence) -> Tuple[float, float, float, float]: 55 | pred = np.zeros(len(labels)) 56 | pred[alarms] = 1 57 | precision, recall, f1score, _ = precision_recall_fscore_support(y_true=labels, 58 | y_pred=pred, 59 | average='binary', 60 | pos_label=1) 61 | return np.nan, precision, recall, f1score 62 | 63 | 64 | def set_num_threads(num_threads: int): 65 | torch.set_num_threads(num_threads) 66 | 67 | 68 | def get_test_results(labels: np.ndarray, 69 | scores: np.ndarray, 70 | missing: np.ndarray, 71 | window_size: int = 120, 72 | use_spot: bool = False, 73 | **kwargs) -> Dict: 74 | labels = labels[window_size - 1:] 75 | scores = scores[window_size - 1:] 76 | missing = missing[window_size - 1:] 77 | scores = adjust_scores(labels=labels, scores=scores) 78 | adjusted_labels, adjusted_scores = _ignore_missing([labels, scores], missing=missing) 79 | 80 | if use_spot: 81 | n_init = 1000 82 | init_data = adjusted_scores[:n_init] 83 | data = adjusted_scores[n_init:] 84 | labels = adjusted_labels[n_init:] 85 | 86 | result = {} 87 | for risk in kwargs.get('risks', [0.0001]): 88 | risk_result = {} 89 | for level in kwargs.get('levels', [0.98]): 90 | threshold, precision, recall, f1score = -1, -1, -1, -1 91 | try: 92 | spot = at.transfer.SPOT(q=risk) 93 | spot.fit(init_data, data) 94 | spot.initialize(level=level) 95 | r = spot.run() 96 | alarms = r['alarms'] 97 | threshold, precision, recall, f1score = _f1score_given_alarms(labels=labels, alarms=alarms) 98 | except Exception: 99 | pass 100 | # import traceback 101 | # traceback.print_exc() 102 | finally: 103 | level_result = { 104 | 'threshold': threshold, 105 | 'precision': precision, 106 | 'recall': recall, 107 | 'f1score': f1score 108 | } 109 | risk_result[f'{level}'] = level_result 110 | result[f'{risk}'] = risk_result 111 | return result 112 | else: 113 | try: 114 | threshold, precision, recall, f1score = _best_f1score(labels=adjusted_labels, scores=adjusted_scores) 115 | return { 116 | 'threshold': threshold, 117 | 'precision': precision, 118 | 'recall': recall, 119 | 'f1score': f1score, 120 | "scores": adjusted_scores, 121 | "labels": adjusted_labels 122 | } 123 | except: 124 | import traceback 125 | traceback.print_exc() 126 | return { 127 | "scores": adjusted_scores, 128 | "labels": adjusted_labels 129 | } 130 | -------------------------------------------------------------------------------- /anomalytransfer/clustering/models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | import anomalytransfer as at 5 | 6 | from typing import Optional 7 | from torch.backends import cudnn 8 | from torch.utils.data import TensorDataset, DataLoader 9 | 10 | 11 | class Encoder(torch.nn.Module): 12 | 13 | def __init__(self, input_dim: int, hidden_dim: int, latent_dim: int): 14 | super().__init__() 15 | 16 | self._lstm = torch.nn.LSTM(input_dim, hidden_dim) 17 | self._hidden_to_mean = torch.nn.Linear(hidden_dim, latent_dim) 18 | self._hidden_to_log_std = torch.nn.Linear(hidden_dim, latent_dim) 19 | 20 | torch.nn.init.xavier_uniform_(self._hidden_to_mean.weight) 21 | torch.nn.init.xavier_uniform_(self._hidden_to_log_std.weight) 22 | 23 | def forward(self, x): 24 | _, (h_end, c_end) = self._lstm(x) 25 | hidden = h_end[-1, :, :] 26 | self.mean = self._hidden_to_mean(hidden) 27 | self.log_std = self._hidden_to_log_std(hidden) 28 | if self.training: 29 | std = torch.exp(0.5 * self.log_std) 30 | eps = torch.randn_like(std) 31 | return eps.mul_(std).add_(self.mean) 32 | else: 33 | return self.mean 34 | 35 | 36 | class Decoder(torch.nn.Module): 37 | 38 | def __init__(self, 39 | seq_length: int, 40 | latent_dim: int, 41 | hidden_dim: int, 42 | output_dim: int, 43 | batch_size: int, 44 | device: str): 45 | super(Decoder, self).__init__() 46 | 47 | self._lstm = torch.nn.LSTM(1, hidden_dim) 48 | self._latent_to_hidden = torch.nn.Linear(latent_dim, hidden_dim) 49 | self._hidden_to_output = torch.nn.Linear(hidden_dim, output_dim) 50 | 51 | self._model_input = torch.zeros(seq_length, batch_size, 1).to(device) 52 | self._c_0 = torch.zeros(1, batch_size, hidden_dim).to(device) 53 | 54 | torch.nn.init.xavier_uniform_(self._latent_to_hidden.weight) 55 | torch.nn.init.xavier_uniform_(self._hidden_to_output.weight) 56 | 57 | def forward(self, x): 58 | hidden = self._latent_to_hidden(x) 59 | h_0 = torch.stack([hidden]) 60 | hidden, _ = self._lstm(self._model_input, (h_0, self._c_0)) 61 | return self._hidden_to_output(hidden) 62 | 63 | 64 | class LatentTransformer(torch.nn.Module): 65 | 66 | def __init__(self, 67 | seq_length: int, 68 | input_dim: int, 69 | hidden_dim: int = 90, 70 | latent_dim: int = 20, 71 | batch_size: int = 32, 72 | max_grad_norm: int = 5, 73 | device: Optional[str] = None): 74 | super().__init__() 75 | 76 | self._batch_size = batch_size 77 | self._max_grad_norm = max_grad_norm 78 | 79 | cudnn.benchmark = True 80 | if device is None: 81 | self._device = 'cuda' if torch.cuda.is_available() else 'cpu' 82 | else: 83 | self._device = device 84 | 85 | self._encoder = Encoder(input_dim=input_dim, hidden_dim=hidden_dim, latent_dim=latent_dim) 86 | self._decoder = Decoder(seq_length=seq_length, latent_dim=latent_dim, hidden_dim=hidden_dim, 87 | output_dim=input_dim, batch_size=self._batch_size, device=self._device) 88 | self._is_fitted = False 89 | self.to(self._device) 90 | 91 | self._optimizer = torch.optim.Adam(self.parameters(), lr=0.001) 92 | self._lr_scheduler = torch.optim.lr_scheduler.StepLR(self._optimizer, step_size=20, gamma=0.9) 93 | self._loss_fn = torch.nn.MSELoss(reduction='sum') 94 | 95 | def forward(self, x): 96 | return self._decoder(self._encoder(x)) 97 | 98 | def _loss(self, x) -> torch.Tensor: 99 | x_recon = self(x) 100 | mean, log_std = self._encoder.mean, self._encoder.log_std 101 | kl_loss = -0.5 * torch.mean(1 + log_std - mean.pow(2) - log_std.exp()) 102 | reconstruction_loss = self._loss_fn(x_recon, x) 103 | return kl_loss + reconstruction_loss 104 | 105 | def fit(self, data: torch.Tensor, epochs: int, verbose=1): 106 | self.train() 107 | dataset = TensorDataset(data.to(self._device)) 108 | train_loader = DataLoader(dataset=dataset, batch_size=self._batch_size, shuffle=True, drop_last=True) 109 | print('Training Epochs') 110 | if verbose: 111 | progbar = at.utils.ProgBar(epochs, interval=0.5, stateful_metrics=['loss'], unit_name='epoch') 112 | for i in range(epochs): 113 | epoch_losses = [] 114 | for x in train_loader: 115 | x = x[0].permute(1, 0, 2) 116 | self._optimizer.zero_grad() 117 | loss = self._loss(x) 118 | loss.backward() 119 | torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=self._max_grad_norm) 120 | self._optimizer.step() 121 | epoch_losses.append(loss) 122 | self._lr_scheduler.step() 123 | epoch_loss = torch.mean(torch.as_tensor(epoch_losses)).numpy() 124 | if verbose: 125 | progbar.add(1, values=[('loss', epoch_loss)]) 126 | self._is_fitted = True 127 | 128 | def transform(self, data: torch.Tensor) -> np.ndarray: 129 | self.eval() 130 | dataset = TensorDataset(data.to(self._device)) 131 | test_loader = DataLoader(dataset=dataset, batch_size=self._batch_size) 132 | print('Transforming Steps') 133 | progbar = at.utils.ProgBar(len(test_loader), interval=0.5) 134 | if self._is_fitted: 135 | with torch.no_grad(): 136 | latent = [] 137 | for x in test_loader: 138 | x = x[0].permute(1, 0, 2) 139 | x = self._encoder(x).cpu().numpy() 140 | latent.append(x) 141 | progbar.add(1) 142 | return np.concatenate(latent, axis=0) 143 | raise RuntimeError('Model needs to be fitted') 144 | 145 | def save(self, path: str): 146 | if self._is_fitted: 147 | torch.save(self.state_dict(), path) 148 | else: 149 | raise RuntimeError('Model needs to be fitted') 150 | 151 | def load(self, path: str): 152 | self._is_fitted = True 153 | self.load_state_dict(torch.load(path)) 154 | -------------------------------------------------------------------------------- /sample/scripts/transfer/transfer_learning.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import anomalytransfer as at 4 | 5 | from typing import Sequence, Tuple, Dict, Optional 6 | 7 | import argparse 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("--th", type=float) 10 | args = parser.parse_args() 11 | 12 | def make_base_model(kpi: at.transfer.data.KPI, model_path: str, epochs: int): 13 | kpi.complete_timestamp() 14 | kpi, _, _ = kpi.standardize() 15 | at.utils.mkdirs(os.path.join(model_path, kpi.name)) 16 | model = at.transfer.models.AnomalyDetector() 17 | model.fit(kpi=kpi.no_labels(), epochs=epochs) 18 | model.save(path=model_path, name=kpi.name) 19 | 20 | 21 | def train_test(train_kpi: at.transfer.data.KPI, 22 | test_kpi: at.transfer.data.KPI, 23 | epochs: int, 24 | mask: Optional[Sequence] = None, 25 | **kwargs) -> float: 26 | model = at.transfer.models.AnomalyDetector() 27 | if "model_path" in kwargs: 28 | model.load_partial(path=kwargs.get('model_path'), name=kwargs.get('base_kpi').name, mask=mask) 29 | 30 | if mask is not None: 31 | model.freeze(mask) 32 | model.fit(kpi=train_kpi.no_labels(), epochs=epochs) 33 | model.unfreeze(mask) 34 | else: 35 | model.fit(kpi=train_kpi.no_labels(), epochs=epochs) 36 | anomaly_scores = model.predict(test_kpi) 37 | results = at.utils.get_test_results(labels=test_kpi.labels, 38 | scores=anomaly_scores, 39 | missing=test_kpi.missing, 40 | use_spot=True) 41 | results = results['0.0001']['0.98'] 42 | at.utils.log_test_results(name=test_kpi.name, results=results) 43 | return results['f1score'] 44 | 45 | 46 | def transfer_learning(base_kpi: at.transfer.data.KPI, 47 | data_kpi: at.transfer.data.KPI, 48 | train_ratio: float, 49 | model_path: str, 50 | epochs: int) -> Optional[Dict]: 51 | result = {} 52 | progress = at.utils.ProgLog(3, indent=3) 53 | 54 | progress.log(step='Preparing KPI...') 55 | data_kpi.complete_timestamp() 56 | train_kpi, test_kpi, _ = data_kpi.split((train_ratio, 0.3, 0.7 - train_ratio)) 57 | train_kpi, mean, std = train_kpi.standardize() 58 | test_kpi, _, _ = test_kpi.standardize(mean=mean, std=std) 59 | 60 | # Ignore kpi curves that have less than 5 anomalies 61 | if len(test_kpi.values[test_kpi.labels == 1]) < 5: 62 | print('Less than 5 anomalies. Skipping...') 63 | return None 64 | 65 | progress.log(step='Training and testing before transfer...') 66 | result['f1score_pre_transfer'] = train_test(train_kpi=train_kpi, 67 | test_kpi=test_kpi, 68 | epochs=epochs) 69 | 70 | progress.log(step='Training and testing after transfer...') 71 | sbd = at.transfer.models.sbd_(base_kpi, data_kpi) 72 | mask = at.transfer.models.find_optimal_mask(sbd, 73 | # threshold=0.3, 74 | threshold=args.th / 10, 75 | less_mask=((1, 1, 1), (1, 1, 1)), 76 | greater_mask=((1, 1, 0), (0, 1, 1)),) 77 | result['f1score_post_transfer'] = train_test(train_kpi=train_kpi, 78 | test_kpi=test_kpi, 79 | epochs=epochs, 80 | mask=mask, 81 | model_path=model_path, 82 | base_kpi=base_kpi) 83 | 84 | return result 85 | 86 | 87 | def cluster_data(path: str) -> Tuple[str, str]: 88 | base = None 89 | data = None 90 | for item in os.listdir(path): 91 | item_path = os.path.join(path, item) 92 | if os.path.isdir(item_path): 93 | data = item_path 94 | else: 95 | base = item_path 96 | if base is None or data is None: 97 | raise ValueError('Base path or data path not found') 98 | return base, data 99 | 100 | 101 | def main(): 102 | at.utils.mkdirs(OUTPUT, MODEL_PATH) 103 | clusters = os.listdir(INPUT) 104 | 105 | cluster_prog = at.utils.ProgLog(len(clusters)) 106 | for cluster in clusters: 107 | cluster_prog.log(cluster=cluster) 108 | 109 | base, data = cluster_data(os.path.join(INPUT, cluster)) 110 | file_list = at.utils.file_list(data) 111 | step_progress = at.utils.ProgLog(2, indent=1) 112 | 113 | step_progress.log(step='Making base model...', cluster=cluster) 114 | base_kpi = at.utils.load_kpi(base) 115 | make_base_model(kpi=base_kpi, model_path=MODEL_PATH, epochs=BASE_EPOCHS) 116 | 117 | step_progress.log(step='Performing transfer learning...', cluster=cluster) 118 | output_path = os.path.join(OUTPUT, f'{cluster}.csv') 119 | with open(output_path, 'w') as output: 120 | output.write('kpi_name,f1score_pre_transfer,f1score_post_transfer\n') 121 | 122 | file_progress = at.utils.ProgLog(len(file_list), indent=2) 123 | for file in file_list: 124 | data_kpi = at.utils.load_kpi(file) 125 | file_progress.log(kpi=data_kpi.name, cluster=cluster) 126 | result = transfer_learning(base_kpi=base_kpi, 127 | data_kpi=data_kpi, 128 | train_ratio=RATIO, 129 | model_path=MODEL_PATH, 130 | epochs=DATA_EPOCHS) 131 | if result is not None: 132 | with open(output_path, 'a') as output: 133 | output.write(f'{data_kpi.name},' 134 | f'{result.get("f1score_pre_transfer"):.3f},' 135 | f'{result.get("f1score_post_transfer"):.3f}\n') 136 | 137 | 138 | if __name__ == '__main__': 139 | logging.basicConfig(level=logging.INFO, format='[%(asctime)s [%(levelname)s]] %(message)s') 140 | 141 | config = at.utils.config() 142 | NUM_THREADS = config.getint('COMMON', 'num_threads') 143 | BASE_EPOCHS = config.getint('TRANSFER_LEARNING', 'base_epochs') 144 | DATA_EPOCHS = config.getint('TRANSFER_LEARNING', 'data_epochs') 145 | INPUT = config.get('TRANSFER_LEARNING', 'input') 146 | OUTPUT = config.get('TRANSFER_LEARNING', 'output') 147 | MODEL_PATH = config.get('TRANSFER_LEARNING', 'model_path') 148 | RATIO = config.getfloat('TRANSFER_LEARNING', 'ratio') 149 | 150 | at.utils.set_num_threads(NUM_THREADS) 151 | main() 152 | -------------------------------------------------------------------------------- /sample/scripts/transfer/cluster_transfer_train.py: -------------------------------------------------------------------------------- 1 | import os 2 | # os.environ["CUDA_VISIBLE_DEVICES"] = "0" 3 | 4 | import logging 5 | import anomalytransfer as at 6 | from glob import glob 7 | from utils import run_time 8 | 9 | from typing import Sequence, Tuple, Dict, Optional 10 | 11 | 12 | def make_base_model(kpi: at.transfer.data.KPI, model_path: str, epochs: int): 13 | kpi.complete_timestamp() 14 | kpi, _, _ = kpi.standardize() 15 | at.utils.mkdirs(os.path.join(model_path, kpi.name)) 16 | model = at.transfer.models.AnomalyDetector() 17 | model.fit(kpi=kpi.no_labels(), epochs=epochs, verbose=1) 18 | model.save(path=model_path, name=kpi.name) 19 | 20 | 21 | def train_test(train_kpi: at.transfer.data.KPI, 22 | epochs: int, 23 | test_kpi: at.transfer.data.KPI = None, 24 | mask: Optional[Sequence] = None, 25 | **kwargs) -> float: 26 | model = at.transfer.models.AnomalyDetector() 27 | if mask is not None: 28 | model.load_partial(path=kwargs.get('model_path'), name=kwargs.get('base_kpi').name, mask=mask) 29 | model.freeze(mask) 30 | model.fit(kpi=train_kpi.no_labels(), epochs=epochs, verbose=1) 31 | model.unfreeze(mask) 32 | model.fit(kpi=train_kpi.no_labels(), epochs=epochs, verbose=1) 33 | if test_kpi is not None and test_kpi.labels is not None: 34 | anomaly_scores = model.predict(test_kpi, verbose=1) 35 | results = at.utils.get_test_results(labels=test_kpi.labels, 36 | scores=anomaly_scores, 37 | missing=test_kpi.missing, 38 | use_spot=False) 39 | at.utils.log_test_results(name=test_kpi.name, results=results) 40 | return results['f1score'] 41 | else: 42 | return None 43 | 44 | def transfer_learning(base_kpi: at.transfer.data.KPI, 45 | data_kpi: at.transfer.data.KPI, 46 | train_ratio: float, 47 | model_path: str, 48 | epochs: int, 49 | TH: int) -> Optional[Dict]: 50 | result = {} 51 | progress = at.utils.ProgLog(3, indent=3) 52 | 53 | progress.log(step='Preparing KPI...') 54 | data_kpi.complete_timestamp() 55 | train_kpi = data_kpi 56 | train_kpi, mean, std = train_kpi.standardize() 57 | 58 | progress.log(step='Training and testing before transfer...') 59 | result['f1score_pre_transfer'] = train_test(train_kpi=train_kpi, 60 | epochs=epochs) 61 | 62 | progress.log(step='Training and testing after transfer...') 63 | sbd = at.transfer.models.sbd_(base_kpi, data_kpi) 64 | mask = at.transfer.models.find_optimal_mask(sbd, 65 | threshold=args.th / 10, 66 | # threshold=0.3, 67 | less_mask=((1, 1, 1), (1, 1, 1)), 68 | greater_mask=((1, 1, 0), (0, 1, 1))) 69 | result['f1score_post_transfer'] = train_test(train_kpi=train_kpi, 70 | epochs=epochs, 71 | mask=mask, 72 | model_path=model_path, 73 | base_kpi=base_kpi) 74 | 75 | return result 76 | 77 | 78 | def cluster_data(path: str) -> Tuple[str, str]: 79 | base = None 80 | data = None 81 | for item in os.listdir(path): 82 | item_path = os.path.join(path, item) 83 | if os.path.isdir(item_path): 84 | data = item_path 85 | else: 86 | base = item_path 87 | if base is None or data is None: 88 | raise ValueError('Base path or data path not found') 89 | return base, data 90 | 91 | 92 | def main(TH: int): 93 | at.utils.mkdirs(OUTPUT, MODEL_PATH) 94 | clusters = os.listdir(INPUT) 95 | 96 | cluster_prog = at.utils.ProgLog(len(clusters)) 97 | time_map = {} 98 | for cluster in clusters: 99 | cluster_model_path = os.path.join(MODEL_PATH, str(TH), cluster) 100 | cluster_name = os.path.basename(cluster_model_path) 101 | time_map[cluster_name] = 0 102 | cluster_prog.log(cluster=cluster) 103 | 104 | base, data = cluster_data(os.path.join(INPUT, cluster)) 105 | file_list = at.utils.file_list(data) 106 | step_progress = at.utils.ProgLog(2, indent=1) 107 | 108 | step_progress.log(step='Making base model...', cluster=cluster) 109 | base_kpi = at.utils.load_kpi(base) 110 | make_base_model(kpi=base_kpi, model_path=cluster_model_path, epochs=BASE_EPOCHS) 111 | 112 | # step_progress.log(step='Performing transfer learning...', cluster=cluster) 113 | # output_path = os.path.join(OUTPUT, f'{cluster}.csv') 114 | # with open(output_path, 'w') as output: 115 | # output.write('kpi_name,f1score_pre_transfer,f1score_post_transfer\n') 116 | 117 | # file_progress = at.utils.ProgLog(len(file_list), indent=2) 118 | # for file in file_list: 119 | # data_kpi = at.utils.load_kpi(file) 120 | # file_progress.log(kpi=data_kpi.name, cluster=cluster) 121 | # with run_time() as t: 122 | # result = transfer_learning(base_kpi=base_kpi, 123 | # data_kpi=data_kpi, 124 | # train_ratio=RATIO, 125 | # model_path=cluster_model_path, 126 | # epochs=DATA_EPOCHS) 127 | # time_map[cluster_name] += t.get_time() 128 | # if result is not None and \ 129 | # result.get("f1score_pre_transfer") is not None and \ 130 | # result.get("f1score_post_transfer") is not None: 131 | # with open(output_path, 'a') as output: 132 | # output.write(f'{data_kpi.name},' 133 | # f'{result.get("f1score_pre_transfer"):.3f},' 134 | # f'{result.get("f1score_post_transfer"):.3f}\n') 135 | # import json 136 | # json.dump(time_map, open("train_cluster_time.json", "w"), indent=4) 137 | 138 | if __name__ == '__main__': 139 | logging.basicConfig(level=logging.INFO, format='[%(asctime)s [%(levelname)s]] %(message)s') 140 | 141 | config = at.utils.config() 142 | CLUSTER_OUTPUT = config.get("CLUSTERING", "output") 143 | BASE_EPOCHS = config.getint('TRANSFER_LEARNING', 'base_epochs') 144 | DATA_EPOCHS = config.getint('TRANSFER_LEARNING', 'data_epochs') 145 | INPUT = config.get('TRANSFER_LEARNING', 'input') 146 | OUTPUT = config.get('TRANSFER_LEARNING', 'output') 147 | MODEL_PATH = config.get('TRANSFER_LEARNING', 'model_path') 148 | RATIO = config.getfloat('TRANSFER_LEARNING', 'ratio') 149 | 150 | for th in range(0, 21, 1): 151 | main(th) 152 | -------------------------------------------------------------------------------- /sample/scripts/transfer/cluster_transfer_test.py: -------------------------------------------------------------------------------- 1 | from sample.scripts.transfer.utils import run_time 2 | import torch 3 | import pandas as pd 4 | from anomalytransfer.transfer.data import KPI 5 | import os 6 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 7 | import logging 8 | import anomalytransfer as at 9 | import numpy as np 10 | from glob import glob 11 | from typing import Sequence, Tuple, Dict, Optional, cast 12 | np.seterr(divide='ignore', invalid='ignore') 13 | 14 | 15 | def _ignore_missing(series_list: Sequence, missing: np.ndarray) -> Tuple[np.ndarray, ...]: 16 | ret = [] 17 | for series in series_list: 18 | series = np.copy(series) 19 | ret.append(series[missing != 1]) 20 | return tuple(ret) 21 | 22 | 23 | def get_test_results( 24 | timestamps: np.ndarray, 25 | labels: np.ndarray, 26 | scores: np.ndarray, 27 | missing: np.ndarray, 28 | values: np.ndarray, 29 | window_size: int = 120, 30 | **kwargs) -> Dict: 31 | timestamps = timestamps[window_size - 1:] 32 | labels = labels[window_size - 1:] 33 | scores = scores[window_size - 1:] 34 | missing = missing[window_size - 1:] 35 | values = values[window_size - 1:] 36 | adjusted_timestamps, adjusted_labels, adjusted_scores, adjusted_values = _ignore_missing( 37 | [timestamps, labels, scores, values], missing=missing) 38 | 39 | return { 40 | "timestamp": adjusted_timestamps, 41 | "scores": adjusted_scores, 42 | "labels": adjusted_labels, 43 | "values": adjusted_values 44 | } 45 | 46 | 47 | def main(TH: int): 48 | raw_csvs = glob(os.path.join(INPUT, "*.csv")) 49 | assert len(raw_csvs) > 0 50 | 51 | models = glob(os.path.join(MODEL_PATH, str(TH), "cluster-*")) 52 | assert len(models) > 0 53 | 54 | time_map = {} 55 | for raw_csv in raw_csvs: 56 | print(f"The KPI: {raw_csv}") 57 | raw_kpi_name = os.path.splitext(os.path.basename(raw_csv))[0] 58 | time_map[raw_kpi_name] = 0 59 | raw_kpi = at.utils.load_kpi(raw_csv) 60 | raw_kpi, _, _ = raw_kpi.standardize() 61 | raw_kpi.complete_timestamp() 62 | 63 | total_timestamps = [] 64 | total_scores = [] 65 | total_labels = [] 66 | total_values = [] 67 | 68 | # get daily KPI 69 | train_week_day_map, test_week_day_map, test_kpi = raw_kpi.split_days(days=7) 70 | 71 | # get cluster map 72 | cluster_map = {} # weekday -> cluster_name 73 | for cluster in os.listdir(DAILY_OUTPUT): 74 | data_path = os.path.join(DAILY_OUTPUT, cluster, "data") 75 | raw_csv_daily = glob(os.path.join( 76 | data_path, f"{raw_kpi_name}*.csv")) 77 | raw_csv_daily = [int(os.path.splitext(os.path.basename(csv))[ 78 | 0][-1]) for csv in raw_csv_daily] 79 | for daily in raw_csv_daily: 80 | assert daily not in cluster_map 81 | cluster_map[daily] = cluster 82 | 83 | # fine-tune with train_kpi 84 | for weekday, kpi_seq in train_week_day_map.items(): 85 | dst_cluster_name = cluster_map[weekday] 86 | cluster_model_path = os.path.join(MODEL_PATH, str(TH), dst_cluster_name) 87 | model = at.transfer.models.AnomalyDetector() 88 | if os.path.exists(os.path.join(cluster_model_path, "finetune")): 89 | model.load(cluster_model_path, "finetune") 90 | else: 91 | model.load(cluster_model_path, "base") 92 | 93 | for kpi in kpi_seq: 94 | with run_time() as t: 95 | model.fit(kpi, epochs=DATA_EPOCHS, verbose=1) 96 | time_map[raw_kpi_name] += t.get_time() 97 | if len(kpi_seq) > 0: 98 | model.save(cluster_model_path, "finetune") 99 | 100 | # test 101 | for weekday, kpi_seq in test_week_day_map.items(): 102 | dst_cluster_name = cluster_map[weekday] 103 | cluster_model_path = os.path.join(MODEL_PATH, str(TH), dst_cluster_name) 104 | assert os.path.exists(os.path.join( 105 | cluster_model_path, "finetune")), f"the train stage of {dst_cluster_name} is missed..." 106 | 107 | model = at.transfer.models.AnomalyDetector() 108 | model.load(cluster_model_path, "finetune") 109 | for kpi in kpi_seq: 110 | kpi = cast(KPI, kpi) 111 | anomaly_scores = model.predict(kpi, verbose=1) 112 | try: 113 | results = get_test_results( 114 | timestamps=kpi.timestamps, 115 | labels=kpi.labels, 116 | scores=anomaly_scores, 117 | missing=kpi.missing, 118 | values=kpi.values 119 | ) 120 | # results = results['0.0001']['0.98'] 121 | 122 | total_timestamps.extend(results["timestamp"]) 123 | total_scores.extend(results["scores"]) 124 | total_labels.extend(results["labels"]) 125 | total_values.extend(results["values"]) 126 | except: 127 | import traceback 128 | traceback.print_exc() 129 | exit(-1) 130 | 131 | total_timestamps = np.asarray(total_timestamps) 132 | total_scores = np.asarray(total_scores) 133 | total_labels = np.asarray(total_labels) 134 | total_values = np.asarray(total_values) 135 | 136 | sort_idx = np.argsort(total_timestamps) 137 | total_timestamps = total_timestamps[sort_idx] 138 | total_scores = total_scores[sort_idx] 139 | total_values = total_values[sort_idx] 140 | total_labels = total_labels[sort_idx] 141 | 142 | # # adjust after concatenate 143 | adjusted_scores = at.utils.adjust_scores( 144 | labels=total_labels, scores=total_scores) 145 | 146 | dt = pd.DataFrame({ 147 | "ts": total_timestamps, 148 | "scores": adjusted_scores, 149 | "values": total_values, 150 | "label": total_labels, 151 | }) 152 | # if not os.path.exists(os.path.join(OUTPUT, "transfer")): 153 | # os.makedirs(os.path.join(OUTPUT, "transfer"), exist_ok=True) 154 | if not os.path.exists(os.path.join(OUTPUT, f"transfer_{TH / 10}")): 155 | os.makedirs(os.path.join(OUTPUT, f"transfer_{TH / 10}")) 156 | dt.to_csv(os.path.join(OUTPUT, f"transfer_{TH / 10}", 157 | f"{raw_kpi_name}.csv"), index=False) 158 | 159 | import json 160 | json.dump(time_map, open("test_time.json", "w"), indent=4) 161 | 162 | 163 | if __name__ == '__main__': 164 | logging.basicConfig(level=logging.INFO, 165 | format='[%(asctime)s [%(levelname)s]] %(message)s') 166 | 167 | config = at.utils.config() 168 | CLUSTER_OUTPUT = config.get("CLUSTERING", "output") 169 | DAILY_OUTPUT = os.path.join(CLUSTER_OUTPUT, "daily_cluster") 170 | 171 | INPUT = config.get('BAGEL', 'input') 172 | OUTPUT = config.get('TRANSFER_LEARNING', 'output') 173 | MODEL_PATH = config.get('TRANSFER_LEARNING', 'model_path') 174 | DATA_EPOCHS = config.getint('TRANSFER_LEARNING', 'data_epochs') 175 | 176 | for th in range(18, 21, 1): 177 | main(th) 178 | -------------------------------------------------------------------------------- /sample/scripts/clustering/step4_clustering.py: -------------------------------------------------------------------------------- 1 | from sklearn.cluster import KMeans 2 | import torch 3 | import pandas as pd 4 | import numpy as np 5 | from typing import Sequence, Tuple 6 | import logging 7 | import anomalytransfer as at 8 | import os 9 | import shutil 10 | os.environ["CUDA_VISIBLE_DEVICES"] = "" 11 | 12 | 13 | def _load_data(path: str) -> Tuple[np.ndarray, Sequence]: 14 | file_list = at.utils.file_list(path) 15 | progbar = at.utils.ProgBar(len(file_list), interval=0.5, unit_name='file') 16 | values = [] 17 | names = [] 18 | for file in file_list: 19 | filename = at.utils.filename(file) 20 | names.append(filename) 21 | values.append(at.clustering.preprocessing.down_sampling( 22 | [pd.read_csv(file).value.to_numpy()], step=DOWN_SAMPLING_STEP)[0]) 23 | progbar.add(1) 24 | values = np.expand_dims(np.asarray(values), -1).astype(np.float32) 25 | return values, names 26 | 27 | 28 | def _get_latent_vectors(x: np.ndarray) -> np.ndarray: 29 | x = torch.as_tensor(x) 30 | seq_length = x.shape[1] 31 | input_dim = x.shape[2] 32 | 33 | model = at.clustering.LatentTransformer( 34 | seq_length=seq_length, input_dim=input_dim) 35 | model.fit(x, epochs=EPOCHS) 36 | model.save(os.path.join(OUTPUT, 'model.pt')) 37 | return model.transform(x) 38 | 39 | 40 | def _get_clustering_result(labels: Sequence, names: Sequence) -> Tuple[Sequence, Sequence]: 41 | class_count = {} 42 | base_names = [] 43 | classes = [] 44 | 45 | for i in range(len(names)): 46 | base_name = names[i][:-4] 47 | if base_name not in class_count.keys(): 48 | class_count[base_name] = [0] * N_CLUSTERS 49 | class_count[base_name][labels[i]] += 1 50 | 51 | for k, v in class_count.items(): 52 | base_names.append(k) 53 | classes.append(np.argmax(v)) 54 | 55 | return base_names, classes 56 | 57 | 58 | def _sse_get_best_cluster_num(latent): 59 | if not os.path.exists("SSE (best cluster num).png"): 60 | distance_centroid = [] 61 | max_clusters = 50 62 | for i in range(1, max_clusters): 63 | km = KMeans(n_clusters=i) 64 | km.fit(latent) 65 | distance_centroid.append(km.inertia_) 66 | 67 | import matplotlib.pyplot as plt 68 | plt.figure() 69 | plt.plot(range(1, max_clusters), distance_centroid, marker="o") 70 | plt.xlabel("The num of clusters") 71 | plt.ylabel("SSE") 72 | plt.savefig("SSE (best cluster num).png") 73 | 74 | 75 | def _get_distance_centroid(features: np.ndarray, centroid: np.ndarray, labels: np.ndarray): 76 | """ 77 | return: (N_samples, order_in_each_cluster) 78 | """ 79 | distance_order = np.zeros([features.shape[0]]) 80 | for label in range(centroid.shape[0]): 81 | center: np.ndarray = centroid[label] 82 | feature_idx = np.where(labels == label)[0] 83 | feature_with_label: np.ndarray = features[feature_idx] 84 | distance = np.sqrt( 85 | np.power((feature_with_label-center), 2).sum(axis=1)) 86 | order_idx = np.argsort(distance) 87 | order = np.zeros_like(order_idx) 88 | order[order_idx] = range(1, order.shape[0]+1) 89 | distance_order[feature_idx] = order 90 | return distance_order 91 | 92 | 93 | def _save_top_k_daily_kpi(order: np.ndarray, labels: np.ndarray, names: Sequence): 94 | """ 95 | save the KPIs from the average stage 96 | """ 97 | # save the entrire daily-kpi cluster result 98 | output_root = os.path.join(OUTPUT, "daily_cluster") 99 | if not os.path.exists(output_root): 100 | os.makedirs(output_root, exist_ok=True) 101 | for i, (label, name) in enumerate(zip(labels, names)): 102 | save_path = os.path.join(output_root, f"cluster-{label}", "data") 103 | if not os.path.exists(save_path): 104 | os.makedirs(save_path, exist_ok=True) 105 | save_file = os.path.join(save_path, f"{name}.csv") 106 | src_file = os.path.join(AVERAGE_OUTPUT, f"{name}.csv") 107 | shutil.copyfile(src_file, save_file) 108 | 109 | top_k_idx = np.where(order <= TOP_K)[0] 110 | 111 | top_k_labels = labels[top_k_idx] 112 | top_k_name = np.asarray(names)[top_k_idx] 113 | top_k_order = order[top_k_idx] 114 | 115 | output_root = os.path.join(OUTPUT, "top_k_daily_cluster") 116 | if not os.path.exists(output_root): 117 | os.makedirs(output_root, exist_ok=True) 118 | 119 | label_based = set() 120 | for i, (label, name, order) in enumerate(zip(top_k_labels, top_k_name, top_k_order)): 121 | if order == 1 and label not in label_based: # generate base 122 | label_based.add(label) 123 | save_path = os.path.join(output_root, f"cluster-{label}") 124 | if not os.path.exists(save_path): 125 | os.makedirs(save_path, exist_ok=True) 126 | save_file = os.path.join(save_path, "base.csv") 127 | else: 128 | save_path = os.path.join(output_root, f"cluster-{label}", "data") 129 | if not os.path.exists(save_path): 130 | os.makedirs(save_path, exist_ok=True) 131 | save_file = os.path.join(save_path, f"{name}.csv") 132 | 133 | src_file = os.path.join(AVERAGE_OUTPUT, f"{name}.csv") 134 | shutil.copyfile(src_file, save_file) 135 | 136 | 137 | def _save_base_kpi(base_names: Sequence, classes: Sequence): 138 | """ 139 | save the KPIs from the preprocess stage 140 | """ 141 | output_root = os.path.join(OUTPUT, "base_cluster") 142 | if not os.path.exists(output_root): 143 | os.makedirs(output_root, exist_ok=True) 144 | 145 | tag = np.zeros([len(classes)]) 146 | for base, cls in zip(base_names, classes): 147 | if tag[cls] == 0: # generate base 148 | save_root = os.path.join(output_root, f"cluster-{cls}") 149 | save_file = os.path.join(save_root, "base.csv") 150 | tag[cls] = 1 151 | else: 152 | save_root = os.path.join(output_root, f"cluster-{cls}", "data") 153 | save_file = os.path.join(save_root, f"{base}.csv") 154 | if not os.path.exists(save_root): 155 | os.makedirs(save_root, exist_ok=True) 156 | src_file = os.path.join(RAW_INPUT, f"{base}.csv") 157 | shutil.copy(src_file, save_file) 158 | 159 | 160 | def main(): 161 | at.utils.mkdirs(OUTPUT) 162 | step_progress = at.utils.ProgLog(4) 163 | 164 | step_progress.log(step='Preparing data...') 165 | values, names = _load_data(INPUT) 166 | 167 | step_progress.log(step='Getting latent vectors...') 168 | latent = _get_latent_vectors(values) # (n_samples, 20) 169 | 170 | step_progress.log( 171 | step='Performing K-means clustering on latent vectors...') 172 | 173 | _sse_get_best_cluster_num(latent) 174 | 175 | k_means = KMeans(n_clusters=N_CLUSTERS) 176 | k_means.fit(latent) 177 | 178 | # get the distance of samples to their closest cluster center 179 | labels = k_means.labels_ # (n_samples, ) 180 | cluster_centers = k_means.cluster_centers_ # (n_clusters, n_features) 181 | order = _get_distance_centroid(latent, cluster_centers, labels) 182 | 183 | # get TOP K kpi (with the shortest distance from centroid) 184 | _save_top_k_daily_kpi(order, labels, names) 185 | 186 | step_progress.log(step='Computing clustering result...') 187 | base_names, classes = _get_clustering_result(labels=labels, names=names) 188 | _save_base_kpi(base_names, classes) 189 | df = pd.DataFrame({'name': base_names, 'cluster': classes}) 190 | df.to_csv(os.path.join(OUTPUT, 'result.csv'), index=False) 191 | 192 | 193 | if __name__ == '__main__': 194 | logging.basicConfig(level=logging.DEBUG, 195 | format='[%(asctime)s [%(levelname)s]] %(message)s') 196 | config = at.utils.config() 197 | 198 | INPUT = config.get('CLUSTERING', 'input') 199 | OUTPUT = config.get('CLUSTERING', 'output') 200 | EPOCHS = config.getint('CLUSTERING', 'epochs') 201 | AVERAGE_OUTPUT = config.get("CLUSTERING_AVERAGE", "output_daily") 202 | RAW_INPUT = config.get("CLUSTERING_PREPROCESSING", "input") 203 | DOWN_SAMPLING_STEP = config.getint( 204 | 'CLUSTERING_PREPROCESSING', 'down_sampling_step') 205 | try: 206 | N_CLUSTERS = config.getint('CLUSTERING', 'n_clusters') 207 | except: 208 | # see `"SSE (best cluster num).png"` to set best cluster number. 209 | N_CLUSTERS = 1 210 | 211 | TOP_K = 50 212 | main() 213 | -------------------------------------------------------------------------------- /sample/scripts/transfer_entirely/finetune.py: -------------------------------------------------------------------------------- 1 | import os 2 | from multiprocessing import Pool 3 | 4 | from torch.cuda import is_available 5 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 6 | 7 | import logging 8 | import anomalytransfer as at 9 | from glob import glob 10 | from utils import run_time 11 | 12 | from typing import Sequence, Tuple, Dict, Optional 13 | import pandas as pd 14 | import numpy as np 15 | import torch 16 | from sklearn.metrics import precision_recall_curve 17 | from tqdm import tqdm 18 | 19 | logging.basicConfig(level=logging.INFO, format='[%(asctime)s [%(levelname)s]] %(message)s') 20 | 21 | config = at.utils.config() 22 | CLUSTER_OUTPUT = config.get("CLUSTERING", "output") 23 | EPOCHS = config.getint("CLUSTERING", "epochs") 24 | BASE_EPOCHS = config.getint('TRANSFER_LEARNING', 'base_epochs') 25 | DATA_EPOCHS = config.getint('TRANSFER_LEARNING', 'data_epochs') 26 | INPUT = config.get('TRANSFER_LEARNING', 'input') 27 | OUTPUT = config.get('TRANSFER_LEARNING', 'output') 28 | MODEL_PATH = config.get('TRANSFER_LEARNING', 'model_path') 29 | RATIO = config.getfloat('TRANSFER_LEARNING', 'ratio') 30 | 31 | RAW_INPUT = config.get("CLUSTERING_PREPROCESSING", "input") 32 | 33 | 34 | def _get_latent_vectors(x: np.ndarray) -> np.ndarray: 35 | x = torch.as_tensor(x) 36 | seq_length = x.shape[1] 37 | input_dim = x.shape[2] 38 | 39 | model = at.clustering.LatentTransformer( 40 | seq_length=seq_length, input_dim=input_dim) 41 | model.fit(x, epochs=EPOCHS, verbose=0) 42 | model.save(os.path.join(OUTPUT, 'model.pt')) 43 | return model.transform(x) 44 | 45 | 46 | def cluster_data(path: str) -> Tuple[str, str]: 47 | base = None 48 | data = None 49 | for item in os.listdir(path): 50 | item_path = os.path.join(path, item) 51 | if os.path.isdir(item_path): 52 | data = item_path 53 | else: 54 | base = item_path 55 | if base is None or data is None: 56 | raise ValueError('Base path or data path not found') 57 | return base, data 58 | 59 | def make_base_model(kpi: at.transfer.data.KPI, epochs: int): 60 | kpi.complete_timestamp() 61 | kpi, _, _ = kpi.standardize() 62 | model = at.transfer.models.AnomalyDetector() 63 | model.fit(kpi=kpi.no_labels(), epochs=epochs, verbose=0) 64 | return model 65 | 66 | def train_test(train_kpi: at.transfer.data.KPI, 67 | epochs: int, 68 | test_kpi: at.transfer.data.KPI = None, 69 | mask: Optional[Sequence] = None, 70 | **kwargs) -> float: 71 | model = at.transfer.models.AnomalyDetector() 72 | if mask is not None: 73 | model.load_partial(path=kwargs.get('model_path'), name=kwargs.get('base_kpi').name, mask=mask) 74 | model.freeze(mask) 75 | model.fit(kpi=train_kpi.no_labels(), epochs=epochs, verbose=0) 76 | model.unfreeze(mask) 77 | model.fit(kpi=train_kpi.no_labels(), epochs=epochs, verbose=0) 78 | if test_kpi is not None and test_kpi.labels is not None: 79 | anomaly_scores = model.predict(test_kpi, verbose=0) 80 | results = at.utils.get_test_results(labels=test_kpi.labels, 81 | scores=anomaly_scores, 82 | missing=test_kpi.missing, 83 | use_spot=False) 84 | at.utils.log_test_results(name=test_kpi.name, results=results) 85 | return results['f1score'] 86 | else: 87 | return None 88 | 89 | 90 | def _ignore_missing(series_list: Sequence, missing: np.ndarray) -> Tuple[np.ndarray, ...]: 91 | ret = [] 92 | for series in series_list: 93 | series = np.copy(series) 94 | ret.append(series[missing != 1]) 95 | return tuple(ret) 96 | 97 | 98 | def get_test_results( 99 | timestamps: np.ndarray, 100 | labels: np.ndarray, 101 | scores: np.ndarray, 102 | missing: np.ndarray, 103 | values: np.ndarray, 104 | window_size: int = 120, 105 | **kwargs) -> Dict: 106 | timestamps = timestamps[window_size - 1:] 107 | labels = labels[window_size - 1:] 108 | scores = scores[window_size - 1:] 109 | missing = missing[window_size - 1:] 110 | values = values[window_size - 1:] 111 | adjusted_timestamps, adjusted_labels, adjusted_scores, adjusted_values = _ignore_missing( 112 | [timestamps, labels, scores, values], missing=missing 113 | ) 114 | 115 | adjusted_scores = at.utils.adjust_scores( 116 | labels=adjusted_labels, scores=adjusted_scores) 117 | precision, recall, th = precision_recall_curve(adjusted_labels, adjusted_scores, pos_label=1) 118 | 119 | f1_score = 2 * precision * recall / (precision + recall + 1e-6) 120 | 121 | arg_max = np.argmax(f1_score) 122 | 123 | best_precision, best_recall, best_f1_score = precision[arg_max], recall[arg_max], f1_score[arg_max] 124 | threshold = th[arg_max] 125 | return best_f1_score 126 | 127 | 128 | def main(finetune_num=200): 129 | print(finetune_num) 130 | # with torch.cuda.device(torch.device(f"cuda:{finetune_num//200%2}")): 131 | clusters = os.listdir(INPUT) 132 | base_values = [] 133 | base_models = [] 134 | for cluster in tqdm(clusters, total=len(clusters)): 135 | base, data = cluster_data(os.path.join(INPUT, cluster)) 136 | base_kpi = at.utils.load_kpi(base) 137 | base_kpi.complete_timestamp() 138 | base_kpi, _, _ = base_kpi.standardize() 139 | base_model = make_base_model(base_kpi, BASE_EPOCHS) 140 | base_models.append(base_model) 141 | 142 | dt = pd.read_csv(base) 143 | base_values.append(dt["value"]) 144 | 145 | file_list = at.utils.file_list(RAW_INPUT) 146 | cluster_values = [] 147 | finetune_values = [] 148 | test_kpis = [] 149 | names = [] 150 | for file in file_list: 151 | data_kpi = at.utils.load_kpi(file) 152 | data_kpi.complete_timestamp() 153 | data_kpi, _, _ = data_kpi.standardize() 154 | filename = at.utils.filename(file) 155 | names.append(filename) 156 | 157 | # split idx 158 | ts = data_kpi.timestamps 159 | ts = ts % (60 * 60 * 24) 160 | split_idx = np.where(ts <= 60)[0] 161 | _, data_kpi = data_kpi.split_by_idx(split_idx[0], window_size=1) 162 | 163 | # split to [for cluster] and [for finetune] 164 | ts = data_kpi.timestamps 165 | ts = ts % (60 * 60 * 24) 166 | split_idx = np.where(ts <= 60)[0] 167 | cluster_value, finetune_value = data_kpi.split_by_idx(split_idx[1], window_size=1) 168 | finetune_value, test_value = finetune_value.split_by_idx(finetune_num, window_size=1) 169 | 170 | cluster_values.append(cluster_value.values) 171 | finetune_values.append(finetune_value) 172 | test_kpis.append(test_value) 173 | 174 | 175 | # get latent var 176 | base_values = np.asarray(base_values, dtype=np.float32)[..., None] 177 | base_feature = _get_latent_vectors(base_values) 178 | 179 | cluster_values = np.asarray(cluster_values, dtype=np.float32)[..., None] 180 | cluster_feature = _get_latent_vectors(cluster_values) 181 | 182 | tmp_result = {name: 0 for name in names} 183 | tmp_result["num_of_points"] = finetune_num 184 | for i, (ft, finetune, test_kpi, name) in enumerate(zip(cluster_feature, finetune_values, test_kpis, names)): 185 | cluster_idx = np.argmin(np.sum((ft - base_feature)**2, axis=1)) 186 | base_model = base_models[cluster_idx] 187 | # base_model.fit(kpi=finetune.no_labels(), epochs=DATA_EPOCHS, verbose=0) 188 | anomaly_scores = base_model.predict(test_kpi, verbose=1) 189 | f1_score = get_test_results( 190 | timestamps=test_kpi.timestamps, 191 | labels=test_kpi.labels, 192 | scores=anomaly_scores, 193 | missing=test_kpi.missing, 194 | values=test_kpi.values 195 | ) 196 | tmp_result[name] = f1_score 197 | print(f"{i} - {name}") 198 | 199 | return tmp_result 200 | 201 | if __name__ == '__main__': 202 | 203 | # for num in range(200, 5000, 200): 204 | # main(num) 205 | with Pool(1) as pool: 206 | results = pool.map(main, range(200, 201, 200)) 207 | # results = pool.map(main, range(200, 201, 200)) 208 | final_result = pd.DataFrame(columns=list(results[0].keys())) 209 | for res in results: 210 | final_result = final_result.append(res, ignore_index=True) 211 | 212 | final_result = final_result.sort_values("num_of_points") 213 | final_result.to_csv("result.csv", index=False) 214 | -------------------------------------------------------------------------------- /anomalytransfer/utils/logging.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import logging 5 | import numpy as np 6 | 7 | from typing import Dict 8 | 9 | 10 | class ProgLog: 11 | 12 | def __init__(self, n: int, indent: int = 0): 13 | self._n = n 14 | self._indent = indent 15 | self._current = 1 16 | 17 | def log(self, **extra): 18 | message = '' 19 | for k, v in extra.items(): 20 | message += f' {k}={v}' 21 | logging.info(f'{"-" * (2 * self._indent - 1) + " " if self._indent > 0 else ""}' 22 | f'Progress >>> {self._current}/{self._n}{message}') 23 | self._current += 1 24 | 25 | 26 | class ProgBar: 27 | """Displays a progress bar. 28 | 29 | Arguments: 30 | target: Total number of steps expected, None if unknown. 31 | width: Progress bar width on screen. 32 | verbose: Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose) 33 | stateful_metrics: Iterable of string names of metrics that should *not* be 34 | averaged over time. Metrics in this list will be displayed as-is. All 35 | others will be averaged by the progbar before display. 36 | interval: Minimum visual progress update interval (in seconds). 37 | unit_name: Display name for step counts (usually "step" or "sample"). 38 | """ 39 | 40 | def __init__(self, 41 | target, 42 | width=30, 43 | verbose=1, 44 | interval=0.05, 45 | stateful_metrics=None, 46 | unit_name='step'): 47 | self.target = target 48 | self.width = width 49 | self.verbose = verbose 50 | self.interval = interval 51 | self.unit_name = unit_name 52 | if stateful_metrics: 53 | self.stateful_metrics = set(stateful_metrics) 54 | else: 55 | self.stateful_metrics = set() 56 | 57 | self._dynamic_display = ((hasattr(sys.stdout, 'isatty') and 58 | sys.stdout.isatty()) or 59 | 'ipykernel' in sys.modules or 60 | 'posix' in sys.modules or 61 | 'PYCHARM_HOSTED' in os.environ) 62 | self._total_width = 0 63 | self._seen_so_far = 0 64 | # We use a dict + list to avoid garbage collection 65 | # issues found in OrderedDict 66 | self._values = {} 67 | self._values_order = [] 68 | self._start = time.time() 69 | self._last_update = 0 70 | 71 | def update(self, current, values=None, finalize=None): 72 | """Updates the progress bar. 73 | 74 | Arguments: 75 | current: Index of current step. 76 | values: List of tuples: `(name, value_for_last_step)`. If `name` is in 77 | `stateful_metrics`, `value_for_last_step` will be displayed as-is. 78 | Else, an average of the metric over time will be displayed. 79 | finalize: Whether this is the last update for the progress bar. If 80 | `None`, defaults to `current >= self.target`. 81 | """ 82 | if finalize is None: 83 | if self.target is None: 84 | finalize = False 85 | else: 86 | finalize = current >= self.target 87 | 88 | values = values or [] 89 | for k, v in values: 90 | if k not in self._values_order: 91 | self._values_order.append(k) 92 | if k not in self.stateful_metrics: 93 | # In the case that progress bar doesn't have a target value in the first 94 | # epoch, both on_batch_end and on_epoch_end will be called, which will 95 | # cause 'current' and 'self._seen_so_far' to have the same value. Force 96 | # the minimal value to 1 here, otherwise stateful_metric will be 0s. 97 | value_base = max(current - self._seen_so_far, 1) 98 | if k not in self._values: 99 | self._values[k] = [v * value_base, value_base] 100 | else: 101 | self._values[k][0] += v * value_base 102 | self._values[k][1] += value_base 103 | else: 104 | # Stateful metrics output a numeric value. This representation 105 | # means "take an average from a single value" but keeps the 106 | # numeric formatting. 107 | self._values[k] = [v, 1] 108 | self._seen_so_far = current 109 | 110 | now = time.time() 111 | info = ' - %.0fs' % (now - self._start) 112 | if self.verbose == 1: 113 | if now - self._last_update < self.interval and not finalize: 114 | return 115 | 116 | prev_total_width = self._total_width 117 | if self._dynamic_display: 118 | sys.stdout.write('\b' * prev_total_width) 119 | sys.stdout.write('\r') 120 | else: 121 | sys.stdout.write('\n') 122 | 123 | if self.target is not None: 124 | numdigits = int(np.log10(self.target)) + 1 125 | bar = ('%' + str(numdigits) + 'd/%d [') % (current, self.target) 126 | prog = float(current) / self.target 127 | prog_width = int(self.width * prog) 128 | if prog_width > 0: 129 | bar += ('=' * (prog_width - 1)) 130 | if current < self.target: 131 | bar += '>' 132 | else: 133 | bar += '=' 134 | bar += ('.' * (self.width - prog_width)) 135 | bar += ']' 136 | else: 137 | bar = '%7d/Unknown' % current 138 | 139 | self._total_width = len(bar) 140 | sys.stdout.write(bar) 141 | 142 | if current: 143 | time_per_unit = (now - self._start) / current 144 | else: 145 | time_per_unit = 0 146 | 147 | if self.target is None or finalize: 148 | if time_per_unit >= 1 or time_per_unit == 0: 149 | info += ' %.0fs/%s' % (time_per_unit, self.unit_name) 150 | elif time_per_unit >= 1e-3: 151 | info += ' %.0fms/%s' % (time_per_unit * 1e3, self.unit_name) 152 | else: 153 | info += ' %.0fus/%s' % (time_per_unit * 1e6, self.unit_name) 154 | else: 155 | eta = time_per_unit * (self.target - current) 156 | if eta > 3600: 157 | eta_format = '%d:%02d:%02d' % (eta // 3600, 158 | (eta % 3600) // 60, eta % 60) 159 | elif eta > 60: 160 | eta_format = '%d:%02d' % (eta // 60, eta % 60) 161 | else: 162 | eta_format = '%ds' % eta 163 | 164 | info = ' - ETA: %s' % eta_format 165 | 166 | for k in self._values_order: 167 | info += ' - %s:' % k 168 | if isinstance(self._values[k], list): 169 | avg = np.mean(self._values[k][0] / max(1, self._values[k][1])) 170 | if abs(avg) > 1e-3: 171 | info += ' %.4f' % avg 172 | else: 173 | info += ' %.4e' % avg 174 | else: 175 | info += ' %s' % self._values[k] 176 | 177 | self._total_width += len(info) 178 | if prev_total_width > self._total_width: 179 | info += (' ' * (prev_total_width - self._total_width)) 180 | 181 | if finalize: 182 | info += '\n' 183 | 184 | sys.stdout.write(info) 185 | sys.stdout.flush() 186 | 187 | elif self.verbose == 2: 188 | if finalize: 189 | numdigits = int(np.log10(self.target)) + 1 190 | count = ('%' + str(numdigits) + 'd/%d') % (current, self.target) 191 | info = count + info 192 | for k in self._values_order: 193 | info += ' - %s:' % k 194 | avg = np.mean(self._values[k][0] / max(1, self._values[k][1])) 195 | if avg > 1e-3: 196 | info += ' %.4f' % avg 197 | else: 198 | info += ' %.4e' % avg 199 | info += '\n' 200 | 201 | sys.stdout.write(info) 202 | sys.stdout.flush() 203 | 204 | self._last_update = now 205 | 206 | def add(self, n, values=None): 207 | self.update(self._seen_so_far + n, values) 208 | 209 | 210 | def log_test_results(name: str, results: Dict): 211 | logging.info(f'kpi: {name}') 212 | logging.info(f'threshold: {results.get("threshold")}') 213 | logging.info(f'precision: {results.get("precision"):.3f}') 214 | logging.info(f'recall: {results.get("recall"):.3f}') 215 | logging.info(f'f1score: {results.get("f1score"):.3f}') 216 | -------------------------------------------------------------------------------- /anomalytransfer/transfer/data.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | import torch 3 | import numpy as np 4 | from datetime import datetime 5 | 6 | from typing import Dict, Sequence, Tuple, Optional 7 | from torch.utils.data import TensorDataset 8 | 9 | 10 | class KPI: 11 | 12 | def __init__(self, 13 | timestamps: Sequence, 14 | values: Sequence, 15 | labels: Optional[Sequence] = None, 16 | missing: Optional[Sequence] = None, 17 | name: Optional[str] = None): 18 | self.timestamps = np.asarray(timestamps, dtype=np.int) 19 | self.values = np.asarray(values, dtype=np.float32) 20 | 21 | if labels is None: 22 | self.labels = np.zeros(np.shape(values), dtype=np.int) 23 | else: 24 | self.labels = np.asarray(labels, dtype=np.int) 25 | 26 | if missing is None: 27 | self.missing = np.zeros(np.shape(values), dtype=np.int) 28 | else: 29 | self.missing = np.asarray(missing, dtype=np.int) 30 | 31 | if name is None: 32 | self.name = str(uuid.uuid4()) 33 | else: 34 | self.name = name 35 | 36 | self.labels[self.missing == 1] = 0 37 | 38 | @property 39 | def abnormal(self) -> np.ndarray: 40 | return np.logical_or(self.missing, self.labels).astype(np.int) 41 | 42 | def complete_timestamp(self): 43 | src_idx = np.argsort(self.timestamps) 44 | timestamp_sorted = self.timestamps[src_idx] 45 | intervals = np.unique(np.diff(timestamp_sorted)) 46 | interval = np.min(intervals) 47 | self.interval = interval 48 | if interval == 0: 49 | raise ValueError('Duplicated values in `timestamp`') 50 | for itv in intervals: 51 | if itv % interval != 0: 52 | raise ValueError( 53 | 'Not all intervals in `timestamp` are multiples of the minimum interval') 54 | 55 | length = (timestamp_sorted[-1] - timestamp_sorted[0]) // interval + 1 56 | new_timestamps = np.arange( 57 | timestamp_sorted[0], timestamp_sorted[-1] + interval, interval, dtype=np.int) 58 | new_values = np.zeros([length], dtype=self.values.dtype) 59 | new_labels = np.zeros([length], dtype=self.labels.dtype) 60 | new_missing = np.ones([length], dtype=self.missing.dtype) 61 | 62 | dst_idx = np.asarray( 63 | (timestamp_sorted - timestamp_sorted[0]) // interval, dtype=np.int) 64 | new_values[dst_idx] = self.values[src_idx] 65 | new_labels[dst_idx] = self.labels[src_idx] 66 | new_missing[dst_idx] = self.missing[src_idx] 67 | 68 | self.timestamps = new_timestamps 69 | self.values = new_values 70 | self.labels = new_labels 71 | self.missing = new_missing 72 | 73 | def split(self, ratios: Sequence) -> Tuple['KPI', ...]: 74 | if abs(1.0 - sum(ratios)) > 1e-4: 75 | raise ValueError('The sum of `ratios` must be 1') 76 | partition = np.asarray(np.cumsum(np.asarray( 77 | ratios, dtype=np.float32)) * len(self.values), dtype=np.int) 78 | partition[-1] = len(self.values) 79 | partition = np.concatenate(([0], partition)) 80 | ret = [] 81 | for low, high in zip(partition[:-1], partition[1:]): 82 | ret.append(KPI(timestamps=self.timestamps[low:high], 83 | values=self.values[low:high], 84 | labels=self.labels[low:high], 85 | missing=self.missing[low:high], 86 | name=self.name)) 87 | return tuple(ret) 88 | 89 | def split_by_idx(self, idx: int, window_size: int = 120) -> Tuple['KPI', 'KPI']: 90 | assert len(self.timestamps) > idx 91 | ret = [] 92 | ret.append(KPI(timestamps=self.timestamps[:idx], 93 | values=self.values[:idx], 94 | labels=self.labels[:idx], 95 | missing=self.missing[:idx], 96 | name=self.name)) 97 | # 保留前一天的 window_size - 1个点, 从而解决时间窗口的问题 98 | ret.append(KPI(timestamps=self.timestamps[idx-(window_size-1):], 99 | values=self.values[idx-(window_size-1):], 100 | labels=self.labels[idx-(window_size-1):], 101 | missing=self.missing[idx-(window_size-1):], 102 | name=self.name)) 103 | return tuple(ret) 104 | 105 | def split_days(self, days: int, window_size: int = 120) -> Tuple[Dict, Dict, 'KPI']: 106 | """ 107 | split the KPI into train_kpi[...], test_kpi[...] 108 | the number of train_kpi equals to `days` (depends on time interval) 109 | """ 110 | total_minutes = days * 24 * 60 111 | inteval = self.interval / 60 # interval in minute 112 | num_of_point = int(total_minutes / inteval) 113 | train_kpi, test_kpi = self.split_by_idx(num_of_point, window_size = window_size) 114 | 115 | # split by day 116 | train_ts = train_kpi.timestamps 117 | train_datetime = [datetime.fromtimestamp(ts) for ts in train_ts] 118 | 119 | test_ts = test_kpi.timestamps 120 | test_datetime = [datetime.fromtimestamp(ts) for ts in test_ts] 121 | 122 | train_week_day_map = self._get_daily_kpi(train_kpi, train_datetime, window_size) 123 | test_week_day_map = self._get_daily_kpi(test_kpi, test_datetime, window_size) 124 | return train_week_day_map, test_week_day_map, test_kpi 125 | 126 | def standardize(self, mean: Optional[float] = None, std: Optional[float] = None) -> Tuple['KPI', float, float]: 127 | if (mean is None) != (std is None): 128 | raise ValueError('`mean` and `std` must be both None or not None') 129 | if mean is None: 130 | mean = self.values.mean() 131 | std = self.values.std() 132 | values = (self.values - mean) / std 133 | kpi = KPI(timestamps=self.timestamps, values=values, 134 | labels=self.labels, missing=self.missing, name=self.name) 135 | return kpi, mean, std 136 | 137 | def use_labels(self, rate: float = 1.) -> 'KPI': 138 | if not 0. <= rate <= 1.: 139 | raise ValueError('`rate` must be in [0, 1]') 140 | if rate == 0.: 141 | return KPI(timestamps=self.timestamps, values=self.values, labels=None, missing=self.missing, 142 | name=self.name) 143 | if rate == 1.: 144 | return self 145 | labels = np.copy(self.labels) 146 | anomaly_idx = labels.nonzero()[0] 147 | drop_idx = np.random.choice(anomaly_idx, round( 148 | (1 - rate) * len(anomaly_idx)), replace=False) 149 | labels[drop_idx] = 0 150 | return KPI(timestamps=self.timestamps, values=self.values, labels=labels, missing=self.missing, name=self.name) 151 | 152 | def no_labels(self) -> 'KPI': 153 | return self.use_labels(0.) 154 | 155 | def _get_daily_kpi(self, kpi: 'KPI', datetime_seq: Sequence, window_size: int = 120): 156 | i = window_size 157 | week_map = {} 158 | while i < len(kpi.timestamps): 159 | if datetime_seq[i].day != datetime_seq[i-1].day: 160 | weekday = datetime_seq[i-1].weekday() 161 | datetime_seq = datetime_seq[i-(window_size-1):] 162 | dst_kpi, kpi = kpi.split_by_idx(i, window_size=window_size) 163 | i = window_size 164 | if len(dst_kpi.timestamps) > window_size: 165 | if weekday not in week_map: 166 | week_map[weekday] = [] 167 | week_map[weekday].append(dst_kpi) 168 | else: 169 | i += 1 170 | if len(kpi.timestamps) > window_size * 4: 171 | weekday = datetime_seq[0].weekday() 172 | if weekday not in week_map: 173 | week_map[weekday] = [] 174 | week_map[weekday].append(kpi) 175 | return week_map 176 | 177 | 178 | class KPIDataset: 179 | 180 | def __init__(self, kpi: KPI, window_size: int, missing_injection_rate: float = 0.): 181 | self._window_size = window_size 182 | self._missing_injection_rate = missing_injection_rate 183 | 184 | self._one_hot_minute = self._one_hot( 185 | self._ts2minute(kpi.timestamps), depth=60) 186 | self._one_hot_hour = self._one_hot( 187 | self._ts2hour(kpi.timestamps), depth=24) 188 | self._one_hot_weekday = self._one_hot( 189 | self._ts2weekday(kpi.timestamps), depth=7) 190 | 191 | self._value_windows = self._to_windows(kpi.values) 192 | self._label_windows = self._to_windows(kpi.labels) 193 | self._normal_windows = self._to_windows(1 - kpi.abnormal) 194 | 195 | self._time_code = [] 196 | self._values = [] 197 | self._normal = [] 198 | for i in range(len(self._value_windows)): 199 | values = np.copy(self._value_windows[i]).astype(np.float32) 200 | labels = np.copy(self._label_windows[i]).astype(np.int) 201 | normal = np.copy(self._normal_windows[i]).astype(np.int) 202 | 203 | injected_missing = np.random.binomial( 204 | 1, self._missing_injection_rate, np.shape(values[normal == 1])) 205 | normal[normal == 1] = 1 - injected_missing 206 | values[np.logical_and(normal == 0, labels == 0)] = 0. 207 | 208 | time_index = i + self._window_size - 1 209 | time_code = np.concatenate( 210 | [self._one_hot_minute[time_index], self._one_hot_hour[time_index], 211 | self._one_hot_weekday[time_index]], 212 | axis=-1 213 | ) 214 | 215 | self._time_code.append(time_code) 216 | self._values.append(values) 217 | self._normal.append(normal) 218 | 219 | def _to_windows(self, series: np.ndarray) -> np.ndarray: 220 | return np.lib.stride_tricks.as_strided( 221 | series, 222 | shape=(np.size(series, 0) - self._window_size + 1, self._window_size), 223 | strides=(series.strides[-1], series.strides[-1]) 224 | ) 225 | 226 | @staticmethod 227 | def _ts2hour(ts: np.ndarray) -> np.ndarray: 228 | return (ts % 86400) // 3600 229 | 230 | @staticmethod 231 | def _ts2minute(ts: np.ndarray) -> np.ndarray: 232 | return ((ts % 86400) % 3600) // 60 233 | 234 | @staticmethod 235 | def _ts2weekday(ts: np.ndarray) -> np.ndarray: 236 | return np.zeros_like(((ts // 86400) + 4) % 7) 237 | 238 | @staticmethod 239 | def _one_hot(indices: Sequence, depth: int) -> np.ndarray: 240 | return np.eye(depth)[indices] 241 | 242 | @property 243 | def time_code(self) -> np.ndarray: 244 | return np.asarray(self._time_code, dtype=np.float32) 245 | 246 | @property 247 | def values(self) -> np.ndarray: 248 | return np.asarray(self._values, dtype=np.float32) 249 | 250 | @property 251 | def normal(self) -> np.ndarray: 252 | return np.asarray(self._normal, dtype=np.float32) 253 | 254 | def to_torch(self, device: str) -> TensorDataset: 255 | return TensorDataset(torch.as_tensor(self.values, device=torch.device(device)), 256 | torch.as_tensor( 257 | self.time_code, device=torch.device(device)), 258 | torch.as_tensor(self.normal, device=torch.device(device))) 259 | -------------------------------------------------------------------------------- /anomalytransfer/transfer/models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | import anomalytransfer as at 5 | 6 | from typing import Sequence, Tuple, Dict, Optional 7 | from torch.backends import cudnn 8 | from torch.utils.data import DataLoader 9 | import time 10 | 11 | ADTSHL = True 12 | 13 | class AutoencoderLayer(torch.nn.Module): 14 | 15 | def __init__(self, input_dim: int, output_dim: int, hidden_dims: Sequence[int]): 16 | super().__init__() 17 | self._hidden = torch.nn.Sequential() 18 | last_dim = input_dim 19 | 20 | 21 | # adtshl 22 | if ADTSHL: 23 | for i, hidden_dim in enumerate(hidden_dims): 24 | self._hidden.add_module(f'hidden_{i}', torch.nn.Conv1d(last_dim, hidden_dim, kernel_size=7, stride=1, padding=3)) 25 | self._hidden.add_module(f'relu_{i}', torch.nn.ReLU()) 26 | last_dim = hidden_dim 27 | self._mean = torch.nn.Conv1d(last_dim, output_dim, kernel_size=7, stride=1, padding=3) 28 | self._std = torch.nn.Sequential( 29 | torch.nn.Conv1d(last_dim, output_dim, kernel_size=7, stride=1, padding=3), 30 | torch.nn.Softplus(), 31 | ) 32 | else: 33 | # naive bagel 34 | for i, hidden_dim in enumerate(hidden_dims): 35 | self._hidden.add_module(f'hidden_{i}', torch.nn.Linear(last_dim, hidden_dim)) 36 | self._hidden.add_module(f'relu_{i}', torch.nn.ReLU()) 37 | last_dim = hidden_dim 38 | self._mean = torch.nn.Linear(last_dim, output_dim) 39 | self._std = torch.nn.Sequential( 40 | torch.nn.Linear(last_dim, output_dim), 41 | torch.nn.Softplus(), 42 | ) 43 | 44 | def forward(self, x: torch.Tensor): 45 | if ADTSHL: 46 | shape_zero_squeeze = False 47 | x = x.unsqueeze(dim=-1) 48 | if x.shape[0] == 1: 49 | shape_zero_squeeze = True 50 | x = x.squeeze(dim=0) 51 | x = self._hidden(x) 52 | mean = self._mean(x) 53 | std = self._std(x) + 1e-6 54 | if ADTSHL: 55 | mean = mean.squeeze(dim=-1) 56 | std = std.squeeze(dim=-1) 57 | if shape_zero_squeeze: 58 | mean = mean.unsqueeze(dim=0) 59 | std = std.unsqueeze(dim=0) 60 | return mean, std 61 | 62 | def save(self, path: str, mask: Sequence, net: str): 63 | for idx in range(0, len(self._hidden), 2): 64 | if mask[idx // 2] == 1: 65 | torch.save(self._hidden[idx].state_dict(), os.path.join(path, f'{net}-hidden-{idx // 2}.pt')) 66 | if mask[-1] == 1: 67 | torch.save(self._mean.state_dict(), os.path.join(path, f'{net}-mean.pt')) 68 | torch.save(self._std.state_dict(), os.path.join(path, f'{net}-std.pt')) 69 | 70 | def load(self, path: str, mask: Sequence, net: str): 71 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 72 | for idx in range(0, len(self._hidden), 2): 73 | if mask[idx // 2] == 1: 74 | self._hidden[idx].load_state_dict(torch.load(os.path.join(path, f'{net}-hidden-{idx // 2}.pt'), map_location=device)) 75 | if mask[-1] == 1: 76 | self._mean.load_state_dict(torch.load(os.path.join(path, f'{net}-mean.pt'), map_location=device)) 77 | self._std.load_state_dict(torch.load(os.path.join(path, f'{net}-std.pt'), map_location=device)) 78 | 79 | def freeze(self, mask: Sequence): 80 | for idx in range(0, len(self._hidden), 2): 81 | if mask[idx // 2] == 1: 82 | for p in self._hidden[idx].parameters(): 83 | p.requires_grad = False 84 | if mask[-1] == 1: 85 | for p in self._mean.parameters(): 86 | p.requires_grad = False 87 | for p in self._std.parameters(): 88 | p.requires_grad = False 89 | 90 | def unfreeze(self): 91 | for p in self.parameters(): 92 | p.requires_grad = True 93 | 94 | 95 | class ConditionalVariationalAutoencoder(torch.nn.Module): 96 | 97 | def __init__(self, encoder: AutoencoderLayer, decoder: AutoencoderLayer, device: str): 98 | super().__init__() 99 | self._encoder = encoder 100 | self._decoder = decoder 101 | self._device = device 102 | 103 | def forward(self, inputs, **kwargs): 104 | x, y = tuple(inputs) 105 | n_samples = kwargs.get('n_samples', 1) 106 | concatted = torch.cat([x, y], dim=-1) 107 | z_mean, z_std = self._encoder(concatted) 108 | q_zx = torch.distributions.Normal(z_mean, z_std) 109 | p_z = torch.distributions.Normal( 110 | torch.zeros(z_mean.size()).to(self._device), 111 | torch.ones(z_std.size()).to(self._device) 112 | ) 113 | z = p_z.sample((n_samples,)) * torch.unsqueeze(z_std, 0) + torch.unsqueeze(z_mean, 0) 114 | y = y.expand(n_samples, -1, -1) 115 | concatted = torch.cat([z, y], dim=-1) 116 | x_mean, x_std = self._decoder(concatted) 117 | p_xz = torch.distributions.Normal(x_mean, x_std) 118 | return q_zx, p_xz, z 119 | 120 | def save_partial(self, path: str, name: str, mask: Sequence): 121 | path = os.path.join(path, name) 122 | if not os.path.exists(path): 123 | os.makedirs(path, exist_ok=True) 124 | self._encoder.save(path, mask=mask[0], net='encoder') 125 | self._decoder.save(path, mask=mask[1], net='decoder') 126 | 127 | def load_partial(self, path: str, name: str, mask: Sequence): 128 | path = os.path.join(path, name) 129 | if not os.path.exists(path): 130 | os.makedirs(path, exist_ok=True) 131 | self._encoder.load(path, mask=mask[0], net='encoder') 132 | self._decoder.load(path, mask=mask[1], net='decoder') 133 | 134 | def freeze(self, mask: Sequence): 135 | if 0 in mask[0] or 0 in mask[1]: 136 | self._encoder.freeze(mask[0]) 137 | self._decoder.freeze(mask[1]) 138 | 139 | def unfreeze(self, mask: Sequence): 140 | if 0 in mask[0] or 0 in mask[1]: 141 | self._encoder.unfreeze() 142 | self._decoder.unfreeze() 143 | 144 | 145 | class AnomalyDetector: 146 | 147 | def __init__(self, 148 | window_size: int = 120, 149 | hidden_dims: Sequence = (100, 100), 150 | latent_dim: int = 8, 151 | learning_rate: float = 1e-3, 152 | dropout_rate: float = 0.1, 153 | device: Optional[str] = None): 154 | cudnn.benchmark = True 155 | if device is None: 156 | self._device = 'cuda' if torch.cuda.is_available() else 'cpu' 157 | else: 158 | self._device = device 159 | 160 | self._window_size = window_size 161 | self._hidden_dims = hidden_dims 162 | self._dropout_rate = dropout_rate 163 | cond_size = 60 + 24 + 7 164 | self._model = ConditionalVariationalAutoencoder( 165 | encoder=AutoencoderLayer( 166 | input_dim=window_size + cond_size, 167 | output_dim=latent_dim, 168 | hidden_dims=hidden_dims, 169 | ), 170 | decoder=AutoencoderLayer( 171 | input_dim=latent_dim + cond_size, 172 | output_dim=window_size, 173 | hidden_dims=list(reversed(hidden_dims)), 174 | ), 175 | device=self._device 176 | ).to(self._device) 177 | self._p_z = torch.distributions.Normal( 178 | torch.zeros(latent_dim).to(self._device), 179 | torch.ones(latent_dim).to(self._device) 180 | ) 181 | self._optimizer = torch.optim.Adam(self._model.parameters(), lr=learning_rate, weight_decay=1e-3) 182 | self._lr_scheduler = torch.optim.lr_scheduler.StepLR(self._optimizer, step_size=10, gamma=0.75) 183 | 184 | @staticmethod 185 | def _m_elbo(x: torch.Tensor, 186 | z: torch.Tensor, 187 | normal: torch.Tensor, 188 | q_zx: torch.distributions.Normal, 189 | p_z: torch.distributions.Normal, 190 | p_xz: torch.distributions.Normal) -> torch.Tensor: 191 | x = torch.unsqueeze(x, 0) 192 | normal = torch.unsqueeze(normal, 0) 193 | log_p_xz = p_xz.log_prob(x) 194 | log_q_zx = torch.sum(q_zx.log_prob(z), -1) 195 | log_p_z = torch.sum(p_z.log_prob(z), -1) 196 | ratio = (torch.sum(normal, -1) / float(normal.size()[-1])) 197 | return torch.mean(torch.sum(log_p_xz * normal, -1) + log_p_z * ratio - log_q_zx) 198 | 199 | def _missing_imputation(self, 200 | x: torch.Tensor, 201 | y: torch.Tensor, 202 | normal: torch.Tensor, 203 | max_iter: int = 10) -> torch.Tensor: 204 | with torch.no_grad(): 205 | for _ in range(max_iter): 206 | _, p_xz, _ = self._model([x, y]) 207 | x[normal == 0.] = p_xz.sample()[0][normal == 0.] 208 | return x 209 | 210 | def _train_step(self, x: torch.Tensor, y: torch.Tensor, normal: torch.Tensor) -> torch.Tensor: 211 | self._optimizer.zero_grad() 212 | y = torch.nn.Dropout(self._dropout_rate)(y) 213 | q_zx, p_xz, z = self._model([x, y]) 214 | loss = -self._m_elbo(x, z, normal, q_zx, self._p_z, p_xz) 215 | loss.backward() 216 | torch.nn.utils.clip_grad_norm_(self._model.parameters(), max_norm=10.) 217 | self._optimizer.step() 218 | return loss 219 | 220 | def _validation_step(self, x: torch.Tensor, y: torch.Tensor, normal: torch.Tensor) -> torch.Tensor: 221 | q_zx, p_xz, z = self._model([x, y]) 222 | return -self._m_elbo(x, z, normal, q_zx, self._p_z, p_xz) 223 | 224 | def _test_step(self, x: torch.Tensor, y: torch.Tensor, normal: torch.Tensor) -> Tuple[torch.Tensor, np.ndarray]: 225 | x = self._missing_imputation(x, y, normal) 226 | q_zx, p_xz, z = self._model([x, y], n_samples=1 if ADTSHL else 128) 227 | test_loss = -self._m_elbo(x, z, normal, q_zx, self._p_z, p_xz) 228 | log_p_xz = p_xz.log_prob(x) 229 | return test_loss, log_p_xz 230 | 231 | def fit(self, 232 | kpi: 'at.transfer.data.KPI', 233 | epochs: int, 234 | validation_kpi: Optional['at.transfer.data.KPI'] = None, 235 | batch_size: int = 256, 236 | verbose: int = 1) -> Dict: 237 | dataset = at.transfer.data.KPIDataset(kpi, window_size=self._window_size, missing_injection_rate=0.01) 238 | dataset = DataLoader(dataset.to_torch(self._device), batch_size=batch_size, shuffle=True, drop_last=True) 239 | validation_dataset = None 240 | if validation_kpi is not None: 241 | validation_dataset = at.transfer.data.KPIDataset(validation_kpi, window_size=self._window_size) 242 | validation_dataset = DataLoader(validation_dataset.to_torch(self._device), 243 | batch_size=batch_size, 244 | shuffle=True 245 | ) 246 | 247 | start = time.time() 248 | ts = [] 249 | losses = [] 250 | val_losses = [] 251 | history = {} 252 | progbar = None 253 | if verbose == 1: 254 | print('Training Epochs') 255 | progbar = at.utils.ProgBar(epochs, interval=0.5, stateful_metrics=['loss', 'val_loss'], unit_name='epoch') 256 | 257 | for epoch in range(epochs): 258 | epoch_losses = [] 259 | epoch_val_losses = [] 260 | epoch_val_loss = np.nan 261 | 262 | if verbose == 2: 263 | print(f'Training Epoch {epoch + 1}/{epochs}') 264 | progbar = at.utils.ProgBar( 265 | target=len(dataset) + (0 if validation_kpi is None else len(validation_dataset)), 266 | interval=0.5 267 | ) 268 | self._model.train() 269 | for batch in dataset: 270 | loss = self._train_step(*batch) 271 | epoch_losses.append(loss) 272 | if verbose == 2: 273 | progbar.add(1, values=[('loss', loss.detach().cpu().numpy())]) 274 | epoch_loss = torch.mean(torch.as_tensor(epoch_losses)).numpy() 275 | ts.append(time.time()-start) 276 | losses.append(epoch_loss) 277 | 278 | if validation_kpi is not None: 279 | with torch.no_grad(): 280 | self._model.eval() 281 | for batch in validation_dataset: 282 | val_loss = self._validation_step(*batch) 283 | epoch_val_losses.append(val_loss) 284 | if verbose == 2: 285 | progbar.add(1, values=[('val_loss', val_loss.cpu().numpy())]) 286 | epoch_val_loss = torch.mean(torch.as_tensor(epoch_val_losses)).numpy() 287 | val_losses.append(epoch_val_loss) 288 | 289 | if verbose == 1: 290 | values = [] 291 | if not np.isnan(epoch_loss): 292 | values.append(('loss', epoch_loss)) 293 | if not np.isnan(epoch_val_loss): 294 | values.append(('val_loss', epoch_val_loss)) 295 | progbar.add(1, values=values) 296 | 297 | self._lr_scheduler.step() 298 | 299 | history['loss'] = losses 300 | history['ts'] = ts 301 | if len(val_losses) > 0: 302 | history['val_loss'] = val_losses 303 | return history 304 | 305 | def predict(self, kpi: 'at.transfer.data.KPI', batch_size: int = 256, verbose: int = 1) -> np.ndarray: 306 | kpi = kpi.no_labels() 307 | dataset = at.transfer.data.KPIDataset(kpi, window_size=self._window_size) 308 | dataset = DataLoader(dataset.to_torch(self._device), batch_size=batch_size) 309 | progbar = None 310 | if verbose == 1: 311 | print('Testing Epoch') 312 | progbar = at.utils.ProgBar(len(dataset), interval=0.5) 313 | anomaly_scores = [] 314 | with torch.no_grad(): 315 | self._model.eval() 316 | for batch in dataset: 317 | test_loss, log_p_xz = self._test_step(*batch) 318 | anomaly_scores.extend(-torch.mean(log_p_xz[:, :, -1], dim=0).cpu()) 319 | if verbose == 1: 320 | progbar.add(1, values=[('test_loss', test_loss.cpu().numpy())]) 321 | anomaly_scores = np.asarray(anomaly_scores, dtype=np.float32) 322 | return np.concatenate([np.ones(self._window_size - 1) * np.min(anomaly_scores), anomaly_scores]) 323 | 324 | def save(self, path: str, name: str): 325 | mask = [[1] * (len(self._hidden_dims) + 1)] * 2 326 | self.save_partial(path, name, mask) 327 | 328 | def load(self, path: str, name: str): 329 | mask = [[1] * (len(self._hidden_dims) + 1)] * 2 330 | self.load_partial(path, name, mask) 331 | 332 | def save_partial(self, path: str, name: str, mask: Sequence): 333 | self._model.save_partial(path, name, mask) 334 | 335 | def load_partial(self, path: str, name: str, mask: Sequence): 336 | self._model.load_partial(path, name, mask) 337 | 338 | def freeze(self, mask: Sequence): 339 | self._model.freeze(mask) 340 | 341 | def unfreeze(self, mask: Sequence): 342 | self._model.unfreeze(mask) 343 | 344 | 345 | def sbd_(a: 'at.transfer.data.KPI', b: 'at.transfer.data.KPI') -> float: 346 | l2_a = np.linalg.norm(a.values) 347 | l2_b = np.linalg.norm(b.values) 348 | cross_correlation = np.convolve(a.values, b.values, mode='full') 349 | return 1 - np.max(cross_correlation) / (l2_a * l2_b) 350 | 351 | 352 | def find_optimal_mask(sbd: float, less_mask: Sequence, greater_mask: Sequence, threshold: float = 0.3) -> Sequence: 353 | if sbd <= threshold: 354 | return less_mask 355 | return greater_mask 356 | -------------------------------------------------------------------------------- /anomalytransfer/transfer/spot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Dec 12 10:08:16 2016 5 | 6 | @author: Alban Siffer 7 | @company: Amossys 8 | @license: GNU GPLv3 9 | """ 10 | 11 | from scipy.optimize import minimize 12 | from math import log,floor 13 | import numpy as np 14 | import pandas as pd 15 | import matplotlib.pyplot as plt 16 | import tqdm 17 | 18 | # colors for plot 19 | deep_saffron = '#FF9933' 20 | air_force_blue = '#5D8AA8' 21 | 22 | 23 | """ 24 | ================================= MAIN CLASS ================================== 25 | """ 26 | 27 | class SPOT: 28 | """ 29 | This class allows to run SPOT algorithm on univariate dataset (upper-bound) 30 | 31 | Attributes 32 | ---------- 33 | proba : float 34 | Detection level (risk), chosen by the user 35 | 36 | extreme_quantile : float 37 | current threshold (bound between normal and abnormal events) 38 | 39 | data : numpy.array 40 | stream 41 | 42 | init_data : numpy.array 43 | initial batch of observations (for the calibration/initialization step) 44 | 45 | init_threshold : float 46 | initial threshold computed during the calibration step 47 | 48 | peaks : numpy.array 49 | array of peaks (excesses above the initial threshold) 50 | 51 | n : int 52 | number of observed values 53 | 54 | Nt : int 55 | number of observed peaks 56 | """ 57 | 58 | def __init__(self, q = 1e-4): 59 | """ 60 | Constructor 61 | 62 | Parameters 63 | ---------- 64 | q 65 | Detection level (risk) 66 | 67 | Returns 68 | ---------- 69 | SPOT object 70 | """ 71 | self.proba = q 72 | self.extreme_quantile = None 73 | self.data = None 74 | self.init_data = None 75 | self.init_threshold = None 76 | self.peaks = None 77 | self.n = 0 78 | self.Nt = 0 79 | 80 | def __str__(self): 81 | s = '' 82 | s += 'Streaming Peaks-Over-Threshold Object\n' 83 | s += 'Detection level q = %s\n' % self.proba 84 | if self.data is not None: 85 | s += 'Data imported : Yes\n' 86 | s += '\t initialization : %s values\n' % self.init_data.size 87 | s += '\t stream : %s values\n' % self.data.size 88 | else: 89 | s += 'Data imported : No\n' 90 | return s 91 | 92 | if self.n == 0: 93 | s += 'Algorithm initialized : No\n' 94 | else: 95 | s += 'Algorithm initialized : Yes\n' 96 | s += '\t initial threshold : %s\n' % self.init_threshold 97 | 98 | r = self.n-self.init_data.size 99 | if r > 0: 100 | s += 'Algorithm run : Yes\n' 101 | s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n) 102 | else: 103 | s += '\t number of peaks : %s\n' % self.Nt 104 | s += '\t extreme quantile : %s\n' % self.extreme_quantile 105 | s += 'Algorithm run : No\n' 106 | return s 107 | 108 | 109 | def fit(self,init_data,data): 110 | """ 111 | Import data to SPOT object 112 | 113 | Parameters 114 | ---------- 115 | init_data : list, numpy.array or pandas.Series 116 | initial batch to calibrate the algorithm 117 | 118 | data : numpy.array 119 | data for the run (list, np.array or pd.series) 120 | 121 | """ 122 | if isinstance(data,list): 123 | self.data = np.array(data) 124 | elif isinstance(data,np.ndarray): 125 | self.data = data 126 | elif isinstance(data,pd.Series): 127 | self.data = data.values 128 | else: 129 | print('This data format (%s) is not supported' % type(data)) 130 | return 131 | 132 | if isinstance(init_data,list): 133 | self.init_data = np.array(init_data) 134 | elif isinstance(init_data,np.ndarray): 135 | self.init_data = init_data 136 | elif isinstance(init_data,pd.Series): 137 | self.init_data = init_data.values 138 | elif isinstance(init_data,int): 139 | self.init_data = self.data[:init_data] 140 | self.data = self.data[init_data:] 141 | elif isinstance(init_data,float) & (init_data<1) & (init_data>0): 142 | r = int(init_data*data.size) 143 | self.init_data = self.data[:r] 144 | self.data = self.data[r:] 145 | else: 146 | print('The initial data cannot be set') 147 | return 148 | 149 | def add(self,data): 150 | """ 151 | This function allows to append data to the already fitted data 152 | 153 | Parameters 154 | ---------- 155 | data : list, numpy.array, pandas.Series 156 | data to append 157 | """ 158 | if isinstance(data,list): 159 | data = np.array(data) 160 | elif isinstance(data,np.ndarray): 161 | data = data 162 | elif isinstance(data,pd.Series): 163 | data = data.values 164 | else: 165 | print('This data format (%s) is not supported' % type(data)) 166 | return 167 | 168 | self.data = np.append(self.data,data) 169 | return 170 | 171 | def initialize(self, level = 0.98, verbose = True): 172 | """ 173 | Run the calibration (initialization) step 174 | 175 | Parameters 176 | ---------- 177 | level : float 178 | (default 0.98) Probability associated with the initial threshold t 179 | verbose : bool 180 | (default = True) If True, gives details about the batch initialization 181 | """ 182 | level = level-floor(level) 183 | 184 | n_init = self.init_data.size 185 | 186 | S = np.sort(self.init_data) # we sort X to get the empirical quantile 187 | self.init_threshold = S[int(level*n_init)] # t is fixed for the whole algorithm 188 | 189 | # initial peaks 190 | self.peaks = self.init_data[self.init_data>self.init_threshold]-self.init_threshold 191 | self.Nt = self.peaks.size 192 | self.n = n_init 193 | 194 | if verbose: 195 | print('Initial threshold : %s' % self.init_threshold) 196 | print('Number of peaks : %s' % self.Nt) 197 | print('Grimshaw maximum log-likelihood estimation ... ', end = '') 198 | 199 | g,s,l = self._grimshaw() 200 | self.extreme_quantile = self._quantile(g,s) 201 | 202 | if verbose: 203 | print('[done]') 204 | print('\t'+chr(0x03B3) + ' = ' + str(g)) 205 | print('\t'+chr(0x03C3) + ' = ' + str(s)) 206 | print('\tL = ' + str(l)) 207 | print('Extreme quantile (probability = %s): %s' % (self.proba,self.extreme_quantile)) 208 | 209 | return 210 | 211 | 212 | 213 | 214 | def _rootsFinder(fun,jac,bounds,npoints,method): 215 | """ 216 | Find possible roots of a scalar function 217 | 218 | Parameters 219 | ---------- 220 | fun : function 221 | scalar function 222 | jac : function 223 | first order derivative of the function 224 | bounds : tuple 225 | (min,max) interval for the roots search 226 | npoints : int 227 | maximum number of roots to output 228 | method : str 229 | 'regular' : regular sample of the search interval, 'random' : uniform (distribution) sample of the search interval 230 | 231 | Returns 232 | ---------- 233 | numpy.array 234 | possible roots of the function 235 | """ 236 | if method == 'regular': 237 | step = (bounds[1]-bounds[0])/(npoints+1) 238 | X0 = np.arange(bounds[0]+step,bounds[1],step) 239 | elif method == 'random': 240 | X0 = np.random.uniform(bounds[0],bounds[1],npoints) 241 | 242 | def objFun(X,f,jac): 243 | g = 0 244 | j = np.zeros(X.shape) 245 | i = 0 246 | for x in X: 247 | fx = f(x) 248 | g = g+fx**2 249 | j[i] = 2*fx*jac(x) 250 | i = i+1 251 | return g,j 252 | 253 | opt = minimize(lambda X:objFun(X,fun,jac), X0, 254 | method='L-BFGS-B', 255 | jac=True, bounds=[bounds]*len(X0)) 256 | 257 | X = opt.x 258 | np.round(X,decimals = 5) 259 | return np.unique(X) 260 | 261 | 262 | def _log_likelihood(Y,gamma,sigma): 263 | """ 264 | Compute the log-likelihood for the Generalized Pareto Distribution (μ=0) 265 | 266 | Parameters 267 | ---------- 268 | Y : numpy.array 269 | observations 270 | gamma : float 271 | GPD index parameter 272 | sigma : float 273 | GPD scale parameter (>0) 274 | 275 | Returns 276 | ---------- 277 | float 278 | log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0) 279 | """ 280 | n = Y.size 281 | if gamma != 0: 282 | tau = gamma/sigma 283 | L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum() 284 | else: 285 | L = n * ( 1 + log(Y.mean()) ) 286 | return L 287 | 288 | 289 | def _grimshaw(self,epsilon = 1e-8, n_points = 10): 290 | """ 291 | Compute the GPD parameters estimation with the Grimshaw's trick 292 | 293 | Parameters 294 | ---------- 295 | epsilon : float 296 | numerical parameter to perform (default : 1e-8) 297 | n_points : int 298 | maximum number of candidates for maximum likelihood (default : 10) 299 | 300 | Returns 301 | ---------- 302 | gamma_best,sigma_best,ll_best 303 | gamma estimates, sigma estimates and corresponding log-likelihood 304 | """ 305 | def u(s): 306 | return 1 + np.log(s).mean() 307 | 308 | def v(s): 309 | return np.mean(1/s) 310 | 311 | def w(Y,t): 312 | s = 1+t*Y 313 | us = u(s) 314 | vs = v(s) 315 | return us*vs-1 316 | 317 | def jac_w(Y,t): 318 | s = 1+t*Y 319 | us = u(s) 320 | vs = v(s) 321 | jac_us = (1/t)*(1-vs) 322 | jac_vs = (1/t)*(-vs+np.mean(1/s**2)) 323 | return us*jac_vs+vs*jac_us 324 | 325 | 326 | Ym = self.peaks.min() 327 | YM = self.peaks.max() 328 | Ymean = self.peaks.mean() 329 | 330 | 331 | a = -1/YM 332 | if abs(a)<2*epsilon: 333 | epsilon = abs(a)/n_points 334 | 335 | a = a + epsilon 336 | b = 2*(Ymean-Ym)/(Ymean*Ym) 337 | c = 2*(Ymean-Ym)/(Ym**2) 338 | 339 | # We look for possible roots 340 | left_zeros = SPOT._rootsFinder(lambda t: w(self.peaks,t), 341 | lambda t: jac_w(self.peaks,t), 342 | (a+epsilon,-epsilon), 343 | n_points,'regular') 344 | 345 | right_zeros = SPOT._rootsFinder(lambda t: w(self.peaks,t), 346 | lambda t: jac_w(self.peaks,t), 347 | (b,c), 348 | n_points,'regular') 349 | 350 | # all the possible roots 351 | zeros = np.concatenate((left_zeros,right_zeros)) 352 | 353 | # 0 is always a solution so we initialize with it 354 | gamma_best = 0 355 | sigma_best = Ymean 356 | ll_best = SPOT._log_likelihood(self.peaks,gamma_best,sigma_best) 357 | 358 | # we look for better candidates 359 | for z in zeros: 360 | gamma = u(1+z*self.peaks)-1 361 | sigma = gamma/z 362 | ll = SPOT._log_likelihood(self.peaks,gamma,sigma) 363 | if ll>ll_best: 364 | gamma_best = gamma 365 | sigma_best = sigma 366 | ll_best = ll 367 | 368 | return gamma_best,sigma_best,ll_best 369 | 370 | 371 | 372 | def _quantile(self,gamma,sigma): 373 | """ 374 | Compute the quantile at level 1-q 375 | 376 | Parameters 377 | ---------- 378 | gamma : float 379 | GPD parameter 380 | sigma : float 381 | GPD parameter 382 | 383 | Returns 384 | ---------- 385 | float 386 | quantile at level 1-q for the GPD(γ,σ,μ=0) 387 | """ 388 | r = self.n * self.proba / self.Nt 389 | if gamma != 0: 390 | return self.init_threshold + (sigma/gamma)*(pow(r,-gamma)-1) 391 | else: 392 | return self.init_threshold - sigma*log(r) 393 | 394 | 395 | def run(self, with_alarm = True): 396 | """ 397 | Run SPOT on the stream 398 | 399 | Parameters 400 | ---------- 401 | with_alarm : bool 402 | (default = True) If False, SPOT will adapt the threshold assuming \ 403 | there is no abnormal values 404 | 405 | 406 | Returns 407 | ---------- 408 | dict 409 | keys : 'thresholds' and 'alarms' 410 | 411 | 'thresholds' contains the extreme quantiles and 'alarms' contains \ 412 | the indexes of the values which have triggered alarms 413 | 414 | """ 415 | if (self.n>self.init_data.size): 416 | print('Warning : the algorithm seems to have already been run, you \ 417 | should initialize before running again') 418 | return {} 419 | 420 | # list of the thresholds 421 | th = [] 422 | alarm = [] 423 | # Loop over the stream 424 | for i in tqdm.tqdm(range(self.data.size)): 425 | 426 | # If the observed value exceeds the current threshold (alarm case) 427 | if self.data[i]>self.extreme_quantile: 428 | # if we want to alarm, we put it in the alarm list 429 | if with_alarm: 430 | alarm.append(i) 431 | # otherwise we add it in the peaks 432 | else: 433 | self.peaks = np.append(self.peaks,self.data[i]-self.init_threshold) 434 | self.Nt += 1 435 | self.n += 1 436 | # and we update the thresholds 437 | 438 | g,s,l = self._grimshaw() 439 | self.extreme_quantile = self._quantile(g,s) 440 | 441 | # case where the value exceeds the initial threshold but not the alarm ones 442 | elif self.data[i]>self.init_threshold: 443 | # we add it in the peaks 444 | self.peaks = np.append(self.peaks,self.data[i]-self.init_threshold) 445 | self.Nt += 1 446 | self.n += 1 447 | # and we update the thresholds 448 | 449 | g,s,l = self._grimshaw() 450 | self.extreme_quantile = self._quantile(g,s) 451 | else: 452 | self.n += 1 453 | 454 | 455 | th.append(self.extreme_quantile) # thresholds record 456 | 457 | return {'thresholds' : th, 'alarms': alarm} 458 | 459 | 460 | def plot(self,run_results,with_alarm = True): 461 | """ 462 | Plot the results of given by the run 463 | 464 | Parameters 465 | ---------- 466 | run_results : dict 467 | results given by the 'run' method 468 | with_alarm : bool 469 | (default = True) If True, alarms are plotted. 470 | 471 | 472 | Returns 473 | ---------- 474 | list 475 | list of the plots 476 | 477 | """ 478 | x = range(self.data.size) 479 | K = run_results.keys() 480 | 481 | ts_fig, = plt.plot(x,self.data,color=air_force_blue) 482 | fig = [ts_fig] 483 | 484 | if 'thresholds' in K: 485 | th = run_results['thresholds'] 486 | th_fig, = plt.plot(x,th,color=deep_saffron,lw=2,ls='dashed') 487 | fig.append(th_fig) 488 | 489 | if with_alarm and ('alarms' in K): 490 | alarm = run_results['alarms'] 491 | al_fig = plt.scatter(alarm,self.data[alarm],color='red') 492 | fig.append(al_fig) 493 | 494 | plt.xlim((0,self.data.size)) 495 | 496 | 497 | return fig 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | """ 509 | ============================ UPPER & LOWER BOUNDS ============================= 510 | """ 511 | 512 | 513 | 514 | 515 | class biSPOT: 516 | """ 517 | This class allows to run biSPOT algorithm on univariate dataset (upper and lower bounds) 518 | 519 | Attributes 520 | ---------- 521 | proba : float 522 | Detection level (risk), chosen by the user 523 | 524 | extreme_quantile : float 525 | current threshold (bound between normal and abnormal events) 526 | 527 | data : numpy.array 528 | stream 529 | 530 | init_data : numpy.array 531 | initial batch of observations (for the calibration/initialization step) 532 | 533 | init_threshold : float 534 | initial threshold computed during the calibration step 535 | 536 | peaks : numpy.array 537 | array of peaks (excesses above the initial threshold) 538 | 539 | n : int 540 | number of observed values 541 | 542 | Nt : int 543 | number of observed peaks 544 | """ 545 | def __init__(self, q = 1e-4): 546 | """ 547 | Constructor 548 | 549 | Parameters 550 | ---------- 551 | q 552 | Detection level (risk) 553 | 554 | Returns 555 | ---------- 556 | biSPOT object 557 | """ 558 | self.proba = q 559 | self.data = None 560 | self.init_data = None 561 | self.n = 0 562 | nonedict = {'up':None,'down':None} 563 | 564 | self.extreme_quantile = dict.copy(nonedict) 565 | self.init_threshold = dict.copy(nonedict) 566 | self.peaks = dict.copy(nonedict) 567 | self.gamma = dict.copy(nonedict) 568 | self.sigma = dict.copy(nonedict) 569 | self.Nt = {'up':0,'down':0} 570 | 571 | 572 | def __str__(self): 573 | s = '' 574 | s += 'Streaming Peaks-Over-Threshold Object\n' 575 | s += 'Detection level q = %s\n' % self.proba 576 | if self.data is not None: 577 | s += 'Data imported : Yes\n' 578 | s += '\t initialization : %s values\n' % self.init_data.size 579 | s += '\t stream : %s values\n' % self.data.size 580 | else: 581 | s += 'Data imported : No\n' 582 | return s 583 | 584 | if self.n == 0: 585 | s += 'Algorithm initialized : No\n' 586 | else: 587 | s += 'Algorithm initialized : Yes\n' 588 | s += '\t initial threshold : %s\n' % self.init_threshold 589 | 590 | r = self.n-self.init_data.size 591 | if r > 0: 592 | s += 'Algorithm run : Yes\n' 593 | s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n) 594 | s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n) 595 | else: 596 | s += '\t number of peaks : %s\n' % self.Nt 597 | s += '\t upper extreme quantile : %s\n' % self.extreme_quantile['up'] 598 | s += '\t lower extreme quantile : %s\n' % self.extreme_quantile['down'] 599 | s += 'Algorithm run : No\n' 600 | return s 601 | 602 | 603 | def fit(self,init_data,data): 604 | """ 605 | Import data to biSPOT object 606 | 607 | Parameters 608 | ---------- 609 | init_data : list, numpy.array or pandas.Series 610 | initial batch to calibrate the algorithm () 611 | 612 | data : numpy.array 613 | data for the run (list, np.array or pd.series) 614 | 615 | """ 616 | if isinstance(data,list): 617 | self.data = np.array(data) 618 | elif isinstance(data,np.ndarray): 619 | self.data = data 620 | elif isinstance(data,pd.Series): 621 | self.data = data.values 622 | else: 623 | print('This data format (%s) is not supported' % type(data)) 624 | return 625 | 626 | if isinstance(init_data,list): 627 | self.init_data = np.array(init_data) 628 | elif isinstance(init_data,np.ndarray): 629 | self.init_data = init_data 630 | elif isinstance(init_data,pd.Series): 631 | self.init_data = init_data.values 632 | elif isinstance(init_data,int): 633 | self.init_data = self.data[:init_data] 634 | self.data = self.data[init_data:] 635 | elif isinstance(init_data,float) & (init_data<1) & (init_data>0): 636 | r = int(init_data*data.size) 637 | self.init_data = self.data[:r] 638 | self.data = self.data[r:] 639 | else: 640 | print('The initial data cannot be set') 641 | return 642 | 643 | def add(self,data): 644 | """ 645 | This function allows to append data to the already fitted data 646 | 647 | Parameters 648 | ---------- 649 | data : list, numpy.array, pandas.Series 650 | data to append 651 | """ 652 | if isinstance(data,list): 653 | data = np.array(data) 654 | elif isinstance(data,np.ndarray): 655 | data = data 656 | elif isinstance(data,pd.Series): 657 | data = data.values 658 | else: 659 | print('This data format (%s) is not supported' % type(data)) 660 | return 661 | 662 | self.data = np.append(self.data,data) 663 | return 664 | 665 | def initialize(self, verbose = True): 666 | """ 667 | Run the calibration (initialization) step 668 | 669 | Parameters 670 | ---------- 671 | verbose : bool 672 | (default = True) If True, gives details about the batch initialization 673 | """ 674 | n_init = self.init_data.size 675 | 676 | S = np.sort(self.init_data) # we sort X to get the empirical quantile 677 | self.init_threshold['up'] = S[int(0.98*n_init)] # t is fixed for the whole algorithm 678 | self.init_threshold['down'] = S[int(0.02*n_init)] # t is fixed for the whole algorithm 679 | 680 | # initial peaks 681 | self.peaks['up'] = self.init_data[self.init_data>self.init_threshold['up']]-self.init_threshold['up'] 682 | self.peaks['down'] = -(self.init_data[self.init_data0) 774 | 775 | Returns 776 | ---------- 777 | float 778 | log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0) 779 | """ 780 | n = Y.size 781 | if gamma != 0: 782 | tau = gamma/sigma 783 | L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum() 784 | else: 785 | L = n * ( 1 + log(Y.mean()) ) 786 | return L 787 | 788 | 789 | def _grimshaw(self,side,epsilon = 1e-8, n_points = 10): 790 | """ 791 | Compute the GPD parameters estimation with the Grimshaw's trick 792 | 793 | Parameters 794 | ---------- 795 | epsilon : float 796 | numerical parameter to perform (default : 1e-8) 797 | n_points : int 798 | maximum number of candidates for maximum likelihood (default : 10) 799 | 800 | Returns 801 | ---------- 802 | gamma_best,sigma_best,ll_best 803 | gamma estimates, sigma estimates and corresponding log-likelihood 804 | """ 805 | def u(s): 806 | return 1 + np.log(s).mean() 807 | 808 | def v(s): 809 | return np.mean(1/s) 810 | 811 | def w(Y,t): 812 | s = 1+t*Y 813 | us = u(s) 814 | vs = v(s) 815 | return us*vs-1 816 | 817 | def jac_w(Y,t): 818 | s = 1+t*Y 819 | us = u(s) 820 | vs = v(s) 821 | jac_us = (1/t)*(1-vs) 822 | jac_vs = (1/t)*(-vs+np.mean(1/s**2)) 823 | return us*jac_vs+vs*jac_us 824 | 825 | 826 | Ym = self.peaks[side].min() 827 | YM = self.peaks[side].max() 828 | Ymean = self.peaks[side].mean() 829 | 830 | 831 | a = -1/YM 832 | if abs(a)<2*epsilon: 833 | epsilon = abs(a)/n_points 834 | 835 | a = a + epsilon 836 | b = 2*(Ymean-Ym)/(Ymean*Ym) 837 | c = 2*(Ymean-Ym)/(Ym**2) 838 | 839 | # We look for possible roots 840 | left_zeros = biSPOT._rootsFinder(lambda t: w(self.peaks[side],t), 841 | lambda t: jac_w(self.peaks[side],t), 842 | (a+epsilon,-epsilon), 843 | n_points,'regular') 844 | 845 | right_zeros = biSPOT._rootsFinder(lambda t: w(self.peaks[side],t), 846 | lambda t: jac_w(self.peaks[side],t), 847 | (b,c), 848 | n_points,'regular') 849 | 850 | # all the possible roots 851 | zeros = np.concatenate((left_zeros,right_zeros)) 852 | 853 | # 0 is always a solution so we initialize with it 854 | gamma_best = 0 855 | sigma_best = Ymean 856 | ll_best = biSPOT._log_likelihood(self.peaks[side],gamma_best,sigma_best) 857 | 858 | # we look for better candidates 859 | for z in zeros: 860 | gamma = u(1+z*self.peaks[side])-1 861 | sigma = gamma/z 862 | ll = biSPOT._log_likelihood(self.peaks[side],gamma,sigma) 863 | if ll>ll_best: 864 | gamma_best = gamma 865 | sigma_best = sigma 866 | ll_best = ll 867 | 868 | return gamma_best,sigma_best,ll_best 869 | 870 | 871 | 872 | def _quantile(self,side,gamma,sigma): 873 | """ 874 | Compute the quantile at level 1-q for a given side 875 | 876 | Parameters 877 | ---------- 878 | side : str 879 | 'up' or 'down' 880 | gamma : float 881 | GPD parameter 882 | sigma : float 883 | GPD parameter 884 | 885 | Returns 886 | ---------- 887 | float 888 | quantile at level 1-q for the GPD(γ,σ,μ=0) 889 | """ 890 | if side == 'up': 891 | r = self.n * self.proba / self.Nt[side] 892 | if gamma != 0: 893 | return self.init_threshold['up'] + (sigma/gamma)*(pow(r,-gamma)-1) 894 | else: 895 | return self.init_threshold['up'] - sigma*log(r) 896 | elif side == 'down': 897 | r = self.n * self.proba / self.Nt[side] 898 | if gamma != 0: 899 | return self.init_threshold['down'] - (sigma/gamma)*(pow(r,-gamma)-1) 900 | else: 901 | return self.init_threshold['down'] + sigma*log(r) 902 | else: 903 | print('error : the side is not right') 904 | 905 | 906 | def run(self, with_alarm = True): 907 | """ 908 | Run biSPOT on the stream 909 | 910 | Parameters 911 | ---------- 912 | with_alarm : bool 913 | (default = True) If False, SPOT will adapt the threshold assuming \ 914 | there is no abnormal values 915 | 916 | 917 | Returns 918 | ---------- 919 | dict 920 | keys : 'upper_thresholds', 'lower_thresholds' and 'alarms' 921 | 922 | '***-thresholds' contains the extreme quantiles and 'alarms' contains \ 923 | the indexes of the values which have triggered alarms 924 | 925 | """ 926 | if (self.n>self.init_data.size): 927 | print('Warning : the algorithm seems to have already been run, you \ 928 | should initialize before running again') 929 | return {} 930 | 931 | # list of the thresholds 932 | thup = [] 933 | thdown = [] 934 | alarm = [] 935 | # Loop over the stream 936 | for i in tqdm.tqdm(range(self.data.size)): 937 | 938 | # If the observed value exceeds the current threshold (alarm case) 939 | if self.data[i]>self.extreme_quantile['up'] : 940 | # if we want to alarm, we put it in the alarm list 941 | if with_alarm: 942 | alarm.append(i) 943 | # otherwise we add it in the peaks 944 | else: 945 | self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up']) 946 | self.Nt['up'] += 1 947 | self.n += 1 948 | # and we update the thresholds 949 | 950 | g,s,l = self._grimshaw('up') 951 | self.extreme_quantile['up'] = self._quantile('up',g,s) 952 | 953 | # case where the value exceeds the initial threshold but not the alarm ones 954 | elif self.data[i]>self.init_threshold['up']: 955 | # we add it in the peaks 956 | self.peaks['up'] = np.append(self.peaks['up'],self.data[i]-self.init_threshold['up']) 957 | self.Nt['up'] += 1 958 | self.n += 1 959 | # and we update the thresholds 960 | 961 | g,s,l = self._grimshaw('up') 962 | self.extreme_quantile['up'] = self._quantile('up',g,s) 963 | 964 | elif self.data[i] 0: 1127 | s += 'Algorithm run : Yes\n' 1128 | s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n) 1129 | s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n) 1130 | else: 1131 | s += '\t number of peaks : %s\n' % self.Nt 1132 | s += '\t extreme quantile : %s\n' % self.extreme_quantile 1133 | s += 'Algorithm run : No\n' 1134 | return s 1135 | 1136 | 1137 | def fit(self,init_data,data): 1138 | """ 1139 | Import data to DSPOT object 1140 | 1141 | Parameters 1142 | ---------- 1143 | init_data : list, numpy.array or pandas.Series 1144 | initial batch to calibrate the algorithm 1145 | 1146 | data : numpy.array 1147 | data for the run (list, np.array or pd.series) 1148 | 1149 | """ 1150 | if isinstance(data,list): 1151 | self.data = np.array(data) 1152 | elif isinstance(data,np.ndarray): 1153 | self.data = data 1154 | elif isinstance(data,pd.Series): 1155 | self.data = data.values 1156 | else: 1157 | print('This data format (%s) is not supported' % type(data)) 1158 | return 1159 | 1160 | if isinstance(init_data,list): 1161 | self.init_data = np.array(init_data) 1162 | elif isinstance(init_data,np.ndarray): 1163 | self.init_data = init_data 1164 | elif isinstance(init_data,pd.Series): 1165 | self.init_data = init_data.values 1166 | elif isinstance(init_data,int): 1167 | self.init_data = self.data[:init_data] 1168 | self.data = self.data[init_data:] 1169 | elif isinstance(init_data,float) & (init_data<1) & (init_data>0): 1170 | r = int(init_data*data.size) 1171 | self.init_data = self.data[:r] 1172 | self.data = self.data[r:] 1173 | else: 1174 | print('The initial data cannot be set') 1175 | return 1176 | 1177 | def add(self,data): 1178 | """ 1179 | This function allows to append data to the already fitted data 1180 | 1181 | Parameters 1182 | ---------- 1183 | data : list, numpy.array, pandas.Series 1184 | data to append 1185 | """ 1186 | if isinstance(data,list): 1187 | data = np.array(data) 1188 | elif isinstance(data,np.ndarray): 1189 | data = data 1190 | elif isinstance(data,pd.Series): 1191 | data = data.values 1192 | else: 1193 | print('This data format (%s) is not supported' % type(data)) 1194 | return 1195 | 1196 | self.data = np.append(self.data,data) 1197 | return 1198 | 1199 | def initialize(self, verbose = True): 1200 | """ 1201 | Run the calibration (initialization) step 1202 | 1203 | Parameters 1204 | ---------- 1205 | verbose : bool 1206 | (default = True) If True, gives details about the batch initialization 1207 | """ 1208 | n_init = self.init_data.size - self.depth 1209 | 1210 | M = backMean(self.init_data,self.depth) 1211 | T = self.init_data[self.depth:]-M[:-1] # new variable 1212 | 1213 | S = np.sort(T) # we sort X to get the empirical quantile 1214 | self.init_threshold = S[int(0.98*n_init)] # t is fixed for the whole algorithm 1215 | 1216 | # initial peaks 1217 | self.peaks = T[T>self.init_threshold]-self.init_threshold 1218 | self.Nt = self.peaks.size 1219 | self.n = n_init 1220 | 1221 | if verbose: 1222 | print('Initial threshold : %s' % self.init_threshold) 1223 | print('Number of peaks : %s' % self.Nt) 1224 | print('Grimshaw maximum log-likelihood estimation ... ', end = '') 1225 | 1226 | g,s,l = self._grimshaw() 1227 | self.extreme_quantile = self._quantile(g,s) 1228 | 1229 | if verbose: 1230 | print('[done]') 1231 | print('\t'+chr(0x03B3) + ' = ' + str(g)) 1232 | print('\t'+chr(0x03C3) + ' = ' + str(s)) 1233 | print('\tL = ' + str(l)) 1234 | print('Extreme quantile (probability = %s): %s' % (self.proba,self.extreme_quantile)) 1235 | 1236 | return 1237 | 1238 | 1239 | 1240 | 1241 | def _rootsFinder(fun,jac,bounds,npoints,method): 1242 | """ 1243 | Find possible roots of a scalar function 1244 | 1245 | Parameters 1246 | ---------- 1247 | fun : function 1248 | scalar function 1249 | jac : function 1250 | first order derivative of the function 1251 | bounds : tuple 1252 | (min,max) interval for the roots search 1253 | npoints : int 1254 | maximum number of roots to output 1255 | method : str 1256 | 'regular' : regular sample of the search interval, 'random' : uniform (distribution) sample of the search interval 1257 | 1258 | Returns 1259 | ---------- 1260 | numpy.array 1261 | possible roots of the function 1262 | """ 1263 | if method == 'regular': 1264 | step = (bounds[1]-bounds[0])/(npoints+1) 1265 | X0 = np.arange(bounds[0]+step,bounds[1],step) 1266 | elif method == 'random': 1267 | X0 = np.random.uniform(bounds[0],bounds[1],npoints) 1268 | 1269 | def objFun(X,f,jac): 1270 | g = 0 1271 | j = np.zeros(X.shape) 1272 | i = 0 1273 | for x in X: 1274 | fx = f(x) 1275 | g = g+fx**2 1276 | j[i] = 2*fx*jac(x) 1277 | i = i+1 1278 | return g,j 1279 | 1280 | opt = minimize(lambda X:objFun(X,fun,jac), X0, 1281 | method='L-BFGS-B', 1282 | jac=True, bounds=[bounds]*len(X0)) 1283 | 1284 | X = opt.x 1285 | np.round(X,decimals = 5) 1286 | return np.unique(X) 1287 | 1288 | 1289 | def _log_likelihood(Y,gamma,sigma): 1290 | """ 1291 | Compute the log-likelihood for the Generalized Pareto Distribution (μ=0) 1292 | 1293 | Parameters 1294 | ---------- 1295 | Y : numpy.array 1296 | observations 1297 | gamma : float 1298 | GPD index parameter 1299 | sigma : float 1300 | GPD scale parameter (>0) 1301 | 1302 | Returns 1303 | ---------- 1304 | float 1305 | log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0) 1306 | """ 1307 | n = Y.size 1308 | if gamma != 0: 1309 | tau = gamma/sigma 1310 | L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum() 1311 | else: 1312 | L = n * ( 1 + log(Y.mean()) ) 1313 | return L 1314 | 1315 | 1316 | def _grimshaw(self,epsilon = 1e-8, n_points = 10): 1317 | """ 1318 | Compute the GPD parameters estimation with the Grimshaw's trick 1319 | 1320 | Parameters 1321 | ---------- 1322 | epsilon : float 1323 | numerical parameter to perform (default : 1e-8) 1324 | n_points : int 1325 | maximum number of candidates for maximum likelihood (default : 10) 1326 | 1327 | Returns 1328 | ---------- 1329 | gamma_best,sigma_best,ll_best 1330 | gamma estimates, sigma estimates and corresponding log-likelihood 1331 | """ 1332 | def u(s): 1333 | return 1 + np.log(s).mean() 1334 | 1335 | def v(s): 1336 | return np.mean(1/s) 1337 | 1338 | def w(Y,t): 1339 | s = 1+t*Y 1340 | us = u(s) 1341 | vs = v(s) 1342 | return us*vs-1 1343 | 1344 | def jac_w(Y,t): 1345 | s = 1+t*Y 1346 | us = u(s) 1347 | vs = v(s) 1348 | jac_us = (1/t)*(1-vs) 1349 | jac_vs = (1/t)*(-vs+np.mean(1/s**2)) 1350 | return us*jac_vs+vs*jac_us 1351 | 1352 | 1353 | Ym = self.peaks.min() 1354 | YM = self.peaks.max() 1355 | Ymean = self.peaks.mean() 1356 | 1357 | 1358 | a = -1/YM 1359 | if abs(a)<2*epsilon: 1360 | epsilon = abs(a)/n_points 1361 | 1362 | a = a + epsilon 1363 | b = 2*(Ymean-Ym)/(Ymean*Ym) 1364 | c = 2*(Ymean-Ym)/(Ym**2) 1365 | 1366 | # We look for possible roots 1367 | left_zeros = SPOT._rootsFinder(lambda t: w(self.peaks,t), 1368 | lambda t: jac_w(self.peaks,t), 1369 | (a+epsilon,-epsilon), 1370 | n_points,'regular') 1371 | 1372 | right_zeros = SPOT._rootsFinder(lambda t: w(self.peaks,t), 1373 | lambda t: jac_w(self.peaks,t), 1374 | (b,c), 1375 | n_points,'regular') 1376 | 1377 | # all the possible roots 1378 | zeros = np.concatenate((left_zeros,right_zeros)) 1379 | 1380 | # 0 is always a solution so we initialize with it 1381 | gamma_best = 0 1382 | sigma_best = Ymean 1383 | ll_best = SPOT._log_likelihood(self.peaks,gamma_best,sigma_best) 1384 | 1385 | # we look for better candidates 1386 | for z in zeros: 1387 | gamma = u(1+z*self.peaks)-1 1388 | sigma = gamma/z 1389 | ll = dSPOT._log_likelihood(self.peaks,gamma,sigma) 1390 | if ll>ll_best: 1391 | gamma_best = gamma 1392 | sigma_best = sigma 1393 | ll_best = ll 1394 | 1395 | return gamma_best,sigma_best,ll_best 1396 | 1397 | 1398 | 1399 | def _quantile(self,gamma,sigma): 1400 | """ 1401 | Compute the quantile at level 1-q 1402 | 1403 | Parameters 1404 | ---------- 1405 | gamma : float 1406 | GPD parameter 1407 | sigma : float 1408 | GPD parameter 1409 | 1410 | Returns 1411 | ---------- 1412 | float 1413 | quantile at level 1-q for the GPD(γ,σ,μ=0) 1414 | """ 1415 | r = self.n * self.proba / self.Nt 1416 | if gamma != 0: 1417 | return self.init_threshold + (sigma/gamma)*(pow(r,-gamma)-1) 1418 | else: 1419 | return self.init_threshold - sigma*log(r) 1420 | 1421 | 1422 | def run(self, with_alarm = True): 1423 | """ 1424 | Run biSPOT on the stream 1425 | 1426 | Parameters 1427 | ---------- 1428 | with_alarm : bool 1429 | (default = True) If False, SPOT will adapt the threshold assuming \ 1430 | there is no abnormal values 1431 | 1432 | 1433 | Returns 1434 | ---------- 1435 | dict 1436 | keys : 'upper_thresholds', 'lower_thresholds' and 'alarms' 1437 | 1438 | '***-thresholds' contains the extreme quantiles and 'alarms' contains \ 1439 | the indexes of the values which have triggered alarms 1440 | 1441 | """ 1442 | if (self.n>self.init_data.size): 1443 | print('Warning : the algorithm seems to have already been run, you \ 1444 | should initialize before running again') 1445 | return {} 1446 | 1447 | # actual normal window 1448 | W = self.init_data[-self.depth:] 1449 | 1450 | # list of the thresholds 1451 | th = [] 1452 | alarm = [] 1453 | # Loop over the stream 1454 | for i in tqdm.tqdm(range(self.data.size)): 1455 | Mi = W.mean() 1456 | # If the observed value exceeds the current threshold (alarm case) 1457 | if (self.data[i]-Mi)>self.extreme_quantile: 1458 | # if we want to alarm, we put it in the alarm list 1459 | if with_alarm: 1460 | alarm.append(i) 1461 | # otherwise we add it in the peaks 1462 | else: 1463 | self.peaks = np.append(self.peaks,self.data[i]-Mi-self.init_threshold) 1464 | self.Nt += 1 1465 | self.n += 1 1466 | # and we update the thresholds 1467 | 1468 | g,s,l = self._grimshaw() 1469 | self.extreme_quantile = self._quantile(g,s) #+ Mi 1470 | W = np.append(W[1:],self.data[i]) 1471 | 1472 | # case where the value exceeds the initial threshold but not the alarm ones 1473 | elif (self.data[i]-Mi)>self.init_threshold: 1474 | # we add it in the peaks 1475 | self.peaks = np.append(self.peaks,self.data[i]-Mi-self.init_threshold) 1476 | self.Nt += 1 1477 | self.n += 1 1478 | # and we update the thresholds 1479 | 1480 | g,s,l = self._grimshaw() 1481 | self.extreme_quantile = self._quantile(g,s) #+ Mi 1482 | W = np.append(W[1:],self.data[i]) 1483 | else: 1484 | self.n += 1 1485 | W = np.append(W[1:],self.data[i]) 1486 | 1487 | 1488 | th.append(self.extreme_quantile+Mi) # thresholds record 1489 | 1490 | return {'thresholds' : th, 'alarms': alarm} 1491 | 1492 | 1493 | def plot(self,run_results, with_alarm = True): 1494 | """ 1495 | Plot the results given by the run 1496 | 1497 | Parameters 1498 | ---------- 1499 | run_results : dict 1500 | results given by the 'run' method 1501 | with_alarm : bool 1502 | (default = True) If True, alarms are plotted. 1503 | 1504 | 1505 | Returns 1506 | ---------- 1507 | list 1508 | list of the plots 1509 | 1510 | """ 1511 | x = range(self.data.size) 1512 | K = run_results.keys() 1513 | 1514 | ts_fig, = plt.plot(x,self.data,color=air_force_blue) 1515 | fig = [ts_fig] 1516 | 1517 | # if 'upper_thresholds' in K: 1518 | # thup = run_results['upper_thresholds'] 1519 | # uth_fig, = plt.plot(x,thup,color=deep_saffron,lw=2,ls='dashed') 1520 | # fig.append(uth_fig) 1521 | # 1522 | # if 'lower_thresholds' in K: 1523 | # thdown = run_results['lower_thresholds'] 1524 | # lth_fig, = plt.plot(x,thdown,color=deep_saffron,lw=2,ls='dashed') 1525 | # fig.append(lth_fig) 1526 | 1527 | if 'thresholds' in K: 1528 | th = run_results['thresholds'] 1529 | th_fig, = plt.plot(x,th,color=deep_saffron,lw=2,ls='dashed') 1530 | fig.append(th_fig) 1531 | 1532 | if with_alarm and ('alarms' in K): 1533 | alarm = run_results['alarms'] 1534 | if len(alarm)>0: 1535 | plt.scatter(alarm,self.data[alarm],color='red') 1536 | 1537 | plt.xlim((0,self.data.size)) 1538 | 1539 | 1540 | return fig 1541 | 1542 | 1543 | 1544 | 1545 | 1546 | 1547 | 1548 | """ 1549 | =========================== DRIFT & DOUBLE BOUNDS ============================= 1550 | """ 1551 | 1552 | 1553 | 1554 | class bidSPOT: 1555 | """ 1556 | This class allows to run DSPOT algorithm on univariate dataset (upper and lower bounds) 1557 | 1558 | Attributes 1559 | ---------- 1560 | proba : float 1561 | Detection level (risk), chosen by the user 1562 | 1563 | depth : int 1564 | Number of observations to compute the moving average 1565 | 1566 | extreme_quantile : float 1567 | current threshold (bound between normal and abnormal events) 1568 | 1569 | data : numpy.array 1570 | stream 1571 | 1572 | init_data : numpy.array 1573 | initial batch of observations (for the calibration/initialization step) 1574 | 1575 | init_threshold : float 1576 | initial threshold computed during the calibration step 1577 | 1578 | peaks : numpy.array 1579 | array of peaks (excesses above the initial threshold) 1580 | 1581 | n : int 1582 | number of observed values 1583 | 1584 | Nt : int 1585 | number of observed peaks 1586 | """ 1587 | def __init__(self, q = 1e-4, depth = 10): 1588 | self.proba = q 1589 | self.data = None 1590 | self.init_data = None 1591 | self.n = 0 1592 | self.depth = depth 1593 | 1594 | nonedict = {'up':None,'down':None} 1595 | 1596 | self.extreme_quantile = dict.copy(nonedict) 1597 | self.init_threshold = dict.copy(nonedict) 1598 | self.peaks = dict.copy(nonedict) 1599 | self.gamma = dict.copy(nonedict) 1600 | self.sigma = dict.copy(nonedict) 1601 | self.Nt = {'up':0,'down':0} 1602 | 1603 | 1604 | def __str__(self): 1605 | s = '' 1606 | s += 'Streaming Peaks-Over-Threshold Object\n' 1607 | s += 'Detection level q = %s\n' % self.proba 1608 | if self.data is not None: 1609 | s += 'Data imported : Yes\n' 1610 | s += '\t initialization : %s values\n' % self.init_data.size 1611 | s += '\t stream : %s values\n' % self.data.size 1612 | else: 1613 | s += 'Data imported : No\n' 1614 | return s 1615 | 1616 | if self.n == 0: 1617 | s += 'Algorithm initialized : No\n' 1618 | else: 1619 | s += 'Algorithm initialized : Yes\n' 1620 | s += '\t initial threshold : %s\n' % self.init_threshold 1621 | 1622 | r = self.n-self.init_data.size 1623 | if r > 0: 1624 | s += 'Algorithm run : Yes\n' 1625 | s += '\t number of observations : %s (%.2f %%)\n' % (r,100*r/self.n) 1626 | s += '\t triggered alarms : %s (%.2f %%)\n' % (len(self.alarm),100*len(self.alarm)/self.n) 1627 | else: 1628 | s += '\t number of peaks : %s\n' % self.Nt 1629 | s += '\t upper extreme quantile : %s\n' % self.extreme_quantile['up'] 1630 | s += '\t lower extreme quantile : %s\n' % self.extreme_quantile['down'] 1631 | s += 'Algorithm run : No\n' 1632 | return s 1633 | 1634 | 1635 | def fit(self,init_data,data): 1636 | """ 1637 | Import data to biDSPOT object 1638 | 1639 | Parameters 1640 | ---------- 1641 | init_data : list, numpy.array or pandas.Series 1642 | initial batch to calibrate the algorithm 1643 | 1644 | data : numpy.array 1645 | data for the run (list, np.array or pd.series) 1646 | 1647 | """ 1648 | if isinstance(data,list): 1649 | self.data = np.array(data) 1650 | elif isinstance(data,np.ndarray): 1651 | self.data = data 1652 | elif isinstance(data,pd.Series): 1653 | self.data = data.values 1654 | else: 1655 | print('This data format (%s) is not supported' % type(data)) 1656 | return 1657 | 1658 | if isinstance(init_data,list): 1659 | self.init_data = np.array(init_data) 1660 | elif isinstance(init_data,np.ndarray): 1661 | self.init_data = init_data 1662 | elif isinstance(init_data,pd.Series): 1663 | self.init_data = init_data.values 1664 | elif isinstance(init_data,int): 1665 | self.init_data = self.data[:init_data] 1666 | self.data = self.data[init_data:] 1667 | elif isinstance(init_data,float) & (init_data<1) & (init_data>0): 1668 | r = int(init_data*data.size) 1669 | self.init_data = self.data[:r] 1670 | self.data = self.data[r:] 1671 | else: 1672 | print('The initial data cannot be set') 1673 | return 1674 | 1675 | def add(self,data): 1676 | """ 1677 | This function allows to append data to the already fitted data 1678 | 1679 | Parameters 1680 | ---------- 1681 | data : list, numpy.array, pandas.Series 1682 | data to append 1683 | """ 1684 | if isinstance(data,list): 1685 | data = np.array(data) 1686 | elif isinstance(data,np.ndarray): 1687 | data = data 1688 | elif isinstance(data,pd.Series): 1689 | data = data.values 1690 | else: 1691 | print('This data format (%s) is not supported' % type(data)) 1692 | return 1693 | 1694 | self.data = np.append(self.data,data) 1695 | return 1696 | 1697 | def initialize(self, verbose = True): 1698 | """ 1699 | Run the calibration (initialization) step 1700 | 1701 | Parameters 1702 | ---------- 1703 | verbose : bool 1704 | (default = True) If True, gives details about the batch initialization 1705 | """ 1706 | n_init = self.init_data.size - self.depth 1707 | 1708 | M = backMean(self.init_data,self.depth) 1709 | T = self.init_data[self.depth:]-M[:-1] # new variable 1710 | 1711 | S = np.sort(T) # we sort T to get the empirical quantile 1712 | self.init_threshold['up'] = S[int(0.98*n_init)] # t is fixed for the whole algorithm 1713 | self.init_threshold['down'] = S[int(0.02*n_init)] # t is fixed for the whole algorithm 1714 | 1715 | # initial peaks 1716 | self.peaks['up'] = T[T>self.init_threshold['up']]-self.init_threshold['up'] 1717 | self.peaks['down'] = -( T[ T0) 1810 | 1811 | Returns 1812 | ---------- 1813 | float 1814 | log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0) 1815 | """ 1816 | n = Y.size 1817 | if gamma != 0: 1818 | tau = gamma/sigma 1819 | L = -n * log(sigma) - ( 1 + (1/gamma) ) * ( np.log(1+tau*Y) ).sum() 1820 | else: 1821 | L = n * ( 1 + log(Y.mean()) ) 1822 | return L 1823 | 1824 | 1825 | def _grimshaw(self,side,epsilon = 1e-8, n_points = 8): 1826 | """ 1827 | Compute the GPD parameters estimation with the Grimshaw's trick 1828 | 1829 | Parameters 1830 | ---------- 1831 | epsilon : float 1832 | numerical parameter to perform (default : 1e-8) 1833 | n_points : int 1834 | maximum number of candidates for maximum likelihood (default : 10) 1835 | 1836 | Returns 1837 | ---------- 1838 | gamma_best,sigma_best,ll_best 1839 | gamma estimates, sigma estimates and corresponding log-likelihood 1840 | """ 1841 | def u(s): 1842 | return 1 + np.log(s).mean() 1843 | 1844 | def v(s): 1845 | return np.mean(1/s) 1846 | 1847 | def w(Y,t): 1848 | s = 1+t*Y 1849 | us = u(s) 1850 | vs = v(s) 1851 | return us*vs-1 1852 | 1853 | def jac_w(Y,t): 1854 | s = 1+t*Y 1855 | us = u(s) 1856 | vs = v(s) 1857 | jac_us = (1/t)*(1-vs) 1858 | jac_vs = (1/t)*(-vs+np.mean(1/s**2)) 1859 | return us*jac_vs+vs*jac_us 1860 | 1861 | 1862 | Ym = self.peaks[side].min() 1863 | YM = self.peaks[side].max() 1864 | Ymean = self.peaks[side].mean() 1865 | 1866 | 1867 | a = -1/YM 1868 | if abs(a)<2*epsilon: 1869 | epsilon = abs(a)/n_points 1870 | 1871 | a = a + epsilon 1872 | b = 2*(Ymean-Ym)/(Ymean*Ym) 1873 | c = 2*(Ymean-Ym)/(Ym**2) 1874 | 1875 | # We look for possible roots 1876 | left_zeros = bidSPOT._rootsFinder(lambda t: w(self.peaks[side],t), 1877 | lambda t: jac_w(self.peaks[side],t), 1878 | (a+epsilon,-epsilon), 1879 | n_points,'regular') 1880 | 1881 | right_zeros = bidSPOT._rootsFinder(lambda t: w(self.peaks[side],t), 1882 | lambda t: jac_w(self.peaks[side],t), 1883 | (b,c), 1884 | n_points,'regular') 1885 | 1886 | # all the possible roots 1887 | zeros = np.concatenate((left_zeros,right_zeros)) 1888 | 1889 | # 0 is always a solution so we initialize with it 1890 | gamma_best = 0 1891 | sigma_best = Ymean 1892 | ll_best = bidSPOT._log_likelihood(self.peaks[side],gamma_best,sigma_best) 1893 | 1894 | # we look for better candidates 1895 | for z in zeros: 1896 | gamma = u(1+z*self.peaks[side])-1 1897 | sigma = gamma/z 1898 | ll = bidSPOT._log_likelihood(self.peaks[side],gamma,sigma) 1899 | if ll>ll_best: 1900 | gamma_best = gamma 1901 | sigma_best = sigma 1902 | ll_best = ll 1903 | 1904 | return gamma_best,sigma_best,ll_best 1905 | 1906 | 1907 | 1908 | def _quantile(self,side,gamma,sigma): 1909 | """ 1910 | Compute the quantile at level 1-q for a given side 1911 | 1912 | Parameters 1913 | ---------- 1914 | side : str 1915 | 'up' or 'down' 1916 | gamma : float 1917 | GPD parameter 1918 | sigma : float 1919 | GPD parameter 1920 | 1921 | Returns 1922 | ---------- 1923 | float 1924 | quantile at level 1-q for the GPD(γ,σ,μ=0) 1925 | """ 1926 | if side == 'up': 1927 | r = self.n * self.proba / self.Nt[side] 1928 | if gamma != 0: 1929 | return self.init_threshold['up'] + (sigma/gamma)*(pow(r,-gamma)-1) 1930 | else: 1931 | return self.init_threshold['up'] - sigma*log(r) 1932 | elif side == 'down': 1933 | r = self.n * self.proba / self.Nt[side] 1934 | if gamma != 0: 1935 | return self.init_threshold['down'] - (sigma/gamma)*(pow(r,-gamma)-1) 1936 | else: 1937 | return self.init_threshold['down'] + sigma*log(r) 1938 | else: 1939 | print('error : the side is not right') 1940 | 1941 | 1942 | def run(self, with_alarm = True, plot = True): 1943 | """ 1944 | Run biDSPOT on the stream 1945 | 1946 | Parameters 1947 | ---------- 1948 | with_alarm : bool 1949 | (default = True) If False, SPOT will adapt the threshold assuming \ 1950 | there is no abnormal values 1951 | 1952 | 1953 | Returns 1954 | ---------- 1955 | dict 1956 | keys : 'upper_thresholds', 'lower_thresholds' and 'alarms' 1957 | 1958 | '***-thresholds' contains the extreme quantiles and 'alarms' contains \ 1959 | the indexes of the values which have triggered alarms 1960 | 1961 | """ 1962 | if (self.n>self.init_data.size): 1963 | print('Warning : the algorithm seems to have already been run, you \ 1964 | should initialize before running again') 1965 | return {} 1966 | 1967 | # actual normal window 1968 | W = self.init_data[-self.depth:] 1969 | 1970 | # list of the thresholds 1971 | thup = [] 1972 | thdown = [] 1973 | alarm = [] 1974 | # Loop over the stream 1975 | for i in tqdm.tqdm(range(self.data.size)): 1976 | Mi = W.mean() 1977 | Ni = self.data[i]-Mi 1978 | # If the observed value exceeds the current threshold (alarm case) 1979 | if Ni>self.extreme_quantile['up'] : 1980 | # if we want to alarm, we put it in the alarm list 1981 | if with_alarm: 1982 | alarm.append(i) 1983 | # otherwise we add it in the peaks 1984 | else: 1985 | self.peaks['up'] = np.append(self.peaks['up'],Ni-self.init_threshold['up']) 1986 | self.Nt['up'] += 1 1987 | self.n += 1 1988 | # and we update the thresholds 1989 | 1990 | g,s,l = self._grimshaw('up') 1991 | self.extreme_quantile['up'] = self._quantile('up',g,s) 1992 | W = np.append(W[1:],self.data[i]) 1993 | 1994 | # case where the value exceeds the initial threshold but not the alarm ones 1995 | elif Ni>self.init_threshold['up']: 1996 | # we add it in the peaks 1997 | self.peaks['up'] = np.append(self.peaks['up'],Ni-self.init_threshold['up']) 1998 | self.Nt['up'] += 1 1999 | self.n += 1 2000 | # and we update the thresholds 2001 | g,s,l = self._grimshaw('up') 2002 | self.extreme_quantile['up'] = self._quantile('up',g,s) 2003 | W = np.append(W[1:],self.data[i]) 2004 | 2005 | elif Ni0: 2079 | al_fig = plt.scatter(alarm,self.data[alarm],color='red') 2080 | fig.append(al_fig) 2081 | 2082 | plt.xlim((0,self.data.size)) 2083 | 2084 | 2085 | return fig 2086 | 2087 | 2088 | 2089 | 2090 | 2091 | 2092 | 2093 | 2094 | 2095 | --------------------------------------------------------------------------------