├── requirements.txt ├── README.md ├── rocka ├── __init__.py ├── baseline_extraction.py ├── shape_based_distance.py ├── density_esitmation.py ├── model.py └── preprocessing_rocka.py ├── LICENSE ├── .gitignore └── setup.py /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy >= 1.12.1 2 | sklearn >= 0.21.3 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rocka 2 | an unsurpervised clustering algorithm named ROCKA 3 | REFERECE: 4 | Li Z, Zhao Y, Liu R, et al. Robust and rapid clustering of kpis for large-scale anomaly detection[C]//2018 IEEE/ACM 26th International Symposium on Quality of Service (IWQoS). IEEE, 2018: 1-10. 5 | -------------------------------------------------------------------------------- /rocka/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Nov 29 17:15:23 2019 4 | 5 | @author: PB 6 | """ 7 | 8 | __version__ = '0.1' 9 | 10 | import preprocessing_rocka 11 | import baseline_extraction 12 | import shape_based_distance 13 | import density_esitmation 14 | import model 15 | 16 | __all__ = ['Rocka'] 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 polarbear1992 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Donut 3 | ----- 4 | 5 | Donut is an anomaly detection algorithm for periodic KPIs. 6 | """ 7 | import ast 8 | import codecs 9 | import os 10 | import re 11 | import sys 12 | from setuptools import setup, find_packages 13 | 14 | 15 | _version_re = re.compile(r'__version__\s+=\s+(.*)') 16 | _source_dir = os.path.split(os.path.abspath(__file__))[0] 17 | 18 | if sys.version_info[0] == 2: 19 | def read_file(path): 20 | with open(path, 'rb') as f: 21 | return f.read() 22 | else: 23 | def read_file(path): 24 | with codecs.open(path, 'rb', 'utf-8') as f: 25 | return f.read() 26 | 27 | version = str(ast.literal_eval(_version_re.search( 28 | read_file(os.path.join(_source_dir, 'rocka/__init__.py'))).group(1))) 29 | 30 | requirements_list = list(filter( 31 | lambda v: v and not v.startswith('#'), 32 | (s.strip() for s in read_file( 33 | os.path.join(_source_dir, 'requirements.txt')).split('\n')) 34 | )) 35 | dependency_links = [s for s in requirements_list if s.startswith('git+')] 36 | install_requires = [s for s in requirements_list if not s.startswith('git+')] 37 | 38 | 39 | setup( 40 | name='Rocka', 41 | version=version, 42 | url='https://github.com/polarbear1992/Rocka/', 43 | license='MIT', 44 | author='Polar Bear', 45 | author_email='wqy919@yeah.net', 46 | description='an unsurpervised clustering algorithm named ROCKA', 47 | long_description=__doc__, 48 | packages=find_packages('.', include=['rocka', 'rocka.*']), 49 | zip_safe=False, 50 | platforms='any', 51 | setup_requires=['setuptools'], 52 | install_requires=install_requires, 53 | dependency_links=dependency_links, 54 | classifiers=[ 55 | 'Development Status :: 2 - Alpha', 56 | 'Intended Audience :: Developers', 57 | 'License :: OSI Approved :: MIT License', 58 | 'Operating System :: OS Independent', 59 | 'Programming Language :: Python', 60 | 'Programming Language :: Python :: 2', 61 | 'Programming Language :: Python :: 2.7', 62 | 'Programming Language :: Python :: 3', 63 | 'Programming Language :: Python :: 3.5', 64 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 65 | 'Topic :: Software Development :: Libraries :: Python Modules' 66 | ] 67 | ) 68 | -------------------------------------------------------------------------------- /rocka/baseline_extraction.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Nov 28 10:04:57 2019 4 | 5 | @author: PB 6 | """ 7 | from preprocessing_rocka import standardize_obj 8 | import numpy as np 9 | 10 | __all__ = ['smoothing_extreme_values','extract_baseline'] 11 | 12 | def smoothing_extreme_values(values): 13 | """ 14 | In general,the ratio of anomaly points in a time series is less than 5%[1]. 15 | As such,simply remove the top5% data which deviate the most from the mean 16 | value,and use linear interpolation to fill them. 17 | 18 | Args: 19 | values(np.ndarray) is a time series which has been preprosessed by linear 20 | interpolation and standardization(to have zero mean and unit variance) 21 | 22 | Returns: 23 | np.ndarray: The smoothed `values` 24 | """ 25 | 26 | values = np.asarray(values, np.float32) 27 | if len(values.shape) != 1: 28 | raise ValueError('`values` must be a 1-D array') 29 | # if (values.mean() != 0) or (values.std() != 1): 30 | # raise ValueError('`values` must be standardized to have zero mean and unit variance') 31 | 32 | #get the deviation of each point from zero mean 33 | values_deviation = np.abs(values) 34 | 35 | #the abnormal portion 36 | abnormal_portion = 0.05 37 | 38 | #replace the abnormal points with linear interpolation 39 | abnormal_max = np.max(values_deviation) 40 | abnormal_index = np.argwhere(values_deviation >= abnormal_max * (1-abnormal_portion)) 41 | abnormal = abnormal_index.reshape(len(abnormal_index)) 42 | normal_index = np.argwhere(values_deviation < abnormal_max * (1-abnormal_portion)) 43 | normal = normal_index.reshape(len(normal_index)) 44 | normal_values = values[normal] 45 | abnormal_values = np.interp(abnormal,normal,normal_values) 46 | values[abnormal] = abnormal_values 47 | 48 | return values 49 | 50 | def extract_baseline(values,w): 51 | """ 52 | A simple but effective method for removing noises if to apply moving 53 | average with a small sliding window(`w`) on the KPI(`values`),separating 54 | its curve into two parts:baseline and residuals. 55 | For a KPI,T,with a sliding window of length of `w`,stride = 1,for each 56 | point x(t),the corresponding point on the baseline,denoted as x(t)*,is the 57 | mean of vector (x(t-w+1),...,x(t)). 58 | Then the diffrence between x(t) and x(t)* is called a residuals. 59 | 60 | Args: 61 | values(np.ndarray): time series after preprocessing and smoothed 62 | 63 | Returns: 64 | tuple(np.ndarray,np.float32,np.float32): 65 | np.ndarray: the baseline of rawdata; 66 | np.float32: the mean of input values after moving average; 67 | np.float32: the std of input values after moving average. 68 | np.ndarray:the residuals between rawdata between baseline 69 | 70 | 71 | """ 72 | #moving average to get the baseline 73 | baseline = np.convolve(values,np.ones((w,))/w,mode='valid') 74 | #get the residuals,the difference between raw series and baseline 75 | residuals = values[w-1:] - baseline 76 | 77 | return standardize_obj(baseline),residuals 78 | 79 | 80 | -------------------------------------------------------------------------------- /rocka/shape_based_distance.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Nov 28 15:28:32 2019 4 | 5 | @author: PB 6 | """ 7 | 8 | import numpy as np 9 | 10 | __all__ = ['sbd_ele','SBD'] 11 | 12 | def sbd_ele(values1,values2): 13 | """ 14 | Given two time seires `values1' and `values2`,cross-correlation slides 15 | `values2' to `values1` to compute the inner-product for each shift `s`,the 16 | range of shift `s` ∈ [-len(values2) + 1, len(values1)-1] 17 | SBD is based of cross-correlation. SBD ranges from 0 to 2, where 0 means 18 | two time series have exactly the same shape. A smaller SBD means higher 19 | shape similarity. 20 | 21 | Args: 22 | values1(np.ndarray): time series 1 23 | values2(np.ndarray): time series 2 24 | 25 | Returns: 26 | np.float32: the SBD between `values1` and `values2` 27 | """ 28 | #get the 2 norm 29 | l2_values1 = np.linalg.norm(values1) 30 | l2_values2 = np.linalg.norm(values2) 31 | #get the cross-correlation of each shift `s` 32 | cross_corre = np.convolve(values1,values2,mode = 'full') 33 | 34 | #return the SBD between `values1` and `values2` 35 | return 1 - np.max(cross_corre)/(l2_values1 * l2_values2) 36 | 37 | def SBD(values_list,minPts = 4): 38 | """ 39 | Caculate the shape based distance(SBD) between any two time series for 40 | similarity measure. SBD is used for DBSCAN for clustering. 41 | The main idea of DBSCAN is to find some cores in dense regions,and then 42 | expand the cores by transitivity of similarity to form clusters. 43 | 44 | Args: 45 | List(np.ndarray): a list consists of all time series(np.ndarray),the 46 | lengths of different time series could be different 47 | minPts(np.int32): The core `p` in DBSCAN is defined as an object that has 48 | at least `minPts` objects within a distance of ϵ from it(excluding 49 | `p`). The default value of `minPts` is 4. 50 | 51 | Returns: 52 | np.ndarray: for each time series, take the SBD between it and its 53 | minPts-Nearest-Neighbor(KNN). The SBDs of all time series in `values` 54 | returned as an np.ndarray. 55 | """ 56 | 57 | if len(values_list) < minPts: 58 | raise ValueError ('`values_list` must contain more than %d time series'\ 59 | %minPts) 60 | if len(values_list[0].shape) != 1: 61 | raise ValueError ('`values` must be a 1-D array') 62 | if (type(minPts) is not int) or (minPts < 1): 63 | raise ValueError ('`minPts` must be a positive integar') 64 | 65 | #Caculate the SBD between any two time time series 66 | sbd_matrix = np.zeros((len(values_list),len(values_list))) 67 | for i in range(len(values_list)): 68 | for j in range(i,len(values_list)): 69 | sbd_matrix[i][j] = sbd_ele(values_list[i],values_list[j]) 70 | sbd_matrix[j][i] = sbd_matrix[i][j] 71 | 72 | #Return the minPts nearest SBD for each time series(excluding itself) 73 | ret_sbd = np.zeros(len(values_list)) 74 | for i in range(len(values_list)): 75 | src_index = np.argsort(sbd_matrix[i]) 76 | ret_sbd[i] = sbd_matrix[i][src_index][minPts] 77 | 78 | return sbd_matrix,ret_sbd -------------------------------------------------------------------------------- /rocka/density_esitmation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Nov 28 18:05:24 2019 4 | 5 | @author: PB 6 | """ 7 | 8 | import numpy as np 9 | 10 | __all__ = ['density_radius'] 11 | 12 | 13 | def density_radius(sbd_arr,len_thresh,max_radius,slope_thresh,slope_diff_thresh): 14 | """ 15 | Given K-Nearest-Neighbor SBDs of each sample,calculate the density radius 16 | for DESCAN clustering. 17 | 18 | Args: 19 | `sbd_arr`: np.ndarray, array of the K-Nearest-Neighbor SBD of each 20 | sample. 21 | `len_thresh`: np.int32, the length of traget SBDs for candidate radius 22 | search. 23 | `max_radius`: np.float32, candidate radius are no larger than 24 | `max_radius`. 25 | `slope_thresh`: np.float32, the slopes on the left and right of 26 | candidate point are no larger than `slope_thresh` 27 | `slope_diff_thresh`: np.float32, the diff between leftslope and right- 28 | slope of candidate point are no larger than `slope_diff_thresh` 29 | 30 | Returns: 31 | np.float32: the final density radius is the largest value of all 32 | candidate radii. 33 | """ 34 | src_index = np.argsort(sbd_arr) 35 | sbd_arr_sorted = sbd_arr[src_index][::-1] 36 | candidates_index = np.argwhere(sbd_arr_sorted<=max_radius) 37 | start = np.min(candidates_index) 38 | end = len(sbd_arr_sorted) 39 | 40 | def find_candidate_radius(sbd_arr_sorted,start,end,candidates): 41 | """ 42 | Given reverse sorted K-Nearest-Neighbor SBDs of each sample,calculate the density 43 | radius for DESCAN clustering. 44 | A divide and conquer strategy is used for candidate radius finding. 45 | 46 | Args: 47 | `sbd_arr_sorted`: np.ndarray, reverse sorted array of the K-Nearest 48 | -Neighbor SBD of each sample. 49 | `start`: np.int32, the begain index of target SBDs. 50 | `end`: np.int32, the end index of target SBDs. 51 | `candidates`: np.ndarray, the indexes of all candidate radii. 52 | 53 | Returns: 54 | `candidates`: np.ndarray, the indexes of all candidate radii. 55 | """ 56 | if end - start <= len_thresh: 57 | return 58 | radius,diff = -1,2 59 | for i in range(start+1,end): 60 | leftslope = (sbd_arr_sorted[i]-sbd_arr_sorted[start])/(i-start) 61 | rightslope = (sbd_arr_sorted[end-1]-sbd_arr_sorted[i])/(end-1-i) 62 | 63 | if leftslope > slope_thresh or rightslope > slope_thresh: 64 | continue 65 | if np.abs(leftslope - rightslope) < diff: 66 | diff = leftslope - rightslope 67 | radius = i 68 | if diff < slope_diff_thresh: 69 | np.append(candidates,radius) 70 | find_candidate_radius(sbd_arr_sorted,start,radius,candidates) 71 | find_candidate_radius(sbd_arr_sorted,radius+1,end,candidates) 72 | 73 | candidate = np.empty((0),np.int32) 74 | candidates = find_candidate_radius(sbd_arr_sorted,start,end,candidate) 75 | print(candidates) 76 | if candidates is not None: 77 | radius_candidates = np.max(sbd_arr_sorted[candidates]) 78 | return radius_candidates 79 | else: 80 | raise ValueError('There is no qualified density raidus.') 81 | 82 | -------------------------------------------------------------------------------- /rocka/model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Nov 29 15:26:44 2019 4 | 5 | @author: icebearwang 6 | """ 7 | 8 | from sklearn.cluster import dbscan_ 9 | 10 | class Rocka(object): 11 | """ 12 | 13 | """ 14 | def __init__(self,sbd_matrix,density_radius,minPts,metric_rocka='precomputed', 15 | metric_params_rocka=None,algorithm_rocka='auto',leaf_size_rocka=30, 16 | p_rocka=2,sample_weight_rocka=None,n_jobs_rocka=None): 17 | self.sbd_matrix = sbd_matrix 18 | self.density_radius = density_radius 19 | self.minPts = minPts 20 | self.metric_rocka = metric_rocka 21 | self.metric_params_rocka = metric_params_rocka 22 | self.algorithm_rocka = algorithm_rocka 23 | self.leaf_size_rocka = leaf_size_rocka 24 | self.p_rocka = p_rocka 25 | self.sample_weight_rocka = sample_weight_rocka 26 | self.n_jobs_rocka = n_jobs_rocka 27 | 28 | def fit(self,sbd_matrix,density_radius,minPts,metric_rocka='precomputed', 29 | metric_params_rocka=None,algorithm_rocka='auto',leaf_size_rocka 30 | =30,p_rocka=2,sample_weight_rocka=None,n_jobs_rocka=None): 31 | """ 32 | Perform DBSCAN clustering from vector array or distance matrix. 33 | 34 | Read more in the :ref:`User Guide `. 35 | 36 | Parameters 37 | ---------- 38 | sbd_matrix : np.ndarray or sparse (CSR) matrix of shape (n_samples, n_features), 39 | or \array of shape (n_samples, n_samples) 40 | A feature array, or array of distances between samples if 41 | ``metric='precomputed'``. 42 | 43 | density_radius : np.float32, 44 | The maximum distance between two samples for one to be considered 45 | as in the neighborhood of the other. This is not a maximum bound 46 | on the distances of points within a cluster. This is the most 47 | important DBSCAN parameter to choose appropriately for your data set 48 | and distance function. 49 | 50 | minPts : np.int32, 51 | The number of samples (or total weight) in a neighborhood for a point 52 | to be considered as a core point. This includes the point itself. 53 | 54 | metric_rocka : string, or callable 55 | The metric to use when calculating distance between instances in a 56 | feature array. If metric is a string or callable, it must be one of 57 | the options allowed by :func:`sklearn.metrics.pairwise_distances` for 58 | its metric parameter. 59 | If metric is "precomputed", X is assumed to be a distance matrix and 60 | must be square. X may be a sparse matrix, in which case only "nonzero" 61 | elements may be considered neighbors for DBSCAN. 62 | 63 | metric_params_rocka : dict, optional 64 | Additional keyword arguments for the metric function. 65 | 66 | .. versionadded:: 0.19 67 | 68 | algorithm_rocka : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional 69 | The algorithm to be used by the NearestNeighbors module 70 | to compute pointwise distances and find nearest neighbors. 71 | See NearestNeighbors module documentation for details. 72 | 73 | leaf_size_rocka : np.int32, optional (default = 30) 74 | Leaf size passed to BallTree or cKDTree. This can affect the speed 75 | of the construction and query, as well as the memory required 76 | to store the tree. The optimal value depends 77 | on the nature of the problem. 78 | 79 | p_rocka : np.float32, optional 80 | The power of the Minkowski metric to be used to calculate distance 81 | between points. 82 | 83 | sample_weight_rocka : np.ndarray, shape (n_samples,), optional 84 | Weight of each sample, such that a sample with a weight of at least 85 | ``min_samples`` is by itself a core sample; a sample with negative 86 | weight may inhibit its eps-neighbor from being core. 87 | Note that weights are absolute, and default to 1. 88 | 89 | n_jobs_rocka : bp.int32 or None, optional (default=None) 90 | The number of parallel jobs to run for neighbors search. 91 | ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 92 | ``-1`` means using all processors. See :term:`Glossary ` 93 | for more details. 94 | 95 | Attributes 96 | ---------- 97 | core_sample_indices_ : array, shape = [n_core_samples] 98 | Indices of core samples. 99 | 100 | components_ : array, shape = [n_core_samples, n_features] 101 | Copy of each core sample found by training. 102 | 103 | labels_ : array, shape = [n_samples] 104 | Cluster labels for each point in the dataset given to fit(). 105 | Noisy samples are given the label -1. 106 | 107 | See also 108 | -------- 109 | DBSCAN 110 | An estimator interface for this clustering algorithm. 111 | OPTICS 112 | A similar estimator interface clustering at multiple values of eps. Our 113 | implementation is optimized for memory usage. 114 | """ 115 | model = dbscan_.DBSCAN(density_radius,minPts,metric=metric_rocka, 116 | metric_params=metric_params_rocka,algorithm=algorithm_rocka, 117 | leaf_size=leaf_size_rocka,p=p_rocka,sample_weight=sample_weight_rocka, 118 | n_jobs=n_jobs_rocka).fit(sbd_matrix) 119 | 120 | return model 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /rocka/preprocessing_rocka.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Nov 27 15:07:20 2019 4 | 5 | @author: PB 6 | """ 7 | 8 | import numpy as np 9 | 10 | __all__ = ['linear_interpolation','standardize_obj'] 11 | 12 | def linear_interpolation(timestamp, arrays=None): 13 | """ 14 | Complete `timestamp` such that the time interval is homogeneous. 15 | 16 | Zeros will be inserted into each array in `arrays`, at missing points. 17 | Linear interpolation would be utilized for missing points(0) and 18 | default points(-1) 19 | Also, an indicator array will be returned to indicate whether each 20 | point is missing or defauld. 21 | 22 | Args: 23 | timestamp (np.ndarray): 1-D int64 array, the timestamp values. 24 | It can be unsorted. 25 | arrays (Iterable[np.ndarray]): The 1-D arrays to be filled with zeros 26 | according to `timestamp`. 27 | 28 | Returns: 29 | np.ndarray: A 1-D int64 array, the completed timestamp. 30 | np.ndarray: A 1-D int32 array, indicating whether each point is missing. 31 | list[np.ndarray]: The arrays, missing points filled with zeros. 32 | (optional, return only if `arrays` is specified) 33 | """ 34 | timestamp = np.asarray(timestamp, np.int64) 35 | if len(timestamp.shape) != 1: 36 | raise ValueError('`timestamp` must be a 1-D array') 37 | 38 | has_arrays = arrays is not None 39 | arrays = [np.asarray(array) for array in (arrays or ())] 40 | for i, array in enumerate(arrays): 41 | if array.shape != timestamp.shape: 42 | raise ValueError('The shape of ``arrays[{}]`` does not agree with ' 43 | 'the shape of `timestamp` ({} vs {})'. 44 | format(i, array.shape, timestamp.shape)) 45 | 46 | # sort the timestamp, and check the intervals 47 | src_index = np.argsort(timestamp) 48 | timestamp_sorted = timestamp[src_index] 49 | intervals = np.unique(np.diff(timestamp_sorted)) 50 | interval = np.min(intervals) 51 | if interval == 0: 52 | raise ValueError('Duplicated values in `timestamp`') 53 | for itv in intervals: 54 | if itv % interval != 0: 55 | raise ValueError('Not all intervals in `timestamp` are multiples ' 56 | 'of the minimum interval') 57 | 58 | # prepare for the return arrays 59 | length = (timestamp_sorted[-1] - timestamp_sorted[0]) // interval + 1 60 | ret_timestamp = np.arange(timestamp_sorted[0], 61 | timestamp_sorted[-1] + interval, 62 | interval, 63 | dtype=np.int64) 64 | ret_missing = np.ones([length], dtype=np.int32) 65 | ret_arrays = [np.zeros([length], dtype=array.dtype) for array in arrays] 66 | 67 | # copy values to the return arrays 68 | dst_index = np.asarray((timestamp_sorted - timestamp_sorted[0]) // interval, 69 | dtype=np.int) 70 | ret_missing[dst_index] = 0 71 | 72 | for ret_array, array in zip(ret_arrays, arrays): 73 | ret_array[dst_index] = array[src_index] 74 | 75 | #linear interpolation for the missing values and the default values(-1) 76 | neg_indexs = [np.argwhere(array==-1) for array in ret_arrays] 77 | miss_index = np.argwhere(ret_missing==1) 78 | 79 | for neg_index,ret_array in zip(neg_indexs,ret_arrays): 80 | if len(neg_index) > 0: 81 | neg =np.concatenate((neg_index.reshape(len(neg_index)),miss_index.reshape(len(miss_index)))) 82 | # neg.sort() 83 | ret_missing[neg]=1 84 | pos_index = np.argwhere(ret_missing==0) 85 | pos = pos_index.reshape(len(pos_index)) 86 | pos_values = ret_array[pos] 87 | neg_values = np.interp(neg,pos,pos_values) 88 | ret_array[neg] = neg_values 89 | else: 90 | if len(miss_index)>0: 91 | neg = miss_index.reshape(len(miss_index)) 92 | pos_index = np.argwhere(ret_missing == 0) 93 | pos = pos_index.reshape(len(pos_index)) 94 | pos_values = ret_array[pos] 95 | neg_values = np.interp(neg,pos,pos_values) 96 | ret_array[neg] = neg_values 97 | 98 | 99 | if has_arrays: 100 | return ret_timestamp, ret_missing, ret_arrays 101 | else: 102 | return ret_timestamp, ret_missing 103 | 104 | 105 | def standardize_obj(values, mean=None, std=None, excludes=None): 106 | """ 107 | Standardize a 108 | Args: 109 | values (np.ndarray): 1-D `float32` array, the KPI observations. 110 | mean (float): If not :obj:`None`, will use this `mean` to standardize 111 | `values`. If :obj:`None`, `mean` will be computed from `values`. 112 | Note `mean` and `std` must be both :obj:`None` or not :obj:`None`. 113 | (default :obj:`None`) 114 | std (float): If not :obj:`None`, will use this `std` to standardize 115 | `values`. If :obj:`None`, `std` will be computed from `values`. 116 | Note `mean` and `std` must be both :obj:`None` or not :obj:`None`. 117 | (default :obj:`None`) 118 | excludes (np.ndarray): Optional, 1-D `int32` or `bool` array, the 119 | indicators of whether each point should be excluded for computing 120 | `mean` and `std`. Ignored if `mean` and `std` are not :obj:`None`. 121 | (default :obj:`None`) 122 | 123 | Returns: 124 | np.ndarray: The standardized `values`. 125 | float: The computed `mean` or the given `mean`. 126 | float: The computed `std` or the given `std`. 127 | """ 128 | values = np.asarray(values, dtype=np.float32) 129 | if len(values.shape) != 1: 130 | raise ValueError('`values` must be a 1-D array') 131 | if (mean is None) != (std is None): 132 | raise ValueError('`mean` and `std` must be both None or not None') 133 | if excludes is not None: 134 | excludes = np.asarray(excludes, dtype=np.bool) 135 | if excludes.shape != values.shape: 136 | raise ValueError('The shape of `excludes` does not agree with ' 137 | 'the shape of `values` ({} vs {})'. 138 | format(excludes.shape, values.shape)) 139 | 140 | if mean is None: 141 | if excludes is not None: 142 | val = values[np.logical_not(excludes)] 143 | else: 144 | val = values 145 | mean = val.mean() 146 | std = val.std() 147 | 148 | return (values - mean) / std, mean, std 149 | --------------------------------------------------------------------------------