├── requirements.txt
├── README.md
├── rocka
    ├── __init__.py
    ├── baseline_extraction.py
    ├── shape_based_distance.py
    ├── density_esitmation.py
    ├── model.py
    └── preprocessing_rocka.py
├── LICENSE
├── .gitignore
└── setup.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy >= 1.12.1
2 | sklearn >= 0.21.3


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Rocka
2 | an unsurpervised clustering algorithm named ROCKA
3 | REFERECE:
4 | Li Z, Zhao Y, Liu R, et al. Robust and rapid clustering of kpis for large-scale anomaly detection[C]//2018 IEEE/ACM 26th International Symposium on Quality of Service (IWQoS). IEEE, 2018: 1-10.
5 | 


--------------------------------------------------------------------------------
/rocka/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Nov 29 17:15:23 2019
 4 | 
 5 | @author: PB
 6 | """
 7 | 
 8 | __version__ = '0.1'
 9 | 
10 | import preprocessing_rocka
11 | import baseline_extraction
12 | import shape_based_distance
13 | import density_esitmation
14 | import model
15 | 
16 | __all__ = ['Rocka']
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 polarbear1992
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Donut
 3 | -----
 4 | 
 5 | Donut is an anomaly detection algorithm for periodic KPIs.
 6 | """
 7 | import ast
 8 | import codecs
 9 | import os
10 | import re
11 | import sys
12 | from setuptools import setup, find_packages
13 | 
14 | 
15 | _version_re = re.compile(r'__version__\s+=\s+(.*)')
16 | _source_dir = os.path.split(os.path.abspath(__file__))[0]
17 | 
18 | if sys.version_info[0] == 2:
19 |     def read_file(path):
20 |         with open(path, 'rb') as f:
21 |             return f.read()
22 | else:
23 |     def read_file(path):
24 |         with codecs.open(path, 'rb', 'utf-8') as f:
25 |             return f.read()
26 | 
27 | version = str(ast.literal_eval(_version_re.search(
28 |     read_file(os.path.join(_source_dir, 'rocka/__init__.py'))).group(1)))
29 | 
30 | requirements_list = list(filter(
31 |     lambda v: v and not v.startswith('#'),
32 |     (s.strip() for s in read_file(
33 |         os.path.join(_source_dir, 'requirements.txt')).split('\n'))
34 | ))
35 | dependency_links = [s for s in requirements_list if s.startswith('git+')]
36 | install_requires = [s for s in requirements_list if not s.startswith('git+')]
37 | 
38 | 
39 | setup(
40 |     name='Rocka',
41 |     version=version,
42 |     url='https://github.com/polarbear1992/Rocka/',
43 |     license='MIT',
44 |     author='Polar Bear',
45 |     author_email='wqy919@yeah.net',
46 |     description='an unsurpervised clustering algorithm named ROCKA',
47 |     long_description=__doc__,
48 |     packages=find_packages('.', include=['rocka', 'rocka.*']),
49 |     zip_safe=False,
50 |     platforms='any',
51 |     setup_requires=['setuptools'],
52 |     install_requires=install_requires,
53 |     dependency_links=dependency_links,
54 |     classifiers=[
55 |         'Development Status :: 2 - Alpha',
56 |         'Intended Audience :: Developers',
57 |         'License :: OSI Approved :: MIT License',
58 |         'Operating System :: OS Independent',
59 |         'Programming Language :: Python',
60 |         'Programming Language :: Python :: 2',
61 |         'Programming Language :: Python :: 2.7',
62 |         'Programming Language :: Python :: 3',
63 |         'Programming Language :: Python :: 3.5',
64 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
65 |         'Topic :: Software Development :: Libraries :: Python Modules'
66 |     ]
67 | )
68 | 


--------------------------------------------------------------------------------
/rocka/baseline_extraction.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Nov 28 10:04:57 2019
 4 | 
 5 | @author: PB
 6 | """
 7 | from preprocessing_rocka import standardize_obj
 8 | import numpy as np
 9 | 
10 | __all__ = ['smoothing_extreme_values','extract_baseline']
11 | 
12 | def smoothing_extreme_values(values):
13 |     """
14 |     In general,the ratio of anomaly points in a time series is less than 5%[1].
15 |     As such,simply remove the top5% data which deviate the most from the mean 
16 |     value,and use linear interpolation to fill them.
17 |     
18 |     Args:
19 |         values(np.ndarray) is a time series which has been preprosessed by linear 
20 |         interpolation and standardization(to have zero mean and unit variance)
21 |     
22 |     Returns:
23 |         np.ndarray: The smoothed `values`
24 |     """
25 |     
26 |     values = np.asarray(values, np.float32)
27 |     if len(values.shape) != 1:
28 |         raise ValueError('`values` must be a 1-D array')
29 | #    if (values.mean() != 0) or (values.std() != 1):
30 | #        raise ValueError('`values` must be standardized to have zero mean and unit variance')
31 |     
32 |     #get the deviation of each point from zero mean
33 |     values_deviation = np.abs(values)
34 |     
35 |     #the abnormal portion
36 |     abnormal_portion = 0.05
37 |     
38 |     #replace the abnormal points with linear interpolation
39 |     abnormal_max = np.max(values_deviation)
40 |     abnormal_index = np.argwhere(values_deviation >= abnormal_max * (1-abnormal_portion))
41 |     abnormal = abnormal_index.reshape(len(abnormal_index))
42 |     normal_index = np.argwhere(values_deviation < abnormal_max * (1-abnormal_portion))
43 |     normal = normal_index.reshape(len(normal_index))
44 |     normal_values = values[normal]
45 |     abnormal_values = np.interp(abnormal,normal,normal_values)
46 |     values[abnormal] = abnormal_values
47 |     
48 |     return values
49 |     
50 | def extract_baseline(values,w):
51 |     """
52 |     A simple but effective method for removing noises if to apply moving 
53 |     average with a small sliding window(`w`) on the KPI(`values`),separating 
54 |     its curve into two parts:baseline and residuals.
55 |     For a KPI,T,with a sliding window of length of `w`,stride = 1,for each 
56 |     point x(t),the corresponding point on the baseline,denoted as x(t)*,is the 
57 |     mean of vector (x(t-w+1),...,x(t)).
58 |     Then the diffrence between x(t) and x(t)* is called a residuals.
59 |     
60 |     Args:
61 |         values(np.ndarray): time series after preprocessing and smoothed
62 |         
63 |     Returns:
64 |         tuple(np.ndarray,np.float32,np.float32):
65 |             np.ndarray: the baseline of rawdata;
66 |             np.float32: the mean of input values after moving average;
67 |             np.float32: the std of input values after moving average.
68 |         np.ndarray:the residuals between rawdata between baseline
69 |         
70 |         
71 |     """
72 |     #moving average to get the baseline
73 |     baseline = np.convolve(values,np.ones((w,))/w,mode='valid')
74 |     #get the residuals,the difference between raw series and baseline
75 |     residuals = values[w-1:] - baseline
76 |     
77 |     return standardize_obj(baseline),residuals
78 |     
79 |     
80 | 


--------------------------------------------------------------------------------
/rocka/shape_based_distance.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Nov 28 15:28:32 2019
 4 | 
 5 | @author: PB
 6 | """
 7 | 
 8 | import numpy as np
 9 | 
10 | __all__ = ['sbd_ele','SBD']
11 | 
12 | def sbd_ele(values1,values2):
13 |     """
14 |     Given two time seires `values1' and `values2`,cross-correlation slides 
15 |     `values2' to `values1` to compute the inner-product for each shift `s`,the
16 |     range of shift `s` ∈ [-len(values2) + 1, len(values1)-1]
17 |     SBD is based of cross-correlation. SBD ranges from 0 to 2, where 0 means 
18 |     two time series have exactly the same shape. A smaller SBD means higher 
19 |     shape similarity.
20 |     
21 |     Args:
22 |         values1(np.ndarray): time series 1
23 |         values2(np.ndarray): time series 2
24 |         
25 |     Returns:
26 |         np.float32: the SBD between `values1` and `values2`
27 |     """
28 |     #get the 2 norm
29 |     l2_values1 = np.linalg.norm(values1)
30 |     l2_values2 = np.linalg.norm(values2)
31 |     #get the cross-correlation of each shift `s`
32 |     cross_corre = np.convolve(values1,values2,mode = 'full')
33 |     
34 |     #return the SBD between `values1` and `values2`
35 |     return 1 - np.max(cross_corre)/(l2_values1 * l2_values2)
36 | 
37 | def SBD(values_list,minPts = 4):
38 |     """
39 |     Caculate the shape based distance(SBD) between any two time series for 
40 |     similarity measure. SBD is used for DBSCAN for clustering.
41 |     The main idea of DBSCAN is to find some cores in dense regions,and then 
42 |     expand the cores by transitivity of similarity to form clusters.
43 |     
44 |     Args:
45 |         List(np.ndarray): a list consists of all time series(np.ndarray),the 
46 |             lengths of different time series could be different
47 |         minPts(np.int32): The core `p` in DBSCAN is defined as an object that has
48 |             at least `minPts` objects within a distance of ϵ from it(excluding
49 |             `p`). The default value of `minPts` is 4.
50 |     
51 |     Returns:
52 |         np.ndarray: for each time series, take the SBD between it and its 
53 |         minPts-Nearest-Neighbor(KNN). The SBDs of all time series in `values`
54 |         returned as an np.ndarray.
55 |     """
56 |     
57 |     if len(values_list) < minPts:
58 |         raise ValueError ('`values_list` must contain more than %d time series'\
59 |                           %minPts)
60 |     if len(values_list[0].shape) != 1:
61 |         raise ValueError ('`values` must be a 1-D array')
62 |     if (type(minPts) is not int) or (minPts < 1):
63 |         raise ValueError ('`minPts` must be a positive integar')
64 |     
65 |     #Caculate the SBD between any two time time series
66 |     sbd_matrix = np.zeros((len(values_list),len(values_list)))
67 |     for i in range(len(values_list)):
68 |         for j in range(i,len(values_list)):
69 |             sbd_matrix[i][j] = sbd_ele(values_list[i],values_list[j])
70 |             sbd_matrix[j][i] = sbd_matrix[i][j]
71 |     
72 |     #Return the minPts nearest SBD for each time series(excluding itself)
73 |     ret_sbd = np.zeros(len(values_list))
74 |     for i in range(len(values_list)):
75 |         src_index = np.argsort(sbd_matrix[i])
76 |         ret_sbd[i] = sbd_matrix[i][src_index][minPts]
77 |     
78 |     return sbd_matrix,ret_sbd


--------------------------------------------------------------------------------
/rocka/density_esitmation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Nov 28 18:05:24 2019
 4 | 
 5 | @author: PB
 6 | """
 7 | 
 8 | import numpy as np 
 9 | 
10 | __all__ = ['density_radius']
11 | 
12 | 
13 | def density_radius(sbd_arr,len_thresh,max_radius,slope_thresh,slope_diff_thresh):
14 |     """
15 |     Given K-Nearest-Neighbor SBDs of each sample,calculate the density radius
16 |     for DESCAN clustering.
17 |     
18 |     Args:
19 |         `sbd_arr`: np.ndarray, array of the K-Nearest-Neighbor SBD of each 
20 |             sample.
21 |         `len_thresh`: np.int32, the length of traget SBDs for candidate radius
22 |             search.
23 |         `max_radius`: np.float32, candidate radius are no larger than 
24 |             `max_radius`.
25 |         `slope_thresh`: np.float32, the slopes on the left and right of 
26 |             candidate point are no larger than `slope_thresh`
27 |         `slope_diff_thresh`: np.float32, the diff between leftslope and right-
28 |             slope of candidate point are no larger than `slope_diff_thresh`
29 |             
30 |     Returns:
31 |         np.float32: the final density radius is the largest value of all 
32 |             candidate radii.
33 |     """
34 |     src_index = np.argsort(sbd_arr)
35 |     sbd_arr_sorted = sbd_arr[src_index][::-1]
36 |     candidates_index = np.argwhere(sbd_arr_sorted<=max_radius)
37 |     start = np.min(candidates_index)
38 |     end = len(sbd_arr_sorted)
39 |     
40 |     def find_candidate_radius(sbd_arr_sorted,start,end,candidates):
41 |         """
42 |         Given reverse sorted K-Nearest-Neighbor SBDs of each sample,calculate the density 
43 |         radius for DESCAN clustering.
44 |         A divide and conquer strategy is used for candidate radius finding.
45 |         
46 |         Args:
47 |             `sbd_arr_sorted`: np.ndarray, reverse sorted array of the K-Nearest
48 |                 -Neighbor SBD of each sample.
49 |             `start`: np.int32, the begain index of target SBDs.
50 |             `end`: np.int32, the end index of target SBDs.
51 |             `candidates`: np.ndarray, the indexes of all candidate radii.
52 |             
53 |         Returns:
54 |             `candidates`: np.ndarray, the indexes of all candidate radii.
55 |         """
56 |         if end - start <= len_thresh:
57 |             return
58 |         radius,diff = -1,2
59 |         for i in range(start+1,end):
60 |             leftslope = (sbd_arr_sorted[i]-sbd_arr_sorted[start])/(i-start)
61 |             rightslope = (sbd_arr_sorted[end-1]-sbd_arr_sorted[i])/(end-1-i)
62 |             
63 |             if leftslope > slope_thresh or rightslope > slope_thresh:
64 |                 continue
65 |             if np.abs(leftslope - rightslope) < diff:
66 |                 diff = leftslope - rightslope
67 |                 radius = i
68 |         if diff < slope_diff_thresh:
69 |             np.append(candidates,radius)
70 |         find_candidate_radius(sbd_arr_sorted,start,radius,candidates)
71 |         find_candidate_radius(sbd_arr_sorted,radius+1,end,candidates)
72 |     
73 |     candidate = np.empty((0),np.int32)
74 |     candidates = find_candidate_radius(sbd_arr_sorted,start,end,candidate)
75 |     print(candidates)
76 |     if candidates is not None:
77 |         radius_candidates = np.max(sbd_arr_sorted[candidates])
78 |         return radius_candidates
79 |     else:
80 |         raise ValueError('There is no qualified density raidus.')
81 |     
82 | 


--------------------------------------------------------------------------------
/rocka/model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Nov 29 15:26:44 2019
  4 | 
  5 | @author: icebearwang
  6 | """
  7 | 
  8 | from sklearn.cluster import dbscan_
  9 | 
 10 | class Rocka(object):
 11 |     """
 12 |     
 13 |     """
 14 |     def __init__(self,sbd_matrix,density_radius,minPts,metric_rocka='precomputed',
 15 |             metric_params_rocka=None,algorithm_rocka='auto',leaf_size_rocka=30,
 16 |             p_rocka=2,sample_weight_rocka=None,n_jobs_rocka=None):
 17 |         self.sbd_matrix = sbd_matrix
 18 |         self.density_radius = density_radius
 19 |         self.minPts = minPts
 20 |         self.metric_rocka = metric_rocka
 21 |         self.metric_params_rocka = metric_params_rocka
 22 |         self.algorithm_rocka = algorithm_rocka
 23 |         self.leaf_size_rocka = leaf_size_rocka
 24 |         self.p_rocka = p_rocka
 25 |         self.sample_weight_rocka = sample_weight_rocka
 26 |         self.n_jobs_rocka = n_jobs_rocka
 27 |         
 28 |     def fit(self,sbd_matrix,density_radius,minPts,metric_rocka='precomputed',
 29 |             metric_params_rocka=None,algorithm_rocka='auto',leaf_size_rocka
 30 |             =30,p_rocka=2,sample_weight_rocka=None,n_jobs_rocka=None):
 31 |         """
 32 |         Perform DBSCAN clustering from vector array or distance matrix.
 33 | 
 34 |         Read more in the :ref:`User Guide <dbscan>`.
 35 |     
 36 |         Parameters
 37 |         ----------
 38 |         sbd_matrix : np.ndarray or sparse (CSR) matrix of shape (n_samples, n_features), 
 39 |             or \array of shape (n_samples, n_samples)
 40 |             A feature array, or array of distances between samples if
 41 |             ``metric='precomputed'``.
 42 |     
 43 |         density_radius : np.float32, 
 44 |             The maximum distance between two samples for one to be considered
 45 |             as in the neighborhood of the other. This is not a maximum bound
 46 |             on the distances of points within a cluster. This is the most
 47 |             important DBSCAN parameter to choose appropriately for your data set
 48 |             and distance function.
 49 |     
 50 |         minPts : np.int32, 
 51 |             The number of samples (or total weight) in a neighborhood for a point
 52 |             to be considered as a core point. This includes the point itself.
 53 |     
 54 |         metric_rocka : string, or callable
 55 |             The metric to use when calculating distance between instances in a
 56 |             feature array. If metric is a string or callable, it must be one of
 57 |             the options allowed by :func:`sklearn.metrics.pairwise_distances` for
 58 |             its metric parameter.
 59 |             If metric is "precomputed", X is assumed to be a distance matrix and
 60 |             must be square. X may be a sparse matrix, in which case only "nonzero"
 61 |             elements may be considered neighbors for DBSCAN.
 62 |     
 63 |         metric_params_rocka : dict, optional
 64 |             Additional keyword arguments for the metric function.
 65 |     
 66 |             .. versionadded:: 0.19
 67 |     
 68 |         algorithm_rocka : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
 69 |             The algorithm to be used by the NearestNeighbors module
 70 |             to compute pointwise distances and find nearest neighbors.
 71 |             See NearestNeighbors module documentation for details.
 72 |     
 73 |         leaf_size_rocka : np.int32, optional (default = 30)
 74 |             Leaf size passed to BallTree or cKDTree. This can affect the speed
 75 |             of the construction and query, as well as the memory required
 76 |             to store the tree. The optimal value depends
 77 |             on the nature of the problem.
 78 |     
 79 |         p_rocka : np.float32, optional
 80 |             The power of the Minkowski metric to be used to calculate distance
 81 |             between points.
 82 |     
 83 |         sample_weight_rocka : np.ndarray, shape (n_samples,), optional
 84 |             Weight of each sample, such that a sample with a weight of at least
 85 |             ``min_samples`` is by itself a core sample; a sample with negative
 86 |             weight may inhibit its eps-neighbor from being core.
 87 |             Note that weights are absolute, and default to 1.
 88 |     
 89 |         n_jobs_rocka : bp.int32 or None, optional (default=None)
 90 |             The number of parallel jobs to run for neighbors search.
 91 |             ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
 92 |             ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
 93 |             for more details.
 94 |     
 95 |         Attributes
 96 |         ----------
 97 |         core_sample_indices_ : array, shape = [n_core_samples]
 98 |             Indices of core samples.
 99 |     
100 |         components_ : array, shape = [n_core_samples, n_features]
101 |             Copy of each core sample found by training.
102 |     
103 |         labels_ : array, shape = [n_samples]
104 |             Cluster labels for each point in the dataset given to fit().
105 |             Noisy samples are given the label -1.
106 |     
107 |         See also
108 |         --------
109 |         DBSCAN
110 |             An estimator interface for this clustering algorithm.
111 |         OPTICS
112 |             A similar estimator interface clustering at multiple values of eps. Our
113 |             implementation is optimized for memory usage.
114 |         """
115 |         model = dbscan_.DBSCAN(density_radius,minPts,metric=metric_rocka,
116 |             metric_params=metric_params_rocka,algorithm=algorithm_rocka,
117 |             leaf_size=leaf_size_rocka,p=p_rocka,sample_weight=sample_weight_rocka,
118 |             n_jobs=n_jobs_rocka).fit(sbd_matrix)
119 |         
120 |         return model
121 |     
122 |         
123 |         
124 |         


--------------------------------------------------------------------------------
/rocka/preprocessing_rocka.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Nov 27 15:07:20 2019
  4 | 
  5 | @author: PB
  6 | """
  7 | 
  8 | import numpy as np
  9 | 
 10 | __all__ = ['linear_interpolation','standardize_obj']
 11 | 
 12 | def linear_interpolation(timestamp, arrays=None):
 13 |     """
 14 |     Complete `timestamp` such that the time interval is homogeneous.
 15 | 
 16 |     Zeros will be inserted into each array in `arrays`, at missing points.
 17 |     Linear interpolation would be utilized for missing points(0) and 
 18 |     default points(-1)
 19 |     Also, an indicator array will be returned to indicate whether each
 20 |     point is missing or defauld.
 21 | 
 22 |     Args:
 23 |         timestamp (np.ndarray): 1-D int64 array, the timestamp values.
 24 |             It can be unsorted.
 25 |         arrays (Iterable[np.ndarray]): The 1-D arrays to be filled with zeros
 26 |             according to `timestamp`.
 27 | 
 28 |     Returns:
 29 |         np.ndarray: A 1-D int64 array, the completed timestamp.
 30 |         np.ndarray: A 1-D int32 array, indicating whether each point is missing.
 31 |         list[np.ndarray]: The arrays, missing points filled with zeros.
 32 |             (optional, return only if `arrays` is specified)
 33 |     """
 34 |     timestamp = np.asarray(timestamp, np.int64)
 35 |     if len(timestamp.shape) != 1:
 36 |         raise ValueError('`timestamp` must be a 1-D array')
 37 | 
 38 |     has_arrays = arrays is not None
 39 |     arrays = [np.asarray(array) for array in (arrays or ())]
 40 |     for i, array in enumerate(arrays):
 41 |         if array.shape != timestamp.shape:
 42 |             raise ValueError('The shape of ``arrays[{}]`` does not agree with '
 43 |                              'the shape of `timestamp` ({} vs {})'.
 44 |                              format(i, array.shape, timestamp.shape))
 45 | 
 46 |     # sort the timestamp, and check the intervals
 47 |     src_index = np.argsort(timestamp)
 48 |     timestamp_sorted = timestamp[src_index]
 49 |     intervals = np.unique(np.diff(timestamp_sorted))
 50 |     interval = np.min(intervals)
 51 |     if interval == 0:
 52 |         raise ValueError('Duplicated values in `timestamp`')
 53 |     for itv in intervals:
 54 |         if itv % interval != 0:
 55 |             raise ValueError('Not all intervals in `timestamp` are multiples '
 56 |                              'of the minimum interval')
 57 | 
 58 |     # prepare for the return arrays
 59 |     length = (timestamp_sorted[-1] - timestamp_sorted[0]) // interval + 1
 60 |     ret_timestamp = np.arange(timestamp_sorted[0],
 61 |                               timestamp_sorted[-1] + interval,
 62 |                               interval,
 63 |                               dtype=np.int64)
 64 |     ret_missing = np.ones([length], dtype=np.int32)
 65 |     ret_arrays = [np.zeros([length], dtype=array.dtype) for array in arrays]
 66 | 
 67 |     # copy values to the return arrays
 68 |     dst_index = np.asarray((timestamp_sorted - timestamp_sorted[0]) // interval,
 69 |                            dtype=np.int)
 70 |     ret_missing[dst_index] = 0
 71 |     
 72 |     for ret_array, array in zip(ret_arrays, arrays):
 73 |         ret_array[dst_index] = array[src_index]
 74 | 
 75 |     #linear interpolation for the missing values and the default values(-1)
 76 |     neg_indexs = [np.argwhere(array==-1) for array in ret_arrays]
 77 |     miss_index = np.argwhere(ret_missing==1)
 78 |     
 79 |     for neg_index,ret_array in zip(neg_indexs,ret_arrays):
 80 |         if len(neg_index) > 0:
 81 |             neg =np.concatenate((neg_index.reshape(len(neg_index)),miss_index.reshape(len(miss_index))))
 82 | #            neg.sort()
 83 |             ret_missing[neg]=1
 84 |             pos_index = np.argwhere(ret_missing==0)
 85 |             pos = pos_index.reshape(len(pos_index))
 86 |             pos_values = ret_array[pos]
 87 |             neg_values = np.interp(neg,pos,pos_values)
 88 |             ret_array[neg] = neg_values
 89 |         else:
 90 |             if len(miss_index)>0:
 91 |                 neg = miss_index.reshape(len(miss_index))
 92 |                 pos_index = np.argwhere(ret_missing == 0)
 93 |                 pos = pos_index.reshape(len(pos_index))
 94 |                 pos_values = ret_array[pos]
 95 |                 neg_values = np.interp(neg,pos,pos_values)
 96 |                 ret_array[neg] = neg_values
 97 |     
 98 |     
 99 |     if has_arrays:
100 |         return ret_timestamp, ret_missing, ret_arrays
101 |     else:
102 |         return ret_timestamp, ret_missing
103 |     
104 | 
105 | def standardize_obj(values, mean=None, std=None, excludes=None):
106 |     """
107 |     Standardize a
108 |     Args:
109 |         values (np.ndarray): 1-D `float32` array, the KPI observations.
110 |         mean (float): If not :obj:`None`, will use this `mean` to standardize
111 |             `values`. If :obj:`None`, `mean` will be computed from `values`.
112 |             Note `mean` and `std` must be both :obj:`None` or not :obj:`None`.
113 |             (default :obj:`None`)
114 |         std (float): If not :obj:`None`, will use this `std` to standardize
115 |             `values`. If :obj:`None`, `std` will be computed from `values`.
116 |             Note `mean` and `std` must be both :obj:`None` or not :obj:`None`.
117 |             (default :obj:`None`)
118 |         excludes (np.ndarray): Optional, 1-D `int32` or `bool` array, the
119 |             indicators of whether each point should be excluded for computing
120 |             `mean` and `std`. Ignored if `mean` and `std` are not :obj:`None`.
121 |             (default :obj:`None`)
122 | 
123 |     Returns:
124 |         np.ndarray: The standardized `values`.
125 |         float: The computed `mean` or the given `mean`.
126 |         float: The computed `std` or the given `std`.
127 |     """
128 |     values = np.asarray(values, dtype=np.float32)
129 |     if len(values.shape) != 1:
130 |         raise ValueError('`values` must be a 1-D array')
131 |     if (mean is None) != (std is None):
132 |         raise ValueError('`mean` and `std` must be both None or not None')
133 |     if excludes is not None:
134 |         excludes = np.asarray(excludes, dtype=np.bool)
135 |         if excludes.shape != values.shape:
136 |             raise ValueError('The shape of `excludes` does not agree with '
137 |                              'the shape of `values` ({} vs {})'.
138 |                              format(excludes.shape, values.shape))
139 | 
140 |     if mean is None:
141 |         if excludes is not None:
142 |             val = values[np.logical_not(excludes)]
143 |         else:
144 |             val = values
145 |         mean = val.mean()
146 |         std = val.std()
147 | 
148 |     return (values - mean) / std, mean, std
149 | 


--------------------------------------------------------------------------------