├── lib
    ├── __init__.py
    ├── __pycache__
    │   ├── ceph.cpython-36.pyc
    │   ├── model.cpython-36.pyc
    │   ├── __init__.cpython-36.pyc
    │   └── prometheus.cpython-36.pyc
    ├── model.py
    ├── ceph.py
    └── prometheus.py
├── .gitignore
├── .zuul.yaml
├── requirements.txt
├── Dockerfile
├── Makefile
├── ceph.py
├── train-prophet-deployment-template.yaml
├── prometheus.py
├── README.md
├── app.py
└── model.py


/lib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.pyo
3 | *_old.py
4 | __pycache__/
5 | *.json
6 | 


--------------------------------------------------------------------------------
/lib/__pycache__/ceph.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AICoE/prometheus-anomaly-detector-legacy/HEAD/lib/__pycache__/ceph.cpython-36.pyc


--------------------------------------------------------------------------------
/lib/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AICoE/prometheus-anomaly-detector-legacy/HEAD/lib/__pycache__/model.cpython-36.pyc


--------------------------------------------------------------------------------
/.zuul.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | - project:
3 |     check:
4 |       jobs:
5 |         - "thoth-coala"
6 |     gate:
7 |       jobs:
8 |         - "thoth-coala"
9 | 


--------------------------------------------------------------------------------
/lib/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AICoE/prometheus-anomaly-detector-legacy/HEAD/lib/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/lib/__pycache__/prometheus.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AICoE/prometheus-anomaly-detector-legacy/HEAD/lib/__pycache__/prometheus.cpython-36.pyc


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | requests
 2 | fbprophet
 3 | pandas
 4 | boto3
 5 | matplotlib
 6 | flask
 7 | apscheduler
 8 | prometheus_client
 9 | sortedcontainers
10 | scipy
11 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/centos/python-36-centos7:latest
 2 | 
 3 | 
 4 | ADD requirements.txt /
 5 | RUN pip install -r /requirements.txt
 6 | 
 7 | ADD app.py /
 8 | ADD prometheus.py /
 9 | ADD model.py /
10 | ADD ceph.py /
11 | ADD lib /lib
12 |  
13 | 
14 | CMD [ "python", "/app.py"]
15 | 


--------------------------------------------------------------------------------
/lib/model.py:
--------------------------------------------------------------------------------
 1 | import pandas
 2 | import json
 3 | 
 4 | def get_df_from_json(metric):
 5 |     # metric_dict = {}
 6 |     metric_dict_pd = {}
 7 |     for row in metric:
 8 |         # metric_dict[str(row['metric'])] = metric_dict.get(str(row['metric']),[]) + (row['values'])
 9 |         metric_metadata = str(row['metric'])
10 |         if  metric_metadata not in metric_dict_pd:
11 |             metric_dict_pd[metric_metadata] = pandas.DataFrame(columns=['timestamp', 'value'])
12 |             pass
13 |         else:
14 |             # for value in (row['values']):
15 |                 # print(value)
16 |             temp_df = pandas.DataFrame(row['values'], columns=['timestamp', 'value'])
17 |             # print(temp_df.head())
18 |             metric_dict_pd[metric_metadata] = pandas.concat([metric_dict_pd[metric_metadata], temp_df])
19 |             del temp_df
20 |             pass
21 |         pass
22 |         metric_dict_pd[metric_metadata].set_index('timestamp')
23 |         # print(metric_dict_pd[metric_metadata])
24 |         # metric_dict_pd[metric_metadata]['timestamp'] = pandas.to_datetime(metric_dict_pd[metric_metadata]['timestamp'], unit='s')
25 |     return metric_dict_pd
26 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Required Variables
 2 | bearer_token=
 3 | prometheus_url=
 4 | 
 5 | block_storage_access_key=
 6 | block_storage_secret_key=
 7 | block_storage_bucket_name=
 8 | block_storage_endpoint_url=
 9 | 
10 | # Optional Variables
11 | oc_app_name=train-prom-dh-prod
12 | docker_app_name=train-prometheus
13 | 
14 | docker_build:
15 | 	docker build -t ${docker_app_name} .
16 | 
17 | docker_test:
18 | 	docker run ${docker_app_name}
19 | 
20 | docker_run:
21 | 	docker run -ti --rm \
22 | 	   --env "BEARER_TOKEN=${bearer_token}" \
23 | 	   --env "URL=${prometheus_url}" \
24 | 		 --env BOTO_ACCESS_KEY="${block_storage_access_key}" \
25 | 		 --env BOTO_SECRET_KEY="${block_storage_secret_key}" \
26 | 		 --env BOTO_OBJECT_STORE="${block_storage_bucket_name}" \
27 | 		 --env BOTO_STORE_ENDPOINT="${block_storage_endpoint_url}" \
28 | 	   ${docker_app_name}:latest
29 | 
30 | oc_deploy:
31 | 	oc new-app --file=./train-prophet-deployment-template.yaml --param APPLICATION_NAME="${oc_app_name}" \
32 | 			--param URL="${prometheus_url}" \
33 | 			--param BEARER_TOKEN="${bearer_token}" \
34 | 			--param BOTO_ACCESS_KEY="${block_storage_access_key}" \
35 | 			--param BOTO_SECRET_KEY="${block_storage_secret_key}" \
36 | 			--param BOTO_OBJECT_STORE="${block_storage_bucket_name}" \
37 | 			--param BOTO_STORE_ENDPOINT="${block_storage_endpoint_url}"
38 | 
39 | oc_delete_all:
40 | 	oc delete all -l app=${oc_app_name}
41 | 
42 | run_model:
43 | 	BEARER_TOKEN=${bearer_token} \
44 | 	URL=${prometheus_url} \
45 | 	BOTO_ACCESS_KEY=${block_storage_access_key} \
46 | 	BOTO_SECRET_KEY=${block_storage_secret_key} \
47 | 	BOTO_OBJECT_STORE=${block_storage_bucket_name} \
48 | 	BOTO_STORE_ENDPOINT=${block_storage_endpoint_url} \
49 | 	python3 ../train-prometheus-prod/train-prometheus/app.py
50 | 


--------------------------------------------------------------------------------
/lib/ceph.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | import bz2
 3 | import os
 4 | import pickle
 5 | import botocore
 6 | 
 7 | class CephConnect:
 8 |     def __init__(self, access_key = None, secret_key = None, object_store = None, object_store_endpoint = None):
 9 |       self.boto_settings = {
10 |           'access_key': os.getenv('BOTO_ACCESS_KEY', access_key),
11 |           'secret_key': os.getenv('BOTO_SECRET_KEY', secret_key),
12 |           'object_store': os.getenv('BOTO_OBJECT_STORE', object_store),
13 |           'object_store_endpoint': os.getenv('BOTO_STORE_ENDPOINT', object_store_endpoint)
14 |       }
15 | 
16 |     def store_data(self, name, values, object_path = None):
17 |         '''
18 |         Function to store predictions to ceph
19 |         '''
20 |         if not values:
21 |             return "No values for {}".format(name)
22 |         # Create a session with CEPH (or any black storage) storage with the stored credentials
23 |         session = boto3.Session(
24 |             aws_access_key_id=self.boto_settings['access_key'],
25 |             aws_secret_access_key=self.boto_settings['secret_key']
26 |         )
27 | 
28 |         s3 = session.resource('s3',
29 |                               endpoint_url=self.boto_settings['object_store_endpoint'],
30 |                               verify=False)
31 |         # prometheus-openshift-devops-monitor.a3c1.starter-us-west-1.openshiftapps.com/container_cpu_usage_percent_by_host/201807040259.json.bz2
32 |         if not object_path:
33 |             object_path = str(name)
34 |             pass
35 |         object_path = object_path + ".bz2"
36 |         try:
37 |             payload = bz2.compress(values.encode('utf-8'))
38 | 
39 |         except AttributeError:
40 |             payload = bz2.compress(values)
41 |         rv = s3.meta.client.put_object(Body=payload,
42 |                                        Bucket=self.boto_settings['object_store'],
43 |                                        Key=object_path)
44 |         if rv['ResponseMetadata']['HTTPStatusCode'] == 200:
45 |             return object_path
46 |         else:
47 |             return str(rv)
48 | 
49 |     def get_model_dict(self, model_storage_path):
50 |         session = boto3.Session(
51 |             aws_access_key_id=self.boto_settings['access_key'],
52 |             aws_secret_access_key=self.boto_settings['secret_key']
53 |         )
54 | 
55 |         s3 = session.resource('s3',
56 |                               endpoint_url=self.boto_settings['object_store_endpoint'],
57 |                               verify=False)
58 |         # try to get model from ceph
59 |         try:
60 |             model_storage_path = model_storage_path + ".bz2"
61 |             print("receiveing Object from: \n {}".format(model_storage_path))
62 | 
63 |             received_object = s3.Object(self.boto_settings['object_store'], model_storage_path).get()['Body'].read()
64 |             # print(type(received_object))
65 |             model_dict = pickle.loads(bz2.decompress(received_object))
66 |             # print(model_dict.keys())
67 |         except botocore.exceptions.ClientError as exc:
68 |             if exc.response['Error']['Code'] in ('404', 'NoSuchKey'):
69 |                 # if no model in ceph, return an empty model dictionary
70 |                 print("Stored Model not found")
71 |                 model_dict = {}
72 |         return model_dict
73 | 


--------------------------------------------------------------------------------
/ceph.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | import bz2
 3 | import os
 4 | import pickle
 5 | import botocore
 6 | 
 7 | class CephConnect:
 8 |     def __init__(self, access_key = None, secret_key = None, object_store = None, object_store_endpoint = None):
 9 |       self.boto_settings = {
10 |           'access_key': os.getenv('BOTO_ACCESS_KEY', access_key),
11 |           'secret_key': os.getenv('BOTO_SECRET_KEY', secret_key),
12 |           'object_store': os.getenv('BOTO_OBJECT_STORE', object_store),
13 |           'object_store_endpoint': os.getenv('BOTO_STORE_ENDPOINT', object_store_endpoint)
14 |       }
15 | 
16 |     def store_data(self, name, values, object_path = None):
17 |         '''
18 |         Function to store predictions to ceph
19 |         '''
20 |         if not values:
21 |             return "No values for {}".format(name)
22 |         # Create a session with CEPH (or any black storage) storage with the stored credentials
23 |         session = boto3.Session(
24 |             aws_access_key_id=self.boto_settings['access_key'],
25 |             aws_secret_access_key=self.boto_settings['secret_key']
26 |         )
27 | 
28 |         s3 = session.resource('s3',
29 |                               endpoint_url=self.boto_settings['object_store_endpoint'],
30 |                               verify=False)
31 |         # prometheus-openshift-devops-monitor.a3c1.starter-us-west-1.openshiftapps.com/container_cpu_usage_percent_by_host/201807040259.json.bz2
32 |         if not object_path:
33 |             object_path = str(name)
34 |             pass
35 |         object_path = object_path + ".bz2"
36 |         try:
37 |             payload = bz2.compress(values.encode('utf-8'))
38 | 
39 |         except AttributeError:
40 |             payload = bz2.compress(values)
41 |         rv = s3.meta.client.put_object(Body=payload,
42 |                                        Bucket=self.boto_settings['object_store'],
43 |                                        Key=object_path)
44 |         if rv['ResponseMetadata']['HTTPStatusCode'] == 200:
45 |             return object_path
46 |         else:
47 |             return str(rv)
48 | 
49 |     def get_model_dict(self, model_storage_path):
50 |         session = boto3.Session(
51 |             aws_access_key_id=self.boto_settings['access_key'],
52 |             aws_secret_access_key=self.boto_settings['secret_key']
53 |         )
54 | 
55 |         s3 = session.resource('s3',
56 |                               endpoint_url=self.boto_settings['object_store_endpoint'],
57 |                               verify=False)
58 |         # try to get model from ceph
59 |         try:
60 |             model_storage_path = model_storage_path + ".bz2"
61 |             print("receiveing Object from: \n {}".format(model_storage_path))
62 | 
63 |             received_object = s3.Object(self.boto_settings['object_store'], model_storage_path).get()['Body'].read()
64 |             # print(type(received_object))
65 |             model_dict = pickle.loads(bz2.decompress(received_object))
66 |             # print(model_dict.keys())
67 |         except botocore.exceptions.ClientError as exc:
68 |             if exc.response['Error']['Code'] in ('404', 'NoSuchKey'):
69 |                 # if no model in ceph, return an empty model dictionary
70 |                 print("Stored Model not found")
71 |                 model_dict = {}
72 |         return model_dict
73 | 
74 |     def get_latest_df_dict(self, data_path=None):
75 |         session = boto3.Session(
76 |             aws_access_key_id=self.boto_settings['access_key'],
77 |             aws_secret_access_key=self.boto_settings['secret_key']
78 |         )
79 | 
80 |         s3 = session.resource('s3',
81 |                               endpoint_url=self.boto_settings['object_store_endpoint'],
82 |                               verify=False)
83 |         s3_bucket = s3.Bucket(self.boto_settings['object_store'])
84 | 
85 |         try:
86 |             object_list = [obj for obj in s3_bucket.objects.filter(Prefix=str(data_path))]
87 |             latest_object = object_list[0]
88 |             for obj in object_list:
89 |                 if int(obj.key[-16:-4]) > int(latest_object.key[-16:-4]):
90 |                     latest_object = obj
91 |             received_data = latest_object.get()['Body'].read()
92 |             data_dict = pickle.loads(bz2.decompress(received_data))
93 |         except botocore.exceptions.ClientError as exc:
94 |             if exc.response['Error']['Code'] in ('404', 'NoSuchKey'):
95 |                 # if no data found in ceph, return an empty model dictionary
96 |                 print("Stored Data not found")
97 |                 data_dict = {}
98 |         return data_dict
99 | 


--------------------------------------------------------------------------------
/train-prophet-deployment-template.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: v1
  2 | kind: Template
  3 | 
  4 | labels:
  5 |   application: train-prometheus
  6 | 
  7 | metadata:
  8 |   name: train-prometheus-deployment-template
  9 | 
 10 | parameters:
 11 | - description: The name for job
 12 |   from: 'train-prometheus-[a-z0-9]{4}'
 13 |   generate: expression
 14 |   name: APPLICATION_NAME
 15 |   required: true
 16 | - name: URL
 17 |   description: URL of prometheus server
 18 |   required: true
 19 | - name: BEARER_TOKEN
 20 |   description: Bearer Token for accessing prometheus
 21 |   required: true
 22 | - name: BOTO_ACCESS_KEY
 23 |   description: Access key to connect to CEPH endpoint storage (or any similar S3 type storage)
 24 |   required: true
 25 | - name: BOTO_SECRET_KEY
 26 |   description: Secret key to connect to CEPH endpoint storage (or any similar S3 type storage)
 27 |   required: true
 28 | - name: BOTO_OBJECT_STORE
 29 |   description: Bucket Name on CEPH  (or any similar S3 type storage)
 30 |   required: true
 31 | - name: BOTO_STORE_ENDPOINT
 32 |   description: The URL to connect to the CEPH storage (or any similar S3 type storage)
 33 |   required: true
 34 | - name: GIT_URI
 35 |   value: https://github.com/4n4nd/train-prometheus.git
 36 |   required: true
 37 | - name: CHUNK_SIZE
 38 |   description: Size of chunks in which Data is scraped from Prometheus (Should be smaller than DATA_SIZE)
 39 |   required: false
 40 |   value: '1d'
 41 | - name: DATA_SIZE
 42 |   description: Size of data scraped from Prometheus (Should be bigger than CHUNK_SIZE)
 43 |   required: false
 44 |   value: '1d'
 45 | - name: TRAINING_REPEAT_HOURS
 46 |   description: number of hours to repeat model training
 47 |   required: false
 48 |   value: '6'
 49 | - name: DATA_WINDOW_SIZE
 50 |   description: Sliding data window size in days (Number of days worth of past data to use as training data )
 51 |   required: false
 52 |   value: '60'
 53 | - name: STORE_INTERMEDIATE_DATA
 54 |   description: Store Dataframes of cumulated training dataframes to ceph
 55 |   required: false
 56 |   value: 'True'
 57 | - name: GET_OLDER_DATA
 58 |   description: Use the previously stored dataframes in ceph to train the models
 59 |   required: false
 60 |   value: 'True'
 61 | 
 62 | objects:
 63 | - apiVersion: v1
 64 |   kind: ImageStream
 65 |   metadata:
 66 |     name: ${APPLICATION_NAME}
 67 |     labels:
 68 |       app: ${APPLICATION_NAME}
 69 |   spec:
 70 |     dockerImageRepository: ${APPLICATION_NAME}
 71 |     tags:
 72 |     - name: latest
 73 |     lookupPolicy:
 74 |       local: true
 75 | 
 76 | - apiVersion: v1
 77 |   kind: BuildConfig
 78 |   metadata:
 79 |     name: ${APPLICATION_NAME}
 80 |     labels:
 81 |       app: ${APPLICATION_NAME}
 82 |   spec:
 83 |     resources:
 84 |       limits:
 85 |         memory: 4Gi
 86 |         cpu: "2"
 87 |     output:
 88 |       to:
 89 |         kind: ImageStreamTag
 90 |         name: ${APPLICATION_NAME}:latest
 91 |     source:
 92 |       git:
 93 |         uri: ${GIT_URI}
 94 |       type: Git
 95 |     strategy:
 96 |       type: Source
 97 |       sourceStrategy:
 98 |         env:
 99 |         - name: APP_FILE
100 |           value: 'app.py'
101 |         - name: GIT_SSL_NO_VERIFY
102 |           value: 'true'
103 |         forcePull: true
104 |         from:
105 |           kind: DockerImage
106 |           name: 'docker.io/centos/python-36-centos7:latest'
107 |     triggers:
108 |     - imageChange: {}
109 |       type: ImageChange
110 |     - type: ConfigChange
111 | 
112 | - apiVersion: v1
113 |   kind: DeploymentConfig
114 |   metadata:
115 |     name: ${APPLICATION_NAME}
116 |     labels:
117 |       deploymentConfig: ${APPLICATION_NAME}
118 |       app: ${APPLICATION_NAME}
119 |   spec:
120 |     replicas: 1
121 |     selector:
122 |       deploymentConfig: ${APPLICATION_NAME}
123 |     strategy:
124 |       type: Rolling
125 |     template:
126 |       metadata:
127 |         labels:
128 |           deploymentConfig: ${APPLICATION_NAME}
129 |           app: ${APPLICATION_NAME}
130 |       spec:
131 |         containers:
132 |         - env:
133 |           - name: PROM_BACKUP_ALL
134 |             value: "true"
135 |           - name: BEARER_TOKEN
136 |             value: "${BEARER_TOKEN}"
137 |           - name: URL
138 |             value: "${URL}"
139 |           - name: BOTO_ACCESS_KEY
140 |             value: "${BOTO_ACCESS_KEY}"
141 |           - name: BOTO_SECRET_KEY
142 |             value: "${BOTO_SECRET_KEY}"
143 |           - name: BOTO_OBJECT_STORE
144 |             value: "${BOTO_OBJECT_STORE}"
145 |           - name: BOTO_STORE_ENDPOINT
146 |             value: "${BOTO_STORE_ENDPOINT}"
147 |           - name: CHUNK_SIZE
148 |             value: "${CHUNK_SIZE}"
149 |           - name: DATA_SIZE
150 |             value: "${DATA_SIZE}"
151 |           - name: TRAINING_REPEAT_HOURS
152 |             value: "${TRAINING_REPEAT_HOURS}"
153 |           - name: DATA_WINDOW_SIZE
154 |             value: "${DATA_WINDOW_SIZE}"
155 |           - name: STORE_INTERMEDIATE_DATA
156 |             value: "${STORE_INTERMEDIATE_DATA}"
157 |           - name: GET_OLDER_DATA
158 |             value: "${GET_OLDER_DATA}"
159 |           image: ${APPLICATION_NAME}
160 |           imagePullPolicy: IfNotPresent
161 |           name: ${APPLICATION_NAME}
162 |           resources:
163 |             requests:
164 |               memory: 500Mi
165 |               cpu: "4"
166 |             limits:
167 |               memory: 16Gi
168 |               cpu: "4"
169 |           terminationMessagePath: /dev/termination-log
170 |         dnsPolicy: ClusterFirst
171 |         restartPolicy: Always
172 |     triggers:
173 |     - imageChangeParams:
174 |         automatic: true
175 |         containerNames:
176 |           - ${APPLICATION_NAME}
177 |         from:
178 |           kind: ImageStreamTag
179 |           name: ${APPLICATION_NAME}:latest
180 |       type: ImageChange
181 |     - type: ConfigChange
182 | 
183 | - apiVersion: v1
184 |   kind: Service
185 |   metadata:
186 |     name: ${APPLICATION_NAME}
187 |     labels:
188 |       app: ${APPLICATION_NAME}
189 |   spec:
190 |     ports:
191 |     - name: 8080-tcp
192 |       port: 8080
193 |       protocol: TCP
194 |       targetPort: 8080
195 |     selector:
196 |       deploymentConfig: ${APPLICATION_NAME}
197 | 
198 | - apiVersion: v1
199 |   kind: Route
200 |   metadata:
201 |     labels:
202 |       app: ${APPLICATION_NAME}
203 |     name: ${APPLICATION_NAME}
204 |   spec:
205 |     port:
206 |       targetPort: 8080-tcp
207 |     to:
208 |       kind: Service
209 |       name: ${APPLICATION_NAME}
210 | 


--------------------------------------------------------------------------------
/prometheus.py:
--------------------------------------------------------------------------------
  1 | from urllib.parse import urlparse
  2 | import requests
  3 | import datetime
  4 | import json
  5 | 
  6 | # Disable SSL warnings
  7 | from requests.packages.urllib3.exceptions import InsecureRequestWarning
  8 | requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
  9 | 
 10 | DEBUG = False
 11 | MAX_REQUEST_RETRIES = 5
 12 | 
 13 | class Prometheus:
 14 |     """docstring for Prometheus."""
 15 |     def __init__(self, url='', end_time=None, token=None, data_chunk='1h',stored_data='1h'):
 16 |         self.headers = { 'Authorization': "bearer {}".format(token) }
 17 |         self.url = url
 18 |         self.prometheus_host = urlparse(self.url).netloc
 19 |         self._all_metrics = None
 20 |         self.data_chunk_size = data_chunk
 21 |         self.end_time = datetime.datetime.now()
 22 |         self.stored_data_range = stored_data
 23 |         self.DATA_CHUNK_SIZE_LIST = {
 24 |             '1m' : 60,
 25 |             '3m' : 180,
 26 |             '5m' : 300,
 27 |             '30m': 1800,
 28 |             '1h' : 3600,
 29 |             '3h' : 10800,
 30 |             '6h' : 21600,
 31 |             '12h': 43200,
 32 |             '1d' : 86400,
 33 |             '2d' : 172800}
 34 | 
 35 |     def all_metrics(self):
 36 |         '''
 37 |         Get the list of all the metrics that the prometheus host has
 38 |         '''
 39 |         if not self._all_metrics:
 40 |             response = requests.get('{0}/api/v1/label/__name__/values'.format(self.url),
 41 |                                     verify=False, # Disable ssl certificate verification temporarily
 42 |                                     headers=self.headers)
 43 |             if DEBUG:
 44 |                 print("Headers -> ",self.headers)
 45 |                 print("URL => ", response.url)
 46 |             if response.status_code == 200:
 47 |                 self._all_metrics = response.json()['data']
 48 |             else:
 49 |                 raise Exception("HTTP Status Code {} {} ({})".format(
 50 |                     response.status_code,
 51 |                     requests.status_codes._codes[response.status_code][0],
 52 |                     response.content
 53 |                 ))
 54 |         return self._all_metrics
 55 | 
 56 |     def get_metric(self, name, chunks=None, data_size=None):
 57 |         if chunks:
 58 |             if str(chunks) in self.DATA_CHUNK_SIZE_LIST:
 59 |                 self.data_chunk_size = str(chunks)
 60 |                 pass
 61 |             else:
 62 |                 print("Invalid Chunk Size, using default value: {}".format(self.data_chunk_size))
 63 |             pass
 64 |         if data_size:
 65 |             if str(data_size) in self.DATA_CHUNK_SIZE_LIST:
 66 |                 self.stored_data_range = str(data_size)
 67 |                 pass
 68 |             else:
 69 |                 print("Invalid Data Size, using default value: {}".format(self.stored_data_range))
 70 |             pass
 71 | 
 72 |         if not name in self.all_metrics():
 73 |             raise Exception("{} is not a valid metric".format(name))
 74 |         elif DEBUG:
 75 |             print("Metric is valid.")
 76 | 
 77 |         # num_chunks = 1
 78 |         num_chunks = int(self.DATA_CHUNK_SIZE_LIST[self.stored_data_range]/self.DATA_CHUNK_SIZE_LIST[self.data_chunk_size]) # Calculate the number of chunks using total data size and chunk size.
 79 |         metrics = self.get_metrics_from_prom(name, num_chunks)
 80 |         if metrics:
 81 |             return metrics
 82 | 
 83 | 
 84 |     def get_metrics_from_prom(self, name, chunks):
 85 |         if not name in self.all_metrics():
 86 |             raise Exception("{} is not a valid metric".format(name))
 87 | 
 88 |         # start = self.start_time.timestamp()
 89 |         end_timestamp = self.end_time.timestamp()
 90 |         chunk_size = self.DATA_CHUNK_SIZE_LIST[self.data_chunk_size]
 91 |         start = end_timestamp - self.DATA_CHUNK_SIZE_LIST[self.stored_data_range] + chunk_size
 92 |         data = []
 93 |         for i in range(chunks):
 94 |             # gc.collect() # Garbage collect to save Memory
 95 |             if DEBUG:
 96 |                 print("Getting chunk: ", i)
 97 |                 print("Start Time: ",datetime.datetime.fromtimestamp(start))
 98 | 
 99 |             tries = 0
100 |             while tries < MAX_REQUEST_RETRIES:  # Retry code in case of errors
101 |                 response = requests.get('{0}/api/v1/query'.format(self.url),    # using the query API to get raw data
102 |                                         params={'query': name+'['+self.data_chunk_size+']',
103 |                                                 'time': start
104 |                                                 },
105 |                                         verify=False, # Disable ssl certificate verification temporarily
106 |                                         headers=self.headers)
107 |                 if DEBUG:
108 |                     print(response.url)
109 |                     pass
110 | 
111 |                 tries+=1
112 |                 if response.status_code == 200:
113 |                     data += response.json()['data']['result']
114 | 
115 |                     if DEBUG:
116 |                         # print("Size of recent chunk = ",getsizeof(data))
117 |                         # print(data)
118 |                         print(datetime.datetime.fromtimestamp(response.json()['data']['result'][0]['values'][0][0]))
119 |                         print(datetime.datetime.fromtimestamp(response.json()['data']['result'][0]['values'][-1][0]))
120 |                         pass
121 | 
122 |                     del response
123 |                     tries = MAX_REQUEST_RETRIES
124 |                 elif response.status_code == 504:
125 |                     if tries >= MAX_REQUEST_RETRIES:
126 |                         self.connection_errors_count+=1
127 |                         return False
128 |                     else:
129 |                         print("Retry Count: ",tries)
130 |                         sleep(CONNECTION_RETRY_WAIT_TIME)    # Wait for a second before making a new request
131 |                 else:
132 |                     if tries >= MAX_REQUEST_RETRIES:
133 |                         self.connection_errors_count+=1
134 |                         raise Exception("HTTP Status Code {} {} ({})".format(
135 |                             response.status_code,
136 |                             requests.status_codes._codes[response.status_code][0],
137 |                             response.content
138 |                         ))
139 |                     else:
140 |                         print("Retry Count: ",tries)
141 |                         sleep(CONNECTION_RETRY_WAIT_TIME)
142 | 
143 |             start += chunk_size
144 | 
145 |         return(json.dumps(data))
146 | 
147 |     def get_current_metric_value(self, metric_name, label_config = None):
148 |         data = []
149 |         if label_config:
150 |             label_list = [str(key+"="+ "'" + label_config[key]+ "'") for key in label_config]
151 |             # print(label_list)
152 |             query = metric_name + "{" + ",".join(label_list) + "}"
153 |         else:
154 |             query = metric_name
155 |         response = requests.get('{0}/api/v1/query'.format(self.url),    # using the query API to get raw data
156 |                                 params={'query': query},#label_config},
157 |                                 verify=False, # Disable ssl certificate verification temporarily
158 |                                 headers=self.headers)
159 |         data += response.json()['data']['result']
160 |         return (json.dumps(data))
161 |         pass
162 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- # train-prometheus
  2 | A simple application to collect data from a prometheus host and train a model on it -->
  3 | # !Newer rewritten version is available here: https://github.com/AICoE/prometheus-anomaly-detector
  4 | 
  5 | # Train Prometheus
  6 | This python application has been written to deploy a training pipeline on OpenShift. This pipeline will at regular specified intervals collect new data directly from a prometheus instance and train a model on it regularly. This application also hosts a web page which can be used as a target for prometheus. This target currently serves 6 different metrics using two different prediction models (Prophet and Fourier Extrapolation).
  7 | 
  8 | ## Getting Started
  9 | 
 10 | ### Installing prerequisites
 11 | 
 12 | To run this application you will need to install several libraries listed in the requirements.txt.
 13 | 
 14 | To install all the dependencies at once, run the following command when inside the directory:
 15 | ```
 16 | pip install -r requirements.txt
 17 | ```
 18 | After all the prerequisites have been installed, open the Makefile and you will see a list of required and optional variables in the beginning.
 19 | The required variables will be used to communicate with the Prometheus and Storage end-points.
 20 | 
 21 | Populating the Makefile is the most important step, as you can use this to run the application on OpenShift, Docker or your local machine.
 22 | 
 23 | ### Running on a local machine
 24 | 
 25 | After setting up the credentials in your Makefile, run the following command to run a flask server which will regularly train and serve the predicted metrics as a prometheus target:
 26 | 
 27 | ```
 28 | make run_model
 29 | ```
 30 | ## Running on Docker
 31 | After populating all the required variables, set the name for your docker app by changing the docker_app_name variable. Then run the following command to build the docker image.
 32 | ```
 33 | make docker_build
 34 | ```
 35 | This command uses the Dockerfile included in the repository to build an image. So you can use it to customize how the image is built.
 36 | 
 37 | After the image is successfully built, you can run the following command to run a flask server in a docker container, this command also specifies on which the predicted metrics are served which can be easily changed in the Makefile.
 38 | 
 39 | ```
 40 | make docker_run
 41 | ```
 42 | ## Deploying on OpenShift
 43 | 
 44 | * ### Deploying a flask application to predict and serve the predicted metrics:
 45 |   In the Makefile set up the required variables, and then run the following command:
 46 | ```
 47 | make oc_deploy
 48 | ```
 49 | This will create a deployment on OpenShift and which after training the prophet model, will serve the predicted metrics as a web page (using the flask web server), these predicted metrics can later be easily collected by a prometheus instance.
 50 | 
 51 | Following is a sample web page view of what the metrics will look like:
 52 | ```
 53 | # HELP process_virtual_memory_bytes Virtual memory size in bytes.
 54 | # TYPE process_virtual_memory_bytes gauge
 55 | process_virtual_memory_bytes 13.0
 56 | # HELP process_resident_memory_bytes Resident memory size in bytes.
 57 | # TYPE process_resident_memory_bytes gauge
 58 | process_resident_memory_bytes 31.0
 59 | # HELP process_start_time_seconds Start time of the process since unix epoch in seconds.
 60 | # TYPE process_start_time_seconds gauge
 61 | process_start_time_seconds 15.25
 62 | # HELP process_cpu_seconds_total Total user and system CPU time spent in seconds.
 63 | # TYPE process_cpu_seconds_total counter
 64 | process_cpu_seconds_total 69.88
 65 | # HELP process_open_fds Number of open file descriptors.
 66 | # TYPE process_open_fds gauge
 67 | process_open_fds 60.0
 68 | # HELP process_max_fds Maximum number of open file descriptors.
 69 | # TYPE process_max_fds gauge
 70 | process_max_fds 14.0
 71 | # HELP python_info Python platform information
 72 | # TYPE python_info gauge
 73 | python_info{implementation="CPython",major="3",minor="6",patchlevel="5",version="3.6.5"} 1.0
 74 | # HELP predicted_values_prophet Forecasted value from Prophet model
 75 | # TYPE predicted_values_prophet gauge
 76 | predicted_values_prophet{beta_kubernetes_io_arch="amd64",beta_kubernetes_io_os="linux",instance="cpt-0001.redhat.com",job="kubernetes-nodes",kubernetes_io_hostname="cpt-0001.redhat.com",node_role_kubernetes_io_compute="true",operation_type="create_container",provider="rhos",quantile="0.5",region="compute",size="small"} 32.99
 77 | # HELP predicted_values_prophet_yhat_upper Forecasted value upper bound from Prophet model
 78 | # TYPE predicted_values_prophet_yhat_upper gauge
 79 | predicted_values_prophet_yhat_upper{beta_kubernetes_io_arch="amd64",beta_kubernetes_io_os="linux",instance="cpt.redhat.com",job="kubernetes-nodes",kubernetes_io_hostname="cpt-0001.redhat.com",node_role_kubernetes_io_compute="true",operation_type="create_container",provider="rhos",quantile="0.5",region="compute",size="small"} 36.728885
 80 | # HELP predicted_values_prophet_yhat_lower Forecasted value lower bound from Prophet model
 81 | # TYPE predicted_values_prophet_yhat_lower gauge
 82 | predicted_values_prophet_yhat_lower{beta_kubernetes_io_arch="amd64",beta_kubernetes_io_os="linux",instance="cpt-0001.redhat.com",job="kubernetes-nodes",kubernetes_io_hostname="cpt-0001.redhat.com",node_role_kubernetes_io_compute="true",operation_type="create_container",provider="rhos",quantile="0.5",region="compute",size="small"} 27881.58691175386
 83 | # HELP predicted_values_fourier Forecasted value from Fourier Transform model
 84 | # TYPE predicted_values_fourier gauge
 85 | predicted_values_fourier{beta_kubernetes_io_arch="amd64",beta_kubernetes_io_os="linux",instance="cpt-0001.redhat.com",job="kubernetes-nodes",kubernetes_io_hostname="cpt-0001.redhat.com",node_role_kubernetes_io_compute="true",operation_type="create_container",provider="rhos",quantile="0.5",region="compute",size="small"} 29838.64724605837
 86 | # HELP predicted_values_fourier_yhat_upper Forecasted value upper bound from Fourier Transform model
 87 | # TYPE predicted_values_fourier_yhat_upper gauge
 88 | predicted_values_fourier_yhat_upper{beta_kubernetes_io_arch="amd64",beta_kubernetes_io_os="linux",instance="cpt-0001.redhat.com",job="kubernetes-nodes",kubernetes_io_hostname="cpt-0001.redhat.com",node_role_kubernetes_io_compute="true",operation_type="create_container",provider="rhos",quantile="0.5",region="compute",size="small"} 37111.31044977396
 89 | # HELP predicted_values_fourier_yhat_lower Forecasted value lower bound from Fourier Transform model
 90 | # TYPE predicted_values_fourier_yhat_lower gauge
 91 | predicted_values_fourier_yhat_lower{beta_kubernetes_io_arch="amd64",beta_kubernetes_io_os="linux",instance="cpt-0001.redhat.com",job="kubernetes-nodes",kubernetes_io_hostname="cpt-0001.redhat.com",node_role_kubernetes_io_compute="true",operation_type="create_container",provider="rhos",quantile="0.5",region="compute",size="small"} 29739.05799347848
 92 | ```
 93 | 
 94 | ## Built With
 95 | 
 96 | * [fbprohphet](https://github.com/facebook/prophet) - Facebook's timeseries forecasting library
 97 | * [requests](http://docs.python-requests.org/en/master/) - HTTP Library for python
 98 | * [boto3](https://boto3.readthedocs.io/en/latest/reference/core/session.html) - AWS sdk for python
 99 | * [pandas](http://pandas.pydata.org/) - High Performance Data Structure
100 | * [flask](http://flask.pocoo.org/) - A lightweight web application framework
101 | * [apscheduler](https://apscheduler.readthedocs.io/en/latest/) - Python Scheduling library
102 | * [prometheus_client](https://github.com/prometheus/client_python) - Official Python client for Prometheus
103 | * [sortedcontainers](http://www.grantjenks.com/docs/sortedcontainers/) - Pure python sorted simple data structures
104 | * [Anomaly Detection](https://github.com/nfrumkin/forecast-prometheus/blob/master/anomaly_detector.py) - Anomaly Detection Function by Natasha Frumkin
105 | * [Fourier Extrapolation Model](https://github.com/nfrumkin/forecast-prometheus/blob/master/fourier_train.py) - Fourier Extrapolation Model by Natasha Frumkin
106 | * [Serving Prometheus Metrics](https://github.com/hemajv/flask-prometheus/blob/master/servicemetrics.py) - Flask Server to host Prometheus metrics by Hema Veeradhi
107 | 


--------------------------------------------------------------------------------
/lib/prometheus.py:
--------------------------------------------------------------------------------
  1 | from urllib.parse import urlparse
  2 | import requests
  3 | import datetime
  4 | import json
  5 | 
  6 | # Disable SSL warnings
  7 | from requests.packages.urllib3.exceptions import InsecureRequestWarning
  8 | requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
  9 | 
 10 | DEBUG = False
 11 | MAX_REQUEST_RETRIES = 5
 12 | 
 13 | class Prometheus:
 14 |     """docstring for Prometheus."""
 15 |     def __init__(self, url='', end_time=None, token=None, data_chunk='1h',stored_data='1h'):
 16 |         self.headers = { 'Authorization': "bearer {}".format(token) }
 17 |         self.url = url
 18 |         self.prometheus_host = urlparse(self.url).netloc
 19 |         self._all_metrics = None
 20 |         self.data_chunk_size = data_chunk
 21 |         self.end_time = datetime.datetime.now()
 22 |         self.stored_data_range = stored_data
 23 |         self.DATA_CHUNK_SIZE_LIST = {
 24 |             '1m' : 60,
 25 |             '5m' : 300,
 26 |             '30m': 1800,
 27 |             '1h' : 3600,
 28 |             '3h' : 10800,
 29 |             '6h' : 21600,
 30 |             '12h': 43200,
 31 |             '1d' : 86400,
 32 |             '2d' : 172800}
 33 | 
 34 |     def all_metrics(self):
 35 |         '''
 36 |         Get the list of all the metrics that the prometheus host has
 37 |         '''
 38 |         if not self._all_metrics:
 39 |             response = requests.get('{0}/api/v1/label/__name__/values'.format(self.url),
 40 |                                     verify=False, # Disable ssl certificate verification temporarily
 41 |                                     headers=self.headers)
 42 |             if DEBUG:
 43 |                 print("Headers -> ",self.headers)
 44 |                 print("URL => ", response.url)
 45 |             if response.status_code == 200:
 46 |                 self._all_metrics = response.json()['data']
 47 |             else:
 48 |                 raise Exception("HTTP Status Code {} {} ({})".format(
 49 |                     response.status_code,
 50 |                     requests.status_codes._codes[response.status_code][0],
 51 |                     response.content
 52 |                 ))
 53 |         return self._all_metrics
 54 | 
 55 |     def get_metric(self, name, chunks=None, data_size=None):
 56 |         if chunks:
 57 |             if str(chunks) in self.DATA_CHUNK_SIZE_LIST:
 58 |                 self.data_chunk_size = str(chunks)
 59 |                 pass
 60 |             else:
 61 |                 print("Invalid Chunk Size, using default value: {}".format(self.data_chunk_size))
 62 |             pass
 63 |         if data_size:
 64 |             if str(data_size) in self.DATA_CHUNK_SIZE_LIST:
 65 |                 self.stored_data_range = str(data_size)
 66 |                 pass
 67 |             else:
 68 |                 print("Invalid Data Size, using default value: {}".format(self.stored_data_range))
 69 |             pass
 70 | 
 71 |         if not name in self.all_metrics():
 72 |             raise Exception("{} is not a valid metric".format(name))
 73 |         elif DEBUG:
 74 |             print("Metric is valid.")
 75 | 
 76 |         # num_chunks = 1
 77 |         num_chunks = int(self.DATA_CHUNK_SIZE_LIST[self.stored_data_range]/self.DATA_CHUNK_SIZE_LIST[self.data_chunk_size]) # Calculate the number of chunks using total data size and chunk size.
 78 |         metrics = self.get_metrics_from_prom(name, num_chunks)
 79 |         if metrics:
 80 |             return metrics
 81 | 
 82 |     # def get_metrics_from_prom(self, name, chunks):
 83 |     #     if not name in self.all_metrics():
 84 |     #         raise Exception("{} is not a valid metric".format(name))
 85 |     #
 86 |     #     # start = self.start_time.timestamp()
 87 |     #     end_timestamp = self.end_time.timestamp()
 88 |     #     chunk_size = self.DATA_CHUNK_SIZE_LIST[self.data_chunk_size]
 89 |     #     start = end_timestamp #- self.DATA_CHUNK_SIZE_LIST[self.stored_data_range] + chunk_size
 90 |     #     data = []
 91 |     #     for i in range(chunks):
 92 |     #         # gc.collect() # Garbage collect to save Memory
 93 |     #         if DEBUG:
 94 |     #             print("Getting chunk: ", i)
 95 |     #             print("Start Time: ",datetime.datetime.fromtimestamp(start))
 96 |     #
 97 |     #         tries = 0
 98 |     #         while tries < MAX_REQUEST_RETRIES:  # Retry code in case of errors
 99 |     #             response = requests.get('{0}/api/v1/query'.format(self.url),    # using the query API to get raw data
100 |     #                                     params={'query': name+'['+self.data_chunk_size+']',
101 |     #                                             'time': start
102 |     #                                             },
103 |     #                                     verify=False, # Disable ssl certificate verification temporarily
104 |     #                                     headers=self.headers)
105 |     #             if DEBUG:
106 |     #                 print(response.url)
107 |     #                 pass
108 |     #
109 |     #             tries+=1
110 |     #             if response.status_code == 200:
111 |     #                 data += response.json()['data']['result']
112 |     #
113 |     #                 if DEBUG:
114 |     #                     # print("Size of recent chunk = ",getsizeof(data))
115 |     #                     # print(data)
116 |     #                     print(datetime.datetime.fromtimestamp(response.json()['data']['result'][0]['values'][0][0]))
117 |     #                     print(datetime.datetime.fromtimestamp(response.json()['data']['result'][0]['values'][-1][0]))
118 |     #                     pass
119 |     #
120 |     #                 del response
121 |     #                 tries = MAX_REQUEST_RETRIES
122 |     #             elif response.status_code == 504:
123 |     #                 if tries >= MAX_REQUEST_RETRIES:
124 |     #                     self.connection_errors_count+=1
125 |     #                     return False
126 |     #                 else:
127 |     #                     print("Retry Count: ",tries)
128 |     #                     sleep(CONNECTION_RETRY_WAIT_TIME)    # Wait for a second before making a new request
129 |     #             else:
130 |     #                 if tries >= MAX_REQUEST_RETRIES:
131 |     #                     self.connection_errors_count+=1
132 |     #                     raise Exception("HTTP Status Code {} {} ({})".format(
133 |     #                         response.status_code,
134 |     #                         requests.status_codes._codes[response.status_code][0],
135 |     #                         response.content
136 |     #                     ))
137 |     #                 else:
138 |     #                     print("Retry Count: ",tries)
139 |     #                     sleep(CONNECTION_RETRY_WAIT_TIME)
140 |     #
141 |     #         start += chunk_size
142 |     #
143 |     #     return(json.dumps(data)) #This works
144 | 
145 |     def get_metrics_from_prom(self, name, chunks):
146 |         if not name in self.all_metrics():
147 |             raise Exception("{} is not a valid metric".format(name))
148 | 
149 |         # start = self.start_time.timestamp()
150 |         end_timestamp = self.end_time.timestamp()
151 |         chunk_size = self.DATA_CHUNK_SIZE_LIST[self.data_chunk_size]
152 |         start = end_timestamp - self.DATA_CHUNK_SIZE_LIST[self.stored_data_range] + chunk_size
153 |         data = []
154 |         for i in range(chunks):
155 |             # gc.collect() # Garbage collect to save Memory
156 |             if DEBUG:
157 |                 print("Getting chunk: ", i)
158 |                 print("Start Time: ",datetime.datetime.fromtimestamp(start))
159 | 
160 |             tries = 0
161 |             while tries < MAX_REQUEST_RETRIES:  # Retry code in case of errors
162 |                 response = requests.get('{0}/api/v1/query'.format(self.url),    # using the query API to get raw data
163 |                                         params={'query': name+'['+self.data_chunk_size+']',
164 |                                                 'time': start
165 |                                                 },
166 |                                         verify=False, # Disable ssl certificate verification temporarily
167 |                                         headers=self.headers)
168 |                 if DEBUG:
169 |                     print(response.url)
170 |                     pass
171 | 
172 |                 tries+=1
173 |                 if response.status_code == 200:
174 |                     data += response.json()['data']['result']
175 | 
176 |                     if DEBUG:
177 |                         # print("Size of recent chunk = ",getsizeof(data))
178 |                         # print(data)
179 |                         print(datetime.datetime.fromtimestamp(response.json()['data']['result'][0]['values'][0][0]))
180 |                         print(datetime.datetime.fromtimestamp(response.json()['data']['result'][0]['values'][-1][0]))
181 |                         pass
182 | 
183 |                     del response
184 |                     tries = MAX_REQUEST_RETRIES
185 |                 elif response.status_code == 504:
186 |                     if tries >= MAX_REQUEST_RETRIES:
187 |                         self.connection_errors_count+=1
188 |                         return False
189 |                     else:
190 |                         print("Retry Count: ",tries)
191 |                         sleep(CONNECTION_RETRY_WAIT_TIME)    # Wait for a second before making a new request
192 |                 else:
193 |                     if tries >= MAX_REQUEST_RETRIES:
194 |                         self.connection_errors_count+=1
195 |                         raise Exception("HTTP Status Code {} {} ({})".format(
196 |                             response.status_code,
197 |                             requests.status_codes._codes[response.status_code][0],
198 |                             response.content
199 |                         ))
200 |                     else:
201 |                         print("Retry Count: ",tries)
202 |                         sleep(CONNECTION_RETRY_WAIT_TIME)
203 | 
204 |             start += chunk_size
205 | 
206 |         return(json.dumps(data))
207 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import time
  3 | import os
  4 | import sys
  5 | import bz2
  6 | import pandas
  7 | import argparse
  8 | import pickle
  9 | from flask import Flask, render_template_string, abort, Response
 10 | from datetime import datetime, timedelta
 11 | from prometheus_client import CollectorRegistry, generate_latest, REGISTRY, Counter, Gauge, Histogram
 12 | from prometheus import Prometheus
 13 | from model import *
 14 | from ceph import CephConnect as cp
 15 | from ast import literal_eval
 16 | # Scheduling stuff
 17 | from apscheduler.schedulers.background import BackgroundScheduler
 18 | from apscheduler.triggers.interval import IntervalTrigger
 19 | import atexit
 20 | 
 21 | 
 22 | app = Flask(__name__)
 23 | 
 24 | data_window = int(os.getenv('DATA_WINDOW_SIZE',60)) # Number of days of past data, the model should use to train
 25 | 
 26 | url = os.getenv('URL')
 27 | token = os.getenv('BEARER_TOKEN')
 28 | 
 29 | # Specific metric to run the model on
 30 | metric_name = os.getenv('METRIC_NAME','kubelet_docker_operations_latency_microseconds')
 31 | 
 32 | print("Using Metric {}.".format(metric_name))
 33 | 
 34 | # This is where the model dictionary will be stored and retrieved from
 35 | data_storage_path = "Data_Frames" + "/" + url[8:] + "/"+ metric_name + "/" + "prophet_model" + ".pkl"
 36 | 
 37 | # Chunk size, download the complete data, but in smaller chunks, should be less than or equal to DATA_SIZE
 38 | chunk_size = str(os.getenv('CHUNK_SIZE','1h'))
 39 | 
 40 | # Net data size to scrape from prometheus
 41 | data_size = str(os.getenv('DATA_SIZE','1h'))
 42 | 
 43 | train_schedule = int(os.getenv('TRAINING_REPEAT_HOURS',6))
 44 | 
 45 | 
 46 | TRUE_LIST = ["True", "true", "1", "y"]
 47 | 
 48 | store_intermediate_data = os.getenv("STORE_INTERMEDIATE_DATA", "False") # Setting this to true will store intermediate dataframes to ceph
 49 | 
 50 | 
 51 | if str(os.getenv('GET_OLDER_DATA',"False")) in TRUE_LIST:
 52 |     print("Collecting previously stored data from {}".format(data_storage_path))
 53 |     data_dict = cp().get_latest_df_dict(data_storage_path) # Need error handling inside this function, in case the storage path does not exist
 54 |     pass
 55 | else:
 56 |     data_dict = {}
 57 | 
 58 | 
 59 | config_list = []
 60 | fixed_label_config = str(os.getenv("LABEL_CONFIG",None)) # by default it will train for all label configurations. WARNING: Tthat might take a lot of time depending on your metrics and cpu
 61 | if fixed_label_config  != "None":
 62 |     config_list = fixed_label_config.split(";") # Separate multiple label configurations using a ';' (semi-colon)
 63 |     fixed_label_config_dict = literal_eval(config_list[0]) # # TODO: Add more error handling here
 64 | 
 65 | 
 66 | predictions_dict_prophet = {}
 67 | predictions_dict_fourier = {}
 68 | current_metric_metadata = ""
 69 | current_metric_metadata_dict = {}
 70 | 
 71 | # iteration = 0
 72 | def job(current_time):
 73 |     # TODO: Replace this function with model training function and set up the correct IntervalTrigger time
 74 |     global data_dict, predictions_dict_prophet, predictions_dict_fourier, current_metric_metadata, current_metric_metadata_dict, data_window, url, token, chunk_size, data_size, TRUE_LIST, store_intermediate_data
 75 |     global data, config_list
 76 |     # iteration += 1
 77 |     start_time = time.time()
 78 |     prom = Prometheus(url=url, token=token, data_chunk=chunk_size, stored_data=data_size)
 79 |     metric = prom.get_metric(metric_name)
 80 |     print("metric collected.")
 81 | 
 82 |     # Convert data to json
 83 |     metric = json.loads(metric)
 84 | 
 85 |     # Metric Json is converted to a shaped dataframe
 86 |     data_dict = get_df_from_json(metric, data_dict, data_window) # This dictionary contains all the sub-labels as keys and their data as Pandas DataFrames
 87 |     del metric, prom
 88 | 
 89 |     if str(store_intermediate_data) in TRUE_LIST:
 90 |         print("DataFrame stored at: ",cp().store_data(metric_name, pickle.dumps(data_dict), (data_storage_path + str(datetime.now().strftime('%Y%m%d%H%M')))))
 91 |         pass
 92 | 
 93 | 
 94 |     if fixed_label_config != "None": #If a label config has been specified
 95 |         single_label_data_dict = {}
 96 | 
 97 |         # split into multiple label configs
 98 |         existing_config_list = list(data_dict.keys())
 99 |         for config in config_list:
100 |             config_found = False
101 |             for existing_config in existing_config_list:
102 |                 if SortedDict(literal_eval(existing_config)) == SortedDict(literal_eval(config)):
103 |                     single_label_data_dict[existing_config] = data_dict[existing_config]
104 |                     config_found = True
105 |                     pass
106 |             if not config_found:
107 |                 print("Specified Label Configuration {} was not found".format(config))
108 |                 raise KeyError
109 |                 pass
110 |             # single_label_data_dict[config] = data_dict[config]
111 |             pass
112 | 
113 |         # single_label_data_dict[fixed_label_config] = data_dict[fixed_label_config]
114 |         current_metric_metadata = list(single_label_data_dict.keys())[0]
115 |         current_metric_metadata_dict = literal_eval(current_metric_metadata)
116 | 
117 |         print(data_dict[current_metric_metadata].head(5))
118 |         print(data_dict[current_metric_metadata].tail(5))
119 | 
120 |         print("Using the default label config")
121 |         predictions_dict_prophet = predict_metrics(single_label_data_dict)
122 |         # print(single_label_data_dict)
123 |         predictions_dict_fourier = predict_metrics_fourier(single_label_data_dict)
124 |         pass
125 |     else:
126 |         for x in data_dict:
127 |             print(data_dict[x].head(5))
128 |             print(data_dict[x].tail(5))
129 |             break
130 |             pass
131 |         predictions_dict_prophet = predict_metrics(data_dict)
132 |         predictions_dict_fourier = predict_metrics_fourier(data_dict)
133 | 
134 |     # TODO: Trigger Data Pruning here
135 |     function_run_time = time.time() - start_time
136 | 
137 |     print("Total time taken to train was: {} seconds.".format(function_run_time))
138 |     pass
139 | 
140 | job(datetime.now())
141 | 
142 | # Schedular schedules a background job that needs to be run regularly
143 | scheduler = BackgroundScheduler()
144 | scheduler.start()
145 | scheduler.add_job(
146 |     func=lambda: job(datetime.now()),
147 |     trigger=IntervalTrigger(hours=train_schedule),
148 |     id='training_job',
149 |     name='Train Prophet model every day regularly',
150 |     replace_existing=True)
151 | 
152 | # Shut down the scheduler when exiting the app
153 | atexit.register(lambda: scheduler.shutdown())
154 | 
155 | 
156 | 
157 | # Initialize Multiple gauge metrics for the predicted values
158 | print("current_metric_metadata_dict: ", current_metric_metadata_dict)
159 | predicted_metric_name = "predicted_" + metric_name
160 | PREDICTED_VALUES_PROPHET = Gauge(predicted_metric_name + '_prophet', 'Forecasted value from Prophet model', [label for label in current_metric_metadata_dict if label != "__name__"])
161 | PREDICTED_VALUES_PROPHET_UPPER = Gauge(predicted_metric_name + '_prophet_yhat_upper', 'Forecasted value upper bound from Prophet model', [label for label in current_metric_metadata_dict if label != "__name__"])
162 | PREDICTED_VALUES_PROPHET_LOWER = Gauge(predicted_metric_name + '_prophet_yhat_lower', 'Forecasted value lower bound from Prophet model', [label for label in current_metric_metadata_dict if label != "__name__"])
163 | 
164 | PREDICTED_VALUES_FOURIER = Gauge(predicted_metric_name + '_fourier', 'Forecasted value from Fourier Transform model', [label for label in current_metric_metadata_dict if label != "__name__"])
165 | PREDICTED_VALUES_FOURIER_UPPER = Gauge(predicted_metric_name + '_fourier_yhat_upper', 'Forecasted value upper bound from Fourier Transform model', [label for label in current_metric_metadata_dict if label != "__name__"])
166 | PREDICTED_VALUES_FOURIER_LOWER = Gauge(predicted_metric_name + '_fourier_yhat_lower', 'Forecasted value lower bound from Fourier Transform model', [label for label in current_metric_metadata_dict if label != "__name__"])
167 | 
168 | PREDICTED_ANOMALY_PROPHET = Gauge(predicted_metric_name + '_prophet_anomaly', 'Detected Anomaly using the Prophet model', [label for label in current_metric_metadata_dict if label != "__name__"])
169 | 
170 | PREDICTED_ANOMALY_FOURIER = Gauge(predicted_metric_name + '_fourier_anomaly', 'Detected Anomaly using the Fourier model', [label for label in current_metric_metadata_dict if label != "__name__"])
171 | 
172 | # Standard Flask route stuff.
173 | @app.route('/')
174 | def hello_world():
175 |     return 'This is just a test page. Please add "/metrics" to the url of this page to see the predicted metrics.'
176 | 
177 | live_data_dict = {}
178 | 
179 | @app.route('/metrics')
180 | def metrics():
181 |     global predictions_dict_prophet, predictions_dict_fourier, current_metric_metadata, current_metric_metadata_dict, metric_name, url, token, live_data_dict
182 | 
183 | 
184 |     for metadata in predictions_dict_prophet:
185 | 
186 |         #Find the index matching with the current timestamp
187 |         index_prophet = predictions_dict_prophet[metadata].index.get_loc(datetime.now(), method='nearest')
188 |         index_fourier = predictions_dict_fourier[metadata].index.get_loc(datetime.now(), method='nearest')
189 |         current_metric_metadata = metadata
190 | 
191 |         print("The current time is: ",datetime.now())
192 |         print("The matching index for Prophet model found was: \n", predictions_dict_prophet[metadata].iloc[[index_prophet]])
193 |         print("The matching index for Fourier Transform found was: \n", predictions_dict_fourier[metadata].iloc[[index_fourier]])
194 | 
195 |         current_metric_metadata_dict = literal_eval(metadata)
196 | 
197 |         temp_current_metric_metadata_dict = current_metric_metadata_dict.copy()
198 | 
199 |         # delete the "__name__" key from the dictionary as we don't need it in labels (it is a non-permitted label) when serving the metrics
200 |         del temp_current_metric_metadata_dict["__name__"]
201 | 
202 |         # TODO: the following function does not have good error handling or retry code in case of get request failure, need to fix that
203 |         # Get the current metric value which will be compared with the predicted value to detect an anomaly
204 |         metric = (Prometheus(url=url, token=token).get_current_metric_value(metric_name, temp_current_metric_metadata_dict))
205 | 
206 |         # print("metric collected.")
207 | 
208 |         # Convert data to json
209 |         metric = json.loads(metric)
210 | 
211 |         # Convert the json to a dictionary of pandas dataframes
212 |         live_data_dict = get_df_from_single_value_json(metric, live_data_dict)
213 | 
214 |         # Trim the live data dataframe to only 5 most recent values
215 |         live_data_dict[metadata] = live_data_dict[metadata][-5:]
216 |         # print(live_data_dict)
217 | 
218 |         # Update the metric values for prophet model
219 |         PREDICTED_VALUES_PROPHET.labels(**temp_current_metric_metadata_dict).set(predictions_dict_prophet[metadata]['yhat'][index_prophet])
220 |         PREDICTED_VALUES_PROPHET_UPPER.labels(**temp_current_metric_metadata_dict).set(predictions_dict_prophet[metadata]['yhat_upper'][index_prophet])
221 |         PREDICTED_VALUES_PROPHET_LOWER.labels(**temp_current_metric_metadata_dict).set(predictions_dict_prophet[metadata]['yhat_lower'][index_prophet])
222 | 
223 |         # Update the metric values for fourier transform model
224 |         PREDICTED_VALUES_FOURIER.labels(**temp_current_metric_metadata_dict).set(predictions_dict_fourier[metadata]['yhat'][index_fourier])
225 |         PREDICTED_VALUES_FOURIER_UPPER.labels(**temp_current_metric_metadata_dict).set(predictions_dict_fourier[metadata]['yhat_upper'][index_fourier])
226 |         PREDICTED_VALUES_FOURIER_LOWER.labels(**temp_current_metric_metadata_dict).set(predictions_dict_fourier[metadata]['yhat_lower'][index_fourier])
227 | 
228 | 
229 |         if len(live_data_dict[metadata] >= 5):
230 |             pass
231 |             # Update the metric values for detected anomalies 1 in case of anomaly, 0 if not
232 |             if (detect_anomalies(predictions_dict_fourier[metadata][len(predictions_dict_fourier[metadata])-(len(live_data_dict[metadata])):],live_data_dict[metadata])):
233 |                 PREDICTED_ANOMALY_FOURIER.labels(**temp_current_metric_metadata_dict).set(1)
234 |             else:
235 |                 PREDICTED_ANOMALY_FOURIER.labels(**temp_current_metric_metadata_dict).set(0)
236 | 
237 |             if (detect_anomalies(predictions_dict_prophet[metadata][len(predictions_dict_prophet[metadata])-(len(live_data_dict[metadata])):],live_data_dict[metadata])):
238 |                 PREDICTED_ANOMALY_PROPHET.labels(**temp_current_metric_metadata_dict).set(1)
239 |             else:
240 |                 PREDICTED_ANOMALY_PROPHET.labels(**temp_current_metric_metadata_dict).set(0)
241 |         pass
242 | 
243 |     return Response(generate_latest(REGISTRY).decode("utf-8"), content_type='text; charset=utf-8')
244 | 
245 | if __name__ == "__main__":
246 |     # Running the flask web server
247 |     app.run(host='0.0.0.0', port=8080)
248 |     pass
249 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | from prometheus import Prometheus
  2 | import pandas
  3 | import numpy as np
  4 | from numpy import fft
  5 | import json
  6 | import time
  7 | # from lib.model import *
  8 | from ceph import CephConnect as cp
  9 | from datetime import datetime, timedelta
 10 | from fbprophet import Prophet
 11 | from sortedcontainers import SortedDict
 12 | import os
 13 | import gc
 14 | import pickle
 15 | import collections
 16 | from scipy.stats import norm
 17 | 
 18 | # Plotting
 19 | # import matplotlib.pyplot as plt
 20 | 
 21 | 
 22 | def get_df_from_json(metric, metric_dict_pd={}, data_window=5):
 23 |     '''
 24 |     Method to convert a json object of a Prometheus metric to a dictionary of shaped Pandas DataFrames
 25 | 
 26 |     The shape is dict[metric_metadata] = Pandas Object
 27 | 
 28 |     Pandas Object = timestamp, value
 29 |                     15737933, 1
 30 |                     .....
 31 | 
 32 |     This method can also be used to update an existing dictionary with new data
 33 |     '''
 34 |     # metric_dict = {}
 35 |     current_time = datetime.now()
 36 |     earliest_data_time = current_time - timedelta(days = data_window)
 37 | 
 38 | 
 39 |     print("Pre-processing Data...........")
 40 |     # metric_dict_pd = {}
 41 |     # print("Length of metric: ", len(metric))
 42 |     for row in metric:
 43 |         # metric_dict[str(row['metric'])] = metric_dict.get(str(row['metric']),[]) + (row['values'])
 44 |         metric_metadata = str(SortedDict(row['metric']))[11:-1] # Sort the dictionary and then convert it to string so it can be hashed
 45 |         # print(metric_metadata)
 46 |         # print("Row Values: ",row['values'])
 47 |         if  metric_metadata not in metric_dict_pd:
 48 |             metric_dict_pd[metric_metadata] = pandas.DataFrame(row['values'], columns=['ds', 'y']).apply(pandas.to_numeric, args=({"errors":"coerce"}))
 49 |             metric_dict_pd[metric_metadata]['ds'] = pandas.to_datetime(metric_dict_pd[metric_metadata]['ds'], unit='s')
 50 |             pass
 51 |         else:
 52 |             temp_df = pandas.DataFrame(row['values'], columns=['ds', 'y']).apply(pandas.to_numeric, args=({"errors":"coerce"}))
 53 |             temp_df['ds'] = pandas.to_datetime(temp_df['ds'], unit='s')
 54 |             # print(temp_df.head())
 55 |             # print("Row Values: ",row['values']
 56 |             # print("Temp Head Before 5: \n",temp_df.head(5))
 57 |             # print("Head Before 5: \n",metric_dict_pd[metric_metadata].head(5))
 58 |             # print("Tail Before 5: \n",metric_dict_pd[metric_metadata].tail(5))
 59 |             metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].append(temp_df, ignore_index=True)
 60 |             # print("Head 5: \n",metric_dict_pd[metric_metadata].head(5))
 61 |             # print("Tail 5: \n",metric_dict_pd[metric_metadata].tail(5))
 62 |             mask = (metric_dict_pd[metric_metadata]['ds'] > earliest_data_time)
 63 |             metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].loc[mask]
 64 |             # del temp_df
 65 |             pass
 66 |         metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].dropna()
 67 |         metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].drop_duplicates('ds').sort_values(by=['ds']).reset_index(drop = True)
 68 | 
 69 |         if len(metric_dict_pd[metric_metadata]) == 0:
 70 |             del metric_dict_pd[metric_metadata]
 71 |             pass
 72 |         pass
 73 | 
 74 |         # print(metric_dict_pd[metric_metadata])
 75 |         # mask = (metric_dict_pd[metric_metadata]['ds'] > earliest_data_time) & (metric_dict_pd[metric_metadata]['ds'] <= current_time)
 76 |         # metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].loc[mask]
 77 |         # break
 78 |     return metric_dict_pd
 79 | 
 80 | 
 81 | def get_df_from_single_value_json(metric, metric_dict_pd={}, data_window=5):
 82 |     '''
 83 |     Method to convert a json object of a Prometheus metric to a dictionary of shaped Pandas DataFrames
 84 | 
 85 |     The shape is dict[metric_metadata] = Pandas Object
 86 | 
 87 |     Pandas Object = timestamp, value
 88 |                     15737933, 1
 89 |                     .....
 90 | 
 91 |     This method can also be used to update an existing dictionary with new data
 92 |     '''
 93 |     # metric_dict = {}
 94 |     current_time = datetime.now()
 95 |     earliest_data_time = current_time - timedelta(days = data_window)
 96 | 
 97 | 
 98 |     print("Pre-processing Data...........")
 99 |     # metric_dict_pd = {}
100 |     # print("Length of metric: ", len(metric))
101 |     for row in metric:
102 |         # metric_dict[str(row['metric'])] = metric_dict.get(str(row['metric']),[]) + (row['values'])
103 |         metric_metadata = str(SortedDict(row['metric']))[11:-1] # Sort the dictionary and then convert it to string so it can be hashed
104 |         # print(metric_metadata)
105 |         # print("Row Values: ",row['values'])
106 |         if  metric_metadata not in metric_dict_pd:
107 |             metric_dict_pd[metric_metadata] = pandas.DataFrame([row['value']], columns=['ds', 'y']).apply(pandas.to_numeric, args=({"errors":"coerce"}))
108 |             metric_dict_pd[metric_metadata]['ds'] = pandas.to_datetime(metric_dict_pd[metric_metadata]['ds'], unit='s')
109 |             pass
110 |         else:
111 |             temp_df = pandas.DataFrame([row['value']], columns=['ds', 'y']).apply(pandas.to_numeric, args=({"errors":"coerce"}))
112 |             temp_df['ds'] = pandas.to_datetime(temp_df['ds'], unit='s')
113 |             # print(temp_df.head())
114 |             # print("Row Values: ",row['values']
115 |             # print("Temp Head Before 5: \n",temp_df.head(5))
116 |             # print("Head Before 5: \n",metric_dict_pd[metric_metadata].head(5))
117 |             # print("Tail Before 5: \n",metric_dict_pd[metric_metadata].tail(5))
118 |             metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].append(temp_df, ignore_index=True)
119 |             # print("Head 5: \n",metric_dict_pd[metric_metadata].head(5))
120 |             # print("Tail 5: \n",metric_dict_pd[metric_metadata].tail(5))
121 |             mask = (metric_dict_pd[metric_metadata]['ds'] > earliest_data_time)
122 |             metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].loc[mask]
123 |             # del temp_df
124 |             pass
125 |         metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].dropna()
126 |         metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].drop_duplicates('ds').sort_values(by=['ds']).reset_index(drop = True)
127 | 
128 |         if len(metric_dict_pd[metric_metadata]) == 0:
129 |             del metric_dict_pd[metric_metadata]
130 |             pass
131 |         pass
132 | 
133 |         # print(metric_dict_pd[metric_metadata])
134 |         # mask = (metric_dict_pd[metric_metadata]['ds'] > earliest_data_time) & (metric_dict_pd[metric_metadata]['ds'] <= current_time)
135 |         # metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].loc[mask]
136 |         # break
137 |     return metric_dict_pd
138 | 
139 | def predict_metrics(pd_dict, prediction_range=1440):
140 |     '''
141 |     This Function takes input a dictionary of Pandas DataFrames, trains the Prophet model for each dataframe and returns a dictionary of predictions.
142 |     '''
143 | 
144 |     total_label_num = len(pd_dict)
145 |     # LABEL_LIMIT = limit_labels
146 |     PREDICT_DURATION = prediction_range
147 | 
148 |     current_label_num = 0
149 |     limit_iterator_num = 0
150 | 
151 |     predictions_dict = {}
152 | 
153 |     for meta_data in pd_dict:
154 |         try:
155 |             current_label_num += 1
156 |             limit_iterator_num += 1
157 | 
158 |             print("Training Label {}/{}".format(current_label_num,total_label_num))
159 |             data = pd_dict[meta_data]
160 | 
161 |             print("----------------------------------\n")
162 |             print(meta_data)
163 |             print("Number of Data Points: {}".format(len(pd_dict[meta_data])))
164 |             print("----------------------------------\n")
165 | 
166 |             data['ds'] = pandas.to_datetime(data['ds'], unit='s')
167 | 
168 |             train_frame = data
169 | 
170 |             # Prophet Modelling begins here
171 |             m = Prophet(daily_seasonality = True, weekly_seasonality=True)
172 | 
173 |             print("Fitting the train_frame")
174 |             m.fit(train_frame)
175 | 
176 |             future = m.make_future_dataframe(periods=int(PREDICT_DURATION),freq="1MIN")
177 | 
178 |             forecast = m.predict(future)
179 | 
180 |             # To Plot
181 |             # fig1 = m.plot(forecast)
182 |             #
183 |             # fig2 = m.plot_components(forecast)
184 |             forecast['timestamp'] = forecast['ds']
185 |             forecast = forecast[['timestamp','yhat','yhat_lower','yhat_upper']]
186 |             forecast = forecast.set_index('timestamp')
187 | 
188 |             # Store predictions in output dictionary
189 |             predictions_dict[meta_data] = forecast
190 | 
191 |             # forecast.plot()
192 |             # plt.legend()
193 |             # plt.show()
194 |         except ValueError as exception:
195 |             if str(exception) == "ValueError: Dataframe has less than 2 non-NaN rows.":
196 |                 print("Too many NaN values........Skipping this label")
197 |                 limit_iterator_num -= 1
198 |             else:
199 |                 raise exception
200 |         pass
201 | 
202 |     return predictions_dict
203 | 
204 | def fourierExtrapolation(x, n_predict, n_harm):
205 |     n = x.size
206 |     #n_harm = 100                     # number of harmonics in model
207 |     t = np.arange(0, n)
208 |     p = np.polyfit(t, x, 1)         # find linear trend in x
209 |     x_notrend = x - p[0] * t        # detrended x
210 |     x_freqdom = fft.fft(x_notrend)  # detrended x in frequency domain
211 |     f = fft.fftfreq(n)              # frequencies
212 |     indexes = np.arange(n).tolist()
213 |     # sort indexes by frequency, lower -> higher
214 |     indexes.sort(key = lambda i:np.absolute(f[i]))
215 | 
216 |     t = np.arange(0, n + n_predict)
217 |     restored_sig = np.zeros(t.size)
218 |     for i in indexes[:1 + n_harm * 2]:
219 |         ampli = np.absolute(x_freqdom[i]) / n   # amplitude
220 |         phase = np.angle(x_freqdom[i])          # phase
221 |         restored_sig += ampli * np.cos(2 * np.pi * f[i] * t + phase)
222 |     return restored_sig + p[0] * t
223 | 
224 | def predict_metrics_fourier(pd_dict, prediction_range=1440):
225 |     total_label_num = len(pd_dict)
226 |     PREDICT_DURATION = prediction_range
227 | 
228 |     current_label_num = 0
229 |     limit_iterator_num = 0
230 | 
231 |     predictions_dict = {}
232 | 
233 |     for meta_data in pd_dict:
234 |         try:
235 |             data = pd_dict[meta_data]
236 |             data['ds'] = pandas.to_datetime(data['ds'], unit='s')
237 |             vals = np.array(data["y"].tolist())
238 | 
239 |             # run model and trim forecast to only newest values
240 |             print("Training Model......")
241 |             forecast_vals = fourierExtrapolation(vals, prediction_range, int(len(vals)/3))
242 |             dataframe_cols = {}
243 |             dataframe_cols["yhat"] = np.array(forecast_vals)
244 | 
245 |             # find most recent timestamp from original data and extrapolate new
246 |             # timestamps
247 |             print("Creating Dummy Timestamps.....")
248 |             min_time = min(data["ds"])
249 |             dataframe_cols["timestamp"] = pandas.date_range(min_time, periods=len(forecast_vals), freq='min')
250 | 
251 |             # create dummy upper and lower bounds
252 |             print("Computing Bounds....")
253 |             upper_bound = np.mean(forecast_vals) + np.std(forecast_vals)
254 |             lower_bound = np.mean(forecast_vals) - np.std(forecast_vals)
255 |             dataframe_cols["yhat_upper"] = np.full((len(forecast_vals)), upper_bound)
256 |             dataframe_cols["yhat_lower"] = np.full((len(forecast_vals)), lower_bound)
257 | 
258 |             # create series and index into precictions_dict
259 |             print("Formatting Forecast to Pandas....")
260 |             forecast = pandas.DataFrame(data=dataframe_cols)
261 |             forecast = forecast.set_index('timestamp')
262 |             predictions_dict[meta_data] = forecast
263 | 
264 |             current_label_num += 1
265 |             limit_iterator_num += 1
266 |         except ValueError as exception:
267 |             if str(exception) == "ValueError: Dataframe has less than 2 non-NaN rows.":
268 |                 print("Too many NaN values........Skipping this label")
269 |                 limit_iterator_num -= 1
270 |             else:
271 |                 raise exception
272 |         pass
273 | 
274 |     return predictions_dict
275 | 
276 | class Accumulator:
277 |     def __init__(self,thresh):
278 |         self._counter = 0
279 |         self.thresh = thresh
280 |     def inc(self, val):
281 |         self._counter += val
282 |     def count(self):
283 |         return self._counter
284 | 
285 | def detect_anomalies(predictions, data):
286 |     if len(predictions) != len(data) :
287 |         raise IndexError
288 | 
289 |     # parameters
290 |     lower_bound_thresh = predictions["yhat_lower"].min()
291 |     upper_bound_thresh = predictions["yhat_upper"].max()
292 |     diff_thresh = 3*data["y"].std()
293 |     acc_thresh = int(0.1*np.shape(predictions)[0])
294 |     epsilon = .01
295 | 
296 |     diffs = []
297 |     acc = Accumulator(acc_thresh)
298 |     preds = np.array(predictions["yhat"])
299 |     dat = np.array(data["y"])
300 |     for i in range(0, np.shape(predictions)[0]):
301 |         diff = preds[i] - dat[i]
302 |         if abs(diff) > diff_thresh:
303 |             # upper bound anomaly, increment counter
304 |             acc.inc(1)
305 |         elif dat[i] < lower_bound_thresh:
306 |             # found trough, decrement so that acc will decay to 0
307 |             acc.inc(-3)
308 |         elif dat[i] > upper_bound_thresh:
309 |             # found peak, decrement so that acc will decay to 0
310 |             acc.inc(-3)
311 |         else:
312 |             # no anomaly, decrement by 2
313 |             acc.inc(-2)
314 | 
315 |         diffs.append(max(diff, 0))
316 | 
317 |     if acc.count() > acc.thresh:
318 |         acc_anomaly = True
319 |     else:
320 |         acc_anomaly = False
321 |     w_size = int(0.8*len(data))
322 |     w_prime_size = len(data) - w_size
323 | 
324 |     w = diffs[0:w_size]
325 |     w_prime = diffs[w_size:]
326 | 
327 |     w_mu = np.mean(w)
328 |     w_std = np.std(w)
329 |     w_prime_mu = np.mean(w_prime)
330 | 
331 |     if w_std == 0:
332 |         L_t = 0
333 |     else:
334 |         L_t = 1 - norm.sf((w_prime_mu - w_mu)/w_std)
335 | 
336 |     print(L_t)
337 |     if L_t >= 1 - epsilon:
338 |         tail_prob_anomaly = True
339 |     else:
340 |         tail_prob_anomaly = False
341 | 
342 |     return acc_anomaly and tail_prob_anomaly
343 | 
344 | 
345 | if __name__ == "__main__":
346 | 
347 |     url = os.getenv('URL')
348 |     token = os.getenv('BEARER_TOKEN')
349 | 
350 |     # Specific metric to run the model on
351 |     metric_name = os.getenv('METRIC_NAME','kubelet_docker_operations_latency_microseconds')
352 | 
353 |     print("Using Metric {}.".format(metric_name))
354 | 
355 |     # This is where the model dictionary will be stored and retrieved from
356 |     model_storage_path = "Models" + "/" + url[8:] + "/"+ metric_name + "/" + "prophet_model" + ".pkl"
357 | 
358 |     # Chunk size, download the complete data, but in smaller chunks, should be less than or equal to DATA_SIZE
359 |     chunk_size = str(os.getenv('CHUNK_SIZE','1d'))
360 | 
361 |     # Net data size to scrape from prometheus
362 |     data_size = str(os.getenv('DATA_SIZE','1d'))
363 | 
364 |     # Number of minutes, the model should predict the values for
365 |     # PREDICT_DURATION=1440 # minutes, 1440 = 24 Hours
366 | 
367 |     # Limit to first few labels of the metric
368 |     # LABEL_LIMIT = None
369 | 
370 |     # Preparing a connection to Prometheus host
371 |     prom = Prometheus(url=url, token=token, data_chunk=chunk_size, stored_data=data_size)
372 | 
373 | 
374 | 
375 |     # Get metric data from Prometheus
376 |     metric = prom.get_metric(metric_name)
377 |     print("metric collected.")
378 |     del prom
379 | 
380 |     # Convert data to json
381 |     metric = json.loads(metric)
382 | 
383 |     # print(metric)
384 | 
385 |     # Metric Json is converted to a shaped dataframe
386 |     pd_dict = get_df_from_json(metric) # This dictionary contains all the sub-labels as keys and their data as Pandas DataFrames
387 |     del metric
388 | 
389 |     predictions = predict_metrics(pd_dict)
390 |     for x in predictions:
391 |         print(predictions[x].head())
392 |     pass
393 | 


--------------------------------------------------------------------------------