├── lib ├── __init__.py ├── __pycache__ │ ├── ceph.cpython-36.pyc │ ├── model.cpython-36.pyc │ ├── __init__.cpython-36.pyc │ └── prometheus.cpython-36.pyc ├── model.py ├── ceph.py └── prometheus.py ├── .gitignore ├── .zuul.yaml ├── requirements.txt ├── Dockerfile ├── Makefile ├── ceph.py ├── train-prophet-deployment-template.yaml ├── prometheus.py ├── README.md ├── app.py └── model.py /lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.pyo 3 | *_old.py 4 | __pycache__/ 5 | *.json 6 | -------------------------------------------------------------------------------- /lib/__pycache__/ceph.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-anomaly-detector-legacy/HEAD/lib/__pycache__/ceph.cpython-36.pyc -------------------------------------------------------------------------------- /lib/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-anomaly-detector-legacy/HEAD/lib/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /.zuul.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | - project: 3 | check: 4 | jobs: 5 | - "thoth-coala" 6 | gate: 7 | jobs: 8 | - "thoth-coala" 9 | -------------------------------------------------------------------------------- /lib/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-anomaly-detector-legacy/HEAD/lib/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /lib/__pycache__/prometheus.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-anomaly-detector-legacy/HEAD/lib/__pycache__/prometheus.cpython-36.pyc -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | fbprophet 3 | pandas 4 | boto3 5 | matplotlib 6 | flask 7 | apscheduler 8 | prometheus_client 9 | sortedcontainers 10 | scipy 11 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/centos/python-36-centos7:latest 2 | 3 | 4 | ADD requirements.txt / 5 | RUN pip install -r /requirements.txt 6 | 7 | ADD app.py / 8 | ADD prometheus.py / 9 | ADD model.py / 10 | ADD ceph.py / 11 | ADD lib /lib 12 | 13 | 14 | CMD [ "python", "/app.py"] 15 | -------------------------------------------------------------------------------- /lib/model.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import json 3 | 4 | def get_df_from_json(metric): 5 | # metric_dict = {} 6 | metric_dict_pd = {} 7 | for row in metric: 8 | # metric_dict[str(row['metric'])] = metric_dict.get(str(row['metric']),[]) + (row['values']) 9 | metric_metadata = str(row['metric']) 10 | if metric_metadata not in metric_dict_pd: 11 | metric_dict_pd[metric_metadata] = pandas.DataFrame(columns=['timestamp', 'value']) 12 | pass 13 | else: 14 | # for value in (row['values']): 15 | # print(value) 16 | temp_df = pandas.DataFrame(row['values'], columns=['timestamp', 'value']) 17 | # print(temp_df.head()) 18 | metric_dict_pd[metric_metadata] = pandas.concat([metric_dict_pd[metric_metadata], temp_df]) 19 | del temp_df 20 | pass 21 | pass 22 | metric_dict_pd[metric_metadata].set_index('timestamp') 23 | # print(metric_dict_pd[metric_metadata]) 24 | # metric_dict_pd[metric_metadata]['timestamp'] = pandas.to_datetime(metric_dict_pd[metric_metadata]['timestamp'], unit='s') 25 | return metric_dict_pd 26 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Required Variables 2 | bearer_token= 3 | prometheus_url= 4 | 5 | block_storage_access_key= 6 | block_storage_secret_key= 7 | block_storage_bucket_name= 8 | block_storage_endpoint_url= 9 | 10 | # Optional Variables 11 | oc_app_name=train-prom-dh-prod 12 | docker_app_name=train-prometheus 13 | 14 | docker_build: 15 | docker build -t ${docker_app_name} . 16 | 17 | docker_test: 18 | docker run ${docker_app_name} 19 | 20 | docker_run: 21 | docker run -ti --rm \ 22 | --env "BEARER_TOKEN=${bearer_token}" \ 23 | --env "URL=${prometheus_url}" \ 24 | --env BOTO_ACCESS_KEY="${block_storage_access_key}" \ 25 | --env BOTO_SECRET_KEY="${block_storage_secret_key}" \ 26 | --env BOTO_OBJECT_STORE="${block_storage_bucket_name}" \ 27 | --env BOTO_STORE_ENDPOINT="${block_storage_endpoint_url}" \ 28 | ${docker_app_name}:latest 29 | 30 | oc_deploy: 31 | oc new-app --file=./train-prophet-deployment-template.yaml --param APPLICATION_NAME="${oc_app_name}" \ 32 | --param URL="${prometheus_url}" \ 33 | --param BEARER_TOKEN="${bearer_token}" \ 34 | --param BOTO_ACCESS_KEY="${block_storage_access_key}" \ 35 | --param BOTO_SECRET_KEY="${block_storage_secret_key}" \ 36 | --param BOTO_OBJECT_STORE="${block_storage_bucket_name}" \ 37 | --param BOTO_STORE_ENDPOINT="${block_storage_endpoint_url}" 38 | 39 | oc_delete_all: 40 | oc delete all -l app=${oc_app_name} 41 | 42 | run_model: 43 | BEARER_TOKEN=${bearer_token} \ 44 | URL=${prometheus_url} \ 45 | BOTO_ACCESS_KEY=${block_storage_access_key} \ 46 | BOTO_SECRET_KEY=${block_storage_secret_key} \ 47 | BOTO_OBJECT_STORE=${block_storage_bucket_name} \ 48 | BOTO_STORE_ENDPOINT=${block_storage_endpoint_url} \ 49 | python3 ../train-prometheus-prod/train-prometheus/app.py 50 | -------------------------------------------------------------------------------- /lib/ceph.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import bz2 3 | import os 4 | import pickle 5 | import botocore 6 | 7 | class CephConnect: 8 | def __init__(self, access_key = None, secret_key = None, object_store = None, object_store_endpoint = None): 9 | self.boto_settings = { 10 | 'access_key': os.getenv('BOTO_ACCESS_KEY', access_key), 11 | 'secret_key': os.getenv('BOTO_SECRET_KEY', secret_key), 12 | 'object_store': os.getenv('BOTO_OBJECT_STORE', object_store), 13 | 'object_store_endpoint': os.getenv('BOTO_STORE_ENDPOINT', object_store_endpoint) 14 | } 15 | 16 | def store_data(self, name, values, object_path = None): 17 | ''' 18 | Function to store predictions to ceph 19 | ''' 20 | if not values: 21 | return "No values for {}".format(name) 22 | # Create a session with CEPH (or any black storage) storage with the stored credentials 23 | session = boto3.Session( 24 | aws_access_key_id=self.boto_settings['access_key'], 25 | aws_secret_access_key=self.boto_settings['secret_key'] 26 | ) 27 | 28 | s3 = session.resource('s3', 29 | endpoint_url=self.boto_settings['object_store_endpoint'], 30 | verify=False) 31 | # prometheus-openshift-devops-monitor.a3c1.starter-us-west-1.openshiftapps.com/container_cpu_usage_percent_by_host/201807040259.json.bz2 32 | if not object_path: 33 | object_path = str(name) 34 | pass 35 | object_path = object_path + ".bz2" 36 | try: 37 | payload = bz2.compress(values.encode('utf-8')) 38 | 39 | except AttributeError: 40 | payload = bz2.compress(values) 41 | rv = s3.meta.client.put_object(Body=payload, 42 | Bucket=self.boto_settings['object_store'], 43 | Key=object_path) 44 | if rv['ResponseMetadata']['HTTPStatusCode'] == 200: 45 | return object_path 46 | else: 47 | return str(rv) 48 | 49 | def get_model_dict(self, model_storage_path): 50 | session = boto3.Session( 51 | aws_access_key_id=self.boto_settings['access_key'], 52 | aws_secret_access_key=self.boto_settings['secret_key'] 53 | ) 54 | 55 | s3 = session.resource('s3', 56 | endpoint_url=self.boto_settings['object_store_endpoint'], 57 | verify=False) 58 | # try to get model from ceph 59 | try: 60 | model_storage_path = model_storage_path + ".bz2" 61 | print("receiveing Object from: \n {}".format(model_storage_path)) 62 | 63 | received_object = s3.Object(self.boto_settings['object_store'], model_storage_path).get()['Body'].read() 64 | # print(type(received_object)) 65 | model_dict = pickle.loads(bz2.decompress(received_object)) 66 | # print(model_dict.keys()) 67 | except botocore.exceptions.ClientError as exc: 68 | if exc.response['Error']['Code'] in ('404', 'NoSuchKey'): 69 | # if no model in ceph, return an empty model dictionary 70 | print("Stored Model not found") 71 | model_dict = {} 72 | return model_dict 73 | -------------------------------------------------------------------------------- /ceph.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import bz2 3 | import os 4 | import pickle 5 | import botocore 6 | 7 | class CephConnect: 8 | def __init__(self, access_key = None, secret_key = None, object_store = None, object_store_endpoint = None): 9 | self.boto_settings = { 10 | 'access_key': os.getenv('BOTO_ACCESS_KEY', access_key), 11 | 'secret_key': os.getenv('BOTO_SECRET_KEY', secret_key), 12 | 'object_store': os.getenv('BOTO_OBJECT_STORE', object_store), 13 | 'object_store_endpoint': os.getenv('BOTO_STORE_ENDPOINT', object_store_endpoint) 14 | } 15 | 16 | def store_data(self, name, values, object_path = None): 17 | ''' 18 | Function to store predictions to ceph 19 | ''' 20 | if not values: 21 | return "No values for {}".format(name) 22 | # Create a session with CEPH (or any black storage) storage with the stored credentials 23 | session = boto3.Session( 24 | aws_access_key_id=self.boto_settings['access_key'], 25 | aws_secret_access_key=self.boto_settings['secret_key'] 26 | ) 27 | 28 | s3 = session.resource('s3', 29 | endpoint_url=self.boto_settings['object_store_endpoint'], 30 | verify=False) 31 | # prometheus-openshift-devops-monitor.a3c1.starter-us-west-1.openshiftapps.com/container_cpu_usage_percent_by_host/201807040259.json.bz2 32 | if not object_path: 33 | object_path = str(name) 34 | pass 35 | object_path = object_path + ".bz2" 36 | try: 37 | payload = bz2.compress(values.encode('utf-8')) 38 | 39 | except AttributeError: 40 | payload = bz2.compress(values) 41 | rv = s3.meta.client.put_object(Body=payload, 42 | Bucket=self.boto_settings['object_store'], 43 | Key=object_path) 44 | if rv['ResponseMetadata']['HTTPStatusCode'] == 200: 45 | return object_path 46 | else: 47 | return str(rv) 48 | 49 | def get_model_dict(self, model_storage_path): 50 | session = boto3.Session( 51 | aws_access_key_id=self.boto_settings['access_key'], 52 | aws_secret_access_key=self.boto_settings['secret_key'] 53 | ) 54 | 55 | s3 = session.resource('s3', 56 | endpoint_url=self.boto_settings['object_store_endpoint'], 57 | verify=False) 58 | # try to get model from ceph 59 | try: 60 | model_storage_path = model_storage_path + ".bz2" 61 | print("receiveing Object from: \n {}".format(model_storage_path)) 62 | 63 | received_object = s3.Object(self.boto_settings['object_store'], model_storage_path).get()['Body'].read() 64 | # print(type(received_object)) 65 | model_dict = pickle.loads(bz2.decompress(received_object)) 66 | # print(model_dict.keys()) 67 | except botocore.exceptions.ClientError as exc: 68 | if exc.response['Error']['Code'] in ('404', 'NoSuchKey'): 69 | # if no model in ceph, return an empty model dictionary 70 | print("Stored Model not found") 71 | model_dict = {} 72 | return model_dict 73 | 74 | def get_latest_df_dict(self, data_path=None): 75 | session = boto3.Session( 76 | aws_access_key_id=self.boto_settings['access_key'], 77 | aws_secret_access_key=self.boto_settings['secret_key'] 78 | ) 79 | 80 | s3 = session.resource('s3', 81 | endpoint_url=self.boto_settings['object_store_endpoint'], 82 | verify=False) 83 | s3_bucket = s3.Bucket(self.boto_settings['object_store']) 84 | 85 | try: 86 | object_list = [obj for obj in s3_bucket.objects.filter(Prefix=str(data_path))] 87 | latest_object = object_list[0] 88 | for obj in object_list: 89 | if int(obj.key[-16:-4]) > int(latest_object.key[-16:-4]): 90 | latest_object = obj 91 | received_data = latest_object.get()['Body'].read() 92 | data_dict = pickle.loads(bz2.decompress(received_data)) 93 | except botocore.exceptions.ClientError as exc: 94 | if exc.response['Error']['Code'] in ('404', 'NoSuchKey'): 95 | # if no data found in ceph, return an empty model dictionary 96 | print("Stored Data not found") 97 | data_dict = {} 98 | return data_dict 99 | -------------------------------------------------------------------------------- /train-prophet-deployment-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Template 3 | 4 | labels: 5 | application: train-prometheus 6 | 7 | metadata: 8 | name: train-prometheus-deployment-template 9 | 10 | parameters: 11 | - description: The name for job 12 | from: 'train-prometheus-[a-z0-9]{4}' 13 | generate: expression 14 | name: APPLICATION_NAME 15 | required: true 16 | - name: URL 17 | description: URL of prometheus server 18 | required: true 19 | - name: BEARER_TOKEN 20 | description: Bearer Token for accessing prometheus 21 | required: true 22 | - name: BOTO_ACCESS_KEY 23 | description: Access key to connect to CEPH endpoint storage (or any similar S3 type storage) 24 | required: true 25 | - name: BOTO_SECRET_KEY 26 | description: Secret key to connect to CEPH endpoint storage (or any similar S3 type storage) 27 | required: true 28 | - name: BOTO_OBJECT_STORE 29 | description: Bucket Name on CEPH (or any similar S3 type storage) 30 | required: true 31 | - name: BOTO_STORE_ENDPOINT 32 | description: The URL to connect to the CEPH storage (or any similar S3 type storage) 33 | required: true 34 | - name: GIT_URI 35 | value: https://github.com/4n4nd/train-prometheus.git 36 | required: true 37 | - name: CHUNK_SIZE 38 | description: Size of chunks in which Data is scraped from Prometheus (Should be smaller than DATA_SIZE) 39 | required: false 40 | value: '1d' 41 | - name: DATA_SIZE 42 | description: Size of data scraped from Prometheus (Should be bigger than CHUNK_SIZE) 43 | required: false 44 | value: '1d' 45 | - name: TRAINING_REPEAT_HOURS 46 | description: number of hours to repeat model training 47 | required: false 48 | value: '6' 49 | - name: DATA_WINDOW_SIZE 50 | description: Sliding data window size in days (Number of days worth of past data to use as training data ) 51 | required: false 52 | value: '60' 53 | - name: STORE_INTERMEDIATE_DATA 54 | description: Store Dataframes of cumulated training dataframes to ceph 55 | required: false 56 | value: 'True' 57 | - name: GET_OLDER_DATA 58 | description: Use the previously stored dataframes in ceph to train the models 59 | required: false 60 | value: 'True' 61 | 62 | objects: 63 | - apiVersion: v1 64 | kind: ImageStream 65 | metadata: 66 | name: ${APPLICATION_NAME} 67 | labels: 68 | app: ${APPLICATION_NAME} 69 | spec: 70 | dockerImageRepository: ${APPLICATION_NAME} 71 | tags: 72 | - name: latest 73 | lookupPolicy: 74 | local: true 75 | 76 | - apiVersion: v1 77 | kind: BuildConfig 78 | metadata: 79 | name: ${APPLICATION_NAME} 80 | labels: 81 | app: ${APPLICATION_NAME} 82 | spec: 83 | resources: 84 | limits: 85 | memory: 4Gi 86 | cpu: "2" 87 | output: 88 | to: 89 | kind: ImageStreamTag 90 | name: ${APPLICATION_NAME}:latest 91 | source: 92 | git: 93 | uri: ${GIT_URI} 94 | type: Git 95 | strategy: 96 | type: Source 97 | sourceStrategy: 98 | env: 99 | - name: APP_FILE 100 | value: 'app.py' 101 | - name: GIT_SSL_NO_VERIFY 102 | value: 'true' 103 | forcePull: true 104 | from: 105 | kind: DockerImage 106 | name: 'docker.io/centos/python-36-centos7:latest' 107 | triggers: 108 | - imageChange: {} 109 | type: ImageChange 110 | - type: ConfigChange 111 | 112 | - apiVersion: v1 113 | kind: DeploymentConfig 114 | metadata: 115 | name: ${APPLICATION_NAME} 116 | labels: 117 | deploymentConfig: ${APPLICATION_NAME} 118 | app: ${APPLICATION_NAME} 119 | spec: 120 | replicas: 1 121 | selector: 122 | deploymentConfig: ${APPLICATION_NAME} 123 | strategy: 124 | type: Rolling 125 | template: 126 | metadata: 127 | labels: 128 | deploymentConfig: ${APPLICATION_NAME} 129 | app: ${APPLICATION_NAME} 130 | spec: 131 | containers: 132 | - env: 133 | - name: PROM_BACKUP_ALL 134 | value: "true" 135 | - name: BEARER_TOKEN 136 | value: "${BEARER_TOKEN}" 137 | - name: URL 138 | value: "${URL}" 139 | - name: BOTO_ACCESS_KEY 140 | value: "${BOTO_ACCESS_KEY}" 141 | - name: BOTO_SECRET_KEY 142 | value: "${BOTO_SECRET_KEY}" 143 | - name: BOTO_OBJECT_STORE 144 | value: "${BOTO_OBJECT_STORE}" 145 | - name: BOTO_STORE_ENDPOINT 146 | value: "${BOTO_STORE_ENDPOINT}" 147 | - name: CHUNK_SIZE 148 | value: "${CHUNK_SIZE}" 149 | - name: DATA_SIZE 150 | value: "${DATA_SIZE}" 151 | - name: TRAINING_REPEAT_HOURS 152 | value: "${TRAINING_REPEAT_HOURS}" 153 | - name: DATA_WINDOW_SIZE 154 | value: "${DATA_WINDOW_SIZE}" 155 | - name: STORE_INTERMEDIATE_DATA 156 | value: "${STORE_INTERMEDIATE_DATA}" 157 | - name: GET_OLDER_DATA 158 | value: "${GET_OLDER_DATA}" 159 | image: ${APPLICATION_NAME} 160 | imagePullPolicy: IfNotPresent 161 | name: ${APPLICATION_NAME} 162 | resources: 163 | requests: 164 | memory: 500Mi 165 | cpu: "4" 166 | limits: 167 | memory: 16Gi 168 | cpu: "4" 169 | terminationMessagePath: /dev/termination-log 170 | dnsPolicy: ClusterFirst 171 | restartPolicy: Always 172 | triggers: 173 | - imageChangeParams: 174 | automatic: true 175 | containerNames: 176 | - ${APPLICATION_NAME} 177 | from: 178 | kind: ImageStreamTag 179 | name: ${APPLICATION_NAME}:latest 180 | type: ImageChange 181 | - type: ConfigChange 182 | 183 | - apiVersion: v1 184 | kind: Service 185 | metadata: 186 | name: ${APPLICATION_NAME} 187 | labels: 188 | app: ${APPLICATION_NAME} 189 | spec: 190 | ports: 191 | - name: 8080-tcp 192 | port: 8080 193 | protocol: TCP 194 | targetPort: 8080 195 | selector: 196 | deploymentConfig: ${APPLICATION_NAME} 197 | 198 | - apiVersion: v1 199 | kind: Route 200 | metadata: 201 | labels: 202 | app: ${APPLICATION_NAME} 203 | name: ${APPLICATION_NAME} 204 | spec: 205 | port: 206 | targetPort: 8080-tcp 207 | to: 208 | kind: Service 209 | name: ${APPLICATION_NAME} 210 | -------------------------------------------------------------------------------- /prometheus.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | import requests 3 | import datetime 4 | import json 5 | 6 | # Disable SSL warnings 7 | from requests.packages.urllib3.exceptions import InsecureRequestWarning 8 | requests.packages.urllib3.disable_warnings(InsecureRequestWarning) 9 | 10 | DEBUG = False 11 | MAX_REQUEST_RETRIES = 5 12 | 13 | class Prometheus: 14 | """docstring for Prometheus.""" 15 | def __init__(self, url='', end_time=None, token=None, data_chunk='1h',stored_data='1h'): 16 | self.headers = { 'Authorization': "bearer {}".format(token) } 17 | self.url = url 18 | self.prometheus_host = urlparse(self.url).netloc 19 | self._all_metrics = None 20 | self.data_chunk_size = data_chunk 21 | self.end_time = datetime.datetime.now() 22 | self.stored_data_range = stored_data 23 | self.DATA_CHUNK_SIZE_LIST = { 24 | '1m' : 60, 25 | '3m' : 180, 26 | '5m' : 300, 27 | '30m': 1800, 28 | '1h' : 3600, 29 | '3h' : 10800, 30 | '6h' : 21600, 31 | '12h': 43200, 32 | '1d' : 86400, 33 | '2d' : 172800} 34 | 35 | def all_metrics(self): 36 | ''' 37 | Get the list of all the metrics that the prometheus host has 38 | ''' 39 | if not self._all_metrics: 40 | response = requests.get('{0}/api/v1/label/__name__/values'.format(self.url), 41 | verify=False, # Disable ssl certificate verification temporarily 42 | headers=self.headers) 43 | if DEBUG: 44 | print("Headers -> ",self.headers) 45 | print("URL => ", response.url) 46 | if response.status_code == 200: 47 | self._all_metrics = response.json()['data'] 48 | else: 49 | raise Exception("HTTP Status Code {} {} ({})".format( 50 | response.status_code, 51 | requests.status_codes._codes[response.status_code][0], 52 | response.content 53 | )) 54 | return self._all_metrics 55 | 56 | def get_metric(self, name, chunks=None, data_size=None): 57 | if chunks: 58 | if str(chunks) in self.DATA_CHUNK_SIZE_LIST: 59 | self.data_chunk_size = str(chunks) 60 | pass 61 | else: 62 | print("Invalid Chunk Size, using default value: {}".format(self.data_chunk_size)) 63 | pass 64 | if data_size: 65 | if str(data_size) in self.DATA_CHUNK_SIZE_LIST: 66 | self.stored_data_range = str(data_size) 67 | pass 68 | else: 69 | print("Invalid Data Size, using default value: {}".format(self.stored_data_range)) 70 | pass 71 | 72 | if not name in self.all_metrics(): 73 | raise Exception("{} is not a valid metric".format(name)) 74 | elif DEBUG: 75 | print("Metric is valid.") 76 | 77 | # num_chunks = 1 78 | num_chunks = int(self.DATA_CHUNK_SIZE_LIST[self.stored_data_range]/self.DATA_CHUNK_SIZE_LIST[self.data_chunk_size]) # Calculate the number of chunks using total data size and chunk size. 79 | metrics = self.get_metrics_from_prom(name, num_chunks) 80 | if metrics: 81 | return metrics 82 | 83 | 84 | def get_metrics_from_prom(self, name, chunks): 85 | if not name in self.all_metrics(): 86 | raise Exception("{} is not a valid metric".format(name)) 87 | 88 | # start = self.start_time.timestamp() 89 | end_timestamp = self.end_time.timestamp() 90 | chunk_size = self.DATA_CHUNK_SIZE_LIST[self.data_chunk_size] 91 | start = end_timestamp - self.DATA_CHUNK_SIZE_LIST[self.stored_data_range] + chunk_size 92 | data = [] 93 | for i in range(chunks): 94 | # gc.collect() # Garbage collect to save Memory 95 | if DEBUG: 96 | print("Getting chunk: ", i) 97 | print("Start Time: ",datetime.datetime.fromtimestamp(start)) 98 | 99 | tries = 0 100 | while tries < MAX_REQUEST_RETRIES: # Retry code in case of errors 101 | response = requests.get('{0}/api/v1/query'.format(self.url), # using the query API to get raw data 102 | params={'query': name+'['+self.data_chunk_size+']', 103 | 'time': start 104 | }, 105 | verify=False, # Disable ssl certificate verification temporarily 106 | headers=self.headers) 107 | if DEBUG: 108 | print(response.url) 109 | pass 110 | 111 | tries+=1 112 | if response.status_code == 200: 113 | data += response.json()['data']['result'] 114 | 115 | if DEBUG: 116 | # print("Size of recent chunk = ",getsizeof(data)) 117 | # print(data) 118 | print(datetime.datetime.fromtimestamp(response.json()['data']['result'][0]['values'][0][0])) 119 | print(datetime.datetime.fromtimestamp(response.json()['data']['result'][0]['values'][-1][0])) 120 | pass 121 | 122 | del response 123 | tries = MAX_REQUEST_RETRIES 124 | elif response.status_code == 504: 125 | if tries >= MAX_REQUEST_RETRIES: 126 | self.connection_errors_count+=1 127 | return False 128 | else: 129 | print("Retry Count: ",tries) 130 | sleep(CONNECTION_RETRY_WAIT_TIME) # Wait for a second before making a new request 131 | else: 132 | if tries >= MAX_REQUEST_RETRIES: 133 | self.connection_errors_count+=1 134 | raise Exception("HTTP Status Code {} {} ({})".format( 135 | response.status_code, 136 | requests.status_codes._codes[response.status_code][0], 137 | response.content 138 | )) 139 | else: 140 | print("Retry Count: ",tries) 141 | sleep(CONNECTION_RETRY_WAIT_TIME) 142 | 143 | start += chunk_size 144 | 145 | return(json.dumps(data)) 146 | 147 | def get_current_metric_value(self, metric_name, label_config = None): 148 | data = [] 149 | if label_config: 150 | label_list = [str(key+"="+ "'" + label_config[key]+ "'") for key in label_config] 151 | # print(label_list) 152 | query = metric_name + "{" + ",".join(label_list) + "}" 153 | else: 154 | query = metric_name 155 | response = requests.get('{0}/api/v1/query'.format(self.url), # using the query API to get raw data 156 | params={'query': query},#label_config}, 157 | verify=False, # Disable ssl certificate verification temporarily 158 | headers=self.headers) 159 | data += response.json()['data']['result'] 160 | return (json.dumps(data)) 161 | pass 162 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 3 | # !Newer rewritten version is available here: https://github.com/AICoE/prometheus-anomaly-detector 4 | 5 | # Train Prometheus 6 | This python application has been written to deploy a training pipeline on OpenShift. This pipeline will at regular specified intervals collect new data directly from a prometheus instance and train a model on it regularly. This application also hosts a web page which can be used as a target for prometheus. This target currently serves 6 different metrics using two different prediction models (Prophet and Fourier Extrapolation). 7 | 8 | ## Getting Started 9 | 10 | ### Installing prerequisites 11 | 12 | To run this application you will need to install several libraries listed in the requirements.txt. 13 | 14 | To install all the dependencies at once, run the following command when inside the directory: 15 | ``` 16 | pip install -r requirements.txt 17 | ``` 18 | After all the prerequisites have been installed, open the Makefile and you will see a list of required and optional variables in the beginning. 19 | The required variables will be used to communicate with the Prometheus and Storage end-points. 20 | 21 | Populating the Makefile is the most important step, as you can use this to run the application on OpenShift, Docker or your local machine. 22 | 23 | ### Running on a local machine 24 | 25 | After setting up the credentials in your Makefile, run the following command to run a flask server which will regularly train and serve the predicted metrics as a prometheus target: 26 | 27 | ``` 28 | make run_model 29 | ``` 30 | ## Running on Docker 31 | After populating all the required variables, set the name for your docker app by changing the docker_app_name variable. Then run the following command to build the docker image. 32 | ``` 33 | make docker_build 34 | ``` 35 | This command uses the Dockerfile included in the repository to build an image. So you can use it to customize how the image is built. 36 | 37 | After the image is successfully built, you can run the following command to run a flask server in a docker container, this command also specifies on which the predicted metrics are served which can be easily changed in the Makefile. 38 | 39 | ``` 40 | make docker_run 41 | ``` 42 | ## Deploying on OpenShift 43 | 44 | * ### Deploying a flask application to predict and serve the predicted metrics: 45 | In the Makefile set up the required variables, and then run the following command: 46 | ``` 47 | make oc_deploy 48 | ``` 49 | This will create a deployment on OpenShift and which after training the prophet model, will serve the predicted metrics as a web page (using the flask web server), these predicted metrics can later be easily collected by a prometheus instance. 50 | 51 | Following is a sample web page view of what the metrics will look like: 52 | ``` 53 | # HELP process_virtual_memory_bytes Virtual memory size in bytes. 54 | # TYPE process_virtual_memory_bytes gauge 55 | process_virtual_memory_bytes 13.0 56 | # HELP process_resident_memory_bytes Resident memory size in bytes. 57 | # TYPE process_resident_memory_bytes gauge 58 | process_resident_memory_bytes 31.0 59 | # HELP process_start_time_seconds Start time of the process since unix epoch in seconds. 60 | # TYPE process_start_time_seconds gauge 61 | process_start_time_seconds 15.25 62 | # HELP process_cpu_seconds_total Total user and system CPU time spent in seconds. 63 | # TYPE process_cpu_seconds_total counter 64 | process_cpu_seconds_total 69.88 65 | # HELP process_open_fds Number of open file descriptors. 66 | # TYPE process_open_fds gauge 67 | process_open_fds 60.0 68 | # HELP process_max_fds Maximum number of open file descriptors. 69 | # TYPE process_max_fds gauge 70 | process_max_fds 14.0 71 | # HELP python_info Python platform information 72 | # TYPE python_info gauge 73 | python_info{implementation="CPython",major="3",minor="6",patchlevel="5",version="3.6.5"} 1.0 74 | # HELP predicted_values_prophet Forecasted value from Prophet model 75 | # TYPE predicted_values_prophet gauge 76 | predicted_values_prophet{beta_kubernetes_io_arch="amd64",beta_kubernetes_io_os="linux",instance="cpt-0001.redhat.com",job="kubernetes-nodes",kubernetes_io_hostname="cpt-0001.redhat.com",node_role_kubernetes_io_compute="true",operation_type="create_container",provider="rhos",quantile="0.5",region="compute",size="small"} 32.99 77 | # HELP predicted_values_prophet_yhat_upper Forecasted value upper bound from Prophet model 78 | # TYPE predicted_values_prophet_yhat_upper gauge 79 | predicted_values_prophet_yhat_upper{beta_kubernetes_io_arch="amd64",beta_kubernetes_io_os="linux",instance="cpt.redhat.com",job="kubernetes-nodes",kubernetes_io_hostname="cpt-0001.redhat.com",node_role_kubernetes_io_compute="true",operation_type="create_container",provider="rhos",quantile="0.5",region="compute",size="small"} 36.728885 80 | # HELP predicted_values_prophet_yhat_lower Forecasted value lower bound from Prophet model 81 | # TYPE predicted_values_prophet_yhat_lower gauge 82 | predicted_values_prophet_yhat_lower{beta_kubernetes_io_arch="amd64",beta_kubernetes_io_os="linux",instance="cpt-0001.redhat.com",job="kubernetes-nodes",kubernetes_io_hostname="cpt-0001.redhat.com",node_role_kubernetes_io_compute="true",operation_type="create_container",provider="rhos",quantile="0.5",region="compute",size="small"} 27881.58691175386 83 | # HELP predicted_values_fourier Forecasted value from Fourier Transform model 84 | # TYPE predicted_values_fourier gauge 85 | predicted_values_fourier{beta_kubernetes_io_arch="amd64",beta_kubernetes_io_os="linux",instance="cpt-0001.redhat.com",job="kubernetes-nodes",kubernetes_io_hostname="cpt-0001.redhat.com",node_role_kubernetes_io_compute="true",operation_type="create_container",provider="rhos",quantile="0.5",region="compute",size="small"} 29838.64724605837 86 | # HELP predicted_values_fourier_yhat_upper Forecasted value upper bound from Fourier Transform model 87 | # TYPE predicted_values_fourier_yhat_upper gauge 88 | predicted_values_fourier_yhat_upper{beta_kubernetes_io_arch="amd64",beta_kubernetes_io_os="linux",instance="cpt-0001.redhat.com",job="kubernetes-nodes",kubernetes_io_hostname="cpt-0001.redhat.com",node_role_kubernetes_io_compute="true",operation_type="create_container",provider="rhos",quantile="0.5",region="compute",size="small"} 37111.31044977396 89 | # HELP predicted_values_fourier_yhat_lower Forecasted value lower bound from Fourier Transform model 90 | # TYPE predicted_values_fourier_yhat_lower gauge 91 | predicted_values_fourier_yhat_lower{beta_kubernetes_io_arch="amd64",beta_kubernetes_io_os="linux",instance="cpt-0001.redhat.com",job="kubernetes-nodes",kubernetes_io_hostname="cpt-0001.redhat.com",node_role_kubernetes_io_compute="true",operation_type="create_container",provider="rhos",quantile="0.5",region="compute",size="small"} 29739.05799347848 92 | ``` 93 | 94 | ## Built With 95 | 96 | * [fbprohphet](https://github.com/facebook/prophet) - Facebook's timeseries forecasting library 97 | * [requests](http://docs.python-requests.org/en/master/) - HTTP Library for python 98 | * [boto3](https://boto3.readthedocs.io/en/latest/reference/core/session.html) - AWS sdk for python 99 | * [pandas](http://pandas.pydata.org/) - High Performance Data Structure 100 | * [flask](http://flask.pocoo.org/) - A lightweight web application framework 101 | * [apscheduler](https://apscheduler.readthedocs.io/en/latest/) - Python Scheduling library 102 | * [prometheus_client](https://github.com/prometheus/client_python) - Official Python client for Prometheus 103 | * [sortedcontainers](http://www.grantjenks.com/docs/sortedcontainers/) - Pure python sorted simple data structures 104 | * [Anomaly Detection](https://github.com/nfrumkin/forecast-prometheus/blob/master/anomaly_detector.py) - Anomaly Detection Function by Natasha Frumkin 105 | * [Fourier Extrapolation Model](https://github.com/nfrumkin/forecast-prometheus/blob/master/fourier_train.py) - Fourier Extrapolation Model by Natasha Frumkin 106 | * [Serving Prometheus Metrics](https://github.com/hemajv/flask-prometheus/blob/master/servicemetrics.py) - Flask Server to host Prometheus metrics by Hema Veeradhi 107 | -------------------------------------------------------------------------------- /lib/prometheus.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | import requests 3 | import datetime 4 | import json 5 | 6 | # Disable SSL warnings 7 | from requests.packages.urllib3.exceptions import InsecureRequestWarning 8 | requests.packages.urllib3.disable_warnings(InsecureRequestWarning) 9 | 10 | DEBUG = False 11 | MAX_REQUEST_RETRIES = 5 12 | 13 | class Prometheus: 14 | """docstring for Prometheus.""" 15 | def __init__(self, url='', end_time=None, token=None, data_chunk='1h',stored_data='1h'): 16 | self.headers = { 'Authorization': "bearer {}".format(token) } 17 | self.url = url 18 | self.prometheus_host = urlparse(self.url).netloc 19 | self._all_metrics = None 20 | self.data_chunk_size = data_chunk 21 | self.end_time = datetime.datetime.now() 22 | self.stored_data_range = stored_data 23 | self.DATA_CHUNK_SIZE_LIST = { 24 | '1m' : 60, 25 | '5m' : 300, 26 | '30m': 1800, 27 | '1h' : 3600, 28 | '3h' : 10800, 29 | '6h' : 21600, 30 | '12h': 43200, 31 | '1d' : 86400, 32 | '2d' : 172800} 33 | 34 | def all_metrics(self): 35 | ''' 36 | Get the list of all the metrics that the prometheus host has 37 | ''' 38 | if not self._all_metrics: 39 | response = requests.get('{0}/api/v1/label/__name__/values'.format(self.url), 40 | verify=False, # Disable ssl certificate verification temporarily 41 | headers=self.headers) 42 | if DEBUG: 43 | print("Headers -> ",self.headers) 44 | print("URL => ", response.url) 45 | if response.status_code == 200: 46 | self._all_metrics = response.json()['data'] 47 | else: 48 | raise Exception("HTTP Status Code {} {} ({})".format( 49 | response.status_code, 50 | requests.status_codes._codes[response.status_code][0], 51 | response.content 52 | )) 53 | return self._all_metrics 54 | 55 | def get_metric(self, name, chunks=None, data_size=None): 56 | if chunks: 57 | if str(chunks) in self.DATA_CHUNK_SIZE_LIST: 58 | self.data_chunk_size = str(chunks) 59 | pass 60 | else: 61 | print("Invalid Chunk Size, using default value: {}".format(self.data_chunk_size)) 62 | pass 63 | if data_size: 64 | if str(data_size) in self.DATA_CHUNK_SIZE_LIST: 65 | self.stored_data_range = str(data_size) 66 | pass 67 | else: 68 | print("Invalid Data Size, using default value: {}".format(self.stored_data_range)) 69 | pass 70 | 71 | if not name in self.all_metrics(): 72 | raise Exception("{} is not a valid metric".format(name)) 73 | elif DEBUG: 74 | print("Metric is valid.") 75 | 76 | # num_chunks = 1 77 | num_chunks = int(self.DATA_CHUNK_SIZE_LIST[self.stored_data_range]/self.DATA_CHUNK_SIZE_LIST[self.data_chunk_size]) # Calculate the number of chunks using total data size and chunk size. 78 | metrics = self.get_metrics_from_prom(name, num_chunks) 79 | if metrics: 80 | return metrics 81 | 82 | # def get_metrics_from_prom(self, name, chunks): 83 | # if not name in self.all_metrics(): 84 | # raise Exception("{} is not a valid metric".format(name)) 85 | # 86 | # # start = self.start_time.timestamp() 87 | # end_timestamp = self.end_time.timestamp() 88 | # chunk_size = self.DATA_CHUNK_SIZE_LIST[self.data_chunk_size] 89 | # start = end_timestamp #- self.DATA_CHUNK_SIZE_LIST[self.stored_data_range] + chunk_size 90 | # data = [] 91 | # for i in range(chunks): 92 | # # gc.collect() # Garbage collect to save Memory 93 | # if DEBUG: 94 | # print("Getting chunk: ", i) 95 | # print("Start Time: ",datetime.datetime.fromtimestamp(start)) 96 | # 97 | # tries = 0 98 | # while tries < MAX_REQUEST_RETRIES: # Retry code in case of errors 99 | # response = requests.get('{0}/api/v1/query'.format(self.url), # using the query API to get raw data 100 | # params={'query': name+'['+self.data_chunk_size+']', 101 | # 'time': start 102 | # }, 103 | # verify=False, # Disable ssl certificate verification temporarily 104 | # headers=self.headers) 105 | # if DEBUG: 106 | # print(response.url) 107 | # pass 108 | # 109 | # tries+=1 110 | # if response.status_code == 200: 111 | # data += response.json()['data']['result'] 112 | # 113 | # if DEBUG: 114 | # # print("Size of recent chunk = ",getsizeof(data)) 115 | # # print(data) 116 | # print(datetime.datetime.fromtimestamp(response.json()['data']['result'][0]['values'][0][0])) 117 | # print(datetime.datetime.fromtimestamp(response.json()['data']['result'][0]['values'][-1][0])) 118 | # pass 119 | # 120 | # del response 121 | # tries = MAX_REQUEST_RETRIES 122 | # elif response.status_code == 504: 123 | # if tries >= MAX_REQUEST_RETRIES: 124 | # self.connection_errors_count+=1 125 | # return False 126 | # else: 127 | # print("Retry Count: ",tries) 128 | # sleep(CONNECTION_RETRY_WAIT_TIME) # Wait for a second before making a new request 129 | # else: 130 | # if tries >= MAX_REQUEST_RETRIES: 131 | # self.connection_errors_count+=1 132 | # raise Exception("HTTP Status Code {} {} ({})".format( 133 | # response.status_code, 134 | # requests.status_codes._codes[response.status_code][0], 135 | # response.content 136 | # )) 137 | # else: 138 | # print("Retry Count: ",tries) 139 | # sleep(CONNECTION_RETRY_WAIT_TIME) 140 | # 141 | # start += chunk_size 142 | # 143 | # return(json.dumps(data)) #This works 144 | 145 | def get_metrics_from_prom(self, name, chunks): 146 | if not name in self.all_metrics(): 147 | raise Exception("{} is not a valid metric".format(name)) 148 | 149 | # start = self.start_time.timestamp() 150 | end_timestamp = self.end_time.timestamp() 151 | chunk_size = self.DATA_CHUNK_SIZE_LIST[self.data_chunk_size] 152 | start = end_timestamp - self.DATA_CHUNK_SIZE_LIST[self.stored_data_range] + chunk_size 153 | data = [] 154 | for i in range(chunks): 155 | # gc.collect() # Garbage collect to save Memory 156 | if DEBUG: 157 | print("Getting chunk: ", i) 158 | print("Start Time: ",datetime.datetime.fromtimestamp(start)) 159 | 160 | tries = 0 161 | while tries < MAX_REQUEST_RETRIES: # Retry code in case of errors 162 | response = requests.get('{0}/api/v1/query'.format(self.url), # using the query API to get raw data 163 | params={'query': name+'['+self.data_chunk_size+']', 164 | 'time': start 165 | }, 166 | verify=False, # Disable ssl certificate verification temporarily 167 | headers=self.headers) 168 | if DEBUG: 169 | print(response.url) 170 | pass 171 | 172 | tries+=1 173 | if response.status_code == 200: 174 | data += response.json()['data']['result'] 175 | 176 | if DEBUG: 177 | # print("Size of recent chunk = ",getsizeof(data)) 178 | # print(data) 179 | print(datetime.datetime.fromtimestamp(response.json()['data']['result'][0]['values'][0][0])) 180 | print(datetime.datetime.fromtimestamp(response.json()['data']['result'][0]['values'][-1][0])) 181 | pass 182 | 183 | del response 184 | tries = MAX_REQUEST_RETRIES 185 | elif response.status_code == 504: 186 | if tries >= MAX_REQUEST_RETRIES: 187 | self.connection_errors_count+=1 188 | return False 189 | else: 190 | print("Retry Count: ",tries) 191 | sleep(CONNECTION_RETRY_WAIT_TIME) # Wait for a second before making a new request 192 | else: 193 | if tries >= MAX_REQUEST_RETRIES: 194 | self.connection_errors_count+=1 195 | raise Exception("HTTP Status Code {} {} ({})".format( 196 | response.status_code, 197 | requests.status_codes._codes[response.status_code][0], 198 | response.content 199 | )) 200 | else: 201 | print("Retry Count: ",tries) 202 | sleep(CONNECTION_RETRY_WAIT_TIME) 203 | 204 | start += chunk_size 205 | 206 | return(json.dumps(data)) 207 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | import os 4 | import sys 5 | import bz2 6 | import pandas 7 | import argparse 8 | import pickle 9 | from flask import Flask, render_template_string, abort, Response 10 | from datetime import datetime, timedelta 11 | from prometheus_client import CollectorRegistry, generate_latest, REGISTRY, Counter, Gauge, Histogram 12 | from prometheus import Prometheus 13 | from model import * 14 | from ceph import CephConnect as cp 15 | from ast import literal_eval 16 | # Scheduling stuff 17 | from apscheduler.schedulers.background import BackgroundScheduler 18 | from apscheduler.triggers.interval import IntervalTrigger 19 | import atexit 20 | 21 | 22 | app = Flask(__name__) 23 | 24 | data_window = int(os.getenv('DATA_WINDOW_SIZE',60)) # Number of days of past data, the model should use to train 25 | 26 | url = os.getenv('URL') 27 | token = os.getenv('BEARER_TOKEN') 28 | 29 | # Specific metric to run the model on 30 | metric_name = os.getenv('METRIC_NAME','kubelet_docker_operations_latency_microseconds') 31 | 32 | print("Using Metric {}.".format(metric_name)) 33 | 34 | # This is where the model dictionary will be stored and retrieved from 35 | data_storage_path = "Data_Frames" + "/" + url[8:] + "/"+ metric_name + "/" + "prophet_model" + ".pkl" 36 | 37 | # Chunk size, download the complete data, but in smaller chunks, should be less than or equal to DATA_SIZE 38 | chunk_size = str(os.getenv('CHUNK_SIZE','1h')) 39 | 40 | # Net data size to scrape from prometheus 41 | data_size = str(os.getenv('DATA_SIZE','1h')) 42 | 43 | train_schedule = int(os.getenv('TRAINING_REPEAT_HOURS',6)) 44 | 45 | 46 | TRUE_LIST = ["True", "true", "1", "y"] 47 | 48 | store_intermediate_data = os.getenv("STORE_INTERMEDIATE_DATA", "False") # Setting this to true will store intermediate dataframes to ceph 49 | 50 | 51 | if str(os.getenv('GET_OLDER_DATA',"False")) in TRUE_LIST: 52 | print("Collecting previously stored data from {}".format(data_storage_path)) 53 | data_dict = cp().get_latest_df_dict(data_storage_path) # Need error handling inside this function, in case the storage path does not exist 54 | pass 55 | else: 56 | data_dict = {} 57 | 58 | 59 | config_list = [] 60 | fixed_label_config = str(os.getenv("LABEL_CONFIG",None)) # by default it will train for all label configurations. WARNING: Tthat might take a lot of time depending on your metrics and cpu 61 | if fixed_label_config != "None": 62 | config_list = fixed_label_config.split(";") # Separate multiple label configurations using a ';' (semi-colon) 63 | fixed_label_config_dict = literal_eval(config_list[0]) # # TODO: Add more error handling here 64 | 65 | 66 | predictions_dict_prophet = {} 67 | predictions_dict_fourier = {} 68 | current_metric_metadata = "" 69 | current_metric_metadata_dict = {} 70 | 71 | # iteration = 0 72 | def job(current_time): 73 | # TODO: Replace this function with model training function and set up the correct IntervalTrigger time 74 | global data_dict, predictions_dict_prophet, predictions_dict_fourier, current_metric_metadata, current_metric_metadata_dict, data_window, url, token, chunk_size, data_size, TRUE_LIST, store_intermediate_data 75 | global data, config_list 76 | # iteration += 1 77 | start_time = time.time() 78 | prom = Prometheus(url=url, token=token, data_chunk=chunk_size, stored_data=data_size) 79 | metric = prom.get_metric(metric_name) 80 | print("metric collected.") 81 | 82 | # Convert data to json 83 | metric = json.loads(metric) 84 | 85 | # Metric Json is converted to a shaped dataframe 86 | data_dict = get_df_from_json(metric, data_dict, data_window) # This dictionary contains all the sub-labels as keys and their data as Pandas DataFrames 87 | del metric, prom 88 | 89 | if str(store_intermediate_data) in TRUE_LIST: 90 | print("DataFrame stored at: ",cp().store_data(metric_name, pickle.dumps(data_dict), (data_storage_path + str(datetime.now().strftime('%Y%m%d%H%M'))))) 91 | pass 92 | 93 | 94 | if fixed_label_config != "None": #If a label config has been specified 95 | single_label_data_dict = {} 96 | 97 | # split into multiple label configs 98 | existing_config_list = list(data_dict.keys()) 99 | for config in config_list: 100 | config_found = False 101 | for existing_config in existing_config_list: 102 | if SortedDict(literal_eval(existing_config)) == SortedDict(literal_eval(config)): 103 | single_label_data_dict[existing_config] = data_dict[existing_config] 104 | config_found = True 105 | pass 106 | if not config_found: 107 | print("Specified Label Configuration {} was not found".format(config)) 108 | raise KeyError 109 | pass 110 | # single_label_data_dict[config] = data_dict[config] 111 | pass 112 | 113 | # single_label_data_dict[fixed_label_config] = data_dict[fixed_label_config] 114 | current_metric_metadata = list(single_label_data_dict.keys())[0] 115 | current_metric_metadata_dict = literal_eval(current_metric_metadata) 116 | 117 | print(data_dict[current_metric_metadata].head(5)) 118 | print(data_dict[current_metric_metadata].tail(5)) 119 | 120 | print("Using the default label config") 121 | predictions_dict_prophet = predict_metrics(single_label_data_dict) 122 | # print(single_label_data_dict) 123 | predictions_dict_fourier = predict_metrics_fourier(single_label_data_dict) 124 | pass 125 | else: 126 | for x in data_dict: 127 | print(data_dict[x].head(5)) 128 | print(data_dict[x].tail(5)) 129 | break 130 | pass 131 | predictions_dict_prophet = predict_metrics(data_dict) 132 | predictions_dict_fourier = predict_metrics_fourier(data_dict) 133 | 134 | # TODO: Trigger Data Pruning here 135 | function_run_time = time.time() - start_time 136 | 137 | print("Total time taken to train was: {} seconds.".format(function_run_time)) 138 | pass 139 | 140 | job(datetime.now()) 141 | 142 | # Schedular schedules a background job that needs to be run regularly 143 | scheduler = BackgroundScheduler() 144 | scheduler.start() 145 | scheduler.add_job( 146 | func=lambda: job(datetime.now()), 147 | trigger=IntervalTrigger(hours=train_schedule), 148 | id='training_job', 149 | name='Train Prophet model every day regularly', 150 | replace_existing=True) 151 | 152 | # Shut down the scheduler when exiting the app 153 | atexit.register(lambda: scheduler.shutdown()) 154 | 155 | 156 | 157 | # Initialize Multiple gauge metrics for the predicted values 158 | print("current_metric_metadata_dict: ", current_metric_metadata_dict) 159 | predicted_metric_name = "predicted_" + metric_name 160 | PREDICTED_VALUES_PROPHET = Gauge(predicted_metric_name + '_prophet', 'Forecasted value from Prophet model', [label for label in current_metric_metadata_dict if label != "__name__"]) 161 | PREDICTED_VALUES_PROPHET_UPPER = Gauge(predicted_metric_name + '_prophet_yhat_upper', 'Forecasted value upper bound from Prophet model', [label for label in current_metric_metadata_dict if label != "__name__"]) 162 | PREDICTED_VALUES_PROPHET_LOWER = Gauge(predicted_metric_name + '_prophet_yhat_lower', 'Forecasted value lower bound from Prophet model', [label for label in current_metric_metadata_dict if label != "__name__"]) 163 | 164 | PREDICTED_VALUES_FOURIER = Gauge(predicted_metric_name + '_fourier', 'Forecasted value from Fourier Transform model', [label for label in current_metric_metadata_dict if label != "__name__"]) 165 | PREDICTED_VALUES_FOURIER_UPPER = Gauge(predicted_metric_name + '_fourier_yhat_upper', 'Forecasted value upper bound from Fourier Transform model', [label for label in current_metric_metadata_dict if label != "__name__"]) 166 | PREDICTED_VALUES_FOURIER_LOWER = Gauge(predicted_metric_name + '_fourier_yhat_lower', 'Forecasted value lower bound from Fourier Transform model', [label for label in current_metric_metadata_dict if label != "__name__"]) 167 | 168 | PREDICTED_ANOMALY_PROPHET = Gauge(predicted_metric_name + '_prophet_anomaly', 'Detected Anomaly using the Prophet model', [label for label in current_metric_metadata_dict if label != "__name__"]) 169 | 170 | PREDICTED_ANOMALY_FOURIER = Gauge(predicted_metric_name + '_fourier_anomaly', 'Detected Anomaly using the Fourier model', [label for label in current_metric_metadata_dict if label != "__name__"]) 171 | 172 | # Standard Flask route stuff. 173 | @app.route('/') 174 | def hello_world(): 175 | return 'This is just a test page. Please add "/metrics" to the url of this page to see the predicted metrics.' 176 | 177 | live_data_dict = {} 178 | 179 | @app.route('/metrics') 180 | def metrics(): 181 | global predictions_dict_prophet, predictions_dict_fourier, current_metric_metadata, current_metric_metadata_dict, metric_name, url, token, live_data_dict 182 | 183 | 184 | for metadata in predictions_dict_prophet: 185 | 186 | #Find the index matching with the current timestamp 187 | index_prophet = predictions_dict_prophet[metadata].index.get_loc(datetime.now(), method='nearest') 188 | index_fourier = predictions_dict_fourier[metadata].index.get_loc(datetime.now(), method='nearest') 189 | current_metric_metadata = metadata 190 | 191 | print("The current time is: ",datetime.now()) 192 | print("The matching index for Prophet model found was: \n", predictions_dict_prophet[metadata].iloc[[index_prophet]]) 193 | print("The matching index for Fourier Transform found was: \n", predictions_dict_fourier[metadata].iloc[[index_fourier]]) 194 | 195 | current_metric_metadata_dict = literal_eval(metadata) 196 | 197 | temp_current_metric_metadata_dict = current_metric_metadata_dict.copy() 198 | 199 | # delete the "__name__" key from the dictionary as we don't need it in labels (it is a non-permitted label) when serving the metrics 200 | del temp_current_metric_metadata_dict["__name__"] 201 | 202 | # TODO: the following function does not have good error handling or retry code in case of get request failure, need to fix that 203 | # Get the current metric value which will be compared with the predicted value to detect an anomaly 204 | metric = (Prometheus(url=url, token=token).get_current_metric_value(metric_name, temp_current_metric_metadata_dict)) 205 | 206 | # print("metric collected.") 207 | 208 | # Convert data to json 209 | metric = json.loads(metric) 210 | 211 | # Convert the json to a dictionary of pandas dataframes 212 | live_data_dict = get_df_from_single_value_json(metric, live_data_dict) 213 | 214 | # Trim the live data dataframe to only 5 most recent values 215 | live_data_dict[metadata] = live_data_dict[metadata][-5:] 216 | # print(live_data_dict) 217 | 218 | # Update the metric values for prophet model 219 | PREDICTED_VALUES_PROPHET.labels(**temp_current_metric_metadata_dict).set(predictions_dict_prophet[metadata]['yhat'][index_prophet]) 220 | PREDICTED_VALUES_PROPHET_UPPER.labels(**temp_current_metric_metadata_dict).set(predictions_dict_prophet[metadata]['yhat_upper'][index_prophet]) 221 | PREDICTED_VALUES_PROPHET_LOWER.labels(**temp_current_metric_metadata_dict).set(predictions_dict_prophet[metadata]['yhat_lower'][index_prophet]) 222 | 223 | # Update the metric values for fourier transform model 224 | PREDICTED_VALUES_FOURIER.labels(**temp_current_metric_metadata_dict).set(predictions_dict_fourier[metadata]['yhat'][index_fourier]) 225 | PREDICTED_VALUES_FOURIER_UPPER.labels(**temp_current_metric_metadata_dict).set(predictions_dict_fourier[metadata]['yhat_upper'][index_fourier]) 226 | PREDICTED_VALUES_FOURIER_LOWER.labels(**temp_current_metric_metadata_dict).set(predictions_dict_fourier[metadata]['yhat_lower'][index_fourier]) 227 | 228 | 229 | if len(live_data_dict[metadata] >= 5): 230 | pass 231 | # Update the metric values for detected anomalies 1 in case of anomaly, 0 if not 232 | if (detect_anomalies(predictions_dict_fourier[metadata][len(predictions_dict_fourier[metadata])-(len(live_data_dict[metadata])):],live_data_dict[metadata])): 233 | PREDICTED_ANOMALY_FOURIER.labels(**temp_current_metric_metadata_dict).set(1) 234 | else: 235 | PREDICTED_ANOMALY_FOURIER.labels(**temp_current_metric_metadata_dict).set(0) 236 | 237 | if (detect_anomalies(predictions_dict_prophet[metadata][len(predictions_dict_prophet[metadata])-(len(live_data_dict[metadata])):],live_data_dict[metadata])): 238 | PREDICTED_ANOMALY_PROPHET.labels(**temp_current_metric_metadata_dict).set(1) 239 | else: 240 | PREDICTED_ANOMALY_PROPHET.labels(**temp_current_metric_metadata_dict).set(0) 241 | pass 242 | 243 | return Response(generate_latest(REGISTRY).decode("utf-8"), content_type='text; charset=utf-8') 244 | 245 | if __name__ == "__main__": 246 | # Running the flask web server 247 | app.run(host='0.0.0.0', port=8080) 248 | pass 249 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | from prometheus import Prometheus 2 | import pandas 3 | import numpy as np 4 | from numpy import fft 5 | import json 6 | import time 7 | # from lib.model import * 8 | from ceph import CephConnect as cp 9 | from datetime import datetime, timedelta 10 | from fbprophet import Prophet 11 | from sortedcontainers import SortedDict 12 | import os 13 | import gc 14 | import pickle 15 | import collections 16 | from scipy.stats import norm 17 | 18 | # Plotting 19 | # import matplotlib.pyplot as plt 20 | 21 | 22 | def get_df_from_json(metric, metric_dict_pd={}, data_window=5): 23 | ''' 24 | Method to convert a json object of a Prometheus metric to a dictionary of shaped Pandas DataFrames 25 | 26 | The shape is dict[metric_metadata] = Pandas Object 27 | 28 | Pandas Object = timestamp, value 29 | 15737933, 1 30 | ..... 31 | 32 | This method can also be used to update an existing dictionary with new data 33 | ''' 34 | # metric_dict = {} 35 | current_time = datetime.now() 36 | earliest_data_time = current_time - timedelta(days = data_window) 37 | 38 | 39 | print("Pre-processing Data...........") 40 | # metric_dict_pd = {} 41 | # print("Length of metric: ", len(metric)) 42 | for row in metric: 43 | # metric_dict[str(row['metric'])] = metric_dict.get(str(row['metric']),[]) + (row['values']) 44 | metric_metadata = str(SortedDict(row['metric']))[11:-1] # Sort the dictionary and then convert it to string so it can be hashed 45 | # print(metric_metadata) 46 | # print("Row Values: ",row['values']) 47 | if metric_metadata not in metric_dict_pd: 48 | metric_dict_pd[metric_metadata] = pandas.DataFrame(row['values'], columns=['ds', 'y']).apply(pandas.to_numeric, args=({"errors":"coerce"})) 49 | metric_dict_pd[metric_metadata]['ds'] = pandas.to_datetime(metric_dict_pd[metric_metadata]['ds'], unit='s') 50 | pass 51 | else: 52 | temp_df = pandas.DataFrame(row['values'], columns=['ds', 'y']).apply(pandas.to_numeric, args=({"errors":"coerce"})) 53 | temp_df['ds'] = pandas.to_datetime(temp_df['ds'], unit='s') 54 | # print(temp_df.head()) 55 | # print("Row Values: ",row['values'] 56 | # print("Temp Head Before 5: \n",temp_df.head(5)) 57 | # print("Head Before 5: \n",metric_dict_pd[metric_metadata].head(5)) 58 | # print("Tail Before 5: \n",metric_dict_pd[metric_metadata].tail(5)) 59 | metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].append(temp_df, ignore_index=True) 60 | # print("Head 5: \n",metric_dict_pd[metric_metadata].head(5)) 61 | # print("Tail 5: \n",metric_dict_pd[metric_metadata].tail(5)) 62 | mask = (metric_dict_pd[metric_metadata]['ds'] > earliest_data_time) 63 | metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].loc[mask] 64 | # del temp_df 65 | pass 66 | metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].dropna() 67 | metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].drop_duplicates('ds').sort_values(by=['ds']).reset_index(drop = True) 68 | 69 | if len(metric_dict_pd[metric_metadata]) == 0: 70 | del metric_dict_pd[metric_metadata] 71 | pass 72 | pass 73 | 74 | # print(metric_dict_pd[metric_metadata]) 75 | # mask = (metric_dict_pd[metric_metadata]['ds'] > earliest_data_time) & (metric_dict_pd[metric_metadata]['ds'] <= current_time) 76 | # metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].loc[mask] 77 | # break 78 | return metric_dict_pd 79 | 80 | 81 | def get_df_from_single_value_json(metric, metric_dict_pd={}, data_window=5): 82 | ''' 83 | Method to convert a json object of a Prometheus metric to a dictionary of shaped Pandas DataFrames 84 | 85 | The shape is dict[metric_metadata] = Pandas Object 86 | 87 | Pandas Object = timestamp, value 88 | 15737933, 1 89 | ..... 90 | 91 | This method can also be used to update an existing dictionary with new data 92 | ''' 93 | # metric_dict = {} 94 | current_time = datetime.now() 95 | earliest_data_time = current_time - timedelta(days = data_window) 96 | 97 | 98 | print("Pre-processing Data...........") 99 | # metric_dict_pd = {} 100 | # print("Length of metric: ", len(metric)) 101 | for row in metric: 102 | # metric_dict[str(row['metric'])] = metric_dict.get(str(row['metric']),[]) + (row['values']) 103 | metric_metadata = str(SortedDict(row['metric']))[11:-1] # Sort the dictionary and then convert it to string so it can be hashed 104 | # print(metric_metadata) 105 | # print("Row Values: ",row['values']) 106 | if metric_metadata not in metric_dict_pd: 107 | metric_dict_pd[metric_metadata] = pandas.DataFrame([row['value']], columns=['ds', 'y']).apply(pandas.to_numeric, args=({"errors":"coerce"})) 108 | metric_dict_pd[metric_metadata]['ds'] = pandas.to_datetime(metric_dict_pd[metric_metadata]['ds'], unit='s') 109 | pass 110 | else: 111 | temp_df = pandas.DataFrame([row['value']], columns=['ds', 'y']).apply(pandas.to_numeric, args=({"errors":"coerce"})) 112 | temp_df['ds'] = pandas.to_datetime(temp_df['ds'], unit='s') 113 | # print(temp_df.head()) 114 | # print("Row Values: ",row['values'] 115 | # print("Temp Head Before 5: \n",temp_df.head(5)) 116 | # print("Head Before 5: \n",metric_dict_pd[metric_metadata].head(5)) 117 | # print("Tail Before 5: \n",metric_dict_pd[metric_metadata].tail(5)) 118 | metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].append(temp_df, ignore_index=True) 119 | # print("Head 5: \n",metric_dict_pd[metric_metadata].head(5)) 120 | # print("Tail 5: \n",metric_dict_pd[metric_metadata].tail(5)) 121 | mask = (metric_dict_pd[metric_metadata]['ds'] > earliest_data_time) 122 | metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].loc[mask] 123 | # del temp_df 124 | pass 125 | metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].dropna() 126 | metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].drop_duplicates('ds').sort_values(by=['ds']).reset_index(drop = True) 127 | 128 | if len(metric_dict_pd[metric_metadata]) == 0: 129 | del metric_dict_pd[metric_metadata] 130 | pass 131 | pass 132 | 133 | # print(metric_dict_pd[metric_metadata]) 134 | # mask = (metric_dict_pd[metric_metadata]['ds'] > earliest_data_time) & (metric_dict_pd[metric_metadata]['ds'] <= current_time) 135 | # metric_dict_pd[metric_metadata] = metric_dict_pd[metric_metadata].loc[mask] 136 | # break 137 | return metric_dict_pd 138 | 139 | def predict_metrics(pd_dict, prediction_range=1440): 140 | ''' 141 | This Function takes input a dictionary of Pandas DataFrames, trains the Prophet model for each dataframe and returns a dictionary of predictions. 142 | ''' 143 | 144 | total_label_num = len(pd_dict) 145 | # LABEL_LIMIT = limit_labels 146 | PREDICT_DURATION = prediction_range 147 | 148 | current_label_num = 0 149 | limit_iterator_num = 0 150 | 151 | predictions_dict = {} 152 | 153 | for meta_data in pd_dict: 154 | try: 155 | current_label_num += 1 156 | limit_iterator_num += 1 157 | 158 | print("Training Label {}/{}".format(current_label_num,total_label_num)) 159 | data = pd_dict[meta_data] 160 | 161 | print("----------------------------------\n") 162 | print(meta_data) 163 | print("Number of Data Points: {}".format(len(pd_dict[meta_data]))) 164 | print("----------------------------------\n") 165 | 166 | data['ds'] = pandas.to_datetime(data['ds'], unit='s') 167 | 168 | train_frame = data 169 | 170 | # Prophet Modelling begins here 171 | m = Prophet(daily_seasonality = True, weekly_seasonality=True) 172 | 173 | print("Fitting the train_frame") 174 | m.fit(train_frame) 175 | 176 | future = m.make_future_dataframe(periods=int(PREDICT_DURATION),freq="1MIN") 177 | 178 | forecast = m.predict(future) 179 | 180 | # To Plot 181 | # fig1 = m.plot(forecast) 182 | # 183 | # fig2 = m.plot_components(forecast) 184 | forecast['timestamp'] = forecast['ds'] 185 | forecast = forecast[['timestamp','yhat','yhat_lower','yhat_upper']] 186 | forecast = forecast.set_index('timestamp') 187 | 188 | # Store predictions in output dictionary 189 | predictions_dict[meta_data] = forecast 190 | 191 | # forecast.plot() 192 | # plt.legend() 193 | # plt.show() 194 | except ValueError as exception: 195 | if str(exception) == "ValueError: Dataframe has less than 2 non-NaN rows.": 196 | print("Too many NaN values........Skipping this label") 197 | limit_iterator_num -= 1 198 | else: 199 | raise exception 200 | pass 201 | 202 | return predictions_dict 203 | 204 | def fourierExtrapolation(x, n_predict, n_harm): 205 | n = x.size 206 | #n_harm = 100 # number of harmonics in model 207 | t = np.arange(0, n) 208 | p = np.polyfit(t, x, 1) # find linear trend in x 209 | x_notrend = x - p[0] * t # detrended x 210 | x_freqdom = fft.fft(x_notrend) # detrended x in frequency domain 211 | f = fft.fftfreq(n) # frequencies 212 | indexes = np.arange(n).tolist() 213 | # sort indexes by frequency, lower -> higher 214 | indexes.sort(key = lambda i:np.absolute(f[i])) 215 | 216 | t = np.arange(0, n + n_predict) 217 | restored_sig = np.zeros(t.size) 218 | for i in indexes[:1 + n_harm * 2]: 219 | ampli = np.absolute(x_freqdom[i]) / n # amplitude 220 | phase = np.angle(x_freqdom[i]) # phase 221 | restored_sig += ampli * np.cos(2 * np.pi * f[i] * t + phase) 222 | return restored_sig + p[0] * t 223 | 224 | def predict_metrics_fourier(pd_dict, prediction_range=1440): 225 | total_label_num = len(pd_dict) 226 | PREDICT_DURATION = prediction_range 227 | 228 | current_label_num = 0 229 | limit_iterator_num = 0 230 | 231 | predictions_dict = {} 232 | 233 | for meta_data in pd_dict: 234 | try: 235 | data = pd_dict[meta_data] 236 | data['ds'] = pandas.to_datetime(data['ds'], unit='s') 237 | vals = np.array(data["y"].tolist()) 238 | 239 | # run model and trim forecast to only newest values 240 | print("Training Model......") 241 | forecast_vals = fourierExtrapolation(vals, prediction_range, int(len(vals)/3)) 242 | dataframe_cols = {} 243 | dataframe_cols["yhat"] = np.array(forecast_vals) 244 | 245 | # find most recent timestamp from original data and extrapolate new 246 | # timestamps 247 | print("Creating Dummy Timestamps.....") 248 | min_time = min(data["ds"]) 249 | dataframe_cols["timestamp"] = pandas.date_range(min_time, periods=len(forecast_vals), freq='min') 250 | 251 | # create dummy upper and lower bounds 252 | print("Computing Bounds....") 253 | upper_bound = np.mean(forecast_vals) + np.std(forecast_vals) 254 | lower_bound = np.mean(forecast_vals) - np.std(forecast_vals) 255 | dataframe_cols["yhat_upper"] = np.full((len(forecast_vals)), upper_bound) 256 | dataframe_cols["yhat_lower"] = np.full((len(forecast_vals)), lower_bound) 257 | 258 | # create series and index into precictions_dict 259 | print("Formatting Forecast to Pandas....") 260 | forecast = pandas.DataFrame(data=dataframe_cols) 261 | forecast = forecast.set_index('timestamp') 262 | predictions_dict[meta_data] = forecast 263 | 264 | current_label_num += 1 265 | limit_iterator_num += 1 266 | except ValueError as exception: 267 | if str(exception) == "ValueError: Dataframe has less than 2 non-NaN rows.": 268 | print("Too many NaN values........Skipping this label") 269 | limit_iterator_num -= 1 270 | else: 271 | raise exception 272 | pass 273 | 274 | return predictions_dict 275 | 276 | class Accumulator: 277 | def __init__(self,thresh): 278 | self._counter = 0 279 | self.thresh = thresh 280 | def inc(self, val): 281 | self._counter += val 282 | def count(self): 283 | return self._counter 284 | 285 | def detect_anomalies(predictions, data): 286 | if len(predictions) != len(data) : 287 | raise IndexError 288 | 289 | # parameters 290 | lower_bound_thresh = predictions["yhat_lower"].min() 291 | upper_bound_thresh = predictions["yhat_upper"].max() 292 | diff_thresh = 3*data["y"].std() 293 | acc_thresh = int(0.1*np.shape(predictions)[0]) 294 | epsilon = .01 295 | 296 | diffs = [] 297 | acc = Accumulator(acc_thresh) 298 | preds = np.array(predictions["yhat"]) 299 | dat = np.array(data["y"]) 300 | for i in range(0, np.shape(predictions)[0]): 301 | diff = preds[i] - dat[i] 302 | if abs(diff) > diff_thresh: 303 | # upper bound anomaly, increment counter 304 | acc.inc(1) 305 | elif dat[i] < lower_bound_thresh: 306 | # found trough, decrement so that acc will decay to 0 307 | acc.inc(-3) 308 | elif dat[i] > upper_bound_thresh: 309 | # found peak, decrement so that acc will decay to 0 310 | acc.inc(-3) 311 | else: 312 | # no anomaly, decrement by 2 313 | acc.inc(-2) 314 | 315 | diffs.append(max(diff, 0)) 316 | 317 | if acc.count() > acc.thresh: 318 | acc_anomaly = True 319 | else: 320 | acc_anomaly = False 321 | w_size = int(0.8*len(data)) 322 | w_prime_size = len(data) - w_size 323 | 324 | w = diffs[0:w_size] 325 | w_prime = diffs[w_size:] 326 | 327 | w_mu = np.mean(w) 328 | w_std = np.std(w) 329 | w_prime_mu = np.mean(w_prime) 330 | 331 | if w_std == 0: 332 | L_t = 0 333 | else: 334 | L_t = 1 - norm.sf((w_prime_mu - w_mu)/w_std) 335 | 336 | print(L_t) 337 | if L_t >= 1 - epsilon: 338 | tail_prob_anomaly = True 339 | else: 340 | tail_prob_anomaly = False 341 | 342 | return acc_anomaly and tail_prob_anomaly 343 | 344 | 345 | if __name__ == "__main__": 346 | 347 | url = os.getenv('URL') 348 | token = os.getenv('BEARER_TOKEN') 349 | 350 | # Specific metric to run the model on 351 | metric_name = os.getenv('METRIC_NAME','kubelet_docker_operations_latency_microseconds') 352 | 353 | print("Using Metric {}.".format(metric_name)) 354 | 355 | # This is where the model dictionary will be stored and retrieved from 356 | model_storage_path = "Models" + "/" + url[8:] + "/"+ metric_name + "/" + "prophet_model" + ".pkl" 357 | 358 | # Chunk size, download the complete data, but in smaller chunks, should be less than or equal to DATA_SIZE 359 | chunk_size = str(os.getenv('CHUNK_SIZE','1d')) 360 | 361 | # Net data size to scrape from prometheus 362 | data_size = str(os.getenv('DATA_SIZE','1d')) 363 | 364 | # Number of minutes, the model should predict the values for 365 | # PREDICT_DURATION=1440 # minutes, 1440 = 24 Hours 366 | 367 | # Limit to first few labels of the metric 368 | # LABEL_LIMIT = None 369 | 370 | # Preparing a connection to Prometheus host 371 | prom = Prometheus(url=url, token=token, data_chunk=chunk_size, stored_data=data_size) 372 | 373 | 374 | 375 | # Get metric data from Prometheus 376 | metric = prom.get_metric(metric_name) 377 | print("metric collected.") 378 | del prom 379 | 380 | # Convert data to json 381 | metric = json.loads(metric) 382 | 383 | # print(metric) 384 | 385 | # Metric Json is converted to a shaped dataframe 386 | pd_dict = get_df_from_json(metric) # This dictionary contains all the sub-labels as keys and their data as Pandas DataFrames 387 | del metric 388 | 389 | predictions = predict_metrics(pd_dict) 390 | for x in predictions: 391 | print(predictions[x].head()) 392 | pass 393 | --------------------------------------------------------------------------------