├── prelert
    ├── __init__.py
    └── engineApiClient
    │   ├── __init__.py
    │   └── EngineApiClient.py
├── .gitignore
├── elk_connector
    ├── configs
    │   └── apache-access.json
    ├── README.md
    ├── elk_connector.py
    └── elk_connector_realtime.py
├── setup.py
├── csv
    ├── influencersToCsv.py
    └── recordsToCsv.py
├── deleteAllJobs.py
├── cloudwatch
    ├── listMetrics.py
    ├── README.md
    └── cloudWatchMetrics.py
├── pollAlerts.py
├── printJobRecords.py
├── simpleEngineApiExample.py
├── printJobBuckets.py
├── README.md
├── streamingApm.py
└── LICENSE.md


/prelert/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | Thumbs.db
3 | .DS_Store
4 | MANIFEST
5 | build/
6 | dist/
7 | 


--------------------------------------------------------------------------------
/prelert/engineApiClient/__init__.py:
--------------------------------------------------------------------------------
1 | from .EngineApiClient import EngineApiClient


--------------------------------------------------------------------------------
/elk_connector/configs/apache-access.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type" : "apache-access",
 3 |     "job_config" : {
 4 |         "analysisConfig" : { 
 5 |             "bucketSpan":600,
 6 |             "detectors":[{"function":"count","byFieldName":"response"}] 
 7 |         },
 8 |         "dataDescription" : {
 9 |             "format":"json",
10 |             "timeField":"@timestamp", 
11 |             "timeFormat":"yyyy-MM-dd'T'HH:mm:ss.SSSX"
12 |         }
13 |     },
14 |     "search" : {
15 |         "filter" : { "match_all" : {}},
16 |         "sort":[{"@timestamp" : {"order":"asc"} }],
17 |         "_source" : ["response", "@timestamp"]            
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | ############################################################################
 3 | #                                                                          #
 4 | # Copyright 2014-2016 Prelert Ltd                                          #
 5 | #                                                                          #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");          #
 7 | # you may not use this file except in compliance with the License.         #
 8 | # You may obtain a copy of the License at                                  #
 9 | #                                                                          #
10 | #    http://www.apache.org/licenses/LICENSE-2.0                            #
11 | #                                                                          #
12 | # Unless required by applicable law or agreed to in writing, software      #
13 | # distributed under the License is distributed on an "AS IS" BASIS,        #
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15 | # See the License for the specific language governing permissions and      #
16 | # limitations under the License.                                           #
17 | #                                                                          #
18 | ############################################################################
19 | """
20 | Prelert Python packages
21 | """
22 | 
23 | from distutils.core import setup
24 | setup(name='Prelert',
25 |       description='Python packages for Prelert',
26 |       version='2.0.0',
27 |       license='Apache License, Version 2.0',
28 |       url='https://github.com/prelert/engine-python',
29 |       packages=['prelert', 'prelert.engineApiClient'],
30 |       )
31 | 


--------------------------------------------------------------------------------
/csv/influencersToCsv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | ############################################################################
 3 | #                                                                          #
 4 | # Copyright 2014-2015 Prelert Ltd                                          #
 5 | #                                                                          #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");          #
 7 | # you may not use this file except in compliance with the License.         #
 8 | # You may obtain a copy of the License at                                  #
 9 | #                                                                          #
10 | #    http://www.apache.org/licenses/LICENSE-2.0                            #
11 | #                                                                          #
12 | # Unless required by applicable law or agreed to in writing, software      #
13 | # distributed under the License is distributed on an "AS IS" BASIS,        #
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15 | # See the License for the specific language governing permissions and      #
16 | # limitations under the License.                                           #
17 | #                                                                          #
18 | ############################################################################
19 | """
20 | Example of how influencers can be queried from the Engine API in CSV format
21 | using Python 2.6+ (including Python 3.x).  No extra modules are required beyond
22 | those that come with a base Python install.
23 | 
24 | Usage:
25 | 
26 | python influencersToCsv.py <job> <server_hostname> [ <server_port> [ <result_limit> ] ]
27 | 
28 | The job ID and server hostname must be specified.  The port defaults to 8080
29 | if not specified and the number of maximum number of results to 10000.
30 | 
31 | Influencers are returned in descending order of influencer anomaly score; the
32 | most unusual will be at the top of the list.
33 | """
34 | 
35 | import csv
36 | import json
37 | import sys
38 | 
39 | try:
40 |     # For Python 3.x
41 |     from urllib.request import urlopen
42 | except ImportError:
43 |     # For Python 2.x
44 |     from urllib2 import urlopen
45 | 
46 | if len(sys.argv) < 3:
47 |     sys.stderr.write('Usage: %s <job> <server_hostname> [ <server_port> [ <result_limit> ] ]\n' % sys.argv[0])
48 |     sys.exit(1)
49 | 
50 | job = sys.argv[1]
51 | server = sys.argv[2]
52 | port = 8080
53 | if len(sys.argv) >= 4:
54 |     port = sys.argv[3]
55 | limit = 10000
56 | if len(sys.argv) >= 5:
57 |     limit = sys.argv[4]
58 | 
59 | url = 'http://%s:%s/engine/v2/results/%s/influencers?take=%s' % (server, port, job, limit)
60 | response = urlopen(url).read()
61 | json = json.loads(response.decode('utf-8'))
62 | writtenHeader = False
63 | csvWriter = csv.writer(sys.stdout)
64 | for document in json['documents']:
65 |     if not writtenHeader:
66 |         csvWriter.writerow([ key for key in sorted(document) ])
67 |         writtenHeader = True
68 |     csvWriter.writerow([ str(document[key]) for key in sorted(document) ])
69 | 
70 | 


--------------------------------------------------------------------------------
/csv/recordsToCsv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | ############################################################################
 3 | #                                                                          #
 4 | # Copyright 2014-2015 Prelert Ltd                                          #
 5 | #                                                                          #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");          #
 7 | # you may not use this file except in compliance with the License.         #
 8 | # You may obtain a copy of the License at                                  #
 9 | #                                                                          #
10 | #    http://www.apache.org/licenses/LICENSE-2.0                            #
11 | #                                                                          #
12 | # Unless required by applicable law or agreed to in writing, software      #
13 | # distributed under the License is distributed on an "AS IS" BASIS,        #
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15 | # See the License for the specific language governing permissions and      #
16 | # limitations under the License.                                           #
17 | #                                                                          #
18 | ############################################################################
19 | """
20 | Example of how records can be queried from the Engine API in CSV format
21 | using Python 2.6+ (including Python 3.x).  No extra modules are required beyond
22 | those that come with a base Python install.
23 | 
24 | Usage:
25 | 
26 | python recordsToCsv.py <job> <server_hostname> [ <server_port> [ <result_limit> ] ]
27 | 
28 | The job ID and server hostname must be specified.  The port defaults to 8080
29 | if not specified and the number of maximum number of results to 10000.
30 | 
31 | Records are returned in descending order of normalized probability; the most
32 | unusual will be at the top of the list.
33 | """
34 | 
35 | import csv
36 | import json
37 | import sys
38 | 
39 | try:
40 |     # For Python 3.x
41 |     from urllib.request import urlopen
42 | except ImportError:
43 |     # For Python 2.x
44 |     from urllib2 import urlopen
45 | 
46 | if len(sys.argv) < 3:
47 |     sys.stderr.write('Usage: %s <job> <server_hostname> [ <server_port> [ <result_limit> ] ]\n' % sys.argv[0])
48 |     sys.exit(1)
49 | 
50 | job = sys.argv[1]
51 | server = sys.argv[2]
52 | port = 8080
53 | if len(sys.argv) >= 4:
54 |     port = sys.argv[3]
55 | limit = 10000
56 | if len(sys.argv) >= 5:
57 |     limit = sys.argv[4]
58 | 
59 | url = 'http://%s:%s/engine/v2/results/%s/records?take=%s&sort=normalizedProbability' % (server, port, job, limit)
60 | response = urlopen(url).read()
61 | json = json.loads(response.decode('utf-8'))
62 | writtenHeader = False
63 | csvWriter = csv.writer(sys.stdout)
64 | for document in json['documents']:
65 |     if not writtenHeader:
66 |         csvWriter.writerow([ key for key in sorted(document) ])
67 |         writtenHeader = True
68 |     csvWriter.writerow([ str(document[key]) for key in sorted(document) ])
69 | 
70 | 


--------------------------------------------------------------------------------
/deleteAllJobs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | ############################################################################
 3 | #                                                                          #
 4 | # Copyright 2014 Prelert Ltd                                               #
 5 | #                                                                          #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");          #
 7 | # you may not use this file except in compliance with the License.         #
 8 | # You may obtain a copy of the License at                                  #
 9 | #                                                                          #
10 | #    http://www.apache.org/licenses/LICENSE-2.0                            #
11 | #                                                                          #
12 | # Unless required by applicable law or agreed to in writing, software      #
13 | # distributed under the License is distributed on an "AS IS" BASIS,        #
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15 | # See the License for the specific language governing permissions and      #
16 | # limitations under the License.                                           #
17 | #                                                                          #
18 | ############################################################################
19 | '''
20 | Delete all the jobs in the Engine API. 
21 | Request a list of jobs configured in the API then
22 | delete them one at a time using the job id.
23 | 
24 | Be careful with this one you can't change your mind afterwards.
25 | '''
26 | 
27 | import argparse
28 | import sys
29 | import json
30 | import logging
31 | import time
32 | 
33 | from prelert.engineApiClient import EngineApiClient
34 | 
35 | # defaults
36 | HOST = 'localhost'
37 | PORT = 8080
38 | BASE_URL = 'engine/v2'
39 | 
40 | def parseArguments():
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--host", help="The Prelert Engine API host defaults to "
43 |         + HOST, default=HOST)
44 |     parser.add_argument("--port", help="The Prelert Engine API port defaults to "
45 |         + str(PORT), default=PORT)
46 |     
47 |     return parser.parse_args()   
48 | 
49 | 
50 | def main():
51 |     args = parseArguments()
52 |     host = args.host
53 |     port = args.port
54 | 
55 | 
56 |     # Create the REST API client
57 |     engine_client = EngineApiClient(host, BASE_URL, port)
58 | 
59 |     while True:
60 |         (http_status_code, response) = engine_client.getJobs()
61 |         if http_status_code != 200:
62 |             print (http_status_code, json.dumps(response))
63 |             break
64 |         
65 |         jobs = response['documents']        
66 |         if (len(jobs) == 0):
67 |             print "Deleted all jobs"
68 |             break
69 | 
70 | 
71 |         print "Deleting %d jobs" % (len(jobs)),
72 | 
73 |         for job in jobs:
74 |             (http_status_code, response) = engine_client.delete(job['id'])
75 |             if http_status_code != 200:
76 |                 print (http_status_code, json.dumps(response))
77 |             else:
78 |                 sys.stdout.write('.')
79 |                 sys.stdout.flush()
80 |         print
81 | 
82 |      
83 | if __name__ == "__main__":
84 |     main()    
85 | 
86 | 


--------------------------------------------------------------------------------
/cloudwatch/listMetrics.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | ############################################################################
 3 | #                                                                          #
 4 | # Copyright 2014 Prelert Ltd                                               #
 5 | #                                                                          #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");          #
 7 | # you may not use this file except in compliance with the License.         #
 8 | # You may obtain a copy of the License at                                  #
 9 | #                                                                          #
10 | #    http://www.apache.org/licenses/LICENSE-2.0                            #
11 | #                                                                          #
12 | # Unless required by applicable law or agreed to in writing, software      #
13 | # distributed under the License is distributed on an "AS IS" BASIS,        #
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15 | # See the License for the specific language governing permissions and      #
16 | # limitations under the License.                                           #
17 | #                                                                          #
18 | ############################################################################
19 | 
20 | '''
21 | Connects the Amazon CloudWatch and prints a list of all the available
22 | metrics. Useful for testing connection settings.
23 | 
24 | The script has one mandatory argument - the path to a config filec
25 | containing the AWS connection settings. The file should have the following
26 | propteries:
27 | 
28 |     region=REGION
29 |     aws_access_key_id=YOUR_ACCESS_ID
30 |     aws_secret_access_key=YOUR_SECRET_KEY
31 | 
32 | Where REGION is one of us-east-1, eu-west-1, etc
33 | 
34 | '''
35 | 
36 | import argparse
37 | import ConfigParser
38 | import StringIO
39 | 
40 | import boto.ec2
41 | import boto.ec2.cloudwatch
42 | 
43 | 
44 | def parseArguments():
45 |     parser = argparse.ArgumentParser()
46 |     parser.add_argument("config", help="The AWS connection parameters.")
47 |     parser.add_argument("--service", choices=["EC2", "EBS"],
48 |             default="EC2", dest="service",
49 |             help="The AWS service for which metrics will be listed. By default it is EC2.")
50 | 
51 |     return parser.parse_args()
52 | 
53 | 
54 | def main():
55 | 
56 |     args = parseArguments()
57 | 
58 |     # read the config file
59 |     config = ConfigParser.RawConfigParser()
60 |     try:
61 |         # insert a section header into the config so
62 |         # ConfigParser will read it without complaint
63 |         with open(args.config, "r") as config_file:
64 |             ini_str = '[root]\n' + config_file.read()
65 |             ini_fp = StringIO.StringIO(ini_str)
66 |             config.readfp(ini_fp)
67 |     except IOError:
68 |         print "Error opening file " + args.file
69 |         return
70 | 
71 | 
72 |     try:
73 |         region = config.get('root', 'region')
74 |         access_id = config.get('root', 'aws_access_key_id')
75 |         secret_key = config.get('root', 'aws_secret_access_key')
76 |     except ConfigParser.NoOptionError as e:
77 |         print e
78 |         return
79 | 
80 |     # AWS CloudWatch connection
81 |     cloudwatch_conn = boto.ec2.cloudwatch.connect_to_region(region,
82 |                  aws_access_key_id=access_id,
83 |                  aws_secret_access_key=secret_key)
84 | 
85 | 
86 |     #metrics = cloudwatch_conn.list_metrics()
87 |     metrics = cloudwatch_conn.list_metrics(namespace='AWS/' + args.service)
88 |     for m in metrics:
89 |          print m.name, m.namespace, m.dimensions
90 | 
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     main()   
95 | 


--------------------------------------------------------------------------------
/pollAlerts.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | ############################################################################
  3 | #                                                                          #
  4 | # Copyright 2014 Prelert Ltd                                               #
  5 | #                                                                          #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");          #
  7 | # you may not use this file except in compliance with the License.         #
  8 | # You may obtain a copy of the License at                                  #
  9 | #                                                                          #
 10 | #    http://www.apache.org/licenses/LICENSE-2.0                            #
 11 | #                                                                          #
 12 | # Unless required by applicable law or agreed to in writing, software      #
 13 | # distributed under the License is distributed on an "AS IS" BASIS,        #
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
 15 | # See the License for the specific language governing permissions and      #
 16 | # limitations under the License.                                           #
 17 | #                                                                          #
 18 | ############################################################################
 19 | '''
 20 | Subscribe to the Prelert Engine API Alerts long poll end point for
 21 | alerts.
 22 | 
 23 | The script is invoked with 1 positional argument -the id of the
 24 | job to alert on. Optional parameters set the threshold arguments
 25 | at which to alert on. One of --anomalyScore or --normalizedProbability
 26 | should be set.
 27 | 
 28 | The script runs in an infinite loop re-subscribing to new alerts after
 29 | the request either times out or an alert is returned.
 30 | 
 31 | Run the script with '--help' to see the options.
 32 | 
 33 | '''
 34 | 
 35 | import argparse
 36 | import sys
 37 | import json
 38 | import logging
 39 | import time
 40 | 
 41 | from prelert.engineApiClient import EngineApiClient
 42 | 
 43 | # defaults
 44 | HOST = 'localhost'
 45 | PORT = 8080
 46 | BASE_URL = 'engine/v2'
 47 | 
 48 | 
 49 | def setupLogging():
 50 |     '''
 51 |         Log to console
 52 |     '''
 53 |     logging.basicConfig(level=logging.INFO,format='%(asctime)s %(levelname)s %(message)s')
 54 | 
 55 | 
 56 | def parseArguments():
 57 |     parser = argparse.ArgumentParser()
 58 |     parser.add_argument("--host", help="The Prelert Engine API host, defaults to "
 59 |         + HOST, default=HOST)
 60 |     parser.add_argument("--port", help="The Prelert Engine API port, defaults to "
 61 |         + str(PORT), default=PORT)
 62 |     parser.add_argument("--anomalyScore", help="Alert on buckets with anomaly score >= "
 63 |         + "this value", type=float, default=None)
 64 |     parser.add_argument("--normalizedProbability", help="Alert on records with a "
 65 |         + "normalized probablilty >= this", type=float, default=None)
 66 |     parser.add_argument("--timeout", help="The long poll timeout period", type=int, default=None)
 67 |     parser.add_argument("jobid", help="The job to alert on")
 68 |     return parser.parse_args()
 69 | 
 70 | 
 71 | def printHeader():
 72 |     print "Timestamp, Anomaly Score, Normalized Probablilty, URI, Results"
 73 | 
 74 | def printAlert(alert):
 75 | 
 76 |     if 'bucket' in alert:
 77 |         data = alert['bucket']
 78 |     else:
 79 |         data = alert['records']
 80 | 
 81 |     line = "{0}, {1}, {2}, {3}. {4}".format(alert['timestamp'],
 82 |                 alert['anomalyScore'], alert['maxNormalizedProbability'],
 83 |                 alert['uri'], data)
 84 | 
 85 |     print line
 86 | 
 87 | 
 88 | def main():
 89 | 
 90 |     setupLogging()
 91 | 
 92 |     args = parseArguments()
 93 |     job_id = args.jobid
 94 | 
 95 |     # Create the REST API client
 96 |     engine_client = EngineApiClient(args.host, BASE_URL, args.port)
 97 | 
 98 |     logging.info("Subscribing to job '" + job_id + "' for alerts")
 99 | 
100 |     printHeader()
101 | 
102 |     while True:
103 | 
104 |         try:
105 |             (http_status_code, response) = engine_client.alerts_longpoll(job_id,
106 |                 normalized_probability_threshold=args.normalizedProbability,
107 |                 anomaly_score_threshold=args.anomalyScore, timeout=args.timeout)
108 |             if http_status_code != 200:
109 |                 print (http_status_code, json.dumps(response))
110 |                 break
111 | 
112 |             if response['timeout'] == False:
113 |                 printAlert(response)
114 | 
115 |         except KeyboardInterrupt:
116 |             print "Exiting script..."
117 | 
118 | if __name__ == "__main__":
119 |     main()
120 | 
121 | 


--------------------------------------------------------------------------------
/printJobRecords.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | ############################################################################
  3 | #                                                                          #
  4 | # Copyright 2014 Prelert Ltd                                               #
  5 | #                                                                          #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");          #
  7 | # you may not use this file except in compliance with the License.         #
  8 | # You may obtain a copy of the License at                                  #
  9 | #                                                                          #
 10 | #    http://www.apache.org/licenses/LICENSE-2.0                            #
 11 | #                                                                          #
 12 | # Unless required by applicable law or agreed to in writing, software      #
 13 | # distributed under the License is distributed on an "AS IS" BASIS,        #
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
 15 | # See the License for the specific language governing permissions and      #
 16 | # limitations under the License.                                           #
 17 | #                                                                          #
 18 | ############################################################################
 19 | '''
 20 | Pull all the anomaly records for the provided job id and print
 21 | the timestamp, anomaly score and unusual score
 22 | 
 23 | The script is invoked with 1 positional argument -the id of the 
 24 | job to query the results of. Additional optional arguments
 25 | to specify the location of the Engine API. Run the script with 
 26 | '--help' to see the options.
 27 |  
 28 | '''
 29 | 
 30 | import argparse
 31 | import sys
 32 | import json
 33 | import logging
 34 | import time
 35 | 
 36 | from prelert.engineApiClient import EngineApiClient
 37 | 
 38 | # defaults
 39 | HOST = 'localhost'
 40 | PORT = 8080
 41 | BASE_URL = 'engine/v2'
 42 | 
 43 | 
 44 | def setupLogging():
 45 |     '''
 46 |         Log to console
 47 |     '''    
 48 |     logging.basicConfig(level=logging.INFO,format='%(asctime)s %(levelname)s %(message)s')
 49 | 
 50 | 
 51 | def parseArguments():
 52 |     parser = argparse.ArgumentParser()
 53 |     parser.add_argument("--host", help="The Prelert Engine API host, defaults to "
 54 |         + HOST, default=HOST)
 55 |     parser.add_argument("--port", help="The Prelert Engine API port, defaults to "
 56 |         + str(PORT), default=PORT)
 57 |     parser.add_argument("--anomalyScore", help="Filter out buckets with an anomalyScore "  
 58 |         + "less than this", type=float, default=0.0)
 59 |     parser.add_argument("--normalizedProbability", help="Filter out buckets with an " 
 60 |         + "max normalized probablilty less than this", type=float, default=0.0)         
 61 |     parser.add_argument("jobid", help="The jobId to request results from", default="0")
 62 |     return parser.parse_args()   
 63 | 
 64 | 
 65 | def printHeader():
 66 |     print "Date,Anomaly Score,Normalized Probability"
 67 | 
 68 | def printRecords(records):
 69 |     for record in records:
 70 |         print "{0},{1},{2}".format(record['timestamp'], record['anomalyScore'], 
 71 |             record['normalizedProbability'])
 72 | 
 73 | 
 74 | def main():
 75 | 
 76 |     setupLogging()
 77 | 
 78 |     args = parseArguments()    
 79 |     job_id = args.jobid
 80 | 
 81 |     # Create the REST API client
 82 |     engine_client = EngineApiClient(args.host, BASE_URL, args.port)
 83 | 
 84 |     # Get all the records up to now
 85 |     logging.info("Get records for job " + job_id)
 86 | 
 87 |     skip = 0
 88 |     take = 200
 89 |     (http_status_code, response) = engine_client.getRecords(job_id, skip, take,
 90 |                             normalized_probability_filter_value=args.normalizedProbability, 
 91 |                             anomaly_score_filter_value=args.anomalyScore)        
 92 |     if http_status_code != 200:
 93 |         print (http_status_code, json.dumps(response))
 94 |         return
 95 | 
 96 |     hit_count = int(response['hitCount'])
 97 | 
 98 |     printHeader()
 99 |     printRecords(response['documents'])
100 | 
101 |     while (skip + take) < hit_count:
102 |         skip += take
103 | 
104 |         (http_status_code, response) = engine_client.getRecords(job_id, skip, take,
105 |                             normalized_probability_filter_value=args.normalizedProbability, 
106 |                             anomaly_score_filter_value=args.anomalyScore)        
107 | 
108 |         if http_status_code != 200:
109 |             print (http_status_code, json.dumps(response))
110 |             return
111 | 
112 |         printRecords(response['documents'])
113 | 
114 | 
115 | if __name__ == "__main__":
116 |     main()    
117 | 
118 | 


--------------------------------------------------------------------------------
/elk_connector/README.md:
--------------------------------------------------------------------------------
  1 | Prelert Engine ELK Integration
  2 | ==============================
  3 | These scripts serve as a connector between ELK (Elasticsearch-logstash-Kibana) 
  4 | and the Prelert Engine API. You can analyze your historical log data or query 
  5 | new log records as they are added to logstash and forward them to the Engine for 
  6 | analysis in real time.
  7 | 
  8 | Pre-requisites
  9 | --------------
 10 | * ELK is installed
 11 | * Engine API is installed
 12 | * The Engine Python client is installed 
 13 |     *  `python setup.py install`
 14 | 
 15 | 
 16 | Apache Web Server Access Logs Example
 17 | --------------------------------------
 18 | In this example we create an Engine API job to analyze Apache Web Server Access logs stored
 19 | in logstash. Here's an example of the access log format in this case the website visitor is Microsoft's Bingbot
 20 | 
 21 |     157.55.33.115 - - [30/May/2014:01:02:39 +0000] "GET / HTTP/1.1" 200 36633 "-" "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
 22 | 
 23 | logstash has a standard *grok* pattern for access logs the files can be ingested 
 24 | with this configuration and stored in Elasticsearch
 25 | 
 26 |     input {
 27 |         file {
 28 |           path => "/path/to/access.log"
 29 |           type => "apache-access"
 30 |         }
 31 |     }
 32 | 
 33 |     filter {
 34 |       grok {
 35 |         match => [ "message", "%{COMBINEDAPACHELOG}" ]
 36 |       }
 37 |       date {
 38 |         match => [ "timestamp" , "dd/MMM/yyyy:HH:mm:ss Z" ]
 39 |       }
 40 |     }
 41 | 
 42 |     output {
 43 |         elasticsearch {}
 44 |     }
 45 | 
 46 | ### Job Configuration 
 47 | The fields under analysis for the this job are the HTTP status code and the log's 
 48 | timestamp we are interested in cases where the total count of a particular 
 49 | HTTP status code in a bucket is much higher or lower than usual. 
 50 | 
 51 | The analysis is configured with a 300 seconds (5 minutes) *bucketSpan* and one 
 52 | detector (count by response).
 53 | 
 54 |     "analysisConfig" : { 
 55 |         "bucketSpan":300,
 56 |         "detectors":[{"function":"count","byFieldName":"response"}] 
 57 |     }
 58 | 
 59 | The data is in JSON format and the field containing the timestamp is called '@timestamp'
 60 | 
 61 |     "dataDescription" : {
 62 |         "format":"json",
 63 |         "timeField":"@timestamp", 
 64 |         "timeFormat":"yyyy-MM-dd'T'HH:mm:ss.SSSX"
 65 |     }
 66 | 
 67 | ### Elasticsearch Query
 68 | Our configuration file also contains the Elasticsearch query to extract the log records.
 69 | The results must be ordered by timestamp earliest first as the Engine API expects 
 70 | records to be presented in that order. As we are only using the 'response' and '@timestamp'
 71 | fields the query returns only those.
 72 | 
 73 |     {
 74 |         "filter" : { "match_all" : {}},
 75 |         "sort":[{"@timestamp" : {"order":"asc"} }],
 76 |         "_source" : ["response", "@timestamp"]            
 77 |     }
 78 | 
 79 | ### Connector Configuration
 80 | The config file must define a type this is the same as the logstash type and is 
 81 | used in Elasticsearch queries
 82 |     
 83 |     {
 84 |         "type" : "apache-access",
 85 |         ...
 86 |     }
 87 | 
 88 | 
 89 | Analyzing Stored Data
 90 | ---------------------
 91 | Logstash puts each day's data into a separate index and names that index following 
 92 | the pattern 'logstash-YYYY.MM.DD'. When querying Elasticsearch for logstash records
 93 | the most efficient strategy is to search one index at a time and this is the approach
 94 | taken by the [elk_connector.py](elk_connector.py) script using the predictable logstash
 95 | index names. Start and end dates can be supplied as optional arguments on the command 
 96 | line otherwise the script finds the oldest index containing the configured data type 
 97 | and starts from there. 
 98 | 
 99 | ####For help see
100 |     python elk_connector.py --help
101 | 
102 | ####Example
103 | Using the configuration in 'configs/apache-access.json' analyze all data after April 1st 2014
104 | 
105 |     python elk_connector.py --start_date=2014-01-04 configs/apache-access.json
106 | 
107 | 
108 | Analyzing Real Time Data
109 | ------------------------
110 | The script [elk_connector_realtime.py](elk_connector_realtime.py) reads log records 
111 | from logstash indexes in Elasticsearch and uploads them to the Prelert Engine in 
112 | real time. By default the last 60 seconds of logs are read every 60 seconds this
113 | can be changed by setting the '--update-interval' argument.
114 | 
115 | ####For help see
116 |     python elk_connector_realtime.py --help
117 | 
118 | ####Example
119 | Connect to the Elasticsearch cluster on host 'elasticsearch-server' and the Prelert
120 | Engine API on 'prelert-server' sending the data to job 'XXXX'
121 | 
122 |     python elk_connector_realtime.py --es-host=elasticsearch-server
123 |         --api-host=prelert-server --job-id=XXXX configs/syslog.json 
124 | 


--------------------------------------------------------------------------------
/simpleEngineApiExample.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | ############################################################################
  3 | #                                                                          #
  4 | # Copyright 2014 Prelert Ltd                                               #
  5 | #                                                                          #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");          #
  7 | # you may not use this file except in compliance with the License.         #
  8 | # You may obtain a copy of the License at                                  #
  9 | #                                                                          #
 10 | #    http://www.apache.org/licenses/LICENSE-2.0                            #
 11 | #                                                                          #
 12 | # Unless required by applicable law or agreed to in writing, software      #
 13 | # distributed under the License is distributed on an "AS IS" BASIS,        #
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
 15 | # See the License for the specific language governing permissions and      #
 16 | # limitations under the License.                                           #
 17 | #                                                                          #
 18 | ############################################################################
 19 | '''
 20 | Creates a new job and uploads farequote.csv to it. The job is
 21 | then closed and the result buckets queried,
 22 | 
 23 | The example file used can be downloaded from 
 24 | http://s3.amazonaws.com/prelert_demo/farequote.csv and looks like this:
 25 | 
 26 |     time,airline,responsetime,sourcetype
 27 |     2014-06-23 00:00:00Z,AAL,132.2046,farequote
 28 |     2014-06-23 00:00:00Z,JZA,990.4628,farequote
 29 |     2014-06-23 00:00:00Z,JBU,877.5927,farequote
 30 | 
 31 | The script is invoked with 1 positional argument the farequote.csv 
 32 | file and has optional arguments to specify the location of the 
 33 | Engine API. Run the script with '--help' to see the options.
 34 | 
 35 | The output is CSV print out of date and anomaly score.
 36 | If a bucket timestamp is specified only the anomaly records for that bucket
 37 | are returned.
 38 | '''
 39 | 
 40 | import argparse
 41 | import sys
 42 | import json
 43 | import logging
 44 | 
 45 | 
 46 | from prelert.engineApiClient import EngineApiClient
 47 | 
 48 | 
 49 | # Prelert Engine API connection prarams
 50 | HOST = 'localhost'
 51 | PORT = 8080
 52 | BASE_URL = 'engine/v2'
 53 | 
 54 | 
 55 | def setupLogging():
 56 |     '''
 57 |         Log to console
 58 |     '''    
 59 |     logging.basicConfig(level=logging.INFO,format='%(asctime)s %(levelname)s %(message)s')
 60 | 
 61 | def parseArguments():
 62 |     parser = argparse.ArgumentParser()
 63 |     parser.add_argument("--host", help="The Prelert Engine API host, defaults to "
 64 |         + HOST, default=HOST)    
 65 |     parser.add_argument("--port", help="The Prelert Engine API port, defaults to " 
 66 |         + str(PORT), default=PORT)
 67 |     parser.add_argument("file", help="Path to farequote.csv")
 68 | 
 69 |     return parser.parse_args()   
 70 | 
 71 | 
 72 | def main():
 73 | 
 74 |     setupLogging()
 75 | 
 76 |     args = parseArguments()
 77 | 
 78 |     # Create the REST API client
 79 |     engine_client = EngineApiClient(args.host, BASE_URL, args.port)
 80 | 
 81 |     job_config = '{"analysisConfig" : {\
 82 |                         "bucketSpan":3600,\
 83 |                         "detectors" :[{"function":"metric","fieldName":"responsetime","byFieldName":"airline"}] },\
 84 |                         "dataDescription" : {"fieldDelimiter":",", "timeField":"time", "timeFormat":"yyyy-MM-dd HH:mm:ssX"} }'
 85 | 
 86 |     logging.info("Creating job")
 87 |     (http_status_code, response) = engine_client.createJob(job_config)
 88 |     if http_status_code != 201:
 89 |         print (http_status_code, json.dumps(response))
 90 |         return
 91 | 
 92 |     job_id = response['id']
 93 | 
 94 |     logging.info("Uploading data to " + job_id)
 95 |     file = open(args.file, 'rb')
 96 |     (http_status_code, response) = engine_client.upload(job_id, file)
 97 |     if http_status_code != 202:
 98 |         print (http_status_code, json.dumps(response))
 99 |         return
100 | 
101 | 
102 |     logging.info("Closing job " + job_id)
103 |     (http_status_code, response) = engine_client.close(job_id)
104 |     if http_status_code != 202:
105 |         print (http_status_code, json.dumps(response))
106 |         return
107 | 
108 |     logging.info("Get result buckets for job " + job_id)
109 |     (http_status_code, response) = engine_client.getAllBuckets(job_id)
110 |     if http_status_code != 200:
111 |         print (http_status_code, json.dumps(response))
112 |     else:
113 |         print "Date,Anomaly Score,Max Normalized Probablility"
114 |         for bucket in response:                                
115 |             print "{0},{1},{2}".format(bucket['timestamp'], bucket['anomalyScore'], 
116 |                         bucket['maxNormalizedProbability'])
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     main()    
121 | 
122 | 


--------------------------------------------------------------------------------
/printJobBuckets.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | ############################################################################
  3 | #                                                                          #
  4 | # Copyright 2014 Prelert Ltd                                               #
  5 | #                                                                          #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");          #
  7 | # you may not use this file except in compliance with the License.         #
  8 | # You may obtain a copy of the License at                                  #
  9 | #                                                                          #
 10 | #    http://www.apache.org/licenses/LICENSE-2.0                            #
 11 | #                                                                          #
 12 | # Unless required by applicable law or agreed to in writing, software      #
 13 | # distributed under the License is distributed on an "AS IS" BASIS,        #
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
 15 | # See the License for the specific language governing permissions and      #
 16 | # limitations under the License.                                           #
 17 | #                                                                          #
 18 | ############################################################################
 19 | '''
 20 | Pull the latest results for the provided job id and print
 21 | the bucket timestamp and anomaly score.
 22 | 
 23 | The script is invoked with 1 positional argument -the id of the 
 24 | job to query the results of. Additional optional arguments
 25 | to specify the location of the Engine API. Run the script with 
 26 | '--help' to see the options.
 27 | 
 28 | If the --continue-poll flag is set then loop 
 29 | indefinitley polling every 10 seconds for any new results. 
 30 | '''
 31 | 
 32 | import argparse
 33 | import sys
 34 | import json
 35 | import logging
 36 | import time
 37 | 
 38 | from prelert.engineApiClient import EngineApiClient
 39 | 
 40 | # defaults
 41 | HOST = 'localhost'
 42 | PORT = 8080
 43 | BASE_URL = 'engine/v2'
 44 | 
 45 | # time between polling for new results
 46 | POLL_INTERVAL_SECS = 10
 47 | 
 48 | 
 49 | def setupLogging():
 50 |     '''
 51 |         Log to console
 52 |     '''    
 53 |     logging.basicConfig(level=logging.INFO,format='%(asctime)s %(levelname)s %(message)s')
 54 | 
 55 | 
 56 | def parseArguments():
 57 |     parser = argparse.ArgumentParser()
 58 |     parser.add_argument("--host", help="The Prelert Engine API host, defaults to "
 59 |         + HOST, default=HOST)
 60 |     parser.add_argument("--port", help="The Prelert Engine API port, defaults to "
 61 |         + str(PORT), default=PORT)            
 62 |     parser.add_argument("--continue-poll", action='store_true', help="If set then "
 63 |         "continue polling in real time for new results", dest="continue_poll")
 64 |     parser.add_argument("--anomalyScore", help="Filter out buckets with an anomalyScore "  
 65 |         + "less than this", type=float, default=0.0)
 66 |     parser.add_argument("--normalizedProbability", help="Filter out buckets with an " 
 67 |         + "max normalized probablilty less than this", type=float, default=0.0)    
 68 |     parser.add_argument("jobid", help="The jobId to request results from", default="0")
 69 |     return parser.parse_args()   
 70 | 
 71 | 
 72 | def printHeader():
 73 |     print "Date,Anomaly Score,Max Normalized Probablility"
 74 | 
 75 | def printBuckets(buckets):
 76 |     for bucket in buckets:
 77 |         print "{0},{1},{2}".format(bucket['timestamp'], bucket['anomalyScore'], 
 78 |             bucket['maxNormalizedProbability'])
 79 | 
 80 | def main():
 81 | 
 82 |     setupLogging()
 83 | 
 84 |     args = parseArguments()
 85 |     job_id = args.jobid
 86 | 
 87 |     # Create the REST API client
 88 |     engine_client = EngineApiClient(args.host, BASE_URL, args.port)
 89 | 
 90 |     # Get all the buckets up to now
 91 |     logging.info("Get result buckets for job " + job_id)
 92 |     (http_status_code, response) = engine_client.getAllBuckets(job_id, 
 93 |         include_records=False, 
 94 |         anomaly_score_filter_value=args.anomalyScore,
 95 |         normalized_probability_filter_value=args.normalizedProbability)
 96 | 
 97 |     
 98 |     if http_status_code != 200:
 99 |         print (http_status_code, json.dumps(response))
100 |         return
101 |     
102 |     
103 |     printHeader()
104 |     printBuckets(response)
105 | 
106 |     if args.continue_poll:
107 | 
108 |         if len(response) > 0:
109 |             next_bucket_id = int(response[-1]['id']) + 1
110 |         else:
111 |             next_bucket_id = None
112 |         
113 |         while True:
114 |             # Wait POLL_INTERVAL_SECS then query for any new buckets
115 |             time.sleep(POLL_INTERVAL_SECS)
116 | 
117 |             (http_status_code, response) = engine_client.getBucketsByDate(job_id=job_id, 
118 |                 start_date=str(next_bucket_id), end_date=None, 
119 |                 include_records=False,         
120 |                 anomaly_score_filter_value=args.anomalyScore,
121 |                 normalized_probability_filter_value=args.normalizedProbability)
122 | 
123 |             if http_status_code != 200:
124 |                 print (http_status_code, json.dumps(response))
125 |                 break
126 | 
127 |             printBuckets(response)
128 |             
129 |             if len(response) > 0:
130 |                 next_bucket_id = int(response[-1]['id']) + 1
131 | 
132 | 
133 | if __name__ == "__main__":
134 |     main()    
135 | 
136 | 


--------------------------------------------------------------------------------
/cloudwatch/README.md:
--------------------------------------------------------------------------------
  1 | Prelert - Data Analytics for AWS CloudWatch
  2 | ==============================
  3 | 
  4 | Prelert's analytics provides fast insights into your machine data. This simple example
  5 | shows how to use Anomaly Detection for your CloudWatch monitoring data.
  6 | 
  7 | 
  8 | Pre-requisites and Installation
  9 | --------------
 10 | * The Prelert Engine API, download a free trial from [here](http://www.prelert.com/reg/beta-signup.html)
 11 | * The Prelert Engine API [Python client](https://github.com/prelert/engine-python)
 12 | * [Boto](https://github.com/boto/boto) the Python interface to Amazon Web Services
 13 | * An Amazon Web Services account and your access details
 14 | 
 15 | Install the Prelert Python client from GitHub
 16 | 
 17 |     git clone https://github.com/prelert/engine-python.git
 18 |     cd engine-python
 19 |     python setup.py install
 20 | 
 21 | Boto can either be installed using `pip`
 22 | 
 23 |     pip install boto
 24 | 
 25 | or cloned from GitHub
 26 | 
 27 |     git clone git://github.com/boto/boto.git
 28 |     cd boto
 29 |     python setup.py install
 30 | 
 31 | 
 32 | Connecting to Amazon CloudWatch
 33 | ----------
 34 | First create a configuration file containing your Amazon access ID and key e.g:
 35 | 
 36 |     region=REGION
 37 |     aws_access_key_id=YOUR_ACCESS_ID
 38 |     aws_secret_access_key=YOUR_SECRET_KEY
 39 | 
 40 | Where REGION is one of us-east-1, eu-west-1, etc
 41 | 
 42 | Save the configuration as aws_access.conf and test the connection parameters using 
 43 | the [listMetrics.py](listMetrics.py) script
 44 | 
 45 |     python listMetrics.py aws_access.conf
 46 | 
 47 | If the script reports an error instead of a list of metrics, like the example below, check your connection settings.
 48 | 
 49 |     DiskReadBytes AWS/EC2 {u'InstanceId': [u'i-d9789exx']}
 50 |     DiskWriteBytes AWS/EC2 {u'InstanceId': [u'i-4b8c47xx']}
 51 |     NetworkIn AWS/EC2 {u'InstanceId': [u'i-4b8c47xx']}
 52 |     ...
 53 | 
 54 | Metric Data
 55 | ------------
 56 | CloudWatch metrics have a name, namespace and a list of dimensions. In this case we
 57 | are only interested in metrics from the 'AWS/EC2' namespace
 58 | 
 59 |     Metric Name,    Namespace, Dimensions
 60 |     NetworkIn,      AWS/EC2,   {'InstanceId': [u'i-baaa95xx']}
 61 |     CPUUtilization, AWS/EC2,   {'InstanceId': [u'i-140862xx']}
 62 | 
 63 | For a particular instance/metric combination Amazon provide an API call to get the
 64 | metrics statistics, in this case the average value was requested for the *CPUUtilization* metric 
 65 | returning a list of datapoints:
 66 | 
 67 |     {'Timestamp': datetime.datetime(2014, 9, 10, 10, 26), 'Average': 1.8, 'Unit': 'Percent'}
 68 |     {'Timestamp': datetime.datetime(2014, 9, 10, 10, 31), 'Average': 1.0, 'Unit': 'Percent'}
 69 |     ...
 70 | 
 71 | The data is then formatted in a manner suitable for uploading to the Prelert Engine.
 72 | 
 73 |     {"timestamp":"2014-09-10T11:05:00+00:00", "instance":"i-1a1743xx", "metric_name":"CPUUtilization", "Average":80.01}
 74 |     {"timestamp":"2014-09-10T11:05:00+00:00", "instance":"i-140862xx", "metric_name":"NetworkIn", "Average":8722.6}
 75 |     {"timestamp":"2014-09-10T11:05:00+00:00", "instance":"i-1a1743xx", "metric_name":"StatusCheckFailed", "Average":0.0}
 76 |     {"timestamp":"2014-09-10T11:05:00+00:00", "instance":"i-1a1743xx", "metric_name":"DiskWriteOps", "Average":0.0}
 77 |     {"timestamp":"2014-09-10T11:05:00+00:00", "instance":"i-1a1743xx", "metric_name":"DiskReadOps", "Average":1.0}
 78 | 
 79 | 
 80 | Job Configuration
 81 | ------------------
 82 | 
 83 | The Prelert Engine job is defined as having one detector configured to analyze the mean of the field named 
 84 | 'Average' by the field 'metric_name' where the value of 'metric_name' is one of the 
 85 | AWS metrics i.e. CPUUtilization, DiskWriteOps, etc. The analysis is partitioned by AWS instance ID.
 86 | `bucketSpan` is set to 300 seconds, which should be set to the same as the CloudWatch reporting interval.
 87 | 
 88 |     "analysisConfig" : {
 89 |         "bucketSpan": 300,
 90 |         "detectors" : [{"function":"mean","fieldName":"Average","byFieldName":"metric_name","partitionFieldName":"instance"}] 
 91 |     }
 92 | 
 93 | The job's dataDescription instructs the Engine that the data is in JSON format and how to parse the timestamp
 94 | 
 95 |     "dataDescription" : {"format":"JSON","timeField":"timestamp","timeFormat":"yyyy-MM-dd'T'HH:mm:ssX"} 
 96 | 
 97 | 
 98 | With the metric data in a suitable format and the job configured we are now ready to monitor CloudWatch data.
 99 | 
100 | Analyzing Real Time Data
101 | -------------------------
102 | 
103 | [cloudWatchMetrics.py](cloudWatchMetrics.py) requires one argument - the AWS connection file created previously.
104 | 
105 |     python cloudWatchMetrics.py aws_access.conf
106 | 
107 | In this mode the script will create a new job then run in an infinite loop and 5 minutes 
108 | it will extract the previous 5 minutes of metric values from CloudWatch then upload this data to the Prelert Engine API. 
109 | To send the data to a previously defined job use the *--job-id* option:
110 | 
111 |     python cloudWatchMetrics.py --job-id=cloudwatch aws_access.conf
112 | 
113 | To stop to process send press Ctrl-C and the script will catch the interrupt then gracefully exit after closing the running job.
114 | 
115 | 
116 | Analyzing Stored Data
117 | ----------------------
118 | If you wish to analyse historical data stored in CloudWatch the script accepts *--start-date* and *--end-date* 
119 | with the dates in YYYY-MM-DD format
120 | 
121 |     python cloudWatchMetrics.py --start-date=2014-09-1 --end-date=2014-09-08 aws_access.conf
122 | 
123 | The script will exit once it has queried the all the data for that time period and analysed it.
124 | 
125 | *Note that the script assumes a default host and port for the Engine API, you can specify different
126 | settings using the *--api-host* and *--api-port* settings.
127 | 
128 |     python cloudWatchMetrics.py --api-host=my.server --api-port=8000 aws_access.conf
129 | 
130 | 
131 | Analytic Results
132 | -----------------
133 | Whether running in real time or historical mode the results of Prelert's analysis are made available once the
134 | data has been processed. Review the results in the Jobs Dashboard (typically hosted at http://localhost:8080/dashboard/index.html#/dashboard/file/prelert_api_jobs.json) or directly through the API. For more information about the API results format see the [API reference](http://www.prelert.com/docs/engine_api/1.0/results.html)
135 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | engine-python
  2 | =============
  3 | 
  4 | A Python HTTP client to the Prelert Anomaly Detective Engine REST API. The client creates analysis jobs, streams data to them and queries the results.
  5 | 
  6 | Prior to using the client, the Engine API needs to be installed and setup. Please follow these steps:
  7 | 
  8 | - Have a read of our documentation: http://www.prelert.com/docs/engine_api/latest
  9 | - Download and install the Anomaly Detective Engine API from here: http://www.prelert.com/reg/anomaly-detective-engine-api.html
 10 | - We recommend you try our quick start example: http://www.prelert.com/docs/engine_api/latest/quick-start.html
 11 | 
 12 | If you are already familiar with connecting to the Engine API, then please check out:
 13 | 
 14 | - [Elasticsearch ELK Connector](elk_connector)
 15 | - [AWS CloudWatch Connector](cloudwatch)
 16 | 
 17 | 
 18 | Running the Examples
 19 | ---------------------
 20 | First install the client using the standard setup script:
 21 | 
 22 |     python setup.py install
 23 | 
 24 | The client uses the standard json, httplib and logging packages no additional
 25 | dependencies are required. The example scripts use the `argparse` module that
 26 | was added to Python in version 2.7.
 27 | 
 28 | Using the Client
 29 | -----------------
 30 | The easiest way is to walk you through this annotated example.
 31 | See the file [simpleEngineApiExample.py](simpleEngineApiExample.py) and download the example data
 32 | from  [http://s3.amazonaws.com/prelert_demo/farequote.csv](http://s3.amazonaws.com/prelert_demo/farequote.csv). To run the full example invoke:
 33 | 
 34 |     python simpleEngineApiExample.py farequote.csv
 35 | 
 36 | Your first act is to create the client and make a HTTP connection to the API server
 37 | 
 38 |     from prelert.engineApiClient import EngineApiClient
 39 |     engine_client = EngineApiClient(host='localhost', base_url='/engine/v0.3', port=8080)
 40 | 
 41 | Before you can create a job the configuration must be defined.
 42 | 
 43 |     job_config =  '{ \
 44 |                     "analysisConfig" : {\
 45 |                     "bucketSpan":3600,\
 46 |                     "detectors" :[{"fieldName":"responsetime","byFieldName":"airline"}] },\
 47 |                     "dataDescription" : {"fieldDelimiter":",", "timeField":"time", "timeFormat":"yyyy-MM-dd HH:mm:ssX"}\
 48 |                    }'
 49 |     (http_status_code, response) = engine_client.createJob(job_config)
 50 | 
 51 |     import json
 52 |     print (http_status_code, json.dumps(response))
 53 |     (201, '{"id": "20140515150739-00002"}')
 54 | 
 55 | *createJob* returns a document with a single `id` field this is the `job id` of the new job.
 56 | 
 57 | Every client call returns a tuple *(http_status_code, response)* use the *http_status_code*
 58 | to determine the sucess of the operation, if the code is not one of the 2XX Success codes
 59 | response will be an object containing an error message.
 60 | 
 61 | As an example try creating a new job with an invalid configuration - this one does not define 
 62 | any detectors.
 63 | 
 64 |     bad_job_config = '{\
 65 |                       "analysisConfig" : {"bucketSpan":3600 },\
 66 |                       "dataDescription" : {"fieldDelimiter":",", "timeField":"time", "timeFormat":"yyyy-MM-dd HH:mm:ssX"}\
 67 |                       }'
 68 | 
 69 |     engine_client = EngineApiClient(host='localhost', base_url='/engine/v0.3', port=8080)
 70 |     (http_status_code, response) = engine_client.createJob(bad_job_config)
 71 | 
 72 |     if http_status_code != 201:
 73 |         print (http_status_code, json.dumps(response))
 74 |     
 75 |     (400, '{"errorCode": 10107, "message": "No detectors configured"}')
 76 | 
 77 | and an informative error message reminds us to configure some detectors next time.
 78 | For more information on the possible error codes see the Engine API documentation.
 79 | 
 80 | Once we have a properly configured job we can upload data to it first let's revisit part of
 81 | the configuration.
 82 |     
 83 |     "dataDescription" : {"fieldDelimiter":",", "timeField":"time", "timeFormat"="yyyy-MM-dd HH:mm:ssX"}
 84 | 
 85 | This line specifies that our data is in a delimited format (the default), the fields are
 86 | separated by ',' and there is a field 'time' containing a timestamp in the Java SimpleDateFormat 
 87 | 'yyyy-MM-dd HH:mm:ssX'.
 88 | 
 89 | Here's an example of the data:
 90 | > time,airline,responsetime,sourcetype  
 91 | > 2013-01-28 00:00:00Z,AAL,132.2046,farequote  
 92 | > 2013-01-28 00:00:00Z,JZA,990.4628,farequote  
 93 | > 2013-01-28 00:00:00Z,JBU,877.5927,farequote  
 94 | > 2013-01-28 00:00:00Z,KLM,1355.4812,farequote  
 95 | 
 96 | Create a job with our previously defined configuration   
 97 | 
 98 |     engine_client = EngineApiClient(host='localhost', base_url='/engine/v0.3', port=8080)  
 99 |     (http_status_code, response) = engine_client.createJob(job_config)  
100 | 
101 |     if http_status_code == 201:
102 |         job_id = response['id']
103 | 
104 | 
105 | The *job_id* will be used in all future method calls
106 | 
107 | Open the csv file and upload it to the Engine
108 | 
109 |     csv_data_file = open('data/farequote.csv', 'rb')
110 |     (http_status_code, response) = engine_client.upload(job_id, csv_data_file)
111 |     if http_status_code != 202:
112 |         print (http_status_code, json.dumps(response)) # !error
113 | 
114 | The *upload* function accepts either an open file object or a string.
115 | 
116 | Close the job to indicate that there is no more data to upload
117 | 
118 |     (http_status_code, response) = engine_client.close(job_id)
119 |     if http_status_code != 202:
120 |         print (http_status_code, json.dumps(response)) # !error
121 | 
122 | Now get all of the result buckets using one of the clients _getBuckets_ functions and 
123 | print the anomaly scores
124 | 
125 |     (http_status_code, response) = engine_client.getAllBuckets(job_id)
126 |     if http_status_code != 200:
127 |         print (http_status_code, json.dumps(response))
128 |     else:
129 |         print "Date,AnomalyScore"
130 |         for bucket in response:                                
131 |             print "{0},{1}".format(bucket['timestamp'], bucket['anomalyScore']) 
132 | 
133 | You can also request buckets by time
134 | 
135 |     (http_status_code, response) = engine_client.getBucketsByDate(job_id=job_id,
136 |         start_date='2012-10-22T07:00:00Z', end_date='2012-10-22T09:00:00Z')
137 |     if http_status_code != 200:
138 |         print (http_status_code, json.dumps(response))
139 |     else:
140 |         print "Date,AnomalyScore"
141 |         for bucket in response:                                
142 |             print "{0},{1}".format(bucket['timestamp'], bucket['anomalyScore'])  
143 | 
144 | 


--------------------------------------------------------------------------------
/streamingApm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | ############################################################################
  3 | #                                                                          #
  4 | # Copyright 2014 Prelert Ltd                                               #
  5 | #                                                                          #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");          #
  7 | # you may not use this file except in compliance with the License.         #
  8 | # You may obtain a copy of the License at                                  #
  9 | #                                                                          #
 10 | #    http://www.apache.org/licenses/LICENSE-2.0                            #
 11 | #                                                                          #
 12 | # Unless required by applicable law or agreed to in writing, software      #
 13 | # distributed under the License is distributed on an "AS IS" BASIS,        #
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
 15 | # See the License for the specific language governing permissions and      #
 16 | # limitations under the License.                                           #
 17 | #                                                                          #
 18 | ############################################################################
 19 | '''
 20 | This script creates a new job and uploads to it APM data records
 21 | generated from existing data in a CSV file. New records will created
 22 | indefinitely or until the 'duration' argument expires. Each record has
 23 | a new timestamp so this script can be used to repeatedly replay the
 24 | historical data. After each upload of data the script requests any new
 25 | bucket results and prints them.
 26 | 
 27 | The script is invoked with 1 positional argument -the CSV file containing
 28 | APM to use a the source of the generated data- and optional arguments
 29 | to specify the location of the Engine API. Run the script with '--help'
 30 | to see the options.
 31 | 
 32 | The file used in the online example can be downloaded from
 33 | http://s3.amazonaws.com/prelert_demo/network.csv
 34 | 
 35 | If no 'duration' is set the script will run indefinitely cse Ctrl-C to
 36 | stop the script - the interrupt is caught and the job closed gracefully
 37 | '''
 38 | 
 39 | import argparse
 40 | import csv
 41 | import json
 42 | import logging
 43 | import sys
 44 | import time
 45 | from datetime import datetime, timedelta, tzinfo
 46 | 
 47 | from prelert.engineApiClient import EngineApiClient
 48 | 
 49 | # Default connection prarams
 50 | HOST = 'localhost'
 51 | PORT = 8080
 52 | BASE_URL = 'engine/v2'
 53 | 
 54 | ZERO_OFFSET = timedelta(0)
 55 | 
 56 | class UtcOffset(tzinfo):
 57 |     '''
 58 |     Timezone object at 0 (UTC) offset
 59 |     '''
 60 | 
 61 |     def utcoffset(self, dt):
 62 |         return ZERO_OFFSET
 63 | 
 64 |     def tzname(self, dt):
 65 |         return "UTC"
 66 | 
 67 |     def dst(self, dt):
 68 |         return ZERO_OFFSET
 69 | 
 70 | 
 71 | def parseArguments():
 72 |     parser = argparse.ArgumentParser()
 73 |     parser.add_argument("--host", help="The Prelert Engine API host, defaults to "
 74 |         + HOST, default=HOST)
 75 |     parser.add_argument("--port", help="The Prelert Engine API port, defaults to "
 76 |         + str(PORT), default=PORT)
 77 |     parser.add_argument("--duration", help="The number of hours to generate \
 78 |         data for. If not set script will produce records from the historical \
 79 |         start date until the time now", type=int, default=0)
 80 |     parser.add_argument("file", help="Path to APM data")
 81 | 
 82 |     return parser.parse_args()
 83 | 
 84 | 
 85 | def generateRecords(csv_filename, start_date, interval, end_date):
 86 |     '''
 87 |     Generator function reads csv data file and returns records
 88 |     with an updated timestamp on demand.
 89 | 
 90 |     Records are read from a file and stored in a local array, once
 91 |     all the records have been read the function does not loop
 92 |     round to the beginning again instead it flips and outputs
 93 |     the records in reverse order and so on.
 94 | 
 95 |     The csv file must contain a field with the name 'time'
 96 |     '''
 97 | 
 98 |     csv_data = []
 99 |     csv_file = open(csv_filename, 'rb')
100 |     reader = csv.reader(csv_file)
101 |     header = reader.next()
102 | 
103 |     time_field_idx = -1
104 |     for i in range(len(header)):
105 |         if header[i] == 'time':
106 |             time_field_idx = i
107 |             break
108 | 
109 |     if time_field_idx == -1:
110 |         logging.error("Cannot find 'time' field in csv header")
111 |         return
112 | 
113 |     reverse = False
114 |     while start_date < end_date:
115 |         try:
116 |             yield header
117 | 
118 |             if len(csv_data) == 0:
119 |                 # populate csv_data record
120 |                 for row in reader:
121 |                     row[time_field_idx] = start_date.isoformat()
122 |                     start_date += interval
123 | 
124 |                     csv_data.append(row)
125 |                     yield row
126 | 
127 |                     if start_date > end_date:
128 |                         break
129 | 
130 |                 csv_file.close()
131 | 
132 |             else:
133 |                 if reverse:
134 |                     for row in reversed(csv_data):
135 |                         row[time_field_idx] = start_date.isoformat()
136 |                         start_date += interval
137 |                         yield row
138 | 
139 |                         if start_date > end_date:
140 |                             break
141 |                 else:
142 |                     for row in csv_data:
143 |                         row[time_field_idx] = start_date.isoformat()
144 |                         start_date += interval
145 |                         yield row
146 | 
147 |                         if start_date > end_date:
148 |                             break
149 | 
150 |             reverse = not reverse
151 | 
152 |         except KeyboardInterrupt:
153 |             raise StopIteration
154 | 
155 | 
156 | 
157 | def main():
158 |     args = parseArguments()
159 | 
160 | 
161 |     start_date = datetime(2014, 05, 18, 0, 0, 0, 0, UtcOffset())
162 |     # interval between the generated timestamps for the records
163 |     interval = timedelta(seconds=300)
164 | 
165 | 
166 |     if args.duration <= 0:
167 |         end_date = datetime.now(UtcOffset())
168 |     else:
169 |         duration = timedelta(hours=args.duration)
170 |         end_date = start_date + duration
171 | 
172 | 
173 |     job_config = '{\
174 |         "analysisConfig" : {\
175 |             "bucketSpan":3600,\
176 |             "detectors" :[\
177 |                 {"fieldName":"In Discards","byFieldName":"host"},\
178 |                 {"fieldName":"In Octets","byFieldName":"host"},\
179 |                 {"fieldName":"Out Discards","byFieldName":"host"},\
180 |                 {"fieldName":"Out Octets","byFieldName":"host"} \
181 |             ]\
182 |         },\
183 |         "dataDescription" : {\
184 |             "fieldDelimiter":",",\
185 |             "timeField":"time",\
186 |             "timeFormat":"yyyy-MM-dd\'T\'HH:mm:ssXXX"\
187 |         }\
188 |     }'
189 | 
190 | 
191 |     engine_client = EngineApiClient(args.host, BASE_URL, args.port)
192 |     (http_status_code, response) = engine_client.createJob(job_config)
193 |     if http_status_code != 201:
194 |         print (http_status_code, json.dumps(response))
195 |         return
196 | 
197 |     job_id = response['id']
198 |     print 'Job created with Id = ' + job_id
199 | 
200 |     # get the csv header (the first record generated)
201 |     record_generator = generateRecords(args.file, start_date, interval, end_date)
202 |     header = ','.join(next(record_generator))
203 |     header += '\n'
204 | 
205 |     count = 0
206 |     try:
207 |         # for the results
208 |         next_bucket_id = 1
209 |         print
210 |         print "Date,Anomaly Score,Max Normalized Probablility"
211 | 
212 |         data = header
213 |         for record in record_generator:
214 |             # format as csv and append new line
215 |             csv = ','.join(record) + '\n'
216 |             data += csv
217 |             # print data
218 | 
219 |             count += 1
220 |             if count == 100:
221 |                 (http_status_code, response) = engine_client.upload(job_id, data)
222 |                 if http_status_code != 202:
223 |                     print (http_status_code, json.dumps(response))
224 |                     break
225 | 
226 |                 # get the latest results...
227 |                 (http_status_code, response) = engine_client.getBucketsByDate(job_id=job_id,
228 |                     start_date=str(next_bucket_id), end_date=None)
229 |                 if http_status_code != 200:
230 |                     print (http_status_code, json.dumps(response))
231 |                     break
232 | 
233 |                 # and print them
234 |                 for bucket in response:
235 |                     print "{0},{1},{2},{3}".format(bucket['timestamp'],
236 |                         bucket['anomalyScore'], bucket['maxNormalizedProbability'])
237 | 
238 |                 if len(response) > 0:
239 |                     next_bucket_id = int(response[-1]['id']) + 1
240 | 
241 |                 # must send the header every time
242 |                 data = header
243 |                 count = 0
244 | 
245 |             # sleep a little while (optional this can be removed)
246 |             #time.sleep(0.1)
247 | 
248 |     except KeyboardInterrupt:
249 |         print "Keyboard interrupt closing job..."
250 | 
251 |     (http_status_code, response) = engine_client.close(job_id)
252 |     if http_status_code != 202:
253 |         print (http_status_code, json.dumps(response))
254 | 
255 | 
256 | if __name__ == "__main__":
257 |     main()
258 | 
259 | 


--------------------------------------------------------------------------------
/elk_connector/elk_connector.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | ############################################################################
  3 | #                                                                          #
  4 | # Copyright 2014 Prelert Ltd                                               #
  5 | #                                                                          #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");          #
  7 | # you may not use this file except in compliance with the License.         #
  8 | # You may obtain a copy of the License at                                  #
  9 | #                                                                          #
 10 | #    http://www.apache.org/licenses/LICENSE-2.0                            #
 11 | #                                                                          #
 12 | # Unless required by applicable law or agreed to in writing, software      #
 13 | # distributed under the License is distributed on an "AS IS" BASIS,        #
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
 15 | # See the License for the specific language governing permissions and      #
 16 | # limitations under the License.                                           #
 17 | #                                                                          #
 18 | ############################################################################
 19 | """
 20 | This script will extract historical log records from Elastissearch logstash
 21 | and upload them to the Prelert Engine API. The program takes
 22 | a number of arguments for the Engine API and Elasticsearch connection 
 23 | settings and optional start and end dates to limit the period begin 
 24 | analysed. The only required argument is the path to a config file 
 25 | containing the Engine Job configuration and the elasticsearch query. 
 26 | 
 27 | See:
 28 |     python elk_connector.py --help
 29 | 
 30 | Example:    
 31 |     Read all the data from the beginning of January 2014 and 
 32 |     upload it to the API server running on host 'api.server'
 33 | 
 34 |     python elk_connector.py --start_date=2014-01-01 --api-host=api.server configs/apache-access.json
 35 | """
 36 | 
 37 | import argparse
 38 | from datetime import datetime, timedelta
 39 | import json
 40 | import logging
 41 | import os
 42 | import sys
 43 | 
 44 | import elasticsearch.exceptions
 45 | from elasticsearch import Elasticsearch
 46 | from prelert.engineApiClient import EngineApiClient
 47 | 
 48 | 
 49 | # Elasticsearch connection settings
 50 | ES_HOST = 'localhost'
 51 | ES_PORT = 9200
 52 | 
 53 | # Prelert Engine API connection prarams
 54 | API_HOST = 'localhost'
 55 | API_PORT = 8080
 56 | API_BASE_URL = 'engine/v2'
 57 | 
 58 | 
 59 | # The maximum number of documents to request from
 60 | # Elasticsearch in each query
 61 | MAX_DOC_TAKE = 5000
 62 | 
 63 | 
 64 | def setupLogging():
 65 |     """
 66 |     Log to console
 67 |     """    
 68 |     logging.basicConfig(level=logging.WARN,format='%(asctime)s %(levelname)s %(message)s')
 69 | 
 70 | def parseArguments():
 71 |     parser = argparse.ArgumentParser()
 72 | 
 73 |     parser.add_argument("file", 
 74 |                 help="Read the configuration from the specified file")
 75 |     parser.add_argument("--es-host", help="The host machine Elasticsearch is \
 76 |         running on, defaults to '" + ES_HOST + "'", default=ES_HOST, dest="es_host")
 77 |     parser.add_argument("--es-port", help="The Elasticsearch HTTP port, defaults to " 
 78 |         + str(ES_PORT), default=ES_PORT, dest="es_port")
 79 |     parser.add_argument("--api-host", help="The Prelert Engine API host, defaults to "
 80 |         + API_HOST, default=API_HOST, dest="api_host")    
 81 |     parser.add_argument("--api-port", help="The Prelert Engine API port, defaults to " 
 82 |         + str(API_PORT), default=API_PORT, dest="api_port")
 83 |     parser.add_argument("--start-date", help="Pull data from this date, if not \
 84 |         set the search starts with the oldest Logstash index. Dates must be in \
 85 |         YYYY-MM-DD format", default=None, dest="start_date")
 86 |     parser.add_argument("--end-date", help="Pull data up to this date, if not \
 87 |         set all indexes from --start-date are searched. Dates must be in \
 88 |         YYYY-MM-DD format", default=None, dest="end_date")
 89 | 
 90 | 
 91 |     return parser.parse_args()   
 92 | 
 93 | def elasticSearchDocsToDicts(hits):
 94 |     """
 95 |     Convert the Elasticsearch hits into an list of dict objects
 96 |     In this case we use the '_source' object as the desired fields
 97 |     were set in the query.
 98 |     """
 99 |     
100 |     objs = []
101 |     for hit in hits:
102 |         objs.append(hit['_source']) 
103 | 
104 |     return objs
105 | 
106 | def nextLogStashIndex(start_date, end_date):
107 |     """
108 |     Generator method for listing all the Logstash index names
109 |     between 2 dates. The method returns when then index for 
110 |     end_date is generated.
111 | 
112 |     Logstash index names are in this format: 'logstash-YYYY.MM.DD'
113 |     """
114 | 
115 |     yield "logstash-" + start_date.strftime("%Y.%m.%d")
116 | 
117 |     one_day = timedelta(days=1)
118 |     while True:
119 |         start_date = start_date + one_day
120 |         if start_date > end_date:
121 |             break
122 | 
123 |         yield "logstash-" + start_date.strftime("%Y.%m.%d")
124 | 
125 | 
126 | def findDateOfFirstIndex(es_client, type, query):
127 |     """
128 |     Query for 1 document from all indicies (he query should be sorted
129 |     in time order) the index the document belongs to is the start index.
130 | 
131 |     Returns the date of the first index or None if no documents are found
132 |     """
133 | 
134 |     hits = es_client.search(index="_all", doc_type=type, 
135 |             body=query, from_=0, size=1)
136 | 
137 |     if len(hits['hits']['hits']) > 0:
138 |         date_str = hits['hits']['hits'][0]['_index'].lstrip("logstash-")     
139 |         
140 |         return datetime.strptime(date_str, "%Y.%m.%d")
141 |     else:
142 |         return None
143 | 
144 | 
145 | 
146 | def main():
147 | 
148 |     setupLogging()
149 |     args = parseArguments()
150 | 
151 |     # read the config file
152 |     try:
153 |         with open(args.file, "r") as config_file:
154 |             config = json.load(config_file)
155 |     except IOError:
156 |         print "Error opening file " + args.file
157 |         return
158 | 
159 | 
160 |     # default start date is None meaning 'all time'
161 |     start_date = None
162 |     if args.start_date != None:
163 |         start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
164 | 
165 |     # default end date is today
166 |     end_date = datetime.today()
167 |     if args.end_date != None:
168 |         end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
169 |    
170 | 
171 |     # The ElasticSearch client
172 |     es_client = Elasticsearch(args.es_host + ":" + str(args.es_port))
173 | 
174 |     data_type = config['type']
175 |     search_body = json.dumps(config['search'])
176 | 
177 |     # If no start date find the first logstash index containing our docs
178 |     if start_date == None:        
179 |         start_date = findDateOfFirstIndex(es_client, data_type, search_body)
180 |         if start_date == None:
181 |             print "No documents found with the query " + search_body
182 |             return
183 | 
184 |     # The REST API client
185 |     engine_client = EngineApiClient(args.api_host, API_BASE_URL, args.api_port)
186 |     (http_status, response) = engine_client.createJob(json.dumps(config['job_config']))
187 |     if http_status != 201:
188 |         print "Error creatting job"
189 |         print http_status, json.dumps(response)
190 |         return
191 | 
192 | 
193 |     job_id = response['id']  
194 |     print "Created job with id " + str(job_id)
195 | 
196 |     doc_count = 0
197 |     for index_name in nextLogStashIndex(start_date, end_date):
198 | 
199 |         print "Reading from index " + index_name
200 | 
201 |         skip = 0
202 |         try:
203 |             # Query the documents from ElasticSearch and write to the Engine
204 |             hits = es_client.search(index=index_name, doc_type=data_type, 
205 |                 body=search_body, from_=skip, size=MAX_DOC_TAKE)
206 |         except elasticsearch.exceptions.NotFoundError:
207 |             # Index not found try the next one
208 |             continue
209 | 
210 |         # upload to the API
211 |         content = json.dumps(elasticSearchDocsToDicts(hits['hits']['hits']))        
212 |         (http_status, response) = engine_client.upload(job_id, content)
213 |         if http_status != 202:
214 |             print "Error uploading log content to the Engine"
215 |             print http_status, json.dumps(response)
216 |             continue
217 | 
218 |         doc_count += len(hits['hits']['hits']) 
219 | 
220 |         # get any other docs
221 |         hitcount = int(hits['hits']['total'])
222 |         while hitcount > (skip + MAX_DOC_TAKE):    
223 |             skip += MAX_DOC_TAKE
224 |             hits = es_client.search(index=index_name, doc_type=data_type, 
225 |                 body=search_body, from_=skip, size=MAX_DOC_TAKE)
226 | 
227 |             content = json.dumps(elasticSearchDocsToDicts(hits['hits']['hits']))        
228 |             (http_status, response) = engine_client.upload(job_id, content)
229 |             if http_status != 202:
230 |                 print json.dumps(response)
231 |                 continue
232 | 
233 |             doc_count += len(hits['hits']['hits']) 
234 | 
235 | 
236 |         print "Uploaded {0} records".format(str(doc_count))
237 |         
238 |     (http_status, response) = engine_client.close(job_id)
239 |     if http_status != 202:
240 |         print "Error closing job"
241 |         print http_status, json.dumps(response)
242 |         return
243 |     print "{0} records successfully written to job {1}".format(str(doc_count), job_id)
244 | 
245 | 
246 | if __name__ == "__main__":
247 |     main()    
248 | 
249 | 


--------------------------------------------------------------------------------
/elk_connector/elk_connector_realtime.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | ############################################################################
  3 | #                                                                          #
  4 | # Copyright 2014 Prelert Ltd                                               #
  5 | #                                                                          #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");          #
  7 | # you may not use this file except in compliance with the License.         #
  8 | # You may obtain a copy of the License at                                  #
  9 | #                                                                          #
 10 | #    http://www.apache.org/licenses/LICENSE-2.0                            #
 11 | #                                                                          #
 12 | # Unless required by applicable law or agreed to in writing, software      #
 13 | # distributed under the License is distributed on an "AS IS" BASIS,        #
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
 15 | # See the License for the specific language governing permissions and      #
 16 | # limitations under the License.                                           #
 17 | #                                                                          #
 18 | ############################################################################
 19 | """
 20 | This script reads log records from logstash indexes in elasticsearch
 21 | and uploads them to the Prelert Engine. Logs are read in real-time 
 22 | by default the last 60 seconds of logs are read every 60 seconds, this
 23 | can be changed by setting the '--update-interval' argument.
 24 | 
 25 | The program takes a number of optional arguments for the Engine 
 26 | API and elasticsearch connection settings the only required argument 
 27 | is the path to a config file containing the Engine Job configuration 
 28 | and the elasticsearch query. If a job id is provided then the logs 
 29 | are sent to that job else a new job is created. 
 30 | 
 31 | The script attempts to add a date range filter for the real-time date 
 32 | arguments to the elasticsearch query defined in the config file, if it 
 33 | cannot because 'filter' and 'post_filter' are already defined then
 34 | it raises an error. 
 35 | 
 36 | The program will indefinitely, interrupt it with Ctrl C and the
 37 | script will close the API analytics Job and exit gracefully. 
 38 | 
 39 | See:
 40 |     python elk_connector_realtime.py --help
 41 | 
 42 | Example:  
 43 |     python elk_connector_realtime.py --es-host=elasticsearchserver
 44 |         --api-host=prelertserver --job-id=jobid configs/syslog.json 
 45 | """
 46 | 
 47 | import argparse
 48 | from datetime import datetime, time, timedelta, tzinfo
 49 | import json
 50 | import logging
 51 | import os
 52 | import sys
 53 | import time
 54 | 
 55 | import elasticsearch.exceptions
 56 | from elasticsearch import Elasticsearch
 57 | from prelert.engineApiClient import EngineApiClient
 58 | 
 59 | 
 60 | # Elasticsearch connection settings
 61 | ES_HOST = 'localhost'
 62 | ES_PORT = 9200
 63 | 
 64 | # Prelert Engine API connection prarams
 65 | API_HOST = 'localhost'
 66 | API_PORT = 8080
 67 | API_BASE_URL = 'engine/v2'
 68 | 
 69 | # The maximum number of documents to request from
 70 | # Elasticsearch in each query
 71 | MAX_DOC_TAKE = 5000
 72 | 
 73 | # The update interval in seconds
 74 | # elasticsearch is queried with this periodicity
 75 | UPDATE_INTERVAL = 60
 76 | 
 77 | 
 78 | class UTC(tzinfo):
 79 |     """
 80 |     UTC timezone class
 81 |     """
 82 |  
 83 |     def utcoffset(self, dt):
 84 |         return timedelta(0)
 85 |  
 86 |     def tzname(self, dt):
 87 |         return "UTC"
 88 |  
 89 |     def dst(self, dt):
 90 |         return timedelta(0)
 91 | 
 92 | 
 93 | def setupLogging():
 94 |     """
 95 |     Log to console
 96 |     """    
 97 |     logging.basicConfig(level=logging.WARN,format='%(asctime)s %(levelname)s %(message)s')
 98 | 
 99 | def parseArguments():
100 |     parser = argparse.ArgumentParser()
101 | 
102 |     parser.add_argument("file", 
103 |                 help="Read the configuration from the specified file")
104 |     parser.add_argument("--es-host", help="The host machine Elasticsearch is \
105 |         running on, defaults to '" + ES_HOST + "'", default=ES_HOST, dest="es_host")
106 |     parser.add_argument("--es-port", help="The Elasticsearch HTTP port, defaults to " 
107 |         + str(ES_PORT), default=ES_PORT, dest="es_port")
108 |     parser.add_argument("--api-host", help="The Prelert Engine API host, defaults to "
109 |         + API_HOST, default=API_HOST, dest="api_host")    
110 |     parser.add_argument("--api-port", help="The Prelert Engine API port, defaults to " 
111 |         + str(API_PORT), default=API_PORT, dest="api_port")
112 |     parser.add_argument("--job-id", help="Send data to this job. If not set a \
113 |         new job will be created.", default=None, dest="job_id")    
114 |     parser.add_argument("--update-interval", help="The period between each \
115 |         each cycle of querying and uploading data", type=int,
116 |         default=UPDATE_INTERVAL, dest="update_interval")
117 | 
118 | 
119 |     return parser.parse_args()   
120 | 
121 | def elasticSearchDocsToDicts(hits):
122 |     """
123 |     Convert the Elasticsearch hits into an list of dict objects
124 |     In this case we use the '_source' object as the desired fields
125 |     were set in the query.
126 |     """
127 | 
128 |     objs = []
129 |     for hit in hits:
130 |         objs.append(hit['_source']) 
131 | 
132 |     return objs
133 | 
134 | def logstashIndex(date):
135 |     """
136 |     Return the logstash index name for the given date
137 | 
138 |     Logstash index names are in the format: 'logstash-YYYY.MM.DD'
139 |     """
140 | 
141 |     return "logstash-" + date.strftime("%Y.%m.%d")
142 | 
143 | 
144 | def insertDateRangeFilter(query):
145 |     """
146 |     Add a date range filter on the '@timestamp' field either as 
147 |     a 'filter' or 'post_filter'. If both 'filter' and 'post_filter'
148 |     are already defined then an RuntimeError is raised as the 
149 |     date filter cannot be inserted into the query.
150 | 
151 |     The date range filter will look like either
152 | 
153 |         "filter" : {"range" : { "@timestamp" : { "gte" : "start-date",
154 |             "lt" : "end-date"} } }
155 |     or
156 | 
157 |         "post_filter" : {"range" : { "@timestamp" : { "gte" : "start-date",
158 |             "lt" : "end-date"} } }
159 | 
160 |     where 'start-date' and 'end-date' literals will be replaced by 
161 |     the actual timestamps in the query. 
162 |     """
163 | 
164 |     dates = {'gte' : 'start-date', 'lt' : 'end-date'}
165 |     timestamp = {'@timestamp' : dates}
166 |     range_ = {'range' : timestamp}
167 |     
168 |     if not 'filter' in query:
169 |         query['filter'] = range_
170 |     elif not 'post_filter' in query:
171 |         query['post_filter'] = range_
172 |     else:
173 |         raise RuntimeError("Cannot add a 'filter' or 'post_filter' \
174 | date range to the query")
175 | 
176 |     return query
177 | 
178 | 
179 | def replaceDateArgs(query, query_start_time, query_end_time):
180 |     """
181 |     Replace the date arguments in the range filter of the query.
182 |     """
183 | 
184 |     if not 'filter' in query:
185 |         query['filter']['range']['@timestamp']['gte'] = query_start_time.isoformat()
186 |         query['filter']['range']['@timestamp']['lt'] = query_end_time.isoformat()
187 |     else:
188 |         query['post_filter']['range']['@timestamp']['gte'] = query_start_time.isoformat()
189 |         query['post_filter']['range']['@timestamp']['lt'] = query_end_time.isoformat()
190 | 
191 |     return query
192 | 
193 | 
194 | def main():
195 | 
196 |     setupLogging()
197 |     args = parseArguments()
198 | 
199 |     # read the config file
200 |     try:
201 |         with open(args.file, "r") as config_file:
202 |             config = json.load(config_file)
203 |     except IOError:
204 |         print "Error opening file " + args.file
205 |         return
206 |   
207 | 
208 |     # The ElasticSearch client
209 |     es_client = Elasticsearch(args.es_host + ":" + str(args.es_port))
210 | 
211 |     # The REST API client
212 |     engine_client = EngineApiClient(args.api_host, API_BASE_URL, args.api_port)
213 | 
214 |     job_id = args.job_id
215 |     if job_id == None:
216 |         (http_status, response) = engine_client.createJob(json.dumps(config['job_config']))
217 |         job_id = response['id']  
218 |         print "Created job with id " + str(job_id)
219 | 
220 |     print "Using job id " + job_id
221 | 
222 |     data_type = config['type']
223 |     raw_query = insertDateRangeFilter(config['search'])
224 |     
225 | 
226 |     timezone = UTC()
227 |     doc_count = 0    
228 |     try:
229 |         query_end_time = datetime.now(timezone) - timedelta(seconds=args.update_interval)
230 |         while True:
231 |             query_start_time = query_end_time
232 |             query_end_time = datetime.now(timezone)
233 |             query_str = json.dumps(replaceDateArgs(raw_query, query_start_time, 
234 |                 query_end_time)) 
235 |             index_name = logstashIndex(query_start_time)        
236 | 
237 |             skip = 0
238 |             try:
239 |                 # Query the documents from ElasticSearch and write to the Engine
240 |                 hits = es_client.search(index=index_name, doc_type=data_type, 
241 |                     body=query_str, from_=skip, size=MAX_DOC_TAKE)
242 |             except elasticsearch.exceptions.NotFoundError:
243 |                 print "Error: missing logstash index '" + index_name + "'"
244 |                 
245 | 
246 |             # upload to the API
247 |             content = json.dumps(elasticSearchDocsToDicts(hits['hits']['hits'])) 
248 |             
249 |             (http_status, response) = engine_client.upload(job_id, content)
250 |             if http_status != 202:
251 |                 print "Error uploading log content to the Engine"
252 |                 print http_status, json.dumps(response)
253 |                 
254 | 
255 |             doc_count += len(hits['hits']['hits'])                 
256 | 
257 |             # get any other docs
258 |             hitcount = int(hits['hits']['total'])
259 |             while hitcount > (skip + MAX_DOC_TAKE):    
260 |                 skip += MAX_DOC_TAKE
261 |                 hits = es_client.search(index=index_name, doc_type=data_type, 
262 |                     body=query_str, from_=skip, size=MAX_DOC_TAKE)
263 | 
264 |                 content = json.dumps(elasticSearchDocsToDicts(hits['hits']['hits']))
265 | 
266 |                 (http_status, response) = engine_client.upload(job_id, content)
267 |                 if http_status != 202:
268 |                     print "Error uploading log content to the Engine"
269 |                     print json.dumps(response)
270 |                     
271 | 
272 |                 doc_count += len(hits['hits']['hits']) 
273 | 
274 |             print "Uploaded {0} records".format(str(doc_count))
275 | 
276 |             duration = datetime.now(timezone) - query_end_time
277 |             sleep_time = max(args.update_interval - duration.seconds, 0)
278 |             print "sleeping for " + str(sleep_time) + " seconds"
279 | 
280 |             if sleep_time > 0.0:                
281 |                 time.sleep(sleep_time)
282 | 
283 |   
284 |     except KeyboardInterrupt:
285 |         print "Interrupt caught closing job..."
286 | 
287 |     
288 | 
289 |     engine_client.close(job_id)
290 | 
291 | 
292 | if __name__ == "__main__":
293 |     main()    
294 | 
295 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/cloudwatch/cloudWatchMetrics.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | ############################################################################
  3 | #                                                                          #
  4 | # Copyright 2014 Prelert Ltd                                               #
  5 | #                                                                          #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");          #
  7 | # you may not use this file except in compliance with the License.         #
  8 | # You may obtain a copy of the License at                                  #
  9 | #                                                                          #
 10 | #    http://www.apache.org/licenses/LICENSE-2.0                            #
 11 | #                                                                          #
 12 | # Unless required by applicable law or agreed to in writing, software      #
 13 | # distributed under the License is distributed on an "AS IS" BASIS,        #
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
 15 | # See the License for the specific language governing permissions and      #
 16 | # limitations under the License.                                           #
 17 | #                                                                          #
 18 | ############################################################################
 19 | 
 20 | '''
 21 | Script to pull metric data from AWS CloudWatch and analyze it in
 22 | the Prelert Engine API. There are 2 modes of operation: historical
 23 | where stored metric data is extracted between 2 dates and a continous
 24 | realtime mode where the preceeding few minutes of data is queried in
 25 | a loop.
 26 | 
 27 | The path to a configuration file containing the AWS connection parameters
 28 | must be passed to the script the file should have the following propteries:
 29 | 
 30 |     region=REGION
 31 |     aws_access_key_id=YOUR_ACCESS_ID
 32 |     aws_secret_access_key=YOUR_SECRET_KEY
 33 | 
 34 | Where REGION is one of us-east-1, eu-west-1, etc
 35 | 
 36 | If the --start-date parameter is set then this will query historical data
 37 | from CloudWatch until --end-date or the current time if --end-date is not
 38 | set. Otherwise the script will run in an infinite loop pulling realtime
 39 | data, use Ctrl-C to quit the realtime mode as the script will catch
 40 | the interrupt and handle the exit gracefully.
 41 | 
 42 | To create a job with a specific ID use the --job-id argument. If the
 43 | job already exists data will be sent to that job otherwise a new job
 44 | with the ID is created. If no job ID is specified one will be automatically
 45 | generated by the API
 46 | 
 47 | Only EC2 metrics are monitored and only those belonging to an instance.
 48 | Aggregated metrics by instance type and AMI metrics are ignored.
 49 | 
 50 | Usage
 51 |     python cloudWatchMetrics.py awskey.conf
 52 | 
 53 |     python cloudWatchMetrics.py --job-id=cloudwatch --start-date=2014-10-01 awskey.conf
 54 | '''
 55 | 
 56 | import argparse
 57 | import ConfigParser
 58 | from datetime import datetime, timedelta, tzinfo
 59 | import json
 60 | import StringIO
 61 | import time
 62 | 
 63 | import boto.ec2
 64 | import boto.ec2.cloudwatch
 65 | from boto.exception import BotoServerError
 66 | 
 67 | from prelert.engineApiClient import EngineApiClient
 68 | 
 69 | 
 70 | # Prelert Engine API default connection prarams
 71 | API_HOST = 'localhost'
 72 | API_PORT = 8080
 73 | API_BASE_URL = 'engine/v2'
 74 | 
 75 | ''' Interval between query new data from CloudWatch (seconds)'''
 76 | UPDATE_INTERVAL=300
 77 | 
 78 | ''' Interval between data points that are being fetched from CloudWatch (seconds)'''
 79 | REPORTING_INTERVAL=60
 80 | 
 81 | ''' In realtime mode run this many seconds behind realtime '''
 82 | DELAY=600
 83 | 
 84 | '''
 85 | CloudWatch imposes a limit to the number of data points a query can return.
 86 | The limit is currently 1440, allowing e.g. a daily query with a reporting interval
 87 | of one minute (a day has 1440 minutes).
 88 | '''
 89 | MAX_DATAPOINTS_PER_QUERY = 1440
 90 | 
 91 | '''
 92 |     Prelert Engine job configuration.
 93 |     Multiple detectors configured one for each metric by the instance id.
 94 | '''
 95 | JOB_CONFIG = '{ %s\
 96 |                 "analysisConfig" : {\
 97 |                     "bucketSpan":' + str(UPDATE_INTERVAL) + ',\
 98 |                     "detectors" :[\
 99 |                         {"function":"mean", "fieldName":"DiskReadOps", "byFieldName":"instance"},\
100 |                         {"function":"mean", "fieldName":"DiskReadBytes", "byFieldName":"instance"},\
101 |                         {"function":"mean", "fieldName":"DiskWriteOps", "byFieldName":"instance"},\
102 |                         {"function":"mean", "fieldName":"DiskWriteBytes", "byFieldName":"instance"},\
103 |                         {"function":"mean", "fieldName":"NetworkIn", "byFieldName":"instance"},\
104 |                         {"function":"mean", "fieldName":"NetworkOut", "byFieldName":"instance"},\
105 |                         {"function":"mean", "fieldName":"CPUUtilization", "byFieldName":"instance"},\
106 |                         {"function":"mean", "fieldName":"StatusCheckFailed", "byFieldName":"instance"},\
107 |                         {"function":"mean", "fieldName":"StatusCheckFailed_Instance", "byFieldName":"instance"},\
108 |                         {"function":"mean", "fieldName":"StatusCheckFailed_System", "byFieldName":"instance"}\
109 |                     ]\
110 |                 },\
111 |                 "dataDescription" : {"format":"JSON","timeField":"timestamp","timeFormat":"yyyy-MM-dd\'T\'HH:mm:ssX"\
112 |                 }\
113 |             }'
114 | 
115 | 
116 | 
117 | class MetricRecord:
118 |     '''
119 |     Simple holder class for the CloudWatch metrics.
120 |     toJsonStr returns the metric in a format for the job
121 |     configuration above.
122 |     '''
123 |     def __init__(self, timestamp, instance, metric_name, metric_value):
124 |         self.timestamp = timestamp
125 |         self.instance = instance
126 |         self.metric_name = metric_name
127 |         self.metric_value = metric_value
128 | 
129 |     def toJsonStr(self):
130 |         result = '{"timestamp":"' + self.timestamp.isoformat() + \
131 |             '", "instance":"' + self.instance + '", "metric_name":"' + \
132 |             self.metric_name + '", "Average":' + str(self.metric_value) + '}'
133 | 
134 |         return result
135 | 
136 | 
137 | class UTC(tzinfo):
138 |     ''' UTC timezone class '''
139 |     def utcoffset(self, dt):
140 |         return timedelta(0)
141 | 
142 |     def tzname(self, dt):
143 |         return "UTC"
144 | 
145 |     def dst(self, dt):
146 |         return timedelta(0)
147 | 
148 | def replaceTimezoneWithUtc(date):
149 |     return date.replace(tzinfo=UTC())
150 | 
151 | 
152 | def parseArguments():
153 |     parser = argparse.ArgumentParser()
154 | 
155 |     parser.add_argument("config", help="The AWS connection parameters.")
156 | 
157 |     parser.add_argument("--api-host", help="The Prelert Engine API host, defaults to "
158 |         + API_HOST, default=API_HOST, dest="api_host")
159 |     parser.add_argument("--api-port", help="The Prelert Engine API port, defaults to "
160 |         + str(API_PORT), default=API_PORT, dest="api_port")
161 | 
162 |     parser.add_argument("--job-id", help="Send data to this job. If not set a \
163 |         new job will be created.", default=None, dest="job_id")
164 | 
165 |     parser.add_argument("--start-date", help="Request data from this date. If not \
166 |         set then run in realtime mode. Dates must be in YYYY-MM-DD format",
167 |         default=None, dest="start_date")
168 |     parser.add_argument("--end-date", help="if --start-date is set then pull \
169 |         and analyze only the metric data between those dates. \
170 |         If --start-date is not set this argument has no meaning. \
171 |         Dates must be in YYYY-MM-DD format",
172 |         default=None, dest="end_date")
173 | 
174 |     return parser.parse_args()
175 | 
176 | def calculateIntervalBetweenQueries(reporting_interval):
177 |     '''
178 |     For querying historic data, we can improve the performance by
179 |     minimising the number of queries we fire against CloudWatch.
180 |     CloudWatch allows a query spanning a day given the reporting
181 |     interval is a minute. Thus, we return the product of a the
182 |     max number of data points and the reporting interval in seconds
183 |     '''
184 |     return timedelta(seconds = MAX_DATAPOINTS_PER_QUERY * reporting_interval)
185 | 
186 | 
187 | def queryMetricRecords(metrics, start, end, reporting_interval):
188 |     '''
189 |         Return the metrics sorted by date.
190 |         The Average statistic is always taken
191 |     '''
192 |     metric_records = []
193 |     for m in metrics:
194 |         if 'InstanceId' not in m.dimensions:
195 |             continue
196 |         instance = m.dimensions['InstanceId'][0]
197 | 
198 |         datapoints = m.query(start, end, 'Average', period=reporting_interval)
199 |         for dp in datapoints:
200 |             # annoyingly Boto does not return datetimes with a timezone
201 |             utc_time = replaceTimezoneWithUtc(dp['Timestamp'])
202 |             mr = MetricRecord(utc_time, instance, m.name, dp['Average'])
203 |             metric_records.append(mr)
204 | 
205 | 
206 |     metric_records.sort(key=lambda r : r.timestamp)
207 |     return metric_records
208 | 
209 | 
210 | def transposeMetrics(metrics):
211 |     '''
212 |     Convert a list of metrics of the form
213 |     {time_1, instance, metric_A, Average},
214 |     {time_1, instance, metric_B, Average},
215 |     {time_1, instance, metric_C, Average}
216 |     {time_2, instance, metric_A, Average}
217 |     ...
218 | 
219 |     To a single record so that there is 1 record for each time period.
220 |     {time_1, instance, metric_A, metric_B, metric_C}
221 |     {time_2, instance, metric_A, metric_B, metric_C}
222 | 
223 |     The input list must be ordered by timestamp
224 |     '''
225 | 
226 |     tranposed_metrics = []
227 |     current_record = None;
228 |     current_time = datetime.fromtimestamp(0, UTC());
229 | 
230 |     for metric in metrics:
231 | 
232 |         if current_time < metric.timestamp:
233 |             current_time = metric.timestamp
234 | 
235 |             if current_record != None:
236 |                 tranposed_metrics.append(current_record)
237 |             current_record = dict()
238 |             current_record['timestamp'] = metric.timestamp.isoformat()
239 |             current_record['instance'] = metric.instance
240 | 
241 |         current_record[metric.metric_name] = metric.metric_value
242 | 
243 |     return tranposed_metrics
244 | 
245 | 
246 | def runHistorical(job_id, start_date, end_date, cloudwatch_conn, engine_client):
247 |     '''
248 |     Query and analyze the CloudWatch metrics from start_date to end_date.
249 |     If end_date == None then run until the time now.
250 |     '''
251 | 
252 |     end = start_date
253 |     delta = calculateIntervalBetweenQueries(REPORTING_INTERVAL)
254 | 
255 |     while True:
256 | 
257 |         start = end
258 |         end = start + delta
259 |         if (end > end_date):
260 |             end = end_date
261 | 
262 |         if start == end:
263 |             break
264 | 
265 |         print "Querying metrics starting at time " + str(start.isoformat())
266 | 
267 |         try:
268 |             metrics = cloudwatch_conn.list_metrics(namespace='AWS/EC2')
269 |             metric_records = queryMetricRecords(metrics, start, end, reporting_interval = REPORTING_INTERVAL)
270 | 
271 |             tranposed_metrics = transposeMetrics(metric_records)
272 | 
273 |             data = ''
274 |             for met in tranposed_metrics:
275 |                 json_str = json.dumps(met)
276 |                 data += json_str + '\n'
277 | 
278 |             (http_status, response) = engine_client.upload(job_id, data)
279 |             if http_status != 202:
280 |                 print "Error uploading metric data to the Engine"
281 |                 print http_status, json.dumps(response)
282 | 
283 |         except BotoServerError as error:
284 |             print "Error querying CloudWatch"
285 |             print error
286 | 
287 | 
288 | def runRealtime(job_id, cloudwatch_conn, engine_client):
289 |     '''
290 |     Query the previous 5 minutes of metric data every 5 minutes
291 |     then upload to the Prelert Engine.
292 | 
293 |     This function runs in an infinite loop but will catch the
294 |     keyboard interrupt (Ctrl C) and exit gracefully
295 |     '''
296 |     try:
297 |         delay = timedelta(seconds=DELAY)
298 |         end = datetime.utcnow() - delay - timedelta(seconds=UPDATE_INTERVAL)
299 |         end = replaceTimezoneWithUtc(end)
300 | 
301 |         while True:
302 | 
303 |             start = end
304 |             end = datetime.utcnow() - delay
305 |             end = replaceTimezoneWithUtc(end)
306 | 
307 |             print "Querying metrics from " + str(start.isoformat())  + " to " + end.isoformat()
308 | 
309 |             try:
310 |                 metrics = cloudwatch_conn.list_metrics(namespace='AWS/EC2')
311 |                 metric_records = queryMetricRecords(metrics, start, end, reporting_interval = REPORTING_INTERVAL)
312 |                 tranposed_metrics = transposeMetrics(metric_records)
313 | 
314 |                 data = ''
315 |                 for met in tranposed_metrics:
316 |                     json_str = json.dumps(met)
317 |                     data += json_str + '\n'
318 | 
319 | 
320 |                 (http_status, response) = engine_client.upload(job_id, data)
321 |                 if http_status != 202:
322 |                     print "Error uploading metric data to the Engine"
323 |                     print http_status, json.dumps(response)
324 | 
325 |             except BotoServerError as error:
326 |                 print "Error querying CloudWatch"
327 |                 print error
328 | 
329 |             now = datetime.utcnow()
330 |             now = replaceTimezoneWithUtc(now)
331 |             duration = now - delay - end
332 |             sleep_time = max(UPDATE_INTERVAL - duration.seconds, 0)
333 |             print "sleeping for " + str(sleep_time) + " seconds"
334 |             if sleep_time > 0:
335 |                 time.sleep(sleep_time)
336 | 
337 |     except KeyboardInterrupt:
338 |         print "Interrupt caught... terminating real time queries"
339 |         return
340 | 
341 | 
342 | def createJob(job_id, client):
343 |     '''
344 |     Create the job. If job_id == None then create the job with
345 |     a default Id else use job_id. If the job already exists
346 |     return job_id and continue.
347 | 
348 |     Returns the created job_id or None if the job could not
349 |     be created.
350 |     '''
351 | 
352 |     # if no job id create a new job
353 |     if job_id == None:
354 |         # no job id in the config
355 |         config = JOB_CONFIG % ''
356 |         (http_status, response) = client.createJob(config)
357 |         if http_status != 201:
358 |             print "Error creating job"
359 |             print response
360 |             return None
361 | 
362 |         job_id = response['id']
363 |         print "Created job with automatic ID " + job_id
364 |     else:
365 |         (http_status, response) = client.getJob(job_id)
366 |         if http_status == 404:
367 |             # no job id in the config
368 |             config = JOB_CONFIG % ('"id" : "' + job_id + '",')
369 |             (http_status, response) = client.createJob(config)
370 |             if http_status != 201:
371 |                 print "Error creating job"
372 |                 print response
373 |                 return None
374 | 
375 |             job_id = response['id']
376 |             print "Created job with ID " + job_id
377 |         else:
378 |             print "Using job with ID " + job_id
379 | 
380 |     return job_id
381 | 
382 | 
383 | def main():
384 |     args = parseArguments()
385 | 
386 |     # read the config file
387 |     config = ConfigParser.RawConfigParser()
388 |     try:
389 |         # insert a section header into the config so
390 |         # ConfigParser will read it without complaint
391 |         with open(args.config, "r") as config_file:
392 |             ini_str = '[root]\n' + config_file.read()
393 |             ini_fp = StringIO.StringIO(ini_str)
394 |             config.readfp(ini_fp)
395 |     except IOError:
396 |         print "Error opening file " + args.config
397 |         return
398 | 
399 | 
400 |     try:
401 |         region = config.get('root', 'region')
402 |         access_id = config.get('root', 'aws_access_key_id')
403 |         secret_key = config.get('root', 'aws_secret_access_key')
404 |     except ConfigParser.NoOptionError as e:
405 |         print e
406 |         return
407 | 
408 | 
409 |     # AWS CloudWatch connection
410 |     cloudwatch_conn = boto.ec2.cloudwatch.connect_to_region(region,
411 |                  aws_access_key_id=access_id,
412 |                  aws_secret_access_key=secret_key)
413 | 
414 |     if cloudwatch_conn == None:
415 |         print "Error unknown region " + region
416 |         return
417 | 
418 |     # The Prelert REST API client
419 |     engine_client = EngineApiClient(args.api_host, API_BASE_URL, args.api_port)
420 | 
421 |     # If no job ID is supplied create a new job
422 |     job_id = createJob(args.job_id, engine_client)
423 |     if job_id == None:
424 |         return
425 | 
426 |     # default start date is None meaning run realtime
427 |     start_date = None
428 |     if args.start_date != None:
429 |         start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
430 |         start_date = replaceTimezoneWithUtc(start_date)
431 | 
432 |     if start_date == None:
433 |         runRealtime(job_id, cloudwatch_conn, engine_client)
434 |     else:
435 |         # historical mode, check for an end date
436 |         end_date = replaceTimezoneWithUtc(datetime.utcnow())
437 |         if args.end_date != None:
438 |             end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
439 |             end_date = replaceTimezoneWithUtc(end_date)
440 | 
441 |         runHistorical(job_id, start_date, end_date, cloudwatch_conn, engine_client)
442 | 
443 | 
444 |     print "Closing job..."
445 |     engine_client.close(job_id)
446 | 
447 | if __name__ == "__main__":
448 |     main()
449 | 


--------------------------------------------------------------------------------
/prelert/engineApiClient/EngineApiClient.py:
--------------------------------------------------------------------------------
   1 | ############################################################################
   2 | #                                                                          #
   3 | # Copyright 2015-2016 Prelert Ltd                                          #
   4 | #                                                                          #
   5 | # Licensed under the Apache License, Version 2.0 (the "License");          #
   6 | # you may not use this file except in compliance with the License.         #
   7 | # You may obtain a copy of the License at                                  #
   8 | #                                                                          #
   9 | #    http://www.apache.org/licenses/LICENSE-2.0                            #
  10 | #                                                                          #
  11 | # Unless required by applicable law or agreed to in writing, software      #
  12 | # distributed under the License is distributed on an "AS IS" BASIS,        #
  13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
  14 | # See the License for the specific language governing permissions and      #
  15 | # limitations under the License.                                           #
  16 | #                                                                          #
  17 | ############################################################################
  18 | """
  19 | A simple HTTP client to the Prelert Engine REST API
  20 | """
  21 | 
  22 | import httplib
  23 | import urllib
  24 | import json
  25 | import logging
  26 | 
  27 | class EngineApiClient:
  28 | 
  29 | 
  30 |     def __init__(self, host, base_url, port=8080):
  31 |         """
  32 |         Create a HTTP connection to host:port
  33 |         host is the host machine
  34 |         base_url is the API URl this should contain the version number
  35 |           e.g. /engine/v2
  36 |         The default port is 8080
  37 |         """
  38 |         self.host = host
  39 | 
  40 |         # ensure the base url starts with "/"
  41 |         if not base_url.startswith("/"):
  42 |             base_url = "/" + base_url
  43 | 
  44 |         logging.info("Connecting to Engine REST API at {0}:{1}{2}".format(host,
  45 |             port, base_url))
  46 |         self.base_url = base_url
  47 |         self.connection = httplib.HTTPConnection(host, port)
  48 | 
  49 | 
  50 |     def getJob(self, job_id):
  51 |         """
  52 |         Get the job for id.
  53 |         Returns a (http_status_code, response) tuple, if http_status_code != 200
  54 |         response is an error message
  55 |         """
  56 | 
  57 |         return self._get(self.base_url + "/jobs/" + job_id, "job")
  58 | 
  59 |     def getJobs(self, skip=0, take=100):
  60 |         '''
  61 |         Get the first page of jobs in the system.
  62 |         Defaults to the first 100 jobs use the skip and take parameters
  63 |         to get further pages.
  64 |         skip the first N jobs
  65 |         take a maxium of this number of jobs
  66 |         Returns a (http_status_code, response) tuple, if http_status_code != 200
  67 |         response is an error message
  68 |         '''
  69 | 
  70 |         url = self.base_url + "/jobs?skip={0}&take={1}".format(skip, take)
  71 |         return self._get(url, "jobs")
  72 | 
  73 |     def createJob(self, payload):
  74 |         """
  75 |         Create a new job.  Payload is the Json format job creation string.
  76 |         Returns a (http_status_code, json) tuple.  If http_status_code == 201
  77 |         the JSON result doc will have an 'id' field set to the newly created
  78 |         job id else json will be an error document.
  79 |         """
  80 | 
  81 |         url = self.base_url + "/jobs"
  82 |         headers = {'Content-Type':'application/json'}
  83 | 
  84 |         return self._post(url, "Create job", headers, payload)
  85 | 
  86 |     def updateJob(self, job_id, payload):
  87 |         """
  88 |         Updates a job with according to the given payload.
  89 | 
  90 |         :param job_id: the job id
  91 |         :param payload: The JSON payload. See Prelert Engine API docs for help.
  92 |         :return: (http_status_code, acknowledgement) tuple if it was successful,
  93 |             or (http_status_code, error_doc) if http_status_code != 200
  94 |         """
  95 |         headers = {'Content-Type': 'application/json'}
  96 |         url = self.base_url + "/jobs/{0}/update".format(job_id)
  97 |         return self._put(url, 'Update job', headers=headers, payload=payload)
  98 | 
  99 |     def pauseJob(self, job_id):
 100 |         """
 101 |         Pauses a job.
 102 | 
 103 |         :param job_id: the job id
 104 |         :return: (http_status_code, acknowledgement) tuple if it was successful,
 105 |             or (http_status_code, error_doc) if http_status_code != 200
 106 |         """
 107 |         url = self.base_url + "/jobs/{0}/pause".format(job_id)
 108 |         return self._post(url, 'Pause job', headers={}, payload=None)
 109 | 
 110 |     def resumeJob(self, job_id):
 111 |         """
 112 |         Resumes a job.
 113 | 
 114 |         :param job_id: the job id
 115 |         :return: (http_status_code, acknowledgement) tuple if it was successful,
 116 |             or (http_status_code, error_doc) if http_status_code != 200
 117 |         """
 118 |         url = self.base_url + "/jobs/{0}/resume".format(job_id)
 119 |         return self._post(url, 'Resume job', headers={}, payload=None)
 120 | 
 121 |     def upload(self, job_id, data, gzipped=False, store=False):
 122 |         """
 123 |         Upload data to the jobs data endpoint.
 124 |         Data can be a string or an open file object.
 125 |         If the data is gzipped compressed set gzipped to True
 126 | 
 127 |         Returns a (http_status_code, response_data) tuple, if
 128 |         http_status_code != 202 response_data is an error message.
 129 |         """
 130 |         endpoint = 'dataload' if store else 'data'
 131 |         (status, data) = self._uploadToEndpoint(job_id, data, endpoint, gzipped)
 132 | 
 133 |         if data:
 134 |             doc = json.loads(data)
 135 |         else:
 136 |             doc = dict()
 137 | 
 138 |         return (status, doc)
 139 | 
 140 | 
 141 |     def stream(self, job_id, data, gzipped=False):
 142 |         """
 143 |         A Generator co-routine for uploading data in an *almost* asynchronous
 144 |         manner using chunked transfer encoding. This function uses the yield
 145 |         statment to receive a data record then chunk encodes the record and
 146 |         writes it into the open upload stream.
 147 | 
 148 |         First the generator must be initialised by calling send(None)
 149 |         this runs the code up to the first yield statement
 150 | 
 151 |             consumer = engineApiClient.stream(job_id, first_line_of_data)
 152 |             consumer.send(None)  # init generator, runs code up to first yield
 153 | 
 154 |         After this data can be sent iteratively by repeatedly calling the send
 155 |         method with new data. CSV records must end in a newline character.
 156 | 
 157 |             for record in data:
 158 |                 consumer.send(record + '\n')
 159 | 
 160 |         When all the data is sent call send with an empty string and the
 161 |         respone is returned.
 162 | 
 163 |             (http_status, response) = consumer.send('')
 164 | 
 165 |         """
 166 | 
 167 |         url = self.base_url + "/data/" + job_id
 168 | 
 169 |         self.connection.connect()
 170 | 
 171 |         self.connection.putrequest("POST", url)
 172 |         self.connection.putheader("Connection", "Keep-Alive")
 173 |         self.connection.putheader("Transfer-Encoding", "chunked")
 174 |         self.connection.putheader("Content-Type", "application/x-www-form-urlencoded")
 175 |         if gzipped:
 176 |             self.connection.putheader('Content-Encoding', 'gzip')
 177 |         self.connection.endheaders()
 178 | 
 179 |         while data:
 180 |             # Send in chunked transfer encoding format. Write the hexidecimal
 181 |             # length of the data message followed by '\r\n' followed by the
 182 |             # data and another '\r\n'
 183 | 
 184 |             # strip the '0x' of the hex string
 185 |             data_len = hex(len(data))[2:]
 186 |             msg = data_len + '\r\n' + data + '\r\n'
 187 | 
 188 |             self.connection.send(msg)
 189 |             data = yield
 190 | 
 191 |         # End chunked transfer encoding by sending the zero length message
 192 |         msg = '0\r\n\r\n'
 193 |         self.connection.send(msg)
 194 | 
 195 |         response = self.connection.getresponse();
 196 |         if response.status != 202:
 197 |             logging.error("Upload file response = " + str(response.status)
 198 |                 + " " + response.reason)
 199 |         else:
 200 |             logging.debug("Upload response = " + str(response.status))
 201 | 
 202 | 
 203 |         # read all of the response before another request can be made
 204 |         data = response.read()
 205 |         if data:
 206 |             doc = json.loads(data)
 207 |         else:
 208 |             doc = dict()
 209 | 
 210 |         self.connection.close()
 211 | 
 212 |         yield (response.status, doc)
 213 | 
 214 | 
 215 |     def close(self, job_id):
 216 |         """
 217 |         Close the job once data has been streamed
 218 |         Returns a (http_status_code, response_data) tuple, if
 219 |         http_status_code != 200 response_data is an error object.
 220 |         """
 221 | 
 222 |         url = self.base_url + "/data/" + job_id + "/close"
 223 | 
 224 |         return self._post(url, "Close", headers={}, payload=None)
 225 | 
 226 |     def flush(self, job_id, calc_interim=False, start_date=None, end_date=None, advance_time=None):
 227 |         """
 228 |         Flush the job, such that when this call returns no data is
 229 |         being held in buffers.
 230 |         calc_interim If set to True then interim results for all
 231 |             incomplete buckets will be calculated
 232 |         start_date, end_date Must either be an epoch time or ISO 8601 format
 233 |             see the Prelert Engine API docs for help. These are only
 234 |             accepted if calc_interim is True and they limit the range
 235 |             of buckets for which interim results will be calculated.
 236 |             If only start_date is specified, interim results will be calculated
 237 |             only for the bucket indicated by start_date.
 238 |         advance_time Must either be an epoch time or ISO 8601 format
 239 |             see the Prelert Engine API docs for help. If set, final
 240 |             results will be calculated up to the time indicated by
 241 |             advance_time.
 242 | 
 243 |         Returns a (http_status_code, response_data) tuple, if
 244 |         http_status_code != 200 response_data is an error object.
 245 |         """
 246 | 
 247 |         url = self.base_url + "/data/" + job_id + "/flush"
 248 | 
 249 |         calc_interim_arg = '?calcInterim=true' if calc_interim else '?calcInterim=false'
 250 |         start_arg = '&start=' + urllib.quote(start_date) if start_date else ''
 251 |         end_arg = '&end=' + urllib.quote(end_date) if end_date else ''
 252 |         advance_time_arg = '&advanceTime=' + urllib.quote(advance_time) if advance_time else ''
 253 | 
 254 |         url += calc_interim_arg + start_arg + end_arg + advance_time_arg
 255 | 
 256 |         return self._post(url, "Flush", headers={}, payload=None)
 257 | 
 258 |     def preview(self, job_id, data, gzipped=False):
 259 |         """
 260 |         Upload data to the jobs preview endpoint, the response
 261 |         is text/csv preview of the uploaded data after the transforms
 262 |         have been applied.
 263 |         Data can be a string or an open file object.
 264 |         If the data is gzipped compressed set gzipped to True
 265 | 
 266 |         Returns a (http_status_code, response_data) tuple, if
 267 |         http_status_code != 202 response_data is an error message.
 268 |         """
 269 |         return self._uploadToEndpoint(job_id, data, 'preview', gzipped)
 270 | 
 271 | 
 272 |     def getBucket(self, job_id, bucket_timestamp, include_records=False,
 273 |                   include_interim=False):
 274 |         '''
 275 |         Get the individual result bucket for the job and bucket timestamp
 276 |         If include_records is True the anomaly records are nested in the
 277 |         resulting dictionary.
 278 |         If include_interim is True then interim results will be returned as
 279 |         well as final results.
 280 | 
 281 |         Returns a (http_status_code, bucket) tuple if successful else
 282 |         if http_status_code != 200 (http_status_code, error_doc) is
 283 |         returned
 284 |         '''
 285 | 
 286 |         query_char = '?'
 287 |         query = ''
 288 |         if include_records:
 289 |             query = query_char + 'expand=true'
 290 |             query_char = '&'
 291 |         if include_interim:
 292 |             query += query_char + 'includeInterim=true'
 293 |             query_char = '&'
 294 | 
 295 |         url = self.base_url + "/results/{0}/{1}{2}".format(job_id, bucket_timestamp, query)
 296 | 
 297 |         return self._get(url, "bucket")
 298 | 
 299 |     def getBuckets(self, job_id, skip=0, take=100, include_records=False,
 300 |                 normalized_probability_filter_value=None, anomaly_score_filter_value=None,
 301 |                 include_interim=False):
 302 |         '''
 303 |         Return a page of the job's buckets results.
 304 |         skip the first N buckets
 305 |         take a maximum of this number of buckets
 306 |         include_records Anomaly records are included in the buckets.
 307 |         normalized_probability_filter_value If not none return only the records with
 308 |             a normalizedProbability >= normalized_probability_filter_value
 309 |         anomaly_score_filter_value If not none return only the records with
 310 |             an anomalyScore >= anomaly_score_filter_value
 311 |         include_interim Should interim results be returned as well as final results?
 312 | 
 313 |         Returns a (http_status_code, buckets) tuple if successful else
 314 |         if http_status_code != 200 a (http_status_code, error_doc) is
 315 |         returned
 316 |         '''
 317 | 
 318 |         query = ''
 319 |         if include_records:
 320 |             query = '&expand=true'
 321 | 
 322 |         if include_interim:
 323 |             query += '&includeInterim=true'
 324 | 
 325 |         if normalized_probability_filter_value:
 326 |             query += '&normalizedProbability=' + str(normalized_probability_filter_value)
 327 | 
 328 |         if anomaly_score_filter_value:
 329 |             query += '&anomalyScore=' + str(anomaly_score_filter_value)
 330 | 
 331 |         url = self.base_url + "/results/{0}/buckets?skip={1}&take={2}{3}".format(
 332 |             job_id, skip, take, query)
 333 | 
 334 |         return self._get(url, "buckets")
 335 | 
 336 | 
 337 |     def getBucketsByDate(self, job_id, start_date, end_date, include_records=False,
 338 |             normalized_probability_filter_value=None, anomaly_score_filter_value=None,
 339 |             include_interim=False):
 340 |         """
 341 |         Return all the job's buckets results between 2 dates.  If there is more
 342 |         than one page of results for the given data range this function will
 343 |         get them all appending the buckets in a list. A list of buckets is
 344 |         returned.
 345 | 
 346 |         start_date, end_date Must either be an epoch time or ISO 8601 format
 347 |         strings see the Prelert Engine API docs for help.
 348 |         include_records Anomaly records are included in the buckets
 349 |         normalized_probability_filter_value If not none return only the records with
 350 |             a normalizedProbability >= normalized_probability_filter_value
 351 |         anomaly_score_filter_value If not none return only the records with
 352 |             an anomalyScore >= anomaly_score_filter_value
 353 |         include_interim Should interim results be returned as well as final results?
 354 | 
 355 |         Returns a (http_status_code, buckets) tuple if successful else
 356 |         if http_status_code != 200 a (http_status_code, error_doc) is
 357 |         returned
 358 |         """
 359 | 
 360 |         skip = 0
 361 |         take = 100
 362 |         expand = ''
 363 |         if include_records:
 364 |             expand = '&expand=true'
 365 | 
 366 |         start_arg = ''
 367 |         if start_date:
 368 |             start_arg = '&start=' + urllib.quote(start_date)
 369 | 
 370 |         end_arg = ''
 371 |         if end_date:
 372 |             end_arg = '&end=' + urllib.quote(end_date)
 373 | 
 374 |         score_filter = ''
 375 |         if normalized_probability_filter_value:
 376 |             score_filter = '&normalizedProbability=' + str(normalized_probability_filter_value)
 377 | 
 378 |         if anomaly_score_filter_value:
 379 |             score_filter += '&anomalyScore=' + str(anomaly_score_filter_value)
 380 | 
 381 |         include_interim_arg = ''
 382 |         if include_interim:
 383 |             include_interim_arg = '&includeInterim=true'
 384 | 
 385 |         url = self.base_url + "/results/{0}/buckets?skip={1}&take={2}{3}{4}{5}{6}{7}".format(job_id,
 386 |             skip, take, expand, start_arg, end_arg, score_filter, include_interim_arg)
 387 | 
 388 |         self.connection.connect()
 389 |         self.connection.request("GET", url)
 390 |         response = self.connection.getresponse();
 391 | 
 392 |         if response.status != 200:
 393 |             logging.error("Get buckets by date response = " + str(response.status) + " " + response.reason)
 394 |             response_data = json.load(response)
 395 |             return (response.status, response_data)
 396 |         else:
 397 |             logging.debug("Get buckets by date response = " + str(response.status))
 398 | 
 399 | 
 400 |         result = json.load(response)
 401 |         buckets = result['documents']
 402 | 
 403 |         # is there another page of results
 404 |         while result['nextPage']:
 405 |             skip += take
 406 |             url = self.base_url + "/results/{0}/buckets?skip={1}&take={2}{3}{4}{5}".format(job_id,
 407 |                                 skip, take, expand, start_arg, end_arg)
 408 |             self.connection.request("GET", url)
 409 |             response = self.connection.getresponse();
 410 |             if response.status != 200:
 411 |                 logging.error("Get buckets by date response = " + str(response.status) + " " + response.reason)
 412 |                 message = json.load(response)
 413 | 
 414 |                 if message:
 415 |                     message = json.loads(message)
 416 | 
 417 |                 self.connection.close()
 418 |                 return (response.status, message)
 419 | 
 420 |             result = json.load(response)
 421 |             buckets.extend(result['documents'])
 422 | 
 423 |         self.connection.close()
 424 | 
 425 |         return (200, buckets)
 426 | 
 427 | 
 428 |     def getAllBuckets(self, job_id, include_records=False,
 429 |                 normalized_probability_filter_value=None, anomaly_score_filter_value=None,
 430 |                 include_interim=False):
 431 |         """
 432 |         Return all the job's buckets results.  If more than 1
 433 |         page of buckets are available continue to with the next
 434 |         page until all results have been read. An array of buckets is
 435 |         returned.
 436 | 
 437 |         include_records Anomaly records are included in the buckets
 438 |         normalized_probability_filter_value If not none return only the records with
 439 |             a normalizedProbability >= normalized_probability_filter_value
 440 |         anomaly_score_filter_value If not none return only the records with
 441 |             an anomalyScore >= anomaly_score_filter_value
 442 |         include_interim Should interim results be returned as well as final results?
 443 | 
 444 |         Returns a (http_status_code, buckets) tuple if successful else
 445 |         if http_status_code != 200 a (http_status_code, error_doc) tuple
 446 |         is returned
 447 |         """
 448 | 
 449 |         skip = 0
 450 |         take = 100
 451 |         expand = ''
 452 |         if include_records:
 453 |             expand = '&expand=true'
 454 | 
 455 |         score_filter = ''
 456 |         if normalized_probability_filter_value:
 457 |             score_filter = '&normalizedProbability=' + str(normalized_probability_filter_value)
 458 | 
 459 |         if anomaly_score_filter_value:
 460 |             score_filter += '&anomalyScore=' + str(anomaly_score_filter_value)
 461 | 
 462 |         include_interim_arg = ''
 463 |         if include_interim:
 464 |             include_interim_arg = '&includeInterim=true'
 465 | 
 466 |         url = self.base_url + "/results/{0}/buckets?skip={1}&take={2}{3}{4}{5}".format(
 467 |             job_id, skip, take, expand, score_filter, include_interim_arg)
 468 | 
 469 | 
 470 |         self.connection.connect()
 471 |         self.connection.request("GET", url)
 472 |         response = self.connection.getresponse();
 473 | 
 474 |         if response.status != 200:
 475 |             logging.error("Get all buckets response = " + str(response.status) + " " + response.reason)
 476 |             response_data = json.load(response)
 477 |             return (response.status, response_data)
 478 |         else:
 479 |             logging.debug("Get all buckets response = " + str(response.status))
 480 | 
 481 | 
 482 |         result = json.load(response)
 483 |         buckets = result['documents']
 484 | 
 485 |         # is there another page of results
 486 |         while result['nextPage']:
 487 |             skip += take
 488 |             url = self.base_url + "/results/{0}/buckets?skip={1}&take={2}{3}{4}".format(
 489 |                 job_id, skip, take, expand, score_filter)
 490 | 
 491 |             self.connection.request("GET", url)
 492 |             response = self.connection.getresponse();
 493 |             if response.status != 200:
 494 |                 logging.error("Get all buckets response = " + str(response.status) + " " + response.reason)
 495 | 
 496 |                 message = json.load(response)
 497 |                 if message:
 498 |                     message = json.loads(message)
 499 | 
 500 |                 self.connection.close()
 501 |                 return (response.status, message)
 502 | 
 503 |             result = json.load(response)
 504 |             buckets.extend(result['documents'])
 505 | 
 506 |         self.connection.close()
 507 | 
 508 |         return (200, buckets)
 509 | 
 510 | 
 511 |     def getRecords(self, job_id, skip=0, take=100, start_date=None,
 512 |             end_date=None, sort_field=None, sort_descending=True,
 513 |             normalized_probability_filter_value=None, anomaly_score_filter_value=None,
 514 |             include_interim=False):
 515 |         """
 516 |         Get a page of the job's anomaly records.
 517 |         Records can be filtered by start & end date parameters and the scores.
 518 | 
 519 |         skip the first N records
 520 |         take a maximum of this number of records
 521 |         start_date, end_date Must either be an epoch time or ISO 8601 format
 522 |             see the Prelert Engine API docs for help
 523 |         sort_field The field to sort the results by, ignored if None
 524 |         sort_descending If sort_field is not None then sort records
 525 |             in descending order if True else sort ascending
 526 |         normalized_probability_filter_value If not none return only the records with
 527 |             a normalizedProbability >= normalized_probability_filter_value
 528 |         anomaly_score_filter_value If not none return only the records with
 529 |             an anomalyScore >= anomaly_score_filter_value
 530 |         include_interim Should interim results be returned as well as final results?
 531 | 
 532 |         Returns a (http_status_code, records) tuple if successful else
 533 |         if http_status_code != 200 a (http_status_code, error_doc) is
 534 |         returned
 535 |         """
 536 | 
 537 |         start_arg = ''
 538 |         if start_date:
 539 |             start_arg = '&start=' + urllib.quote(start_date)
 540 | 
 541 |         end_arg = ''
 542 |         if end_date:
 543 |             end_arg = '&end=' + urllib.quote(end_date)
 544 | 
 545 |         sort_arg = ''
 546 |         if sort_field:
 547 |             sort_arg = "&sort=" + urllib.quote(sort_field) + '&desc=' + ('true' if sort_descending else 'false')
 548 | 
 549 |         filter_arg = ''
 550 |         if normalized_probability_filter_value:
 551 |             filter_arg = '&normalizedProbability=' + str(normalized_probability_filter_value)
 552 | 
 553 |         if anomaly_score_filter_value:
 554 |             filter_arg += '&anomalyScore=' + str(anomaly_score_filter_value)
 555 | 
 556 |         include_interim_arg = ''
 557 |         if include_interim:
 558 |             include_interim_arg = '&includeInterim=true'
 559 | 
 560 |         url = self.base_url + "/results/{0}/records?skip={1}&take={2}{3}{4}{5}{6}{7}".format(
 561 |             job_id, skip, take, start_arg, end_arg, sort_arg, filter_arg, include_interim_arg)
 562 | 
 563 |         return self._get(url, "records")
 564 | 
 565 | 
 566 |     def getCategoryDefinitions(self, job_id):
 567 |         """
 568 |         Get a page of category definitions
 569 |         """
 570 | 
 571 |         url = self.base_url + "/results/{0}/categorydefinitions".format(job_id)
 572 |         return self._get(url, "Category definitions")
 573 | 
 574 | 
 575 |     def getCategoryDefinition(self, job_id, category_id):
 576 |         """
 577 |         Get a single category definition
 578 | 
 579 |         category_id is the id of the category
 580 |         """
 581 |         url = self.base_url + "/results/{0}/categorydefinitions/{1}".format(job_id, category_id)
 582 |         return self._get(url, "Category definition")
 583 | 
 584 | 
 585 |     def getInfluencers(self, job_id, skip=0, take=100, start_date=None,
 586 |             end_date=None, sort_field=None, sort_descending=True,
 587 |             anomaly_score_filter_value=None, include_interim=False):
 588 |         """
 589 |         Get a page of the job's influencers.
 590 |         Influencers can be filtered by start & end date parameters and the anomaly score.
 591 | 
 592 |         skip the first N influencers
 593 |         take a maximum of this number of influencers
 594 |         start_date, end_date Must either be an epoch time or ISO 8601 format
 595 |             see the Prelert Engine API docs for help
 596 |         sort_field The field to sort the influencers by, ignored if None
 597 |         sort_descending If sort_field is not None then sort influencers
 598 |             in descending order if True else sort ascending
 599 |         anomaly_score_filter_value If not none return only the influencers with
 600 |             an anomalyScore >= anomaly_score_filter_value
 601 |         include_interim Should interim influencers be returned as well as final ones?
 602 | 
 603 |         Returns a (http_status_code, influencers) tuple if successful else
 604 |         if http_status_code != 200 a (http_status_code, error_doc) is
 605 |         returned
 606 |         """
 607 | 
 608 |         start_arg = ''
 609 |         if start_date:
 610 |             start_arg = '&start=' + urllib.quote(start_date)
 611 | 
 612 |         end_arg = ''
 613 |         if end_date:
 614 |             end_arg = '&end=' + urllib.quote(end_date)
 615 | 
 616 |         sort_arg = ''
 617 |         if sort_field:
 618 |             sort_arg = "&sort=" + urllib.quote(sort_field) + '&desc=' + ('true' if sort_descending else 'false')
 619 | 
 620 |         filter_arg = ''
 621 |         if anomaly_score_filter_value:
 622 |             filter_arg += '&anomalyScore=' + str(anomaly_score_filter_value)
 623 | 
 624 |         include_interim_arg = ''
 625 |         if include_interim:
 626 |             include_interim_arg = '&includeInterim=true'
 627 | 
 628 |         url = self.base_url + '/results/{0}/influencers?skip={1}&take={2}{3}{4}{5}{6}{7}'.format(
 629 |             job_id, skip, take, start_arg, end_arg, sort_arg, filter_arg, include_interim_arg)
 630 | 
 631 |         return self._get(url, "influencers")
 632 | 
 633 | 
 634 |     def alerts_longpoll(self, job_id, normalized_probability_threshold=None,
 635 |         anomaly_score_threshold=None, timeout=None):
 636 |         """
 637 |         Subscribe to the long poll alerts endpoint. Alerts are fired when
 638 |         a bucket has an anomaly score >= anomaly_score_threshold or the
 639 |         bucket has records with normalised probability >=
 640 |         normalized_probability_threshold.
 641 | 
 642 |         Returns a (http_status_code, alert) tuple if successful else
 643 |         if http_status_code != 200 a (http_status_code, error_doc) is
 644 |         returned
 645 | 
 646 |         If the long poll times out an alert object is returned but the
 647 |         'timeout' field will be set true.
 648 |         """
 649 | 
 650 |         query_char = '?'
 651 | 
 652 |         prob_arg = ''
 653 |         if normalized_probability_threshold:
 654 |             prob_arg = query_char + 'probability=' + str(normalized_probability_threshold)
 655 |             query_char = '&'
 656 | 
 657 |         score_arg = ''
 658 |         if anomaly_score_threshold:
 659 |             score_arg = query_char + 'score=' + str(anomaly_score_threshold)
 660 |             query_char = '&'
 661 | 
 662 |         timeout_arg = ''
 663 |         if timeout:
 664 |             timeout_arg = query_char + 'timeout=' + str(timeout)
 665 | 
 666 |         url = self.base_url + "/alerts_longpoll/{0}/{1}{2}{3}".format(
 667 |             job_id, prob_arg, score_arg, timeout_arg)
 668 | 
 669 |         return self._get(url, "alerts")
 670 | 
 671 | 
 672 |     def delete(self, job_id):
 673 |         """
 674 |         Delete a job.
 675 |         Returns a (http_status_code, response_data) tuple, if
 676 |         http_status_code != 200 response_data is an error object.
 677 |         """
 678 | 
 679 |         url = self.base_url + "/jobs/" + job_id
 680 |         return self._delete(url, 'Delete job')
 681 | 
 682 |     def getZippedLogs(self, job_id):
 683 |         """
 684 |         Download the zipped log files of a job and
 685 |         return a tuple of (http_status_code, zip_data) if http_status_code
 686 |         == 200 else the error is read into a json document and
 687 |         returns (http_status_code, error_doc)
 688 |         """
 689 |         return self._get_logs(self.base_url + "/logs/" + job_id, 'zipped logs')
 690 | 
 691 |     def getJobLog(self, job_id, log_file_name):
 692 |         """
 693 |         Download the zipped log files of a job and
 694 |         return a tuple of (http_status_code, zip_data) if http_status_code
 695 |         == 200 else the error is read into a json document and
 696 |         returns (http_status_code, error_doc)
 697 |         """
 698 |         return self._get_logs(self.base_url + "/logs/{0}/{1}".format(job_id, log_file_name), 'Specific job log')
 699 | 
 700 |     def getElasticsearchServerLogs(self):
 701 |         """
 702 |         Download the zipped log files of elasticsearch and
 703 |         return a tuple of (http_status_code, zip_data) if http_status_code
 704 |         == 200 else the error is read into a json document and
 705 |         returns (http_status_code, error_doc)
 706 |         """
 707 |         return self._get_logs(self.base_url + "/logs/elasticsearch", 'Elasticsearch server logs')
 708 | 
 709 |     def getEngineApiServerLogs(self):
 710 |         """
 711 |         Download the zipped log files of the Engine API server and
 712 |         return a tuple of (http_status_code, zip_data) if http_status_code
 713 |         == 200 else the error is read into a json document and
 714 |         returns (http_status_code, error_doc)
 715 |         """
 716 |         return self._get_logs(self.base_url + "/logs/engine_api", 'Engine API server logs')
 717 | 
 718 |     def _get_logs(self, url, request_description):
 719 |         http_status_code, data = self._get(url, request_description, expects_json=False)
 720 | 
 721 |         # if not a 200 code the response is a JSON error message
 722 |         if http_status_code == 200:
 723 |             return http_status_code, data
 724 |         else:
 725 |             error = json.loads(data)
 726 |             return http_status_code, error
 727 | 
 728 |     def getModelSnapshots(self, job_id, skip=0, take=100,
 729 |                           start_date=None, end_date=None,
 730 |                           sort_field=None, sort_descending=True,
 731 |                           description=None):
 732 |         """
 733 |         Get a page of the job's model snapshots.
 734 |         Model snapshots can be filtered by date or by description.
 735 | 
 736 |         :param job_id: the job id
 737 |         :param skip: skips the first n model snapshots
 738 |         :param take: specifies the number of snapshots to be returned
 739 |         :param start_date: Must either be an epoch time or ISO 8601 format
 740 |             see the Prelert Engine API docs for help
 741 |         :param end_date: Must either be an epoch time or ISO 8601 format
 742 |             see the Prelert Engine API docs for help
 743 |         :param sort_field: If set, the snapshots will be sorted on the given field
 744 |         :param sort_descending: The direction of the sorting (defaults to True)
 745 |         :param description: If set, only results that match the description will be retrieved
 746 |         :return: (http_status_code, model_snapshots) tuple if successful,
 747 |             if not (i.e. http_status_code != 200) (http_status_code, error_doc) is returned
 748 |         """
 749 |         start_arg = ''
 750 |         if start_date:
 751 |             start_arg = '&start=' + urllib.quote(start_date)
 752 | 
 753 |         end_arg = ''
 754 |         if end_date:
 755 |             end_arg = '&end=' + urllib.quote(end_date)
 756 | 
 757 |         sort_arg = ''
 758 |         if sort_field:
 759 |             sort_arg = '&sort=' + urllib.quote(sort_field) + '&desc=' + ('true' if sort_descending else 'false')
 760 | 
 761 |         description_arg = ''
 762 |         if description:
 763 |             description_arg = '&description=' + urllib.quote(description)
 764 | 
 765 |         url = self.base_url + "/modelsnapshots/{0}?skip={1}&take={2}{3}{4}{5}{6}".format(
 766 |             job_id, skip, take, start_arg, end_arg, sort_arg, description_arg)
 767 | 
 768 |         return self._get(url, "Model Snapshots")
 769 | 
 770 | 
 771 |     def revertToSnapshot(self, job_id, time=None, snapshot_id=None, description=None, delete_intervening_results=False):
 772 |         """
 773 |         Revert to the most recent snapshot matching specified criteria.
 774 | 
 775 |         :param job_id: the job id
 776 |         :param time: revert to a snapshot with a timestamp no later than this time
 777 |         :param snapshot_id: the snapshot ID of the snapshot to revert to
 778 |         :param description: the description of the snapshot to revert to
 779 |         :param delete_intervening_results: should the results be reset
 780 |             back to the time of the snapshot? (Defaults to False)
 781 |         :return: (http_status_code, reverted_snapshot) tuple if it was successful,
 782 |             or (http_status_code, error_doc) if http_status_code != 200
 783 |         """
 784 |         delete_intervening_results_arg = 'deleteInterveningResults=' + 'true' if delete_intervening_results else 'false'
 785 | 
 786 |         time_arg = ''
 787 |         if time:
 788 |             time_arg = '&time=' + urllib.quote(time)
 789 | 
 790 |         snapshot_id_arg = ''
 791 |         if snapshot_id:
 792 |             snapshot_id_arg = '&snapshotId=' + urllib.quote(snapshot_id)
 793 | 
 794 |         description_arg = ''
 795 |         if description:
 796 |             description_arg = '&description=' + urllib.quote(description)
 797 | 
 798 |         url = self.base_url + "/modelsnapshots/{0}/revert?{1}{2}{3}{4}".format(
 799 |             job_id, delete_intervening_results_arg, time_arg, snapshot_id_arg, description_arg)
 800 | 
 801 |         return self._post(url, "Revert to snapshot", headers={}, payload=None)
 802 | 
 803 | 
 804 |     def updateModelSnapshotDescription(self, job_id, snapshot_id, description):
 805 |         """
 806 |         Updates the description of the model snapshot that matches the given snapshot id
 807 |         :param job_id: the job id
 808 |         :param snapshot_id: the snapshot id
 809 |         :param description: the new description
 810 |         :return: (http_status_code, updated_snapshot) tuple if it was successful,
 811 |             or (http_status_code, error_doc) if http_status_code != 200
 812 |         """
 813 |         headers = {'Content-Type': 'application/json'}
 814 |         payload = {'description': description}
 815 |         url = self.base_url + "/modelsnapshots/{0}/{1}/description".format(job_id, snapshot_id)
 816 |         return self._put(url, 'Update model snapshot description', headers=headers, payload=json.dumps(payload))
 817 | 
 818 | 
 819 |     def deleteModelSnapshot(self, job_id, snapshot_id):
 820 |         """
 821 |         Deletes the model snapshot that matches the given snapshot if
 822 |         :param job_id: the job id
 823 |         :param snapshot_id: the snapshot id
 824 |         :return: a (http_status_code, response_data) tuple, if
 825 |             http_status_code != 200 response_data is an error object.
 826 |         """
 827 |         url = self.base_url + "/modelsnapshots/{0}/{1}".format(job_id, snapshot_id)
 828 |         return self._delete(url, 'Delete model snapshot')
 829 | 
 830 |     def startScheduler(self, job_id, start_date='', end_date=''):
 831 |         """
 832 |         Starts the scheduler of a job.
 833 | 
 834 |         :param job_id: the id of a scheduled job
 835 |         :param start_date: Must either be an epoch time or ISO 8601 format
 836 |             see the Prelert Engine API docs for help. When set, the scheduler
 837 |             starts from the specified time. If not set, the scheduler starts
 838 |             from epoch 0 or from just after the timestamp of latest processed record.
 839 |         :param end_date: Must either be an epoch time or ISO 8601 format
 840 |             see the Prelert Engine API docs for help. If set, the scheduler will
 841 |             analyse data up to the specified time and then it will stop. If not set,
 842 |             the scheduler will keep analysing data periodically.
 843 |         :return: (http_status_code, acknowledgement) tuple if it was successful,
 844 |             or (http_status_code, error_doc) if http_status_code != 200
 845 |         """
 846 |         url = self.base_url + "/schedulers/{0}/start?start={1}&end={2}".format(
 847 |             job_id, urllib.quote(start_date), urllib.quote(end_date))
 848 |         return self._post(url, 'Start scheduler', headers={}, payload=None)
 849 | 
 850 |     def stopScheduler(self, job_id):
 851 |         """
 852 |         Stops the scheduler of a job.
 853 |         The call is blocking until the scheduler is stopped.
 854 | 
 855 |         :param job_id: the id of a scheduled job
 856 |         :return: (http_status_code, acknowledgement) tuple if it was successful,
 857 |             or (http_status_code, error_doc) if http_status_code != 200
 858 |         """
 859 |         url = self.base_url + "/schedulers/{0}/stop".format(job_id)
 860 |         return self._post(url, 'Stop scheduler', headers={}, payload=None)
 861 | 
 862 |     def validateDetector(self, payload):
 863 |         """
 864 |         Validates a detector.
 865 | 
 866 |         :param payload: The JSON payload as string. See Prelert Engine API docs for help.
 867 |         :return: (http_status_code, acknowledgement) tuple if it was successful,
 868 |             or (http_status_code, error_doc) if http_status_code != 200
 869 |         """
 870 |         headers = {'Content-Type': 'application/json'}
 871 |         url = self.base_url + "/validate/detector"
 872 |         return self._post(url, 'Validate detector', headers, payload)
 873 | 
 874 |     def validateTransform(self, payload):
 875 |         """
 876 |         Validates a transform.
 877 | 
 878 |         :param payload: The JSON payload as string. See Prelert Engine API docs for help.
 879 |         :return: (http_status_code, acknowledgement) tuple if it was successful,
 880 |             or (http_status_code, error_doc) if http_status_code != 200
 881 |         """
 882 |         headers = {'Content-Type': 'application/json'}
 883 |         url = self.base_url + "/validate/transform"
 884 |         return self._post(url, 'Validate transform', headers, payload)
 885 | 
 886 |     def validateTransforms(self, payload):
 887 |         """
 888 |         Validates an array of transforms.
 889 | 
 890 |         :param payload: The JSON payload as string. See Prelert Engine API docs for help.
 891 |         :return: (http_status_code, acknowledgement) tuple if it was successful,
 892 |             or (http_status_code, error_doc) if http_status_code != 200
 893 |         """
 894 |         headers = {'Content-Type': 'application/json'}
 895 |         url = self.base_url + "/validate/transforms"
 896 |         return self._post(url, 'Validate transforms', headers, payload)
 897 | 
 898 |     def _get(self, url, request_description, expects_json=True):
 899 |         """
 900 |           General GET request.
 901 | 
 902 |           request_description is used in log messages which are of the form
 903 |           'Get ' + request_description + ' response = ...'
 904 |           If expects_json is True then the response will be parsed
 905 |           into a JSON object
 906 | 
 907 |           Returns a (status code, JSON/dictonary object) tuple if expects_json
 908 |           is true else (status code, response).
 909 |         """
 910 |         self.connection.connect()
 911 |         self.connection.request("GET", url)
 912 |         response = self.connection.getresponse();
 913 | 
 914 |         if response.status != 200:
 915 |             logging.error("Get " + request_description + " response = " + str(response.status) + " "
 916 |                 + response.reason)
 917 |         else:
 918 |             logging.debug("Get " + request_description + " response = " + str(response.status))
 919 | 
 920 |         data = response.read()
 921 |         if not expects_json:
 922 |             return (response.status, data)
 923 | 
 924 |         if data:
 925 |             job = json.loads(data)
 926 |         else:
 927 |             job = dict()
 928 | 
 929 |         self.connection.close()
 930 | 
 931 |         return (response.status, job)
 932 | 
 933 | 
 934 |     def _post(self, url, request_description, headers={}, payload=None):
 935 |         """
 936 |           General POST request.
 937 |           If the response code is either 200, 201 or 202 then the request is
 938 |           considered a success
 939 | 
 940 |           url is the target URL
 941 |           headers is a dictionary object defining the request headers
 942 |           if not required use {}
 943 |           payload is the data to be sent
 944 |           request_description is used in log messages which are of the form
 945 |           request_description + ' response = ...'
 946 | 
 947 |           Returns a (status code, JSON/dictonary object) tuple
 948 |         """
 949 |         return self._request_with_payload(url, request_description, headers, payload, 'POST')
 950 | 
 951 | 
 952 |     def _put(self, url, request_description, headers={}, payload=None):
 953 |         """
 954 |           General PUT request.
 955 |           If the response code is either 200, 201 or 202 then the request is
 956 |           considered a success
 957 | 
 958 |           url is the target URL
 959 |           headers is a dictionary object defining the request headers
 960 |           if not required use {}
 961 |           payload is the data to be sent
 962 |           request_description is used in log messages which are of the form
 963 |           request_description + ' response = ...'
 964 | 
 965 |           Returns a (status code, JSON/dictonary object) tuple
 966 |         """
 967 |         return self._request_with_payload(url, request_description, headers, payload, 'PUT')
 968 | 
 969 | 
 970 |     def _request_with_payload(self, url, request_description, headers, payload, method):
 971 |         """
 972 |           Executes a request that can have a payload with the given method (POST, PUT)
 973 |           If the response code is either 200, 201 or 202 then the request is
 974 |           considered a success
 975 | 
 976 |           url is the target URL
 977 |           headers is a dictionary object defining the request headers
 978 |           if not required use {}
 979 |           payload is the data to be sent
 980 |           request_description is used in log messages which are of the form
 981 |           request_description + ' response = ...'
 982 | 
 983 |           Returns a (status code, JSON/dictonary object) tuple
 984 |         """
 985 | 
 986 |         self.connection.connect()
 987 |         self.connection.request(method, url, payload, headers)
 988 | 
 989 |         response = self.connection.getresponse()
 990 | 
 991 |         if not response.status in [200, 201, 202]:
 992 |             logging.error(request_description + " response = " + str(response.status) + " "
 993 |                 + response.reason)
 994 |         else:
 995 |             logging.debug(request_description + " response = " + str(response.status))
 996 | 
 997 |         data = response.read()
 998 |         if data:
 999 |             doc = json.loads(data)
1000 |         else:
1001 |             doc = dict()
1002 | 
1003 |         self.connection.close()
1004 | 
1005 |         return (response.status, doc)
1006 | 
1007 |     def _uploadToEndpoint(self, job_id, data, endpoint, gzipped=False):
1008 |         """
1009 |         Upload data to the specified endpoint.
1010 |         Data can be a string or an open file object.
1011 |         endpoint is the endpoint and *should* not be surrounded by backslashes '/'
1012 |         If the data is gzipped compressed set gzipped to True
1013 | 
1014 |         Returns a (http_status_code, response_data) tuple, if
1015 |         http_status_code != 202 response_data is an error message.
1016 | 
1017 |         """
1018 |         headers = {}
1019 |         if gzipped:
1020 |             headers['Content-Encoding'] = 'gzip'
1021 | 
1022 |         url = self.base_url + "/" + endpoint + "/" + job_id
1023 | 
1024 |         self.connection.connect()
1025 |         self.connection.request("POST", url, data, headers)
1026 |         response = self.connection.getresponse();
1027 |         if response.status != 202:
1028 |             logging.error(endpoint + " response = " + str(response.status)
1029 |                 + " " + response.reason)
1030 |         else:
1031 |             logging.debug(endpoint + " response = " + str(response.status))
1032 | 
1033 |         # read all of the response before another request can be made
1034 |         data = response.read()
1035 | 
1036 |         self.connection.close()
1037 | 
1038 |         return (response.status, data)
1039 | 
1040 |     def _delete(self, url, request_description):
1041 |         """
1042 |             General DELETE request.
1043 |             Returns a (http_status_code, response_data) tuple, if
1044 |             http_status_code != 200 response_data is an error object.
1045 |         """
1046 |         self.connection.connect()
1047 |         self.connection.request("DELETE", url)
1048 | 
1049 |         response = self.connection.getresponse()
1050 |         if response.status != 200:
1051 |             logging.error(request_description + " response = " + str(response.status)
1052 |                 + " " + response.reason)
1053 | 
1054 |         data = response.read()
1055 |         if data:
1056 |             msg = json.loads(data)
1057 |         else:
1058 |             msg = dict()
1059 | 
1060 |         self.connection.close()
1061 | 
1062 |         return (response.status, msg)


--------------------------------------------------------------------------------