├── LICENSE ├── README.md ├── anomaly_detector.py ├── arima_train.py ├── compare_fourier_prophet.py ├── data └── prometheus.example.com │ └── go_goroutines │ ├── 20180618 │ ├── 20180618.json.bz2 │ └── 201806182359.json.bz2 │ └── 20180617.json.bz2 ├── exp_smoothing_train.py ├── format_to_pandas.py ├── format_to_pandas_demo.sh ├── fourier_train.py ├── graphing_ts.py ├── holt_winters_train.py ├── imgs ├── anomaly_detection1.png ├── anomaly_detection2.png ├── arima3.png ├── compare_prophet_fourier3.png ├── compare_prophet_fourier4.png ├── compare_prophet_fourier5.png ├── detect_anomaly_accumulator.png ├── detect_anomaly_combined.png ├── detect_anomaly_tail_prob.png ├── example_ts.png ├── exp_smoothing3.png ├── forecasting_data.png ├── fourier3.png ├── fourier4.png ├── fourier_extrapolation.png ├── fourier_extrapolation_behind.png ├── histogram_graph.png ├── histogram_graph2.png ├── imgs │ ├── detect_anomaly_accumulator.png │ ├── detect_anomaly_combined.png │ ├── detect_anomaly_tail_prob.png │ ├── example_ts.png │ ├── fourier_extrapolation.png │ ├── fourier_extrapolation_behind.png │ └── partitioned_ts.png ├── metadata_instance_label_cloudprovider_aws_api_request_duration_seconds.png ├── metadata_instance_label_kubelet_docker.png ├── metadata_operation_label_kubelet_docker2.png ├── metadata_operation_label_legend_kubelet_docker.png ├── partitioned_ts.png ├── prophet.png ├── prophet3.png ├── summary_graph3.png ├── summary_graph4.png ├── t-sne_embedding.png └── t_sne_embedding2.png ├── metadata_analysis ├── get_single_ts_all.py ├── graph_metadata.py ├── graph_specific_ts.py ├── plot_metadata_labels.py └── t_sne_for_metadata.py ├── notebooks ├── .ipynb_checkpoints │ ├── ARIMA modelling-checkpoint.ipynb │ ├── Exponential Smoothing and ARIMA on Real Data-checkpoint.ipynb │ ├── Parse Json to Pandas Dataframes-checkpoint.ipynb │ └── Prophet Model Forecasting-checkpoint.ipynb ├── ARIMA modelling.ipynb ├── Anomaly Detection Decision Rules.ipynb ├── Counter Gauge Metric Analysis.ipynb ├── Exponential Smoothing Real Data.ipynb ├── Exponential Smoothing and ARIMA on Real Data.ipynb ├── Fourier Analysis Forecasting.ipynb ├── Parse Json to Pandas Dataframes.ipynb ├── Prophet Model Forecasting.ipynb ├── Verify Alerts with Spark and Explanatory Statistics - CEPH.ipynb └── imgs │ ├── arima.png │ ├── detect_anomaly_accumulator.png │ ├── detect_anomaly_combined.png │ ├── detect_anomaly_tail_prob.png │ ├── example_ts.png │ ├── exp_smoothing.png │ ├── fourier_extrapolation.png │ ├── fourier_extrapolation_behind.png │ ├── imgs │ ├── detect_anomaly_accumulator.png │ ├── detect_anomaly_combined.png │ ├── detect_anomaly_tail_prob.png │ ├── example_ts.png │ ├── fourier_extrapolation.png │ ├── fourier_extrapolation_behind.png │ └── partitioned_ts.png │ ├── kubelet_docker_instance_label.png │ ├── kubelet_docker_op_type_label.png │ ├── partitioned_ts.png │ ├── prophet.png │ └── t-sne_embedding.png ├── presentations ├── devconf_presentation.pdf ├── final_presentation.pdf ├── lightning_talk.ppdf ├── mid-summer_presentation.pdf └── pipeline_arch.png ├── prophet_train.py └── run_compare_mdls.sh /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Science on Prometheus Metrics 2 | ## **Table of Contents** 3 | 1. [Metric Types](#metrics) 4 | 2. [Metadata Analysis](#metadata) 5 | 3. [Data Preprocessing](#preprocessing) 6 | 4. [Time Series Forecasting](#forcasting) 7 | 5. [Anomaly Detection Decision Rules](#ad) 8 | 6. [Anomaly Detection for Histograms and Summaries](#ad-hist-summ) 9 | 7. [Conclusion](#conclusion) 10 | ## **The Metrics** 11 | #### Anatomy of Metrics 12 | For more information, visit https://prometheus.io/docs/concepts/metric_types/ 13 | 14 | Gauge Metric for a given target: 15 | * a time series 16 | 17 | Counter Metric for a given target: 18 | * a monotonically increasing time series 19 | 20 | Histogram Metric for a given target: 21 | * *a collection of bucket time series* - n buckets in the histogram correspond to n time series. the +Inf bucket time series is the same as the count time series. 22 | * *a count time series* - 23 | a cumulative count of all observations we have seen thus far 24 | * *a sum time series* - a cumulative sum of all observations we have seen thus far 25 | 26 | Summary Metric for a given target: 27 | * *quantile time series* - there are n quantiles corresponding to n time series 28 | * *count time series* - a cumulative count of all observations we have seen thus far 29 | * *sum time series* - a cumulative sum of all observations we have seen thus far 30 | 31 | #### Key Insights on Metrics 32 | 1. The value of a counter is useless - the instantaneous value of a counter is dependent on when Prometheus decides to reset the counter. it is often not indicative of the state of the system. Counters are only useful when we look at how they change over time. For this reason, when we talk about a counter we will automatically preprocess the counter into a difference series where difference(t) = raw(t) - raw(t-1) 33 | 2. The metrics are received and stored in the form of packets. All quantile packets are stored in a quantile folder, all count packets in a count folder, etc. We parse these packets and reformat them into dictionaries for each time series type and metric. The key refers to the metadata (target info and metric contents labels) and the value is a pandas dataframe with timestamp and value for each unique metadata configuration. Essentially each key-value pair represents a single incoming time series. For more information, check out [this notebook](notebooks/Parse%20Json%20to%20Pandas%20Dataframes.ipynb) 34 | 35 | ## **The Metadata** 36 | 37 | A given time series is assigned a unique metadata label configuration which includes user-specified and default labels. We used some basic visualization techniques for metadata analysis including graphing and dimensionality reduction. 38 | 39 | One of the default labels that occurs in every metric and metadata packet is the instance. The instance is an ip which corresponds to the scraped target. However, these ip addresses are refreshed over time. In the graphs below, we can see that certain instances pop up and go away at different times. Because the instance change typically results from one or two targets going down, most instances are re-assigned simultaneously which means that we cannot pinpoint the continuity between the old and new instance labels. 40 | 41 | Metadata analysis scripts can be found in [this folder](metadata_analysis/). 42 | ![](imgs/metadata_instance_label_kubelet_docker.png) 43 | **Graph 1:** instance values over time for all targets of kubelet_docker_operation_latency_microseconds_quantile. Every value on the y-axis corresponds to a specific instance ip address (ex: ip-172-31-70-31.us-east-2.compute.internal) 44 | 45 | ![](imgs/metadata_instance_label_cloudprovider_aws_api_request_duration_seconds.png) 46 | **Graph 2:** instance values over time for all targets of cloudprovider_aws_api_request_duration_seconds. Every value on the y-axis corresponds to a specific instance ip address (ex: ip-172-31-70-31.us-east-2.compute.internal) 47 | 48 | 49 | ![](imgs/metadata_operation_label_kubelet_docker2.png) 50 | **Graph 3:** operation_type values over time for all targets of kubelet_docker_operation_latency_microseconds_quantile. legend provided below. Note that all operation_types exist for all timestamps except for remove_image. 51 | ![](imgs/metadata_operation_label_legend_kubelet_docker.png) 52 | 53 | #### T-SNE Embedding of dataset 54 | T-sne embedding is a dimensionality reduction technique used for mapping high dimensional data into a lower dimension for visualization. In this case, our dimensions are labels in the metadata. We use a standard t-sne package from sci-kit learn and represented our categorical data in terms of numerical values from 1 to len(label values). The result is shown below. 55 | 56 | [Here](metadata_analysis/t_sne_for_metadata.py) is the script for generating these visualizations. 57 | ![](imgs/t_sne_embedding2.png) 58 | **Graph 4:** t-sne embedding for all data points in kubelet_docker_operation_latency_microseconds_quantile. Note that the data seems to be clustered in some way. 59 | ![](imgs/t-sne_embedding.png) 60 | **Graph 5:** t-sne embedding for all data points in kubelet_docker_operation_latency_microseconds_quantile colored by instance (there are too many intances for colors, so colors are re-used). Note that the coloring corresponds to the clustering which indicates that our data is likely clustered by instance. 61 | 62 | ## **Data Preprocessing** 63 | In order to run forecasting and time series analysis on our Prometheus data, we had to reformat the metric packets received from Prometheus into a format which can be fed into a forecasting model. We found Pandas DataFrames to be very useful for this purpose. 64 | 65 | I designed a script that will convert a collection of json files in a specified directory into a dictionary of DataFrames housed in a Pickle file. 66 | 67 | **Input Format**: a directory which houses .json or .json.bz2 files. Each of these files may contain one or more json packets. These files can be taken directly from a local folder or remotely from a Ceph folder (additional configuration may need to be completed for Ceph retrieval). These json files are intended to be taken from a single metric. 68 | 69 | **Output Format**: a pickle file which houses a dictionary of Pandas DataFrames. The keys are the metadata configurations in a string format. To use this output, here are a few helpful commands: 70 | 71 | 72 | `data_dictionary = pickle.load(my_output_file)` 73 | `data_dictionary["{'__name__': 'kubelet_docker_operations_latency_microseconds','beta_kubernetes_io_arch': 'amd64'..."]` 74 | 75 | **Or Manually using the Command Line Arguments** 76 | 77 | `python format_to_pandas.py --metric METRIC --input INPUT_DIR --output OUTPUT_DIR` 78 | 79 | for more information about the command line arguments, use the help message `./format_to_pandas.py --help` or have a look at the [sample run](format_to_pandas.py) on sample data. 80 | 81 | 82 | [this](https://docs.google.com/spreadsheets/d/1CB14X5xd1dPH2x9m_ko_2rfz6BrilPZPYcJA_kQWBUo/edit?usp=sharing) spreadsheet has a working list of metrics and their associated data sparsity 83 | ## **Forecasting** 84 | 85 | ![](imgs/forecasting_data.png) 86 | **Graph 6:** A time series from kubelet_docker_operations_latency_microseconds with the following metadata: *{'name': 'http_request_duration_microseconds', 'app': 'openshift-web-console', 'handler': 'prometheus', 'instance': '10.129.0.18:8443', 'job': 'kubernetes-service-endpoints', 'kubernetes_name': 'webconsole', 'kubernetes_namespace': 'openshift-web-console', 'quantile': '0.9'}* 87 | #### Exponential Smoothing 88 | [This notebook](notebooks/Exponential%20Smoothing%20and%20ARIMA%20on%20Real%20Data.ipynb) has an introduction to Exponential Smoothing and a few examples. 89 | The implementation for this model came from the [statsmodels python package](http://www.statsmodels.org/dev/tsa.html) 90 | ![](imgs/exp_smoothing3.png) 91 | **Graph 7:** Exponential Smoothing on a Time Series. Note that the forecast (yhat) remains the same as the last training value. This means that we do not take into account the seasonality or volatility of the series using this model. 92 | #### ARIMA Modelling 93 | [This notebook](notebooks/ARIMA%20modelling.ipynb) has an introduction to ARIMA and a few examples of ARIMA modelling. 94 | The implementation for this model came from the [statsmodels python package](http://www.statsmodels.org/dev/tsa.html). 95 | ![](imgs/arima3.png) 96 | **Graph 8:** ARIMA Modelling on a Time Series. Note that the forecast (yhat) decays to the median very quickly. It seems that this model does not take into account the seasonality of the data. For this example, we used ARIMA(1,0,1). 97 | #### Prophet Modelling 98 | [This notebook](notebooks/Prophet%20Model%20Forecasting.ipynb) has an introduction to Prophet and a few examples of Prophet modelling.[Here](https://peerj.com/preprints/3190.pdf) is Facebook's paper on Prophet modelling 99 | ![](imgs/prophet.png) 100 | **Graph 9:** Prophet Modelling on a Time Series. Note that the model seems to train according to trend, and the bounds (yhat_upper and yhat_lower) are reasonably accurate. This specific example likely provides too little data for Prophet to detect anomalies accurately. 101 | ![](imgs/prophet3.png) 102 | **Graph 10:** Prophet Modelling on a Time Series from kubelet_docker_operations_latency_microseconds. Notice how there are large gaps in the training and testing data. This is characteristic of many of the time series we get from Prometheus because there are often dead times in the systems. Prophet seems to handle these gaps pretty well. 103 | #### Fourier Extrapolation 104 | [This notebook](notebooks/Fourier%20Analysis%20Forecasting.ipynb) has an introduction to Fourier Analysis and a few examples. 105 | ![](imgs/fourier3.png) 106 | **Graph 11:** Fourier Extrapolation on a Time Series. Note that this model does an excellent job of reproducing the seasonality of the training set. It responds very well when there is a clear pattern in the data. 107 | ![](imgs/fourier4.png) 108 | **Graph 12:** Fourier Extrapolation on a Time Series from kubelet_docker_operations_latency_microseconds. Note that this forecast seems to be shifted upwards and reproduces the point-wise peaks in the traning set. 109 | #### Model Comparisons 110 | Comparing Prophet and Fourier 111 | ![](imgs/compare_prophet_fourier4.png) 112 | **Graph 13:** A Comparison between Prophet and Fourier for a time series from kubelet_docker_operations_latency_microseconds. Prophet does a good job of predicting the mean, while Fourier accurately predicts the seasonality of the data. 113 | ![](imgs/compare_prophet_fourier5.png) 114 | **Graph 14:** A Comparison between Prophet and Fourier for a time series from kubelet_docker_operations_latency_microseconds. Prophet does a good job of predicting the mean, while Fourier accurately predicts the seasonality of the data. 115 | ![](imgs/compare_prophet_fourier3.png) 116 | **Graph 15:** A Comparison between Prophet and Fourier for a time series from kubelet_docker_operations_latency_microseconds. A and B are the same forecasts from above, while C and D represent two specific types of data. C had a training set with a few point-wise extremes. We can see here that Fourier is very sensitive to those extremities and will attempt to model them in the forecast, while Prophet does a good job of identifying the mean. In D, we have a training set which has a drastic change in seasonality halfway through. Again, Prophet seems rather robust against that sudden change while Fourier continues the original pattern with a slight variation. 117 | #### Further Research 118 | * RNNs (LSTMs) 119 | * Generalization to additional metrics 120 | * Verification from domain experts 121 | ## **Anomaly Detection Decision Rules** 122 | [This notebook](notebooks/Anomaly%20Detection%20Decision%20Rules.ipynb) provides details on our anomaly detection decision rules that we've employed. 123 | 124 | ## **Anomaly Detection for Histograms and Summaries** 125 | #### **Histogram and Summary Visualization** 126 | Let's begin by taking a look at the graphs of histogram and summary components over time. 127 | 128 | ![](imgs/histogram_graph.png) 129 | **Graph 18:** The buckets of apiserver_admission_controller_admission_latencies_seconds_count over time. While at first glance it looks like there is only the 976562.5 bucket, this is actually a graph of all the buckets. The buckets are all the same values. This indicates that the buckets are likely to be mis-configured, but are nevertheless, treated as the same value for our uses in anomaly detection. 130 | ![](imgs/histogram_graph2.png) 131 | **Graph 19:** The buckets of cloudprovider_aws_api_request_duration_seconds over time. From this graph we can see that a majority of the time series raw data values lie in between 0.25 and 10. 132 | ![](imgs/summary_graph3.png) 133 | **Graph 20:** The quantiles of kubelet_docker_operations_latency_microseconds over time. The 0.5 quantile will always be below the 0.9 quantile which will always be below the 0.99 quantile. 134 | ![](imgs/summary_graph4.png) 135 | **Graph 21:** The quantiles of kubelet_docker_operations_latency_microseconds over time. Notice that the 0.99 quantile is much more volatile than the median. This is because the 0.99 quantile represents the extreme values whereas the 0.5 quantile represents the median which is often much more stable. 136 | #### **Key Observations** 137 | Histograms and Summaries can be seen as a collection of gauges and counters. We can take each quantile/bucket, count, and sum as an individual time series and apply anomaly detection on them individually, or we can apply some sort of anomaly detection based on their correlation. 138 | 139 | #### **Anomaly Detection Rule Outline** 140 | Below is an example of an alerting decision scheme for Summary metric alerting. It can also be extended to Histogram metric alerting by replacing the quantiles with buckets. 141 | Let's assign the function AD(time series) to be the following: 142 | 143 | 144 | AD (time series): 145 | Return True if anomaly detected 146 | Return False otherwise 147 | 148 | Let's define quantiles is the set of all time series quantiles. 149 | Below is a block decision chain for when to send an alert if there is an anomaly detected in some set of the time series. 150 | 151 | If 0.5 ∈ quantiles: 152 | If AD(0.5) OR [ AD(quant1) AND AD(quant2) AND AD(quant3) ] : (∀ quant1, quant2, quant3 ∈ quantiles) 153 | Send alert 154 | Else if AD(quant1) AND AD(quant2): (∀ quant1, quant2 ∈ quantiles) 155 | Send alert 156 | If AD(sum) AND AD(count): 157 | Send alert 158 | Else if AD(quant) AND [ AD(sum) OR AD(count) ] : (∀ quant ∈ quantiles) 159 | Send alert 160 | 161 | 162 | ## **Conclusion** 163 | My project aimed to explore the connections between metadata labels, time series forecasting, and anomaly detection in an attempt to gain valuable quantitative insights on Prometheus metrics. The summer started with a dive into Spark and Prophet Modelling. We found Spark to be very clunky for quick analysis of Prometheus data, so next we moved to local data with Pandas DataFrames. With these DataFrames, we applied various metadata analysis and forecasting techniques to our data. 164 | 165 | In addition to the analysis here, my teammates worked on deploying a real-time anomaly detection pipeline on Prometheus metrics for sending automatic alerts to developers. 166 | 167 | For future work, we would like to do more forecast and anomaly detection testing on a variety of metrics. Since we have the pipeline working, this will probably require setting up real-time anomaly detection for different targets and monitoring them in Grafana. 168 | -------------------------------------------------------------------------------- /anomaly_detector.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from scipy.stats import norm 5 | 6 | import matplotlib.transforms as mtransforms 7 | 8 | class Accumulator: 9 | def __init__(self,thresh): 10 | self._counter = 0 11 | self.thresh = thresh 12 | def inc(self, val): 13 | self._counter += val 14 | def count(self): 15 | return self._counter 16 | 17 | class AnomalyDetector: 18 | def __init__(self, window=8000, small_window=80, epsilon=0.61, bounds_thresh=22000, peak_thresh=130000, acc_thresh=1000): 19 | # accumulator parameters 20 | self.large_window = window 21 | self.small_window = small_window 22 | self.epsilon = epsilon 23 | # tail probability parameters 24 | self.bounds_thresh = bounds_thresh 25 | self.peak_thresh = peak_thresh 26 | self.acc_thresh = acc_thresh 27 | 28 | def anomaly_tail_distribution(self, w, w_prime): 29 | if len(w) != self.large_window: 30 | return "ERROR: input values do not match window size" 31 | mu = np.mean(w) 32 | std = np.std(w) 33 | mu_bar = np.mean(w_prime) 34 | 35 | L_t = norm.sf(((mu_bar - mu)/std)) 36 | # print(L_t) 37 | if L_t >= 1 - self.epsilon: 38 | return 1 39 | return 0 40 | 41 | def anomaly_accumulator(self, y, y_hat): 42 | s_t = [] 43 | anomaly_inds = [] 44 | acc_thresh = self.acc_thresh 45 | acc = Accumulator(acc_thresh) 46 | for i in range(0, len(y_hat)): 47 | diff = y_hat[i] - y[i] 48 | if abs(diff) > self.bounds_thresh: 49 | # upper bound anomaly, increment counter 50 | acc.inc(1) 51 | elif y[i] > self.peak_thresh: 52 | # found peak, decrement so that acc will decay to 0 53 | acc.inc(-3) 54 | else: 55 | # no anomaly, decrement by 2 56 | acc.inc(-2) 57 | 58 | if acc.count() > acc.thresh: 59 | anomaly_inds.append(i) 60 | 61 | s_t.append(max(diff, 0)) 62 | return s_t, anomaly_inds 63 | def get_anomalies(self, y, y_hat): 64 | if len(y) != len(y_hat): 65 | return "ERROR: lengths of inputs do not match" 66 | s_t, anomaly_inds_acc = self.anomaly_accumulator(y, y_hat) 67 | cum_window = self.large_window+self.small_window 68 | 69 | anomaly_inds_tail = [] 70 | print("st:", len(s_t)) 71 | print("cum_wind:", cum_window) 72 | for i in range(0,(len(s_t)-cum_window)): 73 | window = s_t[i:int(i+self.large_window)] 74 | small_window = s_t[int(i+self.large_window):int(i+cum_window)] 75 | val = self.anomaly_tail_distribution(window, small_window) 76 | anomaly_inds_tail.append(val) 77 | anomaly_inds_tail = np.argwhere(anomaly_inds_tail).flatten() 78 | 79 | print("a_i_tail: ", len(anomaly_inds_tail)) 80 | print("a_i_accum: ", len(anomaly_inds_acc)) 81 | # get intersection of both 82 | set_tail = set(anomaly_inds_tail) 83 | set_acc = set(anomaly_inds_acc) 84 | flag_anomaly = set_tail.intersection(set_acc) 85 | return flag_anomaly 86 | 87 | def detect_anomalies(predictions, data): 88 | if len(predictions) != len(data) : 89 | raise IndexError 90 | 91 | # parameters 92 | lower_bound_thresh = predictions["yhat_lower"].min() 93 | upper_bound_thresh = predictions["yhat_upper"].max() 94 | diff_thresh = 2*data["values"].std() 95 | acc_thresh = int(0.1*np.shape(predictions)[0]) 96 | epsilon = .1 97 | 98 | diffs = [] 99 | acc = Accumulator(acc_thresh) 100 | preds = np.array(predictions["yhat"]) 101 | dat = np.array(data["values"]) 102 | for i in range(0, np.shape(predictions)[0]): 103 | diff = preds[i] - dat[i] 104 | if abs(diff) > diff_thresh: 105 | # upper bound anomaly, increment counter 106 | acc.inc(1) 107 | elif dat[i] < lower_bound_thresh: 108 | # found trough, decrement so that acc will decay to 0 109 | acc.inc(-3) 110 | elif dat[i] > upper_bound_thresh: 111 | # found peak, decrement so that acc will decay to 0 112 | acc.inc(-3) 113 | else: 114 | # no anomaly, decrement by 2 115 | acc.inc(-2) 116 | 117 | diffs.append(max(diff, 0)) 118 | 119 | if acc.count() > acc.thresh: 120 | acc_anomaly = True 121 | else: 122 | acc_anomaly = False 123 | w_size = int(0.8*len(data)) 124 | w_prime_size = len(data) - w_size 125 | 126 | w = diffs[0:w_size] 127 | w_prime = diffs[w_size:] 128 | 129 | w_mu = np.mean(w) 130 | w_std = np.std(w) 131 | w_prime_mu = np.mean(w_prime) 132 | 133 | if w_std == 0: 134 | L_t = 0 135 | else: 136 | L_t = 1 - norm.sf((w_prime_mu - w_mu)/w_std) 137 | 138 | print(L_t) 139 | if L_t >= 1 - epsilon: 140 | tail_prob_anomaly = True 141 | else: 142 | tail_prob_anomaly = False 143 | 144 | return acc_anomaly and tail_prob_anomaly 145 | 146 | 147 | 148 | def graph(train, test, forecast, anomalies, metric_name): 149 | len_train = len(train) 150 | fig = plt.figure(figsize=(20,10)) 151 | ax = plt.axes() 152 | ax.plot(np.array(train["timestamps"]), np.array(train["values"]), 'b', label = 'train', linewidth = 3) 153 | ax.plot(np.array(test["timestamps"]), np.array(test["values"]), 'g', label = 'test', linewidth = 3) 154 | ax.plot(np.array(forecast["ds"]), np.array(forecast["yhat"]), 'y', label = 'yhat') 155 | title = "Forecast for " + metric_name 156 | ax.set_title(title) 157 | ax.set_xlabel("Timestamp") 158 | ax.set_ylabel("Value") 159 | trans = mtransforms.blended_transform_factory(ax.transData, ax.transAxes) 160 | for a in anomalies: 161 | bool_arr = np.repeat(False,len(forecast)) 162 | for i in range(a,a+100): 163 | bool_arr[i] = True 164 | ax.fill_between(np.array(forecast["ds"]),0,1, where=bool_arr, facecolor='red', alpha=0.5, transform=trans) 165 | plt.legend(loc=3) 166 | plt.show() 167 | 168 | metric_name = "http_request_duration_microseconds_quantile_728" 169 | filename = "../fourier_forecasts/forecast_" + metric_name + ".pkl" 170 | pkl_file = open(filename, "rb") 171 | forecast = pickle.load(pkl_file) 172 | train = pickle.load(pkl_file) 173 | test = pickle.load(pkl_file) 174 | pkl_file.close() 175 | forecast = forecast[np.shape(train)[0]:] 176 | print(len(forecast)) 177 | print(len(test)) 178 | 179 | inc = 0 180 | anomaly_inds = [] 181 | for i in range(0,len(test)-100,100): 182 | if detect_anomalies(forecast[i:i+100], test[i:i+100]) : 183 | inc += 1 184 | anomaly_inds.append(i) 185 | print(inc) 186 | 187 | #ad = AnomalyDetector() 188 | #anomaly_inds = ad.get_anomalies(test, forecast[-len(test):]) 189 | graph(train, test, forecast, anomaly_inds, metric_name) 190 | -------------------------------------------------------------------------------- /arima_train.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import pandas as pd 4 | import warnings 5 | import matplotlib.pyplot as plt 6 | warnings.filterwarnings("ignore") 7 | import collections 8 | import argparse 9 | from statsmodels.tsa.arima_model import ARIMA 10 | import statsmodels.api as sm 11 | from datetime import datetime 12 | from pandas.tools.plotting import autocorrelation_plot 13 | 14 | 15 | class Exp_Smoothing: 16 | def __init__(self, train, test): 17 | self.train = np.array(train["values"]) 18 | self.ds_train = np.array(train["timestamps"]) 19 | self.test = np.array(test["values"]) 20 | self.ds_test = np.array(test["timestamps"]) 21 | 22 | def fit_model(self, n_predict): 23 | start_date = min(self.ds_train) 24 | print(type(start_date)) 25 | dates = sm.tsa.datetools.dates_from_range("2018m3", length=len(self.ds_train)) 26 | 27 | df_train = pd.Series(self.train, index=dates) 28 | #autocorrelation_plot(df_train) 29 | #plt.show() 30 | model = ARIMA(df_train, order=(1,0,1)) 31 | model_fit = model.fit(disp=0) 32 | self.forecast = model_fit.forecast(steps=len(test)) 33 | 34 | ds = self.ds_test 35 | 36 | self.forecast = pd.DataFrame({"ds": ds, "yhat": self.forecast[0]}) 37 | print(len(self.forecast["yhat"])) 38 | print(len(self.test)) 39 | return self.forecast 40 | 41 | def graph(self, metric_name, key): 42 | plt.figure(figsize=(40,10)) 43 | 44 | 45 | plt.plot(self.ds_train, self.train, 'b', label = 'train', linewidth = 3) 46 | print(np.array(self.forecast["yhat"])) 47 | plt.plot(self.ds_test, self.test, 'k', label = 'test', linewidth = 3) 48 | plt.plot(np.array(self.ds_test), np.array(self.forecast["yhat"]), 'g', label = 'yhat') 49 | # pl.plot(np.array(self.forecast["ds"]), np.array(self.forecast["yhat_upper"]), 'y', label = 'yhat_upper') 50 | # pl.plot(np.array(self.forecast["ds"]), np.array(self.forecast["yhat_lower"]), 'y', label = 'yhat_lower') 51 | 52 | plt.legend() 53 | plt.savefig("../testing/exp_smoothing_graphs/graph_" + metric_name + "_" + str(key) + ".png") 54 | plt.show() 55 | 56 | 57 | 58 | def calc_delta(vals): 59 | diff = vals - np.roll(vals, 1) 60 | diff[0] = 0 61 | return diff 62 | 63 | def monotonically_inc(vals): 64 | # check corner case 65 | if len(vals) == 1: 66 | return True 67 | diff = calc_delta(vals) 68 | diff[np.where(vals == 0)] = 0 69 | 70 | if ((diff < 0).sum() == 0): 71 | return True 72 | else: 73 | return False 74 | 75 | if __name__ == "__main__": 76 | parser = argparse.ArgumentParser(description="frun Prophet training on time series") 77 | 78 | parser.add_argument("--metric", type=str, help='metric name', required=True) 79 | 80 | parser.add_argument("--key", type=int, help='key number') 81 | 82 | args = parser.parse_args() 83 | 84 | metric_name = args.metric 85 | 86 | pkl_file = open("../pkl_data/" + metric_name + "_dataframes.pkl", "rb") 87 | dfs = pickle.load(pkl_file) 88 | pkl_file.close() 89 | key_vals = list(dfs.keys()) 90 | 91 | selected = [args.key] 92 | for ind in selected: 93 | key = key_vals[ind] 94 | df = dfs[key] 95 | df = df.sort_values(by=['timestamps']) 96 | 97 | print(key) 98 | df["values"] = df["values"].apply(pd.to_numeric) 99 | vals = np.array(df["values"].tolist()) 100 | 101 | # check if metric is a counter, if so, run AD on difference 102 | if monotonically_inc(vals): 103 | print("monotonically_inc") 104 | vals = calc_delta(vals) 105 | df["values"] = vals 106 | 107 | train = df[0:int(0.7*len(vals))] 108 | test = df[int(0.7*len(vals)):] 109 | print(len(test)) 110 | es = Exp_Smoothing(train, test) 111 | forecast = es.fit_model(len(test)) 112 | 113 | f = open("../testing/exp_smoothing_forecasts/forecast_" + metric_name + "_" + str(args.key) + ".pkl", "wb") 114 | pickle.dump(forecast, f) 115 | pickle.dump(train, f) 116 | pickle.dump(test,f) 117 | f.close() 118 | 119 | es.graph(metric_name, args.key) 120 | -------------------------------------------------------------------------------- /compare_fourier_prophet.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from matplotlib.pylab import plt 3 | import numpy as np 4 | import argparse 5 | 6 | def graph(train_df, test_df, p_forecast, f_forecast, metric, key): 7 | fig = plt.figure(figsize=(40,10)) 8 | forecast_ds = np.array(f_forecast["ds"]) 9 | print(len(forecast_ds)) 10 | print(len(train_df)) 11 | forecast_ds = forecast_ds[int(train_df["values"].count()):] 12 | 13 | 14 | plt.plot(np.array(train_df["ds"]), np.array(train_df["y"]),'b', label="train", linewidth=3) 15 | plt.plot(np.array(test_df["ds"]), np.array(test_df["y"]), 'k', label="test", linewidth=3) 16 | 17 | plt.savefig( "../testing/compare_fourier_prophet/" + str(key) + "_raw_" + metric + ".png", transparent=True) 18 | prophet = np.array(p_forecast["yhat"]) 19 | prophet_upper = np.array(p_forecast["yhat_upper"]) 20 | prophet_lower = np.array(p_forecast["yhat_lower"]) 21 | 22 | fourier = f_forecast["yhat"] 23 | fourier = fourier[len(train_df["values"]):] 24 | print(len(forecast_ds)) 25 | print(len(fourier)) 26 | plt.plot(forecast_ds, fourier, 'g', label="fourier_yhat", linewidth=3) 27 | plt.savefig( "../testing/compare_fourier_prophet/" + str(key) + "_fourier_" + metric + ".png", transparent=True) 28 | 29 | prophet = prophet[len(train_df["values"]):] 30 | prophet_upper = prophet_upper[len(train_df["values"]):] 31 | prophet_lower = prophet_lower[len(train_df["values"]):] 32 | plt.plot(forecast_ds, prophet, '*y', label="prophet_yhat", linewidth=3) 33 | plt.plot(forecast_ds, prophet_upper, 'y', label="yhat_upper", linewidth=3) 34 | plt.plot(forecast_ds, prophet_lower, 'y', label="yhat_lower", linewidth=3) 35 | 36 | 37 | plt.plot() 38 | plt.xlabel("Timestamp") 39 | plt.ylabel("Value") 40 | plt.legend(loc=1) 41 | plt.title("Prophet Model Forecast") 42 | plt.savefig( "../testing/compare_fourier_prophet/" + str(key) + "_compare_" + metric + ".png", transparent=True) 43 | plt.close() 44 | 45 | 46 | fig = plt.figure(figsize=(40,10)) 47 | forecast_ds = np.array(f_forecast["ds"]) 48 | forecast_ds = forecast_ds[len(train_df["values"]):] 49 | 50 | 51 | plt.plot(np.array(train_df["ds"]), np.array(train_df["y"]),'b', label="train", linewidth=3) 52 | plt.plot(np.array(test_df["ds"]), np.array(test_df["y"]), 'k', label="test", linewidth=3) 53 | 54 | prophet = np.array(p_forecast["yhat"]) 55 | prophet_upper = np.array(p_forecast["yhat_upper"]) 56 | prophet_lower = np.array(p_forecast["yhat_lower"]) 57 | prophet = prophet[len(train_df["values"]):] 58 | prophet_upper = prophet_upper[len(train_df["values"]):] 59 | prophet_lower = prophet_lower[len(train_df["values"]):] 60 | plt.plot(forecast_ds, prophet, '*y', label="prophet_yhat", linewidth=3) 61 | plt.plot(forecast_ds, prophet_upper, 'y', label="yhat_upper", linewidth=3) 62 | plt.plot(forecast_ds, prophet_lower, 'y', label="yhat_lower", linewidth=3) 63 | plt.savefig( "../testing/compare_fourier_prophet/" + str(key) + "_prophet_" + metric + ".png", transparent=True) 64 | plt.close() 65 | if __name__ == '__main__': 66 | parser = argparse.ArgumentParser(description="run Fourier training on time series") 67 | 68 | parser.add_argument("--metric", type=str, help='metric name', required=True) 69 | parser.add_argument("--key", type=int, help='key number') 70 | 71 | args = parser.parse_args() 72 | 73 | fname = "../prophet_forecasts/prophet_model_" + args.metric + "_" + str(args.key) + ".pkl" 74 | f = open(fname, "rb") 75 | p_forecast = pickle.load(f) 76 | print(len(p_forecast)) 77 | p_train = pickle.load(f) 78 | print(len(p_train)) 79 | p_test = pickle.load(f) 80 | print(len(p_test)) 81 | f.close() 82 | 83 | fname = "../fourier_forecasts/forecast_" + args.metric + "_" + str(args.key) + ".pkl" 84 | f = open(fname, "rb") 85 | f_forecast = pickle.load(f) 86 | print(len(f_forecast)) 87 | f_train = pickle.load(f) 88 | print(len(f_train)) 89 | f_test = pickle.load(f) 90 | print(len(f_test)) 91 | f.close() 92 | 93 | graph(p_train, p_test, p_forecast, f_forecast, args.metric, args.key) 94 | -------------------------------------------------------------------------------- /data/prometheus.example.com/go_goroutines/20180617.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/data/prometheus.example.com/go_goroutines/20180617.json.bz2 -------------------------------------------------------------------------------- /data/prometheus.example.com/go_goroutines/20180618/20180618.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/data/prometheus.example.com/go_goroutines/20180618/20180618.json.bz2 -------------------------------------------------------------------------------- /data/prometheus.example.com/go_goroutines/20180618/201806182359.json.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/data/prometheus.example.com/go_goroutines/20180618/201806182359.json.bz2 -------------------------------------------------------------------------------- /exp_smoothing_train.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import pandas as pd 4 | import warnings 5 | import matplotlib.pyplot as plt 6 | warnings.filterwarnings("ignore") 7 | import collections 8 | import argparse 9 | from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt 10 | 11 | 12 | class Exp_Smoothing: 13 | def __init__(self, train, test): 14 | self.train = np.array(train["values"]) 15 | self.ds_train = np.array(train["timestamps"]) 16 | self.test = np.array(test["values"]) 17 | self.ds_test = np.array(test["timestamps"]) 18 | 19 | def fit_model(self, n_predict): 20 | 21 | fit = SimpleExpSmoothing(self.train).fit() 22 | forecast = fit.forecast(n_predict) 23 | 24 | ds = self.ds_test 25 | 26 | self.forecast = pd.DataFrame({"ds": ds, "yhat": forecast}) 27 | 28 | return self.forecast 29 | 30 | def graph(self, metric_name, key): 31 | plt.figure(figsize=(40,10)) 32 | 33 | plt.plot(np.array(self.forecast["ds"]), np.array(self.forecast["yhat"]), 'g', label = 'yhat') 34 | plt.plot(self.ds_train, self.train, 'b', label = 'train', linewidth = 3) 35 | plt.plot(self.ds_test, self.test, 'k', label = 'test', linewidth = 3) 36 | # pl.plot(np.array(self.forecast["ds"]), np.array(self.forecast["yhat_upper"]), 'y', label = 'yhat_upper') 37 | # pl.plot(np.array(self.forecast["ds"]), np.array(self.forecast["yhat_lower"]), 'y', label = 'yhat_lower') 38 | 39 | plt.legend() 40 | plt.savefig("../testing/exp_smoothing_graphs/graph_" + metric_name + "_" + str(key) + ".png") 41 | plt.show() 42 | 43 | 44 | 45 | def calc_delta(vals): 46 | diff = vals - np.roll(vals, 1) 47 | diff[0] = 0 48 | return diff 49 | 50 | def monotonically_inc(vals): 51 | # check corner case 52 | if len(vals) == 1: 53 | return True 54 | diff = calc_delta(vals) 55 | diff[np.where(vals == 0)] = 0 56 | 57 | if ((diff < 0).sum() == 0): 58 | return True 59 | else: 60 | return False 61 | 62 | if __name__ == "__main__": 63 | parser = argparse.ArgumentParser(description="frun Prophet training on time series") 64 | 65 | parser.add_argument("--metric", type=str, help='metric name', required=True) 66 | 67 | parser.add_argument("--key", type=int, help='key number') 68 | 69 | args = parser.parse_args() 70 | 71 | metric_name = args.metric 72 | 73 | pkl_file = open("../pkl_data/" + metric_name + "_dataframes.pkl", "rb") 74 | dfs = pickle.load(pkl_file) 75 | pkl_file.close() 76 | key_vals = list(dfs.keys()) 77 | 78 | selected = [args.key] 79 | for ind in selected: 80 | key = key_vals[ind] 81 | df = dfs[key] 82 | df = df.sort_values(by=['timestamps']) 83 | 84 | print(key) 85 | df["values"] = df["values"].apply(pd.to_numeric) 86 | vals = np.array(df["values"].tolist()) 87 | 88 | # check if metric is a counter, if so, run AD on difference 89 | if monotonically_inc(vals): 90 | print("monotonically_inc") 91 | vals = calc_delta(vals) 92 | df["values"] = vals 93 | 94 | train = df[0:int(0.7*len(vals))] 95 | test = df[int(0.7*len(vals)):] 96 | 97 | es = Exp_Smoothing(train, test) 98 | forecast = es.fit_model(test.shape[0]) 99 | 100 | f = open("../testing/exp_smoothing_forecasts/forecast_" + metric_name + "_" + str(args.key) + ".pkl", "wb") 101 | pickle.dump(forecast, f) 102 | pickle.dump(train, f) 103 | pickle.dump(test,f) 104 | f.close() 105 | 106 | es.graph(metric_name, args.key) 107 | -------------------------------------------------------------------------------- /format_to_pandas.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import json 3 | import pandas as pd 4 | import fnmatch 5 | import os 6 | import bz2 7 | import pickle 8 | import argparse 9 | import gc 10 | from pprint import pprint 11 | 12 | # read files in list and convert to pandas dataframes 13 | def load_files(files): 14 | dfs = {} 15 | for file in files: 16 | # check file format and read appropriately 17 | if file.endswith('json'): 18 | f = open(file, 'rb') 19 | else: 20 | f = bz2.BZ2File(file, 'rb') 21 | 22 | jsons = json.load(f) 23 | f.close() 24 | 25 | # iterate through packets in file 26 | for pkt in jsons: 27 | # create a new dataframe with packet timestamp and values 28 | df = pd.DataFrame.from_dict(pkt["values"]) 29 | df = df.rename( columns={0:"ds", 1:"y"}) 30 | df["ds"] = pd.to_datetime(df["ds"], unit='s') 31 | df = df.sort_values(by=["ds"]) 32 | df.y = pd.to_numeric(df['y'], errors='coerce') 33 | df = df.dropna() 34 | md = str(pkt["metric"]) 35 | # append generated dataframe and metadata to collection 36 | try: 37 | dfs[md] = dfs[md].append(df, ignore_index=True) 38 | except: 39 | dfs[md] = df 40 | return dfs 41 | 42 | # take a list of dataframes and their metadata and collapse to a 43 | # collection of unique time series (based on unique metadata) 44 | def collapse_to_unique(dfs_master, dfs_new): 45 | # iterate through metadata 46 | dfs_remaining = {} 47 | for md in dfs_new.keys(): 48 | try: 49 | # find metadata in our master list 50 | # if this throws an error, simply add it to the list 51 | dfs_master[md] = dfs_master[md].append(dfs_new[md], ignore_index=True) 52 | except: 53 | dfs_remaining[md] = dfs_new[md] 54 | return dfs_master, dfs_remaining 55 | 56 | # create pickle file containing data 57 | def save_checkpoint(pds, file): 58 | if file[-4:] != ".pkl": 59 | file = file + ".pkl" 60 | f = open(file, "wb") 61 | pickle.dump(pds, f) 62 | f.close() 63 | return file 64 | 65 | # load pickle file containing data 66 | def load_checkpoint(file): 67 | f = open(file, "rb") 68 | pds = pickle.load(f) 69 | f.close() 70 | return pds 71 | 72 | # load all files and convert to a list of pandas dataframes 73 | def convert_to_pandas(files, batch_size): 74 | checkpoints = [] 75 | # # separate files into batches 76 | batches = [files[batch_size*i:batch_size*(i+1)] for i in range(int(len(files)/batch_size) + 1)] 77 | print("Batches: ", len(batches)) 78 | i = 0 79 | for batch in batches: 80 | print("Load batch %i" % i, end="\r") 81 | i += 1 82 | # get new portion of dataframes and add to master set 83 | pds_new = load_files(batch) 84 | cp = save_checkpoint(pds_new, "raw_" + str(i)) 85 | checkpoints.append(cp) 86 | gc.collect() 87 | print("Loaded %i batches" % i) 88 | 89 | pds = [] 90 | # iterate checkpoint by checkpoint and add data to unique collection 91 | # of time series 92 | collapsed_fs = [] 93 | i = 0 94 | for cp in checkpoints: 95 | i += 1 96 | print("Processing batch %i" % i, end="\r") 97 | pds_new = load_checkpoint(cp) 98 | # load data in batches and combine dataframes 99 | for f in collapsed_fs: 100 | pds = load_checkpoint(f) 101 | pds, pds_new = collapse_to_unique(pds, pds_new) 102 | save_checkpoint(pds, f) 103 | gc.collect() 104 | if len(pds_new) > 0: 105 | f_new = save_checkpoint(pds_new, "collapsed_" + str(i)) 106 | # print("Generated ", f_new) 107 | collapsed_fs.append(f_new) 108 | gc.collect() 109 | print("Processed %i batches" % i) 110 | return pds 111 | 112 | # get main input arguments and return formatted data 113 | def read_input(data_folder, metric, batch_size): 114 | # metric-specific data folder 115 | folder = os.path.join(data_folder, metric) 116 | 117 | # get all files in folder 118 | files = [] 119 | for root, d_names, f_names in os.walk(folder): 120 | for f in f_names: 121 | if f.endswith('bz2') or f.endswith('json'): 122 | files.append(os.path.join(root, f)) 123 | files.sort() 124 | print("Processing %s files" % len(files)) 125 | 126 | pd_frames = convert_to_pandas(files, batch_size) 127 | 128 | return pd_frames 129 | 130 | # remove all temp pickle files generated during this program 131 | # TODO: use tempfiles for temporary files 132 | def combine_checkpoints(master_file): 133 | df = {} 134 | files = os.listdir() 135 | for file in files: 136 | if fnmatch.fnmatch(file, "collapsed_*.pkl"): 137 | try: 138 | f = open(file, "rb") 139 | dfs = pickle.load(f) 140 | f.close() 141 | df.update(dfs) 142 | except: 143 | continue 144 | os.system("rm " + file) 145 | elif fnmatch.fnmatch(file, "raw_*.pkl"): 146 | os.system("rm " + file) 147 | f = open(master_file + ".pkl", "wb") 148 | pickle.dump(df, f) 149 | f.close() 150 | 151 | def main(): 152 | print("Formatting Data") 153 | pd_frames = read_input(args.input, args.metric, args.batch_size) 154 | print("Conversion successful") 155 | 156 | os.makedirs(args.output) 157 | master_file = os.path.join(args.output, args.metric) 158 | 159 | combine_checkpoints(master_file) 160 | 161 | print("Saved data:", master_file) 162 | 163 | if __name__ == '__main__': 164 | 165 | parser = argparse.ArgumentParser(description="format time series data into an array of pandas dataframes. input folder architecture: input folder must contain a folder with the metric name. Inside the metric folder will be sum/, count/, quant/, or bucket/ according to the metric_type. ex: data/metric_name/files. data/ is input directory") 166 | 167 | parser.add_argument("--metric", type=str, help='metric name', required=True) 168 | 169 | parser.add_argument("-i", "--input", default='', help='input directory') 170 | 171 | parser.add_argument("-o", "--output", default='', help='output directory') 172 | 173 | parser.add_argument("--batch_size", default=1, type=int, help="number of data files to process at once. use this flag if handling big dataset (recommended: 20)") 174 | 175 | 176 | args = parser.parse_args() 177 | 178 | main() -------------------------------------------------------------------------------- /format_to_pandas_demo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # mkdir data/prometheus.example.com/ 4 | 5 | time python format_to_pandas.py \ 6 | --metric go_goroutines \ 7 | --input data/prometheus.example.com \ 8 | --output data/prometheus.example.com_pkl \ 9 | --batch_size 20 10 | -------------------------------------------------------------------------------- /fourier_train.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | from numpy import fft 4 | import pandas as pd 5 | import warnings 6 | import matplotlib.pyplot as plt 7 | warnings.filterwarnings("ignore") 8 | import collections 9 | import argparse 10 | 11 | class FourierForecast: 12 | def __init__(self, train, test): 13 | self.train = np.array(train["values"]) 14 | self.ds_train = np.array(train["timestamps"]) 15 | self.test = np.array(test["values"]) 16 | self.ds_test = np.array(test["timestamps"]) 17 | 18 | def fourierExtrapolation(self, n_predict, n_harm): 19 | n = self.train.size # number of harmonics in model 20 | t = np.arange(0, n) 21 | p = np.polyfit(t, self.train, 1) # find linear trend in x 22 | train_notrend = self.train - p[0] * t # detrended x 23 | train_freqdom = fft.fft(train_notrend) # detrended x in frequency domain 24 | f = fft.fftfreq(n) # frequencies 25 | indexes = np.arange(n).tolist() 26 | 27 | # sort indexes by frequency, lower -> higher 28 | indexes.sort(key = lambda i:np.absolute(f[i])) 29 | 30 | t = np.arange(0, n + n_predict) 31 | restored_sig = np.zeros(t.size) 32 | for i in indexes[:1 + n_harm * 2]: 33 | ampli = np.absolute(train_freqdom[i]) / n # amplitude 34 | phase = np.angle(train_freqdom[i]) # phase 35 | restored_sig += ampli * np.cos(2 * np.pi * f[i] * t + phase) 36 | return restored_sig + p[0] * t 37 | 38 | def fit_model(self, n_predict): 39 | 40 | minimum = np.min(self.train) 41 | stddev = np.std(self.train) 42 | 43 | upper = np.max(self.train) + stddev 44 | lower = minimum - stddev 45 | 46 | if minimum > 0: 47 | lower = max(0, lower) 48 | 49 | # n_harm = 1/3 of number of data points was chosen by visual inspection 50 | n_harm = int(len(self.train)/3) 51 | forecast = self.fourierExtrapolation(n_predict, n_harm) 52 | 53 | ds = np.append(self.ds_train, self.ds_test) 54 | 55 | self.forecast = pd.DataFrame({"ds": ds, "yhat": forecast, "yhat_upper": upper,"yhat_lower": lower}) 56 | 57 | return self.forecast 58 | 59 | def graph(self): 60 | plt.figure(figsize=(40,10)) 61 | # ds = np.arange(0, len(np.array(self.forecast["ds"]))) 62 | # ds_train = np.arange(0,len(self.ds_train)) 63 | # ds_test = np.arange(len(self.ds_train),len(self.ds_train) + len(self.ds_test)) 64 | # plt.plot(ds_train, self.train, 'b', label = 'train', linewidth = 3) 65 | # plt.plot(ds_test, self.test, 'g', label = 'test', linewidth = 3) 66 | # plt.plot(ds, np.array(self.forecast["yhat"]), 'y', label = 'yhat') 67 | ds_forecast = np.array(self.forecast["ds"]) 68 | forecast = np.array(self.forecast["yhat"]) 69 | 70 | ds_forecast = ds_forecast[len(self.ds_train):] 71 | forecast = forecast[len(self.ds_train):] 72 | plt.plot(self.ds_train, self.train, 'b', label = 'train', linewidth = 3) 73 | plt.plot(self.ds_test, self.test, 'g', label = 'test', linewidth = 3) 74 | plt.plot(ds_forecast,forecast, 'y', label = 'yhat') 75 | 76 | # plt.plot(np.array(self.forecast["ds"]), np.array(self.forecast["yhat_upper"]), 'y', label = 'yhat_upper') 77 | # plt.plot(np.array(self.forecast["ds"]), np.array(self.forecast["yhat_lower"]), 'y', label = 'yhat_lower') 78 | 79 | plt.legend() 80 | 81 | def calc_delta(vals): 82 | diff = vals - np.roll(vals, 1) 83 | diff[0] = 0 84 | return diff 85 | 86 | def monotonically_inc(vals): 87 | # check corner case 88 | if len(vals) == 1: 89 | return True 90 | diff = calc_delta(vals) 91 | diff[np.where(vals == 0)] = 0 92 | 93 | if ((diff < 0).sum() == 0): 94 | return True 95 | else: 96 | return False 97 | 98 | 99 | if __name__ == "__main__": 100 | parser = argparse.ArgumentParser(description="frun Prophet training on time series") 101 | 102 | parser.add_argument("--metric", type=str, help='metric name', required=True) 103 | 104 | parser.add_argument("--key", type=int, help='key number') 105 | 106 | args = parser.parse_args() 107 | 108 | metric_name = args.metric 109 | 110 | pkl_file = open("../pkl_data/" + metric_name + "_dataframes.pkl", "rb") 111 | #pkl_file = open("../data/real_data_test.pkl", "rb") 112 | dfs = pickle.load(pkl_file) 113 | pkl_file.close() 114 | key_vals = list(dfs.keys()) 115 | 116 | selected = [args.key] 117 | for ind in selected: 118 | key = key_vals[ind] 119 | #df = dfs["{'__name__': 'http_request_duration_microseconds', 'beta_kubernetes_io_arch': 'amd64', 'beta_kubernetes_io_instance_type': 'm4.xlarge', 'beta_kubernetes_io_os': 'linux', 'failure_domain_beta_kubernetes_io_region': 'us-east-2', 'failure_domain_beta_kubernetes_io_zone': 'us-east-2a', 'handler': 'prometheus', 'hostname': 'free-stg-node-compute-e0756', 'instance': 'ip-172-31-76-144.us-east-2.compute.internal', 'job': 'kubernetes-nodes-exporter', 'kubernetes_io_hostname': 'ip-172-31-76-144.us-east-2.compute.internal', 'logging_infra_fluentd': 'true', 'node_role_kubernetes_io_compute': 'true', 'quantile': '0.99', 'region': 'us-east-2', 'type': 'compute'}"] 120 | df = dfs[key] 121 | # df["timestamps"] = df["ds"] 122 | # df["values"] = df["y"] 123 | df = df.sort_values(by=['timestamps']) 124 | 125 | print(key) 126 | df["values"] = df["values"].apply(pd.to_numeric) 127 | vals = np.array(df["values"].tolist()) 128 | 129 | # check if metric is a counter, if so, run AD on difference 130 | if monotonically_inc(vals): 131 | print("monotonically_inc") 132 | vals = calc_delta(vals) 133 | df["values"] = vals 134 | 135 | train = df[0:int(0.7*len(vals))] 136 | test = df[int(0.7*len(vals)):] 137 | 138 | # graph(vals) 139 | ff = FourierForecast(train, test) 140 | forecast = ff.fit_model(test.shape[0]) 141 | 142 | f = open("../fourier_forecasts/forecast_" + metric_name + "_" + str(args.key) + ".pkl", "wb") 143 | pickle.dump(forecast, f) 144 | pickle.dump(train, f) 145 | pickle.dump(test,f) 146 | f.close() 147 | 148 | ff.graph() 149 | plt.savefig("../presentation/graphs/" + str(args.key) + "_" + args.metric + ".png", transparent=True) 150 | plt.close() 151 | -------------------------------------------------------------------------------- /graphing_ts.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | from numpy import fft 4 | import pandas as pd 5 | import warnings 6 | import matplotlib.pyplot as plt 7 | warnings.filterwarnings("ignore") 8 | from scipy.stats import chisquare 9 | import collections 10 | import bz2 11 | def fourierExtrapolation(x, n_predict, n_harm): 12 | n = x.size 13 | #n_harm = 100 # number of harmonics in model 14 | t = np.arange(0, n) 15 | p = np.polyfit(t, x, 1) # find linear trend in x 16 | x_notrend = x - p[0] * t # detrended x 17 | x_freqdom = fft.fft(x_notrend) # detrended x in frequency domain 18 | f = fft.fftfreq(n) # frequencies 19 | indexes = np.arange(n).tolist() 20 | # sort indexes by frequency, lower -> higher 21 | indexes.sort(key = lambda i:np.absolute(f[i])) 22 | 23 | t = np.arange(0, n + n_predict) 24 | restored_sig = np.zeros(t.size) 25 | for i in indexes[:1 + n_harm * 2]: 26 | ampli = np.absolute(x_freqdom[i]) / n # amplitude 27 | phase = np.angle(x_freqdom[i]) # phase 28 | restored_sig += ampli * np.cos(2 * np.pi * f[i] * t + phase) 29 | return restored_sig + p[0] * t 30 | 31 | def fit_model(train, n_predict): 32 | 33 | model = collections.namedtuple('model',['upper','lower','forecast']) 34 | 35 | minimum = np.min(train) 36 | stddev = np.std(train) 37 | 38 | model.upper = np.max(train) + stddev 39 | model.lower = minimum - stddev 40 | if minimum > 0: 41 | model.lower = max(0, model.lower) 42 | 43 | # n_harm = 1/3 of number of data points was chosen by visual inspection 44 | n_harm = int(len(train)/3) 45 | 46 | model.forecast = fourierExtrapolation(train, n_predict, n_harm) 47 | 48 | return model 49 | 50 | def window_AD(forecast, test, win_size): 51 | num_bins = 5 52 | 53 | new_forecast = forecast[-len(test):] 54 | # windows = [np.arange(win_size*i,win_size*(i+1)) for i in range(int(len(test)/win_size) + 1)] 55 | # windows[-1] = np.arange(windows[-1][0], len(test)) 56 | win_test = test[1:win_size] 57 | win_forecast = new_forecast[1:win_size] 58 | p_vals = [] 59 | for j in range(0, len(test)): 60 | print(j+len(forecast)-len(test)) 61 | win_test = test[1:win_size] 62 | for i in range(0,len(test)): 63 | test_hist, bin_edges = np.histogram(win_test, bins=num_bins) 64 | big_vals = np.where(win_forecast > bin_edges[-1]) 65 | small_vals = np.where(win_forecast < bin_edges[0]) 66 | f_hist, bin_edges = np.histogram(win_forecast, bins=bin_edges) 67 | # print(np.sum(test_hist)) 68 | # print(np.sum(f_hist)) 69 | f_hist[-1] = f_hist[-1] + len(big_vals) 70 | f_hist[0] = f_hist[0] + len(small_vals) 71 | test_hist = [x+1 for x in test_hist] 72 | f_hist = [x+1 for x in f_hist] 73 | # print(test_hist) 74 | # print(f_hist) 75 | vals = chisquare(f_hist, f_exp=test_hist ) 76 | # print(vals[1]) 77 | p_vals.append(vals[1]) 78 | # win_test = np.roll(win_test, 1) 79 | new_forecast = np.roll(new_forecast, 1) 80 | win_forecast = new_forecast[1:win_size] 81 | # if p_val > .75: 82 | # return True 83 | print(np.max(np.array(p_vals))) 84 | p_vals = [] 85 | test = np.roll(test, 1) 86 | return False 87 | 88 | def detect_anomalies(model, test): 89 | if np.max(test) > model.upper: 90 | print("yep") 91 | #return "point-wise anomaly - upper bound exceeded\nbound: " + str(model.upper) + "\nexceeded value: " + str(np.max(test)) 92 | if np.min(test) < model.lower: 93 | print('yep') 94 | #return "point-wise anomaly - lower bound exceeded" 95 | else: 96 | # run histogram-based AD 97 | if window_AD(model.forecast, test, 60): 98 | return "5-min window anomaly detected" 99 | return "running histogram-based AD" 100 | return "no anomalies detected" 101 | 102 | def graph(series): 103 | x_series = np.arange(series.size) 104 | plt.plot(x_series, series, 'b', label = 'x', linewidth = 3) 105 | # pl.plot(x_test, test, 'g*', label = 'x', linewidth = 3) 106 | #pl.plot(x_extrapolation, extrapolation, 'r', label = 'extrapolation') 107 | 108 | plt.legend() 109 | pkl_file = open("../pkl_data/http_request_duration_microseconds_quantile_dataframes.pkl", "rb") 110 | dfs = pickle.load(pkl_file) 111 | pkl_file.close() 112 | print(type(dfs)) 113 | key_vals = list(dfs.keys()) 114 | print(len(key_vals)) 115 | 116 | 117 | pkl_file = open("../data/real_data_test.pkl", "wb") 118 | pickle.dump(dfs, pkl_file) 119 | pkl_file.close() 120 | i = 0 121 | for key in key_vals[0:800]: 122 | print(key) 123 | df = dfs["{'__name__': 'http_request_duration_microseconds', 'beta_kubernetes_io_arch': 'amd64', 'beta_kubernetes_io_instance_type': 'm4.xlarge', 'beta_kubernetes_io_os': 'linux', 'failure_domain_beta_kubernetes_io_region': 'us-east-2', 'failure_domain_beta_kubernetes_io_zone': 'us-east-2a', 'handler': 'prometheus', 'hostname': 'free-stg-node-compute-e0756', 'instance': 'ip-172-31-76-144.us-east-2.compute.internal', 'job': 'kubernetes-nodes-exporter', 'kubernetes_io_hostname': 'ip-172-31-76-144.us-east-2.compute.internal', 'logging_infra_fluentd': 'true', 'node_role_kubernetes_io_compute': 'true', 'quantile': '0.99', 'region': 'us-east-2', 'type': 'compute'}"] 124 | # df = dfs[key] 125 | df["values"] = df["values"].apply(pd.to_numeric) 126 | df = df.sort_values(by=['timestamps']) 127 | vals = np.array(df["values"].tolist()) 128 | # train = vals[0:int(0.7*len(vals))] 129 | # test = vals[int(0.7*len(vals)):] 130 | # print(np.max(test)) 131 | # print(np.where(test == np.max(test))) 132 | # x_vals = np.arange(0,len(vals)) 133 | # x_test = x_vals[int(0.7*len(vals)):] 134 | # x_train = x_vals[0:int(0.7*len(vals))] 135 | # mdl = fit_model(train, len(test)) 136 | print(i) 137 | i += 1 138 | # print(detect_anomalies(mdl, test)) 139 | 140 | graph(vals) 141 | # plt.show() 142 | plt.savefig("../time_series_graphing/graphs_http_total/fourier_" + str(i) + ".png") 143 | plt.close() -------------------------------------------------------------------------------- /holt_winters_train.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import pandas as pd 4 | import warnings 5 | import matplotlib.pyplot as plt 6 | warnings.filterwarnings("ignore") 7 | import collections 8 | import argparse 9 | from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt 10 | 11 | 12 | class Exp_Smoothing: 13 | def __init__(self, train, test): 14 | self.train = np.array(train["values"]) 15 | self.ds_train = np.array(train["timestamps"]) 16 | self.test = np.array(test["values"]) 17 | self.ds_test = np.array(test["timestamps"]) 18 | 19 | def fit_model(self, n_predict): 20 | 21 | fit = ExponentialSmoothing(self.train, seasonal_periods=4, trend='add', seasonal='add').fit(use_boxcox=True) 22 | forecast = fit.forecast(n_predict) 23 | 24 | ds = self.ds_test 25 | 26 | self.forecast = pd.DataFrame({"ds": ds, "yhat": forecast}) 27 | 28 | return self.forecast 29 | 30 | def graph(self, metric_name, key): 31 | plt.figure(figsize=(40,10)) 32 | 33 | plt.plot(np.array(self.forecast["ds"]), np.array(self.forecast["yhat"]), 'y', label = 'yhat') 34 | plt.plot(self.ds_train, self.train, '*b', label = 'train', linewidth = 3) 35 | plt.plot(self.ds_test, self.test, '*g', label = 'test', linewidth = 3) 36 | # pl.plot(np.array(self.forecast["ds"]), np.array(self.forecast["yhat_upper"]), 'y', label = 'yhat_upper') 37 | # pl.plot(np.array(self.forecast["ds"]), np.array(self.forecast["yhat_lower"]), 'y', label = 'yhat_lower') 38 | 39 | plt.legend() 40 | plt.savefig("../testing/exp_smoothing_graphs/graph_" + metric_name + "_" + str(key) + ".png") 41 | plt.show() 42 | 43 | 44 | 45 | def calc_delta(vals): 46 | diff = vals - np.roll(vals, 1) 47 | diff[0] = 0 48 | return diff 49 | 50 | def monotonically_inc(vals): 51 | # check corner case 52 | if len(vals) == 1: 53 | return True 54 | diff = calc_delta(vals) 55 | diff[np.where(vals == 0)] = 0 56 | 57 | if ((diff < 0).sum() == 0): 58 | return True 59 | else: 60 | return False 61 | 62 | if __name__ == "__main__": 63 | parser = argparse.ArgumentParser(description="frun Prophet training on time series") 64 | 65 | parser.add_argument("--metric", type=str, help='metric name', required=True) 66 | 67 | parser.add_argument("--key", type=int, help='key number') 68 | 69 | args = parser.parse_args() 70 | 71 | metric_name = args.metric 72 | 73 | pkl_file = open("../pkl_data/" + metric_name + "_dataframes.pkl", "rb") 74 | dfs = pickle.load(pkl_file) 75 | pkl_file.close() 76 | key_vals = list(dfs.keys()) 77 | 78 | selected = [args.key] 79 | for ind in selected: 80 | key = key_vals[ind] 81 | df = dfs[key] 82 | df = df.sort_values(by=['timestamps']) 83 | 84 | print(key) 85 | df["values"] = df["values"].apply(pd.to_numeric) 86 | vals = np.array(df["values"].tolist()) 87 | 88 | # check if metric is a counter, if so, run AD on difference 89 | if monotonically_inc(vals): 90 | print("monotonically_inc") 91 | vals = calc_delta(vals) 92 | df["values"] = vals 93 | 94 | train = df[0:int(0.7*len(vals))] 95 | test = df[int(0.7*len(vals)):] 96 | 97 | es = Exp_Smoothing(train, test) 98 | forecast = es.fit_model(test.shape[0]) 99 | 100 | f = open("../testing/exp_smoothing_forecasts/forecast_" + metric_name + "_" + str(args.key) + ".pkl", "wb") 101 | pickle.dump(forecast, f) 102 | pickle.dump(train, f) 103 | pickle.dump(test,f) 104 | f.close() 105 | 106 | es.graph(metric_name, args.key) 107 | -------------------------------------------------------------------------------- /imgs/anomaly_detection1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/anomaly_detection1.png -------------------------------------------------------------------------------- /imgs/anomaly_detection2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/anomaly_detection2.png -------------------------------------------------------------------------------- /imgs/arima3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/arima3.png -------------------------------------------------------------------------------- /imgs/compare_prophet_fourier3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/compare_prophet_fourier3.png -------------------------------------------------------------------------------- /imgs/compare_prophet_fourier4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/compare_prophet_fourier4.png -------------------------------------------------------------------------------- /imgs/compare_prophet_fourier5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/compare_prophet_fourier5.png -------------------------------------------------------------------------------- /imgs/detect_anomaly_accumulator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/detect_anomaly_accumulator.png -------------------------------------------------------------------------------- /imgs/detect_anomaly_combined.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/detect_anomaly_combined.png -------------------------------------------------------------------------------- /imgs/detect_anomaly_tail_prob.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/detect_anomaly_tail_prob.png -------------------------------------------------------------------------------- /imgs/example_ts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/example_ts.png -------------------------------------------------------------------------------- /imgs/exp_smoothing3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/exp_smoothing3.png -------------------------------------------------------------------------------- /imgs/forecasting_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/forecasting_data.png -------------------------------------------------------------------------------- /imgs/fourier3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/fourier3.png -------------------------------------------------------------------------------- /imgs/fourier4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/fourier4.png -------------------------------------------------------------------------------- /imgs/fourier_extrapolation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/fourier_extrapolation.png -------------------------------------------------------------------------------- /imgs/fourier_extrapolation_behind.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/fourier_extrapolation_behind.png -------------------------------------------------------------------------------- /imgs/histogram_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/histogram_graph.png -------------------------------------------------------------------------------- /imgs/histogram_graph2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/histogram_graph2.png -------------------------------------------------------------------------------- /imgs/imgs/detect_anomaly_accumulator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/imgs/detect_anomaly_accumulator.png -------------------------------------------------------------------------------- /imgs/imgs/detect_anomaly_combined.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/imgs/detect_anomaly_combined.png -------------------------------------------------------------------------------- /imgs/imgs/detect_anomaly_tail_prob.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/imgs/detect_anomaly_tail_prob.png -------------------------------------------------------------------------------- /imgs/imgs/example_ts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/imgs/example_ts.png -------------------------------------------------------------------------------- /imgs/imgs/fourier_extrapolation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/imgs/fourier_extrapolation.png -------------------------------------------------------------------------------- /imgs/imgs/fourier_extrapolation_behind.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/imgs/fourier_extrapolation_behind.png -------------------------------------------------------------------------------- /imgs/imgs/partitioned_ts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/imgs/partitioned_ts.png -------------------------------------------------------------------------------- /imgs/metadata_instance_label_cloudprovider_aws_api_request_duration_seconds.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/metadata_instance_label_cloudprovider_aws_api_request_duration_seconds.png -------------------------------------------------------------------------------- /imgs/metadata_instance_label_kubelet_docker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/metadata_instance_label_kubelet_docker.png -------------------------------------------------------------------------------- /imgs/metadata_operation_label_kubelet_docker2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/metadata_operation_label_kubelet_docker2.png -------------------------------------------------------------------------------- /imgs/metadata_operation_label_legend_kubelet_docker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/metadata_operation_label_legend_kubelet_docker.png -------------------------------------------------------------------------------- /imgs/partitioned_ts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/partitioned_ts.png -------------------------------------------------------------------------------- /imgs/prophet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/prophet.png -------------------------------------------------------------------------------- /imgs/prophet3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/prophet3.png -------------------------------------------------------------------------------- /imgs/summary_graph3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/summary_graph3.png -------------------------------------------------------------------------------- /imgs/summary_graph4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/summary_graph4.png -------------------------------------------------------------------------------- /imgs/t-sne_embedding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/t-sne_embedding.png -------------------------------------------------------------------------------- /imgs/t_sne_embedding2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/imgs/t_sne_embedding2.png -------------------------------------------------------------------------------- /metadata_analysis/get_single_ts_all.py: -------------------------------------------------------------------------------- 1 | metadata = {'__name__': 'kubelet_docker_operations_latency_microseconds', 'beta_kubernetes_io_arch': 'amd64', 'beta_kubernetes_io_instance_type': 'm4.xlarge', 'beta_kubernetes_io_os': 'linux', 'clam_controller_enabled': 'True', 'clam_server_enabled': 'True', 'failure_domain_beta_kubernetes_io_region': 'us-east-2', 'failure_domain_beta_kubernetes_io_zone': 'us-east-2a', 'fluentd_test': 'true', 'hostname': 'free-stg-master-5c6a9', 'instance': 'ip-172-31-73-251.us-east-2.compute.internal', 'job': 'kubernetes-nodes', 'kubernetes_io_hostname': 'ip-172-31-73-251.us-east-2.compute.internal', 'node_role_kubernetes_io_compute': 'true', 'node_role_kubernetes_io_master': 'true', 'operation_type': 'version', 'quantile': '0.5', 'region': 'us-east-2', 'type': 'master'} 2 | 3 | import json 4 | from datetime import datetime 5 | import matplotlib.pyplot as plt 6 | import matplotlib.dates as dt 7 | import pandas as pd 8 | import re 9 | import string 10 | import random 11 | import numpy as np 12 | import fnmatch 13 | import os 14 | import sys 15 | import bz2 16 | import pickle 17 | if len(sys.argv) != 2: 18 | print("incorrect number of command line arguments") 19 | print("received: ", len(sys.argv)) 20 | print("expected: 2") 21 | exit(1) 22 | 23 | m_name = sys.argv[1] 24 | metric_type = "" 25 | data_folder = "../data/" 26 | 27 | # find bucket/quantile, sum, and count files in metric folder 28 | filename_bkt = [] 29 | 30 | try: 31 | for file in os.listdir(data_folder + m_name + "/bucket/"): 32 | if fnmatch.fnmatch(file, "*.json.bz2"): 33 | metric_type = "hist" 34 | f_name = data_folder + m_name + "/bucket/" + file 35 | filename_bkt.append(f_name) 36 | except: 37 | for file in os.listdir(data_folder + m_name + "/quantile/quant2/"): 38 | if fnmatch.fnmatch(file, "*.json"): 39 | metric_type = "summary" 40 | f_name = data_folder + m_name + "/quantile/quant2/" + file 41 | filename_bkt.append(f_name) 42 | 43 | print("Metric: ", m_name) 44 | if metric_type == "hist": 45 | label = "le" 46 | elif metric_type == "summary": 47 | label = "quantile" 48 | else: 49 | print("no metric type detected") 50 | exit(1) 51 | 52 | print("Metric: ", m_name) 53 | if metric_type == "hist": 54 | label = "le" 55 | elif metric_type == "summary": 56 | label = "quantile" 57 | else: 58 | print("no metric type detected") 59 | exit(1) 60 | 61 | results_folder = "../results/" 62 | 63 | 64 | # load appropriate data 65 | inc = 0 66 | num_jsons = 10 67 | jsons_bkt = [] 68 | print(len(filename_bkt)) 69 | dfs = [] 70 | for file in filename_bkt: 71 | f = open(file, 'rb') 72 | # f = bz2.BZ2File(file, 'rb') 73 | one_json = json.load(f) 74 | f.close() 75 | for pkt in one_json: 76 | df = pd.DataFrame.from_dict(pkt["values"]) 77 | df = df.rename( columns={0:"timestamps", 1:"values"}) 78 | df["timestamps"] = pd.to_datetime(df["timestamps"], unit='s') 79 | df = df.sort_values(by=["timestamps"]) 80 | meta_keys = np.array(list(pkt["metric"].keys())) 81 | meta_vals = np.array(list(pkt["metric"].values())) 82 | md = dict(zip(meta_keys, meta_vals)) 83 | if md == metadata: 84 | dfs.append(df) 85 | print(len(dfs)) 86 | # if inc == num_jsons: 87 | # break 88 | print(inc) 89 | inc += 1 90 | 91 | 92 | file = "df_one_ts" + ".pkl" 93 | pickle_file = open(file, "wb") 94 | pickle.dump(dfs, pickle_file) 95 | pickle_file.close() 96 | 97 | 98 | -------------------------------------------------------------------------------- /metadata_analysis/graph_metadata.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime 3 | import matplotlib.pyplot as plt 4 | import matplotlib.dates as dt 5 | import re 6 | import string 7 | import random 8 | import numpy as np 9 | import fnmatch 10 | import os 11 | import sys 12 | import bz2 13 | from matplotlib.backends.backend_pdf import PdfPages 14 | from matplotlib.collections import EventCollection 15 | import time 16 | 17 | colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] 18 | if len(sys.argv) != 2: 19 | print("incorrect number of command line arguments") 20 | print("received: ", len(sys.argv)) 21 | print("expected: 2") 22 | exit(1) 23 | 24 | m_name = sys.argv[1] 25 | metric_type = "" 26 | data_folder = "../data/" 27 | 28 | # find bucket/quantile, sum, and count files in metric folder 29 | filename_bkt = [] 30 | 31 | try: 32 | for file in os.listdir(data_folder + m_name + "/bucket/"): 33 | if fnmatch.fnmatch(file, "*.json.bz2"): 34 | metric_type = "hist" 35 | f_name = data_folder + m_name + "/bucket/" + file 36 | filename_bkt.append(f_name) 37 | except: 38 | for file in os.listdir(data_folder + m_name + "/quantile/"): 39 | if fnmatch.fnmatch(file, "*.json.bz2"): 40 | metric_type = "summary" 41 | f_name = data_folder + m_name + "/quantile/" + file 42 | filename_bkt.append(f_name) 43 | 44 | print("Metric: ", m_name) 45 | if metric_type == "hist": 46 | label = "le" 47 | elif metric_type == "summary": 48 | label = "quantile" 49 | else: 50 | print("no metric type detected") 51 | exit(1) 52 | 53 | results_folder = "../results/" 54 | png_name = results_folder + m_name + '_graphs.png' 55 | png_name2 = results_folder + m_name + '_graphs_legend.png' 56 | 57 | def parse_jsons(jsons, select_label="__name__"): 58 | times = [] 59 | master_md = {} 60 | md = [] 61 | for one_json in jsons: 62 | for pkt in one_json: 63 | timestamps = [] 64 | timestamps_int = [] 65 | for i in pkt["values"]: 66 | timestamps.append(datetime.fromtimestamp(float(i[0]))) 67 | metadata = pkt["metric"] 68 | for lbl in metadata.keys(): 69 | lbl_val = metadata[lbl] 70 | if lbl in master_md.keys(): 71 | if lbl_val in master_md[lbl]: 72 | continue 73 | else: 74 | master_md[lbl].append(lbl_val) 75 | else: 76 | temp_list = [] 77 | temp_list.append(lbl_val) 78 | master_md[lbl] = temp_list 79 | times.append(timestamps) 80 | md.append(metadata) 81 | return master_md, times, md 82 | 83 | 84 | fig = plt.figure(figsize=(20,10)) 85 | lbls = set() 86 | label_axis = {} 87 | select_label = "" 88 | while(True): 89 | master_md = [] 90 | times = [] 91 | times_int = [] 92 | md = [] 93 | #filename_bkt = filename_bkt[1:2] 94 | if len(filename_bkt) == 0: 95 | break 96 | print(len(filename_bkt)) 97 | 98 | jsons_bkt = [] 99 | for i in range(0,len(filename_bkt)): 100 | file = filename_bkt[i] 101 | print(i) 102 | print(file) 103 | f = bz2.BZ2File(file, 'rb') 104 | jsons_bkt.append(json.load(f)) 105 | f.close() 106 | if i >= 50: 107 | break 108 | 109 | try: 110 | filename_bkt = filename_bkt[50:] 111 | except: 112 | filename_bkt = [] 113 | 114 | master_md, times, md = parse_jsons(jsons_bkt, label) 115 | 116 | for lbl in master_md: 117 | print("\n==", lbl, len(master_md[lbl])) 118 | for lbl_val in master_md[lbl]: 119 | print("\t", lbl_val) 120 | 121 | if select_label == "": 122 | select_label = input("\n\nSelect a label to graph:\n") 123 | try: 124 | label_vals = master_md[select_label] 125 | except: 126 | print("Not a valid label. Exiting..") 127 | exit(1) 128 | 129 | graph = {} 130 | for md_i in range(0, len(md)): 131 | metadata = md[md_i] 132 | try: 133 | label_val = metadata[select_label] 134 | except: 135 | continue 136 | 137 | try: 138 | graph[label_val].extend(times[md_i]) 139 | except: 140 | graph[label_val] = times[md_i] 141 | 142 | 143 | for j in graph.keys(): 144 | lbls.add(j) 145 | print("number of label values: ", len(graph.keys())) 146 | 147 | for i in lbls: 148 | print(i) 149 | try: 150 | x = dt.date2num(graph[i]) 151 | except: 152 | continue 153 | try: 154 | val = label_axis[i] 155 | y = [val]*len(x) 156 | except: 157 | val = len(label_axis) 158 | label_axis[i] = val 159 | y = [val]*len(x) 160 | 161 | plt.plot(x, y, ',', color=colors[(val+1)%len(colors)]) 162 | 163 | 164 | del metadata 165 | del md 166 | del master_md 167 | del times 168 | 169 | title = select_label 170 | plt.gcf().autofmt_xdate() 171 | plt.suptitle(m_name) 172 | plt.title(title) 173 | plt.xlabel("Timestamp") 174 | plt.xticks(rotation=25) 175 | plt.ylabel("Value") 176 | plt.yticks(np.arange(len(label_axis.keys()))) 177 | 178 | ax = plt.gca() 179 | xfmt = dt.DateFormatter('%Y-%m-%d %H:%M:%S') 180 | ax.xaxis.set_major_formatter(xfmt) 181 | #plt.show() 182 | plt.savefig(png_name) 183 | plt.close() 184 | 185 | # plot the legend table 186 | plt.figure(figsize=(20,10)) 187 | print(label_axis.keys()) 188 | n_lbls = np.array(list(label_axis.keys())) 189 | n_lbls.shape = (len(lbls), 1) 190 | vals = np.array(list(label_axis.values())) 191 | vals.shape = (len(vals), 1) 192 | table_vals = np.append(vals, n_lbls, 1) 193 | t = plt.table(cellText=table_vals, colLabels=["Number", label], cellLoc='center', loc='center') 194 | # t.set_fontsize(18) 195 | t.scale(1,3) 196 | plt.axis("off") 197 | plt.title("LEGEND") 198 | plt.savefig(png_name2) -------------------------------------------------------------------------------- /metadata_analysis/graph_specific_ts.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime 3 | import matplotlib.pyplot as plt 4 | import matplotlib.dates as dt 5 | import re 6 | import string 7 | import random 8 | import numpy as np 9 | import fnmatch 10 | import os 11 | import sys 12 | import ast 13 | from matplotlib.backends.backend_pdf import PdfPages 14 | 15 | if len(sys.argv) != 2: 16 | print("incorrect number of command line arguments") 17 | print("received: ", len(sys.argv)) 18 | print("expected: 2") 19 | exit(1) 20 | 21 | file = sys.argv[1] 22 | lines = [line.rstrip('\n') for line in open(file)] 23 | m_name = lines[0] 24 | target_metadata = lines[1] 25 | target_metadata = target_metadata.replace("'", "\"") 26 | target_metadata = json.loads(target_metadata) 27 | 28 | data_folder = "../data/" 29 | metric_type = "hist" 30 | #metadata = "{'beta_kubernetes_io_arch': 'amd64', 'beta_kubernetes_io_fluentd_ds_ready': 'true', 'beta_kubernetes_io_instance_type': 'm4.xlarge', 'beta_kubernetes_io_os': 'linux', 'clam_controller_enabled': 'True', 'failure_domain_beta_kubernetes_io_region': 'us-east-2', 'failure_domain_beta_kubernetes_io_zone': 'us-east-2a', 'fluentd_test': 'true', 'hostname': 'free-stg-master-03fb6', 'instance': 'ip-172-31-78-254.us-east-2.compute.internal', 'job': 'kubernetes-nodes', 'kubernetes_io_hostname': 'ip-172-31-78-254.us-east-2.compute.internal', 'node_role_kubernetes_io_master': 'true', 'operation_type': 'list_images', 'region': 'us-east-2', 'type': 'master'}" 31 | 32 | # find bucket/quantile, sum, and count files in metric folder 33 | for file in os.listdir(data_folder + m_name + "/"): 34 | if fnmatch.fnmatch(file, "bucket_*.json"): 35 | metric_type = "hist" 36 | filename_bkt = data_folder + m_name + "/" + file 37 | elif fnmatch.fnmatch(file, "quantile_*.json"): 38 | metric_type = "summary" 39 | filename_bkt = data_folder + m_name + "/" + file 40 | if fnmatch.fnmatch(file, "count_*.json"): 41 | filename_cnt = data_folder + m_name + "/" + file 42 | if fnmatch.fnmatch(file, "sum_*.json"): 43 | filename_sum = data_folder + m_name + "/" + file 44 | if metric_type == "hist" or metric_type == "summary": 45 | print("Metric: ", m_name) 46 | else: 47 | print("no metric type detected") 48 | exit(1) 49 | 50 | results_folder = "../results/" 51 | pp_graph = PdfPages(results_folder + m_name + '_graphs.pdf') 52 | pp_hist = PdfPages(results_folder + m_name + '_hists.pdf') 53 | 54 | # load appropriate data 55 | f = open(filename_bkt) 56 | jsonFile_bkt = json.load(f) 57 | f.close() 58 | 59 | f2 = open(filename_cnt) 60 | jsonFile_cnt = json.load(f2) 61 | f2.close() 62 | 63 | f3 = open(filename_sum) 64 | jsonFile_sum = json.load(f3) 65 | f3.close() 66 | 67 | # each index corresponds to one graph 68 | # each graph is a list of lists 69 | # list of list of lists 70 | # graph = all_series[i] 71 | # one_series = graph[i] 72 | # one_data_point = one_series[i] 73 | b_val = [] 74 | b_time = [] 75 | b_md = [] 76 | b_le = [] 77 | for pkt in jsonFile_bkt: 78 | timestamps = [] 79 | vals = [] 80 | for i in pkt["values"]: 81 | if i[1] != 'NaN': 82 | vals.append(float(i[1])) 83 | timestamps.append(datetime.fromtimestamp(float(i[0]))) 84 | metadata = pkt["metric"] 85 | if metric_type == "hist": 86 | le = metadata["le"] 87 | del metadata["le"] 88 | elif metric_type == "summary": 89 | le = metadata["quantile"] 90 | del metadata["quantile"] 91 | metric_name = metadata["__name__"] 92 | del metadata["__name__"] 93 | if metadata == target_metadata: 94 | metadata = str(metadata) 95 | if len(vals) > 0: 96 | b_val.append(vals) 97 | b_time.append(timestamps) 98 | b_md.append(metadata) 99 | b_le.append(le) 100 | 101 | s_val = [] 102 | s_time = [] 103 | s_md = [] 104 | for pkt in jsonFile_sum: 105 | timestamps = [] 106 | vals = [] 107 | for i in pkt["values"]: 108 | if i[1] != 'NaN': 109 | vals.append(float(i[1])) 110 | timestamps.append(datetime.fromtimestamp(float(i[0]))) 111 | metadata = pkt["metric"] 112 | metric_name = metadata["__name__"] 113 | del metadata["__name__"] 114 | metadata = str(metadata) 115 | if len(vals) > 0: 116 | s_val.append(vals) 117 | s_time.append(timestamps) 118 | s_md.append(metadata) 119 | 120 | c_val = [] 121 | c_time = [] 122 | c_md = [] 123 | for pkt in jsonFile_cnt: 124 | timestamps = [] 125 | vals = [] 126 | for i in pkt["values"]: 127 | if i[1] != 'NaN': 128 | vals.append(float(i[1])) 129 | timestamps.append(datetime.fromtimestamp(float(i[0]))) 130 | metadata = pkt["metric"] 131 | metric_name = metadata["__name__"] 132 | del metadata["__name__"] 133 | metadata = str(metadata) 134 | if len(vals) > 0: 135 | c_val.append(vals) 136 | c_time.append(timestamps) 137 | c_md.append(metadata) 138 | 139 | 140 | 141 | graphs = {} 142 | graph_label = [] 143 | graph_xs = {} 144 | for md_i in range(0,len(b_md)): 145 | metadata = str(b_md[md_i]) 146 | label = b_le[md_i] 147 | try: 148 | graphs[metadata][label].extend(b_val[md_i]) 149 | graph_xs[metadata][label].extend(b_time[md_i]) 150 | except: 151 | try: 152 | graphs[metadata][label] = b_val[md_i] 153 | graph_xs[metadata][label] = b_time[md_i] 154 | except: 155 | label_dict = {} 156 | label_dict[label] = b_val[md_i] 157 | label_t_dict = {} 158 | label_t_dict[label] = b_time[md_i] 159 | graphs[metadata] = label_dict 160 | graph_xs[metadata] = label_t_dict 161 | 162 | 163 | 164 | inc = 0 165 | print("number of graphs: ", len(graphs.keys())) 166 | for i in graphs.keys(): 167 | if (inc+1) % 50 == 0: 168 | pp_graph.close() 169 | pp_graph = PdfPages(results_folder + str(inc+1) + "_" + m_name + '_graphs.pdf') 170 | pp_hist.close() 171 | pp_hist = PdfPages(results_folder + str(inc+1) + "_" + m_name + '_hists.pdf') 172 | print(inc) 173 | graph_title = i 174 | xs = graph_xs[i] 175 | ys = graphs[i] 176 | #if graph_title == "{'instance': '172.31.65.74:8444', 'job': 'kubernetes-controllers', 'request': 'detach_volume'}": 177 | 178 | title = re.sub("(.{200})", "\\1\n", graph_title, 0, re.DOTALL) 179 | if len(graph_title) > 50: 180 | graph_title= graph_title[1:50] 181 | plt.figure(figsize=(20,10)) 182 | for j in ys.keys(): 183 | plt.plot(xs[j], ys[j], '*') 184 | plt.gcf().autofmt_xdate() 185 | plt.suptitle(metric_name) 186 | plt.title(title) 187 | plt.legend(ys.keys()) 188 | plt.xlabel("Timestamp") 189 | plt.ylabel("Value") 190 | 191 | #savefile = "graphs/" + insts[i] + "_" + graph_title + ".png" 192 | plt.savefig(pp_graph, format='pdf') 193 | plt.close() 194 | 195 | main_title = re.sub("(.{200})", "\\1\n", graph_title, 0, re.DOTALL) 196 | if len(graph_title) > 50: 197 | graph_title= graph_title[1:50] 198 | plt.figure(figsize=(20,10)) 199 | for j in ys.keys(): 200 | time = xs[j][0] 201 | break 202 | for j in range(0, len(s_time[inc])): 203 | if s_time[inc][j] == time: 204 | sum_val = s_val[inc][j] 205 | break 206 | for j in range(0, len(c_time[inc])): 207 | if c_time[inc][j] == time: 208 | count_val = c_val[inc][j] 209 | break 210 | 211 | 212 | graph_label = list(xs.keys()) 213 | tmp = graph_label 214 | tmp.sort() 215 | if metric_type == "hist": 216 | inf = tmp[0] 217 | 218 | # take away the +Inf bucket 219 | tmp = tmp[1::] 220 | 221 | # sort the remaining integers/floats 222 | tmp.sort(key=float) 223 | 224 | # append +Inf to the end 225 | tmp.append(inf) 226 | 227 | sorted_y = [] 228 | for j in tmp: 229 | for k in graph_label: 230 | if j == k: 231 | sorted_y.append(ys[k][0]) 232 | break 233 | 234 | graph_label = tmp 235 | bar_vals = np.arange(len(graph_label)) 236 | plt.bar(bar_vals, height =sorted_y) 237 | plt.xticks(bar_vals, graph_label) 238 | plt.gcf().autofmt_xdate() 239 | plt.suptitle(main_title) 240 | title = "Count: " + str(count_val) + ", Sum: " + str(sum_val) 241 | plt.title(title, fontsize=20) 242 | plt.xlabel("Bucket") 243 | plt.ylabel("Value" ) 244 | 245 | # #savefile = "hists/" + insts[i] + ".png" 246 | plt.savefig(pp_hist, format='pdf') 247 | plt.close() 248 | inc += 1 249 | 250 | pp_graph.close() 251 | pp_hist.close() -------------------------------------------------------------------------------- /metadata_analysis/plot_metadata_labels.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime 3 | import matplotlib.pyplot as plt 4 | import matplotlib.dates as dt 5 | import re 6 | import string 7 | import random 8 | import numpy as np 9 | import bz2 10 | from matplotlib.backends.backend_pdf import PdfPages 11 | pp = PdfPages('label_hists2.pdf') 12 | import os 13 | 14 | label = "instance" 15 | folder = "kubelet_docker_operations_latency_microseconds/" 16 | files = os.listdir(folder) 17 | jsons = [] 18 | 19 | inc = 0 20 | print(len(files)) 21 | md = [] 22 | for file in files: 23 | inc += 1 24 | print(inc) 25 | filen = folder + file 26 | try: 27 | f = bz2.BZ2File(filen, 'rb') 28 | jsonFile = json.load(f) 29 | f.close() 30 | except IsADirectoryError: 31 | continue 32 | for pkt in jsonFile: 33 | metadata = pkt["metric"] 34 | del metadata["__name__"] 35 | md.append(metadata) 36 | 37 | lbls = {} 38 | for i in range(0, len(md)): 39 | for key in md[i].keys(): 40 | if key in lbls.keys(): 41 | lbls[key].append(md[i][key]) 42 | else: 43 | lbls[key] = [md[i][key]] 44 | 45 | for key in lbls.keys(): 46 | vals = lbls[key] 47 | plt.figure(figsize=(10,5)) 48 | plt.hist(vals) 49 | #plt.gcf().autofmt_xdate() 50 | #plt.legend(lbl) 51 | plt.title(key) 52 | plt.xlabel("Label Value") 53 | plt.ylabel("Count") 54 | plt.savefig(pp, format='pdf') 55 | plt.close() 56 | 57 | pp.close() -------------------------------------------------------------------------------- /metadata_analysis/t_sne_for_metadata.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime 3 | import matplotlib.pyplot as plt 4 | import matplotlib.dates as dt 5 | import re 6 | import string 7 | import random 8 | import numpy as np 9 | import fnmatch 10 | import os 11 | import sys 12 | import bz2 13 | from matplotlib.backends.backend_pdf import PdfPages 14 | from matplotlib.collections import EventCollection 15 | from sklearn.manifold import TSNE 16 | import time 17 | import pickle 18 | 19 | if len(sys.argv) != 2: 20 | print("incorrect number of command line arguments") 21 | print("received: ", len(sys.argv)) 22 | print("expected: 2") 23 | exit(1) 24 | 25 | m_name = sys.argv[1] 26 | metric_type = "" 27 | data_folder = "../data/" 28 | 29 | # find bucket/quantile, sum, and count files in metric folder 30 | filename_bkt = [] 31 | 32 | try: 33 | for file in os.listdir(data_folder + m_name + "/bucket/"): 34 | if fnmatch.fnmatch(file, "*.json.bz2"): 35 | metric_type = "hist" 36 | f_name = data_folder + m_name + "/bucket/" + file 37 | filename_bkt.append(f_name) 38 | except: 39 | for file in os.listdir(data_folder + m_name + "/quantile/"): 40 | if fnmatch.fnmatch(file, "*.json.bz2"): 41 | metric_type = "summary" 42 | f_name = data_folder + m_name + "/quantile/" + file 43 | filename_bkt.append(f_name) 44 | 45 | print("Metric: ", m_name) 46 | if metric_type == "hist": 47 | label = "le" 48 | elif metric_type == "summary": 49 | label = "quantile" 50 | else: 51 | print("no metric type detected") 52 | exit(1) 53 | 54 | results_folder = "../results/" 55 | png_name = results_folder + m_name + '_graphs.png' 56 | png_name2 = results_folder + m_name + '_graphs_legend.png' 57 | 58 | def parse_jsons(jsons, select_label="__name__"): 59 | X = np.zeros(shape=[1, 200]) 60 | master_labels = [] 61 | label_ints = [] 62 | mds = [] 63 | for one_json in jsons: 64 | for row in range(0, len(one_json)): 65 | metadata = one_json[row]["metric"] 66 | labels = list(metadata.keys()) 67 | label_vals = list(metadata.values()) 68 | x_feature = np.zeros(shape=[1,200]) 69 | for i in range(0, len(labels)): 70 | flag = True 71 | for j in range(0,len(master_labels)): 72 | if master_labels[j] == labels[i]: 73 | if label_vals[i] in label_ints[j]: 74 | x_feature[0,j] = label_ints[j][label_vals[i]] 75 | else: 76 | label_ints[j][label_vals[i]] = len(label_ints[j])+1 77 | flag = False 78 | if flag: 79 | master_labels.append(labels[i]) 80 | label_ints_tmp = {} 81 | label_ints_tmp[label_vals[i]] = 1 82 | x_feature[0,len(label_ints)] = label_ints_tmp[label_vals[i]] 83 | label_ints.append(label_ints_tmp) 84 | 85 | mds.append(metadata) 86 | X = np.vstack((X, x_feature)) 87 | X = X[1:,:] 88 | return X, master_labels, label_ints, mds 89 | 90 | jsons_bkt = [] 91 | for i in range(0,len(filename_bkt)): 92 | file = filename_bkt[i] 93 | print(i) 94 | print(file) 95 | f = bz2.BZ2File(file, 'rb') 96 | jsons_bkt.append(json.load(f)) 97 | f.close() 98 | if i >= 15: 99 | break 100 | 101 | X, master_labels, label_ints, mds = parse_jsons(jsons_bkt, label) 102 | 103 | X_embedded = TSNE(n_components=2).fit_transform(X) 104 | file = open("x_vals", "wb") 105 | pickle.dump(X, file) 106 | pickle.dump(X_embedded, file) 107 | pickle.dump(master_labels, file) 108 | pickle.dump(label_ints, file) 109 | pickle.dump(mds, file) 110 | file.close() 111 | 112 | 113 | print(X_embedded.shape) 114 | plt.figure(figsize=(20,10)) 115 | plt.scatter(X_embedded[:,0], X_embedded[:,1],cmap=plt.cm.Spectral) 116 | plt.show() 117 | # fig = plt.figure(figsize=(20,10)) 118 | # lbls = set() 119 | # label_axis = {} 120 | # select_label = "" 121 | # while(True): 122 | # master_md = [] 123 | # times = [] 124 | # times_int = [] 125 | # md = [] 126 | # #filename_bkt = filename_bkt[1:2] 127 | # if len(filename_bkt) == 0: 128 | # break 129 | # print(len(filename_bkt)) 130 | 131 | # jsons_bkt = [] 132 | # for i in range(0,len(filename_bkt)): 133 | # file = filename_bkt[i] 134 | # print(i) 135 | # print(file) 136 | # f = bz2.BZ2File(file, 'rb') 137 | # jsons_bkt.append(json.load(f)) 138 | # f.close() 139 | # if i >= 50: 140 | # break 141 | 142 | # try: 143 | # filename_bkt = filename_bkt[50:] 144 | # except: 145 | # filename_bkt = [] 146 | 147 | # master_md, times, md = parse_jsons(jsons_bkt, label) 148 | 149 | # for lbl in master_md: 150 | # print("\n==", lbl, len(master_md[lbl])) 151 | # for lbl_val in master_md[lbl]: 152 | # print("\t", lbl_val) 153 | 154 | # if select_label == "": 155 | # select_label = input("\n\nSelect a label to graph:\n") 156 | # try: 157 | # label_vals = master_md[select_label] 158 | # except: 159 | # print("Not a valid label. Exiting..") 160 | # exit(1) 161 | 162 | # graph = {} 163 | # for md_i in range(0, len(md)): 164 | # metadata = md[md_i] 165 | # try: 166 | # label_val = metadata[select_label] 167 | # except: 168 | # continue 169 | 170 | # try: 171 | # graph[label_val].extend(times[md_i]) 172 | # except: 173 | # graph[label_val] = times[md_i] 174 | 175 | 176 | # for j in graph.keys(): 177 | # lbls.add(j) 178 | # print("number of label values: ", len(graph.keys())) 179 | 180 | # for i in lbls: 181 | # print(i) 182 | # try: 183 | # x = dt.date2num(graph[i]) 184 | # except: 185 | # continue 186 | # try: 187 | # val = label_axis[i] 188 | # y = [val]*len(x) 189 | # except: 190 | # val = len(label_axis) 191 | # label_axis[i] = val 192 | # y = [val]*len(x) 193 | 194 | # plt.plot(x, y, ',', color=colors[(val+1)%len(colors)]) 195 | 196 | 197 | # del metadata 198 | # del md 199 | # del master_md 200 | # del times 201 | 202 | # title = select_label 203 | # plt.gcf().autofmt_xdate() 204 | # plt.suptitle(m_name) 205 | # plt.title(title) 206 | # plt.xlabel("Timestamp") 207 | # plt.xticks(rotation=25) 208 | # plt.ylabel("Value") 209 | # plt.yticks(np.arange(len(label_axis.keys()))) 210 | 211 | # ax = plt.gca() 212 | # xfmt = dt.DateFormatter('%Y-%m-%d %H:%M:%S') 213 | # ax.xaxis.set_major_formatter(xfmt) 214 | # #plt.show() 215 | # plt.savefig(png_name) 216 | # plt.close() 217 | 218 | # # plot the legend table 219 | # plt.figure(figsize=(20,10)) 220 | # print(label_axis.keys()) 221 | # n_lbls = np.array(list(label_axis.keys())) 222 | # n_lbls.shape = (len(lbls), 1) 223 | # vals = np.array(list(label_axis.values())) 224 | # vals.shape = (len(vals), 1) 225 | # table_vals = np.append(vals, n_lbls, 1) 226 | # t = plt.table(cellText=table_vals, colLabels=["Number", label], cellLoc='center', loc='center') 227 | # # t.set_fontsize(18) 228 | # t.scale(1,3) 229 | # plt.axis("off") 230 | # plt.title("LEGEND") 231 | # plt.savefig(png_name2) -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/Parse Json to Pandas Dataframes-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Parse Json to Pandas Dataframes\n", 8 | "This script is used to convert json packets into a dictionary where the key is a unique metadata configuration and the value is a Pandas dataframe. The Pandas dataframe has a ds column and a y column corresponding to the timestamp and corresponding value in the time series. The dictionary is then stored in a Pickle file." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import json\n", 18 | "import pandas as pd\n", 19 | "import fnmatch\n", 20 | "import os\n", 21 | "import bz2\n", 22 | "import pickle\n", 23 | "import gc" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# read files in list and convert to pandas dataframes\n", 33 | "def load_files(files, file_format):\n", 34 | " dfs = {}\n", 35 | " for file in files:\n", 36 | " # check file format and read appropriately\n", 37 | " if file_format == \".json\":\n", 38 | " f = open(file, 'rb')\n", 39 | " else:\n", 40 | " f = bz2.BZ2File(file, 'rb')\n", 41 | " jsons = json.load(f)\n", 42 | " f.close()\n", 43 | "\n", 44 | " # iterate through packets in file\n", 45 | " for pkt in jsons:\n", 46 | " # create a new dataframe with packet timestamp and values\n", 47 | " df = pd.DataFrame.from_dict(pkt[\"values\"])\n", 48 | " df = df.rename( columns={0:\"ds\", 1:\"y\"})\n", 49 | " df[\"ds\"] = pd.to_datetime(df[\"ds\"], unit='s')\n", 50 | " df = df.sort_values(by=[\"ds\"])\n", 51 | " df.y = pd.to_numeric(df['y'], errors='coerce')\n", 52 | " df = df.dropna()\n", 53 | " md = str(pkt[\"metric\"])\n", 54 | " # append generated dataframe and metadata to collection\n", 55 | " try:\n", 56 | " dfs[md] = dfs[md].append(df, ignore_index=True)\n", 57 | " except:\n", 58 | " dfs[md] = df\n", 59 | " return dfs" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# take a list of dataframes and their metadata and collapse to a\n", 69 | "# collection of unique time series (based on unique metadata)\n", 70 | "def collapse_to_unique(dfs_master, dfs_new):\n", 71 | " # iterate through metadata\n", 72 | " dfs_remaining = {}\n", 73 | " for md in dfs_new.keys():\n", 74 | " try:\n", 75 | " # find metadata in our master list\n", 76 | " # if this throws an error, simply add it to the list\n", 77 | " dfs_master[md] = dfs_master[md].append(dfs_new[md], ignore_index=True)\n", 78 | " except:\n", 79 | " dfs_remaining[md] = dfs_new[md]\n", 80 | " return dfs_master, dfs_remaining" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 4, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# create pickle file containing data\n", 90 | "def save_checkpoint(pds, file):\n", 91 | " if file[-4:] != \".pkl\":\n", 92 | " file = file + \".pkl\"\n", 93 | " f = open(file, \"wb\")\n", 94 | " pickle.dump(pds, f)\n", 95 | " f.close()\n", 96 | " return file\n", 97 | "\n", 98 | "# load pickle file containing data\n", 99 | "def load_checkpoint(file):\n", 100 | " f = open(file, \"rb\")\n", 101 | " pds = pickle.load(f)\n", 102 | " f.close()\n", 103 | " return pds\n", 104 | "# remove all temp pickle files generated during this program\n", 105 | "def combine_checkpoints(master_file):\n", 106 | " df = {}\n", 107 | " files = os.listdir()\n", 108 | " for file in files:\n", 109 | " if fnmatch.fnmatch(file, \"collapsed_*.pkl\"):\n", 110 | " try:\n", 111 | " f = open(file, \"rb\")\n", 112 | " dfs = pickle.load(f)\n", 113 | " f.close()\n", 114 | " df.update(dfs)\n", 115 | " except:\n", 116 | " continue\n", 117 | " os.system(\"rm \" + file)\n", 118 | " elif fnmatch.fnmatch(file, \"raw_*.pkl\"):\n", 119 | " os.system(\"rm \" + file)\n", 120 | " f = open(master_file + \".pkl\", \"wb\")\n", 121 | " pickle.dump(df, f)\n", 122 | " f.close()" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 5, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "# load all files and convert to a list of pandas dataframes\n", 132 | "def convert_to_pandas(files, file_format, batch_size):\n", 133 | " checkpoints = []\n", 134 | " # # separate files into batches\n", 135 | " batches = [files[batch_size*i:batch_size*(i+1)] for i in range(int(len(files)/batch_size) + 1)]\n", 136 | " print(\"num_batches\", len(batches))\n", 137 | " i = 0\n", 138 | " for batch in batches:\n", 139 | " print(i)\n", 140 | " i += 1\n", 141 | " # get new portion of dataframes and add to master set\n", 142 | " pds_new = load_files(batch, file_format)\n", 143 | " cp = save_checkpoint(pds_new, \"raw_\" + str(i))\n", 144 | " checkpoints.append(cp)\n", 145 | " gc.collect()\n", 146 | "\n", 147 | " pds = []\n", 148 | " # iterate checkpoint by checkpoint and add data to unique collection\n", 149 | " # of time series\n", 150 | " collapsed_fs = []\n", 151 | " i = 0\n", 152 | " for cp in checkpoints:\n", 153 | " i += 1\n", 154 | " print(i)\n", 155 | " pds_new = load_checkpoint(cp)\n", 156 | " print(i)\n", 157 | " # load data in batches and combine dataframes\n", 158 | " for f in collapsed_fs:\n", 159 | " pds = load_checkpoint(f)\n", 160 | " pds, pds_new = collapse_to_unique(pds, pds_new)\n", 161 | " save_checkpoint(pds, f)\n", 162 | " gc.collect()\n", 163 | " if len(pds_new) > 0:\n", 164 | " f_new = save_checkpoint(pds_new, \"collapsed_\" + str(i)) \n", 165 | " print(\"Generated \", f_new)\n", 166 | " collapsed_fs.append(f_new) \n", 167 | " print(i)\n", 168 | " gc.collect()\n", 169 | " return pds" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 6, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "# get all appropriately formatted files in a folder\n", 179 | "def retrieve_filenames(path, file_format):\n", 180 | " filenames = []\n", 181 | " for file in os.listdir(path):\n", 182 | " # check if this file has correct ending (regex)\n", 183 | " if fnmatch.fnmatch(file, \"*\" + file_format):\n", 184 | " f_name = path + file\n", 185 | " filenames.append(f_name)\n", 186 | " return filenames\n", 187 | "\n", 188 | "# get main input arguments and return formatted data\n", 189 | "def read_input(data_folder, metric, file_format, batch_size):\n", 190 | " # metric-specific data folder\n", 191 | " folder = data_folder + metric + \"/\"\n", 192 | " # get all files in folder\n", 193 | " files = os.listdir(folder)\n", 194 | "\n", 195 | " # automatically detect metric type\n", 196 | " if \"quantile\" in files:\n", 197 | " metric_type = \"summary\"\n", 198 | " label = \"quantile\"\n", 199 | " filenames = retrieve_filenames(folder + \"quantile/\", file_format)\n", 200 | "# filenames_count = retrieve_filenames(folder + \"count/\", file_format)\n", 201 | "# filenames_sum = retrieve_filenames(folder + \"sum/\", file_format)\n", 202 | " elif \"bucket\" in files:\n", 203 | " metric_type = \"histogram\"\n", 204 | " label = \"le\"\n", 205 | " filenames = retrieve_filenames(folder + \"bucket/\", file_format)\n", 206 | "# filenames_count = retrieve_filenames(folder + \"count/\", file_format)\n", 207 | "# filenames_sum = retrieve_filenames(folder + \"sum/\", file_format)\n", 208 | " else:\n", 209 | " metric_type = \"counter/gauge\"\n", 210 | " label = \"\"\n", 211 | " filenames = retrieve_filenames(folder, file_format)\n", 212 | " \n", 213 | " pd_frames = convert_to_pandas(filenames, file_format, batch_size)\n", 214 | "\n", 215 | " return pd_frames" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 7, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "def main():\n", 225 | " print(\"Formatting Data\")\n", 226 | " pd_frames = read_input(input_dir, metric, fformat, batch_size)\n", 227 | " print(\"Conversion successful\")\n", 228 | "\n", 229 | " master_file = output_dir + metric\n", 230 | "\n", 231 | " combine_checkpoints(master_file)\n", 232 | "\n", 233 | " print(\"Saved data:\", master_file)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "name": "stdout", 243 | "output_type": "stream", 244 | "text": [ 245 | "Formatting Data\n", 246 | "num_batches 11\n", 247 | "0\n" 248 | ] 249 | } 250 | ], 251 | "source": [ 252 | "if __name__ == '__main__':\n", 253 | "\n", 254 | " # input parameters\n", 255 | " metric = \"http_request_duration_microseconds\"\n", 256 | " fformat='.json.bz2'\n", 257 | " input_dir = \"data/\"\n", 258 | " output_dir = \"\"\n", 259 | " batch_size= 20\n", 260 | "\n", 261 | " main()" 262 | ] 263 | } 264 | ], 265 | "metadata": { 266 | "kernelspec": { 267 | "display_name": "Python 3", 268 | "language": "python", 269 | "name": "python3" 270 | }, 271 | "language_info": { 272 | "codemirror_mode": { 273 | "name": "ipython", 274 | "version": 3 275 | }, 276 | "file_extension": ".py", 277 | "mimetype": "text/x-python", 278 | "name": "python", 279 | "nbconvert_exporter": "python", 280 | "pygments_lexer": "ipython3", 281 | "version": "3.6.5" 282 | } 283 | }, 284 | "nbformat": 4, 285 | "nbformat_minor": 2 286 | } 287 | -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/Prophet Model Forecasting-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Requirement already satisfied: fbprophet in /home/nfrumkin/anaconda3/lib/python3.6/site-packages (0.3.post2)\n", 13 | "Requirement already satisfied: Cython>=0.22 in /home/nfrumkin/anaconda3/lib/python3.6/site-packages (from fbprophet) (0.28.2)\n", 14 | "Requirement already satisfied: pystan>=2.14 in /home/nfrumkin/anaconda3/lib/python3.6/site-packages (from fbprophet) (2.17.1.0)\n", 15 | "Requirement already satisfied: numpy>=1.10.0 in /home/nfrumkin/anaconda3/lib/python3.6/site-packages (from fbprophet) (1.14.3)\n", 16 | "Requirement already satisfied: pandas>=0.20.1 in /home/nfrumkin/anaconda3/lib/python3.6/site-packages (from fbprophet) (0.23.0)\n", 17 | "Requirement already satisfied: matplotlib>=2.0.0 in /home/nfrumkin/anaconda3/lib/python3.6/site-packages (from fbprophet) (2.2.2)\n", 18 | "Requirement already satisfied: python-dateutil>=2.5.0 in /home/nfrumkin/anaconda3/lib/python3.6/site-packages (from pandas>=0.20.1->fbprophet) (2.7.3)\n", 19 | "Requirement already satisfied: pytz>=2011k in /home/nfrumkin/anaconda3/lib/python3.6/site-packages (from pandas>=0.20.1->fbprophet) (2018.4)\n", 20 | "Requirement already satisfied: cycler>=0.10 in /home/nfrumkin/anaconda3/lib/python3.6/site-packages (from matplotlib>=2.0.0->fbprophet) (0.10.0)\n", 21 | "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /home/nfrumkin/anaconda3/lib/python3.6/site-packages (from matplotlib>=2.0.0->fbprophet) (2.2.0)\n", 22 | "Requirement already satisfied: six>=1.10 in /home/nfrumkin/anaconda3/lib/python3.6/site-packages (from matplotlib>=2.0.0->fbprophet) (1.11.0)\n", 23 | "Requirement already satisfied: kiwisolver>=1.0.1 in /home/nfrumkin/anaconda3/lib/python3.6/site-packages (from matplotlib>=2.0.0->fbprophet) (1.0.1)\n", 24 | "Requirement already satisfied: setuptools in /home/nfrumkin/anaconda3/lib/python3.6/site-packages (from kiwisolver>=1.0.1->matplotlib>=2.0.0->fbprophet) (39.1.0)\n", 25 | "\u001b[33mYou are using pip version 10.0.1, however version 18.0 is available.\n", 26 | "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "!pip install fbprophet\n", 32 | "from fbprophet import Prophet\n", 33 | "import pandas as pd\n", 34 | "import numpy as np\n", 35 | "import matplotlib.pylab as plt\n", 36 | "import datetime as dt\n", 37 | "import pickle" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "class ProphetForecast:\n", 47 | " def __init__(self, train, test):\n", 48 | " self.train = train\n", 49 | " self.test = test\n", 50 | "\n", 51 | " def fit_model(self, n_predict):\n", 52 | " m = Prophet(daily_seasonality=False, weekly_seasonality=False, yearly_seasonality=False)\n", 53 | " m.fit(self.train)\n", 54 | " future = m.make_future_dataframe(periods= len(self.test),freq= '1MIN')\n", 55 | " self.forecast = m.predict(future)\n", 56 | "\n", 57 | " return self.forecast\n", 58 | "\n", 59 | " def graph(self):\n", 60 | " fig = plt.figure(figsize=(40,10))\n", 61 | " plt.plot(np.array(self.train[\"ds\"]), np.array(self.train[\"y\"]),'b', label=\"train\", linewidth=3)\n", 62 | " plt.plot(np.array(self.test[\"ds\"]), np.array(self.test[\"y\"]), 'g', label=\"test\", linewidth=3)\n", 63 | "\n", 64 | " forecast_ds = np.array(self.forecast[\"ds\"])\n", 65 | " plt.plot(forecast_ds, np.array(self.forecast[\"yhat\"]), 'o', label=\"yhat\", linewidth=3)\n", 66 | " plt.plot(forecast_ds, np.array(self.forecast[\"yhat_upper\"]), 'y', label=\"yhat_upper\", linewidth=3)\n", 67 | " plt.plot(forecast_ds, np.array(self.forecast[\"yhat_lower\"]), 'y', label=\"yhat_lower\", linewidth=3)\n", 68 | " plt.xlabel(\"Timestamp\")\n", 69 | " plt.ylabel(\"Value\")\n", 70 | " plt.legend(loc=1)\n", 71 | " plt.title(\"Prophet Model Forecast\")\n", 72 | " plt.show()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 3, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "def calc_delta(vals):\n", 82 | " diff = vals - np.roll(vals, 1)\n", 83 | " diff[0] = 0\n", 84 | " return diff\n", 85 | "\n", 86 | "def monotonically_inc(vals):\n", 87 | " # check corner case\n", 88 | " if len(vals) == 1:\n", 89 | " return True\n", 90 | " diff = calc_delta(vals)\n", 91 | " diff[np.where(vals == 0)] = 0\n", 92 | "\n", 93 | " if ((diff < 0).sum() == 0):\n", 94 | " return True\n", 95 | " else:\n", 96 | " return False" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 4, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "metric_name = \"http_request_duration_microseconds_quantile\"\n", 106 | "pkl_file = open(\"data/\" + metric_name + \"_dataframes.pkl\", \"rb\")\n", 107 | "dfs = pickle.load(pkl_file)\n", 108 | "pkl_file.close()\n", 109 | "key_vals = list(dfs.keys())" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "{'__name__': 'http_request_duration_microseconds', 'beta_kubernetes_io_arch': 'amd64', 'beta_kubernetes_io_instance_type': 'm4.xlarge', 'beta_kubernetes_io_os': 'linux', 'failure_domain_beta_kubernetes_io_region': 'us-east-2', 'failure_domain_beta_kubernetes_io_zone': 'us-east-2a', 'handler': 'prometheus', 'hostname': 'free-stg-node-compute-e0756', 'instance': 'ip-172-31-76-144.us-east-2.compute.internal', 'job': 'kubernetes-nodes-exporter', 'kubernetes_io_hostname': 'ip-172-31-76-144.us-east-2.compute.internal', 'logging_infra_fluentd': 'true', 'node_role_kubernetes_io_compute': 'true', 'quantile': '0.99', 'region': 'us-east-2', 'type': 'compute'}\n" 122 | ] 123 | } 124 | ], 125 | "source": [ 126 | "selected = [728,738]\n", 127 | "for ind in selected:\n", 128 | " key = key_vals[ind]\n", 129 | " df = dfs[key]\n", 130 | " df = df.sort_values(by=['timestamps'])\n", 131 | " print(key)\n", 132 | " df[\"values\"] = df[\"values\"].apply(pd.to_numeric)\n", 133 | " vals = np.array(df[\"values\"].tolist())\n", 134 | "\n", 135 | " df[\"ds\"] = df[\"timestamps\"]\n", 136 | " df[\"y\"] = df[\"values\"]\n", 137 | " # check if metric is a counter, if so, run AD on difference\n", 138 | " if monotonically_inc(vals):\n", 139 | " print(\"monotonically_inc\")\n", 140 | " vals = calc_delta(vals)\n", 141 | " df[\"values\"] = vals.tolist()\n", 142 | "\n", 143 | " train = df[0:int(0.7*len(vals))]\n", 144 | " test = df[int(0.7*len(vals)):]\n", 145 | "\n", 146 | " pf = ProphetForecast(train, test)\n", 147 | " forecast = pf.fit_model(len(test))\n", 148 | "\n", 149 | " pf.graph()" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [] 158 | } 159 | ], 160 | "metadata": { 161 | "kernelspec": { 162 | "display_name": "Python 3", 163 | "language": "python", 164 | "name": "python3" 165 | }, 166 | "language_info": { 167 | "codemirror_mode": { 168 | "name": "ipython", 169 | "version": 3 170 | }, 171 | "file_extension": ".py", 172 | "mimetype": "text/x-python", 173 | "name": "python", 174 | "nbconvert_exporter": "python", 175 | "pygments_lexer": "ipython3", 176 | "version": "3.6.5" 177 | } 178 | }, 179 | "nbformat": 4, 180 | "nbformat_minor": 2 181 | } 182 | -------------------------------------------------------------------------------- /notebooks/Parse Json to Pandas Dataframes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Parse Json to Pandas Dataframes\n", 8 | "This script is used to convert json packets into a dictionary where the key is a unique metadata configuration and the value is a Pandas dataframe. The Pandas dataframe has a ds column and a y column corresponding to the timestamp and corresponding value in the time series. The dictionary is then stored in a Pickle file." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import json\n", 18 | "import pandas as pd\n", 19 | "import fnmatch\n", 20 | "import os\n", 21 | "import bz2\n", 22 | "import pickle\n", 23 | "import gc" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# read files in list and convert to pandas dataframes\n", 33 | "def load_files(files, file_format):\n", 34 | " dfs = {}\n", 35 | " for file in files:\n", 36 | " # check file format and read appropriately\n", 37 | " if file_format == \".json\":\n", 38 | " f = open(file, 'rb')\n", 39 | " else:\n", 40 | " f = bz2.BZ2File(file, 'rb')\n", 41 | " jsons = json.load(f)\n", 42 | " f.close()\n", 43 | "\n", 44 | " # iterate through packets in file\n", 45 | " for pkt in jsons:\n", 46 | " # create a new dataframe with packet timestamp and values\n", 47 | " df = pd.DataFrame.from_dict(pkt[\"values\"])\n", 48 | " df = df.rename( columns={0:\"ds\", 1:\"y\"})\n", 49 | " df[\"ds\"] = pd.to_datetime(df[\"ds\"], unit='s')\n", 50 | " df = df.sort_values(by=[\"ds\"])\n", 51 | " df.y = pd.to_numeric(df['y'], errors='coerce')\n", 52 | " df = df.dropna()\n", 53 | " md = str(pkt[\"metric\"])\n", 54 | " # append generated dataframe and metadata to collection\n", 55 | " try:\n", 56 | " dfs[md] = dfs[md].append(df, ignore_index=True)\n", 57 | " except:\n", 58 | " dfs[md] = df\n", 59 | " return dfs" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# take a list of dataframes and their metadata and collapse to a\n", 69 | "# collection of unique time series (based on unique metadata)\n", 70 | "def collapse_to_unique(dfs_master, dfs_new):\n", 71 | " # iterate through metadata\n", 72 | " dfs_remaining = {}\n", 73 | " for md in dfs_new.keys():\n", 74 | " try:\n", 75 | " # find metadata in our master list\n", 76 | " # if this throws an error, simply add it to the list\n", 77 | " dfs_master[md] = dfs_master[md].append(dfs_new[md], ignore_index=True)\n", 78 | " except:\n", 79 | " dfs_remaining[md] = dfs_new[md]\n", 80 | " return dfs_master, dfs_remaining" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 4, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# create pickle file containing data\n", 90 | "def save_checkpoint(pds, file):\n", 91 | " if file[-4:] != \".pkl\":\n", 92 | " file = file + \".pkl\"\n", 93 | " f = open(file, \"wb\")\n", 94 | " pickle.dump(pds, f)\n", 95 | " f.close()\n", 96 | " return file\n", 97 | "\n", 98 | "# load pickle file containing data\n", 99 | "def load_checkpoint(file):\n", 100 | " f = open(file, \"rb\")\n", 101 | " pds = pickle.load(f)\n", 102 | " f.close()\n", 103 | " return pds\n", 104 | "# remove all temp pickle files generated during this program\n", 105 | "def combine_checkpoints(master_file):\n", 106 | " df = {}\n", 107 | " files = os.listdir()\n", 108 | " for file in files:\n", 109 | " if fnmatch.fnmatch(file, \"collapsed_*.pkl\"):\n", 110 | " try:\n", 111 | " f = open(file, \"rb\")\n", 112 | " dfs = pickle.load(f)\n", 113 | " f.close()\n", 114 | " df.update(dfs)\n", 115 | " except:\n", 116 | " continue\n", 117 | " os.system(\"rm \" + file)\n", 118 | " elif fnmatch.fnmatch(file, \"raw_*.pkl\"):\n", 119 | " os.system(\"rm \" + file)\n", 120 | " f = open(master_file + \".pkl\", \"wb\")\n", 121 | " pickle.dump(df, f)\n", 122 | " f.close()" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 5, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "# load all files and convert to a list of pandas dataframes\n", 132 | "def convert_to_pandas(files, file_format, batch_size):\n", 133 | " checkpoints = []\n", 134 | " # # separate files into batches\n", 135 | " batches = [files[batch_size*i:batch_size*(i+1)] for i in range(int(len(files)/batch_size) + 1)]\n", 136 | " print(\"num_batches\", len(batches))\n", 137 | " i = 0\n", 138 | " for batch in batches:\n", 139 | " print(i)\n", 140 | " i += 1\n", 141 | " # get new portion of dataframes and add to master set\n", 142 | " pds_new = load_files(batch, file_format)\n", 143 | " cp = save_checkpoint(pds_new, \"raw_\" + str(i))\n", 144 | " checkpoints.append(cp)\n", 145 | " gc.collect()\n", 146 | "\n", 147 | " pds = []\n", 148 | " # iterate checkpoint by checkpoint and add data to unique collection\n", 149 | " # of time series\n", 150 | " collapsed_fs = []\n", 151 | " i = 0\n", 152 | " for cp in checkpoints:\n", 153 | " i += 1\n", 154 | " print(i)\n", 155 | " pds_new = load_checkpoint(cp)\n", 156 | " print(i)\n", 157 | " # load data in batches and combine dataframes\n", 158 | " for f in collapsed_fs:\n", 159 | " pds = load_checkpoint(f)\n", 160 | " pds, pds_new = collapse_to_unique(pds, pds_new)\n", 161 | " save_checkpoint(pds, f)\n", 162 | " gc.collect()\n", 163 | " if len(pds_new) > 0:\n", 164 | " f_new = save_checkpoint(pds_new, \"collapsed_\" + str(i)) \n", 165 | " print(\"Generated \", f_new)\n", 166 | " collapsed_fs.append(f_new) \n", 167 | " print(i)\n", 168 | " gc.collect()\n", 169 | " return pds" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 6, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "# get all appropriately formatted files in a folder\n", 179 | "def retrieve_filenames(path, file_format):\n", 180 | " filenames = []\n", 181 | " for file in os.listdir(path):\n", 182 | " # check if this file has correct ending (regex)\n", 183 | " if fnmatch.fnmatch(file, \"*\" + file_format):\n", 184 | " f_name = path + file\n", 185 | " filenames.append(f_name)\n", 186 | " return filenames\n", 187 | "\n", 188 | "# get main input arguments and return formatted data\n", 189 | "def read_input(data_folder, metric, file_format, batch_size):\n", 190 | " # metric-specific data folder\n", 191 | " folder = data_folder + metric + \"/\"\n", 192 | " # get all files in folder\n", 193 | " files = os.listdir(folder)\n", 194 | "\n", 195 | " # automatically detect metric type\n", 196 | " if \"quantile\" in files:\n", 197 | " metric_type = \"summary\"\n", 198 | " label = \"quantile\"\n", 199 | " filenames = retrieve_filenames(folder + \"quantile/\", file_format)\n", 200 | "# filenames_count = retrieve_filenames(folder + \"count/\", file_format)\n", 201 | "# filenames_sum = retrieve_filenames(folder + \"sum/\", file_format)\n", 202 | " elif \"bucket\" in files:\n", 203 | " metric_type = \"histogram\"\n", 204 | " label = \"le\"\n", 205 | " filenames = retrieve_filenames(folder + \"bucket/\", file_format)\n", 206 | "# filenames_count = retrieve_filenames(folder + \"count/\", file_format)\n", 207 | "# filenames_sum = retrieve_filenames(folder + \"sum/\", file_format)\n", 208 | " else:\n", 209 | " metric_type = \"counter/gauge\"\n", 210 | " label = \"\"\n", 211 | " filenames = retrieve_filenames(folder, file_format)\n", 212 | " \n", 213 | " pd_frames = convert_to_pandas(filenames, file_format, batch_size)\n", 214 | "\n", 215 | " return pd_frames" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 7, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "def main():\n", 225 | " print(\"Formatting Data\")\n", 226 | " pd_frames = read_input(input_dir, metric, fformat, batch_size)\n", 227 | " print(\"Conversion successful\")\n", 228 | "\n", 229 | " master_file = output_dir + metric\n", 230 | "\n", 231 | " combine_checkpoints(master_file)\n", 232 | "\n", 233 | " print(\"Saved data:\", master_file)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 8, 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "name": "stdout", 243 | "output_type": "stream", 244 | "text": [ 245 | "Formatting Data\n", 246 | "num_batches 11\n", 247 | "0\n", 248 | "1\n", 249 | "2\n", 250 | "3\n", 251 | "4\n", 252 | "5\n", 253 | "6\n", 254 | "7\n", 255 | "8\n", 256 | "9\n", 257 | "10\n", 258 | "1\n", 259 | "1\n", 260 | "Generated collapsed_1.pkl\n", 261 | "1\n", 262 | "2\n", 263 | "2\n", 264 | "Generated collapsed_2.pkl\n", 265 | "2\n", 266 | "3\n", 267 | "3\n", 268 | "Generated collapsed_3.pkl\n", 269 | "3\n", 270 | "4\n", 271 | "4\n", 272 | "Generated collapsed_4.pkl\n", 273 | "4\n", 274 | "5\n", 275 | "5\n", 276 | "Generated collapsed_5.pkl\n", 277 | "5\n", 278 | "6\n", 279 | "6\n", 280 | "Generated collapsed_6.pkl\n", 281 | "6\n", 282 | "7\n", 283 | "7\n", 284 | "Generated collapsed_7.pkl\n", 285 | "7\n", 286 | "8\n", 287 | "8\n", 288 | "Generated collapsed_8.pkl\n", 289 | "8\n", 290 | "9\n", 291 | "9\n", 292 | "Generated collapsed_9.pkl\n", 293 | "9\n", 294 | "10\n", 295 | "10\n", 296 | "Generated collapsed_10.pkl\n", 297 | "10\n", 298 | "11\n", 299 | "11\n", 300 | "11\n", 301 | "Conversion successful\n", 302 | "Saved data: http_request_duration_microseconds\n" 303 | ] 304 | } 305 | ], 306 | "source": [ 307 | "if __name__ == '__main__':\n", 308 | "\n", 309 | " # input parameters\n", 310 | " metric = \"http_request_duration_microseconds\"\n", 311 | " fformat='.json.bz2'\n", 312 | " input_dir = \"data/\"\n", 313 | " output_dir = \"\"\n", 314 | " batch_size= 20\n", 315 | "\n", 316 | " main()" 317 | ] 318 | } 319 | ], 320 | "metadata": { 321 | "kernelspec": { 322 | "display_name": "Python 3", 323 | "language": "python", 324 | "name": "python3" 325 | }, 326 | "language_info": { 327 | "codemirror_mode": { 328 | "name": "ipython", 329 | "version": 3 330 | }, 331 | "file_extension": ".py", 332 | "mimetype": "text/x-python", 333 | "name": "python", 334 | "nbconvert_exporter": "python", 335 | "pygments_lexer": "ipython3", 336 | "version": "3.6.5" 337 | } 338 | }, 339 | "nbformat": 4, 340 | "nbformat_minor": 2 341 | } 342 | -------------------------------------------------------------------------------- /notebooks/imgs/arima.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/arima.png -------------------------------------------------------------------------------- /notebooks/imgs/detect_anomaly_accumulator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/detect_anomaly_accumulator.png -------------------------------------------------------------------------------- /notebooks/imgs/detect_anomaly_combined.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/detect_anomaly_combined.png -------------------------------------------------------------------------------- /notebooks/imgs/detect_anomaly_tail_prob.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/detect_anomaly_tail_prob.png -------------------------------------------------------------------------------- /notebooks/imgs/example_ts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/example_ts.png -------------------------------------------------------------------------------- /notebooks/imgs/exp_smoothing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/exp_smoothing.png -------------------------------------------------------------------------------- /notebooks/imgs/fourier_extrapolation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/fourier_extrapolation.png -------------------------------------------------------------------------------- /notebooks/imgs/fourier_extrapolation_behind.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/fourier_extrapolation_behind.png -------------------------------------------------------------------------------- /notebooks/imgs/imgs/detect_anomaly_accumulator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/imgs/detect_anomaly_accumulator.png -------------------------------------------------------------------------------- /notebooks/imgs/imgs/detect_anomaly_combined.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/imgs/detect_anomaly_combined.png -------------------------------------------------------------------------------- /notebooks/imgs/imgs/detect_anomaly_tail_prob.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/imgs/detect_anomaly_tail_prob.png -------------------------------------------------------------------------------- /notebooks/imgs/imgs/example_ts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/imgs/example_ts.png -------------------------------------------------------------------------------- /notebooks/imgs/imgs/fourier_extrapolation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/imgs/fourier_extrapolation.png -------------------------------------------------------------------------------- /notebooks/imgs/imgs/fourier_extrapolation_behind.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/imgs/fourier_extrapolation_behind.png -------------------------------------------------------------------------------- /notebooks/imgs/imgs/partitioned_ts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/imgs/partitioned_ts.png -------------------------------------------------------------------------------- /notebooks/imgs/kubelet_docker_instance_label.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/kubelet_docker_instance_label.png -------------------------------------------------------------------------------- /notebooks/imgs/kubelet_docker_op_type_label.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/kubelet_docker_op_type_label.png -------------------------------------------------------------------------------- /notebooks/imgs/partitioned_ts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/partitioned_ts.png -------------------------------------------------------------------------------- /notebooks/imgs/prophet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/prophet.png -------------------------------------------------------------------------------- /notebooks/imgs/t-sne_embedding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/notebooks/imgs/t-sne_embedding.png -------------------------------------------------------------------------------- /presentations/devconf_presentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/presentations/devconf_presentation.pdf -------------------------------------------------------------------------------- /presentations/final_presentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/presentations/final_presentation.pdf -------------------------------------------------------------------------------- /presentations/lightning_talk.ppdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/presentations/lightning_talk.ppdf -------------------------------------------------------------------------------- /presentations/mid-summer_presentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/presentations/mid-summer_presentation.pdf -------------------------------------------------------------------------------- /presentations/pipeline_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AICoE/prometheus-data-science/72cc52b808274fc5b7b6623ef42e5abec7255667/presentations/pipeline_arch.png -------------------------------------------------------------------------------- /prophet_train.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from fbprophet import Prophet 3 | import pandas as pd 4 | import numpy as np 5 | import matplotlib.pylab as plt 6 | import datetime as dt 7 | import argparse 8 | 9 | class ProphetForecast: 10 | def __init__(self, train, test): 11 | self.train = train 12 | self.test = test 13 | 14 | def fit_model(self, n_predict): 15 | m = Prophet(daily_seasonality=False, weekly_seasonality=False, yearly_seasonality=False) 16 | m.fit(self.train) 17 | future = m.make_future_dataframe(periods= len(self.test),freq= '1MIN') 18 | self.forecast = m.predict(future) 19 | 20 | return self.forecast 21 | 22 | def graph(self): 23 | fig = plt.figure(figsize=(40,10)) 24 | # plt.plot(np.array(self.train["ds"]), np.array(self.train["y"]),'b', label="train", linewidth=3) 25 | # plt.plot(np.array(self.test["ds"]), np.array(self.test["y"]), 'g', label="test", linewidth=3) 26 | ds_forecast = np.array(self.forecast["ds"]) 27 | forecast = np.array(self.forecast["yhat"]) 28 | 29 | forecast_lower = np.array(self.forecast["yhat_lower"]) 30 | forecast_upper = np.array(self.forecast["yhat_upper"]) 31 | 32 | ds_forecast = ds_forecast[len(self.train["y"]):] 33 | forecast = forecast[len(self.train["y"]):] 34 | forecast_upper = forecast_upper[len(self.train["y"]):] 35 | forecast_lower = forecast_lower[len(self.train["y"]):] 36 | plt.plot(self.train["ds"], self.train["y"], 'b', label = 'train', linewidth = 3) 37 | plt.plot(self.test["ds"], self.test["y"], 'g', label = 'test', linewidth = 3) 38 | plt.plot(ds_forecast,forecast, 'y', label = 'yhat') 39 | forecast_ds = np.array(self.forecast["ds"]) 40 | # plt.plot(forecast_ds, np.array(self.forecast["yhat"]), 'o', label="yhat", linewidth=3) 41 | plt.plot(ds_forecast, forecast_upper, 'y', label="yhat_upper", linewidth=3) 42 | plt.plot(ds_forecast, forecast_lower, 'y', label="yhat_lower", linewidth=3) 43 | plt.xlabel("Timestamp") 44 | plt.ylabel("Value") 45 | plt.legend(loc=1) 46 | plt.title("Prophet Model Forecast") 47 | 48 | def calc_delta(vals): 49 | diff = vals - np.roll(vals, 1) 50 | diff[0] = 0 51 | return diff 52 | 53 | def monotonically_inc(vals): 54 | # check corner case 55 | if len(vals) == 1: 56 | return True 57 | diff = calc_delta(vals) 58 | diff[np.where(vals == 0)] = 0 59 | 60 | if ((diff < 0).sum() == 0): 61 | return True 62 | else: 63 | return False 64 | 65 | if __name__ == "__main__": 66 | 67 | parser = argparse.ArgumentParser(description="run Prophet training on time series") 68 | 69 | parser.add_argument("--metric", type=str, help='metric name', required=True) 70 | 71 | parser.add_argument("--key", type=int, help='key number') 72 | args = parser.parse_args() 73 | 74 | metric_name = args.metric 75 | # pkl_file = open("../pkl_data/" + metric_name + "_dataframes.pkl", "rb") 76 | pkl_file = open("../data/real_data_test.pkl", "rb") 77 | dfs = pickle.load(pkl_file) 78 | pkl_file.close() 79 | key_vals = list(dfs.keys()) 80 | 81 | selected = [args.key] 82 | for ind in selected: 83 | key = key_vals[ind] 84 | df = dfs[key] 85 | #df = dfs["{'__name__': 'kubelet_docker_operations_latency_microseconds', 'beta_kubernetes_io_arch': 'amd64', 'beta_kubernetes_io_os': 'linux', 'instance': 'cpt-0001.ocp.prod.upshift.eng.rdu2.redhat.com', 'job': 'kubernetes-nodes', 'kubernetes_io_hostname': 'cpt-0001.ocp.prod.upshift.eng.rdu2.redhat.com', 'operation_type': 'version', 'provider': 'rhos', 'quantile': '0.5', 'region': 'compute', 'size': 'small'}"] 86 | df["ds"] = df["timestamps"] 87 | df["y"] = df["values"] 88 | df = df.sort_values(by=['ds']) 89 | print(key) 90 | df["y"] = df["y"].apply(pd.to_numeric) 91 | vals = np.array(df["y"].tolist()) 92 | 93 | df["ds"] = df["ds"] 94 | df["y"] = df["y"] 95 | # check if metric is a counter, if so, run AD on difference 96 | if monotonically_inc(vals): 97 | print("monotonically_inc") 98 | vals = calc_delta(vals) 99 | df["y"] = vals.tolist() 100 | 101 | train = df[0:int(0.7*len(vals))] 102 | test = df[int(0.7*len(vals)):] 103 | 104 | pf = ProphetForecast(train, test) 105 | forecast = pf.fit_model(len(test)) 106 | 107 | f = open("../prophet_forecasts/prophet_model_" + metric_name + "_" + str(args.key) + ".pkl", "wb") 108 | pickle.dump(forecast,f) 109 | print(type(forecast)) 110 | pickle.dump(train, f) 111 | pickle.dump(test,f) 112 | f.close() 113 | 114 | pf.graph() 115 | plt.savefig("../presentation/graphs/prophet_" + str(args.key) + "_" + args.metric + ".png", transparent=True) 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /run_compare_mdls.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | METRIC='http_request_duration_microseconds_quantile' 4 | KEY=60 5 | python prophet_train.py --metric $METRIC --key $KEY 6 | python fourier_train.py --metric $METRIC --key $KEY 7 | python compare_fourier_prophet.py --metric $METRIC --key $KEY 8 | --------------------------------------------------------------------------------