├── README.md ├── images └── promdash.png ├── example_python ├── README.md ├── main.py ├── client.py └── server.py ├── example_golang ├── main.go ├── client.go └── server.go └── workshop.md /README.md: -------------------------------------------------------------------------------- 1 | ## This very outdated repo is archived and will not receive further changes. 2 | -------------------------------------------------------------------------------- /images/promdash.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliusv/prometheus_workshop/HEAD/images/promdash.png -------------------------------------------------------------------------------- /example_python/README.md: -------------------------------------------------------------------------------- 1 | The original repository for the Python example is 2 | [brian-brazil/prometheus_workshop_python](https://github.com/brian-brazil/prometheus_workshop_python). 3 | -------------------------------------------------------------------------------- /example_python/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from server import Server 4 | from client import start_request_workers 5 | 6 | 7 | if __name__ == '__main__': 8 | Server().start() 9 | start_request_workers() 10 | -------------------------------------------------------------------------------- /example_golang/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "net/http" 6 | "os" 7 | "time" 8 | 9 | "github.com/justinas/alice" 10 | "github.com/streadway/handy/report" 11 | ) 12 | 13 | var ( 14 | addr = flag.String("listen-address", ":8080", "The address to listen on for HTTP requests.") 15 | 16 | start = time.Now() 17 | ) 18 | 19 | func main() { 20 | flag.Parse() 21 | 22 | http.HandleFunc("/api/", handleAPI) 23 | 24 | // Log every received HTTP request to stdout. 25 | go http.ListenAndServe(*addr, alice.New( 26 | report.JSONMiddleware(os.Stdout), 27 | ).Then(http.DefaultServeMux)) 28 | 29 | startClient(*addr) 30 | 31 | select {} 32 | } 33 | -------------------------------------------------------------------------------- /example_python/client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import math 4 | import urllib2 5 | import thread 6 | import time 7 | 8 | OSCILLATION_PERIOD_SECONDS = 300.0 9 | 10 | 11 | def send_request(method, path): 12 | data = None 13 | if method == 'POST': 14 | data = '' 15 | try: 16 | urllib2.urlopen('http://localhost:8081' + path, data) 17 | except urllib2.HTTPError: 18 | pass 19 | except: 20 | pass 21 | 22 | start = time.time() 23 | 24 | def oscillation_factor(): 25 | return 2 + math.sin(math.sin(2 * math.pi * (time.time() - start) / OSCILLATION_PERIOD_SECONDS)) 26 | 27 | def request_worker(method, path, sleep): 28 | while True: 29 | send_request(method, path) 30 | time.sleep(sleep * oscillation_factor()) 31 | 32 | def start_request_workers(): 33 | thread.start_new_thread(request_worker, ('GET', '/api/foo', .01)) 34 | thread.start_new_thread(request_worker, ('POST', '/api/foo', .15)) 35 | thread.start_new_thread(request_worker, ('GET', '/api/bar', .02)) 36 | thread.start_new_thread(request_worker, ('POST', '/api/foo', .1)) 37 | thread.start_new_thread(request_worker, ('GET', '/api/nonexistent', .5)) 38 | -------------------------------------------------------------------------------- /example_golang/client.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "flag" 6 | "math" 7 | "net/http" 8 | "time" 9 | ) 10 | 11 | var oscillationPeriod = flag.Duration("oscillation-period", 5*time.Minute, "The duration of the rate oscillation period.") 12 | 13 | func startClient(servAddr string) { 14 | 15 | oscillationFactor := func() float64 { 16 | return 2 + math.Sin(math.Sin(2*math.Pi*float64(time.Since(start))/float64(*oscillationPeriod))) 17 | } 18 | 19 | ignoreRequest := func(resp *http.Response, err error) { 20 | if err != nil { 21 | return 22 | } 23 | resp.Body.Close() 24 | } 25 | 26 | // GET /api/foo. 27 | go func() { 28 | for { 29 | ignoreRequest(http.Get("http://" + servAddr + "/api/foo")) 30 | time.Sleep(time.Duration(10*oscillationFactor()) * time.Millisecond) 31 | } 32 | }() 33 | // POST /api/foo. 34 | go func() { 35 | for { 36 | ignoreRequest(http.Post("http://"+servAddr+"/api/foo", "text/plain", &bytes.Buffer{})) 37 | time.Sleep(time.Duration(150*oscillationFactor()) * time.Millisecond) 38 | } 39 | }() 40 | // GET /api/bar. 41 | go func() { 42 | for { 43 | ignoreRequest(http.Get("http://" + servAddr + "/api/bar")) 44 | time.Sleep(time.Duration(20*oscillationFactor()) * time.Millisecond) 45 | } 46 | }() 47 | // POST /api/bar. 48 | go func() { 49 | for { 50 | ignoreRequest(http.Post("http://"+servAddr+"/api/bar", "text/plain", &bytes.Buffer{})) 51 | time.Sleep(time.Duration(100*oscillationFactor()) * time.Millisecond) 52 | } 53 | }() 54 | // GET /api/nonexistent. 55 | go func() { 56 | for { 57 | ignoreRequest(http.Get("http://" + servAddr + "/api/nonexistent")) 58 | time.Sleep(time.Duration(500*oscillationFactor()) * time.Millisecond) 59 | } 60 | }() 61 | } 62 | -------------------------------------------------------------------------------- /example_python/server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import random 4 | import threading 5 | import time 6 | from BaseHTTPServer import BaseHTTPRequestHandler 7 | from BaseHTTPServer import HTTPServer 8 | from SocketServer import ThreadingMixIn 9 | 10 | start = time.time() 11 | 12 | def generate_request_handler(average_latency_seconds, error_ratio, outage_duration_seconds): 13 | def f(self): 14 | in_outage = (time.time() - start) % (10 * outage_duration_seconds) < outage_duration_seconds 15 | sleep_time = max(0, random.normalvariate(average_latency_seconds, average_latency_seconds/10)) 16 | time.sleep(sleep_time * (3 if in_outage else 1)) 17 | if random.random() < error_ratio * (10 if in_outage else 1): 18 | self.send_response(500) 19 | else: 20 | self.send_response(200) 21 | self.end_headers() 22 | return f 23 | 24 | def handler_404(self): 25 | self.send_response(404) 26 | 27 | 28 | ROUTES = { 29 | ('GET', "/"): lambda self: self.wfile.write("Hello World!"), 30 | ('GET', "/favicon.ico"): lambda self: self.send_response(404), 31 | ('GET', "/api/foo"): generate_request_handler(.01, .005, 23.0), 32 | ('POST', "/api/foo"): generate_request_handler(.02, .02, 60.0), 33 | ('GET', "/api/bar"): generate_request_handler(.015, .00025, 13.0), 34 | ('POST', "/api/bar"): generate_request_handler(.05, .01, 47.0), 35 | } 36 | 37 | class Handler(BaseHTTPRequestHandler): 38 | def do_GET(self): 39 | ROUTES.get(('GET', self.path), handler_404)(self) 40 | 41 | def do_POST(self): 42 | ROUTES.get(('POST', self.path), handler_404)(self) 43 | 44 | class MultiThreadedHTTPServer(ThreadingMixIn, HTTPServer): 45 | pass 46 | 47 | class Server(threading.Thread): 48 | def run(self): 49 | httpd = MultiThreadedHTTPServer(('', 8081), Handler) 50 | httpd.serve_forever() 51 | -------------------------------------------------------------------------------- /example_golang/server.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "math/rand" 5 | "net/http" 6 | _ "net/http/pprof" 7 | "time" 8 | ) 9 | 10 | type responseOpts struct { 11 | baseLatency time.Duration 12 | errorRatio float64 13 | 14 | // Whenever 10*outageDuration has passed, an outage will be simulated 15 | // that lasts for outageDuration. During the outage, errorRatio is 16 | // increased by a factor of 10, and baseLatency by a factor of 3. At 17 | // start-up time, an outage is simulated, too (so that you can see the 18 | // effects right ahead and don't have to wait for 10*outageDuration). 19 | outageDuration time.Duration 20 | } 21 | 22 | var opts = map[string]map[string]responseOpts{ 23 | "/api/foo": map[string]responseOpts{ 24 | "GET": responseOpts{ 25 | baseLatency: 10 * time.Millisecond, 26 | errorRatio: 0.005, 27 | outageDuration: 23 * time.Second, 28 | }, 29 | "POST": responseOpts{ 30 | baseLatency: 20 * time.Millisecond, 31 | errorRatio: 0.02, 32 | outageDuration: time.Minute, 33 | }, 34 | }, 35 | "/api/bar": map[string]responseOpts{ 36 | "GET": responseOpts{ 37 | baseLatency: 15 * time.Millisecond, 38 | errorRatio: 0.0025, 39 | outageDuration: 13 * time.Second, 40 | }, 41 | "POST": responseOpts{ 42 | baseLatency: 50 * time.Millisecond, 43 | errorRatio: 0.01, 44 | outageDuration: 47 * time.Second, 45 | }, 46 | }, 47 | } 48 | 49 | func handleAPI(w http.ResponseWriter, r *http.Request) { 50 | pathOpts, ok := opts[r.URL.Path] 51 | if !ok { 52 | http.Error(w, "Not Found", http.StatusNotFound) 53 | return 54 | } 55 | methodOpts, ok := pathOpts[r.Method] 56 | if !ok { 57 | http.Error(w, "Method not Allowed", http.StatusMethodNotAllowed) 58 | return 59 | } 60 | 61 | latencyFactor := time.Duration(1) 62 | errorFactor := 1. 63 | if time.Since(start)%(10*methodOpts.outageDuration) < methodOpts.outageDuration { 64 | latencyFactor *= 3 65 | errorFactor *= 10 66 | } 67 | time.Sleep( 68 | (methodOpts.baseLatency + time.Duration(rand.NormFloat64()*float64(methodOpts.baseLatency)/10)) * latencyFactor, 69 | ) 70 | if rand.Float64() <= methodOpts.errorRatio*errorFactor { 71 | http.Error(w, "Internal Server Error", http.StatusInternalServerError) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /workshop.md: -------------------------------------------------------------------------------- 1 | # Getting started 2 | 3 | ## Getting Prometheus 4 | Download the latest binary release of Prometheus for your platform from: 5 | 6 | https://github.com/prometheus/prometheus/releases 7 | 8 | Extract the contents into a new directory and change to that directory. 9 | 10 | Example for Linux: 11 | 12 | If you're using Prometheus 0.16.0, the tarball already extracts into a separate 13 | sub-directory: 14 | 15 | ``` 16 | wget https://github.com/prometheus/prometheus/releases/download/v1.0.1/prometheus-1.0.1.linux-amd64.tar.gz 17 | tar xfvz prometheus-1.0.1.linux-amd64.tar.gz 18 | cd prometheus-1.0.1.linux-amd64 19 | ``` 20 | 21 | ## Configuring Prometheus to monitor itself 22 | 23 | Take a look at the included example `prometheus.yml` configuration file. It 24 | configures global options, as well as a single job to scrape metrics from: the 25 | Prometheus server itself. 26 | 27 | Prometheus collects metrics from monitored targets by scraping metrics HTTP 28 | endpoints on these targets. Since Prometheus also exposes data in the same 29 | manner about itself, it may also be used to scrape and monitor its own health. 30 | While a Prometheus server which collects only data about itself is not very 31 | useful in practice, it is a good starting example. 32 | 33 | ## Starting Prometheus 34 | Start Prometheus. By default, Prometheus reads its config from a file 35 | called `prometheus.yml` in the current working directory, and it 36 | stores its database in a sub-directory called `data`, again relative 37 | to the current working directory. Both behaviors can be changed using 38 | the flags `-config.file` or `-storage.local.path`, respectively. 39 | 40 | ``` 41 | ./prometheus -config.file=prometheus.yml -storage.local.path=data 42 | ``` 43 | 44 | Prometheus should start up and it should show the targets it scrapes at 45 | [http://localhost:9090/targets](http://localhost:9090/targets). You 46 | will find [http://localhost:9090/metrics](http://localhost:9090/metrics) in the 47 | list of scraped targets. Give Prometheus a couple of seconds to start 48 | collecting data about itself from its own HTTP metrics endpoint. 49 | 50 | You can also verify that Prometheus is serving metrics about itself by 51 | navigating to its metrics exposure endpoint: 52 | [http://localhost:9090/metrics](http://localhost:9090/metrics). 53 | 54 | ## Using the expression browser 55 | The query interface at 56 | [http://localhost:9090/](http://localhost:9090/) allows you to 57 | explore metric data collected by the Prometheus server. At the moment, the 58 | server is only scraping itself. The collected metrics are already quite 59 | interesting, though. The *Console* tab shows the most recent value of metrics, 60 | while the *Graph* tab plots values over time. The latter can be quite expensive 61 | (for both the server and the browser). It is in general a good idea to try 62 | potentially expensive expressions in the *Console* tab first. Take a bit of 63 | time to play with the expression browser. Suggestions: 64 | 65 | * Evaluate `prometheus_local_storage_ingested_samples_total`, which shows you 66 | the total number of ingested samples over the lifetime of the server. In the 67 | *Graph* tab, it will show as steadily increasing. 68 | * The expression `prometheus_local_storage_ingested_samples_total[1m]` 69 | evaluates to all sample values of the metric in the last minute. It cannot be 70 | plotted as a graph, but in the *Console* tab, you see a list of the values with 71 | (Unix) timestamp. 72 | * `rate(prometheus_local_storage_ingested_samples_total[1m])` calculates the 73 | rate (increase per second) over the 1m timeframe. In other words, it tells you 74 | how many samples per second your server is ingesting. This expression can be 75 | plotted nicely, and it will become more interesting as you add more targets. 76 | 77 | ## Start the node exporter 78 | The node exporter is a server that exposes system statistics about the machine 79 | it is running on as Prometheus metrics. 80 | 81 | Download the latest node exporter binary release for your platform from: 82 | 83 | https://github.com/prometheus/node_exporter/releases 84 | 85 | Beware that the majority of the node exporter's functionality is 86 | Linux-specific, so its exposed metrics will be significantly reduced when 87 | running it on other platforms. 88 | 89 | Linux example: 90 | 91 | ``` 92 | wget https://github.com/prometheus/node_exporter/releases/download/0.12.0/node_exporter-0.12.0.linux-amd64.tar.gz 93 | tar xvfz node_exporter-0.12.0.linux-amd64.tar.gz 94 | cd node_exporter-0.12.0.linux-amd64 95 | ``` 96 | 97 | Start the node exporter: 98 | 99 | ``` 100 | ./node_exporter 101 | ``` 102 | 103 | ## Configure Prometheus to monitor the node exporter 104 | 105 | If you are not running your local node exporter under Linux, you might want to 106 | point your Prometheus server to a Linux node exporter run by one of your peers 107 | in the workshop. Or point it to a node exporter we are running during the 108 | workshop at 109 | [http://demo.robustperception.io:9100/metrics](http://demo.robustperception.io:9100/metrics). 110 | 111 | Add the following job configuration to the `scrape_configs:` section 112 | in `prometheus.yml` to monitor both your own and the demo node 113 | exporter: 114 | 115 | ``` 116 | - job_name: 'node' 117 | scrape_interval: '15s' 118 | static_configs: 119 | - targets: 120 | - 'localhost:9100' 121 | - 'demo.robustperception.io:9100' 122 | ``` 123 | 124 | Send your Prometheus server a `SIGHUP` to initiate a reload of the configuration: 125 | 126 | ``` 127 | killall -HUP prometheus 128 | ``` 129 | 130 | Then check the *Status* page of your Prometheus server to make sure the node 131 | exporter is scraped correctly. Shortly after, a whole lot of interesting 132 | metrics will show up in the expression browser, each of them starting with 133 | `node_`. (Reload the page to see them in the autocompletion.) As an example, 134 | have a look at `node_cpu`. 135 | 136 | The node exporter has a whole lot of modules to export machine 137 | metrics. Have a look at the 138 | [README.md](https://github.com/prometheus/node_exporter) to get an 139 | idea. While Prometheus is particularly good at collecting service 140 | metrics, correlating those with system metrics from individual 141 | machines can be immensely helpful. (Perhaps that one task that showed 142 | high latency yesterday was scheduled on a node with a lot of competing 143 | disk operations?) 144 | 145 | ## Use the node exporter to export the contents of a text file 146 | The *textfile* module of the node exporter can be used to expose static 147 | machine-level metrics (such as what role a machine has) or the outcome of 148 | machine-tied batch jobs (such as a Chef client run). To use it, create a 149 | directory for the text files to export and (re-)start the node exporter with 150 | the `-collector.textfile.directory` flag set. Finally, create a text file in 151 | that directory. 152 | 153 | ``` 154 | mkdir textfile-exports 155 | ./node_exporter --collector.textfile.directory=textfile-exports 156 | echo 'role{role="workshop_node_exporter"} 1' > textfile-exports/role.prom.$$ 157 | mv textfile-exports/role.prom.$$ textfile-exports/role.prom 158 | ``` 159 | 160 | For details, see the 161 | [documentation](https://github.com/prometheus/node_exporter#textfile-collector). 162 | 163 | ## Configuring targets with service discovery 164 | 165 | Above you have seen how to configure multiple targets. You can also 166 | have multiple `- targets: [...]` sub-sections in the `static_configs` 167 | section, each with a different set of labels. 168 | 169 | Prometheus adds an `instance` label with the hostname and port as the value to 170 | each metric scraped from any target. With that label, you can later aggregate 171 | or separate metrics from different targets. 172 | 173 | In practice, configuring many targets statically is often a 174 | maintenance burden. The solution is service discovery. Currently, 175 | Prometheus supports service discovery via a number of methods. Here, 176 | we will look at service discovery via DNS SRV records. To try out a 177 | DNS SRV record, we have created one for `_demo-node._tcp.prometheus.io`: 178 | 179 | ``` 180 | dig +short SRV _demo-node._tcp.prometheus.io 181 | ``` 182 | 183 | Only one host and port is returned (the already known `_demo-node._tcp.prometheus.io` 184 | on port 9100), but any number of host/port combinations could be part of the 185 | SRV record. Prometheus regularly polls the DNS information and dynamically 186 | adjusts the targets. To configure a job with DNS service discovery, add the 187 | following to `prometheus.yml`: 188 | 189 | ``` 190 | - job_name: 'discovered_node' 191 | dns_sd_configs: 192 | - names: 193 | - '_demo-node._tcp.prometheus.io' 194 | ``` 195 | 196 | # The expression language 197 | 198 | With more metrics collected by your Prometheus server, it is time to 199 | familiarize yourself a bit more with the expression language. For comprehensive 200 | documentation, check out the 201 | [querying chapter](http://prometheus.io/docs/querying/basics/). The following 202 | is meant as an inspiration for how to play with the metrics currently collected 203 | by your server. Evaluate them in the *Console* and *Graph* tab. For the latter, 204 | try different time ranges and the *stacked* option. 205 | 206 | ## The `rate()` function 207 | Prometheus internally organizes sample data in chunks. It performs a number of 208 | different chunk operations on them and exposes them as 209 | `prometheus_local_storage_chunk_ops_total`, which is comprised of a number of 210 | counters, one per possible chunk operation. To see a rate of chunk operations 211 | per second, use the rate function over a time range that should cover at least 212 | a handful of scrape intervals. 213 | 214 | ``` 215 | rate(prometheus_local_storage_chunk_ops_total[1m]) 216 | ``` 217 | 218 | Now you can see the rate for each chunk operation type. Note that the rate 219 | function handles counter resets (for example if a binary is restarted). 220 | Whenever a counter goes down, the function assumes that a counter reset has 221 | happened and the counter has started counting from `0`. 222 | 223 | ## The `sum` aggregation operator 224 | If you want to get the total rate for all operations, you need to sum up the 225 | rates: 226 | 227 | ``` 228 | sum(rate(prometheus_local_storage_chunk_ops_total[1m])) 229 | ``` 230 | 231 | Note that you need to take the sum of the rate, and not the rate of the sum. 232 | (Exercise for the reader: Why?) 233 | 234 | ## Select by label 235 | If you want to look only at the persist operation, you can filter by label with 236 | curly braces: 237 | 238 | ``` 239 | rate(prometheus_local_storage_chunk_ops_total{type="persist"}[1m]) 240 | ``` 241 | 242 | You can use multiple label pairs within the curly braces (comma-separated), and 243 | the match can be inverted (with `!=`) or performed with a regular expression 244 | (with `=~`, or `!~` for the inverted match). 245 | 246 | (Exercise: How to estimate the average number of samples per chunk?) 247 | 248 | ## Aggregate by label 249 | The metric `http_request_duration_microseconds_count` counts the number of HTTP 250 | requests processed. (Disregard the `duration_microseconds` part for now. It 251 | will be explained later.) If you look at it in the *Console* tab, you can see 252 | the many time series with that name. The metric is partitioned by handler, 253 | instance, and job, resulting in many sample values at any given time. We call 254 | that an instant vector. 255 | 256 | If you are only interested in which job is serving how many QPS, you can let 257 | the sum operator aggregate by job (resulting in the two jobs we are monitoring, 258 | the Prometheus itself and the node exporter): 259 | 260 | ``` 261 | sum(rate(http_request_duration_microseconds_count[5m])) by (job) 262 | ``` 263 | 264 | A combination of label pairs is possible, too. You can aggregate by job and 265 | instance (which is interesting if you have added an additional node exporter to 266 | your config): 267 | 268 | ``` 269 | sum(rate(http_request_duration_microseconds_count[5m])) by (job, instance) 270 | ``` 271 | 272 | Note that there is an alternative syntax with the `by` clause following 273 | directly the aggregation operator. This syntax is particularly useful in 274 | complex nested expressions, where it otherwise becomes difficult to spot which 275 | `by` clause belongs to which operator. 276 | 277 | ``` 278 | sum by (job, instance) (rate(http_request_duration_microseconds_count[5m])) 279 | ``` 280 | 281 | ## Arithmetic 282 | There is a metric `http_request_duration_microseconds_sum`, which sums up the 283 | duration of all HTTP requests. If the labels match, you can easily divide two 284 | instant vectors, yielding the average request duration in this case: 285 | 286 | ``` 287 | rate(http_request_duration_microseconds_sum[5m]) / rate(http_request_duration_microseconds_count[5m]) 288 | ``` 289 | 290 | You can aggregate as above if you do it separately for numerator and 291 | denominator: 292 | 293 | ``` 294 | sum(rate(http_request_duration_microseconds_sum[5m])) by (job) / sum(rate(http_request_duration_microseconds_count[5m])) by (job) 295 | ``` 296 | 297 | Things become more interesting if the labels do not match perfectly 298 | between two instant vectors or you want to match vector elements in a 299 | many-to-one or one-to-many fashion. See the 300 | [vector-matching section](http://prometheus.io/docs/querying/operators/#vector-matching) 301 | in the documentation for details. 302 | 303 | ## Summaries 304 | Rather than an average request duration, you will be more often interested in 305 | quantiles like the median or the 90th percentile. To serve that need, 306 | Prometheus offers summaries. `http_request_duration_microseconds` is a summary 307 | of HTTP request durations, and `http_request_duration_microseconds_sum` and 308 | `http_request_duration_microseconds_count` are merely byproducts of that 309 | summary. If you look at `http_request_duration_microseconds` in the expression 310 | browser, you see a multitude of time series, as the metric is now partitioned 311 | by quantile, too. An expression like 312 | `http_request_duration_microseconds{quantile="0.9"}` displays the 90th 313 | percentile request duration. You might be tempted to aggregate the result as 314 | you have done above. Not possible, unfortunately. Welcome to the wonderland of 315 | statistics. 316 | 317 | Read more about 318 | [histograms and summaries](http://prometheus.io/docs/practices/histograms/) 319 | in the documentation. 320 | 321 | ## Recording rules 322 | In your practical work with Prometheus at scale, you will pretty soon run into 323 | expressions that are very expensive and slow to evaluate. The remedy is 324 | *recording* rules, a way to tell Prometheus to pre-calculate expressions, 325 | saving the result in a new time series, which can then be used instead of the 326 | expensive expression. See the documentation for details: 327 | * [General documentation about rules](http://prometheus.io/docs/querying/rules/). 328 | * [Best practices for naming rules](http://prometheus.io/docs/practices/). 329 | 330 | # Instrument code: Go 331 | 332 | *This section is about instrumenting a Go application. If you prefer 333 | Python, continue with the next section.* 334 | 335 | ## The example application 336 | 337 | The example application is in the same GitHub repository as these 338 | instructions. If you have not done so yet, clone the repository: 339 | 340 | ``` 341 | $ cd $GOPATH/src/ 342 | $ mkdir -p github.com/juliusv 343 | $ cd github.com/juliusv 344 | $ git clone https://github.com/juliusv/prometheus_workshop.git 345 | $ cd prometheus_workshop/example_golang 346 | $ go get -d 347 | $ go build 348 | $ ./example_golang 349 | ``` 350 | 351 | Study the code to understand what it is doing. Note that the 352 | application has been kept very simple for demonstration purposes and 353 | implements a server and a client in the same binary. 354 | 355 | ## Instrument it 356 | Instrument the server part with Prometheus. Things to keep in mind: 357 | 358 | * What would be useful to instrument? 359 | * What would be good variable names? 360 | * How can I instrument in one place rather than many? 361 | * How can/should I use labels? 362 | * How to expose the `/metrics` endpoint? 363 | 364 | The following links will be helpful: 365 | * [Documentation for the Prometheus Go client library](https://godoc.org/github.com/prometheus/client_golang/prometheus). 366 | * [Instrumentation guidelines](http://prometheus.io/docs/practices/instrumentation/). 367 | * [Naming conventions](http://prometheus.io/docs/practices/naming/). 368 | 369 | If you are lost, you can look at instrumented code in the branch called 370 | `instrumented` in the GitHub repository above. Note that the example 371 | instrumentation is not necessarily ideal and/or complete. 372 | 373 | # Instrument Code: Python 374 | 375 | *This section is about instrumenting a Python application. If you 376 | prefer Go, continue with the previous section.* 377 | 378 | ## The example application 379 | 380 | The example application is in the same GitHub repository as these 381 | instructions. If you have not done so yet, clone the repository: 382 | 383 | ``` 384 | $ git clone https://github.com/juliusv/prometheus_workshop.git 385 | $ cd prometheus_workshop/example_python 386 | ``` 387 | 388 | Install the Prometheus Python client library: 389 | 390 | ``` 391 | $ pip install prometheus_client 392 | ``` 393 | 394 | If you don't want to install python libraries globally, pass the `--user` flag to pip. 395 | 396 | Run the example application: 397 | 398 | ``` 399 | $ python main.py 400 | ``` 401 | 402 | ## Instrument it 403 | Instrument the client and server with Prometheus. Things to keep in mind: 404 | 405 | * What would be useful to instrument? 406 | * What would be good variable names? 407 | * How can I instrument in one place rather than many? 408 | * How can/should I use labels? 409 | * How to expose the /metrics endpoint? 410 | 411 | The following links will be helpful: 412 | * [Documentation for the Prometheus Python client library](https://github.com/prometheus/client_python#prometheus-python-client). 413 | * [Instrumentation guidelines](http://prometheus.io/docs/practices/instrumentation/). 414 | * [Naming conventions](http://prometheus.io/docs/practices/naming/). 415 | 416 | # Dashboard Building: Console Templates 417 | Console templates are a built-in dashboarding system in the Prometheus server. 418 | They are based on Go's templating language, which is more strongly typed than a 419 | typical web templating engine. 420 | 421 | You can see an example at 422 | [http://localhost:9090/consoles/node.html](http://localhost:9090/consoles/node.html). 423 | 424 | Task: Create a dashboard of QPS, latency, and "up" servers for the Go/Python 425 | code you instrumented above. 426 | 427 | The `consoles` directory that was part of the Prometheus tar-ball 428 | unpacked above contains a number of examples you can take as a base to 429 | work off. Look at `cassandra.html` for a start. (You can also access 430 | the 431 | [consoles directory on GitHub](https://github.com/prometheus/prometheus/blob/master/consoles/cassandra.html).) 432 | 433 | # Dashboard Building: PromDash 434 | 435 | TODO: PromDash is deprecated. Replace this section with Grafana. See 436 | https://prometheus.io/docs/visualization/grafana/ 437 | 438 | PromDash is a browser-based dashboard builder for Prometheus. It is a Rails 439 | application and stores its dashboard metadata in a configurable SQL backend. 440 | The actual graph data is retrieved by the browser via AJAX requests from the 441 | configured Prometheus servers. 442 | 443 | Follow the installation procedure at https://github.com/prometheus/promdash/blob/master/README.md. 444 | 445 | Let's create a dashboard to monitor the health of the Prometheus instance 446 | itself: 447 | 448 | 1. Head over to http://localhost:3000 and click "New Dashboard". 449 | 2. Create a dashboard called "<username>-workshop" (you don't need to select a 450 | directory). PromDash will redirect you to your new, empty dashboard. 451 | 3. Set the "Range" input field just under the dashboard title to "30m" to show 452 | the last 30 minutes of data in the dashboard (feel free to play with the graph 453 | time range later). 454 | 455 | Let's create a graph that shows the ingested samples per second: 456 | 457 | 1. Click on the "Datasources" menu item in the header line of the empty graph. 458 | 2. Click "Add Expression" and set the expression to 459 | `rate(prometheus_local_storage_ingested_samples_total[1m])` 460 | The graph should show the per-second rate of ingested samples. 461 | 3. Let's give the graph a title. Open the "Graph and axis settings" graph menu 462 | and set the title to "Ingested samples [rate-1m]". 463 | 4. Open the "Legend Settings" graph menu and set "Show legend" to "never", 464 | since this graph only contains a single time series. 465 | 5. Press "Save Changes" to save your progress. 466 | 467 | Let's add another graph showing the rates of the various chunk operations: 468 | 469 | 1. Click the "Add Graph" button to add a second graph. 470 | 2. Add the following expression to the graph: 471 | 472 | `rate(prometheus_local_storage_chunk_ops_total[1m])` 473 | 474 | The graph should now show the per-second rate of chunk operations of various kinds. 475 | 3. Set the graph title to "Chunk ops [rate-1m]". 476 | 4. The legend currently shows all labels of the returned time series, although 477 | only the "chunk" label differs. To show only that label in the legend, click 478 | the "Legend Settings" tab and set the existing "Legend format" input to 479 | `{{type}}`. 480 | 5. Because a graph may have multiple expressions with different applicable 481 | legend format strings each, we still need to assign each legend format string 482 | to a particular expression. Open the "Datasources" graph menu again and in the 483 | "- Select format string -" dropdown, select the format string that you just 484 | created. 485 | 6. Press "Save Changes" to save your progress. 486 | 487 | Finally, let's add a gauge that shows the number of expression queries 488 | performed against your Prometheus server per second: 489 | 490 | 1. Click the "Add Gauge" button to add a gauge. 491 | 2. Set the gauge expression to: 492 | 493 | `scalar(sum(rate(http_request_duration_microseconds_count{handler=~"/api/query"}[1m])))` 494 | 495 | The gauge should now show the number of expression queries per second. Note 496 | that a gauge only supports queries which result in scalar values without any 497 | labels. Thus, the entire expression is wrapped in a scalar() call. 498 | 3. Under the "Chart settings" menu tab of your gauge, set the title to 499 | "Expression queries [rate-1m]". 500 | 4. Let's adjust the gauge's maximum value. In the "Chart settings" menu tab of 501 | the gauge, set the "Gauge Max" to a lower value that seems reasonable for the 502 | rate of queries your server is getting. For example, try setting it to 5. 503 | 5. Press "Save Changes" to save your progress. 504 | 505 | Your dashboard should now look somewhat like this: 506 | 507 | [![PromDash screenshot](/images/promdash.png)](#promdash) 508 | 509 | PromDash supports many more features which we will not be able to explore in 510 | this workshop. For example: 511 | 512 | * graphing multiple expressions from different servers in the same graph 513 | * mapping expressions to different axes and setting various axis options 514 | * building templatized dashboards using template variables 515 | * adding pie charts, Graphite graphs, or arbitrary web content 516 | 517 | For a more comprehensive overview, see the [PromDash 518 | documentation](http://prometheus.io/docs/visualization/promdash/). 519 | 520 | # Alerting 521 | 522 | With instrumentation and a meaningful dashboard in place, the time is 523 | ripe to think about alerting. Alerting rules are set up similarly to 524 | recording rules. See the 525 | [section about alerting rules](https://prometheus.io/docs/alerting/rules/) 526 | in the documentation. You can inspect the status of configured alerts 527 | in the Alerts section of the Prometheus server's status page 528 | [http://localhost:9090/alerts](http://localhost:9090/alerts). However, 529 | for proper notifications, you need to set up an 530 | [Alertmanager](https://github.com/prometheus/alertmanager). 531 | 532 | To play with the Alertmanager, you can download a release from 533 | https://github.com/prometheus/alertmanager/releases. 534 | 535 | TODO: Alertmanager setup instructions. 536 | 537 | In the workshop, we will run the Alertmanager without any configured 538 | notifications, just to see how alerts arrive there. In practice, you want to 539 | configure one of the many notification methods described 540 | [in the docmentation](http://prometheus.io/docs/alerting/alertmanager/). Pay 541 | special attention to the aggregation rules, which allow you to route alerts to 542 | different destinations. 543 | 544 | To point your Prometheus server to an Alertmanager, use the `-alertmanager.url` 545 | flag. 546 | 547 | Alerting rules use the same expression language as used for graphing 548 | before. Here is an example for a very fundamental alerting rule: 549 | 550 | ``` 551 | # Alert for any monitored instance that is unreachable for >2 minutes. 552 | ALERT InstanceDown 553 | IF up == 0 554 | FOR 2m 555 | LABELS { 556 | severity="page" 557 | } 558 | ANNOTATIONS { 559 | runbook = "Instance {{$labels.instance}} down", 560 | description = "{{$labels.instance}} of job {{$labels.job}} has been down for more than 2 minutes.", 561 | runbook = "http://sinsip.com/xm.jpg" 562 | } 563 | ``` 564 | 565 | Add the rule to a configured rule file, reload the config, and observe the 566 | _Alerts_ tab on the Prometheus server status page and the _Alerts_ tab on the 567 | Alertmanager status page while you start and stop jobs monitored by your 568 | server. 569 | 570 | For meaningful alerting, refer to the 571 | [best practices section about alerting](http://prometheus.io/docs/practices/alerting/). 572 | 573 | Create a useful alerting rule for your example application and try it out. Possible tasks: 574 | * Create an alert on your service being down. Stop the service and 575 | check if and when the alert fires. 576 | * The example service simulates short outages now and then. Create an 577 | alert that will detect them. 578 | * Modify the code to simulate other kinds of outages and create alerts to 579 | detect them. 580 | * Run the example service multiple times (on different ports). Create 581 | alerts that fire if a certain percentage of replicas are down. 582 | * Create an alert that fires if the disk is predicted to run full 583 | within the next six 584 | hours. ([Hint](http://www.robustperception.io/reduce-noise-from-disk-space-alerts/) 585 | for cheaters.) 586 | 587 | # Pushing Metrics 588 | 589 | Occasionally, you might need to push metrics that are not 590 | machine-related. (The latter would be exposed via the `textfile` 591 | module of the node exporter, see above.) The 592 | [Pushgateway](http://prometheus.io/docs/instrumenting/pushing/) is a 593 | possible solution in that case. Note that it is not meant to change 594 | Prometheus's semantics to a push-based model. 595 | 596 | To play with the Pushgateway, you can download a release from 597 | https://github.com/prometheus/pushgateway/releases or build one from source 598 | yourself. 599 | 600 | Configure your Prometheus server to scrape the Pushgateway. The scrape config 601 | for a Pushgateway should have `honor_labels` set to `true`. (Later, you can try 602 | out what happens if you leave it at its default value `false`.) 603 | 604 | ``` 605 | - job_name: 'pushgateway' 606 | scrape_interval: '15s' 607 | honor_labels: true 608 | static_configs: 609 | - targets: 610 | - 'http://localhost:9091/metrics' 611 | ``` 612 | 613 | Prometheus client libraries allow you to push to the Pushgateway, but you can 614 | also push in a very simple way using `curl`. Imagine a script that runs a 615 | database backup via some kind of (possibly distributed) cron solution. Upon 616 | successful completion, it should report the completion timestamp. 617 | 618 | ``` 619 | #!/bin/bash 620 | 621 | set -e 622 | 623 | # Some command that creates the backup. 624 | 625 | echo "db_backup_last_success_timestamp_seconds $(date +%s)" | curl --data-binary @- http://demo-node.prometheus.io:9091/metrics/job/foo_db 626 | ``` 627 | 628 | Check the status page of the Pushgateway and its `/metrics` endpoint for the 629 | pushed metrics, and then observe how it is ingested by Prometheus. Pay special 630 | attention to the difference between the scrape timestamp and the timestamp that 631 | is the value of the metric. How would you graph the age of the backup? How 632 | would you alert on a backup too old? 633 | --------------------------------------------------------------------------------