├── README.md
├── images
    └── promdash.png
├── example_python
    ├── README.md
    ├── main.py
    ├── client.py
    └── server.py
├── example_golang
    ├── main.go
    ├── client.go
    └── server.go
└── workshop.md


/README.md:
--------------------------------------------------------------------------------
1 | ## This very outdated repo is archived and will not receive further changes.
2 | 


--------------------------------------------------------------------------------
/images/promdash.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliusv/prometheus_workshop/HEAD/images/promdash.png


--------------------------------------------------------------------------------
/example_python/README.md:
--------------------------------------------------------------------------------
1 | The original repository for the Python example is
2 | [brian-brazil/prometheus_workshop_python](https://github.com/brian-brazil/prometheus_workshop_python).
3 | 


--------------------------------------------------------------------------------
/example_python/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | from server import Server
 4 | from client import start_request_workers
 5 | 
 6 | 
 7 | if __name__ == '__main__':
 8 |     Server().start()
 9 |     start_request_workers()
10 | 


--------------------------------------------------------------------------------
/example_golang/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"flag"
 5 | 	"net/http"
 6 | 	"os"
 7 | 	"time"
 8 | 
 9 | 	"github.com/justinas/alice"
10 | 	"github.com/streadway/handy/report"
11 | )
12 | 
13 | var (
14 | 	addr = flag.String("listen-address", ":8080", "The address to listen on for HTTP requests.")
15 | 
16 | 	start = time.Now()
17 | )
18 | 
19 | func main() {
20 | 	flag.Parse()
21 | 
22 | 	http.HandleFunc("/api/", handleAPI)
23 | 
24 | 	// Log every received HTTP request to stdout.
25 | 	go http.ListenAndServe(*addr, alice.New(
26 | 		report.JSONMiddleware(os.Stdout),
27 | 	).Then(http.DefaultServeMux))
28 | 
29 | 	startClient(*addr)
30 | 
31 | 	select {}
32 | }
33 | 


--------------------------------------------------------------------------------
/example_python/client.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import math
 4 | import urllib2
 5 | import thread
 6 | import time
 7 | 
 8 | OSCILLATION_PERIOD_SECONDS = 300.0
 9 | 
10 | 
11 | def send_request(method, path):
12 |     data = None
13 |     if method == 'POST':
14 |         data = ''
15 |     try:
16 |         urllib2.urlopen('http://localhost:8081' + path, data)
17 |     except urllib2.HTTPError:
18 |         pass
19 |     except:
20 |         pass
21 | 
22 | start = time.time()
23 | 
24 | def oscillation_factor():
25 |     return 2 + math.sin(math.sin(2 * math.pi * (time.time() - start) / OSCILLATION_PERIOD_SECONDS))
26 | 
27 | def request_worker(method, path, sleep):
28 |     while True:
29 |         send_request(method, path)
30 |         time.sleep(sleep * oscillation_factor())
31 | 
32 | def start_request_workers():
33 |     thread.start_new_thread(request_worker, ('GET', '/api/foo', .01))
34 |     thread.start_new_thread(request_worker, ('POST', '/api/foo', .15))
35 |     thread.start_new_thread(request_worker, ('GET', '/api/bar', .02))
36 |     thread.start_new_thread(request_worker, ('POST', '/api/foo', .1))
37 |     thread.start_new_thread(request_worker, ('GET', '/api/nonexistent', .5))
38 | 


--------------------------------------------------------------------------------
/example_golang/client.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"flag"
 6 | 	"math"
 7 | 	"net/http"
 8 | 	"time"
 9 | )
10 | 
11 | var oscillationPeriod = flag.Duration("oscillation-period", 5*time.Minute, "The duration of the rate oscillation period.")
12 | 
13 | func startClient(servAddr string) {
14 | 
15 | 	oscillationFactor := func() float64 {
16 | 		return 2 + math.Sin(math.Sin(2*math.Pi*float64(time.Since(start))/float64(*oscillationPeriod)))
17 | 	}
18 | 
19 | 	ignoreRequest := func(resp *http.Response, err error) {
20 | 		if err != nil {
21 | 			return
22 | 		}
23 | 		resp.Body.Close()
24 | 	}
25 | 
26 | 	// GET /api/foo.
27 | 	go func() {
28 | 		for {
29 | 			ignoreRequest(http.Get("http://" + servAddr + "/api/foo"))
30 | 			time.Sleep(time.Duration(10*oscillationFactor()) * time.Millisecond)
31 | 		}
32 | 	}()
33 | 	// POST /api/foo.
34 | 	go func() {
35 | 		for {
36 | 			ignoreRequest(http.Post("http://"+servAddr+"/api/foo", "text/plain", &bytes.Buffer{}))
37 | 			time.Sleep(time.Duration(150*oscillationFactor()) * time.Millisecond)
38 | 		}
39 | 	}()
40 | 	// GET /api/bar.
41 | 	go func() {
42 | 		for {
43 | 			ignoreRequest(http.Get("http://" + servAddr + "/api/bar"))
44 | 			time.Sleep(time.Duration(20*oscillationFactor()) * time.Millisecond)
45 | 		}
46 | 	}()
47 | 	// POST /api/bar.
48 | 	go func() {
49 | 		for {
50 | 			ignoreRequest(http.Post("http://"+servAddr+"/api/bar", "text/plain", &bytes.Buffer{}))
51 | 			time.Sleep(time.Duration(100*oscillationFactor()) * time.Millisecond)
52 | 		}
53 | 	}()
54 | 	// GET /api/nonexistent.
55 | 	go func() {
56 | 		for {
57 | 			ignoreRequest(http.Get("http://" + servAddr + "/api/nonexistent"))
58 | 			time.Sleep(time.Duration(500*oscillationFactor()) * time.Millisecond)
59 | 		}
60 | 	}()
61 | }
62 | 


--------------------------------------------------------------------------------
/example_python/server.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import random
 4 | import threading
 5 | import time
 6 | from BaseHTTPServer import BaseHTTPRequestHandler
 7 | from BaseHTTPServer import HTTPServer
 8 | from SocketServer import ThreadingMixIn
 9 | 
10 | start = time.time()
11 | 
12 | def generate_request_handler(average_latency_seconds, error_ratio, outage_duration_seconds):
13 |     def f(self):
14 |         in_outage = (time.time() - start) % (10 * outage_duration_seconds) < outage_duration_seconds
15 |         sleep_time = max(0, random.normalvariate(average_latency_seconds, average_latency_seconds/10))
16 |         time.sleep(sleep_time * (3 if in_outage else 1))
17 |         if random.random() < error_ratio * (10 if in_outage else 1):
18 |             self.send_response(500)
19 |         else:
20 |             self.send_response(200)
21 |         self.end_headers()
22 |     return f
23 | 
24 | def handler_404(self):
25 |   self.send_response(404)
26 | 
27 |       
28 | ROUTES = {
29 |     ('GET', "/"): lambda self: self.wfile.write("Hello World!"),
30 |     ('GET', "/favicon.ico"): lambda self: self.send_response(404),
31 |     ('GET', "/api/foo"): generate_request_handler(.01, .005, 23.0),
32 |     ('POST', "/api/foo"): generate_request_handler(.02, .02, 60.0),
33 |     ('GET', "/api/bar"): generate_request_handler(.015, .00025, 13.0),
34 |     ('POST', "/api/bar"): generate_request_handler(.05, .01, 47.0),
35 | }
36 | 
37 | class Handler(BaseHTTPRequestHandler):
38 |     def do_GET(self):
39 |       ROUTES.get(('GET', self.path), handler_404)(self)
40 | 
41 |     def do_POST(self):
42 |       ROUTES.get(('POST', self.path), handler_404)(self)
43 |         
44 | class MultiThreadedHTTPServer(ThreadingMixIn, HTTPServer):
45 |       pass
46 | 
47 | class Server(threading.Thread):
48 |     def run(self):
49 |         httpd = MultiThreadedHTTPServer(('', 8081), Handler)
50 |         httpd.serve_forever()
51 | 


--------------------------------------------------------------------------------
/example_golang/server.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"math/rand"
 5 | 	"net/http"
 6 | 	_ "net/http/pprof"
 7 | 	"time"
 8 | )
 9 | 
10 | type responseOpts struct {
11 | 	baseLatency time.Duration
12 | 	errorRatio  float64
13 | 
14 | 	// Whenever 10*outageDuration has passed, an outage will be simulated
15 | 	// that lasts for outageDuration. During the outage, errorRatio is
16 | 	// increased by a factor of 10, and baseLatency by a factor of 3.  At
17 | 	// start-up time, an outage is simulated, too (so that you can see the
18 | 	// effects right ahead and don't have to wait for 10*outageDuration).
19 | 	outageDuration time.Duration
20 | }
21 | 
22 | var opts = map[string]map[string]responseOpts{
23 | 	"/api/foo": map[string]responseOpts{
24 | 		"GET": responseOpts{
25 | 			baseLatency:    10 * time.Millisecond,
26 | 			errorRatio:     0.005,
27 | 			outageDuration: 23 * time.Second,
28 | 		},
29 | 		"POST": responseOpts{
30 | 			baseLatency:    20 * time.Millisecond,
31 | 			errorRatio:     0.02,
32 | 			outageDuration: time.Minute,
33 | 		},
34 | 	},
35 | 	"/api/bar": map[string]responseOpts{
36 | 		"GET": responseOpts{
37 | 			baseLatency:    15 * time.Millisecond,
38 | 			errorRatio:     0.0025,
39 | 			outageDuration: 13 * time.Second,
40 | 		},
41 | 		"POST": responseOpts{
42 | 			baseLatency:    50 * time.Millisecond,
43 | 			errorRatio:     0.01,
44 | 			outageDuration: 47 * time.Second,
45 | 		},
46 | 	},
47 | }
48 | 
49 | func handleAPI(w http.ResponseWriter, r *http.Request) {
50 | 	pathOpts, ok := opts[r.URL.Path]
51 | 	if !ok {
52 | 		http.Error(w, "Not Found", http.StatusNotFound)
53 | 		return
54 | 	}
55 | 	methodOpts, ok := pathOpts[r.Method]
56 | 	if !ok {
57 | 		http.Error(w, "Method not Allowed", http.StatusMethodNotAllowed)
58 | 		return
59 | 	}
60 | 
61 | 	latencyFactor := time.Duration(1)
62 | 	errorFactor := 1.
63 | 	if time.Since(start)%(10*methodOpts.outageDuration) < methodOpts.outageDuration {
64 | 		latencyFactor *= 3
65 | 		errorFactor *= 10
66 | 	}
67 | 	time.Sleep(
68 | 		(methodOpts.baseLatency + time.Duration(rand.NormFloat64()*float64(methodOpts.baseLatency)/10)) * latencyFactor,
69 | 	)
70 | 	if rand.Float64() <= methodOpts.errorRatio*errorFactor {
71 | 		http.Error(w, "Internal Server Error", http.StatusInternalServerError)
72 | 	}
73 | }
74 | 


--------------------------------------------------------------------------------
/workshop.md:
--------------------------------------------------------------------------------
  1 | # Getting started
  2 | 
  3 | ## Getting Prometheus
  4 | Download the latest binary release of Prometheus for your platform from:
  5 | 
  6 | https://github.com/prometheus/prometheus/releases
  7 | 
  8 | Extract the contents into a new directory and change to that directory.
  9 | 
 10 | Example for Linux:
 11 | 
 12 | If you're using Prometheus 0.16.0, the tarball already extracts into a separate
 13 | sub-directory:
 14 | 
 15 | ```
 16 | wget https://github.com/prometheus/prometheus/releases/download/v1.0.1/prometheus-1.0.1.linux-amd64.tar.gz
 17 | tar xfvz prometheus-1.0.1.linux-amd64.tar.gz
 18 | cd prometheus-1.0.1.linux-amd64
 19 | ```
 20 | 
 21 | ## Configuring Prometheus to monitor itself
 22 | 
 23 | Take a look at the included example `prometheus.yml` configuration file. It
 24 | configures global options, as well as a single job to scrape metrics from: the
 25 | Prometheus server itself.
 26 | 
 27 | Prometheus collects metrics from monitored targets by scraping metrics HTTP
 28 | endpoints on these targets. Since Prometheus also exposes data in the same
 29 | manner about itself, it may also be used to scrape and monitor its own health.
 30 | While a Prometheus server which collects only data about itself is not very
 31 | useful in practice, it is a good starting example.
 32 | 
 33 | ## Starting Prometheus
 34 | Start Prometheus. By default, Prometheus reads its config from a file
 35 | called `prometheus.yml` in the current working directory, and it
 36 | stores its database in a sub-directory called `data`, again relative
 37 | to the current working directory. Both behaviors can be changed using
 38 | the flags `-config.file` or `-storage.local.path`, respectively.
 39 | 
 40 | ```
 41 | ./prometheus -config.file=prometheus.yml -storage.local.path=data
 42 | ```
 43 | 
 44 | Prometheus should start up and it should show the targets it scrapes at
 45 | [http://localhost:9090/targets](http://localhost:9090/targets). You
 46 | will find [http://localhost:9090/metrics](http://localhost:9090/metrics) in the
 47 | list of scraped targets. Give Prometheus a couple of seconds to start
 48 | collecting data about itself from its own HTTP metrics endpoint.
 49 | 
 50 | You can also verify that Prometheus is serving metrics about itself by
 51 | navigating to its metrics exposure endpoint:
 52 | [http://localhost:9090/metrics](http://localhost:9090/metrics).
 53 | 
 54 | ## Using the expression browser
 55 | The query interface at
 56 | [http://localhost:9090/](http://localhost:9090/) allows you to
 57 | explore metric data collected by the Prometheus server. At the moment, the
 58 | server is only scraping itself. The collected metrics are already quite
 59 | interesting, though.  The *Console* tab shows the most recent value of metrics,
 60 | while the *Graph* tab plots values over time. The latter can be quite expensive
 61 | (for both the server and the browser). It is in general a good idea to try
 62 | potentially expensive expressions in the *Console* tab first. Take a bit of
 63 | time to play with the expression browser. Suggestions:
 64 | 
 65 | * Evaluate `prometheus_local_storage_ingested_samples_total`, which shows you
 66 |   the total number of ingested samples over the lifetime of the server. In the
 67 |   *Graph* tab, it will show as steadily increasing.
 68 | * The expression `prometheus_local_storage_ingested_samples_total[1m]`
 69 |   evaluates to all sample values of the metric in the last minute. It cannot be
 70 |   plotted as a graph, but in the *Console* tab, you see a list of the values with
 71 |   (Unix) timestamp.
 72 | * `rate(prometheus_local_storage_ingested_samples_total[1m])` calculates the
 73 |   rate (increase per second) over the 1m timeframe. In other words, it tells you
 74 |   how many samples per second your server is ingesting. This expression can be
 75 |   plotted nicely, and it will become more interesting as you add more targets.
 76 | 
 77 | ## Start the node exporter
 78 | The node exporter is a server that exposes system statistics about the machine
 79 | it is running on as Prometheus metrics.
 80 | 
 81 | Download the latest node exporter binary release for your platform from:
 82 | 
 83 | https://github.com/prometheus/node_exporter/releases
 84 | 
 85 | Beware that the majority of the node exporter's functionality is
 86 | Linux-specific, so its exposed metrics will be significantly reduced when
 87 | running it on other platforms.
 88 | 
 89 | Linux example:
 90 | 
 91 | ```
 92 | wget https://github.com/prometheus/node_exporter/releases/download/0.12.0/node_exporter-0.12.0.linux-amd64.tar.gz
 93 | tar xvfz node_exporter-0.12.0.linux-amd64.tar.gz
 94 | cd node_exporter-0.12.0.linux-amd64
 95 | ```
 96 | 
 97 | Start the node exporter:
 98 | 
 99 | ```
100 | ./node_exporter
101 | ```
102 | 
103 | ## Configure Prometheus to monitor the node exporter
104 | 
105 | If you are not running your local node exporter under Linux, you might want to
106 | point your Prometheus server to a Linux node exporter run by one of your peers
107 | in the workshop. Or point it to a node exporter we are running during the
108 | workshop at
109 | [http://demo.robustperception.io:9100/metrics](http://demo.robustperception.io:9100/metrics).
110 | 
111 | Add the following job configuration to the `scrape_configs:` section
112 | in `prometheus.yml` to monitor both your own and the demo node
113 | exporter:
114 | 
115 | ```
116 |   - job_name: 'node'
117 |     scrape_interval: '15s'
118 |     static_configs:
119 |       - targets:
120 |           - 'localhost:9100'
121 |           - 'demo.robustperception.io:9100'
122 | ```
123 | 
124 | Send your Prometheus server a `SIGHUP` to initiate a reload of the configuration:
125 | 
126 | ```
127 | killall -HUP prometheus
128 | ```
129 | 
130 | Then check the *Status* page of your Prometheus server to make sure the node
131 | exporter is scraped correctly. Shortly after, a whole lot of interesting
132 | metrics will show up in the expression browser, each of them starting with
133 | `node_`. (Reload the page to see them in the autocompletion.) As an example,
134 | have a look at `node_cpu`.
135 | 
136 | The node exporter has a whole lot of modules to export machine
137 | metrics. Have a look at the
138 | [README.md](https://github.com/prometheus/node_exporter) to get an
139 | idea. While Prometheus is particularly good at collecting service
140 | metrics, correlating those with system metrics from individual
141 | machines can be immensely helpful.  (Perhaps that one task that showed
142 | high latency yesterday was scheduled on a node with a lot of competing
143 | disk operations?)
144 | 
145 | ## Use the node exporter to export the contents of a text file
146 | The *textfile* module of the node exporter can be used to expose static
147 | machine-level metrics (such as what role a machine has) or the outcome of
148 | machine-tied batch jobs (such as a Chef client run). To use it, create a
149 | directory for the text files to export and (re-)start the node exporter with
150 | the `-collector.textfile.directory` flag set. Finally, create a text file in
151 | that directory.
152 | 
153 | ```
154 | mkdir textfile-exports
155 | ./node_exporter --collector.textfile.directory=textfile-exports
156 | echo 'role{role="workshop_node_exporter"} 1' > textfile-exports/role.prom.$$
157 | mv textfile-exports/role.prom.$$ textfile-exports/role.prom
158 | ```
159 | 
160 | For details, see the
161 | [documentation](https://github.com/prometheus/node_exporter#textfile-collector).
162 | 
163 | ## Configuring targets with service discovery
164 | 
165 | Above you have seen how to configure multiple targets. You can also
166 | have multiple `- targets: [...]` sub-sections in the `static_configs`
167 | section, each with a different set of labels.
168 | 
169 | Prometheus adds an `instance` label with the hostname and port as the value to
170 | each metric scraped from any target. With that label, you can later aggregate
171 | or separate metrics from different targets.
172 | 
173 | In practice, configuring many targets statically is often a
174 | maintenance burden.  The solution is service discovery. Currently,
175 | Prometheus supports service discovery via a number of methods. Here,
176 | we will look at service discovery via DNS SRV records. To try out a
177 | DNS SRV record, we have created one for `_demo-node._tcp.prometheus.io`:
178 | 
179 | ```
180 | dig +short SRV _demo-node._tcp.prometheus.io
181 | ```
182 | 
183 | Only one host and port is returned (the already known `_demo-node._tcp.prometheus.io`
184 | on port 9100), but any number of host/port combinations could be part of the
185 | SRV record. Prometheus regularly polls the DNS information and dynamically
186 | adjusts the targets. To configure a job with DNS service discovery, add the
187 | following to `prometheus.yml`:
188 | 
189 | ```
190 | - job_name: 'discovered_node'
191 |   dns_sd_configs:
192 |     - names:
193 |         - '_demo-node._tcp.prometheus.io'
194 | ```
195 | 
196 | # The expression language
197 | 
198 | With more metrics collected by your Prometheus server, it is time to
199 | familiarize yourself a bit more with the expression language. For comprehensive
200 | documentation, check out the
201 | [querying chapter](http://prometheus.io/docs/querying/basics/). The following
202 | is meant as an inspiration for how to play with the metrics currently collected
203 | by your server. Evaluate them in the *Console* and *Graph* tab. For the latter,
204 | try different time ranges and the *stacked* option.
205 | 
206 | ## The `rate()` function
207 | Prometheus internally organizes sample data in chunks. It performs a number of
208 | different chunk operations on them and exposes them as
209 | `prometheus_local_storage_chunk_ops_total`, which is comprised of a number of
210 | counters, one per possible chunk operation. To see a rate of chunk operations
211 | per second, use the rate function over a time range that should cover at least
212 | a handful of scrape intervals.
213 | 
214 | ```
215 | rate(prometheus_local_storage_chunk_ops_total[1m])
216 | ```
217 | 
218 | Now you can see the rate for each chunk operation type.  Note that the rate
219 | function handles counter resets (for example if a binary is restarted).
220 | Whenever a counter goes down, the function assumes that a counter reset has
221 | happened and the counter has started counting from `0`.
222 | 
223 | ## The `sum` aggregation operator
224 | If you want to get the total rate for all operations, you need to sum up the
225 | rates:
226 | 
227 | ```
228 | sum(rate(prometheus_local_storage_chunk_ops_total[1m]))
229 | ```
230 | 
231 | Note that you need to take the sum of the rate, and not the rate of the sum.
232 | (Exercise for the reader: Why?)
233 | 
234 | ## Select by label
235 | If you want to look only at the persist operation, you can filter by label with
236 | curly braces:
237 | 
238 | ```
239 | rate(prometheus_local_storage_chunk_ops_total{type="persist"}[1m])
240 | ```
241 | 
242 | You can use multiple label pairs within the curly braces (comma-separated), and
243 | the match can be inverted (with `!=`) or performed with a regular expression
244 | (with `=~`, or `!~` for the inverted match).
245 | 
246 | (Exercise: How to estimate the average number of samples per chunk?)
247 | 
248 | ## Aggregate by label
249 | The metric `http_request_duration_microseconds_count` counts the number of HTTP
250 | requests processed. (Disregard the `duration_microseconds` part for now. It
251 | will be explained later.) If you look at it in the *Console* tab, you can see
252 | the many time series with that name. The metric is partitioned by handler,
253 | instance, and job, resulting in many sample values at any given time. We call
254 | that an instant vector.
255 | 
256 | If you are only interested in which job is serving how many QPS, you can let
257 | the sum operator aggregate by job (resulting in the two jobs we are monitoring,
258 | the Prometheus itself and the node exporter):
259 | 
260 | ```
261 | sum(rate(http_request_duration_microseconds_count[5m])) by (job)
262 | ```
263 | 
264 | A combination of label pairs is possible, too. You can aggregate by job and
265 | instance (which is interesting if you have added an additional node exporter to
266 | your config):
267 | 
268 | ```
269 | sum(rate(http_request_duration_microseconds_count[5m])) by (job, instance)
270 | ```
271 | 
272 | Note that there is an alternative syntax with the `by` clause following
273 | directly the aggregation operator. This syntax is particularly useful in
274 | complex nested expressions, where it otherwise becomes difficult to spot which
275 | `by` clause belongs to which operator.
276 | 
277 | ```
278 | sum by (job, instance) (rate(http_request_duration_microseconds_count[5m]))
279 | ```
280 | 
281 | ## Arithmetic
282 | There is a metric `http_request_duration_microseconds_sum`, which sums up the
283 | duration of all HTTP requests. If the labels match, you can easily divide two
284 | instant vectors, yielding the average request duration in this case:
285 | 
286 | ```
287 | rate(http_request_duration_microseconds_sum[5m]) / rate(http_request_duration_microseconds_count[5m])
288 | ```
289 | 
290 | You can aggregate as above if you do it separately for numerator and
291 | denominator:
292 | 
293 | ```
294 | sum(rate(http_request_duration_microseconds_sum[5m])) by (job) / sum(rate(http_request_duration_microseconds_count[5m])) by (job)
295 | ```
296 | 
297 | Things become more interesting if the labels do not match perfectly
298 | between two instant vectors or you want to match vector elements in a
299 | many-to-one or one-to-many fashion. See the
300 | [vector-matching section](http://prometheus.io/docs/querying/operators/#vector-matching)
301 | in the documentation for details.
302 | 
303 | ## Summaries
304 | Rather than an average request duration, you will be more often interested in
305 | quantiles like the median or the 90th percentile. To serve that need,
306 | Prometheus offers summaries. `http_request_duration_microseconds` is a summary
307 | of HTTP request durations, and `http_request_duration_microseconds_sum` and
308 | `http_request_duration_microseconds_count` are merely byproducts of that
309 | summary.  If you look at `http_request_duration_microseconds` in the expression
310 | browser, you see a multitude of time series, as the metric is now partitioned
311 | by quantile, too. An expression like
312 | `http_request_duration_microseconds{quantile="0.9"}` displays the 90th
313 | percentile request duration. You might be tempted to aggregate the result as
314 | you have done above. Not possible, unfortunately. Welcome to the wonderland of
315 | statistics.
316 | 
317 | Read more about
318 | [histograms and summaries](http://prometheus.io/docs/practices/histograms/)
319 | in the documentation.
320 | 
321 | ## Recording rules
322 | In your practical work with Prometheus at scale, you will pretty soon run into
323 | expressions that are very expensive and slow to evaluate. The remedy is
324 | *recording* rules, a way to tell Prometheus to pre-calculate expressions,
325 | saving the result in a new time series, which can then be used instead of the
326 | expensive expression. See the documentation for details:
327 | * [General documentation about rules](http://prometheus.io/docs/querying/rules/).
328 | * [Best practices for naming rules](http://prometheus.io/docs/practices/).
329 | 
330 | # Instrument code: Go
331 | 
332 | *This section is about instrumenting a Go application. If you prefer
333 |  Python, continue with the next section.*
334 | 
335 | ## The example application
336 | 
337 | The example application is in the same GitHub repository as these
338 | instructions. If you have not done so yet, clone the repository:
339 | 
340 | ```
341 | $ cd $GOPATH/src/
342 | $ mkdir -p github.com/juliusv
343 | $ cd github.com/juliusv
344 | $ git clone https://github.com/juliusv/prometheus_workshop.git
345 | $ cd prometheus_workshop/example_golang
346 | $ go get -d
347 | $ go build
348 | $ ./example_golang
349 | ```
350 | 
351 | Study the code to understand what it is doing. Note that the
352 | application has been kept very simple for demonstration purposes and
353 | implements a server and a client in the same binary.
354 | 
355 | ## Instrument it
356 | Instrument the server part with Prometheus. Things to keep in mind:
357 | 
358 | * What would be useful to instrument?
359 | * What would be good variable names?
360 | * How can I instrument in one place rather than many?
361 | * How can/should I use labels?
362 | * How to expose the `/metrics` endpoint?
363 | 
364 | The following links will be helpful:
365 | * [Documentation for the Prometheus Go client library](https://godoc.org/github.com/prometheus/client_golang/prometheus).
366 | * [Instrumentation guidelines](http://prometheus.io/docs/practices/instrumentation/).
367 | * [Naming conventions](http://prometheus.io/docs/practices/naming/).
368 | 
369 | If you are lost, you can look at instrumented code in the branch called
370 | `instrumented` in the GitHub repository above. Note that the example
371 | instrumentation is not necessarily ideal and/or complete.
372 | 
373 | # Instrument Code: Python
374 | 
375 | *This section is about instrumenting a Python application. If you
376 |  prefer Go, continue with the previous section.*
377 | 
378 | ## The example application
379 | 
380 | The example application is in the same GitHub repository as these
381 | instructions. If you have not done so yet, clone the repository:
382 | 
383 | ```
384 | $ git clone https://github.com/juliusv/prometheus_workshop.git
385 | $ cd prometheus_workshop/example_python
386 | ```
387 | 
388 | Install the Prometheus Python client library:
389 | 
390 | ```
391 | $ pip install prometheus_client
392 | ```
393 | 
394 | If you don't want to install python libraries globally, pass the `--user` flag to pip.
395 | 
396 | Run the example application:
397 | 
398 | ```
399 | $ python main.py
400 | ```
401 | 
402 | ## Instrument it
403 | Instrument the client and server with Prometheus. Things to keep in mind:
404 | 
405 | * What would be useful to instrument?
406 | * What would be good variable names?
407 | * How can I instrument in one place rather than many?
408 | * How can/should I use labels?
409 | * How to expose the /metrics endpoint?
410 | 
411 | The following links will be helpful:
412 | * [Documentation for the Prometheus Python client library](https://github.com/prometheus/client_python#prometheus-python-client).
413 | * [Instrumentation guidelines](http://prometheus.io/docs/practices/instrumentation/).
414 | * [Naming conventions](http://prometheus.io/docs/practices/naming/).
415 | 
416 | # Dashboard Building: Console Templates
417 | Console templates are a built-in dashboarding system in the Prometheus server.
418 | They are based on Go's templating language, which is more strongly typed than a
419 | typical web templating engine.
420 | 
421 | You can see an example at
422 | [http://localhost:9090/consoles/node.html](http://localhost:9090/consoles/node.html).
423 | 
424 | Task: Create a dashboard of QPS, latency, and "up" servers for the Go/Python
425 | code you instrumented above.
426 | 
427 | The `consoles` directory that was part of the Prometheus tar-ball
428 | unpacked above contains a number of examples you can take as a base to
429 | work off. Look at `cassandra.html` for a start. (You can also access
430 | the
431 | [consoles directory on GitHub](https://github.com/prometheus/prometheus/blob/master/consoles/cassandra.html).)
432 | 
433 | # Dashboard Building: PromDash
434 | 
435 | TODO: PromDash is deprecated. Replace this section with Grafana. See
436 | https://prometheus.io/docs/visualization/grafana/
437 | 
438 | PromDash is a browser-based dashboard builder for Prometheus. It is a Rails
439 | application and stores its dashboard metadata in a configurable SQL backend.
440 | The actual graph data is retrieved by the browser via AJAX requests from the
441 | configured Prometheus servers.
442 | 
443 | Follow the installation procedure at https://github.com/prometheus/promdash/blob/master/README.md.
444 | 
445 | Let's create a dashboard to monitor the health of the Prometheus instance
446 | itself:
447 | 
448 | 1. Head over to http://localhost:3000 and click "New Dashboard".
449 | 2. Create a dashboard called "&lt;username&gt;-workshop" (you don't need to select a
450 |    directory). PromDash will redirect you to your new, empty dashboard.
451 | 3. Set the "Range" input field just under the dashboard title to "30m" to show
452 |    the last 30 minutes of data in the dashboard (feel free to play with the graph
453 |    time range later).
454 | 
455 | Let's create a graph that shows the ingested samples per second:
456 | 
457 | 1. Click on the "Datasources" menu item in the header line of the empty graph.
458 | 2. Click "Add Expression" and set the expression to
459 |    `rate(prometheus_local_storage_ingested_samples_total[1m])`
460 |    The graph should show the per-second rate of ingested samples.
461 | 3. Let's give the graph a title. Open the "Graph and axis settings" graph menu
462 |    and set the title to "Ingested samples [rate-1m]".
463 | 4. Open the "Legend Settings" graph menu and set "Show legend" to "never",
464 |    since this graph only contains a single time series.
465 | 5. Press "Save Changes" to save your progress.
466 | 
467 | Let's add another graph showing the rates of the various chunk operations:
468 | 
469 | 1. Click the "Add Graph" button to add a second graph.
470 | 2. Add the following expression to the graph:
471 | 
472 |    `rate(prometheus_local_storage_chunk_ops_total[1m])`
473 | 
474 |    The graph should now show the per-second rate of chunk operations of various kinds.
475 | 3. Set the graph title to "Chunk ops [rate-1m]".
476 | 4. The legend currently shows all labels of the returned time series, although
477 |    only the "chunk" label differs. To show only that label in the legend, click
478 |    the "Legend Settings" tab and set the existing "Legend format" input to
479 |    `{{type}}`.
480 | 5. Because a graph may have multiple expressions with different applicable
481 |    legend format strings each, we still need to assign each legend format string
482 |    to a particular expression. Open the "Datasources" graph menu again and in the
483 |    "- Select format string -" dropdown, select the format string that you just
484 |    created.
485 | 6. Press "Save Changes" to save your progress.
486 | 
487 | Finally, let's add a gauge that shows the number of expression queries
488 | performed against your Prometheus server per second:
489 | 
490 | 1. Click the "Add Gauge" button to add a gauge.
491 | 2. Set the gauge expression to:
492 | 
493 |    `scalar(sum(rate(http_request_duration_microseconds_count{handler=~"/api/query"}[1m])))`
494 | 
495 |    The gauge should now show the number of expression queries per second. Note
496 |    that a gauge only supports queries which result in scalar values without any
497 |    labels. Thus, the entire expression is wrapped in a scalar() call.
498 | 3. Under the "Chart settings" menu tab of your gauge, set the title to
499 |    "Expression queries [rate-1m]".
500 | 4. Let's adjust the gauge's maximum value. In the "Chart settings" menu tab of
501 |    the gauge, set the "Gauge Max" to a lower value that seems reasonable for the
502 |    rate of queries your server is getting. For example, try setting it to 5.
503 | 5. Press "Save Changes" to save your progress.
504 | 
505 | Your dashboard should now look somewhat like this:
506 | 
507 | [![PromDash screenshot](/images/promdash.png)](#promdash)
508 | 
509 | PromDash supports many more features which we will not be able to explore in
510 | this workshop. For example:
511 | 
512 | * graphing multiple expressions from different servers in the same graph
513 | * mapping expressions to different axes and setting various axis options
514 | * building templatized dashboards using template variables
515 | * adding pie charts, Graphite graphs, or arbitrary web content
516 | 
517 | For a more comprehensive overview, see the [PromDash
518 | documentation](http://prometheus.io/docs/visualization/promdash/).
519 | 
520 | # Alerting
521 | 
522 | With instrumentation and a meaningful dashboard in place, the time is
523 | ripe to think about alerting.  Alerting rules are set up similarly to
524 | recording rules.  See the
525 | [section about alerting rules](https://prometheus.io/docs/alerting/rules/)
526 | in the documentation. You can inspect the status of configured alerts
527 | in the Alerts section of the Prometheus server's status page
528 | [http://localhost:9090/alerts](http://localhost:9090/alerts). However,
529 | for proper notifications, you need to set up an
530 | [Alertmanager](https://github.com/prometheus/alertmanager).
531 | 
532 | To play with the Alertmanager, you can download a release from
533 | https://github.com/prometheus/alertmanager/releases.
534 | 
535 | TODO: Alertmanager setup instructions.
536 | 
537 | In the workshop, we will run the Alertmanager without any configured
538 | notifications, just to see how alerts arrive there. In practice, you want to
539 | configure one of the many notification methods described
540 | [in the docmentation](http://prometheus.io/docs/alerting/alertmanager/). Pay
541 | special attention to the aggregation rules, which allow you to route alerts to
542 | different destinations.
543 | 
544 | To point your Prometheus server to an Alertmanager, use the `-alertmanager.url`
545 | flag.
546 | 
547 | Alerting rules use the same expression language as used for graphing
548 | before. Here is an example for a very fundamental alerting rule:
549 | 
550 | ```
551 | # Alert for any monitored instance that is unreachable for >2 minutes.
552 | ALERT InstanceDown
553 |   IF up == 0
554 |   FOR 2m
555 |   LABELS {
556 |     severity="page"
557 |   }
558 |   ANNOTATIONS {
559 |     runbook = "Instance {{$labels.instance}} down",
560 |     description = "{{$labels.instance}} of job {{$labels.job}} has been down for more than 2 minutes.",
561 |     runbook = "http://sinsip.com/xm.jpg"
562 |   }
563 | ```
564 | 
565 | Add the rule to a configured rule file, reload the config, and observe the
566 | _Alerts_ tab on the Prometheus server status page and the _Alerts_ tab on the
567 | Alertmanager status page while you start and stop jobs monitored by your
568 | server.
569 | 
570 | For meaningful alerting, refer to the
571 | [best practices section about alerting](http://prometheus.io/docs/practices/alerting/).
572 | 
573 | Create a useful alerting rule for your example application and try it out. Possible tasks:
574 | * Create an alert on your service being down. Stop the service and
575 |   check if and when the alert fires.
576 | * The example service simulates short outages now and then. Create an
577 |   alert that will detect them.
578 | * Modify the code to simulate other kinds of outages and create alerts to
579 |   detect them.
580 | * Run the example service multiple times (on different ports). Create
581 |   alerts that fire if a certain percentage of replicas are down.
582 | * Create an alert that fires if the disk is predicted to run full
583 |   within the next six
584 |   hours. ([Hint](http://www.robustperception.io/reduce-noise-from-disk-space-alerts/)
585 |   for cheaters.)
586 | 
587 | # Pushing Metrics
588 | 
589 | Occasionally, you might need to push metrics that are not
590 | machine-related. (The latter would be exposed via the `textfile`
591 | module of the node exporter, see above.) The
592 | [Pushgateway](http://prometheus.io/docs/instrumenting/pushing/) is a
593 | possible solution in that case. Note that it is not meant to change
594 | Prometheus's semantics to a push-based model.
595 | 
596 | To play with the Pushgateway, you can download a release from
597 | https://github.com/prometheus/pushgateway/releases or build one from source
598 | yourself.
599 | 
600 | Configure your Prometheus server to scrape the Pushgateway. The scrape config
601 | for a Pushgateway should have `honor_labels` set to `true`. (Later, you can try
602 | out what happens if you leave it at its default value `false`.)
603 | 
604 | ```
605 | - job_name: 'pushgateway'
606 |   scrape_interval: '15s'
607 |   honor_labels: true
608 |   static_configs:
609 |     - targets:
610 |         - 'http://localhost:9091/metrics'
611 | ```
612 | 
613 | Prometheus client libraries allow you to push to the Pushgateway, but you can
614 | also push in a very simple way using `curl`. Imagine a script that runs a
615 | database backup via some kind of (possibly distributed) cron solution. Upon
616 | successful completion, it should report the completion timestamp.
617 | 
618 | ```
619 | #!/bin/bash
620 | 
621 | set -e
622 | 
623 | # Some command that creates the backup.
624 | 
625 | echo "db_backup_last_success_timestamp_seconds $(date +%s)" | curl --data-binary @- http://demo-node.prometheus.io:9091/metrics/job/foo_db
626 | ```
627 | 
628 | Check the status page of the Pushgateway and its `/metrics` endpoint for the
629 | pushed metrics, and then observe how it is ingested by Prometheus.  Pay special
630 | attention to the difference between the scrape timestamp and the timestamp that
631 | is the value of the metric. How would you graph the age of the backup? How
632 | would you alert on a backup too old?
633 | 


--------------------------------------------------------------------------------