├── 3
├── collectd
│ └── riemann.conf
└── riemann
│ ├── examplecom
│ └── etc
│ │ └── email.clj
│ ├── riemann.config
│ └── riemann.config_riemannmc
├── 4
├── collectd
│ ├── carbon.conf
│ └── grafana.conf
├── graphite
│ ├── carbon-cache-ubuntu.init
│ ├── carbon-cache@.service
│ ├── carbon-relay-ubuntu.init
│ ├── carbon-relay@.service
│ ├── carbon.conf
│ ├── graphite-api.service
│ ├── graphite-api.yaml
│ ├── graphite-carbon.default
│ ├── local_settings.py
│ └── whisper-calculator.py
└── riemann
│ ├── examplecom
│ └── etc
│ │ ├── email.clj
│ │ └── graphite.clj
│ ├── riemann.config
│ └── riemann.config_riemannmc
├── 7
├── collectd
│ ├── collectd.conf
│ └── collectd.d
│ │ ├── cpu.conf
│ │ ├── df.conf
│ │ ├── docker.conf
│ │ ├── memory.conf
│ │ ├── processes.conf
│ │ ├── swap.conf
│ │ └── write_riemann.conf
└── riemann
│ ├── examplecom
│ └── etc
│ │ ├── checks.clj
│ │ ├── collectd.clj
│ │ ├── email.clj
│ │ └── graphite.clj
│ └── riemann.config
├── 8
├── collectd
│ ├── elasticsearch.conf
│ ├── elasticsearch_collectd.py
│ ├── logstash.conf
│ ├── logstash_jmx.conf
│ └── rsyslogd.conf
├── logstash
│ └── logstash.conf
└── riemann
│ ├── examplecom
│ └── etc
│ │ ├── checks.clj
│ │ ├── collectd.clj
│ │ ├── email.clj
│ │ ├── graphite.clj
│ │ └── logstash.clj
│ └── riemann.config
├── 9
├── collectd
│ └── statsd.conf
└── riemann
│ ├── examplecom
│ └── etc
│ │ ├── checks.clj
│ │ ├── collectd.clj
│ │ ├── email.clj
│ │ ├── graphite.clj
│ │ └── logstash.clj
│ └── riemann.config
├── 10
├── grafana
│ └── riemann.js
└── riemann
│ ├── examplecom
│ └── etc
│ │ ├── checks.clj
│ │ ├── collectd.clj
│ │ ├── count-notifications.clj
│ │ ├── email.clj
│ │ ├── graphite.clj
│ │ ├── logstash.clj
│ │ ├── maintenance.clj
│ │ ├── pagerduty.clj
│ │ └── slack.clj
│ └── riemann.config
├── .gitignore
├── 11-13
├── collectd
│ ├── mysql.conf
│ └── tornado-api.conf
├── grafana
│ └── tornado-dashboard.json
├── logstash
│ ├── logstash.conf
│ └── patterns
│ │ ├── nginx
│ │ └── tornadoapi
├── riemann
│ ├── examplecom
│ │ ├── app
│ │ │ └── tornado.clj
│ │ └── etc
│ │ │ ├── checks.clj
│ │ │ ├── collectd.clj
│ │ │ ├── count-notifications.clj
│ │ │ ├── email.clj
│ │ │ ├── graphite.clj
│ │ │ ├── logstash.clj
│ │ │ ├── maintenance.clj
│ │ │ ├── pagerduty.clj
│ │ │ └── slack.clj
│ └── riemann.config
└── rsyslog
│ └── 35-aom-clojure-rest.conf
├── 5-6
├── collectd
│ ├── collectd.conf
│ └── collectd.d
│ │ ├── carbon.conf
│ │ ├── cpu.conf
│ │ ├── df.conf
│ │ ├── memory.conf
│ │ ├── processes.conf
│ │ ├── swap.conf
│ │ └── write_riemann.conf
└── riemann
│ ├── examplecom
│ └── etc
│ │ ├── checks.clj
│ │ ├── collectd.clj
│ │ ├── email.clj
│ │ └── graphite.clj
│ └── riemann.config
├── LICENSE
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | *.gem
2 | *.rbc
3 | /.config
4 | /coverage/
5 | /InstalledFiles
6 | /pkg/
7 | /spec/reports/
8 | /test/tmp/
9 | /test/version_tmp/
10 | /tmp/
11 |
12 | ## Specific to RubyMotion:
13 | .dat*
14 | .repl_history
15 | build/
16 |
17 | ## Documentation cache and generated files:
18 | /.yardoc/
19 | /_yardoc/
20 | /doc/
21 | /rdoc/
22 |
23 | ## Environment normalisation:
24 | /.bundle/
25 | /lib/bundler/man/
26 |
27 | # for a library or gem, you might want to ignore these files since the code is
28 | # intended to run in multiple environments; otherwise, check them in:
29 | # Gemfile.lock
30 | # .ruby-version
31 | # .ruby-gemset
32 |
33 | # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
34 | .rvmrc
35 |
--------------------------------------------------------------------------------
/10/grafana/riemann.js:
--------------------------------------------------------------------------------
1 | /* Original dashboard code modified from: https://github.com/bimlendu/GrafanaScriptedDashboards /*
2 | /* Thanks to Bimlendu Mishra for developing the original! /*
3 |
4 | /*global XMLHttpRequest: false */
5 |
6 | var window, document, ARGS, $, jQuery, moment, kbn;
7 | var graphite = 'http://graphitea.example.com:8888';
8 |
9 | // Specify defaults for URL arguments
10 | var arg_host = 'graphitea';
11 | var arg_span = 4;
12 | var arg_from = '6h';
13 | var arg_env = 'productiona';
14 | var arg_stack = 'hosts';
15 |
16 | if (!_.isUndefined(ARGS.span)) {
17 | arg_span = ARGS.span; // graph width
18 | }
19 | if (!_.isUndefined(ARGS.from)) {
20 | arg_from = ARGS.from; // show data from 'x' hours until now
21 | }
22 | if (!_.isUndefined(ARGS.host)) {
23 | arg_host = ARGS.host; // host name
24 | }
25 | if (!_.isUndefined(ARGS.env)) {
26 | arg_env = ARGS.env; // environment
27 | }
28 | if (!_.isUndefined(ARGS.stack)) {
29 | arg_stack = ARGS.stack; // stack (hosts or docker)
30 | }
31 |
32 | // Execute graphite-api /metrics/find query. Returns array of metric last names ( func('test.cpu-*') returns ['cpu-0','cpu-1',..] )
33 | function find_filter_values(query) {
34 | var search_url = graphite + '/metrics/find/?query=' + query;
35 | var res = [];
36 | var req = new XMLHttpRequest();
37 | req.open('GET', search_url, false);
38 | req.send(null);
39 | var obj = JSON.parse(req.responseText);
40 | var key;
41 | for (key in obj) {
42 | if (obj.hasOwnProperty(key)) {
43 | if (obj[key].hasOwnProperty("text")) {
44 | res.push(obj[key].text);
45 | }
46 | }
47 | }
48 | return res;
49 | }
50 |
51 | // Return dashboard filter_list. Optionally include 'All'
52 | function get_filter_object(name, query, show_all) {
53 | show_all = (show_all === undefined) ? true : show_all;
54 | var arr = find_filter_values(query);
55 | var opts = [];
56 | var i;
57 | for (i in arr) {
58 | if (arr.hasOwnProperty(i)) {
59 | opts.push({"text": arr[i], "value": arr[i]});
60 | }
61 | }
62 | if (show_all === true) {
63 | opts.unshift({"text": "All", "value": '{' + arr.join() + '}'});
64 | }
65 | return {
66 | type: "filter",
67 | name: name,
68 | query: query,
69 | options: opts,
70 | current: opts[0],
71 | includeAll: show_all
72 | };
73 | }
74 |
75 | /*
76 | Panel templates
77 | */
78 |
79 | function panel_cpu(title, prefix) {
80 | return {
81 | title: title,
82 | type: 'graphite',
83 | span: arg_span,
84 | renderer: "flot",
85 | y_formats: ["none"],
86 | grid: {max: null, min: 0},
87 | lines: true,
88 | fill: 2,
89 | linewidth: 1,
90 | tooltip: {
91 | value_type: 'individual',
92 | shared: true
93 | },
94 | stack: true,
95 | legend: {show: true},
96 | percentage: true,
97 | nullPointMode: "null",
98 | targets: [
99 | { "target": "aliasByNode(" + prefix + "[[host]].cpu.wait,4)" },
100 | { "target": "aliasByNode(" + prefix + "[[host]].cpu.user,4)" },
101 | { "target": "aliasByNode(" + prefix + "[[host]].cpu.system,4)" },
102 | { "target": "aliasByNode(" + prefix + "[[host]].cpu.steal,4)" },
103 | { "target": "aliasByNode(" + prefix + "[[host]].cpu.interrupt,4)" },
104 | { "target": "aliasByNode(" + prefix + "[[host]].cpu.nice,4)" },
105 | { "target": "aliasByNode(" + prefix + "[[host]].cpu.idle,4)" },
106 | { "target": "aliasByNode(" + prefix + "[[host]].cpu.softirq,4)" }
107 | ],
108 | aliasColors: {
109 | "user": "#508642",
110 | "system": "#EAB839",
111 | "wait": "#890F02",
112 | "steal": "#E24D42",
113 | "idle": "#6ED0E0",
114 | "nice": "#629E51",
115 | "irq": "#1F78C1",
116 | "intrpt": "#EF843C"
117 | }
118 | };
119 | }
120 |
121 | function panel_memory(title, prefix) {
122 | return {
123 | title: title,
124 | type: 'graphite',
125 | span: arg_span,
126 | y_formats: ["none"],
127 | grid: {max: null, min: 0},
128 | lines: true,
129 | fill: 2,
130 | linewidth: 1,
131 | stack: true,
132 | tooltip: {
133 | value_type: 'individual',
134 | shared: true
135 | },
136 | nullPointMode: "null",
137 | targets: [
138 | { "target": "aliasByNode(" + prefix + "[[host]].memory.used,4)" }
139 | ],
140 | aliasColors: {
141 | "used": "#ff6666",
142 | }
143 | };
144 | }
145 |
146 | function panel_loadavg(title, prefix) {
147 | return {
148 | title: title,
149 | type: 'graphite',
150 | span: arg_span,
151 | y_formats: ["none"],
152 | grid: {max: null, min: 0},
153 | lines: true,
154 | fill: 2,
155 | linewidth: 1,
156 | tooltip: {
157 | value_type: 'individual',
158 | shared: true
159 | },
160 | stack : true,
161 | nullPointMode: "null",
162 | targets: [
163 | { "target": "aliasByNode(" + prefix + "[[host]].load.*,4)" }
164 | ],
165 | aliasColors: {
166 | "midterm": "#629E51",
167 | "shortterm": "#1F78C1",
168 | "longterm": "#EF843C"
169 | }
170 | };
171 | }
172 |
173 | function panel_swap_size(title, prefix) {
174 | return {
175 | title: title,
176 | type: 'graphite',
177 | span: arg_span,
178 | y_formats: ["none"],
179 | grid: {max: null, min: 0, leftMin: 0},
180 | lines: true,
181 | fill: 2,
182 | linewidth: 1,
183 | tooltip: {
184 | value_type: 'individual',
185 | shared: true
186 | },
187 | stack: true,
188 | nullPointMode: "null",
189 | percentage: true,
190 | targets: [
191 | { "target": "aliasByNode(" + prefix + "[[host]].swap.{free,used,cached},4)" },
192 | ],
193 | aliasColors: {
194 | "used": "#ff6666",
195 | "cached": "#EAB839",
196 | "free": "#66b266"
197 | }
198 | };
199 | }
200 |
201 | function panel_disk_space(title, prefix) {
202 | return {
203 | title: title,
204 | type: 'graphite',
205 | span: arg_span,
206 | y_formats: ["none"],
207 | grid: {max: null, min: 0, leftMin: 0},
208 | lines: true,
209 | fill: 2,
210 | linewidth: 1,
211 | tooltip: {
212 | value_type: 'individual',
213 | shared: true
214 | },
215 | stack: true,
216 | nullPointMode: "null",
217 | targets: [
218 | { "target": "aliasByNode(" + prefix + "[[host]]." + "df.root.percent_bytes.used,6)" },
219 | ],
220 | aliasColors: {
221 | "used": "#e32636"
222 | }
223 | };
224 | }
225 |
226 | /*
227 | Row templates
228 | */
229 |
230 | function row_delimiter(title) {
231 | return {
232 | title: "_____ " + title,
233 | height: "20px",
234 | collapse: false,
235 | editable: false,
236 | collapsable: false,
237 | panels: [{
238 | title: title,
239 | editable: false,
240 | span: 12,
241 | type: "text",
242 | mode: "text"
243 | }]
244 | };
245 | }
246 |
247 | function row_cpu_memory(title, prefix) {
248 | return {
249 | title: title,
250 | height: '250px',
251 | collapse: false,
252 | panels: [
253 | panel_cpu('CPU %', prefix),
254 | panel_memory('Memory', prefix),
255 | panel_loadavg('Load avg', prefix)
256 | ]
257 | };
258 | }
259 |
260 | function row_swap_disk(title, prefix) {
261 | return {
262 | title: title,
263 | height: '250px',
264 | collapse: false,
265 | panels: [
266 | panel_swap_size('Swap size', prefix),
267 | panel_disk_space('Disk Space on root', prefix)
268 | ]
269 | };
270 | }
271 |
272 | /*jslint unparam: true, node: true */
273 | return function(callback) {
274 |
275 | // Setup some variables
276 | var dashboard;
277 |
278 | var prefix = arg_env + '.' + arg_stack + '.';
279 |
280 | var arg_filter = prefix + arg_host;
281 |
282 | // Set filter
283 |
284 | var dashboard_filter = {
285 | time: {
286 | from: "now-" + arg_from,
287 | to: "now"
288 | },
289 | list: [
290 | get_filter_object("host", arg_filter, false)
291 | ]
292 | };
293 |
294 | // Define pulldowns
295 |
296 | var pulldowns = [
297 | {
298 | type: "filtering",
299 | collapse: false,
300 | notice: false,
301 | enable: true
302 | },
303 | {
304 | type: "annotations",
305 | enable: false
306 | }
307 | ];
308 |
309 | // Initialize a skeleton with nothing but a rows array and service object
310 |
311 | dashboard = {
312 | rows : [],
313 | services : {}
314 | };
315 | dashboard.title = prefix + arg_host;
316 | dashboard.editable = false;
317 | dashboard.pulldowns = pulldowns;
318 | dashboard.services.filter = dashboard_filter;
319 |
320 | $.ajax({
321 | method: 'GET',
322 | url: '/'
323 | })
324 | .done(function (result) {
325 |
326 | // Construct dashboard rows
327 |
328 | dashboard.rows.push(
329 | row_cpu_memory('CPU, Memory, Load', prefix),
330 | row_swap_disk('Swap, Disk Space', prefix)
331 | );
332 |
333 | callback(dashboard);
334 | });
335 | }
336 |
--------------------------------------------------------------------------------
/10/riemann/examplecom/etc/checks.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.checks
2 | (:require [riemann.config :refer :all]
3 | [clojure.tools.logging :refer :all]
4 | [riemann.streams :refer :all]))
5 |
6 | (defn set_state [warning critical]
7 | (fn [event]
8 | (assoc event :state
9 | (condp < (:metric event)
10 | critical "critical"
11 | warning "warning"
12 | "ok"))))
13 |
14 | (defn check_threshold [srv window func warning critical & children]
15 | (where (service srv)
16 | (fixed-time-window window
17 | (smap func
18 | (where (< warning metric)
19 | (smap (set_state warning critical)
20 | (fn [event]
21 | (call-rescue event children))))))))
22 |
23 | (defn check_percentiles [srv window & children]
24 | (where (service srv)
25 | (percentiles window [0.5 0.95 0.99 1]
26 | (fn [event]
27 | (call-rescue event children)))))
28 |
--------------------------------------------------------------------------------
/10/riemann/examplecom/etc/collectd.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.collectd
2 | (:require [clojure.tools.logging :refer :all]
3 | [riemann.streams :refer :all]
4 | [clojure.string :as str]
5 | [clojure.walk :as walk]))
6 |
7 | (defn docker-attribute-map
8 | [attributes]
9 | (let [instance (str/split (str/replace attributes #"^.*\[(.*)\]$" "$1") #",")]
10 | (walk/keywordize-keys (into {} (for [pair instance] (apply hash-map (str/split pair #"=")))))))
11 |
12 | (defn docker-attributes
13 | [{:keys [plugin_instance] :as event}]
14 | (if-let [attributes (re-find #"^.*\[.*\]$" plugin_instance)]
15 | (merge event (docker-attribute-map attributes))
16 | event))
17 |
18 | (defn parse-docker-service-host
19 | [{:keys [type type_instance plugin_instance] :as event}]
20 | (let [host (re-find #"^\w+\.?\w+\.?\w+" (:plugin_instance event))
21 | service (cond-> (str (:type event)) (:type_instance event) (str "." (:type_instance event)))]
22 | (assoc event :service service :host host)))
23 |
24 | (defn plugin-map
25 | "Parses labels from collectd plugin_stance"
26 | [plugin_instance]
27 | (let [instance (str/split (str/replace plugin_instance #"^.*\[(.*)\]$" "$1") #",")]
28 | (walk/keywordize-keys (into {} (for [pair instance] (apply hash-map (str/split pair #"=")))))))
29 |
30 | (defn parse-docker
31 | [& children]
32 | "Parses Docker events"
33 | (fn [event]
34 | (let [host (re-find #"^\w+\.?\w+\.?\w+" (:plugin_instance event))
35 | service (cond-> (str (:type event)) (:type_instance event) (str "." (:type_instance event)))
36 | event (assoc event :service service :host host)
37 | event (merge event (plugin-map (:plugin_instance event)))]
38 | (call-rescue event children))))
39 |
40 | (def default-services
41 | [{:service #"^load/load/(.*)$" :rewrite "load $1"}
42 | {:service #"^swap/percent-(.*)$" :rewrite "swap $1"}
43 | {:service #"^memory/percent-(.*)$" :rewrite "memory $1"}
44 | {:service #"^processes/ps_state-(.*)$" :rewrite "processes $1"}
45 | {:service #"^processes-(.*)/(.*)$" :rewrite "processes $1 $2"}
46 | {:service #"^cpu/percent-(.*)$" :rewrite "cpu $1"}
47 | {:service #"^df-(.*)/(df_complex|percent_bytes)-(.*)$" :rewrite "df $1 $2 $3"}
48 | {:service #"^interface-(.*)/if_(errors|packets|octets)/(tx|rx)$" :rewrite "nic $1 $3 $2"}
49 | {:service #"^protocols-(.*)/(.*)$" :rewrite "protocols $1 $2"}
50 | {:service #"^GenericJMX-(:?_|\/)?(.*)$" :rewrite "jmx $2"}
51 | {:service #"^haproxy\/(gauge|derive)-(.*)$" :rewrite "haproxy $2"}
52 | {:service #"^statsd\/(gauge|derive|latency)-(.*)$" :rewrite "$2"}
53 | {:service #"^statsd\/(gauge|derive|latency)-(.*)$" :rewrite "statsd $1 $2"}
54 | {:service #"^mysql-(.*)\/(counter|gauge)-(.*)$" :rewrite "mysql $1 $3"}
55 | {:service #"^dbi-(.*)\/(gauge|counter)-(.*)$" :rewrite "dbi $1 $3"}
56 | {:service #"^redis-(.*)$" :rewrite "redis $1"}])
57 |
58 | (defn rewrite-service-with
59 | [rules]
60 | (let [matcher (fn [s1 s2] (if (string? s1) (= s1 s2) (re-find s1 s2)))]
61 | (fn [{:keys [service] :as event}]
62 | (or
63 | (first
64 | (for [{:keys [rewrite] :as rule} rules
65 | :when (matcher (:service rule) service)]
66 | (assoc event :service
67 | (if (string? (:service rule))
68 | rewrite
69 | (str/replace service (:service rule) rewrite)))))
70 | event))))
71 |
72 | (def rewrite-service
73 | (rewrite-service-with default-services))
74 |
--------------------------------------------------------------------------------
/10/riemann/examplecom/etc/count-notifications.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.count-notifications
2 | (:require [riemann.streams :refer :all]))
3 |
4 | (defn count-notifications
5 | "Count notifications"
6 | [& children]
7 | (adjust [:service #(str % ".rate")]
8 | (tag "notification-rate"
9 | (rate 5
10 | (fn [event]
11 | (call-rescue event children))))))
12 |
--------------------------------------------------------------------------------
/10/riemann/examplecom/etc/email.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.email
2 | (:require [clojure.string :as str]
3 | [riemann.email :refer :all]))
4 |
5 | (defn format-subject
6 | "Format the email subject"
7 | [events]
8 | (apply format "Service %s is in state %s on host %s" (str/join ", " (map :service events)) (str/join ", " (map :state events)) (map :host events)))
9 |
10 | (def header "Monitoring notification from Riemann!\n\n")
11 | (def footer "This is an automated Riemann notification. Please do not reply.")
12 |
13 | (defn lookup
14 | "Lookup events in the index"
15 | [host service]
16 | (riemann.index/lookup (:index @riemann.config/core) host service))
17 |
18 | (defn round
19 | "Round numbers to 2 decimal places"
20 | [metric]
21 | (clojure.pprint/cl-format nil "~,2f" metric))
22 |
23 | (defn byte-to-gb [bytes] (/ bytes (* 1024.0 1024.0 1024.0)))
24 |
25 | (defn context
26 | "Add some contextual event data"
27 | [event]
28 | (str
29 | "Host context:\n"
30 | " CPU Utilization:\t"(round (+ (:metric (lookup (:host event) "cpu/percent-system")) (:metric (lookup (:host event) "cpu/percent-user")))) "%\n"
31 | " Memory Used:\t"(round (:metric (lookup (:host event) "memory/percent-used"))) "%\n"
32 | " Disk(root) %:\t\t"(round (:metric (lookup (:host event) "df-root/percent_bytes-used"))) "% used "
33 | " ("(round (byte-to-gb (:metric (lookup (:host event) "df-root/df_complex-used")))) " GB used of "
34 | (round (+ (byte-to-gb (:metric (lookup (:host event) "df-root/df_complex-used")))
35 | (byte-to-gb (:metric (lookup (:host event) "df-root/df_complex-free")))
36 | (byte-to-gb (:metric (lookup (:host event) "df-root/df_complex-reserved"))))) "GB)\n\n"
37 | "Grafana Dashboard:\n\n"
38 | " http://graphitea.example.com:3000/dashboard/script/riemann.js?host="(:host event)"\n\n"))
39 |
40 | (defn format-body
41 | "Format the email body"
42 | [events]
43 | (str/join "\n\n\n"
44 | (map
45 | (fn [event]
46 | (str
47 | header
48 | "Time:\t\t" (riemann.common/time-at (:time event)) "\n"
49 | "Host:\t\t" (:host event) "\n"
50 | "Service:\t\t" (:service event) "\n"
51 | "State:\t\t" (:state event) "\n"
52 | "Metric:\t\t" (if (ratio? (:metric event))
53 | (double (:metric event))
54 | (:metric event)) "\n"
55 | "Tags:\t\t[" (str/join ", " (:tags event)) "] \n"
56 | "\n"
57 | "Description:\t\t" (:description event)
58 | "\n\n"
59 | (context event)
60 | footer))
61 | events)))
62 |
63 | (def email (mailer {:from "riemann@example.com"
64 | :subject (fn [events] (format-subject events))
65 | :body (fn [events] (format-body events))
66 | }))
67 |
--------------------------------------------------------------------------------
/10/riemann/examplecom/etc/graphite.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.graphite
2 | (:require [clojure.string :as str]
3 | [riemann.config :refer :all]
4 | [riemann.graphite :refer :all]))
5 |
6 | (defn graphite-path-statsd [event]
7 | (let [host (:host event)
8 | app (re-find #"^.*?\." (:service event))
9 | service (str/replace-first (:service event) #"^.*?\." "")
10 | split-host (if host (str/split host #"\.") [])
11 | split-service (if service (str/split service #" ") [])]
12 | (str app, (str/join "." (concat (reverse split-host) split-service)))))
13 |
14 | (defn add-environment-to-graphite [event]
15 | (condp = (:plugin event)
16 | "docker"
17 | (if (:com.example.application event)
18 | (str "productiona.docker.", (:com.example.application event), ".", (riemann.graphite/graphite-path-percentiles event))
19 | (str "productiona.docker.", (riemann.graphite/graphite-path-percentiles event)))
20 | "statsd" (str "productiona.", (graphite-path-statsd event))
21 | (str "productiona.hosts.", (riemann.graphite/graphite-path-percentiles event))))
22 |
23 | (def graph (async-queue! :graphite {:queue-size 1000}
24 | (graphite {:host "graphitea" :path add-environment-to-graphite})))
25 |
--------------------------------------------------------------------------------
/10/riemann/examplecom/etc/logstash.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.logstash
2 | (:require [riemann.logstash :refer :all]))
3 |
4 | (def logstash (async-queue! :logstash {:queue-size 1000}
5 | (logstash {:host "logstash" :port 2003 :port-size 20})))
6 |
--------------------------------------------------------------------------------
/10/riemann/examplecom/etc/maintenance.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.maintenance
2 | (:require [riemann.streams :refer :all]))
3 |
4 | (defn maintenance-mode?
5 | "Is it currently in maintenance mode?"
6 | [event]
7 | (->> '(and (= host (:host event))
8 | (= service (:service event))
9 | (= (:type event) "maintenance-mode"))
10 | (riemann.index/search (:index @core))
11 | first
12 | :state
13 | (= "active")))
14 |
--------------------------------------------------------------------------------
/10/riemann/examplecom/etc/pagerduty.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.pagerduty
2 | (:require [riemann.pagerduty :refer :all]
3 | [riemann.streams :refer :all]))
4 |
5 | (defn pd-format
6 | [event]
7 | {:incident_key (str (:host event) " " (:service event))
8 | :description (str "Host: " (:host event) " "
9 | (:service event) " is "
10 | (:state event) " ("
11 | (:metric event) ")")
12 | :details (assoc event :graphs (str "http://graphitea.example.com:3000/dashboard/script/riemann.js?host="(:host event)))})
13 |
14 | (def pd (pagerduty { :service-key "123ABC123" :formatter pd-format}))
15 |
16 | (defn page
17 | []
18 | (changed-state {:init "ok"}
19 | (where (state "ok")
20 | (:resolve pd)
21 | (else (:trigger pd)))))
22 |
--------------------------------------------------------------------------------
/10/riemann/examplecom/etc/slack.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.slack
2 | (:require [riemann.slack :refer :all]))
3 |
4 | (def credentials {:account "examplecom", :token "123ABC123ABC"})
5 |
6 | (defn slack-format
7 | "Format our Slack message"
8 | [event]
9 | (str "Service " (:service event) " on host " (:host event) " is in state " (:state event) ".\n"
10 | "See http://graphitea.example.com:3000/dashboard/script/riemann.js?host="(:host event) ))
11 |
12 | (defn slacker
13 | "Send notifications to Slack"
14 | [& {:keys [recipient]
15 | :or {recipient "#monitoring"}}]
16 | (slack credentials {:username "Riemann bot"
17 | :channel recipient
18 | :formatter (fn [e] { :text (slack-format e) } )
19 | :icon ":smile:"}))
20 |
--------------------------------------------------------------------------------
/10/riemann/riemann.config:
--------------------------------------------------------------------------------
1 | (logging/init {:file "/var/log/riemann/riemann.log"})
2 |
3 | (require 'riemann.client)
4 | (require '[examplecom.etc.email :refer :all])
5 | (require '[examplecom.etc.graphite :refer :all])
6 | (require '[examplecom.etc.collectd :refer :all])
7 |
8 | (let [host "0.0.0.0"]
9 | (repl-server {:host "127.0.0.1"})
10 | (tcp-server {:host host})
11 | (udp-server {:host host})
12 | (ws-server {:host host}))
13 |
14 | (periodically-expire 10 {:keep-keys [:host :service :tags, :state, :description, :metric]})
15 |
16 | (let [index (index)
17 | downstream (batch 100 1/10
18 | (async-queue! :agg { :queue-size 1e3
19 | :core-pool-size 4
20 | :max-pool-size 32}
21 | (forward
22 | (riemann.client/tcp-client :host "riemannmc"))))]
23 |
24 | ; Inbound events will be passed to these streams:
25 | (streams
26 | (default :ttl 60
27 | ; Index all events immediately.
28 | (where (not (tagged "notification"))
29 | index)
30 |
31 | (tagged "collectd"
32 | (where (not (= (:plugin event) "docker"))
33 | (smap rewrite-service graph))
34 |
35 | (where (= (:plugin event) "docker")
36 | (smap (comp parse-docker-service-host docker-attributes rewrite-service) graph))
37 |
38 | (tagged "notification"
39 | (where (not (maintenance-mode? event))
40 | (changed-state {:init "ok"}
41 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count$" "$1"]
42 | (email "james@example.com")))))
43 |
44 | (where (and (expired? event)
45 | (service #"^processes-.+\/ps_count\/processes"))
46 | (not (maintenance-mode? event))
47 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count\/processes$" "$1"]
48 | (email "james@example.com"))))
49 |
50 | (where (service #"^riemann.*")
51 | graph
52 |
53 | downstream))))
54 |
--------------------------------------------------------------------------------
/11-13/collectd/mysql.conf:
--------------------------------------------------------------------------------
1 |
2 | Globals true
3 |
4 |
5 | ModulePath "/usr/lib/collectd/mysql/"
6 |
7 |
8 |
9 | Import mysql
10 |
11 | Host "localhost"
12 | Port 3306
13 | User "collectd"
14 | Password "collectd"
15 |
16 |
17 |
18 | LoadPlugin processes
19 |
20 | Process "mysqld"
21 |
22 |
23 | LoadPlugin dbi
24 |
25 |
26 | Statement "SELECT COUNT(*) AS value FROM items;"
27 | MinVersion 50000
28 |
29 | Type "gauge"
30 | InstancePrefix "tornado_item_count"
31 | ValuesFrom "value"
32 |
33 |
34 |
35 | Statement "SELECT SUM(price) AS total_price FROM items"
36 | MinVersion 50000
37 |
38 | Type "gauge"
39 | InstancePrefix "item_sold_total_price"
40 | ValuesFrom "total_price"
41 |
42 |
43 |
44 | Statement "SELECT MAX(thread_id), timer_wait/1000000000 AS exec_time_ms
45 | FROM events_statements_history_long
46 | WHERE digest_text = 'INSERT INTO `items` ( `title` , TEXT , `price` , `id` ) VALUES (...)';"
47 | MinVersion 50000
48 |
49 | Type "gauge"
50 | InstancePrefix "insert_query_time"
51 | ValuesFrom "exec_time_ms"
52 |
53 |
54 |
55 | Driver "mysql"
56 | DriverOption "host" "localhost"
57 | DriverOption "username" "collectd"
58 | DriverOption "password" "collectd"
59 | DriverOption "dbname" "items"
60 | SelectDB "items"
61 | Query "get_item_count"
62 | Query "item_sold_total_price"
63 |
64 |
65 | Driver "mysql"
66 | DriverOption "host" "localhost"
67 | DriverOption "username" "collectd"
68 | DriverOption "password" "collectd"
69 | DriverOption "dbname" "performance_schema"
70 | Query "insert_query_time"
71 |
72 |
73 |
--------------------------------------------------------------------------------
/11-13/collectd/tornado-api.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin processes
2 |
3 | ProcessMatch "tornado-api" "-jar tornado-api"
4 |
5 |
6 | LoadPlugin java
7 |
8 | JVMARG "-Djava.class.path=/usr/share/collectd/java/collectd-api.jar:/usr/share/collectd/java/generic-jmx.jar"
9 | LoadPlugin "org.collectd.java.GenericJMX"
10 |
11 |
12 | ObjectName "java.lang:type=GarbageCollector,*"
13 | InstancePrefix "gc-"
14 | InstanceFrom "name"
15 |
16 | Type "derive"
17 | Table false
18 | Attribute "CollectionCount"
19 | InstancePrefix "count"
20 |
21 |
22 |
23 | ObjectName "java.lang:type=GarbageCollector,*"
24 | InstancePrefix "gc-"
25 | InstanceFrom "name"
26 |
27 | Type "derive"
28 | Table false
29 | Attribute "CollectionTime"
30 | InstancePrefix "time"
31 |
32 |
33 |
34 | ObjectName "java.lang:type=MemoryPool,*"
35 | InstancePrefix "memory_pool-"
36 | InstanceFrom "name"
37 |
38 | Type "memory"
39 | Table true
40 | Attribute "Usage"
41 |
42 |
43 |
44 | ObjectName "java.lang:type=Memory"
45 | InstancePrefix "memory-heap"
46 |
47 | Type "memory"
48 | Table true
49 | Attribute "HeapMemoryUsage"
50 |
51 |
52 |
53 | ObjectName "java.lang:type=Memory"
54 | InstancePrefix "memory-nonheap"
55 |
56 | Type "memory"
57 | Table true
58 | Attribute "NonHeapMemoryUsage"
59 |
60 |
61 |
62 | ObjectName "java.lang:type=Threading"
63 | InstancePrefix "threading"
64 |
65 | Type "gauge"
66 | Table false
67 | Attribute "ThreadCount"
68 | InstancePrefix "count"
69 |
70 |
71 |
72 | ObjectName "java.lang:type=Threading"
73 | InstancePrefix "threading"
74 |
75 | Type "gauge"
76 | Table false
77 | Attribute "DaemonThreadCount"
78 | InstancePrefix "count-daemon"
79 |
80 |
81 |
82 | ServiceURL "service:jmx:rmi:///jndi/rmi://localhost:8855/jmxrmi"
83 | Collect "memory_pool"
84 | Collect "memory-heap"
85 | Collect "memory-nonheap"
86 | Collect "gc-count"
87 | Collect "gc-time"
88 | Collect "thread"
89 | Collect "thread-daemon"
90 |
91 |
92 |
93 |
--------------------------------------------------------------------------------
/11-13/grafana/tornado-dashboard.json:
--------------------------------------------------------------------------------
1 | {
2 | "id": 6,
3 | "title": "Tornado",
4 | "originalTitle": "Tornado",
5 | "tags": [],
6 | "style": "dark",
7 | "timezone": "browser",
8 | "editable": true,
9 | "hideControls": false,
10 | "sharedCrosshair": false,
11 | "rows": [
12 | {
13 | "collapse": false,
14 | "editable": true,
15 | "height": "250px",
16 | "panels": [
17 | {
18 | "cacheTimeout": null,
19 | "colorBackground": false,
20 | "colorValue": false,
21 | "colors": [
22 | "rgba(50, 172, 45, 0.97)",
23 | "rgba(237, 129, 40, 0.89)",
24 | "rgba(245, 54, 54, 0.9)"
25 | ],
26 | "datasource": null,
27 | "editable": true,
28 | "error": false,
29 | "format": "none",
30 | "id": 19,
31 | "interval": null,
32 | "isNew": true,
33 | "links": [],
34 | "maxDataPoints": 100,
35 | "nullPointMode": "connected",
36 | "nullText": null,
37 | "postfix": "",
38 | "postfixFontSize": "50%",
39 | "prefix": "",
40 | "prefixFontSize": "50%",
41 | "span": 6,
42 | "sparkline": {
43 | "fillColor": "rgba(31, 118, 189, 0.18)",
44 | "full": true,
45 | "lineColor": "rgb(31, 120, 193)",
46 | "show": true
47 | },
48 | "targets": [
49 | {
50 | "refId": "A",
51 | "target": "productiona.hosts.tornado-db.dbi.items.tornado_item_count",
52 | "textEditor": true
53 | }
54 | ],
55 | "thresholds": "",
56 | "title": "Item Count",
57 | "type": "singlestat",
58 | "valueFontSize": "80%",
59 | "valueMaps": [
60 | {
61 | "op": "=",
62 | "text": "N/A",
63 | "value": "null"
64 | }
65 | ],
66 | "valueName": "avg"
67 | },
68 | {
69 | "cacheTimeout": null,
70 | "colorBackground": false,
71 | "colorValue": false,
72 | "colors": [
73 | "rgba(50, 172, 45, 0.97)",
74 | "rgba(237, 129, 40, 0.89)",
75 | "rgba(245, 54, 54, 0.9)"
76 | ],
77 | "datasource": null,
78 | "editable": true,
79 | "error": false,
80 | "format": "currencyUSD",
81 | "id": 21,
82 | "interval": null,
83 | "isNew": true,
84 | "links": [],
85 | "maxDataPoints": 100,
86 | "nullPointMode": "connected",
87 | "nullText": null,
88 | "postfix": "",
89 | "postfixFontSize": "50%",
90 | "prefix": "",
91 | "prefixFontSize": "50%",
92 | "span": 6,
93 | "sparkline": {
94 | "fillColor": "rgba(31, 118, 189, 0.18)",
95 | "full": true,
96 | "lineColor": "rgb(31, 120, 193)",
97 | "show": true
98 | },
99 | "targets": [
100 | {
101 | "refId": "A",
102 | "target": "productiona.hosts.tornado-db.dbi.items.item_sold_total_price",
103 | "textEditor": true
104 | }
105 | ],
106 | "thresholds": "",
107 | "title": "Total price of items sold",
108 | "type": "singlestat",
109 | "valueFontSize": "80%",
110 | "valueMaps": [
111 | {
112 | "op": "=",
113 | "text": "N/A",
114 | "value": "null"
115 | }
116 | ],
117 | "valueName": "avg"
118 | },
119 | {
120 | "aliasColors": {},
121 | "bars": false,
122 | "datasource": null,
123 | "editable": true,
124 | "error": false,
125 | "fill": 1,
126 | "grid": {
127 | "leftLogBase": 1,
128 | "leftMax": null,
129 | "leftMin": null,
130 | "rightLogBase": 1,
131 | "rightMax": null,
132 | "rightMin": null,
133 | "threshold1": null,
134 | "threshold1Color": "rgba(216, 200, 27, 0.27)",
135 | "threshold2": null,
136 | "threshold2Color": "rgba(234, 112, 112, 0.22)"
137 | },
138 | "id": 23,
139 | "isNew": true,
140 | "legend": {
141 | "avg": false,
142 | "current": false,
143 | "max": false,
144 | "min": false,
145 | "show": true,
146 | "total": false,
147 | "values": false
148 | },
149 | "lines": true,
150 | "linewidth": 2,
151 | "links": [],
152 | "nullPointMode": "connected",
153 | "percentage": false,
154 | "pointradius": 5,
155 | "points": false,
156 | "renderer": "flot",
157 | "seriesOverrides": [],
158 | "span": 6,
159 | "stack": false,
160 | "steppedLine": false,
161 | "targets": [
162 | {
163 | "refId": "A",
164 | "target": "alias(sumSeriesWithWildcards(productiona.hosts.*.statsd.gauge.tornado.api.item.sold.total, 2), 'Tornado API servers')",
165 | "textEditor": true
166 | }
167 | ],
168 | "timeFrom": null,
169 | "timeShift": null,
170 | "title": "Tornado API Sold Total $",
171 | "tooltip": {
172 | "shared": true,
173 | "value_type": "cumulative"
174 | },
175 | "type": "graph",
176 | "x-axis": true,
177 | "y-axis": true,
178 | "y_formats": [
179 | "short",
180 | "short"
181 | ]
182 | },
183 | {
184 | "aliasColors": {},
185 | "bars": false,
186 | "datasource": null,
187 | "editable": true,
188 | "error": false,
189 | "fill": 1,
190 | "grid": {
191 | "leftLogBase": 1,
192 | "leftMax": null,
193 | "leftMin": null,
194 | "rightLogBase": 1,
195 | "rightMax": null,
196 | "rightMin": null,
197 | "threshold1": null,
198 | "threshold1Color": "rgba(216, 200, 27, 0.27)",
199 | "threshold2": null,
200 | "threshold2Color": "rgba(234, 112, 112, 0.22)"
201 | },
202 | "id": 17,
203 | "isNew": true,
204 | "legend": {
205 | "avg": false,
206 | "current": false,
207 | "max": false,
208 | "min": false,
209 | "show": true,
210 | "total": false,
211 | "values": false
212 | },
213 | "lines": true,
214 | "linewidth": 2,
215 | "links": [],
216 | "nullPointMode": "connected",
217 | "percentage": false,
218 | "pointradius": 5,
219 | "points": false,
220 | "renderer": "flot",
221 | "seriesOverrides": [],
222 | "span": 6,
223 | "stack": false,
224 | "steppedLine": false,
225 | "targets": [
226 | {
227 | "refId": "A",
228 | "target": "alias(sumSeriesWithWildcards(productiona.hosts.*.statsd.gauge.tornado.api.item.bought.total, 2), 'Tornado API servers')",
229 | "textEditor": true
230 | }
231 | ],
232 | "timeFrom": null,
233 | "timeShift": null,
234 | "title": "Tornado API Bought Total $",
235 | "tooltip": {
236 | "shared": true,
237 | "value_type": "cumulative"
238 | },
239 | "type": "graph",
240 | "x-axis": true,
241 | "y-axis": true,
242 | "y_formats": [
243 | "short",
244 | "short"
245 | ]
246 | },
247 | {
248 | "aliasColors": {},
249 | "bars": false,
250 | "datasource": null,
251 | "editable": true,
252 | "error": false,
253 | "fill": 1,
254 | "grid": {
255 | "leftLogBase": 1,
256 | "leftMax": null,
257 | "leftMin": null,
258 | "rightLogBase": 1,
259 | "rightMax": null,
260 | "rightMin": null,
261 | "threshold1": null,
262 | "threshold1Color": "rgba(216, 200, 27, 0.27)",
263 | "threshold2": null,
264 | "threshold2Color": "rgba(234, 112, 112, 0.22)"
265 | },
266 | "id": 16,
267 | "isNew": true,
268 | "legend": {
269 | "avg": false,
270 | "current": false,
271 | "max": false,
272 | "min": false,
273 | "show": true,
274 | "total": false,
275 | "values": false
276 | },
277 | "lines": true,
278 | "linewidth": 2,
279 | "links": [],
280 | "nullPointMode": "connected",
281 | "percentage": false,
282 | "pointradius": 5,
283 | "points": false,
284 | "renderer": "flot",
285 | "seriesOverrides": [],
286 | "span": 6,
287 | "stack": false,
288 | "steppedLine": false,
289 | "targets": [
290 | {
291 | "refId": "A",
292 | "target": "aliasByNode(productiona.hosts.*.tornado.api.request.99, 2)",
293 | "textEditor": false
294 | }
295 | ],
296 | "timeFrom": null,
297 | "timeShift": null,
298 | "title": "Tornado API Request Time 0.99",
299 | "tooltip": {
300 | "shared": true,
301 | "value_type": "cumulative"
302 | },
303 | "type": "graph",
304 | "x-axis": true,
305 | "y-axis": true,
306 | "y_formats": [
307 | "short",
308 | "short"
309 | ]
310 | },
311 | {
312 | "aliasColors": {},
313 | "bars": false,
314 | "datasource": null,
315 | "editable": true,
316 | "error": false,
317 | "fill": 1,
318 | "grid": {
319 | "leftLogBase": 1,
320 | "leftMax": null,
321 | "leftMin": null,
322 | "rightLogBase": 1,
323 | "rightMax": null,
324 | "rightMin": null,
325 | "threshold1": null,
326 | "threshold1Color": "rgba(216, 200, 27, 0.27)",
327 | "threshold2": null,
328 | "threshold2Color": "rgba(234, 112, 112, 0.22)"
329 | },
330 | "id": 14,
331 | "isNew": true,
332 | "legend": {
333 | "avg": false,
334 | "current": false,
335 | "max": false,
336 | "min": false,
337 | "show": true,
338 | "total": false,
339 | "values": false
340 | },
341 | "lines": true,
342 | "linewidth": 2,
343 | "links": [],
344 | "nullPointMode": "connected",
345 | "percentage": false,
346 | "pointradius": 5,
347 | "points": false,
348 | "renderer": "flot",
349 | "seriesOverrides": [
350 | {}
351 | ],
352 | "span": 6,
353 | "stack": false,
354 | "steppedLine": false,
355 | "targets": [
356 | {
357 | "refId": "A",
358 | "target": "aliasByNode(productiona.hosts.*.tornado.api.request.rate,2)",
359 | "textEditor": false
360 | }
361 | ],
362 | "timeFrom": null,
363 | "timeShift": null,
364 | "title": "Tornado API Request rate",
365 | "tooltip": {
366 | "shared": true,
367 | "value_type": "cumulative"
368 | },
369 | "type": "graph",
370 | "x-axis": true,
371 | "y-axis": true,
372 | "y_formats": [
373 | "short",
374 | "short"
375 | ]
376 | },
377 | {
378 | "aliasColors": {},
379 | "bars": false,
380 | "datasource": null,
381 | "editable": true,
382 | "error": false,
383 | "fill": 1,
384 | "grid": {
385 | "leftLogBase": 1,
386 | "leftMax": null,
387 | "leftMin": null,
388 | "rightLogBase": 1,
389 | "rightMax": null,
390 | "rightMin": null,
391 | "threshold1": null,
392 | "threshold1Color": "rgba(216, 200, 27, 0.27)",
393 | "threshold2": null,
394 | "threshold2Color": "rgba(234, 112, 112, 0.22)"
395 | },
396 | "id": 13,
397 | "isNew": true,
398 | "legend": {
399 | "avg": false,
400 | "current": false,
401 | "max": false,
402 | "min": false,
403 | "show": true,
404 | "total": false,
405 | "values": false
406 | },
407 | "lines": true,
408 | "linewidth": 2,
409 | "links": [],
410 | "nullPointMode": "connected",
411 | "percentage": false,
412 | "pointradius": 5,
413 | "points": false,
414 | "renderer": "flot",
415 | "seriesOverrides": [],
416 | "span": 6,
417 | "stack": false,
418 | "steppedLine": false,
419 | "targets": [
420 | {
421 | "refId": "A",
422 | "target": "aliasByNode(productiona.hosts.tornado-proxy.haproxy.frontend.tornado-www.5xx_error_percentage,2)",
423 | "textEditor": true
424 | }
425 | ],
426 | "timeFrom": null,
427 | "timeShift": null,
428 | "title": "Tornado 5xx Error Percentage",
429 | "tooltip": {
430 | "shared": true,
431 | "value_type": "cumulative"
432 | },
433 | "type": "graph",
434 | "x-axis": true,
435 | "y-axis": true,
436 | "y_formats": [
437 | "short",
438 | "short"
439 | ]
440 | },
441 | {
442 | "aliasColors": {},
443 | "bars": false,
444 | "datasource": null,
445 | "editable": true,
446 | "error": false,
447 | "fill": 1,
448 | "grid": {
449 | "leftLogBase": 1,
450 | "leftMax": null,
451 | "leftMin": null,
452 | "rightLogBase": 1,
453 | "rightMax": null,
454 | "rightMin": null,
455 | "threshold1": null,
456 | "threshold1Color": "rgba(216, 200, 27, 0.27)",
457 | "threshold2": null,
458 | "threshold2Color": "rgba(234, 112, 112, 0.22)"
459 | },
460 | "id": 22,
461 | "isNew": true,
462 | "legend": {
463 | "avg": false,
464 | "current": false,
465 | "max": false,
466 | "min": false,
467 | "show": true,
468 | "total": false,
469 | "values": false
470 | },
471 | "lines": true,
472 | "linewidth": 2,
473 | "links": [],
474 | "nullPointMode": "connected",
475 | "percentage": false,
476 | "pointradius": 5,
477 | "points": false,
478 | "renderer": "flot",
479 | "seriesOverrides": [],
480 | "span": 6,
481 | "stack": false,
482 | "steppedLine": false,
483 | "targets": [
484 | {
485 | "refId": "A",
486 | "target": "aliasByNode(productiona.hosts.tornado-db.mysql.aborted_connection_rate,2)",
487 | "textEditor": false
488 | }
489 | ],
490 | "timeFrom": null,
491 | "timeShift": null,
492 | "title": "MySQL Aborted Connection rate",
493 | "tooltip": {
494 | "shared": true,
495 | "value_type": "cumulative"
496 | },
497 | "type": "graph",
498 | "x-axis": true,
499 | "y-axis": true,
500 | "y_formats": [
501 | "short",
502 | "short"
503 | ]
504 | },
505 | {
506 | "aliasColors": {},
507 | "bars": false,
508 | "datasource": null,
509 | "editable": true,
510 | "error": false,
511 | "fill": 1,
512 | "grid": {
513 | "leftLogBase": 1,
514 | "leftMax": null,
515 | "leftMin": null,
516 | "rightLogBase": 1,
517 | "rightMax": null,
518 | "rightMin": null,
519 | "threshold1": null,
520 | "threshold1Color": "rgba(216, 200, 27, 0.27)",
521 | "threshold2": null,
522 | "threshold2Color": "rgba(234, 112, 112, 0.22)"
523 | },
524 | "id": 18,
525 | "isNew": true,
526 | "legend": {
527 | "avg": false,
528 | "current": false,
529 | "max": false,
530 | "min": false,
531 | "show": true,
532 | "total": false,
533 | "values": false
534 | },
535 | "lines": true,
536 | "linewidth": 2,
537 | "links": [],
538 | "nullPointMode": "connected",
539 | "percentage": false,
540 | "pointradius": 5,
541 | "points": false,
542 | "renderer": "flot",
543 | "seriesOverrides": [],
544 | "span": 6,
545 | "stack": false,
546 | "steppedLine": false,
547 | "targets": [
548 | {
549 | "refId": "A",
550 | "target": "aliasByNode(productiona.hosts.tornado-db.dbi.performance_schema.insert_query_time.99,2)",
551 | "textEditor": true
552 | }
553 | ],
554 | "timeFrom": null,
555 | "timeShift": null,
556 | "title": "Tornado API Item Insert 0.99",
557 | "tooltip": {
558 | "shared": true,
559 | "value_type": "cumulative"
560 | },
561 | "type": "graph",
562 | "x-axis": true,
563 | "y-axis": true,
564 | "y_formats": [
565 | "short",
566 | "short"
567 | ]
568 | }
569 | ],
570 | "title": "Row"
571 | },
572 | {
573 | "collapse": false,
574 | "editable": true,
575 | "height": "250px",
576 | "panels": [
577 | {
578 | "aliasColors": {},
579 | "bars": false,
580 | "datasource": null,
581 | "editable": true,
582 | "error": false,
583 | "fill": 1,
584 | "grid": {
585 | "leftLogBase": 1,
586 | "leftMax": null,
587 | "leftMin": null,
588 | "rightLogBase": 1,
589 | "rightMax": null,
590 | "rightMin": null,
591 | "threshold1": null,
592 | "threshold1Color": "rgba(216, 200, 27, 0.27)",
593 | "threshold2": null,
594 | "threshold2Color": "rgba(234, 112, 112, 0.22)"
595 | },
596 | "id": 12,
597 | "legend": {
598 | "avg": false,
599 | "current": false,
600 | "max": false,
601 | "min": false,
602 | "show": true,
603 | "total": false,
604 | "values": false
605 | },
606 | "lines": true,
607 | "linewidth": 2,
608 | "links": [],
609 | "nullPointMode": "connected",
610 | "percentage": false,
611 | "pointradius": 5,
612 | "points": false,
613 | "renderer": "flot",
614 | "seriesOverrides": [],
615 | "span": 4,
616 | "stack": false,
617 | "steppedLine": false,
618 | "targets": [
619 | {
620 | "refId": "A",
621 | "target": "groupByNode(productiona.hosts.{tornado-redis,tornado-db}.cpu.{user,system},2,'sumSeries')",
622 | "textEditor": true
623 | }
624 | ],
625 | "timeFrom": null,
626 | "timeShift": null,
627 | "title": "DB Tier CPU Usage",
628 | "tooltip": {
629 | "shared": true,
630 | "value_type": "individual"
631 | },
632 | "type": "graph",
633 | "x-axis": true,
634 | "y-axis": true,
635 | "y_formats": [
636 | "short",
637 | "short"
638 | ]
639 | },
640 | {
641 | "aliasColors": {},
642 | "bars": false,
643 | "datasource": null,
644 | "editable": true,
645 | "error": false,
646 | "fill": 1,
647 | "grid": {
648 | "leftLogBase": 1,
649 | "leftMax": null,
650 | "leftMin": null,
651 | "rightLogBase": 1,
652 | "rightMax": null,
653 | "rightMin": null,
654 | "threshold1": null,
655 | "threshold1Color": "rgba(216, 200, 27, 0.27)",
656 | "threshold2": null,
657 | "threshold2Color": "rgba(234, 112, 112, 0.22)"
658 | },
659 | "id": 11,
660 | "legend": {
661 | "avg": false,
662 | "current": false,
663 | "max": false,
664 | "min": false,
665 | "show": true,
666 | "total": false,
667 | "values": false
668 | },
669 | "lines": true,
670 | "linewidth": 2,
671 | "links": [],
672 | "nullPointMode": "connected",
673 | "percentage": false,
674 | "pointradius": 5,
675 | "points": false,
676 | "renderer": "flot",
677 | "seriesOverrides": [],
678 | "span": 4,
679 | "stack": false,
680 | "steppedLine": false,
681 | "targets": [
682 | {
683 | "refId": "A",
684 | "target": "groupByNode(productiona.hosts.{tornado-api1,tornado-api2}.cpu.{user,system},2,'sumSeries')",
685 | "textEditor": true
686 | }
687 | ],
688 | "timeFrom": null,
689 | "timeShift": null,
690 | "title": "App Tier CPU Usage",
691 | "tooltip": {
692 | "shared": true,
693 | "value_type": "individual"
694 | },
695 | "type": "graph",
696 | "x-axis": true,
697 | "y-axis": true,
698 | "y_formats": [
699 | "short",
700 | "short"
701 | ]
702 | },
703 | {
704 | "aliasColors": {},
705 | "bars": false,
706 | "datasource": null,
707 | "editable": true,
708 | "error": false,
709 | "fill": 1,
710 | "grid": {
711 | "leftLogBase": 1,
712 | "leftMax": null,
713 | "leftMin": null,
714 | "rightLogBase": 1,
715 | "rightMax": null,
716 | "rightMin": null,
717 | "threshold1": null,
718 | "threshold1Color": "rgba(216, 200, 27, 0.27)",
719 | "threshold2": null,
720 | "threshold2Color": "rgba(234, 112, 112, 0.22)"
721 | },
722 | "id": 15,
723 | "isNew": true,
724 | "legend": {
725 | "avg": false,
726 | "current": false,
727 | "max": false,
728 | "min": false,
729 | "show": true,
730 | "total": false,
731 | "values": false
732 | },
733 | "lines": true,
734 | "linewidth": 2,
735 | "links": [],
736 | "nullPointMode": "connected",
737 | "percentage": false,
738 | "pointradius": 5,
739 | "points": false,
740 | "renderer": "flot",
741 | "seriesOverrides": [],
742 | "span": 4,
743 | "stack": false,
744 | "steppedLine": false,
745 | "targets": [
746 | {
747 | "refId": "A",
748 | "target": "groupByNode(productiona.hosts.{tornado-proxy,tornado-web1,tornado-web2}.cpu.{user,system},2,'sumSeries')",
749 | "textEditor": true
750 | }
751 | ],
752 | "timeFrom": null,
753 | "timeShift": null,
754 | "title": "Web Tier CPU Usage",
755 | "tooltip": {
756 | "shared": true,
757 | "value_type": "cumulative"
758 | },
759 | "type": "graph",
760 | "x-axis": true,
761 | "y-axis": true,
762 | "y_formats": [
763 | "short",
764 | "short"
765 | ]
766 | }
767 | ],
768 | "title": "New row"
769 | },
770 | {
771 | "collapse": false,
772 | "editable": true,
773 | "height": "250px",
774 | "panels": [
775 | {
776 | "aliasColors": {},
777 | "bars": false,
778 | "datasource": null,
779 | "editable": true,
780 | "error": false,
781 | "fill": 1,
782 | "grid": {
783 | "leftLogBase": 1,
784 | "leftMax": null,
785 | "leftMin": null,
786 | "rightLogBase": 1,
787 | "rightMax": null,
788 | "rightMin": null,
789 | "threshold1": null,
790 | "threshold1Color": "rgba(216, 200, 27, 0.27)",
791 | "threshold2": null,
792 | "threshold2Color": "rgba(234, 112, 112, 0.22)"
793 | },
794 | "id": 3,
795 | "legend": {
796 | "avg": false,
797 | "current": false,
798 | "max": false,
799 | "min": false,
800 | "show": true,
801 | "total": false,
802 | "values": false
803 | },
804 | "lines": true,
805 | "linewidth": 2,
806 | "links": [],
807 | "nullPointMode": "connected",
808 | "percentage": false,
809 | "pointradius": 5,
810 | "points": false,
811 | "renderer": "flot",
812 | "seriesOverrides": [],
813 | "span": 6,
814 | "stack": false,
815 | "steppedLine": false,
816 | "targets": [
817 | {
818 | "refId": "A",
819 | "target": "aliasByNode(productiona.hosts.{tornado-proxy,tornado-web1,tornado-web2,tornado-api1,tornado-api2,tornado-redis,tornado-db}.swap.used, 2)",
820 | "textEditor": true
821 | }
822 | ],
823 | "timeFrom": null,
824 | "timeShift": null,
825 | "title": "Tornado Swap",
826 | "tooltip": {
827 | "shared": true,
828 | "value_type": "cumulative"
829 | },
830 | "type": "graph",
831 | "x-axis": true,
832 | "y-axis": true,
833 | "y_formats": [
834 | "short",
835 | "short"
836 | ]
837 | },
838 | {
839 | "aliasColors": {},
840 | "bars": false,
841 | "datasource": null,
842 | "editable": true,
843 | "error": false,
844 | "fill": 1,
845 | "grid": {
846 | "leftLogBase": 1,
847 | "leftMax": null,
848 | "leftMin": null,
849 | "rightLogBase": 1,
850 | "rightMax": null,
851 | "rightMin": null,
852 | "threshold1": null,
853 | "threshold1Color": "rgba(216, 200, 27, 0.27)",
854 | "threshold2": null,
855 | "threshold2Color": "rgba(234, 112, 112, 0.22)"
856 | },
857 | "id": 2,
858 | "legend": {
859 | "avg": false,
860 | "current": false,
861 | "max": false,
862 | "min": false,
863 | "show": true,
864 | "total": false,
865 | "values": false
866 | },
867 | "lines": true,
868 | "linewidth": 2,
869 | "links": [],
870 | "nullPointMode": "connected",
871 | "percentage": false,
872 | "pointradius": 5,
873 | "points": false,
874 | "renderer": "flot",
875 | "seriesOverrides": [],
876 | "span": 6,
877 | "stack": false,
878 | "steppedLine": false,
879 | "targets": [
880 | {
881 | "refId": "A",
882 | "target": "aliasByNode(productiona.hosts.{tornado-proxy,tornado-web1,tornado-web2,tornado-api1,tornado-api2,tornado-redis,tornado-db}.memory.used,2)",
883 | "textEditor": true
884 | }
885 | ],
886 | "timeFrom": null,
887 | "timeShift": null,
888 | "title": "Tornado Memory Usage",
889 | "tooltip": {
890 | "shared": true,
891 | "value_type": "cumulative"
892 | },
893 | "type": "graph",
894 | "x-axis": true,
895 | "y-axis": true,
896 | "y_formats": [
897 | "short",
898 | "short"
899 | ]
900 | }
901 | ],
902 | "title": "New row"
903 | },
904 | {
905 | "collapse": false,
906 | "editable": true,
907 | "height": "250px",
908 | "panels": [
909 | {
910 | "aliasColors": {},
911 | "bars": false,
912 | "datasource": null,
913 | "editable": true,
914 | "error": false,
915 | "fill": 1,
916 | "grid": {
917 | "leftLogBase": 1,
918 | "leftMax": null,
919 | "leftMin": null,
920 | "rightLogBase": 1,
921 | "rightMax": null,
922 | "rightMin": null,
923 | "threshold1": null,
924 | "threshold1Color": "rgba(216, 200, 27, 0.27)",
925 | "threshold2": null,
926 | "threshold2Color": "rgba(234, 112, 112, 0.22)"
927 | },
928 | "id": 5,
929 | "legend": {
930 | "avg": false,
931 | "current": false,
932 | "max": false,
933 | "min": false,
934 | "show": true,
935 | "total": false,
936 | "values": false
937 | },
938 | "lines": true,
939 | "linewidth": 2,
940 | "links": [],
941 | "nullPointMode": "connected",
942 | "percentage": false,
943 | "pointradius": 5,
944 | "points": false,
945 | "renderer": "flot",
946 | "seriesOverrides": [],
947 | "span": 6,
948 | "stack": false,
949 | "steppedLine": false,
950 | "targets": [
951 | {
952 | "refId": "A",
953 | "target": "aliasByNode(productiona.hosts.{tornado-proxy,tornado-web1,tornado-web2,tornado-api1,tornado-api2,tornado-redis,tornado-db}.load.shortterm,2)",
954 | "textEditor": true
955 | }
956 | ],
957 | "timeFrom": null,
958 | "timeShift": null,
959 | "title": "Tornado Load Average (short-term)",
960 | "tooltip": {
961 | "shared": true,
962 | "value_type": "cumulative"
963 | },
964 | "type": "graph",
965 | "x-axis": true,
966 | "y-axis": true,
967 | "y_formats": [
968 | "short",
969 | "short"
970 | ]
971 | },
972 | {
973 | "aliasColors": {},
974 | "bars": false,
975 | "datasource": null,
976 | "editable": true,
977 | "error": false,
978 | "fill": 1,
979 | "grid": {
980 | "leftLogBase": 1,
981 | "leftMax": null,
982 | "leftMin": null,
983 | "rightLogBase": 1,
984 | "rightMax": null,
985 | "rightMin": null,
986 | "threshold1": null,
987 | "threshold1Color": "rgba(216, 200, 27, 0.27)",
988 | "threshold2": null,
989 | "threshold2Color": "rgba(234, 112, 112, 0.22)"
990 | },
991 | "id": 4,
992 | "legend": {
993 | "avg": false,
994 | "current": false,
995 | "max": false,
996 | "min": false,
997 | "show": true,
998 | "total": false,
999 | "values": false
1000 | },
1001 | "lines": true,
1002 | "linewidth": 2,
1003 | "links": [],
1004 | "nullPointMode": "connected",
1005 | "percentage": false,
1006 | "pointradius": 5,
1007 | "points": false,
1008 | "renderer": "flot",
1009 | "seriesOverrides": [],
1010 | "span": 6,
1011 | "stack": false,
1012 | "steppedLine": false,
1013 | "targets": [
1014 | {
1015 | "refId": "A",
1016 | "target": "aliasByNode(productiona.hosts.{tornado-proxy,tornado-web1,tornado-web2,tornado-api1,tornado-api2,tornado-redis,tornado-db}.df.root.percent_bytes.used, 2)",
1017 | "textEditor": true
1018 | }
1019 | ],
1020 | "timeFrom": null,
1021 | "timeShift": null,
1022 | "title": "Tornado disk used on /",
1023 | "tooltip": {
1024 | "shared": true,
1025 | "value_type": "cumulative"
1026 | },
1027 | "type": "graph",
1028 | "x-axis": true,
1029 | "y-axis": true,
1030 | "y_formats": [
1031 | "short",
1032 | "short"
1033 | ]
1034 | }
1035 | ],
1036 | "title": "New row"
1037 | }
1038 | ],
1039 | "time": {
1040 | "from": "now-12h",
1041 | "to": "now"
1042 | },
1043 | "timepicker": {
1044 | "collapse": false,
1045 | "enable": true,
1046 | "notice": false,
1047 | "now": true,
1048 | "refresh_intervals": [
1049 | "5s",
1050 | "10s",
1051 | "30s",
1052 | "1m",
1053 | "5m",
1054 | "15m",
1055 | "30m",
1056 | "1h",
1057 | "2h",
1058 | "1d"
1059 | ],
1060 | "status": "Stable",
1061 | "time_options": [
1062 | "5m",
1063 | "15m",
1064 | "1h",
1065 | "6h",
1066 | "12h",
1067 | "24h",
1068 | "2d",
1069 | "7d",
1070 | "30d"
1071 | ],
1072 | "type": "timepicker"
1073 | },
1074 | "templating": {
1075 | "list": []
1076 | },
1077 | "annotations": {
1078 | "list": []
1079 | },
1080 | "refresh": "30s",
1081 | "schemaVersion": 8,
1082 | "version": 41,
1083 | "links": []
1084 | }
--------------------------------------------------------------------------------
/11-13/logstash/logstash.conf:
--------------------------------------------------------------------------------
1 | input {
2 | tcp {
3 | port => 5514
4 | type => syslog
5 | }
6 | tcp {
7 | port => 2003
8 | type => "riemann"
9 | codec => "json"
10 | }
11 | udp {
12 | port => 5514
13 | type => syslog
14 | }
15 | file {
16 | path => [ "/var/log/syslog", "/var/log/auth.log" ]
17 | type => "syslog"
18 | }
19 | }
20 | filter {
21 | if [type] == "syslog" {
22 | grok {
23 | match => { "message" => "(?:%{SYSLOGTIMESTAMP:syslog_timestamp}|%{TIMESTAMP_ISO8601:syslog_timestamp}) %{SYSLOGHOST:syslog_hostname} %{DATA:syslog_program}(?:\/%{DATA:container_name}\/%{DATA:container_id})?(?:\[%{POSINT:syslog_pid}\])?: %{GREEDYDATA:syslog_message}" }
24 | remove_field => ["message"]
25 | }
26 | syslog_pri { }
27 | date {
28 | match => [ "syslog_timestamp", "MMM d HH:mm:ss", "MMM dd HH:mm:ss", "ISO8601" ]
29 | }
30 | if [syslog_program] == "tornado-haproxy" {
31 | grok {
32 | match => ["syslog_message", "%{HAPROXYHTTPBASE}"]
33 | remove_field => ["syslog_message"]
34 | add_field => { "tags" => "tornado"
35 | }
36 | }
37 | if [syslog_program] == "tornado-nginx-access" {
38 | grok {
39 | patterns_dir => "/etc/logstash/patterns"
40 | match => { "syslog_message" => "%{NGINXACCESS}" }
41 | remove_field => ["syslog_message"]
42 | add_field => { "tags" => "tornado"
43 | }
44 | }
45 | if [syslog_program] == "tornado-api" {
46 | grok {
47 | patterns_dir => "/etc/logstash/patterns"
48 | match => { "syslog_message" => "%{TORNADOAPI}" }
49 | remove_field => ["syslog_message"]
50 | add_field => { "tags" => "tornado" }
51 | }
52 | }
53 | }
54 | }
55 | output {
56 | if [syslog_program] == "tornado-haproxy" {
57 | riemann {
58 | host => "riemanna"
59 | sender => "%{syslog_hostname}"
60 | map_fields => true
61 | riemann_event => {
62 | "service" => "tornado.proxy.request"
63 | "metric" => "%{time_duration}"
64 | "state" => "ok"
65 | }
66 | }
67 | }
68 | if [syslog_program] == "tornado-nginx-access" {
69 | riemann {
70 | host => "riemanna"
71 | sender => "%{syslog_hostname}"
72 | map_fields => true
73 | riemann_event => {
74 | "service" => "tornado.web.request"
75 | "metric" => "%{body_bytes_sent}"
76 | "state" => "ok"
77 | }
78 | }
79 | }
80 | if [syslog_program] == "tornado-api" and [app_request_time] {
81 | riemann {
82 | host => "riemanna"
83 | sender => "%{syslog_hostname}"
84 | map_fields => true
85 | riemann_event => {
86 | "service" => "tornado.api.request"
87 | "metric" => "%{app_request_time}"
88 | "state" => "ok"
89 | }
90 | }
91 | }
92 | elasticsearch {
93 | sniffing => true
94 | hosts => "esa1.example.com"
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/11-13/logstash/patterns/nginx:
--------------------------------------------------------------------------------
1 | NGINXACCESS %{IPORHOST:remote_addr} - %{USERNAME:remote_user} \[%{HTTPDATE:time_local}\] "%{WORD:http_method} %{URIPATHPARAM:http_request} HTTP/%{NUMBER:http_version}" %{INT:http_status} %{INT:body_bytes_sent} %{QS:http_referer} %{QS:http_user_agent}
2 |
--------------------------------------------------------------------------------
/11-13/logstash/patterns/tornadoapi:
--------------------------------------------------------------------------------
1 | TORNADOAPI %{TIMESTAMP_ISO8601:app_timestamp} %{URIHOST:app_host} %{DATA:app_severity} %{SYSLOG5424SD} - nil %{DATA:app_request_state} \:%{DATA:app_verb} %{DATA:app_path} for %{URIHOST:app_source} (?:in \(%{INT:app_request_time:int} ms\) Status: %{INT:app_status_code:int}|%{GREEDYDATA:app_request})
2 |
--------------------------------------------------------------------------------
/11-13/riemann/examplecom/app/tornado.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.app.tornado
2 | "Monitoring streams for Tornado"
3 | (:require [riemann.config :refer :all]
4 | [clojure.tools.logging :refer :all]
5 | [riemann.folds :as folds]
6 | [riemann.streams :refer :all]))
7 |
8 | (defn alert_graph
9 | []
10 | "Alert and graph on events"
11 | (sdo
12 | (changed-state {:init "ok"}
13 | (where (state "critical")
14 | (page))
15 | (where (state "warning")
16 | (slacker)))
17 | (smap rewrite-service graph)))
18 |
19 | (defn webtier
20 | "Checks for the Tornado Web Tier"
21 | []
22 | (let [active_servers 2.0]
23 | (sdo
24 | (where (and (service "haproxy/gauge-backend.tornado-web.active_servers")
25 | (< metric active_servers))
26 | (adjust #(assoc % :service "tornado-web active servers"
27 | :type_instance nil
28 | :state (condp = (:metric %)
29 | 0.0 "critical"
30 | 1.0 "warning"
31 | 2.0 "ok"))
32 | (changed :metric {:init active_servers}
33 | (slacker))))
34 | (check_ratio "haproxy/derive-frontend.tornado-www.response_5xx"
35 | "haproxy/derive-frontend.tornado-www.request_total"
36 | "haproxy.frontend.tornado-www.5xx_error_percentage"
37 | 0.5 1
38 | (alert_graph)))))
39 |
40 | (defn apptier
41 | "Checks for the Tornado App Tier"
42 | []
43 | (sdo
44 | (where (service "curl_json-tornado-api/gauge-price")
45 | (where (!= metric 666)
46 | (slacker))
47 | (expired
48 | (page)))
49 | (where (service #"^tornado.api.")
50 | (smap rewrite-service graph))
51 | (check_ratio "GenericJMX-memory-heap/memory-used"
52 | "GenericJMX-memory-heap/memory-max"
53 | "jmx.memory-heap.percentage_used"
54 | 80 90
55 | (alert_graph))
56 | (where (service "tornado.api.request")
57 | (with { :service "tornado.api.request.rate" :metric 1 }
58 | (rate 1
59 | (smap rewrite-service graph))))
60 | (check_percentiles "tornado.api.request" 10
61 | (smap rewrite-service graph)
62 | (where (and (service "tornado.api.request 0.99") (>= metric 100.0))
63 | (changed-state { :init "ok"}
64 | (slacker))))))
65 |
66 | (defn datatier
67 | "Check for the Tornado Data Tier"
68 | []
69 | (sdo
70 | (check_ratio "mysql-status/gauge-Max_used_connections"
71 | "mysql-variables/gauge-max_connections"
72 | "mysql.max_connection_percentage"
73 | 80 90
74 | (alert_graph))
75 | (create_rate "mysql-status/counter-Aborted_connects" 5)
76 | (check_percentiles "dbi-performance_schema/gauge-insert_query_time" 10
77 | (smap rewrite-service graph)
78 | (where (and (service "dbi-performance_schema/gauge-insert_query_time 0.99") (>= metric 3.0))
79 | (changed-state { :init "ok"}
80 | (slacker))))))
81 |
82 | (defn checks
83 | "Handles events for Tornado"
84 | []
85 | (let [web-tier-hosts #"tornado-(proxy|web1|web2)"
86 | app-tier-hosts #"tornado-(api1|api2)"
87 | db-tier-hosts #"tornado-(db|redis)"]
88 |
89 | (splitp re-matches host
90 | web-tier-hosts (webtier)
91 | app-tier-hosts (apptier)
92 | db-tier-hosts (datatier)
93 | #(info "Catchall" (:host %)))))
94 |
--------------------------------------------------------------------------------
/11-13/riemann/examplecom/etc/checks.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.checks
2 | (:require [riemann.config :refer :all]
3 | [clojure.tools.logging :refer :all]
4 | [riemann.streams :refer :all]))
5 |
6 | (defn set_state [warning critical]
7 | (fn [event]
8 | (assoc event :state
9 | (condp < (:metric event)
10 | critical "critical"
11 | warning "warning"
12 | "ok"))))
13 |
14 | (defn create_rate [srv window]
15 | (where (service srv)
16 | (with {:service (str srv " rate")}
17 | (rate window (smap rewrite-service graph)))))
18 |
19 | (defn check_ratio [srv1 srv2 newsrv warning critical & children]
20 | "Checks the ratio between two events"
21 | (project [(service srv1)
22 | (service srv2)]
23 | (smap folds/quotient-sloppy
24 | (fn [event] (let [percenta (* (float (:metric event)) 100)
25 | new-event (assoc event :metric percenta
26 | :service (str newsrv)
27 | :type_instance nil
28 | :state (condp < percenta
29 | critical "critical"
30 | warning "warning"
31 | "ok"))]
32 | (call-rescue new-event children))))))
33 |
34 | (defn check_threshold [srv window func warning critical & children]
35 | (where (service srv)
36 | (fixed-time-window window
37 | (smap func
38 | (where (< warning metric)
39 | (smap (set_state warning critical)
40 | (fn [event]
41 | (call-rescue event children))))))))
42 |
43 | (defn check_percentiles [srv window & children]
44 | (where (service srv)
45 | (percentiles window [0.5 0.95 0.99 1]
46 | (fn [event]
47 | (call-rescue event children)))))
48 |
--------------------------------------------------------------------------------
/11-13/riemann/examplecom/etc/collectd.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.collectd
2 | (:require [clojure.tools.logging :refer :all]
3 | [riemann.streams :refer :all]
4 | [clojure.string :as str]
5 | [clojure.walk :as walk]))
6 |
7 | (defn docker-attribute-map
8 | [attributes]
9 | (let [instance (str/split (str/replace attributes #"^.*\[(.*)\]$" "$1") #",")]
10 | (walk/keywordize-keys (into {} (for [pair instance] (apply hash-map (str/split pair #"=")))))))
11 |
12 | (defn docker-attributes
13 | [{:keys [plugin_instance] :as event}]
14 | (if-let [attributes (re-find #"^.*\[.*\]$" plugin_instance)]
15 | (merge event (docker-attribute-map attributes))
16 | event))
17 |
18 | (defn parse-docker-service-host
19 | [{:keys [type type_instance plugin_instance] :as event}]
20 | (let [host (re-find #"^\w+\.?\w+\.?\w+" (:plugin_instance event))
21 | service (cond-> (str (:type event)) (:type_instance event) (str "." (:type_instance event)))]
22 | (assoc event :service service :host host)))
23 |
24 | (def default-services
25 | [{:service #"^load/load/(.*)$" :rewrite "load $1"}
26 | {:service #"^swap/percent-(.*)$" :rewrite "swap $1"}
27 | {:service #"^memory/percent-(.*)$" :rewrite "memory $1"}
28 | {:service #"^processes/ps_state-(.*)$" :rewrite "processes $1"}
29 | {:service #"^processes-(.*)/(.*)$" :rewrite "processes $1 $2"}
30 | {:service #"^cpu/percent-(.*)$" :rewrite "cpu $1"}
31 | {:service #"^df-(.*)/(df_complex|percent_bytes)-(.*)$" :rewrite "df $1 $2 $3"}
32 | {:service #"^interface-(.*)/if_(errors|packets|octets)/(tx|rx)$" :rewrite "nic $1 $3 $2"}
33 | {:service #"^protocols-(.*)/(.*)$" :rewrite "protocols $1 $2"}
34 | {:service #"^GenericJMX-(:?_|\/)?(.*)$" :rewrite "jmx $2"}
35 | {:service #"^haproxy\/(gauge|derive)-(.*)$" :rewrite "haproxy $2"}
36 | {:service #"^statsd\/(gauge|derive|latency)-(.*)$" :rewrite "$2"}
37 | {:service #"^mysql-(.*)\/(counter|gauge)-(.*)$" :rewrite "mysql $1 $3"}
38 | {:service #"^dbi-(.*)\/(gauge|counter)-(.*)$" :rewrite "dbi $1 $3"}
39 | {:service #"^redis-(.*)$" :rewrite "redis $1"}])
40 |
41 | (defn rewrite-service-with
42 | [rules]
43 | (let [matcher (fn [s1 s2] (if (string? s1) (= s1 s2) (re-find s1 s2)))]
44 | (fn [{:keys [service] :as event}]
45 | (or
46 | (first
47 | (for [{:keys [rewrite] :as rule} rules
48 | :when (matcher (:service rule) service)]
49 | (assoc event :service
50 | (if (string? (:service rule))
51 | rewrite
52 | (str/replace service (:service rule) rewrite)))))
53 | event))))
54 |
55 | (def rewrite-service
56 | (rewrite-service-with default-services))
57 |
--------------------------------------------------------------------------------
/11-13/riemann/examplecom/etc/count-notifications.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.count-notifications
2 | (:require [riemann.streams :refer :all]))
3 |
4 | (defn count-notifications
5 | "Count notifications"
6 | [& children]
7 | (adjust [:service #(str % ".rate")]
8 | (tag "notification-rate"
9 | (rate 5
10 | (fn [event]
11 | (call-rescue event children))))))
12 |
--------------------------------------------------------------------------------
/11-13/riemann/examplecom/etc/email.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.email
2 | (:require [clojure.string :as str]
3 | [riemann.email :refer :all]))
4 |
5 | (defn format-subject
6 | "Format the email subject"
7 | [events]
8 | (apply format "Service %s is in state %s on host %s" (str/join ", " (map :service events)) (str/join ", " (map :state events)) (map :host events)))
9 |
10 | (def header "Monitoring notification from Riemann!\n\n")
11 | (def footer "This is an automated Riemann notification. Please do not reply.")
12 |
13 | (defn lookup
14 | "Lookup events in the index"
15 | [host service]
16 | (riemann.index/lookup (:index @riemann.config/core) host service))
17 |
18 | (defn round
19 | "Round numbers to 2 decimal places"
20 | [metric]
21 | (clojure.pprint/cl-format nil "~,2f" metric))
22 |
23 | (defn byte-to-gb [bytes] (/ bytes (* 1024.0 1024.0 1024.0)))
24 |
25 | (defn context
26 | "Add some contextual event data"
27 | [event]
28 | (str
29 | "Host context:\n"
30 | " CPU Utilization:\t"(round (+ (:metric (lookup (:host event) "cpu/percent-system")) (:metric (lookup (:host event) "cpu/percent-user")))) "%\n"
31 | " Memory Used:\t"(round (:metric (lookup (:host event) "memory/percent-used"))) "%\n"
32 | " Disk(root) %:\t\t"(round (:metric (lookup (:host event) "df-root/percent_bytes-used"))) "% used "
33 | " ("(round (byte-to-gb (:metric (lookup (:host event) "df-root/df_complex-used")))) " GB used of "
34 | (round (+ (byte-to-gb (:metric (lookup (:host event) "df-root/df_complex-used")))
35 | (byte-to-gb (:metric (lookup (:host event) "df-root/df_complex-free")))
36 | (byte-to-gb (:metric (lookup (:host event) "df-root/df_complex-reserved"))))) "GB)\n\n"
37 | "Grafana Dashboard:\n\n"
38 | " http://graphitea.example.com:3000/dashboard/script/riemann.js?host="(:host event)"\n\n"))
39 |
40 | (defn format-body
41 | "Format the email body"
42 | [events]
43 | (str/join "\n\n\n"
44 | (map
45 | (fn [event]
46 | (str
47 | header
48 | "Time:\t\t" (riemann.common/time-at (:time event)) "\n"
49 | "Host:\t\t" (:host event) "\n"
50 | "Service:\t\t" (:service event) "\n"
51 | "State:\t\t" (:state event) "\n"
52 | "Metric:\t\t" (if (ratio? (:metric event))
53 | (double (:metric event))
54 | (:metric event)) "\n"
55 | "Tags:\t\t[" (str/join ", " (:tags event)) "] \n"
56 | "\n"
57 | "Description:\t\t" (:description event)
58 | "\n\n"
59 | (context event)
60 | footer))
61 | events)))
62 |
63 | (def email (mailer {:from "riemann@example.com"
64 | :subject (fn [events] (format-subject events))
65 | :body (fn [events] (format-body events))
66 | }))
67 |
--------------------------------------------------------------------------------
/11-13/riemann/examplecom/etc/graphite.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.graphite
2 | (:require [clojure.string :as str]
3 | [riemann.config :refer :all]
4 | [riemann.graphite :refer :all]))
5 |
6 | (defn graphite-path-statsd [event]
7 | (let [host (:host event)
8 | app (re-find #"^.*?\." (:service event))
9 | service (str/replace-first (:service event) #"^.*?\." "")
10 | split-host (if host (str/split host #"\.") [])
11 | split-service (if service (str/split service #" ") [])]
12 | (str app, (str/join "." (concat (reverse split-host) split-service)))))
13 |
14 | (defn add-environment-to-graphite [event]
15 | (condp = (:plugin event)
16 | "docker"
17 | (if (:com.example.application event)
18 | (str "productiona.docker.", (:com.example.application event), ".", (riemann.graphite/graphite-path-percentiles event))
19 | (str "productiona.docker.", (riemann.graphite/graphite-path-percentiles event)))
20 | "statsd" (str "productiona.", (graphite-path-statsd event))
21 | (str "productiona.hosts.", (riemann.graphite/graphite-path-percentiles event))))
22 |
23 | (def graph (async-queue! :graphite {:queue-size 1000}
24 | (graphite {:host "graphitea" :path add-environment-to-graphite})))
25 |
--------------------------------------------------------------------------------
/11-13/riemann/examplecom/etc/logstash.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.logstash
2 | (:require [riemann.logstash :refer :all]))
3 |
4 | (def logstash (async-queue! :logstash {:queue-size 1000}
5 | (logstash {:host "logstash" :port 2003 :port-size 20})))
6 |
--------------------------------------------------------------------------------
/11-13/riemann/examplecom/etc/maintenance.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.maintenance
2 | (:require [riemann.streams :refer :all]))
3 |
4 | (defn maintenance-mode?
5 | "Is it currently in maintenance mode?"
6 | [event]
7 | (->> '(and (= host (:host event))
8 | (= service (:service event))
9 | (= (:type event) "maintenance-mode"))
10 | (riemann.index/search (:index @core))
11 | first
12 | :state
13 | (= "active")))
14 |
--------------------------------------------------------------------------------
/11-13/riemann/examplecom/etc/pagerduty.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.pagerduty
2 | (:require [riemann.pagerduty :refer :all]
3 | [riemann.streams :refer :all]))
4 |
5 | (defn pd-format
6 | [event]
7 | {:incident_key (str (:host event) " " (:service event))
8 | :description (str "Host: " (:host event) " "
9 | (:service event) " is "
10 | (:state event) " ("
11 | (:metric event) ")")
12 | :details (assoc event :graphs (str "http://graphitea.example.com:3000/dashboard/script/riemann.js?host="(:host event)))})
13 |
14 | (def pd (pagerduty { :service-key "123ABC123" :formatter pd-format}))
15 |
16 | (defn page
17 | []
18 | (changed-state {:init "ok"}
19 | (where (state "ok")
20 | (:resolve pd)
21 | (else (:trigger pd)))))
22 |
--------------------------------------------------------------------------------
/11-13/riemann/examplecom/etc/slack.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.slack
2 | (:require [riemann.slack :refer :all]))
3 |
4 | (def credentials {:account "examplecom", :token "123ABC123ABC"})
5 |
6 | (defn slack-format
7 | "Format our Slack message"
8 | [event]
9 | (str "Service " (:service event) " on host " (:host event) " is in state " (:state event) ".\n"
10 | "See http://graphitea.example.com:3000/dashboard/script/riemann.js?host="(:host event) ))
11 |
12 | (defn slacker
13 | "Send notifications to Slack"
14 | [& {:keys [recipient]
15 | :or {recipient "#monitoring"}}]
16 | (slack credentials {:username "Riemann bot"
17 | :channel recipient
18 | :formatter (fn [e] { :text (slack-format e) } )
19 | :icon ":smile:"}))
20 |
--------------------------------------------------------------------------------
/11-13/riemann/riemann.config:
--------------------------------------------------------------------------------
1 | (logging/init {:file "/var/log/riemann/riemann.log"})
2 |
3 | (require 'riemann.client)
4 | (require '[examplecom.etc.email :refer :all])
5 | (require '[examplecom.etc.graphite :refer :all])
6 | (require '[examplecom.etc.collectd :refer :all])
7 | (require '[examplecom.app.tornado :as tornado])
8 |
9 | (let [host "0.0.0.0"]
10 | (repl-server {:host "127.0.0.1"})
11 | (tcp-server {:host host})
12 | (udp-server {:host host})
13 | (ws-server {:host host}))
14 |
15 | (periodically-expire 10 {:keep-keys [:host :service :tags, :state, :description, :metric]})
16 |
17 | (let [index (index)
18 | downstream (batch 100 1/10
19 | (async-queue! :agg { :queue-size 1e3
20 | :core-pool-size 4
21 | :max-pool-size 32}
22 | (forward
23 | (riemann.client/tcp-client :host "riemannmc"))))]
24 |
25 | ; Inbound events will be passed to these streams:
26 | (streams
27 | (default :ttl 60
28 | ; Index all events immediately.
29 | (where (not (tagged "notification"))
30 | index)
31 |
32 | (tagged "tornado"
33 | (tornado/checks))
34 |
35 | (tagged "collectd"
36 | (where (not (= (:plugin event) "docker"))
37 | (smap rewrite-service graph))
38 |
39 | (where (= (:plugin event) "docker")
40 | (smap (comp parse-docker-service-host docker-attributes rewrite-service) graph))
41 |
42 | (tagged "notification"
43 | (changed-state {:init "ok"}
44 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count$" "$1"]
45 | (email "james@example.com"))))
46 |
47 | (where (and (expired? event)
48 | (service #"^processes-.+\/ps_count\/processes"))
49 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count\/processes$" "$1"]
50 | (email "james@example.com"))))
51 |
52 | (where (service #"^riemann.*")
53 | graph
54 |
55 | downstream))))
56 |
--------------------------------------------------------------------------------
/11-13/rsyslog/35-aom-clojure-rest.conf:
--------------------------------------------------------------------------------
1 | module(load="imfile" PollingInterval="10")
2 |
3 | input(type="imfile"
4 | File="/var/log/aom-clojure-rest.log"
5 | StateFile="aom_clojure_rest"
6 | Tag="aom-clojure-rest:"
7 | Severity="info"
8 | Facility="local7")
9 |
--------------------------------------------------------------------------------
/3/collectd/riemann.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin processes
2 |
3 | ProcessMatch riemann "riemann.bin start"
4 |
5 |
6 | LoadPlugin java
7 |
8 | JVMARG "-Djava.class.path=/usr/share/collectd/java/collectd-api.jar:/usr/share/collectd/java/generic-jmx.jar"
9 | LoadPlugin "org.collectd.java.GenericJMX"
10 |
11 |
12 | ObjectName "java.lang:type=GarbageCollector,*"
13 | InstancePrefix "gc-"
14 | InstanceFrom "name"
15 |
16 | Type "derive"
17 | Table false
18 | Attribute "CollectionCount"
19 | InstancePrefix "count"
20 |
21 |
22 |
23 | ObjectName "java.lang:type=GarbageCollector,*"
24 | InstancePrefix "gc-"
25 | InstanceFrom "name"
26 |
27 | Type "derive"
28 | Table false
29 | Attribute "CollectionTime"
30 | InstancePrefix "time"
31 |
32 |
33 |
34 | ObjectName "java.lang:type=MemoryPool,*"
35 | InstancePrefix "memory_pool-"
36 | InstanceFrom "name"
37 |
38 | Type "memory"
39 | Table true
40 | Attribute "Usage"
41 |
42 |
43 |
44 | ObjectName "java.lang:type=Memory"
45 | InstancePrefix "memory-heap"
46 |
47 | Type "memory"
48 | Table true
49 | Attribute "HeapMemoryUsage"
50 |
51 |
52 |
53 | ObjectName "java.lang:type=Memory"
54 | InstancePrefix "memory-nonheap"
55 |
56 | Type "memory"
57 | Table true
58 | Attribute "NonHeapMemoryUsage"
59 |
60 |
61 |
62 | ObjectName "java.lang:type=Threading"
63 | InstancePrefix "threading"
64 |
65 | Type "gauge"
66 | Table false
67 | Attribute "ThreadCount"
68 | InstancePrefix "count"
69 |
70 |
71 |
72 | ObjectName "java.lang:type=Threading"
73 | InstancePrefix "threading"
74 |
75 | Type "gauge"
76 | Table false
77 | Attribute "DaemonThreadCount"
78 | InstancePrefix "count-daemon"
79 |
80 |
81 |
82 | ServiceURL "service:jmx:rmi:///jndi/rmi://localhost:8855/jmxrmi"
83 | Collect "memory_pool"
84 | Collect "memory-heap"
85 | Collect "memory-nonheap"
86 | Collect "gc-count"
87 | Collect "gc-time"
88 | Collect "thread"
89 | Collect "thread-daemon"
90 |
91 |
92 |
93 |
--------------------------------------------------------------------------------
/3/riemann/examplecom/etc/email.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.email
2 | (:require [riemann.email :refer :all]))
3 |
4 | (def email (mailer {:from "riemann@example.com"}))
5 |
--------------------------------------------------------------------------------
/3/riemann/riemann.config:
--------------------------------------------------------------------------------
1 | (logging/init {:file "/var/log/riemann/riemann.log"})
2 |
3 | (require 'riemann.client)
4 | (require '[examplecom.etc.email :refer :all])
5 |
6 | (let [host "0.0.0.0"]
7 | (repl-server {:host "127.0.0.1"})
8 | (tcp-server {:host host})
9 | (udp-server {:host host})
10 | (ws-server {:host host}))
11 |
12 | (periodically-expire 10 {:keep-keys [:host :service :tags, :state, :description, :metric]})
13 |
14 | (let [index (index)
15 | downstream (batch 100 1/10
16 | (async-queue! :agg { :queue-size 1e3
17 | :core-pool-size 4
18 | :max-pool-size 32}
19 | (forward
20 | (riemann.client/tcp-client :host "riemannmc"))))]
21 |
22 | ; Inbound events will be passed to these streams:
23 | (streams
24 | (default :ttl 60
25 | ; Index all events immediately.
26 | index
27 |
28 | ; Send all events to the log file.
29 | ;#(info %)
30 | (where (and (service "disk /") (metric > 0.10))
31 | #(info "Disk space on / is over 10%!" %))
32 |
33 | (where (service #"^riemann.*")
34 |
35 | downstream))))
36 |
--------------------------------------------------------------------------------
/3/riemann/riemann.config_riemannmc:
--------------------------------------------------------------------------------
1 | (logging/init {:file "/var/log/riemann/riemann.log"})
2 |
3 | (require 'riemann.client)
4 | (require '[examplecom.etc.email :refer :all])
5 |
6 | (let [host "0.0.0.0"]
7 | (repl-server {:host "127.0.0.1"})
8 | (tcp-server {:host host})
9 | (udp-server {:host host})
10 | (ws-server {:host host}))
11 |
12 | (periodically-expire 10 {:keep-keys [:host :service :tags, :state, :description, :metric]})
13 |
14 | (let [index (index)]
15 |
16 | (streams
17 | (default :ttl 60
18 | ; Index all events immediately.
19 | index
20 |
21 | (expired
22 | (throttle 1 120
23 | (where (service #"^riemann.*"
24 | (email "james@example.com")))))))
25 |
--------------------------------------------------------------------------------
/4/collectd/carbon.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin processes
2 |
3 |
4 | ProcessMatch "carbon-cache" "python.+carbon-cache"
5 | ProcessMatch "carbon-relay" "python.+carbon-relay"
6 |
7 |
8 |
9 | ProcessMatch "graphite-api" "graphite_api.app:app"
10 |
11 |
12 |
13 |
14 | Instance "carbon-cache"
15 |
16 | DataSource "processes"
17 | WarningMin 2
18 | FailureMin 1
19 |
20 | Instance "carbon-relay"
21 |
22 | DataSource "processes"
23 | FailureMin 1
24 |
25 | Instance "graphite-api"
26 |
27 | DataSource "processes"
28 | FailureMin 2
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/4/collectd/grafana.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin processes
2 |
3 |
4 | Process "grafana-server"
5 |
6 |
--------------------------------------------------------------------------------
/4/graphite/carbon-cache-ubuntu.init:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # Initscript for carbon-cache processes
4 | # Jason Dixon
5 | #
6 | # You must set the variables below. The
7 | # CACHE_INSTANCES variable should be set to the
8 | # number of carbon-cache instances you have
9 | # configured in your carbon.conf. Note that
10 | # they must be numerically indexed from 1.
11 | # (e.g. [cache:1], [cache:2], [cache:3]
12 |
13 | ### BEGIN INIT INFO
14 | # Provides: carbon-cache
15 | # Required-Start: $remote_fs $syslog
16 | # Required-Stop: $remote_fs $syslog
17 | # Default-Start: 2 3 4 5
18 | # Default-Stop: 0 1 6
19 | # Short-Description: Start carbon-cache daemon at boot time
20 | # Description: Runs the carbon-cache daemon.
21 | ### END INIT INFO
22 |
23 | PATH=/sbin:/usr/sbin:/bin:/usr/bin
24 | NAME=carbon-cache
25 | DAEMON=/usr/bin/$NAME
26 | DAEMON_ARGS="--config=/etc/carbon/carbon.conf --logdir=/var/log/carbon/"
27 | SCRIPTNAME=/etc/init.d/$NAME
28 | PID_DIR=/var/run
29 |
30 | set -e
31 |
32 | test -x $DAEMON || exit 0
33 |
34 | [ -r /etc/default/graphite-carbon ] && . /etc/default/graphite-carbon
35 |
36 | case "$1" in
37 |
38 | start)
39 | for INSTANCE in $(seq 1 $CACHE_INSTANCES); do
40 | echo -n "Starting ${NAME}-${INSTANCE}: "
41 | PID="${PID_DIR}/${NAME}-${INSTANCE}.pid"
42 | if start-stop-daemon --start --quiet --pidfile $PID --exec $DAEMON -- $DAEMON_ARGS start --pidfile=$PID --instance=${INSTANCE}
43 | then
44 | echo "succeeded"
45 | else
46 | echo "failed"
47 | fi
48 | done
49 | ${0} status
50 | ;;
51 |
52 | stop)
53 | for INSTANCE in $(seq 1 $CACHE_INSTANCES); do
54 | echo -n "Stopping ${NAME}-${INSTANCE}: "
55 | PID="${PID_DIR}/${NAME}-${INSTANCE}.pid"
56 | $DAEMON stop $DAEMON_ARGS --pidfile=$PID --instance=${INSTANCE}
57 | echo "stopped"
58 | rm -f $PID
59 | done
60 | exit 0
61 | ;;
62 |
63 | restart)
64 | ${0} stop
65 | ${0} start
66 | ;;
67 |
68 | status)
69 | for INSTANCE in $(seq 1 $CACHE_INSTANCES); do
70 | if [ -f "${PID_DIR}/${NAME}-${INSTANCE}.pid" ]; then
71 | PID=`cat "${PID_DIR}/${NAME}-${INSTANCE}.pid"`
72 |
73 | echo -n "${NAME}-${INSTANCE} (pid: $PID): "
74 | if ps -p $PID >/dev/null; then
75 | echo "running"
76 | else
77 | echo "failed"
78 | fi
79 | else
80 | echo "${NAME}-${INSTANCE} not running"
81 | fi
82 | done
83 | for INSTANCE in $(seq 1 $CACHE_INSTANCES); do
84 | if [ ! -f "${PID_DIR}/${NAME}-${INSTANCE}.pid" ]; then
85 | exit 1
86 | fi
87 | done
88 | exit 0
89 | ;;
90 |
91 | *)
92 | echo "Usage: /etc/init.d/${NAME} {start|stop|restart|status}" >%2
93 | exit 1
94 | ;;
95 |
96 | esac
97 |
98 | exit 0
99 |
--------------------------------------------------------------------------------
/4/graphite/carbon-cache@.service:
--------------------------------------------------------------------------------
1 | [Unit]
2 | Description=carbon-cache instance %i (graphite)
3 |
4 | [Service]
5 | ExecStartPre=/bin/rm -f /var/run/carbon-cache-%i.pid
6 | ExecStart=/usr/bin/carbon-cache --config=/etc/carbon/carbon.conf --pidfile=/var/run/carbon-cache-%i.pid --logdir=/var/log/carbon/ --instance=%i start
7 | Type=forking
8 | PIDFile=/var/run/carbon-cache-%i.pid
9 |
10 | [Install]
11 | WantedBy=multi-user.target
12 |
--------------------------------------------------------------------------------
/4/graphite/carbon-relay-ubuntu.init:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # Initscript for carbon-relay processes
4 | # Jason Dixon
5 | #
6 | # You must set the variables below. The
7 | # RELAY_INSTANCES variable should be set to the
8 | # number of carbon-relay instances you have
9 | # configured in your carbon.conf. Note that
10 | # they must be numerically indexed from 1.
11 | # (e.g. [relay:1], [relay:2], [relay:3]
12 |
13 | ### BEGIN INIT INFO
14 | # Provides: carbon-relay
15 | # Required-Start: $remote_fs $syslog
16 | # Required-Stop: $remote_fs $syslog
17 | # Default-Start: 2 3 4 5
18 | # Default-Stop: 0 1 6
19 | # Short-Description: Start carbon-relay daemon at boot time
20 | # Description: Runs the carbon-relay daemon.
21 | ### END INIT INFO
22 |
23 | PATH=/sbin:/usr/sbin:/bin:/usr/bin
24 | NAME=carbon-relay
25 | DAEMON=/usr/bin/$NAME
26 | DAEMON_ARGS="--config=/etc/carbon/carbon.conf --logdir=/var/log/carbon/"
27 | SCRIPTNAME=/etc/init.d/$NAME
28 | PID_DIR=/var/run
29 |
30 | set -e
31 |
32 | test -x $DAEMON || exit 0
33 |
34 | [ -r /etc/default/graphite-carbon ] && . /etc/default/graphite-carbon
35 |
36 | case "$1" in
37 |
38 | start)
39 | for INSTANCE in $(seq 1 $RELAY_INSTANCES); do
40 | echo -n "Starting ${NAME}-${INSTANCE}: "
41 | PID="${PID_DIR}/${NAME}-${INSTANCE}.pid"
42 | if start-stop-daemon --start --quiet --pidfile $PID --exec $DAEMON -- $DAEMON_ARGS start --pidfile=$PID --instance=${INSTANCE}
43 | then
44 | echo "succeeded"
45 | else
46 | echo "failed"
47 | fi
48 | done
49 | ${0} status
50 | ;;
51 |
52 | stop)
53 | for INSTANCE in $(seq 1 $RELAY_INSTANCES); do
54 | echo -n "Stopping ${NAME}-${INSTANCE}: "
55 | PID="${PID_DIR}/${NAME}-${INSTANCE}.pid"
56 | $DAEMON stop $DAEMON_ARGS --pidfile=$PID --instance=${INSTANCE}
57 | echo "stopped"
58 | rm -f $PID
59 | done
60 | exit 0
61 | ;;
62 |
63 | restart)
64 | ${0} stop
65 | ${0} start
66 | ;;
67 |
68 | status)
69 | for INSTANCE in $(seq 1 $RELAY_INSTANCES); do
70 | if [ -f "${PID_DIR}/${NAME}-${INSTANCE}.pid" ]; then
71 | PID=`cat "${PID_DIR}/${NAME}-${INSTANCE}.pid"`
72 |
73 | echo -n "${NAME}-${INSTANCE} (pid: $PID): "
74 | if ps -p $PID >/dev/null; then
75 | echo "running"
76 | else
77 | echo "failed"
78 | fi
79 | else
80 | echo "${NAME}-${INSTANCE} not running"
81 | fi
82 | done
83 | for INSTANCE in $(seq 1 $RELAY_INSTANCES); do
84 | if [ ! -f "${PID_DIR}/${NAME}-${INSTANCE}.pid" ]; then
85 | exit 1
86 | fi
87 | done
88 | exit 0
89 | ;;
90 |
91 | *)
92 | echo "Usage: /etc/init.d/${NAME} {start|stop|restart|status}" >%2
93 | exit 1
94 | ;;
95 |
96 | esac
97 |
98 | exit 0
99 |
--------------------------------------------------------------------------------
/4/graphite/carbon-relay@.service:
--------------------------------------------------------------------------------
1 | [Unit]
2 | Description=carbon-relay instance %i (graphite)
3 |
4 | [Service]
5 | ExecStartPre=/bin/rm -f /var/run/carbon-relay-%i.pid
6 | ExecStart=/usr/bin/carbon-relay --config=/etc/carbon/carbon.conf --pidfile=/var/run/carbon-relay-%i.pid --logdir=/var/log/carbon/ --instance=%i start
7 | Type=forking
8 | PIDFile=/var/run/carbon-relay-%i.pid
9 |
10 | [Install]
11 | WantedBy=multi-user.target
12 |
--------------------------------------------------------------------------------
/4/graphite/carbon.conf:
--------------------------------------------------------------------------------
1 | [cache]
2 | STORAGE_DIR = /var/lib/graphite/
3 | CONF_DIR = /etc/carbon/
4 | LOG_DIR = /var/log/carbon/
5 | PID_DIR = /var/run/
6 | LOCAL_DATA_DIR = /var/lib/graphite/whisper/
7 | USER = _graphite
8 | ENABLE_LOGROTATION = True
9 | LINE_RECEIVER_INTERFACE = 127.0.0.1
10 | PICKLE_RECEIVER_INTERFACE = 127.0.0.1
11 | CACHE_QUERY_INTERFACE = 127.0.0.1
12 | LOG_UPDATES = False
13 | LOG_CACHE_HITS = False
14 |
15 | [cache:1]
16 | LINE_RECEIVER_PORT = 2013
17 | PICKLE_RECEIVER_PORT = 2014
18 | CACHE_QUERY_PORT = 7012
19 |
20 | [cache:2]
21 | LINE_RECEIVER_PORT = 2023
22 | PICKLE_RECEIVER_PORT = 2024
23 | CACHE_QUERY_PORT = 7022
24 |
25 | [relay]
26 | USER = _graphite
27 | LINE_RECEIVER_INTERFACE = 0.0.0.0
28 | LINE_RECEIVER_PORT = 2003
29 | PICKLE_RECEIVER_INTERFACE = 0.0.0.0
30 | PICKLE_RECEIVER_PORT = 2004
31 | RELAY_METHOD = consistent-hashing
32 | REPLICATION_FACTOR = 1
33 | DESTINATIONS = 127.0.0.1:2014:1, 127.0.0.1:2024:2
34 |
--------------------------------------------------------------------------------
/4/graphite/graphite-api.service:
--------------------------------------------------------------------------------
1 | [Unit]
2 | Description=graphite-api (graphite)
3 |
4 | [Service]
5 | ExecStartPre=/bin/rm -f /var/run/graphite-api.pid
6 | ExecStart=/usr/bin/gunicorn --pid /var/run/graphite-api.pid -b 0.0.0.0:8888 -w 2 --daemon graphite_api.app:app
7 | Type=forking
8 | PIDFile=/var/run/graphite-api.pid
9 |
10 | [Install]
11 | WantedBy=multi-user.target
12 |
--------------------------------------------------------------------------------
/4/graphite/graphite-api.yaml:
--------------------------------------------------------------------------------
1 | search_index: /var/lib/graphite/api_search_index
2 | finders:
3 | - graphite_api.finders.whisper.WhisperFinder
4 | functions:
5 | - graphite_api.functions.SeriesFunctions
6 | - graphite_api.functions.PieFunctions
7 | whisper:
8 | directories:
9 | - /var/lib/graphite/whisper
10 | carbon:
11 | hosts:
12 | - 127.0.0.1:7012
13 | - 127.0.0.1:7022
14 | timeout: 1
15 | retry_delay: 15
16 | carbon_prefix: carbon
17 | replication_factor: 1
18 | time_zone: UTC
19 |
--------------------------------------------------------------------------------
/4/graphite/graphite-carbon.default:
--------------------------------------------------------------------------------
1 | # Change to true, to enable carbon-cache on boot
2 | CARBON_CACHE_ENABLED=true
3 | RELAY_INSTANCES=1
4 | CACHE_INSTANCES=2
5 |
--------------------------------------------------------------------------------
/4/graphite/local_settings.py:
--------------------------------------------------------------------------------
1 | ## Graphite local_settings.py
2 | # Edit this file to customize the default Graphite webapp settings
3 | #
4 | # Additional customizations to Django settings can be added to this file as well
5 |
6 | SECRET_KET = 'tqIaQJEnthL5zVRKgBYR4KkcSzks98F55LRKdZo821tC9pwvCr7Bf5edqTIcr2Gemmttr3FXTMCofzH0zdEaNHpcCstiN7zFZxuUeCxB7rHLbD7VYwlh0gGSstIgkMyvYXLHc6bnwlClioGNI0GFdaVg8xrfnD9gr7W0ESL5O9luLVrRLwpLbZKoEV93DXwMBINTqXemgupVFJnBdUhWZMFfWzRiNDr0pvCawFl5ZVC7Y8fVXy4dj7hSOGzumV9i'
7 | TIME_ZONE = 'America/New_York'
8 | USE_REMOTE_USER_AUTHENTICATION = True
9 | LOG_RENDERING_PERFORMANCE = True
10 | LOG_CACHE_PERFORMANCE = True
11 | LOG_METRIC_ACCESS = True
12 | GRAPHITE_ROOT = '/usr/share/graphite-web'
13 | CONF_DIR = '/etc/graphite'
14 | STORAGE_DIR = '/var/lib/graphite/whisper'
15 | CONTENT_DIR = '/usr/share/graphite-web/static'
16 | WHISPER_DIR = '/var/lib/graphite/whisper'
17 | LOG_DIR = '/var/log/graphite'
18 | INDEX_FILE = '/var/lib/graphite/search_index' # Search index file
19 |
20 | DATABASES = {
21 | 'default': {
22 | 'NAME': 'graphite',
23 | 'ENGINE': 'django.db.backends.postgresql_psycopg2',
24 | 'USER': 'graphite',
25 | 'PASSWORD': 'strongpassword',
26 | 'HOST': '127.0.0.1',
27 | 'PORT': ''
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/4/graphite/whisper-calculator.py:
--------------------------------------------------------------------------------
1 | ## Original source - https://gist.github.com/jjmaestro/5774063
2 |
3 | #!/usr/bin/env python
4 | # -*- coding: utf-8 -*-
5 |
6 |
7 | def archive_to_bytes(archive):
8 | def to_seconds(s):
9 | SECONDS_IN_A = {
10 | 's': 1,
11 | 'm': 1 * 60,
12 | 'h': 1 * 60 * 60,
13 | 'd': 1 * 60 * 60 * 24,
14 | 'y': 1 * 60 * 60 * 24 * 365,
15 | }
16 |
17 | return int(s[:-1]) * SECONDS_IN_A[s[-1]]
18 |
19 | archive = [map(to_seconds, point.split(':'))
20 | for point in args.archive.split(',')]
21 |
22 | SIZE_METADATA = 2 * 4 + 4 + 4 # 16 [!2LfL]
23 | SIZE_ARCHIVE_INFO = 3 * 4 # 12 [!3L]+
24 | SIZE_POINT = 4 + 8 # 12 [!Ld]+
25 |
26 | size = 0
27 | for resolution, retention in archive:
28 | size += SIZE_ARCHIVE_INFO + SIZE_POINT * retention/resolution
29 |
30 | if size:
31 | size += SIZE_METADATA
32 |
33 | return size
34 |
35 |
36 | if __name__ == '__main__':
37 | import argparse
38 |
39 | parser = argparse.ArgumentParser(
40 | description="Calculates the size of the whisper storage for the given \
41 | archive (in resolution:retention format, e.g. 1m:24h,5m:3m)"
42 | )
43 | parser.add_argument(
44 | 'archive',
45 | help="Archive in storage-schemas.conf format (resolution:retention)"
46 | )
47 |
48 | args = parser.parse_args()
49 |
50 | print "{} >> {} bytes".format(args.archive, archive_to_bytes(args.archive))
51 |
52 |
--------------------------------------------------------------------------------
/4/riemann/examplecom/etc/email.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.email
2 | (:require [riemann.email :refer :all]))
3 |
4 | (def email (mailer {:from "riemann@example.com"}))
5 |
--------------------------------------------------------------------------------
/4/riemann/examplecom/etc/graphite.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.graphite
2 | (:require [riemann.config :refer :all]
3 | [riemann.graphite :refer :all]))
4 |
5 | (defn add-environment-to-graphite [event] (str "productiona.hosts.", (riemann.graphite/graphite-path-percentiles event)))
6 |
7 | (def graph (async-queue! :graphite {:queue-size 1000}
8 | (graphite {:host "graphitea" :path add-environment-to-graphite})))
9 |
--------------------------------------------------------------------------------
/4/riemann/riemann.config:
--------------------------------------------------------------------------------
1 | (logging/init {:file "/var/log/riemann/riemann.log"})
2 |
3 | (require 'riemann.client)
4 | (require '[examplecom.etc.email :refer :all])
5 | (require '[examplecom.etc.graphite :refer :all])
6 |
7 | (let [host "0.0.0.0"]
8 | (repl-server {:host "127.0.0.1"})
9 | (tcp-server {:host host})
10 | (udp-server {:host host})
11 | (ws-server {:host host}))
12 |
13 | (periodically-expire 10 {:keep-keys [:host :service :tags, :state, :description, :metric]})
14 |
15 | (let [index (index)
16 | downstream (batch 100 1/10
17 | (async-queue! :agg { :queue-size 1e3
18 | :core-pool-size 4
19 | :max-pool-size 32}
20 | (forward
21 | (riemann.client/tcp-client :host "riemannmc"))))]
22 |
23 | ; Inbound events will be passed to these streams:
24 | (streams
25 | (default :ttl 60
26 | ; Index all events immediately.
27 | index
28 |
29 | ; Send all events to the log file.
30 | #(info %)
31 |
32 | (where (service #"^riemann.*")
33 | graph
34 |
35 | downstream))))
36 |
--------------------------------------------------------------------------------
/4/riemann/riemann.config_riemannmc:
--------------------------------------------------------------------------------
1 | ; -*- mode: clojure; -*-
2 | ; vim: filetype=clojure
3 | (include "/etc/riemann/include")
4 |
5 | (let [index (index)]
6 |
7 | (streams
8 | (default :ttl 60
9 | ; Index all events immediately.
10 | index
11 |
12 | (where (service #"^riemann.*")
13 | graph)
14 |
15 | (expired
16 | (throttle 1 120
17 | (where (service #"^riemann.*") (email "james@example.com")))))))
18 |
--------------------------------------------------------------------------------
/5-6/collectd/collectd.conf:
--------------------------------------------------------------------------------
1 | TypesDB "/usr/share/collectd/types.db"
2 |
3 | Interval 2
4 | CheckThresholds true
5 | WriteQueueLimitHigh 5000
6 | WriteQueueLimitLow 5000
7 |
8 | LoadPlugin logfile
9 |
10 |
11 | LogLevel "info"
12 | File "/var/log/collectd.log"
13 | Timestamp true
14 |
15 |
16 | LoadPlugin threshold
17 |
18 | Include "/etc/collectd.d/*.conf"
19 |
--------------------------------------------------------------------------------
/5-6/collectd/collectd.d/carbon.conf:
--------------------------------------------------------------------------------
1 |
2 | ProcessMatch "carbon-cache" "python.+carbon-cache"
3 | ProcessMatch "carbon-relay" "python.+carbon-relay"
4 |
5 |
6 |
7 |
8 | Instance "carbon-cache"
9 |
10 | DataSource "processes"
11 | WarningMin 2
12 | FailureMin 1
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/5-6/collectd/collectd.d/cpu.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin cpu
2 |
3 | ValuesPercentage true
4 | ReportByCpu false
5 |
6 |
--------------------------------------------------------------------------------
/5-6/collectd/collectd.d/df.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin df
2 |
3 | MountPoint "/"
4 | ValuesPercentage true
5 |
6 |
--------------------------------------------------------------------------------
/5-6/collectd/collectd.d/memory.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin memory
2 |
3 | ValuesPercentage true
4 |
5 |
--------------------------------------------------------------------------------
/5-6/collectd/collectd.d/processes.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin processes
2 |
3 | Process "collectd"
4 |
5 |
6 |
7 |
8 |
9 | DataSource "processes"
10 | FailureMin 1
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/5-6/collectd/collectd.d/swap.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin swap
2 |
3 | ValuesPercentage true
4 |
5 |
--------------------------------------------------------------------------------
/5-6/collectd/collectd.d/write_riemann.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin write_riemann
2 |
3 |
4 | Host "riemanna.example.com"
5 | Port "5555"
6 | Protocol TCP
7 | StoreRates false
8 | CheckThresholds true
9 | TTLFactor 30.0
10 |
11 | Tag "collectd"
12 |
13 |
--------------------------------------------------------------------------------
/5-6/riemann/examplecom/etc/checks.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.checks
2 | (:require [riemann.config :refer :all]
3 | [clojure.tools.logging :refer :all]
4 | [riemann.streams :refer :all]))
5 |
6 | (defn set_state [warning critical]
7 | (fn [event]
8 | (assoc event :state
9 | (condp < (:metric event)
10 | critical "critical"
11 | warning "warning"
12 | "ok"))))
13 |
14 | (defn check_threshold [srv window func warning critical & children]
15 | (where (service srv)
16 | (fixed-time-window window
17 | (smap func
18 | (where (< warning metric)
19 | (smap (set_state warning critical)
20 | (fn [event]
21 | (call-rescue event children))))))))
22 |
23 | (defn check_percentiles [srv window & children]
24 | (where (service srv)
25 | (percentiles window [0.5 0.95 0.99 1]
26 | (fn [event]
27 | (call-rescue event children)))))
28 |
--------------------------------------------------------------------------------
/5-6/riemann/examplecom/etc/collectd.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.collectd
2 | (:require [clojure.tools.logging :refer :all]
3 | [riemann.streams :refer :all]
4 | [clojure.string :as str]))
5 |
6 | (def default-services
7 | [{:service #"^load/load/(.*)$" :rewrite "load $1"}
8 | {:service #"^swap/percent-(.*)$" :rewrite "swap $1"}
9 | {:service #"^memory/percent-(.*)$" :rewrite "memory $1"}
10 | {:service #"^processes/ps_state-(.*)$" :rewrite "processes $1"}
11 | {:service #"^cpu/percent-(.*)$" :rewrite "cpu $1"}
12 | {:service #"^df-(.*)/(df_complex|percent_bytes)-(.*)$" :rewrite "df $1 $2 $3"}
13 | {:service #"^interface-(.*)/if_(errors|packets|octets)/(tx|rx)$" :rewrite "nic $1 $3 $2"}])
14 |
15 | (defn rewrite-service-with
16 | [rules]
17 | (let [matcher (fn [s1 s2] (if (string? s1) (= s1 s2) (re-find s1 s2)))]
18 | (fn [{:keys [service] :as event}]
19 | (or
20 | (first
21 | (for [{:keys [rewrite] :as rule} rules
22 | :when (matcher (:service rule) service)]
23 | (assoc event :service
24 | (if (string? (:service rule))
25 | rewrite
26 | (str/replace service (:service rule) rewrite)))))
27 | event))))
28 |
29 | (def rewrite-service
30 | (rewrite-service-with default-services))
31 |
--------------------------------------------------------------------------------
/5-6/riemann/examplecom/etc/email.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.email
2 | (:require [riemann.email :refer :all]))
3 |
4 | (def email (mailer {:from "riemann@example.com"}))
5 |
--------------------------------------------------------------------------------
/5-6/riemann/examplecom/etc/graphite.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.graphite
2 | (:require [riemann.config :refer :all]
3 | [riemann.graphite :refer :all]))
4 |
5 | (defn add-environment-to-graphite [event] (str "productiona.hosts.", (riemann.graphite/graphite-path-percentiles event)))
6 |
7 | (def graph (async-queue! :graphite {:queue-size 1000}
8 | (graphite {:host "graphitea" :path add-environment-to-graphite})))
9 |
--------------------------------------------------------------------------------
/5-6/riemann/riemann.config:
--------------------------------------------------------------------------------
1 | (logging/init {:file "/var/log/riemann/riemann.log"})
2 |
3 | (require 'riemann.client)
4 | (require '[examplecom.etc.email :refer :all])
5 | (require '[examplecom.etc.graphite :refer :all])
6 | (require '[examplecom.etc.collectd :refer :all])
7 |
8 | (let [host "0.0.0.0"]
9 | (repl-server {:host "127.0.0.1"})
10 | (tcp-server {:host host})
11 | (udp-server {:host host})
12 | (ws-server {:host host}))
13 |
14 | (periodically-expire 10 {:keep-keys [:host :service :tags, :state, :description, :metric]})
15 |
16 | (let [index (index)
17 | downstream (batch 100 1/10
18 | (async-queue! :agg { :queue-size 1e3
19 | :core-pool-size 4
20 | :max-pool-size 32}
21 | (forward
22 | (riemann.client/tcp-client :host "riemannmc"))))]
23 |
24 | ; Inbound events will be passed to these streams:
25 | (streams
26 | (default :ttl 60
27 | ; Index all events immediately.
28 | (where (not (tagged "notification"))
29 | index)
30 |
31 | (tagged "collectd"
32 | (smap rewrite-service graph)
33 |
34 | (tagged "notification"
35 | (changed-state {:init "ok"}
36 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count$" "$1"]
37 | (email "james@example.com"))))
38 |
39 | (where (and (expired? event)
40 | (service #"^processes-.+\/ps_count\/processes"))
41 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count\/processes$" "$1"]
42 | (email "james@example.com"))))
43 |
44 | (where (service #"^riemann.*")
45 | graph
46 |
47 | downstream))))
48 |
--------------------------------------------------------------------------------
/7/collectd/collectd.conf:
--------------------------------------------------------------------------------
1 | TypesDB "/usr/share/collectd/types.db"
2 |
3 | Interval 2
4 | CheckThresholds true
5 |
6 | LoadPlugin logfile
7 |
8 |
9 | LogLevel "info"
10 | File "/var/log/collectd.log"
11 | Timestamp true
12 |
13 |
14 | LoadPlugin threshold
15 |
16 | Include "/etc/collectd.d/*.conf"
17 |
--------------------------------------------------------------------------------
/7/collectd/collectd.d/cpu.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin cpu
2 |
3 | ValuesPercentage true
4 | ReportByCpu false
5 |
6 |
--------------------------------------------------------------------------------
/7/collectd/collectd.d/df.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin df
2 |
3 | MountPoint "/"
4 | ValuesPercentage true
5 |
6 |
--------------------------------------------------------------------------------
/7/collectd/collectd.d/docker.conf:
--------------------------------------------------------------------------------
1 | TypesDB "/usr/lib/collectd/docker/dockerplugin.db"
2 | LoadPlugin python
3 |
4 |
5 | ModulePath "/usr/lib/collectd/docker"
6 | Import "dockerplugin"
7 |
8 |
9 | BaseURL "unix://var/run/docker.sock"
10 | Timeout 3
11 |
12 |
13 |
14 |
15 | Process "docker"
16 |
17 |
--------------------------------------------------------------------------------
/7/collectd/collectd.d/memory.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin memory
2 |
3 | ValuesPercentage true
4 |
5 |
--------------------------------------------------------------------------------
/7/collectd/collectd.d/processes.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin processes
2 |
3 | Process "collectd"
4 |
5 |
6 |
7 |
8 |
9 | DataSource "processes"
10 | FailureMin 1
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/7/collectd/collectd.d/swap.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin swap
2 |
3 | ValuesPercentage true
4 |
5 |
--------------------------------------------------------------------------------
/7/collectd/collectd.d/write_riemann.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin write_riemann
2 |
3 |
4 | Host "riemanna.example.com"
5 | Port "5555"
6 | Protocol TCP
7 | StoreRates false
8 | CheckThresholds true
9 | TTLFactor 30.0
10 |
11 | Tag "collectd"
12 |
13 |
--------------------------------------------------------------------------------
/7/riemann/examplecom/etc/checks.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.checks
2 | (:require [riemann.config :refer :all]
3 | [clojure.tools.logging :refer :all]
4 | [riemann.streams :refer :all]))
5 |
6 | (defn set_state [warning critical]
7 | (fn [event]
8 | (assoc event :state
9 | (condp < (:metric event)
10 | critical "critical"
11 | warning "warning"
12 | "ok"))))
13 |
14 | (defn check_threshold [srv window func warning critical & children]
15 | (where (service srv)
16 | (fixed-time-window window
17 | (smap func
18 | (where (< warning metric)
19 | (smap (set_state warning critical)
20 | (fn [event]
21 | (call-rescue event children))))))))
22 |
23 | (defn check_percentiles [srv window & children]
24 | (where (service srv)
25 | (percentiles window [0.5 0.95 0.99 1]
26 | (fn [event]
27 | (call-rescue event children)))))
28 |
--------------------------------------------------------------------------------
/7/riemann/examplecom/etc/collectd.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.collectd
2 | (:require [clojure.tools.logging :refer :all]
3 | [riemann.streams :refer :all]
4 | [clojure.string :as str]
5 | [clojure.walk :as walk]))
6 |
7 | (defn docker-attribute-map
8 | [attributes]
9 | (let [instance (str/split (str/replace attributes #"^.*\[(.*)\]$" "$1") #",")]
10 | (walk/keywordize-keys (into {} (for [pair instance] (apply hash-map (str/split pair #"=")))))))
11 |
12 | (defn docker-attributes
13 | [{:keys [plugin_instance] :as event}]
14 | (if-let [attributes (re-find #"^.*\[.*\]$" plugin_instance)]
15 | (merge event (docker-attribute-map attributes))
16 | event))
17 |
18 | (defn parse-docker-service-host
19 | [{:keys [type type_instance plugin_instance] :as event}]
20 | (let [host (re-find #"^\w+\.?\w+\.?\w+" (:plugin_instance event))
21 | service (cond-> (str (:type event)) (:type_instance event) (str "." (:type_instance event)))]
22 | (assoc event :service service :host host)))
23 |
24 | (def default-services
25 | [{:service #"^load/load/(.*)$" :rewrite "load $1"}
26 | {:service #"^swap/percent-(.*)$" :rewrite "swap $1"}
27 | {:service #"^memory/percent-(.*)$" :rewrite "memory $1"}
28 | {:service #"^processes/ps_state-(.*)$" :rewrite "processes $1"}
29 | {:service #"^processes-(.*)/(.*)$" :rewrite "processes $1 $2"}
30 | {:service #"^cpu/percent-(.*)$" :rewrite "cpu $1"}
31 | {:service #"^df-(.*)/(df_complex|percent_bytes)-(.*)$" :rewrite "df $1 $2 $3"}
32 | {:service #"^interface-(.*)/if_(errors|packets|octets)/(tx|rx)$" :rewrite "nic $1 $3 $2"}
33 | {:service #"^protocols-(.*)/(.*)$" :rewrite "protocols $1 $2"}
34 | {:service #"^GenericJMX-(:?_|\/)?(.*)$" :rewrite "jmx $2"}
35 | {:service #"^haproxy\/(gauge|derive)-(.*)$" :rewrite "haproxy $2"}
36 | {:service #"^statsd\/(gauge|derive|latency)-(.*)$" :rewrite "$2"}
37 | {:service #"^statsd\/(gauge|derive|latency)-(.*)$" :rewrite "statsd $1 $2"}
38 | {:service #"^mysql-(.*)\/(counter|gauge)-(.*)$" :rewrite "mysql $1 $3"}
39 | {:service #"^dbi-(.*)\/(gauge|counter)-(.*)$" :rewrite "dbi $1 $3"}
40 | {:service #"^redis-(.*)$" :rewrite "redis $1"}])
41 |
42 | (defn rewrite-service-with
43 | [rules]
44 | (let [matcher (fn [s1 s2] (if (string? s1) (= s1 s2) (re-find s1 s2)))]
45 | (fn [{:keys [service] :as event}]
46 | (or
47 | (first
48 | (for [{:keys [rewrite] :as rule} rules
49 | :when (matcher (:service rule) service)]
50 | (assoc event :service
51 | (if (string? (:service rule))
52 | rewrite
53 | (str/replace service (:service rule) rewrite)))))
54 | event))))
55 |
56 | (def rewrite-service
57 | (rewrite-service-with default-services))
58 |
--------------------------------------------------------------------------------
/7/riemann/examplecom/etc/email.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.email
2 | (:require [riemann.email :refer :all]))
3 |
4 | (def email (mailer {:from "riemann@example.com"}))
5 |
--------------------------------------------------------------------------------
/7/riemann/examplecom/etc/graphite.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.graphite
2 | (:require [riemann.config :refer :all]
3 | [riemann.graphite :refer :all]))
4 |
5 | (defn add-environment-to-graphite [event]
6 | (condp = (:plugin event)
7 | "docker"
8 | (if (:com.example.application event)
9 | (str "productiona.docker.", (:com.example.application event), ".", (riemann.graphite/graphite-path-percentiles event))
10 | (str "productiona.docker.", (riemann.graphite/graphite-path-percentiles event)))
11 | (str "productiona.hosts.", (riemann.graphite/graphite-path-percentiles event))))
12 |
13 | (def graph (async-queue! :graphite {:queue-size 1000}
14 | (graphite {:host "graphitea" :path add-environment-to-graphite})))
15 |
--------------------------------------------------------------------------------
/7/riemann/riemann.config:
--------------------------------------------------------------------------------
1 | (logging/init {:file "/var/log/riemann/riemann.log"})
2 |
3 | (require 'riemann.client)
4 | (require '[examplecom.etc.email :refer :all])
5 | (require '[examplecom.etc.graphite :refer :all])
6 | (require '[examplecom.etc.collectd :refer :all])
7 |
8 | (let [host "0.0.0.0"]
9 | (repl-server {:host "127.0.0.1"})
10 | (tcp-server {:host host})
11 | (udp-server {:host host})
12 | (ws-server {:host host}))
13 |
14 | (periodically-expire 10 {:keep-keys [:host :service :tags, :state, :description, :metric]})
15 |
16 | (let [index (index)
17 | downstream (batch 100 1/10
18 | (async-queue! :agg { :queue-size 1e3
19 | :core-pool-size 4
20 | :max-pool-size 32}
21 | (forward
22 | (riemann.client/tcp-client :host "riemannmc"))))]
23 |
24 | ; Inbound events will be passed to these streams:
25 | (streams
26 | (default :ttl 60
27 | ; Index all events immediately.
28 | (where (not (tagged "notification"))
29 | index)
30 |
31 | (tagged "collectd"
32 | (where (not (= (:plugin event) "docker"))
33 | (smap rewrite-service graph))
34 |
35 | (where (= (:plugin event) "docker")
36 | (smap (comp parse-docker-service-host docker-attributes rewrite-service) graph))
37 |
38 | (tagged "notification"
39 | (changed-state {:init "ok"}
40 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count$" "$1"]
41 | (email "james@example.com"))))
42 |
43 | (where (and (expired? event)
44 | (service #"^processes-.+\/ps_count\/processes"))
45 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count\/processes$" "$1"]
46 | (email "james@example.com"))))
47 |
48 | (where (service #"^riemann.*")
49 | graph
50 |
51 | downstream))))
52 |
--------------------------------------------------------------------------------
/8/collectd/elasticsearch.conf:
--------------------------------------------------------------------------------
1 |
2 | Globals true
3 |
4 |
5 |
6 | ModulePath "/usr/lib/collectd/"
7 |
8 | Import "elasticsearch_collectd"
9 |
10 |
11 | Verbose false
12 | Cluster "productiona"
13 |
14 |
15 |
16 | LoadPlugin processes
17 |
18 | Process "elasticsearch"
19 |
20 |
21 |
--------------------------------------------------------------------------------
/8/collectd/elasticsearch_collectd.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | #Copyright 2014 Jeremy Carroll
3 | #
4 | #Licensed under the Apache License, Version 2.0 (the "License");
5 | #you may not use this file except in compliance with the License.
6 | #You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | #Unless required by applicable law or agreed to in writing, software
11 | #distributed under the License is distributed on an "AS IS" BASIS,
12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #See the License for the specific language governing permissions and
14 | #limitations under the License.
15 |
16 |
17 | import collectd
18 | import json
19 | import urllib2
20 | import socket
21 | import collections
22 | from distutils.version import StrictVersion
23 |
24 |
25 | ES_CLUSTER = "elasticsearch"
26 | ES_HOST = "localhost"
27 | ES_PORT = 9200
28 |
29 | # ES indexes must be fully qualified. E.g. _all, index1,index2
30 | # To do: Handle glob sytanx for index names.
31 | ES_INDEX = [ ]
32 |
33 | ENABLE_INDEX_STATS = False
34 | ENABLE_NODE_STATS = True
35 |
36 | VERBOSE_LOGGING = False
37 |
38 | Stat = collections.namedtuple('Stat', ('type', 'path'))
39 |
40 | # Indices are cluster wide, metrics should be collected from only one server
41 | # in the cluster or from an external probe server.
42 | INDEX_STATS = {
43 |
44 | # === ElasticSearch 0.90.x and higher ===
45 | "v('{es_version}') >= v('0.90.0')": {
46 |
47 | ## PRIMARIES
48 | # DOCS
49 | "indices.{index_name}.primaries.docs.count" : Stat("counter", "indices.%s.primaries.docs.count"),
50 | "indices.{index_name}.primaries.docs.deleted" : Stat("counter", "indices.%s.primaries.docs.deleted"),
51 | # STORE
52 | "indices.{index_name}.primaries.store.size_in_bytes" : Stat("bytes", "indices.%s.primaries.store.size_in_bytes"),
53 | "indices.{index_name}.primaries.store.throttle_time_in_millis" : Stat("counter", "indices.%s.primaries.store.throttle_time_in_millis"),
54 | # INDEXING
55 | "indices.{index_name}.primaries.indexing.index_total" : Stat("counter", "indices.%s.primaries.indexing.index_total"),
56 | "indices.{index_name}.primaries.indexing.index_time_in_millis" : Stat("counter", "indices.%s.primaries.indexing.index_time_in_millis"),
57 | "indices.{index_name}.primaries.indexing.index_current" : Stat("gauge", "indices.%s.primaries.indexing.index_current"),
58 | "indices.{index_name}.primaries.indexing.delete_total" : Stat("counter", "indices.%s.primaries.indexing.delete_total"),
59 | "indices.{index_name}.primaries.indexing.delete_time_in_millis" : Stat("counter", "indices.%s.primaries.indexing.delete_time_in_millis"),
60 | "indices.{index_name}.primaries.indexing.delete_current" : Stat("gauge", "indices.%s.primaries.indexing.delete_current"),
61 | # GET
62 | "indices.{index_name}.primaries.get.time_in_millis" : Stat("counter", "indices.%s.primaries.get.time_in_millis"),
63 | "indices.{index_name}.primaries.get.exists_total" : Stat("counter", "indices.%s.primaries.get.exists_total"),
64 | "indices.{index_name}.primaries.get.exists_time_in_millis" : Stat("counter", "indices.%s.primaries.get.exists_time_in_millis"),
65 | "indices.{index_name}.primaries.get.missing_total" : Stat("counter", "indices.%s.primaries.get.missing_total"),
66 | "indices.{index_name}.primaries.get.missing_time_in_millis" : Stat("counter", "indices.%s.primaries.get.missing_time_in_millis"),
67 | "indices.{index_name}.primaries.get.current" : Stat("gauge", "indices.%s.primaries.get.current"),
68 | # SEARCH
69 | "indices.{index_name}.primaries.search.open_contexts" : Stat("gauge", "indices.%s.primaries.search.open_contexts"),
70 | "indices.{index_name}.primaries.search.query_total" : Stat("counter", "indices.%s.primaries.search.query_total"),
71 | "indices.{index_name}.primaries.search.query_time_in_millis" : Stat("counter", "indices.%s.primaries.search.query_time_in_millis"),
72 | "indices.{index_name}.primaries.search.query_current" : Stat("gauge", "indices.%s.primaries.search.query_current"),
73 | "indices.{index_name}.primaries.search.fetch_total" : Stat("counter", "indices.%s.primaries.search.fetch_total"),
74 | "indices.{index_name}.primaries.search.fetch_time_in_millis" : Stat("counter", "indices.%s.primaries.search.fetch_time_in_millis"),
75 | "indices.{index_name}.primaries.search.fetch_current" : Stat("gauge", "indices.%s.primaries.search.fetch_current"),
76 | # MERGES
77 | "indices.{index_name}.primaries.merges.current" : Stat("gauge", "indices.%s.primaries.merges.current"),
78 | "indices.{index_name}.primaries.merges.current_docs" : Stat("gauge", "indices.%s.primaries.merges.current_docs"),
79 | "indices.{index_name}.primaries.merges.current_size_in_bytes" : Stat("bytes", "indices.%s.primaries.merges.current_size_in_bytes"),
80 | "indices.{index_name}.primaries.merges.total" : Stat("counter", "indices.%s.primaries.merges.total"),
81 | "indices.{index_name}.primaries.merges.total_time_in_millis" : Stat("counter", "indices.%s.primaries.merges.total_time_in_millis"),
82 | "indices.{index_name}.primaries.merges.total_docs" : Stat("counter", "indices.%s.primaries.merges.total_docs"),
83 | "indices.{index_name}.primaries.merges.total_size_in_bytes" : Stat("bytes", "indices.%s.primaries.merges.total_size_in_bytes"),
84 | # REFRESH
85 | "indices.{index_name}.primaries.refresh.total" : Stat("counter", "indices.%s.primaries.refresh.total"),
86 | "indices.{index_name}.primaries.refresh.total_time_in_millis" : Stat("counter", "indices.%s.primaries.refresh.total_time_in_millis"),
87 | # FLUSH
88 | "indices.{index_name}.primaries.flush.total" : Stat("counter", "indices.%s.primaries.flush.total"),
89 | "indices.{index_name}.primaries.flush.total_time_in_millis" : Stat("counter", "indices.%s.primaries.flush.total_time_in_millis"),
90 | # WARMER
91 | "indices.{index_name}.primaries.warmer.current" : Stat("gauge", "indices.%s.primaries.warmer.current"),
92 | "indices.{index_name}.primaries.warmer.total" : Stat("counter", "indices.%s.primaries.warmer.total"),
93 | "indices.{index_name}.primaries.warmer.total_time_in_millis" : Stat("counter", "indices.%s.primaries.warmer.total_time_in_millis"),
94 | # FILTER_CACHE
95 | "indices.{index_name}.primaries.filter_cache.memory_size_in_bytes" : Stat("bytes", "indices.%s.primaries.filter_cache.memory_size_in_bytes"),
96 | "indices.{index_name}.primaries.filter_cache.evictions" : Stat("counter", "indices.%s.primaries.filter_cache.evictions"),
97 | # ID_CACHE
98 | "indices.{index_name}.primaries.id_cache.memory_size_in_bytes" : Stat("bytes", "indices.%s.primaries.id_cache.memory_size_in_bytes"),
99 | # FIELDDATA
100 | "indices.{index_name}.primaries.fielddata.memory_size_in_bytes" : Stat("bytes", "indices.%s.primaries.fielddata.memory_size_in_bytes"),
101 | "indices.{index_name}.primaries.fielddata.evictions" : Stat("counter", "indices.%s.primaries.fielddata.evictions"),
102 | # PERCOLATE
103 | "indices.{index_name}.primaries.percolate.total" : Stat("counter", "indices.%s.primaries.percolate.total"),
104 | "indices.{index_name}.primaries.percolate.time_in_millis" : Stat("counter", "indices.%s.primaries.percolate.time_in_millis"),
105 | "indices.{index_name}.primaries.percolate.current" : Stat("gauge", "indices.%s.primaries.percolate.current"),
106 | "indices.{index_name}.primaries.percolate.memory_size_in_bytes" : Stat("bytes", "indices.%s.primaries.percolate.memory_size_in_bytes"),
107 | "indices.{index_name}.primaries.percolate.queries" : Stat("counter", "indices.%s.primaries.percolate.queries"),
108 | # COMPELTION
109 | "indices.{index_name}.primaries.completion.size_in_bytes" : Stat("bytes", "indices.%s.primaries.completion.size_in_bytes"),
110 | # SEGMENTS
111 | "indices.{index_name}.primaries.segments.count" : Stat("counter", "indices.%s.primaries.segments.count"),
112 | "indices.{index_name}.primaries.segments.memory_in_bytes" : Stat("bytes", "indices.%s.primaries.segments.memory_in_bytes"),
113 | "indices.{index_name}.primaries.segments.index_writer_memory_in_bytes" : Stat("bytes", "indices.%s.primaries.segments.index_writer_memory_in_bytes"),
114 | "indices.{index_name}.primaries.segments.version_map_memory_in_bytes" : Stat("bytes", "indices.%s.primaries.segments.version_map_memory_in_bytes"),
115 | # TRANSLOG
116 | "indices.{index_name}.primaries.translog.operations" : Stat("counter", "indices.%s.primaries.translog.operations"),
117 | "indices.{index_name}.primaries.translog.size_in_bytes" : Stat("bytes", "indices.%s.primaries.translog.size_in_bytes"),
118 | # SUGGEST
119 | "indices.{index_name}.primaries.suggest.total" : Stat("counter", "indices.%s.primaries.suggest.total"),
120 | "indices.{index_name}.primaries.suggest.time_in_millis" : Stat("counter", "indices.%s.primaries.suggest.time_in_millis"),
121 | "indices.{index_name}.primaries.suggest.current" : Stat("gauge", "indices.%s.primaries.suggest.current"),
122 |
123 | ## TOTAL ##
124 | # DOCS
125 | "indices.{index_name}.total.docs.count" : Stat("gauge", "indices.%s.total.docs.count"),
126 | "indices.{index_name}.total.docs.deleted" : Stat("gauge", "indices.%s.total.docs.deleted"),
127 | # STORE
128 | "indices.{index_name}.total.store.size_in_bytes" : Stat("gauge", "indices.%s.total.store.size_in_bytes"),
129 | "indices.{index_name}.total.store.throttle_time_in_millis" : Stat("counter", "indices.%s.total.store.throttle_time_in_millis"),
130 | # INDEXING
131 | "indices.{index_name}.total.indexing.index_total" : Stat("counter", "indices.%s.total.indexing.index_total"),
132 | "indices.{index_name}.total.indexing.index_time_in_millis" : Stat("counter", "indices.%s.total.indexing.index_time_in_millis"),
133 | "indices.{index_name}.total.indexing.index_current" : Stat("gauge", "indices.%s.total.indexing.index_current"),
134 | "indices.{index_name}.total.indexing.delete_total" : Stat("counter", "indices.%s.total.indexing.delete_total"),
135 | "indices.{index_name}.total.indexing.delete_time_in_millis" : Stat("counter", "indices.%s.total.indexing.delete_time_in_millis"),
136 | "indices.{index_name}.total.indexing.delete_current" : Stat("gauge", "indices.%s.total.indexing.delete_current"),
137 | # GET
138 | "indices.{index_name}.total.get.total" : Stat("counter", "indices.%s.total.get.total"),
139 | "indices.{index_name}.total.get.time_in_millis" : Stat("counter", "indices.%s.total.get.time_in_millis"),
140 | "indices.{index_name}.total.get.exists_total" : Stat("counter", "indices.%s.total.get.exists_total"),
141 | "indices.{index_name}.total.get.exists_time_in_millis" : Stat("counter", "indices.%s.total.get.exists_time_in_millis"),
142 | "indices.{index_name}.total.get.missing_total" : Stat("counter", "indices.%s.total.get.missing_total"),
143 | "indices.{index_name}.total.get.missing_time_in_millis" : Stat("counter", "indices.%s.total.get.missing_time_in_millis"),
144 | "indices.{index_name}.total.get.current" : Stat("gauge", "indices.%s.total.get.current"),
145 | # SEARCH
146 | "indices.{index_name}.total.search.open_contexts" : Stat("gauge", "indices.%s.total.search.open_contexts"),
147 | "indices.{index_name}.total.search.query_total" : Stat("counter", "indices.%s.total.search.query_total"),
148 | "indices.{index_name}.total.search.query_time_in_millis" : Stat("counter", "indices.%s.total.search.query_time_in_millis"),
149 | "indices.{index_name}.total.search.query_current" : Stat("gauge", "indices.%s.total.search.query_current"),
150 | "indices.{index_name}.total.search.fetch_total" : Stat("counter", "indices.%s.total.search.fetch_total"),
151 | }
152 | }
153 |
154 | NODE_STATS = {
155 |
156 | # === ElasticSearch 0.90.x and higher ===
157 | "v('{es_version}') >= v('0.90.0')": {
158 | ## DOCS
159 | 'indices.docs.count': Stat("gauge", "nodes.%s.indices.docs.count"),
160 | 'indices.docs.deleted': Stat("counter", "nodes.%s.indices.docs.deleted"),
161 |
162 | ## STORE
163 | 'indices.store.size': Stat("bytes", "nodes.%s.indices.store.size_in_bytes"),
164 |
165 | ## INDEXING
166 | 'indices.indexing.index-total': Stat("counter", "nodes.%s.indices.indexing.index_total"),
167 | 'indices.indexing.index-time': Stat("counter", "nodes.%s.indices.indexing.index_time_in_millis"),
168 | 'indices.indexing.delete-total': Stat("counter", "nodes.%s.indices.indexing.delete_total"),
169 | 'indices.indexing.delete-time': Stat("counter", "nodes.%s.indices.indexing.delete_time_in_millis"),
170 | 'indices.indexing.index-current': Stat("gauge", "nodes.%s.indices.indexing.index_current"),
171 | 'indices.indexing.delete-current': Stat("gauge", "nodes.%s.indices.indexing.delete_current"),
172 |
173 | ## GET
174 | 'indices.get.total': Stat("counter", "nodes.%s.indices.get.total"),
175 | 'indices.get.time': Stat("counter", "nodes.%s.indices.get.time_in_millis"),
176 | 'indices.get.exists-total': Stat("counter", "nodes.%s.indices.get.exists_total"),
177 | 'indices.get.exists-time': Stat("counter", "nodes.%s.indices.get.exists_time_in_millis"),
178 | 'indices.get.missing-total': Stat("counter", "nodes.%s.indices.get.missing_total"),
179 | 'indices.get.missing-time': Stat("counter", "nodes.%s.indices.get.missing_time_in_millis"),
180 | 'indices.get.current': Stat("gauge", "nodes.%s.indices.get.current"),
181 |
182 | ## SEARCH
183 | 'indices.search.query-current': Stat("gauge", "nodes.%s.indices.search.query_current"),
184 | 'indices.search.query-total': Stat("counter", "nodes.%s.indices.search.query_total"),
185 | 'indices.search.query-time': Stat("counter", "nodes.%s.indices.search.query_time_in_millis"),
186 | 'indices.search.fetch-current': Stat("gauge", "nodes.%s.indices.search.fetch_current"),
187 | 'indices.search.fetch-total': Stat("counter", "nodes.%s.indices.search.fetch_total"),
188 | 'indices.search.fetch-time': Stat("counter", "nodes.%s.indices.search.fetch_time_in_millis"),
189 |
190 | # JVM METRICS #
191 | ##GC
192 | 'jvm.gc.time': Stat("counter", "nodes.%s.jvm.gc.collectors.young.collection_time_in_millis"),
193 | 'jvm.gc.count': Stat("counter", "nodes.%s.jvm.gc.collectors.young.collection_count"),
194 | 'jvm.gc.old-time': Stat("counter", "nodes.%s.jvm.gc.collectors.old.collection_time_in_millis"),
195 | 'jvm.gc.old-count': Stat("counter", "nodes.%s.jvm.gc.collectors.old.collection_count"),
196 |
197 | ## MEM
198 | 'jvm.mem.heap-committed': Stat("bytes", "nodes.%s.jvm.mem.heap_committed_in_bytes"),
199 | 'jvm.mem.heap-used': Stat("bytes", "nodes.%s.jvm.mem.heap_used_in_bytes"),
200 | 'jvm.mem.heap-used-percent': Stat("percent", "nodes.%s.jvm.mem.heap_used_percent"),
201 | 'jvm.mem.non-heap-committed': Stat("bytes", "nodes.%s.jvm.mem.non_heap_committed_in_bytes"),
202 | 'jvm.mem.non-heap-used': Stat("bytes", "nodes.%s.jvm.mem.non_heap_used_in_bytes"),
203 |
204 | ## THREADS
205 | 'jvm.threads.count': Stat("gauge", "nodes.%s.jvm.threads.count"),
206 | 'jvm.threads.peak': Stat("gauge", "nodes.%s.jvm.threads.peak_count"),
207 |
208 | # TRANSPORT METRICS #
209 | 'transport.server_open': Stat("gauge", "nodes.%s.transport.server_open"),
210 | 'transport.rx.count': Stat("counter", "nodes.%s.transport.rx_count"),
211 | 'transport.rx.size': Stat("bytes", "nodes.%s.transport.rx_size_in_bytes"),
212 | 'transport.tx.count': Stat("counter", "nodes.%s.transport.tx_count"),
213 | 'transport.tx.size': Stat("bytes", "nodes.%s.transport.tx_size_in_bytes"),
214 |
215 | # HTTP METRICS #
216 | 'http.current_open': Stat("gauge", "nodes.%s.http.current_open"),
217 | 'http.total_open': Stat("counter", "nodes.%s.http.total_opened"),
218 |
219 | # PROCESS METRICS #
220 | 'process.open_file_descriptors': Stat("gauge", "nodes.%s.process.open_file_descriptors"),
221 | },
222 |
223 | # === ElasticSearch 0.90.x only ===
224 | "v('0.90.0') <= v('{es_version}') < v('1.0.0')": {
225 | ##CPU
226 | 'process.cpu.percent': Stat("gauge", "nodes.%s.process.cpu.percent")
227 | },
228 |
229 | # === ElasticSearch 1.0.0 or greater ===
230 | "v('{es_version}') >= v('1.0.0')": {
231 | ## STORE
232 | 'indices.store.throttle-time': Stat("counter", "nodes.%s.indices.store.throttle_time_in_millis"),
233 |
234 | ##SEARCH
235 | 'indices.search.open-contexts': Stat("gauge", "nodes.%s.indices.search.open_contexts"),
236 |
237 | ##CACHE
238 | 'indices.cache.field.eviction': Stat("counter", "nodes.%s.indices.fielddata.evictions"),
239 | 'indices.cache.field.size': Stat("bytes", "nodes.%s.indices.fielddata.memory_size_in_bytes"),
240 | 'indices.cache.filter.evictions': Stat("counter", "nodes.%s.indices.filter_cache.evictions"),
241 | 'indices.cache.filter.size': Stat("bytes", "nodes.%s.indices.filter_cache.memory_size_in_bytes"),
242 |
243 | ## FLUSH
244 | 'indices.flush.total': Stat("counter", "nodes.%s.indices.flush.total"),
245 | 'indices.flush.time': Stat("counter", "nodes.%s.indices.flush.total_time_in_millis"),
246 |
247 | ## MERGES
248 | 'indices.merges.current': Stat("gauge", "nodes.%s.indices.merges.current"),
249 | 'indices.merges.current-docs': Stat("gauge", "nodes.%s.indices.merges.current_docs"),
250 | 'indices.merges.current-size': Stat("bytes", "nodes.%s.indices.merges.current_size_in_bytes"),
251 | 'indices.merges.total': Stat("counter", "nodes.%s.indices.merges.total"),
252 | 'indices.merges.total-docs': Stat("gauge", "nodes.%s.indices.merges.total_docs"),
253 | 'indices.merges.total-size': Stat("bytes", "nodes.%s.indices.merges.total_size_in_bytes"),
254 | 'indices.merges.time': Stat("counter", "nodes.%s.indices.merges.total_time_in_millis"),
255 |
256 | ## REFRESH
257 | 'indices.refresh.total': Stat("counter", "nodes.%s.indices.refresh.total"),
258 | 'indices.refresh.time': Stat("counter", "nodes.%s.indices.refresh.total_time_in_millis"),
259 |
260 | ## SEGMENTS
261 | 'indices.segments.count': Stat("gauge", "nodes.%s.indices.segments.count"),
262 | 'indices.segments.size': Stat("bytes", "nodes.%s.indices.segments.memory_in_bytes"),
263 |
264 | ## TRANSLOG
265 | 'indices.translog.operations': Stat("gauge", "nodes.%s.indices.translog.operations"),
266 | 'indices.translog.size': Stat("bytes", "nodes.%s.indices.translog.size_in_bytes"),
267 | },
268 |
269 | # DICT: ElasticSearch 1.3.0 or greater
270 | "v('{es_version}') >= v('1.3.0')": {
271 | 'indices.segments.index-writer-memory': Stat("bytes", "nodes.%s.indices.segments.index_writer_memory_in_bytes"),
272 | 'indices.segments.index-memory': Stat("bytes", "nodes.%s.indices.segments.memory_in_bytes"),
273 | }
274 | }
275 |
276 | STATS_CUR = {}
277 |
278 | def check_es_version(rule, version):
279 | log_verbose('Elasticsearch version rule: %s' % (rule.format(es_version=version)) )
280 | v = StrictVersion
281 | eval_string = rule.format(es_version=version)
282 | return eval(eval_string)
283 |
284 |
285 | def generate_metric_set(rules, es_version):
286 | """
287 | @breif - Given an initial set of metrics with the elasticsearch version and the
288 | requested metrics to be fetched, parse all pre-defined metrics and
289 | return a sythesised set of metrics which is compatiable with existing
290 | functions.
291 |
292 | @rules - a struction which contains a rule to be evaluated when evaluting
293 | which metrics to be appended to the returned data set.
294 |
295 | @es_version - the Elasticsearch version.
296 | """
297 | synthesised_metrics = {}
298 |
299 | for k in rules.keys():
300 | if check_es_version(k, es_version):
301 | log_verbose("Adding %s" % k)
302 | synthesised_metrics.update(rules[k])
303 |
304 | return synthesised_metrics
305 |
306 |
307 | # FUNCTION: Collect stats from JSON result
308 | def lookup_node_stat(stat, metrics, json):
309 | node = json['nodes'].keys()[0]
310 | val = dig_it_up(json, metrics[stat].path % node)
311 |
312 | # Check to make sure we have a valid result
313 | # dig_it_up returns False if no match found
314 | if not isinstance(val, bool):
315 | return int(val)
316 | else:
317 | return None
318 |
319 |
320 | def lookup_index_stat(stat, metrics, json):
321 | indices = json['indices'].keys()
322 |
323 | for index in indices:
324 | formatted_stat = stat.format(index_name=index)
325 | val = index_dig_it_up(json, metrics[stat].path, index )
326 |
327 | # Check to make sure we have a valid result
328 | # dig_it_up returns False if no match found
329 | if not isinstance(val, bool):
330 | return int(val)
331 | else:
332 | return None
333 |
334 |
335 | def log_verbose(msg):
336 | if VERBOSE_LOGGING == True:
337 | collectd.warning('elasticsearch plugin [verbose]: %s' % msg)
338 |
339 |
340 | def configure_callback(conf):
341 | """Received configuration information"""
342 | global ES_HOST, ES_PORT, VERBOSE_LOGGING, ES_CLUSTER, ES_INDEX, ENABLE_INDEX_STATS, ENABLE_NODE_STATS
343 | for node in conf.children:
344 | if node.key == 'Host':
345 | ES_HOST = node.values[0]
346 | elif node.key == 'Port':
347 | ES_PORT = int(node.values[0])
348 | elif node.key == 'Verbose':
349 | VERBOSE_LOGGING = bool(node.values[0])
350 | elif node.key == 'Cluster':
351 | ES_CLUSTER = node.values[0]
352 | elif node.key == 'Indexes':
353 | ES_INDEX = node.values
354 | log_verbose('Indexes to query: %s' % (str(ES_INDEX)))
355 | elif node.key == 'EnableIndexStats':
356 | ENABLE_INDEX_STATS = bool(node.values[0])
357 | log_verbose("Enable Index Stats : %s" % ENABLE_INDEX_STATS)
358 | elif node.key == 'EnableNodeStats':
359 | ENABLE_NODE_STATS = bool(node.values[0])
360 | log_verbose("Enable Node Stats : %s" % ENABLE_NODE_STATS)
361 | else:
362 | collectd.warning('elasticsearch plugin: Ignoring unknown config key: %s.' % node.key)
363 |
364 | log_verbose('Configured with host=%s, port=%s' % (ES_HOST, ES_PORT))
365 |
366 |
367 |
368 | def fetch_url(url):
369 | try:
370 | result = json.load(urllib2.urlopen(url, timeout=10))
371 | except urllib2.URLError, e:
372 | collectd.error('elasticsearch plugin: Error connecting to %s - %r' % (url, e))
373 | return None
374 | return result
375 |
376 |
377 |
378 | def fetch_stats():
379 | global ES_CLUSTER, ES_HOST, ES_PORT, STATS_CUR, ES_INDEX, ENABLE_NODE_STATS, ENABLE_INDEX_STATS
380 |
381 | NODE_STATS_URL = {
382 | "v('{es_version}') >= v('0.90.0')": '{url}_cluster/nodes/_local/stats?http=true&process=true&jvm=true&transport=true',
383 | "v('{es_version}') >= v('1.0.0')" : '{url}_nodes/_local/stats/transport,http,process,jvm,indices'
384 | }
385 |
386 | node_stats_url = ""
387 | base_url = 'http://' + ES_HOST + ':' + str(ES_PORT) + '/'
388 | server_info = fetch_url(base_url)
389 | version = server_info['version']['number']
390 |
391 | # Get the cluster name.
392 | if server_info.has_key("cluster_name"):
393 | ES_CLUSTER = server_info["cluster_name"]
394 | else:
395 | ES_CLUSTER = fetch_url(base_url+"_nodes")['cluster_name']
396 |
397 | log_verbose('Elasticsearch cluster: %s version : %s' % (ES_CLUSTER, version))
398 |
399 | # Node statistics
400 | if ENABLE_NODE_STATS:
401 | node_metrics = {}
402 | for k in NODE_STATS_URL.keys():
403 | if check_es_version(k, str(version)):
404 | node_stats_url = NODE_STATS_URL[k].format(url=base_url)
405 | log_verbose('Node url : %s' % node_stats_url)
406 |
407 | node_metrics.update(generate_metric_set(NODE_STATS, version))
408 |
409 | # FIXME: Re-add the thread pool statistics.
410 | # # add info on thread pools
411 | # for pool in ['generic', 'index', 'get', 'snapshot', 'merge', 'optimize', 'bulk', 'warmer', 'flush', 'search', 'refresh']:
412 | # for attr in ['threads', 'queue', 'active', 'largest']:
413 | # path = 'thread_pool.{0}.{1}'.format(pool, attr)
414 | # node_metrics[path] = Stat("gauge", 'nodes.%s.{0}'.format(path))
415 | # for attr in ['completed', 'rejected']:
416 | # path = 'thread_pool.{0}.{1}'.format(pool, attr)
417 | # node_metrics[path] = Stat("counter", 'nodes.%s.{0}'.format(path))
418 |
419 | node_json = fetch_url(node_stats_url)
420 | parse_node_stats(node_metrics, node_json)
421 | log_verbose('Node stats processed')
422 |
423 | # Indexes statistics
424 | if ENABLE_INDEX_STATS:
425 | index_metrics = {}
426 | log_verbose('Checking index.')
427 | for k in ES_INDEX:
428 | index_stats_url = base_url + k + "/_stats"
429 | index_metrics.update(generate_metric_set(INDEX_STATS, version))
430 | log_verbose('Index statistics url : %s' % index_stats_url)
431 |
432 | index_json = fetch_url(index_stats_url)
433 | parse_index_stats(index_metrics, index_json, k)
434 |
435 | return True
436 |
437 |
438 |
439 | def parse_node_stats(metrics, json):
440 | """Parse stats response from ElasticSearch"""
441 | for name, key in metrics.iteritems():
442 | result = lookup_node_stat(name, metrics, json)
443 | dispatch_stat(result, name, key)
444 | return True
445 |
446 |
447 | def parse_index_stats(metrics, json, index):
448 | """Parse stats response from ElasticSearch"""
449 | for name, key in metrics.iteritems():
450 | result = lookup_index_stat(name, metrics, json)
451 | dispatch_stat(result, name.format(index_name=index), key)
452 | return True
453 |
454 |
455 | def dispatch_stat(result, name, key):
456 | """Read a key from info response data and dispatch a value"""
457 | if result is None:
458 | collectd.warning('elasticsearch plugin: Value not found for %s' % name)
459 | return
460 | estype = key.type
461 | value = int(result)
462 | log_verbose('Sending value[%s]: %s=%s' % (estype, name, value))
463 |
464 | val = collectd.Values(plugin='elasticsearch')
465 | val.plugin_instance = ES_CLUSTER
466 | val.type = estype
467 | val.type_instance = name
468 | val.values = [value]
469 | val.meta={'0': True}
470 | val.dispatch()
471 |
472 |
473 | def read_callback():
474 | log_verbose('Read callback called')
475 | stats = fetch_stats()
476 |
477 |
478 |
479 | def dig_it_up(obj, path):
480 | try:
481 | if type(path) in (str, unicode):
482 | path = path.split('.')
483 | return reduce(lambda x, y: x[y], path, obj)
484 | except:
485 | return False
486 |
487 |
488 | def index_dig_it_up(obj, path, index_name):
489 | try:
490 | if type(path) in (str, unicode):
491 | path = path.split('.')
492 | path[1] = path[1] % index_name
493 | return reduce(lambda x, y: x[y], path, obj)
494 | except:
495 | return False
496 |
497 |
498 |
499 | collectd.register_config(configure_callback)
500 | collectd.register_read(read_callback)
501 |
--------------------------------------------------------------------------------
/8/collectd/logstash.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin processes
2 |
3 | ProcessMatch "logstash" "logstash\/runner.rb"
4 |
5 |
--------------------------------------------------------------------------------
/8/collectd/logstash_jmx.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin java
2 |
3 | JVMARG "-Djava.class.path=/usr/share/collectd/java/collectd-api.jar:/usr/share/collectd/java/generic-jmx.jar"
4 | LoadPlugin "org.collectd.java.GenericJMX"
5 |
6 |
7 | ObjectName "java.lang:type=GarbageCollector,*"
8 | InstancePrefix "gc-"
9 | InstanceFrom "name"
10 |
11 | Type "derive"
12 | Table false
13 | Attribute "CollectionCount"
14 | InstancePrefix "count"
15 |
16 |
17 |
18 | ObjectName "java.lang:type=GarbageCollector,*"
19 | InstancePrefix "gc-"
20 | InstanceFrom "name"
21 |
22 | Type "derive"
23 | Table false
24 | Attribute "CollectionTime"
25 | InstancePrefix "time"
26 |
27 |
28 |
29 | ObjectName "java.lang:type=MemoryPool,*"
30 | InstancePrefix "memory_pool-"
31 | InstanceFrom "name"
32 |
33 | Type "memory"
34 | Table true
35 | Attribute "Usage"
36 |
37 |
38 |
39 | ObjectName "java.lang:type=Memory"
40 | InstancePrefix "memory-heap"
41 |
42 | Type "memory"
43 | Table true
44 | Attribute "HeapMemoryUsage"
45 |
46 |
47 |
48 | ObjectName "java.lang:type=Memory"
49 | InstancePrefix "memory-nonheap"
50 |
51 | Type "memory"
52 | Table true
53 | Attribute "NonHeapMemoryUsage"
54 |
55 |
56 |
57 | ObjectName "java.lang:type=Threading"
58 | InstancePrefix "threading"
59 |
60 | Type "gauge"
61 | Table false
62 | Attribute "ThreadCount"
63 | InstancePrefix "count"
64 |
65 |
66 |
67 | ObjectName "java.lang:type=Threading"
68 | InstancePrefix "threading"
69 |
70 | Type "gauge"
71 | Table false
72 | Attribute "DaemonThreadCount"
73 | InstancePrefix "count-daemon"
74 |
75 |
76 |
77 | ServiceURL "service:jmx:rmi:///jndi/rmi://localhost:8855/jmxrmi"
78 | Collect "memory_pool"
79 | Collect "memory-heap"
80 | Collect "memory-nonheap"
81 | Collect "gc-count"
82 | Collect "gc-time"
83 | Collect "thread"
84 | Collect "thread-daemon"
85 |
86 |
87 |
88 |
--------------------------------------------------------------------------------
/8/collectd/rsyslogd.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin processes
2 |
3 | Process "rsyslogd"
4 |
5 |
--------------------------------------------------------------------------------
/8/logstash/logstash.conf:
--------------------------------------------------------------------------------
1 | input {
2 | tcp {
3 | port => 5514
4 | type => syslog
5 | }
6 | tcp {
7 | port => 2003
8 | type => "riemann"
9 | codec => "json"
10 | }
11 | udp {
12 | port => 5514
13 | type => syslog
14 | }
15 | file {
16 | path => [ "/var/log/syslog", "/var/log/auth.log" ]
17 | type => "syslog"
18 | }
19 | }
20 | filter {
21 | if [type] == "syslog" {
22 | grok {
23 | match => { "message" => "(?:%{SYSLOGTIMESTAMP:syslog_timestamp}|%{TIMESTAMP_ISO8601:syslog_timestamp}) %{SYSLOGHOST:syslog_hostname} %{DATA:syslog_program}(?:\/%{DATA:container_name}\/%{DATA:container_id})?(?:\[%{POSINT:syslog_pid}\])?: %{GREEDYDATA:syslog_message}" }
24 | remove_field => ["message"]
25 | }
26 | syslog_pri { }
27 | date {
28 | match => [ "syslog_timestamp", "MMM d HH:mm:ss", "MMM dd HH:mm:ss", "ISO8601" ]
29 | }
30 | }
31 | }
32 | output {
33 | stdout { }
34 | elasticsearch {
35 | sniffing => true
36 | hosts => "esa1.example.com"
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/8/riemann/examplecom/etc/checks.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.checks
2 | (:require [riemann.config :refer :all]
3 | [clojure.tools.logging :refer :all]
4 | [riemann.streams :refer :all]))
5 |
6 | (defn set_state [warning critical]
7 | (fn [event]
8 | (assoc event :state
9 | (condp < (:metric event)
10 | critical "critical"
11 | warning "warning"
12 | "ok"))))
13 |
14 | (defn check_threshold [srv window func warning critical & children]
15 | (where (service srv)
16 | (fixed-time-window window
17 | (smap func
18 | (where (< warning metric)
19 | (smap (set_state warning critical)
20 | (fn [event]
21 | (call-rescue event children))))))))
22 |
23 | (defn check_percentiles [srv window & children]
24 | (where (service srv)
25 | (percentiles window [0.5 0.95 0.99 1]
26 | (fn [event]
27 | (call-rescue event children)))))
28 |
--------------------------------------------------------------------------------
/8/riemann/examplecom/etc/collectd.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.collectd
2 | (:require [clojure.tools.logging :refer :all]
3 | [riemann.streams :refer :all]
4 | [clojure.string :as str]
5 | [clojure.walk :as walk]))
6 |
7 | (defn docker-attribute-map
8 | [attributes]
9 | (let [instance (str/split (str/replace attributes #"^.*\[(.*)\]$" "$1") #",")]
10 | (walk/keywordize-keys (into {} (for [pair instance] (apply hash-map (str/split pair #"=")))))))
11 |
12 | (defn docker-attributes
13 | [{:keys [plugin_instance] :as event}]
14 | (if-let [attributes (re-find #"^.*\[.*\]$" plugin_instance)]
15 | (merge event (docker-attribute-map attributes))
16 | event))
17 |
18 | (defn parse-docker-service-host
19 | [{:keys [type type_instance plugin_instance] :as event}]
20 | (let [host (re-find #"^\w+\.?\w+\.?\w+" (:plugin_instance event))
21 | service (cond-> (str (:type event)) (:type_instance event) (str "." (:type_instance event)))]
22 | (assoc event :service service :host host)))
23 |
24 | (def default-services
25 | [{:service #"^load/load/(.*)$" :rewrite "load $1"}
26 | {:service #"^swap/percent-(.*)$" :rewrite "swap $1"}
27 | {:service #"^memory/percent-(.*)$" :rewrite "memory $1"}
28 | {:service #"^processes/ps_state-(.*)$" :rewrite "processes $1"}
29 | {:service #"^processes-(.*)/(.*)$" :rewrite "processes $1 $2"}
30 | {:service #"^cpu/percent-(.*)$" :rewrite "cpu $1"}
31 | {:service #"^df-(.*)/(df_complex|percent_bytes)-(.*)$" :rewrite "df $1 $2 $3"}
32 | {:service #"^interface-(.*)/if_(errors|packets|octets)/(tx|rx)$" :rewrite "nic $1 $3 $2"}
33 | {:service #"^protocols-(.*)/(.*)$" :rewrite "protocols $1 $2"}
34 | {:service #"^GenericJMX-(:?_|\/)?(.*)$" :rewrite "jmx $2"}
35 | {:service #"^haproxy\/(gauge|derive)-(.*)$" :rewrite "haproxy $2"}
36 | {:service #"^statsd\/(gauge|derive|latency)-(.*)$" :rewrite "$2"}
37 | {:service #"^statsd\/(gauge|derive|latency)-(.*)$" :rewrite "statsd $1 $2"}
38 | {:service #"^mysql-(.*)\/(counter|gauge)-(.*)$" :rewrite "mysql $1 $3"}
39 | {:service #"^dbi-(.*)\/(gauge|counter)-(.*)$" :rewrite "dbi $1 $3"}
40 | {:service #"^redis-(.*)$" :rewrite "redis $1"}])
41 |
42 | (defn rewrite-service-with
43 | [rules]
44 | (let [matcher (fn [s1 s2] (if (string? s1) (= s1 s2) (re-find s1 s2)))]
45 | (fn [{:keys [service] :as event}]
46 | (or
47 | (first
48 | (for [{:keys [rewrite] :as rule} rules
49 | :when (matcher (:service rule) service)]
50 | (assoc event :service
51 | (if (string? (:service rule))
52 | rewrite
53 | (str/replace service (:service rule) rewrite)))))
54 | event))))
55 |
56 | (def rewrite-service
57 | (rewrite-service-with default-services))
58 |
--------------------------------------------------------------------------------
/8/riemann/examplecom/etc/email.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.email
2 | (:require [riemann.email :refer :all]))
3 |
4 | (def email (mailer {:from "riemann@example.com"}))
5 |
--------------------------------------------------------------------------------
/8/riemann/examplecom/etc/graphite.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.graphite
2 | (:require [clojure.string :as str]
3 | [riemann.config :refer :all]
4 | [riemann.graphite :refer :all]))
5 |
6 | (defn add-environment-to-graphite [event]
7 | (condp = (:plugin event)
8 | "docker"
9 | (if (:com.example.application event)
10 | (str "productiona.docker.", (:com.example.application event), ".", (riemann.graphite/graphite-path-percentiles event))
11 | (str "productiona.docker.", (riemann.graphite/graphite-path-percentiles event)))
12 | (str "productiona.hosts.", (riemann.graphite/graphite-path-percentiles event))))
13 |
14 | (def graph (async-queue! :graphite {:queue-size 1000}
15 | (graphite {:host "graphitea" :path add-environment-to-graphite})))
16 |
--------------------------------------------------------------------------------
/8/riemann/examplecom/etc/logstash.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.logstash
2 | (:require [riemann.logstash :refer :all]))
3 |
4 | (def logstash (async-queue! :logstash {:queue-size 1000}
5 | (logstash {:host "logstash" :port 2003 :port-size 20})))
6 |
--------------------------------------------------------------------------------
/8/riemann/riemann.config:
--------------------------------------------------------------------------------
1 | (logging/init {:file "/var/log/riemann/riemann.log"})
2 |
3 | (require 'riemann.client)
4 | (require '[examplecom.etc.email :refer :all])
5 | (require '[examplecom.etc.graphite :refer :all])
6 | (require '[examplecom.etc.collectd :refer :all])
7 |
8 | (let [host "0.0.0.0"]
9 | (repl-server {:host "127.0.0.1"})
10 | (tcp-server {:host host})
11 | (udp-server {:host host})
12 | (ws-server {:host host}))
13 |
14 | (periodically-expire 10 {:keep-keys [:host :service :tags, :state, :description, :metric]})
15 |
16 | (let [index (index)
17 | downstream (batch 100 1/10
18 | (async-queue! :agg { :queue-size 1e3
19 | :core-pool-size 4
20 | :max-pool-size 32}
21 | (forward
22 | (riemann.client/tcp-client :host "riemannmc"))))]
23 |
24 | ; Inbound events will be passed to these streams:
25 | (streams
26 | (default :ttl 60
27 | ; Index all events immediately.
28 | (where (not (tagged "notification"))
29 | index)
30 |
31 | (tagged "collectd"
32 | (where (not (= (:plugin event) "docker"))
33 | (smap rewrite-service graph))
34 |
35 | (where (= (:plugin event) "docker")
36 | (smap (comp parse-docker-service-host docker-attributes rewrite-service) graph))
37 |
38 | (tagged "notification"
39 | (changed-state {:init "ok"}
40 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count$" "$1"]
41 | (email "james@example.com"))))
42 |
43 | (where (and (expired? event)
44 | (service #"^processes-.+\/ps_count\/processes"))
45 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count\/processes$" "$1"]
46 | (email "james@example.com"))))
47 |
48 | (where (service #"^riemann.*")
49 | graph
50 |
51 | downstream))))
52 |
--------------------------------------------------------------------------------
/9/collectd/statsd.conf:
--------------------------------------------------------------------------------
1 | LoadPlugin statsd
2 |
3 |
4 | Host "localhost"
5 | Port "8125"
6 | TimerPercentile 90
7 | TimerPercentile 99
8 | TimerLower true
9 | TimerUpper true
10 | TimerSum true
11 | TimerCount true
12 |
13 |
--------------------------------------------------------------------------------
/9/riemann/examplecom/etc/checks.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.checks
2 | (:require [riemann.config :refer :all]
3 | [clojure.tools.logging :refer :all]
4 | [riemann.streams :refer :all]))
5 |
6 | (defn set_state [warning critical]
7 | (fn [event]
8 | (assoc event :state
9 | (condp < (:metric event)
10 | critical "critical"
11 | warning "warning"
12 | "ok"))))
13 |
14 | (defn check_threshold [srv window func warning critical & children]
15 | (where (service srv)
16 | (fixed-time-window window
17 | (smap func
18 | (where (< warning metric)
19 | (smap (set_state warning critical)
20 | (fn [event]
21 | (call-rescue event children))))))))
22 |
23 | (defn check_percentiles [srv window & children]
24 | (where (service srv)
25 | (percentiles window [0.5 0.95 0.99 1]
26 | (fn [event]
27 | (call-rescue event children)))))
28 |
--------------------------------------------------------------------------------
/9/riemann/examplecom/etc/collectd.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.collectd
2 | (:require [clojure.tools.logging :refer :all]
3 | [riemann.streams :refer :all]
4 | [clojure.string :as str]
5 | [clojure.walk :as walk]))
6 |
7 | (defn docker-attribute-map
8 | [attributes]
9 | (let [instance (str/split (str/replace attributes #"^.*\[(.*)\]$" "$1") #",")]
10 | (walk/keywordize-keys (into {} (for [pair instance] (apply hash-map (str/split pair #"=")))))))
11 |
12 | (defn docker-attributes
13 | [{:keys [plugin_instance] :as event}]
14 | (if-let [attributes (re-find #"^.*\[.*\]$" plugin_instance)]
15 | (merge event (docker-attribute-map attributes))
16 | event))
17 |
18 | (defn parse-docker-service-host
19 | [{:keys [type type_instance plugin_instance] :as event}]
20 | (let [host (re-find #"^\w+\.?\w+\.?\w+" (:plugin_instance event))
21 | service (cond-> (str (:type event)) (:type_instance event) (str "." (:type_instance event)))]
22 | (assoc event :service service :host host)))
23 |
24 | (def default-services
25 | [{:service #"^load/load/(.*)$" :rewrite "load $1"}
26 | {:service #"^swap/percent-(.*)$" :rewrite "swap $1"}
27 | {:service #"^memory/percent-(.*)$" :rewrite "memory $1"}
28 | {:service #"^processes/ps_state-(.*)$" :rewrite "processes $1"}
29 | {:service #"^processes-(.*)/(.*)$" :rewrite "processes $1 $2"}
30 | {:service #"^cpu/percent-(.*)$" :rewrite "cpu $1"}
31 | {:service #"^df-(.*)/(df_complex|percent_bytes)-(.*)$" :rewrite "df $1 $2 $3"}
32 | {:service #"^interface-(.*)/if_(errors|packets|octets)/(tx|rx)$" :rewrite "nic $1 $3 $2"}
33 | {:service #"^protocols-(.*)/(.*)$" :rewrite "protocols $1 $2"}
34 | {:service #"^GenericJMX-(:?_|\/)?(.*)$" :rewrite "jmx $2"}
35 | {:service #"^haproxy\/(gauge|derive)-(.*)$" :rewrite "haproxy $2"}
36 | {:service #"^statsd\/(gauge|derive|latency)-(.*)$" :rewrite "$2"}
37 | {:service #"^statsd\/(gauge|derive|latency)-(.*)$" :rewrite "statsd $1 $2"}
38 | {:service #"^mysql-(.*)\/(counter|gauge)-(.*)$" :rewrite "mysql $1 $3"}
39 | {:service #"^dbi-(.*)\/(gauge|counter)-(.*)$" :rewrite "dbi $1 $3"}
40 | {:service #"^redis-(.*)$" :rewrite "redis $1"}])
41 |
42 | (defn rewrite-service-with
43 | [rules]
44 | (let [matcher (fn [s1 s2] (if (string? s1) (= s1 s2) (re-find s1 s2)))]
45 | (fn [{:keys [service] :as event}]
46 | (or
47 | (first
48 | (for [{:keys [rewrite] :as rule} rules
49 | :when (matcher (:service rule) service)]
50 | (assoc event :service
51 | (if (string? (:service rule))
52 | rewrite
53 | (str/replace service (:service rule) rewrite)))))
54 | event))))
55 |
56 | (def rewrite-service
57 | (rewrite-service-with default-services))
58 |
--------------------------------------------------------------------------------
/9/riemann/examplecom/etc/email.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.email
2 | (:require [riemann.email :refer :all]))
3 |
4 | (def email (mailer {:from "riemann@example.com"}))
5 |
--------------------------------------------------------------------------------
/9/riemann/examplecom/etc/graphite.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.graphite
2 | (:require [clojure.string :as str]
3 | [riemann.config :refer :all]
4 | [riemann.graphite :refer :all]))
5 |
6 | (defn graphite-path-statsd [event]
7 | (let [host (:host event)
8 | app (re-find #"^.*?\." (:service event))
9 | service (str/replace-first (:service event) #"^.*?\." "")
10 | split-host (if host (str/split host #"\.") [])
11 | split-service (if service (str/split service #" ") [])]
12 | (str app, (str/join "." (concat (reverse split-host) split-service)))))
13 |
14 | (defn add-environment-to-graphite [event]
15 | (condp = (:plugin event)
16 | "docker"
17 | (if (:com.example.application event)
18 | (str "productiona.docker.", (:com.example.application event), ".", (riemann.graphite/graphite-path-percentiles event))
19 | (str "productiona.docker.", (riemann.graphite/graphite-path-percentiles event)))
20 | "statsd" (str "productiona.", (graphite-path-statsd event))
21 | (str "productiona.hosts.", (riemann.graphite/graphite-path-percentiles event))))
22 |
23 | (def graph (async-queue! :graphite {:queue-size 1000}
24 | (graphite {:host "graphitea" :path add-environment-to-graphite})))
25 |
--------------------------------------------------------------------------------
/9/riemann/examplecom/etc/logstash.clj:
--------------------------------------------------------------------------------
1 | (ns examplecom.etc.logstash
2 | (:require [riemann.logstash :refer :all]))
3 |
4 | (def logstash (async-queue! :logstash {:queue-size 1000}
5 | (logstash {:host "logstash" :port 2003 :port-size 20})))
6 |
--------------------------------------------------------------------------------
/9/riemann/riemann.config:
--------------------------------------------------------------------------------
1 | (logging/init {:file "/var/log/riemann/riemann.log"})
2 |
3 | (require 'riemann.client)
4 | (require '[examplecom.etc.email :refer :all])
5 | (require '[examplecom.etc.graphite :refer :all])
6 | (require '[examplecom.etc.collectd :refer :all])
7 |
8 | (let [host "0.0.0.0"]
9 | (repl-server {:host "127.0.0.1"})
10 | (tcp-server {:host host})
11 | (udp-server {:host host})
12 | (ws-server {:host host}))
13 |
14 | (periodically-expire 10 {:keep-keys [:host :service :tags, :state, :description, :metric]})
15 |
16 | (let [index (index)
17 | downstream (batch 100 1/10
18 | (async-queue! :agg { :queue-size 1e3
19 | :core-pool-size 4
20 | :max-pool-size 32}
21 | (forward
22 | (riemann.client/tcp-client :host "riemannmc"))))]
23 |
24 | ; Inbound events will be passed to these streams:
25 | (streams
26 | (default :ttl 60
27 | ; Index all events immediately.
28 | (where (not (tagged "notification"))
29 | index)
30 |
31 | (tagged "collectd"
32 | (where (not (= (:plugin event) "docker"))
33 | (smap rewrite-service graph))
34 |
35 | (where (= (:plugin event) "docker")
36 | (smap (comp parse-docker-service-host docker-attributes rewrite-service) graph))
37 |
38 | (tagged "notification"
39 | (changed-state {:init "ok"}
40 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count$" "$1"]
41 | (email "james@example.com"))))
42 |
43 | (where (and (expired? event)
44 | (service #"^processes-.+\/ps_count\/processes"))
45 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count\/processes$" "$1"]
46 | (email "james@example.com"))))
47 |
48 | (where (service #"^riemann.*")
49 | graph
50 |
51 | downstream))))
52 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # The Art of Monitoring code repository
2 |
3 | The source code to accompany [The Art of
4 | Monitoring](http://artofmonitoring.com) book.
5 |
6 | Each directory contains one or more chapter's code.
7 |
--------------------------------------------------------------------------------