├── 3 ├── collectd │ └── riemann.conf └── riemann │ ├── examplecom │ └── etc │ │ └── email.clj │ ├── riemann.config │ └── riemann.config_riemannmc ├── 4 ├── collectd │ ├── carbon.conf │ └── grafana.conf ├── graphite │ ├── carbon-cache-ubuntu.init │ ├── carbon-cache@.service │ ├── carbon-relay-ubuntu.init │ ├── carbon-relay@.service │ ├── carbon.conf │ ├── graphite-api.service │ ├── graphite-api.yaml │ ├── graphite-carbon.default │ ├── local_settings.py │ └── whisper-calculator.py └── riemann │ ├── examplecom │ └── etc │ │ ├── email.clj │ │ └── graphite.clj │ ├── riemann.config │ └── riemann.config_riemannmc ├── 7 ├── collectd │ ├── collectd.conf │ └── collectd.d │ │ ├── cpu.conf │ │ ├── df.conf │ │ ├── docker.conf │ │ ├── memory.conf │ │ ├── processes.conf │ │ ├── swap.conf │ │ └── write_riemann.conf └── riemann │ ├── examplecom │ └── etc │ │ ├── checks.clj │ │ ├── collectd.clj │ │ ├── email.clj │ │ └── graphite.clj │ └── riemann.config ├── 8 ├── collectd │ ├── elasticsearch.conf │ ├── elasticsearch_collectd.py │ ├── logstash.conf │ ├── logstash_jmx.conf │ └── rsyslogd.conf ├── logstash │ └── logstash.conf └── riemann │ ├── examplecom │ └── etc │ │ ├── checks.clj │ │ ├── collectd.clj │ │ ├── email.clj │ │ ├── graphite.clj │ │ └── logstash.clj │ └── riemann.config ├── 9 ├── collectd │ └── statsd.conf └── riemann │ ├── examplecom │ └── etc │ │ ├── checks.clj │ │ ├── collectd.clj │ │ ├── email.clj │ │ ├── graphite.clj │ │ └── logstash.clj │ └── riemann.config ├── 10 ├── grafana │ └── riemann.js └── riemann │ ├── examplecom │ └── etc │ │ ├── checks.clj │ │ ├── collectd.clj │ │ ├── count-notifications.clj │ │ ├── email.clj │ │ ├── graphite.clj │ │ ├── logstash.clj │ │ ├── maintenance.clj │ │ ├── pagerduty.clj │ │ └── slack.clj │ └── riemann.config ├── .gitignore ├── 11-13 ├── collectd │ ├── mysql.conf │ └── tornado-api.conf ├── grafana │ └── tornado-dashboard.json ├── logstash │ ├── logstash.conf │ └── patterns │ │ ├── nginx │ │ └── tornadoapi ├── riemann │ ├── examplecom │ │ ├── app │ │ │ └── tornado.clj │ │ └── etc │ │ │ ├── checks.clj │ │ │ ├── collectd.clj │ │ │ ├── count-notifications.clj │ │ │ ├── email.clj │ │ │ ├── graphite.clj │ │ │ ├── logstash.clj │ │ │ ├── maintenance.clj │ │ │ ├── pagerduty.clj │ │ │ └── slack.clj │ └── riemann.config └── rsyslog │ └── 35-aom-clojure-rest.conf ├── 5-6 ├── collectd │ ├── collectd.conf │ └── collectd.d │ │ ├── carbon.conf │ │ ├── cpu.conf │ │ ├── df.conf │ │ ├── memory.conf │ │ ├── processes.conf │ │ ├── swap.conf │ │ └── write_riemann.conf └── riemann │ ├── examplecom │ └── etc │ │ ├── checks.clj │ │ ├── collectd.clj │ │ ├── email.clj │ │ └── graphite.clj │ └── riemann.config ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | /.config 4 | /coverage/ 5 | /InstalledFiles 6 | /pkg/ 7 | /spec/reports/ 8 | /test/tmp/ 9 | /test/version_tmp/ 10 | /tmp/ 11 | 12 | ## Specific to RubyMotion: 13 | .dat* 14 | .repl_history 15 | build/ 16 | 17 | ## Documentation cache and generated files: 18 | /.yardoc/ 19 | /_yardoc/ 20 | /doc/ 21 | /rdoc/ 22 | 23 | ## Environment normalisation: 24 | /.bundle/ 25 | /lib/bundler/man/ 26 | 27 | # for a library or gem, you might want to ignore these files since the code is 28 | # intended to run in multiple environments; otherwise, check them in: 29 | # Gemfile.lock 30 | # .ruby-version 31 | # .ruby-gemset 32 | 33 | # unless supporting rvm < 1.11.0 or doing something fancy, ignore this: 34 | .rvmrc 35 | -------------------------------------------------------------------------------- /10/grafana/riemann.js: -------------------------------------------------------------------------------- 1 | /* Original dashboard code modified from: https://github.com/bimlendu/GrafanaScriptedDashboards /* 2 | /* Thanks to Bimlendu Mishra for developing the original! /* 3 | 4 | /*global XMLHttpRequest: false */ 5 | 6 | var window, document, ARGS, $, jQuery, moment, kbn; 7 | var graphite = 'http://graphitea.example.com:8888'; 8 | 9 | // Specify defaults for URL arguments 10 | var arg_host = 'graphitea'; 11 | var arg_span = 4; 12 | var arg_from = '6h'; 13 | var arg_env = 'productiona'; 14 | var arg_stack = 'hosts'; 15 | 16 | if (!_.isUndefined(ARGS.span)) { 17 | arg_span = ARGS.span; // graph width 18 | } 19 | if (!_.isUndefined(ARGS.from)) { 20 | arg_from = ARGS.from; // show data from 'x' hours until now 21 | } 22 | if (!_.isUndefined(ARGS.host)) { 23 | arg_host = ARGS.host; // host name 24 | } 25 | if (!_.isUndefined(ARGS.env)) { 26 | arg_env = ARGS.env; // environment 27 | } 28 | if (!_.isUndefined(ARGS.stack)) { 29 | arg_stack = ARGS.stack; // stack (hosts or docker) 30 | } 31 | 32 | // Execute graphite-api /metrics/find query. Returns array of metric last names ( func('test.cpu-*') returns ['cpu-0','cpu-1',..] ) 33 | function find_filter_values(query) { 34 | var search_url = graphite + '/metrics/find/?query=' + query; 35 | var res = []; 36 | var req = new XMLHttpRequest(); 37 | req.open('GET', search_url, false); 38 | req.send(null); 39 | var obj = JSON.parse(req.responseText); 40 | var key; 41 | for (key in obj) { 42 | if (obj.hasOwnProperty(key)) { 43 | if (obj[key].hasOwnProperty("text")) { 44 | res.push(obj[key].text); 45 | } 46 | } 47 | } 48 | return res; 49 | } 50 | 51 | // Return dashboard filter_list. Optionally include 'All' 52 | function get_filter_object(name, query, show_all) { 53 | show_all = (show_all === undefined) ? true : show_all; 54 | var arr = find_filter_values(query); 55 | var opts = []; 56 | var i; 57 | for (i in arr) { 58 | if (arr.hasOwnProperty(i)) { 59 | opts.push({"text": arr[i], "value": arr[i]}); 60 | } 61 | } 62 | if (show_all === true) { 63 | opts.unshift({"text": "All", "value": '{' + arr.join() + '}'}); 64 | } 65 | return { 66 | type: "filter", 67 | name: name, 68 | query: query, 69 | options: opts, 70 | current: opts[0], 71 | includeAll: show_all 72 | }; 73 | } 74 | 75 | /* 76 | Panel templates 77 | */ 78 | 79 | function panel_cpu(title, prefix) { 80 | return { 81 | title: title, 82 | type: 'graphite', 83 | span: arg_span, 84 | renderer: "flot", 85 | y_formats: ["none"], 86 | grid: {max: null, min: 0}, 87 | lines: true, 88 | fill: 2, 89 | linewidth: 1, 90 | tooltip: { 91 | value_type: 'individual', 92 | shared: true 93 | }, 94 | stack: true, 95 | legend: {show: true}, 96 | percentage: true, 97 | nullPointMode: "null", 98 | targets: [ 99 | { "target": "aliasByNode(" + prefix + "[[host]].cpu.wait,4)" }, 100 | { "target": "aliasByNode(" + prefix + "[[host]].cpu.user,4)" }, 101 | { "target": "aliasByNode(" + prefix + "[[host]].cpu.system,4)" }, 102 | { "target": "aliasByNode(" + prefix + "[[host]].cpu.steal,4)" }, 103 | { "target": "aliasByNode(" + prefix + "[[host]].cpu.interrupt,4)" }, 104 | { "target": "aliasByNode(" + prefix + "[[host]].cpu.nice,4)" }, 105 | { "target": "aliasByNode(" + prefix + "[[host]].cpu.idle,4)" }, 106 | { "target": "aliasByNode(" + prefix + "[[host]].cpu.softirq,4)" } 107 | ], 108 | aliasColors: { 109 | "user": "#508642", 110 | "system": "#EAB839", 111 | "wait": "#890F02", 112 | "steal": "#E24D42", 113 | "idle": "#6ED0E0", 114 | "nice": "#629E51", 115 | "irq": "#1F78C1", 116 | "intrpt": "#EF843C" 117 | } 118 | }; 119 | } 120 | 121 | function panel_memory(title, prefix) { 122 | return { 123 | title: title, 124 | type: 'graphite', 125 | span: arg_span, 126 | y_formats: ["none"], 127 | grid: {max: null, min: 0}, 128 | lines: true, 129 | fill: 2, 130 | linewidth: 1, 131 | stack: true, 132 | tooltip: { 133 | value_type: 'individual', 134 | shared: true 135 | }, 136 | nullPointMode: "null", 137 | targets: [ 138 | { "target": "aliasByNode(" + prefix + "[[host]].memory.used,4)" } 139 | ], 140 | aliasColors: { 141 | "used": "#ff6666", 142 | } 143 | }; 144 | } 145 | 146 | function panel_loadavg(title, prefix) { 147 | return { 148 | title: title, 149 | type: 'graphite', 150 | span: arg_span, 151 | y_formats: ["none"], 152 | grid: {max: null, min: 0}, 153 | lines: true, 154 | fill: 2, 155 | linewidth: 1, 156 | tooltip: { 157 | value_type: 'individual', 158 | shared: true 159 | }, 160 | stack : true, 161 | nullPointMode: "null", 162 | targets: [ 163 | { "target": "aliasByNode(" + prefix + "[[host]].load.*,4)" } 164 | ], 165 | aliasColors: { 166 | "midterm": "#629E51", 167 | "shortterm": "#1F78C1", 168 | "longterm": "#EF843C" 169 | } 170 | }; 171 | } 172 | 173 | function panel_swap_size(title, prefix) { 174 | return { 175 | title: title, 176 | type: 'graphite', 177 | span: arg_span, 178 | y_formats: ["none"], 179 | grid: {max: null, min: 0, leftMin: 0}, 180 | lines: true, 181 | fill: 2, 182 | linewidth: 1, 183 | tooltip: { 184 | value_type: 'individual', 185 | shared: true 186 | }, 187 | stack: true, 188 | nullPointMode: "null", 189 | percentage: true, 190 | targets: [ 191 | { "target": "aliasByNode(" + prefix + "[[host]].swap.{free,used,cached},4)" }, 192 | ], 193 | aliasColors: { 194 | "used": "#ff6666", 195 | "cached": "#EAB839", 196 | "free": "#66b266" 197 | } 198 | }; 199 | } 200 | 201 | function panel_disk_space(title, prefix) { 202 | return { 203 | title: title, 204 | type: 'graphite', 205 | span: arg_span, 206 | y_formats: ["none"], 207 | grid: {max: null, min: 0, leftMin: 0}, 208 | lines: true, 209 | fill: 2, 210 | linewidth: 1, 211 | tooltip: { 212 | value_type: 'individual', 213 | shared: true 214 | }, 215 | stack: true, 216 | nullPointMode: "null", 217 | targets: [ 218 | { "target": "aliasByNode(" + prefix + "[[host]]." + "df.root.percent_bytes.used,6)" }, 219 | ], 220 | aliasColors: { 221 | "used": "#e32636" 222 | } 223 | }; 224 | } 225 | 226 | /* 227 | Row templates 228 | */ 229 | 230 | function row_delimiter(title) { 231 | return { 232 | title: "_____ " + title, 233 | height: "20px", 234 | collapse: false, 235 | editable: false, 236 | collapsable: false, 237 | panels: [{ 238 | title: title, 239 | editable: false, 240 | span: 12, 241 | type: "text", 242 | mode: "text" 243 | }] 244 | }; 245 | } 246 | 247 | function row_cpu_memory(title, prefix) { 248 | return { 249 | title: title, 250 | height: '250px', 251 | collapse: false, 252 | panels: [ 253 | panel_cpu('CPU %', prefix), 254 | panel_memory('Memory', prefix), 255 | panel_loadavg('Load avg', prefix) 256 | ] 257 | }; 258 | } 259 | 260 | function row_swap_disk(title, prefix) { 261 | return { 262 | title: title, 263 | height: '250px', 264 | collapse: false, 265 | panels: [ 266 | panel_swap_size('Swap size', prefix), 267 | panel_disk_space('Disk Space on root', prefix) 268 | ] 269 | }; 270 | } 271 | 272 | /*jslint unparam: true, node: true */ 273 | return function(callback) { 274 | 275 | // Setup some variables 276 | var dashboard; 277 | 278 | var prefix = arg_env + '.' + arg_stack + '.'; 279 | 280 | var arg_filter = prefix + arg_host; 281 | 282 | // Set filter 283 | 284 | var dashboard_filter = { 285 | time: { 286 | from: "now-" + arg_from, 287 | to: "now" 288 | }, 289 | list: [ 290 | get_filter_object("host", arg_filter, false) 291 | ] 292 | }; 293 | 294 | // Define pulldowns 295 | 296 | var pulldowns = [ 297 | { 298 | type: "filtering", 299 | collapse: false, 300 | notice: false, 301 | enable: true 302 | }, 303 | { 304 | type: "annotations", 305 | enable: false 306 | } 307 | ]; 308 | 309 | // Initialize a skeleton with nothing but a rows array and service object 310 | 311 | dashboard = { 312 | rows : [], 313 | services : {} 314 | }; 315 | dashboard.title = prefix + arg_host; 316 | dashboard.editable = false; 317 | dashboard.pulldowns = pulldowns; 318 | dashboard.services.filter = dashboard_filter; 319 | 320 | $.ajax({ 321 | method: 'GET', 322 | url: '/' 323 | }) 324 | .done(function (result) { 325 | 326 | // Construct dashboard rows 327 | 328 | dashboard.rows.push( 329 | row_cpu_memory('CPU, Memory, Load', prefix), 330 | row_swap_disk('Swap, Disk Space', prefix) 331 | ); 332 | 333 | callback(dashboard); 334 | }); 335 | } 336 | -------------------------------------------------------------------------------- /10/riemann/examplecom/etc/checks.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.checks 2 | (:require [riemann.config :refer :all] 3 | [clojure.tools.logging :refer :all] 4 | [riemann.streams :refer :all])) 5 | 6 | (defn set_state [warning critical] 7 | (fn [event] 8 | (assoc event :state 9 | (condp < (:metric event) 10 | critical "critical" 11 | warning "warning" 12 | "ok")))) 13 | 14 | (defn check_threshold [srv window func warning critical & children] 15 | (where (service srv) 16 | (fixed-time-window window 17 | (smap func 18 | (where (< warning metric) 19 | (smap (set_state warning critical) 20 | (fn [event] 21 | (call-rescue event children)))))))) 22 | 23 | (defn check_percentiles [srv window & children] 24 | (where (service srv) 25 | (percentiles window [0.5 0.95 0.99 1] 26 | (fn [event] 27 | (call-rescue event children))))) 28 | -------------------------------------------------------------------------------- /10/riemann/examplecom/etc/collectd.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.collectd 2 | (:require [clojure.tools.logging :refer :all] 3 | [riemann.streams :refer :all] 4 | [clojure.string :as str] 5 | [clojure.walk :as walk])) 6 | 7 | (defn docker-attribute-map 8 | [attributes] 9 | (let [instance (str/split (str/replace attributes #"^.*\[(.*)\]$" "$1") #",")] 10 | (walk/keywordize-keys (into {} (for [pair instance] (apply hash-map (str/split pair #"="))))))) 11 | 12 | (defn docker-attributes 13 | [{:keys [plugin_instance] :as event}] 14 | (if-let [attributes (re-find #"^.*\[.*\]$" plugin_instance)] 15 | (merge event (docker-attribute-map attributes)) 16 | event)) 17 | 18 | (defn parse-docker-service-host 19 | [{:keys [type type_instance plugin_instance] :as event}] 20 | (let [host (re-find #"^\w+\.?\w+\.?\w+" (:plugin_instance event)) 21 | service (cond-> (str (:type event)) (:type_instance event) (str "." (:type_instance event)))] 22 | (assoc event :service service :host host))) 23 | 24 | (defn plugin-map 25 | "Parses labels from collectd plugin_stance" 26 | [plugin_instance] 27 | (let [instance (str/split (str/replace plugin_instance #"^.*\[(.*)\]$" "$1") #",")] 28 | (walk/keywordize-keys (into {} (for [pair instance] (apply hash-map (str/split pair #"="))))))) 29 | 30 | (defn parse-docker 31 | [& children] 32 | "Parses Docker events" 33 | (fn [event] 34 | (let [host (re-find #"^\w+\.?\w+\.?\w+" (:plugin_instance event)) 35 | service (cond-> (str (:type event)) (:type_instance event) (str "." (:type_instance event))) 36 | event (assoc event :service service :host host) 37 | event (merge event (plugin-map (:plugin_instance event)))] 38 | (call-rescue event children)))) 39 | 40 | (def default-services 41 | [{:service #"^load/load/(.*)$" :rewrite "load $1"} 42 | {:service #"^swap/percent-(.*)$" :rewrite "swap $1"} 43 | {:service #"^memory/percent-(.*)$" :rewrite "memory $1"} 44 | {:service #"^processes/ps_state-(.*)$" :rewrite "processes $1"} 45 | {:service #"^processes-(.*)/(.*)$" :rewrite "processes $1 $2"} 46 | {:service #"^cpu/percent-(.*)$" :rewrite "cpu $1"} 47 | {:service #"^df-(.*)/(df_complex|percent_bytes)-(.*)$" :rewrite "df $1 $2 $3"} 48 | {:service #"^interface-(.*)/if_(errors|packets|octets)/(tx|rx)$" :rewrite "nic $1 $3 $2"} 49 | {:service #"^protocols-(.*)/(.*)$" :rewrite "protocols $1 $2"} 50 | {:service #"^GenericJMX-(:?_|\/)?(.*)$" :rewrite "jmx $2"} 51 | {:service #"^haproxy\/(gauge|derive)-(.*)$" :rewrite "haproxy $2"} 52 | {:service #"^statsd\/(gauge|derive|latency)-(.*)$" :rewrite "$2"} 53 | {:service #"^statsd\/(gauge|derive|latency)-(.*)$" :rewrite "statsd $1 $2"} 54 | {:service #"^mysql-(.*)\/(counter|gauge)-(.*)$" :rewrite "mysql $1 $3"} 55 | {:service #"^dbi-(.*)\/(gauge|counter)-(.*)$" :rewrite "dbi $1 $3"} 56 | {:service #"^redis-(.*)$" :rewrite "redis $1"}]) 57 | 58 | (defn rewrite-service-with 59 | [rules] 60 | (let [matcher (fn [s1 s2] (if (string? s1) (= s1 s2) (re-find s1 s2)))] 61 | (fn [{:keys [service] :as event}] 62 | (or 63 | (first 64 | (for [{:keys [rewrite] :as rule} rules 65 | :when (matcher (:service rule) service)] 66 | (assoc event :service 67 | (if (string? (:service rule)) 68 | rewrite 69 | (str/replace service (:service rule) rewrite))))) 70 | event)))) 71 | 72 | (def rewrite-service 73 | (rewrite-service-with default-services)) 74 | -------------------------------------------------------------------------------- /10/riemann/examplecom/etc/count-notifications.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.count-notifications 2 | (:require [riemann.streams :refer :all])) 3 | 4 | (defn count-notifications 5 | "Count notifications" 6 | [& children] 7 | (adjust [:service #(str % ".rate")] 8 | (tag "notification-rate" 9 | (rate 5 10 | (fn [event] 11 | (call-rescue event children)))))) 12 | -------------------------------------------------------------------------------- /10/riemann/examplecom/etc/email.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.email 2 | (:require [clojure.string :as str] 3 | [riemann.email :refer :all])) 4 | 5 | (defn format-subject 6 | "Format the email subject" 7 | [events] 8 | (apply format "Service %s is in state %s on host %s" (str/join ", " (map :service events)) (str/join ", " (map :state events)) (map :host events))) 9 | 10 | (def header "Monitoring notification from Riemann!\n\n") 11 | (def footer "This is an automated Riemann notification. Please do not reply.") 12 | 13 | (defn lookup 14 | "Lookup events in the index" 15 | [host service] 16 | (riemann.index/lookup (:index @riemann.config/core) host service)) 17 | 18 | (defn round 19 | "Round numbers to 2 decimal places" 20 | [metric] 21 | (clojure.pprint/cl-format nil "~,2f" metric)) 22 | 23 | (defn byte-to-gb [bytes] (/ bytes (* 1024.0 1024.0 1024.0))) 24 | 25 | (defn context 26 | "Add some contextual event data" 27 | [event] 28 | (str 29 | "Host context:\n" 30 | " CPU Utilization:\t"(round (+ (:metric (lookup (:host event) "cpu/percent-system")) (:metric (lookup (:host event) "cpu/percent-user")))) "%\n" 31 | " Memory Used:\t"(round (:metric (lookup (:host event) "memory/percent-used"))) "%\n" 32 | " Disk(root) %:\t\t"(round (:metric (lookup (:host event) "df-root/percent_bytes-used"))) "% used " 33 | " ("(round (byte-to-gb (:metric (lookup (:host event) "df-root/df_complex-used")))) " GB used of " 34 | (round (+ (byte-to-gb (:metric (lookup (:host event) "df-root/df_complex-used"))) 35 | (byte-to-gb (:metric (lookup (:host event) "df-root/df_complex-free"))) 36 | (byte-to-gb (:metric (lookup (:host event) "df-root/df_complex-reserved"))))) "GB)\n\n" 37 | "Grafana Dashboard:\n\n" 38 | " http://graphitea.example.com:3000/dashboard/script/riemann.js?host="(:host event)"\n\n")) 39 | 40 | (defn format-body 41 | "Format the email body" 42 | [events] 43 | (str/join "\n\n\n" 44 | (map 45 | (fn [event] 46 | (str 47 | header 48 | "Time:\t\t" (riemann.common/time-at (:time event)) "\n" 49 | "Host:\t\t" (:host event) "\n" 50 | "Service:\t\t" (:service event) "\n" 51 | "State:\t\t" (:state event) "\n" 52 | "Metric:\t\t" (if (ratio? (:metric event)) 53 | (double (:metric event)) 54 | (:metric event)) "\n" 55 | "Tags:\t\t[" (str/join ", " (:tags event)) "] \n" 56 | "\n" 57 | "Description:\t\t" (:description event) 58 | "\n\n" 59 | (context event) 60 | footer)) 61 | events))) 62 | 63 | (def email (mailer {:from "riemann@example.com" 64 | :subject (fn [events] (format-subject events)) 65 | :body (fn [events] (format-body events)) 66 | })) 67 | -------------------------------------------------------------------------------- /10/riemann/examplecom/etc/graphite.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.graphite 2 | (:require [clojure.string :as str] 3 | [riemann.config :refer :all] 4 | [riemann.graphite :refer :all])) 5 | 6 | (defn graphite-path-statsd [event] 7 | (let [host (:host event) 8 | app (re-find #"^.*?\." (:service event)) 9 | service (str/replace-first (:service event) #"^.*?\." "") 10 | split-host (if host (str/split host #"\.") []) 11 | split-service (if service (str/split service #" ") [])] 12 | (str app, (str/join "." (concat (reverse split-host) split-service))))) 13 | 14 | (defn add-environment-to-graphite [event] 15 | (condp = (:plugin event) 16 | "docker" 17 | (if (:com.example.application event) 18 | (str "productiona.docker.", (:com.example.application event), ".", (riemann.graphite/graphite-path-percentiles event)) 19 | (str "productiona.docker.", (riemann.graphite/graphite-path-percentiles event))) 20 | "statsd" (str "productiona.", (graphite-path-statsd event)) 21 | (str "productiona.hosts.", (riemann.graphite/graphite-path-percentiles event)))) 22 | 23 | (def graph (async-queue! :graphite {:queue-size 1000} 24 | (graphite {:host "graphitea" :path add-environment-to-graphite}))) 25 | -------------------------------------------------------------------------------- /10/riemann/examplecom/etc/logstash.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.logstash 2 | (:require [riemann.logstash :refer :all])) 3 | 4 | (def logstash (async-queue! :logstash {:queue-size 1000} 5 | (logstash {:host "logstash" :port 2003 :port-size 20}))) 6 | -------------------------------------------------------------------------------- /10/riemann/examplecom/etc/maintenance.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.maintenance 2 | (:require [riemann.streams :refer :all])) 3 | 4 | (defn maintenance-mode? 5 | "Is it currently in maintenance mode?" 6 | [event] 7 | (->> '(and (= host (:host event)) 8 | (= service (:service event)) 9 | (= (:type event) "maintenance-mode")) 10 | (riemann.index/search (:index @core)) 11 | first 12 | :state 13 | (= "active"))) 14 | -------------------------------------------------------------------------------- /10/riemann/examplecom/etc/pagerduty.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.pagerduty 2 | (:require [riemann.pagerduty :refer :all] 3 | [riemann.streams :refer :all])) 4 | 5 | (defn pd-format 6 | [event] 7 | {:incident_key (str (:host event) " " (:service event)) 8 | :description (str "Host: " (:host event) " " 9 | (:service event) " is " 10 | (:state event) " (" 11 | (:metric event) ")") 12 | :details (assoc event :graphs (str "http://graphitea.example.com:3000/dashboard/script/riemann.js?host="(:host event)))}) 13 | 14 | (def pd (pagerduty { :service-key "123ABC123" :formatter pd-format})) 15 | 16 | (defn page 17 | [] 18 | (changed-state {:init "ok"} 19 | (where (state "ok") 20 | (:resolve pd) 21 | (else (:trigger pd))))) 22 | -------------------------------------------------------------------------------- /10/riemann/examplecom/etc/slack.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.slack 2 | (:require [riemann.slack :refer :all])) 3 | 4 | (def credentials {:account "examplecom", :token "123ABC123ABC"}) 5 | 6 | (defn slack-format 7 | "Format our Slack message" 8 | [event] 9 | (str "Service " (:service event) " on host " (:host event) " is in state " (:state event) ".\n" 10 | "See http://graphitea.example.com:3000/dashboard/script/riemann.js?host="(:host event) )) 11 | 12 | (defn slacker 13 | "Send notifications to Slack" 14 | [& {:keys [recipient] 15 | :or {recipient "#monitoring"}}] 16 | (slack credentials {:username "Riemann bot" 17 | :channel recipient 18 | :formatter (fn [e] { :text (slack-format e) } ) 19 | :icon ":smile:"})) 20 | -------------------------------------------------------------------------------- /10/riemann/riemann.config: -------------------------------------------------------------------------------- 1 | (logging/init {:file "/var/log/riemann/riemann.log"}) 2 | 3 | (require 'riemann.client) 4 | (require '[examplecom.etc.email :refer :all]) 5 | (require '[examplecom.etc.graphite :refer :all]) 6 | (require '[examplecom.etc.collectd :refer :all]) 7 | 8 | (let [host "0.0.0.0"] 9 | (repl-server {:host "127.0.0.1"}) 10 | (tcp-server {:host host}) 11 | (udp-server {:host host}) 12 | (ws-server {:host host})) 13 | 14 | (periodically-expire 10 {:keep-keys [:host :service :tags, :state, :description, :metric]}) 15 | 16 | (let [index (index) 17 | downstream (batch 100 1/10 18 | (async-queue! :agg { :queue-size 1e3 19 | :core-pool-size 4 20 | :max-pool-size 32} 21 | (forward 22 | (riemann.client/tcp-client :host "riemannmc"))))] 23 | 24 | ; Inbound events will be passed to these streams: 25 | (streams 26 | (default :ttl 60 27 | ; Index all events immediately. 28 | (where (not (tagged "notification")) 29 | index) 30 | 31 | (tagged "collectd" 32 | (where (not (= (:plugin event) "docker")) 33 | (smap rewrite-service graph)) 34 | 35 | (where (= (:plugin event) "docker") 36 | (smap (comp parse-docker-service-host docker-attributes rewrite-service) graph)) 37 | 38 | (tagged "notification" 39 | (where (not (maintenance-mode? event)) 40 | (changed-state {:init "ok"} 41 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count$" "$1"] 42 | (email "james@example.com"))))) 43 | 44 | (where (and (expired? event) 45 | (service #"^processes-.+\/ps_count\/processes")) 46 | (not (maintenance-mode? event)) 47 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count\/processes$" "$1"] 48 | (email "james@example.com")))) 49 | 50 | (where (service #"^riemann.*") 51 | graph 52 | 53 | downstream)))) 54 | -------------------------------------------------------------------------------- /11-13/collectd/mysql.conf: -------------------------------------------------------------------------------- 1 | 2 | Globals true 3 | 4 | 5 | ModulePath "/usr/lib/collectd/mysql/" 6 | 7 | 8 | 9 | Import mysql 10 | 11 | Host "localhost" 12 | Port 3306 13 | User "collectd" 14 | Password "collectd" 15 | 16 | 17 | 18 | LoadPlugin processes 19 | 20 | Process "mysqld" 21 | 22 | 23 | LoadPlugin dbi 24 | 25 | 26 | Statement "SELECT COUNT(*) AS value FROM items;" 27 | MinVersion 50000 28 | 29 | Type "gauge" 30 | InstancePrefix "tornado_item_count" 31 | ValuesFrom "value" 32 | 33 | 34 | 35 | Statement "SELECT SUM(price) AS total_price FROM items" 36 | MinVersion 50000 37 | 38 | Type "gauge" 39 | InstancePrefix "item_sold_total_price" 40 | ValuesFrom "total_price" 41 | 42 | 43 | 44 | Statement "SELECT MAX(thread_id), timer_wait/1000000000 AS exec_time_ms 45 | FROM events_statements_history_long 46 | WHERE digest_text = 'INSERT INTO `items` ( `title` , TEXT , `price` , `id` ) VALUES (...)';" 47 | MinVersion 50000 48 | 49 | Type "gauge" 50 | InstancePrefix "insert_query_time" 51 | ValuesFrom "exec_time_ms" 52 | 53 | 54 | 55 | Driver "mysql" 56 | DriverOption "host" "localhost" 57 | DriverOption "username" "collectd" 58 | DriverOption "password" "collectd" 59 | DriverOption "dbname" "items" 60 | SelectDB "items" 61 | Query "get_item_count" 62 | Query "item_sold_total_price" 63 | 64 | 65 | Driver "mysql" 66 | DriverOption "host" "localhost" 67 | DriverOption "username" "collectd" 68 | DriverOption "password" "collectd" 69 | DriverOption "dbname" "performance_schema" 70 | Query "insert_query_time" 71 | 72 | 73 | -------------------------------------------------------------------------------- /11-13/collectd/tornado-api.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin processes 2 | 3 | ProcessMatch "tornado-api" "-jar tornado-api" 4 | 5 | 6 | LoadPlugin java 7 | 8 | JVMARG "-Djava.class.path=/usr/share/collectd/java/collectd-api.jar:/usr/share/collectd/java/generic-jmx.jar" 9 | LoadPlugin "org.collectd.java.GenericJMX" 10 | 11 | 12 | ObjectName "java.lang:type=GarbageCollector,*" 13 | InstancePrefix "gc-" 14 | InstanceFrom "name" 15 | 16 | Type "derive" 17 | Table false 18 | Attribute "CollectionCount" 19 | InstancePrefix "count" 20 | 21 | 22 | 23 | ObjectName "java.lang:type=GarbageCollector,*" 24 | InstancePrefix "gc-" 25 | InstanceFrom "name" 26 | 27 | Type "derive" 28 | Table false 29 | Attribute "CollectionTime" 30 | InstancePrefix "time" 31 | 32 | 33 | 34 | ObjectName "java.lang:type=MemoryPool,*" 35 | InstancePrefix "memory_pool-" 36 | InstanceFrom "name" 37 | 38 | Type "memory" 39 | Table true 40 | Attribute "Usage" 41 | 42 | 43 | 44 | ObjectName "java.lang:type=Memory" 45 | InstancePrefix "memory-heap" 46 | 47 | Type "memory" 48 | Table true 49 | Attribute "HeapMemoryUsage" 50 | 51 | 52 | 53 | ObjectName "java.lang:type=Memory" 54 | InstancePrefix "memory-nonheap" 55 | 56 | Type "memory" 57 | Table true 58 | Attribute "NonHeapMemoryUsage" 59 | 60 | 61 | 62 | ObjectName "java.lang:type=Threading" 63 | InstancePrefix "threading" 64 | 65 | Type "gauge" 66 | Table false 67 | Attribute "ThreadCount" 68 | InstancePrefix "count" 69 | 70 | 71 | 72 | ObjectName "java.lang:type=Threading" 73 | InstancePrefix "threading" 74 | 75 | Type "gauge" 76 | Table false 77 | Attribute "DaemonThreadCount" 78 | InstancePrefix "count-daemon" 79 | 80 | 81 | 82 | ServiceURL "service:jmx:rmi:///jndi/rmi://localhost:8855/jmxrmi" 83 | Collect "memory_pool" 84 | Collect "memory-heap" 85 | Collect "memory-nonheap" 86 | Collect "gc-count" 87 | Collect "gc-time" 88 | Collect "thread" 89 | Collect "thread-daemon" 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /11-13/grafana/tornado-dashboard.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": 6, 3 | "title": "Tornado", 4 | "originalTitle": "Tornado", 5 | "tags": [], 6 | "style": "dark", 7 | "timezone": "browser", 8 | "editable": true, 9 | "hideControls": false, 10 | "sharedCrosshair": false, 11 | "rows": [ 12 | { 13 | "collapse": false, 14 | "editable": true, 15 | "height": "250px", 16 | "panels": [ 17 | { 18 | "cacheTimeout": null, 19 | "colorBackground": false, 20 | "colorValue": false, 21 | "colors": [ 22 | "rgba(50, 172, 45, 0.97)", 23 | "rgba(237, 129, 40, 0.89)", 24 | "rgba(245, 54, 54, 0.9)" 25 | ], 26 | "datasource": null, 27 | "editable": true, 28 | "error": false, 29 | "format": "none", 30 | "id": 19, 31 | "interval": null, 32 | "isNew": true, 33 | "links": [], 34 | "maxDataPoints": 100, 35 | "nullPointMode": "connected", 36 | "nullText": null, 37 | "postfix": "", 38 | "postfixFontSize": "50%", 39 | "prefix": "", 40 | "prefixFontSize": "50%", 41 | "span": 6, 42 | "sparkline": { 43 | "fillColor": "rgba(31, 118, 189, 0.18)", 44 | "full": true, 45 | "lineColor": "rgb(31, 120, 193)", 46 | "show": true 47 | }, 48 | "targets": [ 49 | { 50 | "refId": "A", 51 | "target": "productiona.hosts.tornado-db.dbi.items.tornado_item_count", 52 | "textEditor": true 53 | } 54 | ], 55 | "thresholds": "", 56 | "title": "Item Count", 57 | "type": "singlestat", 58 | "valueFontSize": "80%", 59 | "valueMaps": [ 60 | { 61 | "op": "=", 62 | "text": "N/A", 63 | "value": "null" 64 | } 65 | ], 66 | "valueName": "avg" 67 | }, 68 | { 69 | "cacheTimeout": null, 70 | "colorBackground": false, 71 | "colorValue": false, 72 | "colors": [ 73 | "rgba(50, 172, 45, 0.97)", 74 | "rgba(237, 129, 40, 0.89)", 75 | "rgba(245, 54, 54, 0.9)" 76 | ], 77 | "datasource": null, 78 | "editable": true, 79 | "error": false, 80 | "format": "currencyUSD", 81 | "id": 21, 82 | "interval": null, 83 | "isNew": true, 84 | "links": [], 85 | "maxDataPoints": 100, 86 | "nullPointMode": "connected", 87 | "nullText": null, 88 | "postfix": "", 89 | "postfixFontSize": "50%", 90 | "prefix": "", 91 | "prefixFontSize": "50%", 92 | "span": 6, 93 | "sparkline": { 94 | "fillColor": "rgba(31, 118, 189, 0.18)", 95 | "full": true, 96 | "lineColor": "rgb(31, 120, 193)", 97 | "show": true 98 | }, 99 | "targets": [ 100 | { 101 | "refId": "A", 102 | "target": "productiona.hosts.tornado-db.dbi.items.item_sold_total_price", 103 | "textEditor": true 104 | } 105 | ], 106 | "thresholds": "", 107 | "title": "Total price of items sold", 108 | "type": "singlestat", 109 | "valueFontSize": "80%", 110 | "valueMaps": [ 111 | { 112 | "op": "=", 113 | "text": "N/A", 114 | "value": "null" 115 | } 116 | ], 117 | "valueName": "avg" 118 | }, 119 | { 120 | "aliasColors": {}, 121 | "bars": false, 122 | "datasource": null, 123 | "editable": true, 124 | "error": false, 125 | "fill": 1, 126 | "grid": { 127 | "leftLogBase": 1, 128 | "leftMax": null, 129 | "leftMin": null, 130 | "rightLogBase": 1, 131 | "rightMax": null, 132 | "rightMin": null, 133 | "threshold1": null, 134 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 135 | "threshold2": null, 136 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 137 | }, 138 | "id": 23, 139 | "isNew": true, 140 | "legend": { 141 | "avg": false, 142 | "current": false, 143 | "max": false, 144 | "min": false, 145 | "show": true, 146 | "total": false, 147 | "values": false 148 | }, 149 | "lines": true, 150 | "linewidth": 2, 151 | "links": [], 152 | "nullPointMode": "connected", 153 | "percentage": false, 154 | "pointradius": 5, 155 | "points": false, 156 | "renderer": "flot", 157 | "seriesOverrides": [], 158 | "span": 6, 159 | "stack": false, 160 | "steppedLine": false, 161 | "targets": [ 162 | { 163 | "refId": "A", 164 | "target": "alias(sumSeriesWithWildcards(productiona.hosts.*.statsd.gauge.tornado.api.item.sold.total, 2), 'Tornado API servers')", 165 | "textEditor": true 166 | } 167 | ], 168 | "timeFrom": null, 169 | "timeShift": null, 170 | "title": "Tornado API Sold Total $", 171 | "tooltip": { 172 | "shared": true, 173 | "value_type": "cumulative" 174 | }, 175 | "type": "graph", 176 | "x-axis": true, 177 | "y-axis": true, 178 | "y_formats": [ 179 | "short", 180 | "short" 181 | ] 182 | }, 183 | { 184 | "aliasColors": {}, 185 | "bars": false, 186 | "datasource": null, 187 | "editable": true, 188 | "error": false, 189 | "fill": 1, 190 | "grid": { 191 | "leftLogBase": 1, 192 | "leftMax": null, 193 | "leftMin": null, 194 | "rightLogBase": 1, 195 | "rightMax": null, 196 | "rightMin": null, 197 | "threshold1": null, 198 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 199 | "threshold2": null, 200 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 201 | }, 202 | "id": 17, 203 | "isNew": true, 204 | "legend": { 205 | "avg": false, 206 | "current": false, 207 | "max": false, 208 | "min": false, 209 | "show": true, 210 | "total": false, 211 | "values": false 212 | }, 213 | "lines": true, 214 | "linewidth": 2, 215 | "links": [], 216 | "nullPointMode": "connected", 217 | "percentage": false, 218 | "pointradius": 5, 219 | "points": false, 220 | "renderer": "flot", 221 | "seriesOverrides": [], 222 | "span": 6, 223 | "stack": false, 224 | "steppedLine": false, 225 | "targets": [ 226 | { 227 | "refId": "A", 228 | "target": "alias(sumSeriesWithWildcards(productiona.hosts.*.statsd.gauge.tornado.api.item.bought.total, 2), 'Tornado API servers')", 229 | "textEditor": true 230 | } 231 | ], 232 | "timeFrom": null, 233 | "timeShift": null, 234 | "title": "Tornado API Bought Total $", 235 | "tooltip": { 236 | "shared": true, 237 | "value_type": "cumulative" 238 | }, 239 | "type": "graph", 240 | "x-axis": true, 241 | "y-axis": true, 242 | "y_formats": [ 243 | "short", 244 | "short" 245 | ] 246 | }, 247 | { 248 | "aliasColors": {}, 249 | "bars": false, 250 | "datasource": null, 251 | "editable": true, 252 | "error": false, 253 | "fill": 1, 254 | "grid": { 255 | "leftLogBase": 1, 256 | "leftMax": null, 257 | "leftMin": null, 258 | "rightLogBase": 1, 259 | "rightMax": null, 260 | "rightMin": null, 261 | "threshold1": null, 262 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 263 | "threshold2": null, 264 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 265 | }, 266 | "id": 16, 267 | "isNew": true, 268 | "legend": { 269 | "avg": false, 270 | "current": false, 271 | "max": false, 272 | "min": false, 273 | "show": true, 274 | "total": false, 275 | "values": false 276 | }, 277 | "lines": true, 278 | "linewidth": 2, 279 | "links": [], 280 | "nullPointMode": "connected", 281 | "percentage": false, 282 | "pointradius": 5, 283 | "points": false, 284 | "renderer": "flot", 285 | "seriesOverrides": [], 286 | "span": 6, 287 | "stack": false, 288 | "steppedLine": false, 289 | "targets": [ 290 | { 291 | "refId": "A", 292 | "target": "aliasByNode(productiona.hosts.*.tornado.api.request.99, 2)", 293 | "textEditor": false 294 | } 295 | ], 296 | "timeFrom": null, 297 | "timeShift": null, 298 | "title": "Tornado API Request Time 0.99", 299 | "tooltip": { 300 | "shared": true, 301 | "value_type": "cumulative" 302 | }, 303 | "type": "graph", 304 | "x-axis": true, 305 | "y-axis": true, 306 | "y_formats": [ 307 | "short", 308 | "short" 309 | ] 310 | }, 311 | { 312 | "aliasColors": {}, 313 | "bars": false, 314 | "datasource": null, 315 | "editable": true, 316 | "error": false, 317 | "fill": 1, 318 | "grid": { 319 | "leftLogBase": 1, 320 | "leftMax": null, 321 | "leftMin": null, 322 | "rightLogBase": 1, 323 | "rightMax": null, 324 | "rightMin": null, 325 | "threshold1": null, 326 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 327 | "threshold2": null, 328 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 329 | }, 330 | "id": 14, 331 | "isNew": true, 332 | "legend": { 333 | "avg": false, 334 | "current": false, 335 | "max": false, 336 | "min": false, 337 | "show": true, 338 | "total": false, 339 | "values": false 340 | }, 341 | "lines": true, 342 | "linewidth": 2, 343 | "links": [], 344 | "nullPointMode": "connected", 345 | "percentage": false, 346 | "pointradius": 5, 347 | "points": false, 348 | "renderer": "flot", 349 | "seriesOverrides": [ 350 | {} 351 | ], 352 | "span": 6, 353 | "stack": false, 354 | "steppedLine": false, 355 | "targets": [ 356 | { 357 | "refId": "A", 358 | "target": "aliasByNode(productiona.hosts.*.tornado.api.request.rate,2)", 359 | "textEditor": false 360 | } 361 | ], 362 | "timeFrom": null, 363 | "timeShift": null, 364 | "title": "Tornado API Request rate", 365 | "tooltip": { 366 | "shared": true, 367 | "value_type": "cumulative" 368 | }, 369 | "type": "graph", 370 | "x-axis": true, 371 | "y-axis": true, 372 | "y_formats": [ 373 | "short", 374 | "short" 375 | ] 376 | }, 377 | { 378 | "aliasColors": {}, 379 | "bars": false, 380 | "datasource": null, 381 | "editable": true, 382 | "error": false, 383 | "fill": 1, 384 | "grid": { 385 | "leftLogBase": 1, 386 | "leftMax": null, 387 | "leftMin": null, 388 | "rightLogBase": 1, 389 | "rightMax": null, 390 | "rightMin": null, 391 | "threshold1": null, 392 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 393 | "threshold2": null, 394 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 395 | }, 396 | "id": 13, 397 | "isNew": true, 398 | "legend": { 399 | "avg": false, 400 | "current": false, 401 | "max": false, 402 | "min": false, 403 | "show": true, 404 | "total": false, 405 | "values": false 406 | }, 407 | "lines": true, 408 | "linewidth": 2, 409 | "links": [], 410 | "nullPointMode": "connected", 411 | "percentage": false, 412 | "pointradius": 5, 413 | "points": false, 414 | "renderer": "flot", 415 | "seriesOverrides": [], 416 | "span": 6, 417 | "stack": false, 418 | "steppedLine": false, 419 | "targets": [ 420 | { 421 | "refId": "A", 422 | "target": "aliasByNode(productiona.hosts.tornado-proxy.haproxy.frontend.tornado-www.5xx_error_percentage,2)", 423 | "textEditor": true 424 | } 425 | ], 426 | "timeFrom": null, 427 | "timeShift": null, 428 | "title": "Tornado 5xx Error Percentage", 429 | "tooltip": { 430 | "shared": true, 431 | "value_type": "cumulative" 432 | }, 433 | "type": "graph", 434 | "x-axis": true, 435 | "y-axis": true, 436 | "y_formats": [ 437 | "short", 438 | "short" 439 | ] 440 | }, 441 | { 442 | "aliasColors": {}, 443 | "bars": false, 444 | "datasource": null, 445 | "editable": true, 446 | "error": false, 447 | "fill": 1, 448 | "grid": { 449 | "leftLogBase": 1, 450 | "leftMax": null, 451 | "leftMin": null, 452 | "rightLogBase": 1, 453 | "rightMax": null, 454 | "rightMin": null, 455 | "threshold1": null, 456 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 457 | "threshold2": null, 458 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 459 | }, 460 | "id": 22, 461 | "isNew": true, 462 | "legend": { 463 | "avg": false, 464 | "current": false, 465 | "max": false, 466 | "min": false, 467 | "show": true, 468 | "total": false, 469 | "values": false 470 | }, 471 | "lines": true, 472 | "linewidth": 2, 473 | "links": [], 474 | "nullPointMode": "connected", 475 | "percentage": false, 476 | "pointradius": 5, 477 | "points": false, 478 | "renderer": "flot", 479 | "seriesOverrides": [], 480 | "span": 6, 481 | "stack": false, 482 | "steppedLine": false, 483 | "targets": [ 484 | { 485 | "refId": "A", 486 | "target": "aliasByNode(productiona.hosts.tornado-db.mysql.aborted_connection_rate,2)", 487 | "textEditor": false 488 | } 489 | ], 490 | "timeFrom": null, 491 | "timeShift": null, 492 | "title": "MySQL Aborted Connection rate", 493 | "tooltip": { 494 | "shared": true, 495 | "value_type": "cumulative" 496 | }, 497 | "type": "graph", 498 | "x-axis": true, 499 | "y-axis": true, 500 | "y_formats": [ 501 | "short", 502 | "short" 503 | ] 504 | }, 505 | { 506 | "aliasColors": {}, 507 | "bars": false, 508 | "datasource": null, 509 | "editable": true, 510 | "error": false, 511 | "fill": 1, 512 | "grid": { 513 | "leftLogBase": 1, 514 | "leftMax": null, 515 | "leftMin": null, 516 | "rightLogBase": 1, 517 | "rightMax": null, 518 | "rightMin": null, 519 | "threshold1": null, 520 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 521 | "threshold2": null, 522 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 523 | }, 524 | "id": 18, 525 | "isNew": true, 526 | "legend": { 527 | "avg": false, 528 | "current": false, 529 | "max": false, 530 | "min": false, 531 | "show": true, 532 | "total": false, 533 | "values": false 534 | }, 535 | "lines": true, 536 | "linewidth": 2, 537 | "links": [], 538 | "nullPointMode": "connected", 539 | "percentage": false, 540 | "pointradius": 5, 541 | "points": false, 542 | "renderer": "flot", 543 | "seriesOverrides": [], 544 | "span": 6, 545 | "stack": false, 546 | "steppedLine": false, 547 | "targets": [ 548 | { 549 | "refId": "A", 550 | "target": "aliasByNode(productiona.hosts.tornado-db.dbi.performance_schema.insert_query_time.99,2)", 551 | "textEditor": true 552 | } 553 | ], 554 | "timeFrom": null, 555 | "timeShift": null, 556 | "title": "Tornado API Item Insert 0.99", 557 | "tooltip": { 558 | "shared": true, 559 | "value_type": "cumulative" 560 | }, 561 | "type": "graph", 562 | "x-axis": true, 563 | "y-axis": true, 564 | "y_formats": [ 565 | "short", 566 | "short" 567 | ] 568 | } 569 | ], 570 | "title": "Row" 571 | }, 572 | { 573 | "collapse": false, 574 | "editable": true, 575 | "height": "250px", 576 | "panels": [ 577 | { 578 | "aliasColors": {}, 579 | "bars": false, 580 | "datasource": null, 581 | "editable": true, 582 | "error": false, 583 | "fill": 1, 584 | "grid": { 585 | "leftLogBase": 1, 586 | "leftMax": null, 587 | "leftMin": null, 588 | "rightLogBase": 1, 589 | "rightMax": null, 590 | "rightMin": null, 591 | "threshold1": null, 592 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 593 | "threshold2": null, 594 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 595 | }, 596 | "id": 12, 597 | "legend": { 598 | "avg": false, 599 | "current": false, 600 | "max": false, 601 | "min": false, 602 | "show": true, 603 | "total": false, 604 | "values": false 605 | }, 606 | "lines": true, 607 | "linewidth": 2, 608 | "links": [], 609 | "nullPointMode": "connected", 610 | "percentage": false, 611 | "pointradius": 5, 612 | "points": false, 613 | "renderer": "flot", 614 | "seriesOverrides": [], 615 | "span": 4, 616 | "stack": false, 617 | "steppedLine": false, 618 | "targets": [ 619 | { 620 | "refId": "A", 621 | "target": "groupByNode(productiona.hosts.{tornado-redis,tornado-db}.cpu.{user,system},2,'sumSeries')", 622 | "textEditor": true 623 | } 624 | ], 625 | "timeFrom": null, 626 | "timeShift": null, 627 | "title": "DB Tier CPU Usage", 628 | "tooltip": { 629 | "shared": true, 630 | "value_type": "individual" 631 | }, 632 | "type": "graph", 633 | "x-axis": true, 634 | "y-axis": true, 635 | "y_formats": [ 636 | "short", 637 | "short" 638 | ] 639 | }, 640 | { 641 | "aliasColors": {}, 642 | "bars": false, 643 | "datasource": null, 644 | "editable": true, 645 | "error": false, 646 | "fill": 1, 647 | "grid": { 648 | "leftLogBase": 1, 649 | "leftMax": null, 650 | "leftMin": null, 651 | "rightLogBase": 1, 652 | "rightMax": null, 653 | "rightMin": null, 654 | "threshold1": null, 655 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 656 | "threshold2": null, 657 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 658 | }, 659 | "id": 11, 660 | "legend": { 661 | "avg": false, 662 | "current": false, 663 | "max": false, 664 | "min": false, 665 | "show": true, 666 | "total": false, 667 | "values": false 668 | }, 669 | "lines": true, 670 | "linewidth": 2, 671 | "links": [], 672 | "nullPointMode": "connected", 673 | "percentage": false, 674 | "pointradius": 5, 675 | "points": false, 676 | "renderer": "flot", 677 | "seriesOverrides": [], 678 | "span": 4, 679 | "stack": false, 680 | "steppedLine": false, 681 | "targets": [ 682 | { 683 | "refId": "A", 684 | "target": "groupByNode(productiona.hosts.{tornado-api1,tornado-api2}.cpu.{user,system},2,'sumSeries')", 685 | "textEditor": true 686 | } 687 | ], 688 | "timeFrom": null, 689 | "timeShift": null, 690 | "title": "App Tier CPU Usage", 691 | "tooltip": { 692 | "shared": true, 693 | "value_type": "individual" 694 | }, 695 | "type": "graph", 696 | "x-axis": true, 697 | "y-axis": true, 698 | "y_formats": [ 699 | "short", 700 | "short" 701 | ] 702 | }, 703 | { 704 | "aliasColors": {}, 705 | "bars": false, 706 | "datasource": null, 707 | "editable": true, 708 | "error": false, 709 | "fill": 1, 710 | "grid": { 711 | "leftLogBase": 1, 712 | "leftMax": null, 713 | "leftMin": null, 714 | "rightLogBase": 1, 715 | "rightMax": null, 716 | "rightMin": null, 717 | "threshold1": null, 718 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 719 | "threshold2": null, 720 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 721 | }, 722 | "id": 15, 723 | "isNew": true, 724 | "legend": { 725 | "avg": false, 726 | "current": false, 727 | "max": false, 728 | "min": false, 729 | "show": true, 730 | "total": false, 731 | "values": false 732 | }, 733 | "lines": true, 734 | "linewidth": 2, 735 | "links": [], 736 | "nullPointMode": "connected", 737 | "percentage": false, 738 | "pointradius": 5, 739 | "points": false, 740 | "renderer": "flot", 741 | "seriesOverrides": [], 742 | "span": 4, 743 | "stack": false, 744 | "steppedLine": false, 745 | "targets": [ 746 | { 747 | "refId": "A", 748 | "target": "groupByNode(productiona.hosts.{tornado-proxy,tornado-web1,tornado-web2}.cpu.{user,system},2,'sumSeries')", 749 | "textEditor": true 750 | } 751 | ], 752 | "timeFrom": null, 753 | "timeShift": null, 754 | "title": "Web Tier CPU Usage", 755 | "tooltip": { 756 | "shared": true, 757 | "value_type": "cumulative" 758 | }, 759 | "type": "graph", 760 | "x-axis": true, 761 | "y-axis": true, 762 | "y_formats": [ 763 | "short", 764 | "short" 765 | ] 766 | } 767 | ], 768 | "title": "New row" 769 | }, 770 | { 771 | "collapse": false, 772 | "editable": true, 773 | "height": "250px", 774 | "panels": [ 775 | { 776 | "aliasColors": {}, 777 | "bars": false, 778 | "datasource": null, 779 | "editable": true, 780 | "error": false, 781 | "fill": 1, 782 | "grid": { 783 | "leftLogBase": 1, 784 | "leftMax": null, 785 | "leftMin": null, 786 | "rightLogBase": 1, 787 | "rightMax": null, 788 | "rightMin": null, 789 | "threshold1": null, 790 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 791 | "threshold2": null, 792 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 793 | }, 794 | "id": 3, 795 | "legend": { 796 | "avg": false, 797 | "current": false, 798 | "max": false, 799 | "min": false, 800 | "show": true, 801 | "total": false, 802 | "values": false 803 | }, 804 | "lines": true, 805 | "linewidth": 2, 806 | "links": [], 807 | "nullPointMode": "connected", 808 | "percentage": false, 809 | "pointradius": 5, 810 | "points": false, 811 | "renderer": "flot", 812 | "seriesOverrides": [], 813 | "span": 6, 814 | "stack": false, 815 | "steppedLine": false, 816 | "targets": [ 817 | { 818 | "refId": "A", 819 | "target": "aliasByNode(productiona.hosts.{tornado-proxy,tornado-web1,tornado-web2,tornado-api1,tornado-api2,tornado-redis,tornado-db}.swap.used, 2)", 820 | "textEditor": true 821 | } 822 | ], 823 | "timeFrom": null, 824 | "timeShift": null, 825 | "title": "Tornado Swap", 826 | "tooltip": { 827 | "shared": true, 828 | "value_type": "cumulative" 829 | }, 830 | "type": "graph", 831 | "x-axis": true, 832 | "y-axis": true, 833 | "y_formats": [ 834 | "short", 835 | "short" 836 | ] 837 | }, 838 | { 839 | "aliasColors": {}, 840 | "bars": false, 841 | "datasource": null, 842 | "editable": true, 843 | "error": false, 844 | "fill": 1, 845 | "grid": { 846 | "leftLogBase": 1, 847 | "leftMax": null, 848 | "leftMin": null, 849 | "rightLogBase": 1, 850 | "rightMax": null, 851 | "rightMin": null, 852 | "threshold1": null, 853 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 854 | "threshold2": null, 855 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 856 | }, 857 | "id": 2, 858 | "legend": { 859 | "avg": false, 860 | "current": false, 861 | "max": false, 862 | "min": false, 863 | "show": true, 864 | "total": false, 865 | "values": false 866 | }, 867 | "lines": true, 868 | "linewidth": 2, 869 | "links": [], 870 | "nullPointMode": "connected", 871 | "percentage": false, 872 | "pointradius": 5, 873 | "points": false, 874 | "renderer": "flot", 875 | "seriesOverrides": [], 876 | "span": 6, 877 | "stack": false, 878 | "steppedLine": false, 879 | "targets": [ 880 | { 881 | "refId": "A", 882 | "target": "aliasByNode(productiona.hosts.{tornado-proxy,tornado-web1,tornado-web2,tornado-api1,tornado-api2,tornado-redis,tornado-db}.memory.used,2)", 883 | "textEditor": true 884 | } 885 | ], 886 | "timeFrom": null, 887 | "timeShift": null, 888 | "title": "Tornado Memory Usage", 889 | "tooltip": { 890 | "shared": true, 891 | "value_type": "cumulative" 892 | }, 893 | "type": "graph", 894 | "x-axis": true, 895 | "y-axis": true, 896 | "y_formats": [ 897 | "short", 898 | "short" 899 | ] 900 | } 901 | ], 902 | "title": "New row" 903 | }, 904 | { 905 | "collapse": false, 906 | "editable": true, 907 | "height": "250px", 908 | "panels": [ 909 | { 910 | "aliasColors": {}, 911 | "bars": false, 912 | "datasource": null, 913 | "editable": true, 914 | "error": false, 915 | "fill": 1, 916 | "grid": { 917 | "leftLogBase": 1, 918 | "leftMax": null, 919 | "leftMin": null, 920 | "rightLogBase": 1, 921 | "rightMax": null, 922 | "rightMin": null, 923 | "threshold1": null, 924 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 925 | "threshold2": null, 926 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 927 | }, 928 | "id": 5, 929 | "legend": { 930 | "avg": false, 931 | "current": false, 932 | "max": false, 933 | "min": false, 934 | "show": true, 935 | "total": false, 936 | "values": false 937 | }, 938 | "lines": true, 939 | "linewidth": 2, 940 | "links": [], 941 | "nullPointMode": "connected", 942 | "percentage": false, 943 | "pointradius": 5, 944 | "points": false, 945 | "renderer": "flot", 946 | "seriesOverrides": [], 947 | "span": 6, 948 | "stack": false, 949 | "steppedLine": false, 950 | "targets": [ 951 | { 952 | "refId": "A", 953 | "target": "aliasByNode(productiona.hosts.{tornado-proxy,tornado-web1,tornado-web2,tornado-api1,tornado-api2,tornado-redis,tornado-db}.load.shortterm,2)", 954 | "textEditor": true 955 | } 956 | ], 957 | "timeFrom": null, 958 | "timeShift": null, 959 | "title": "Tornado Load Average (short-term)", 960 | "tooltip": { 961 | "shared": true, 962 | "value_type": "cumulative" 963 | }, 964 | "type": "graph", 965 | "x-axis": true, 966 | "y-axis": true, 967 | "y_formats": [ 968 | "short", 969 | "short" 970 | ] 971 | }, 972 | { 973 | "aliasColors": {}, 974 | "bars": false, 975 | "datasource": null, 976 | "editable": true, 977 | "error": false, 978 | "fill": 1, 979 | "grid": { 980 | "leftLogBase": 1, 981 | "leftMax": null, 982 | "leftMin": null, 983 | "rightLogBase": 1, 984 | "rightMax": null, 985 | "rightMin": null, 986 | "threshold1": null, 987 | "threshold1Color": "rgba(216, 200, 27, 0.27)", 988 | "threshold2": null, 989 | "threshold2Color": "rgba(234, 112, 112, 0.22)" 990 | }, 991 | "id": 4, 992 | "legend": { 993 | "avg": false, 994 | "current": false, 995 | "max": false, 996 | "min": false, 997 | "show": true, 998 | "total": false, 999 | "values": false 1000 | }, 1001 | "lines": true, 1002 | "linewidth": 2, 1003 | "links": [], 1004 | "nullPointMode": "connected", 1005 | "percentage": false, 1006 | "pointradius": 5, 1007 | "points": false, 1008 | "renderer": "flot", 1009 | "seriesOverrides": [], 1010 | "span": 6, 1011 | "stack": false, 1012 | "steppedLine": false, 1013 | "targets": [ 1014 | { 1015 | "refId": "A", 1016 | "target": "aliasByNode(productiona.hosts.{tornado-proxy,tornado-web1,tornado-web2,tornado-api1,tornado-api2,tornado-redis,tornado-db}.df.root.percent_bytes.used, 2)", 1017 | "textEditor": true 1018 | } 1019 | ], 1020 | "timeFrom": null, 1021 | "timeShift": null, 1022 | "title": "Tornado disk used on /", 1023 | "tooltip": { 1024 | "shared": true, 1025 | "value_type": "cumulative" 1026 | }, 1027 | "type": "graph", 1028 | "x-axis": true, 1029 | "y-axis": true, 1030 | "y_formats": [ 1031 | "short", 1032 | "short" 1033 | ] 1034 | } 1035 | ], 1036 | "title": "New row" 1037 | } 1038 | ], 1039 | "time": { 1040 | "from": "now-12h", 1041 | "to": "now" 1042 | }, 1043 | "timepicker": { 1044 | "collapse": false, 1045 | "enable": true, 1046 | "notice": false, 1047 | "now": true, 1048 | "refresh_intervals": [ 1049 | "5s", 1050 | "10s", 1051 | "30s", 1052 | "1m", 1053 | "5m", 1054 | "15m", 1055 | "30m", 1056 | "1h", 1057 | "2h", 1058 | "1d" 1059 | ], 1060 | "status": "Stable", 1061 | "time_options": [ 1062 | "5m", 1063 | "15m", 1064 | "1h", 1065 | "6h", 1066 | "12h", 1067 | "24h", 1068 | "2d", 1069 | "7d", 1070 | "30d" 1071 | ], 1072 | "type": "timepicker" 1073 | }, 1074 | "templating": { 1075 | "list": [] 1076 | }, 1077 | "annotations": { 1078 | "list": [] 1079 | }, 1080 | "refresh": "30s", 1081 | "schemaVersion": 8, 1082 | "version": 41, 1083 | "links": [] 1084 | } -------------------------------------------------------------------------------- /11-13/logstash/logstash.conf: -------------------------------------------------------------------------------- 1 | input { 2 | tcp { 3 | port => 5514 4 | type => syslog 5 | } 6 | tcp { 7 | port => 2003 8 | type => "riemann" 9 | codec => "json" 10 | } 11 | udp { 12 | port => 5514 13 | type => syslog 14 | } 15 | file { 16 | path => [ "/var/log/syslog", "/var/log/auth.log" ] 17 | type => "syslog" 18 | } 19 | } 20 | filter { 21 | if [type] == "syslog" { 22 | grok { 23 | match => { "message" => "(?:%{SYSLOGTIMESTAMP:syslog_timestamp}|%{TIMESTAMP_ISO8601:syslog_timestamp}) %{SYSLOGHOST:syslog_hostname} %{DATA:syslog_program}(?:\/%{DATA:container_name}\/%{DATA:container_id})?(?:\[%{POSINT:syslog_pid}\])?: %{GREEDYDATA:syslog_message}" } 24 | remove_field => ["message"] 25 | } 26 | syslog_pri { } 27 | date { 28 | match => [ "syslog_timestamp", "MMM d HH:mm:ss", "MMM dd HH:mm:ss", "ISO8601" ] 29 | } 30 | if [syslog_program] == "tornado-haproxy" { 31 | grok { 32 | match => ["syslog_message", "%{HAPROXYHTTPBASE}"] 33 | remove_field => ["syslog_message"] 34 | add_field => { "tags" => "tornado" 35 | } 36 | } 37 | if [syslog_program] == "tornado-nginx-access" { 38 | grok { 39 | patterns_dir => "/etc/logstash/patterns" 40 | match => { "syslog_message" => "%{NGINXACCESS}" } 41 | remove_field => ["syslog_message"] 42 | add_field => { "tags" => "tornado" 43 | } 44 | } 45 | if [syslog_program] == "tornado-api" { 46 | grok { 47 | patterns_dir => "/etc/logstash/patterns" 48 | match => { "syslog_message" => "%{TORNADOAPI}" } 49 | remove_field => ["syslog_message"] 50 | add_field => { "tags" => "tornado" } 51 | } 52 | } 53 | } 54 | } 55 | output { 56 | if [syslog_program] == "tornado-haproxy" { 57 | riemann { 58 | host => "riemanna" 59 | sender => "%{syslog_hostname}" 60 | map_fields => true 61 | riemann_event => { 62 | "service" => "tornado.proxy.request" 63 | "metric" => "%{time_duration}" 64 | "state" => "ok" 65 | } 66 | } 67 | } 68 | if [syslog_program] == "tornado-nginx-access" { 69 | riemann { 70 | host => "riemanna" 71 | sender => "%{syslog_hostname}" 72 | map_fields => true 73 | riemann_event => { 74 | "service" => "tornado.web.request" 75 | "metric" => "%{body_bytes_sent}" 76 | "state" => "ok" 77 | } 78 | } 79 | } 80 | if [syslog_program] == "tornado-api" and [app_request_time] { 81 | riemann { 82 | host => "riemanna" 83 | sender => "%{syslog_hostname}" 84 | map_fields => true 85 | riemann_event => { 86 | "service" => "tornado.api.request" 87 | "metric" => "%{app_request_time}" 88 | "state" => "ok" 89 | } 90 | } 91 | } 92 | elasticsearch { 93 | sniffing => true 94 | hosts => "esa1.example.com" 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /11-13/logstash/patterns/nginx: -------------------------------------------------------------------------------- 1 | NGINXACCESS %{IPORHOST:remote_addr} - %{USERNAME:remote_user} \[%{HTTPDATE:time_local}\] "%{WORD:http_method} %{URIPATHPARAM:http_request} HTTP/%{NUMBER:http_version}" %{INT:http_status} %{INT:body_bytes_sent} %{QS:http_referer} %{QS:http_user_agent} 2 | -------------------------------------------------------------------------------- /11-13/logstash/patterns/tornadoapi: -------------------------------------------------------------------------------- 1 | TORNADOAPI %{TIMESTAMP_ISO8601:app_timestamp} %{URIHOST:app_host} %{DATA:app_severity} %{SYSLOG5424SD} - nil %{DATA:app_request_state} \:%{DATA:app_verb} %{DATA:app_path} for %{URIHOST:app_source} (?:in \(%{INT:app_request_time:int} ms\) Status: %{INT:app_status_code:int}|%{GREEDYDATA:app_request}) 2 | -------------------------------------------------------------------------------- /11-13/riemann/examplecom/app/tornado.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.app.tornado 2 | "Monitoring streams for Tornado" 3 | (:require [riemann.config :refer :all] 4 | [clojure.tools.logging :refer :all] 5 | [riemann.folds :as folds] 6 | [riemann.streams :refer :all])) 7 | 8 | (defn alert_graph 9 | [] 10 | "Alert and graph on events" 11 | (sdo 12 | (changed-state {:init "ok"} 13 | (where (state "critical") 14 | (page)) 15 | (where (state "warning") 16 | (slacker))) 17 | (smap rewrite-service graph))) 18 | 19 | (defn webtier 20 | "Checks for the Tornado Web Tier" 21 | [] 22 | (let [active_servers 2.0] 23 | (sdo 24 | (where (and (service "haproxy/gauge-backend.tornado-web.active_servers") 25 | (< metric active_servers)) 26 | (adjust #(assoc % :service "tornado-web active servers" 27 | :type_instance nil 28 | :state (condp = (:metric %) 29 | 0.0 "critical" 30 | 1.0 "warning" 31 | 2.0 "ok")) 32 | (changed :metric {:init active_servers} 33 | (slacker)))) 34 | (check_ratio "haproxy/derive-frontend.tornado-www.response_5xx" 35 | "haproxy/derive-frontend.tornado-www.request_total" 36 | "haproxy.frontend.tornado-www.5xx_error_percentage" 37 | 0.5 1 38 | (alert_graph))))) 39 | 40 | (defn apptier 41 | "Checks for the Tornado App Tier" 42 | [] 43 | (sdo 44 | (where (service "curl_json-tornado-api/gauge-price") 45 | (where (!= metric 666) 46 | (slacker)) 47 | (expired 48 | (page))) 49 | (where (service #"^tornado.api.") 50 | (smap rewrite-service graph)) 51 | (check_ratio "GenericJMX-memory-heap/memory-used" 52 | "GenericJMX-memory-heap/memory-max" 53 | "jmx.memory-heap.percentage_used" 54 | 80 90 55 | (alert_graph)) 56 | (where (service "tornado.api.request") 57 | (with { :service "tornado.api.request.rate" :metric 1 } 58 | (rate 1 59 | (smap rewrite-service graph)))) 60 | (check_percentiles "tornado.api.request" 10 61 | (smap rewrite-service graph) 62 | (where (and (service "tornado.api.request 0.99") (>= metric 100.0)) 63 | (changed-state { :init "ok"} 64 | (slacker)))))) 65 | 66 | (defn datatier 67 | "Check for the Tornado Data Tier" 68 | [] 69 | (sdo 70 | (check_ratio "mysql-status/gauge-Max_used_connections" 71 | "mysql-variables/gauge-max_connections" 72 | "mysql.max_connection_percentage" 73 | 80 90 74 | (alert_graph)) 75 | (create_rate "mysql-status/counter-Aborted_connects" 5) 76 | (check_percentiles "dbi-performance_schema/gauge-insert_query_time" 10 77 | (smap rewrite-service graph) 78 | (where (and (service "dbi-performance_schema/gauge-insert_query_time 0.99") (>= metric 3.0)) 79 | (changed-state { :init "ok"} 80 | (slacker)))))) 81 | 82 | (defn checks 83 | "Handles events for Tornado" 84 | [] 85 | (let [web-tier-hosts #"tornado-(proxy|web1|web2)" 86 | app-tier-hosts #"tornado-(api1|api2)" 87 | db-tier-hosts #"tornado-(db|redis)"] 88 | 89 | (splitp re-matches host 90 | web-tier-hosts (webtier) 91 | app-tier-hosts (apptier) 92 | db-tier-hosts (datatier) 93 | #(info "Catchall" (:host %))))) 94 | -------------------------------------------------------------------------------- /11-13/riemann/examplecom/etc/checks.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.checks 2 | (:require [riemann.config :refer :all] 3 | [clojure.tools.logging :refer :all] 4 | [riemann.streams :refer :all])) 5 | 6 | (defn set_state [warning critical] 7 | (fn [event] 8 | (assoc event :state 9 | (condp < (:metric event) 10 | critical "critical" 11 | warning "warning" 12 | "ok")))) 13 | 14 | (defn create_rate [srv window] 15 | (where (service srv) 16 | (with {:service (str srv " rate")} 17 | (rate window (smap rewrite-service graph))))) 18 | 19 | (defn check_ratio [srv1 srv2 newsrv warning critical & children] 20 | "Checks the ratio between two events" 21 | (project [(service srv1) 22 | (service srv2)] 23 | (smap folds/quotient-sloppy 24 | (fn [event] (let [percenta (* (float (:metric event)) 100) 25 | new-event (assoc event :metric percenta 26 | :service (str newsrv) 27 | :type_instance nil 28 | :state (condp < percenta 29 | critical "critical" 30 | warning "warning" 31 | "ok"))] 32 | (call-rescue new-event children)))))) 33 | 34 | (defn check_threshold [srv window func warning critical & children] 35 | (where (service srv) 36 | (fixed-time-window window 37 | (smap func 38 | (where (< warning metric) 39 | (smap (set_state warning critical) 40 | (fn [event] 41 | (call-rescue event children)))))))) 42 | 43 | (defn check_percentiles [srv window & children] 44 | (where (service srv) 45 | (percentiles window [0.5 0.95 0.99 1] 46 | (fn [event] 47 | (call-rescue event children))))) 48 | -------------------------------------------------------------------------------- /11-13/riemann/examplecom/etc/collectd.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.collectd 2 | (:require [clojure.tools.logging :refer :all] 3 | [riemann.streams :refer :all] 4 | [clojure.string :as str] 5 | [clojure.walk :as walk])) 6 | 7 | (defn docker-attribute-map 8 | [attributes] 9 | (let [instance (str/split (str/replace attributes #"^.*\[(.*)\]$" "$1") #",")] 10 | (walk/keywordize-keys (into {} (for [pair instance] (apply hash-map (str/split pair #"="))))))) 11 | 12 | (defn docker-attributes 13 | [{:keys [plugin_instance] :as event}] 14 | (if-let [attributes (re-find #"^.*\[.*\]$" plugin_instance)] 15 | (merge event (docker-attribute-map attributes)) 16 | event)) 17 | 18 | (defn parse-docker-service-host 19 | [{:keys [type type_instance plugin_instance] :as event}] 20 | (let [host (re-find #"^\w+\.?\w+\.?\w+" (:plugin_instance event)) 21 | service (cond-> (str (:type event)) (:type_instance event) (str "." (:type_instance event)))] 22 | (assoc event :service service :host host))) 23 | 24 | (def default-services 25 | [{:service #"^load/load/(.*)$" :rewrite "load $1"} 26 | {:service #"^swap/percent-(.*)$" :rewrite "swap $1"} 27 | {:service #"^memory/percent-(.*)$" :rewrite "memory $1"} 28 | {:service #"^processes/ps_state-(.*)$" :rewrite "processes $1"} 29 | {:service #"^processes-(.*)/(.*)$" :rewrite "processes $1 $2"} 30 | {:service #"^cpu/percent-(.*)$" :rewrite "cpu $1"} 31 | {:service #"^df-(.*)/(df_complex|percent_bytes)-(.*)$" :rewrite "df $1 $2 $3"} 32 | {:service #"^interface-(.*)/if_(errors|packets|octets)/(tx|rx)$" :rewrite "nic $1 $3 $2"} 33 | {:service #"^protocols-(.*)/(.*)$" :rewrite "protocols $1 $2"} 34 | {:service #"^GenericJMX-(:?_|\/)?(.*)$" :rewrite "jmx $2"} 35 | {:service #"^haproxy\/(gauge|derive)-(.*)$" :rewrite "haproxy $2"} 36 | {:service #"^statsd\/(gauge|derive|latency)-(.*)$" :rewrite "$2"} 37 | {:service #"^mysql-(.*)\/(counter|gauge)-(.*)$" :rewrite "mysql $1 $3"} 38 | {:service #"^dbi-(.*)\/(gauge|counter)-(.*)$" :rewrite "dbi $1 $3"} 39 | {:service #"^redis-(.*)$" :rewrite "redis $1"}]) 40 | 41 | (defn rewrite-service-with 42 | [rules] 43 | (let [matcher (fn [s1 s2] (if (string? s1) (= s1 s2) (re-find s1 s2)))] 44 | (fn [{:keys [service] :as event}] 45 | (or 46 | (first 47 | (for [{:keys [rewrite] :as rule} rules 48 | :when (matcher (:service rule) service)] 49 | (assoc event :service 50 | (if (string? (:service rule)) 51 | rewrite 52 | (str/replace service (:service rule) rewrite))))) 53 | event)))) 54 | 55 | (def rewrite-service 56 | (rewrite-service-with default-services)) 57 | -------------------------------------------------------------------------------- /11-13/riemann/examplecom/etc/count-notifications.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.count-notifications 2 | (:require [riemann.streams :refer :all])) 3 | 4 | (defn count-notifications 5 | "Count notifications" 6 | [& children] 7 | (adjust [:service #(str % ".rate")] 8 | (tag "notification-rate" 9 | (rate 5 10 | (fn [event] 11 | (call-rescue event children)))))) 12 | -------------------------------------------------------------------------------- /11-13/riemann/examplecom/etc/email.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.email 2 | (:require [clojure.string :as str] 3 | [riemann.email :refer :all])) 4 | 5 | (defn format-subject 6 | "Format the email subject" 7 | [events] 8 | (apply format "Service %s is in state %s on host %s" (str/join ", " (map :service events)) (str/join ", " (map :state events)) (map :host events))) 9 | 10 | (def header "Monitoring notification from Riemann!\n\n") 11 | (def footer "This is an automated Riemann notification. Please do not reply.") 12 | 13 | (defn lookup 14 | "Lookup events in the index" 15 | [host service] 16 | (riemann.index/lookup (:index @riemann.config/core) host service)) 17 | 18 | (defn round 19 | "Round numbers to 2 decimal places" 20 | [metric] 21 | (clojure.pprint/cl-format nil "~,2f" metric)) 22 | 23 | (defn byte-to-gb [bytes] (/ bytes (* 1024.0 1024.0 1024.0))) 24 | 25 | (defn context 26 | "Add some contextual event data" 27 | [event] 28 | (str 29 | "Host context:\n" 30 | " CPU Utilization:\t"(round (+ (:metric (lookup (:host event) "cpu/percent-system")) (:metric (lookup (:host event) "cpu/percent-user")))) "%\n" 31 | " Memory Used:\t"(round (:metric (lookup (:host event) "memory/percent-used"))) "%\n" 32 | " Disk(root) %:\t\t"(round (:metric (lookup (:host event) "df-root/percent_bytes-used"))) "% used " 33 | " ("(round (byte-to-gb (:metric (lookup (:host event) "df-root/df_complex-used")))) " GB used of " 34 | (round (+ (byte-to-gb (:metric (lookup (:host event) "df-root/df_complex-used"))) 35 | (byte-to-gb (:metric (lookup (:host event) "df-root/df_complex-free"))) 36 | (byte-to-gb (:metric (lookup (:host event) "df-root/df_complex-reserved"))))) "GB)\n\n" 37 | "Grafana Dashboard:\n\n" 38 | " http://graphitea.example.com:3000/dashboard/script/riemann.js?host="(:host event)"\n\n")) 39 | 40 | (defn format-body 41 | "Format the email body" 42 | [events] 43 | (str/join "\n\n\n" 44 | (map 45 | (fn [event] 46 | (str 47 | header 48 | "Time:\t\t" (riemann.common/time-at (:time event)) "\n" 49 | "Host:\t\t" (:host event) "\n" 50 | "Service:\t\t" (:service event) "\n" 51 | "State:\t\t" (:state event) "\n" 52 | "Metric:\t\t" (if (ratio? (:metric event)) 53 | (double (:metric event)) 54 | (:metric event)) "\n" 55 | "Tags:\t\t[" (str/join ", " (:tags event)) "] \n" 56 | "\n" 57 | "Description:\t\t" (:description event) 58 | "\n\n" 59 | (context event) 60 | footer)) 61 | events))) 62 | 63 | (def email (mailer {:from "riemann@example.com" 64 | :subject (fn [events] (format-subject events)) 65 | :body (fn [events] (format-body events)) 66 | })) 67 | -------------------------------------------------------------------------------- /11-13/riemann/examplecom/etc/graphite.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.graphite 2 | (:require [clojure.string :as str] 3 | [riemann.config :refer :all] 4 | [riemann.graphite :refer :all])) 5 | 6 | (defn graphite-path-statsd [event] 7 | (let [host (:host event) 8 | app (re-find #"^.*?\." (:service event)) 9 | service (str/replace-first (:service event) #"^.*?\." "") 10 | split-host (if host (str/split host #"\.") []) 11 | split-service (if service (str/split service #" ") [])] 12 | (str app, (str/join "." (concat (reverse split-host) split-service))))) 13 | 14 | (defn add-environment-to-graphite [event] 15 | (condp = (:plugin event) 16 | "docker" 17 | (if (:com.example.application event) 18 | (str "productiona.docker.", (:com.example.application event), ".", (riemann.graphite/graphite-path-percentiles event)) 19 | (str "productiona.docker.", (riemann.graphite/graphite-path-percentiles event))) 20 | "statsd" (str "productiona.", (graphite-path-statsd event)) 21 | (str "productiona.hosts.", (riemann.graphite/graphite-path-percentiles event)))) 22 | 23 | (def graph (async-queue! :graphite {:queue-size 1000} 24 | (graphite {:host "graphitea" :path add-environment-to-graphite}))) 25 | -------------------------------------------------------------------------------- /11-13/riemann/examplecom/etc/logstash.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.logstash 2 | (:require [riemann.logstash :refer :all])) 3 | 4 | (def logstash (async-queue! :logstash {:queue-size 1000} 5 | (logstash {:host "logstash" :port 2003 :port-size 20}))) 6 | -------------------------------------------------------------------------------- /11-13/riemann/examplecom/etc/maintenance.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.maintenance 2 | (:require [riemann.streams :refer :all])) 3 | 4 | (defn maintenance-mode? 5 | "Is it currently in maintenance mode?" 6 | [event] 7 | (->> '(and (= host (:host event)) 8 | (= service (:service event)) 9 | (= (:type event) "maintenance-mode")) 10 | (riemann.index/search (:index @core)) 11 | first 12 | :state 13 | (= "active"))) 14 | -------------------------------------------------------------------------------- /11-13/riemann/examplecom/etc/pagerduty.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.pagerduty 2 | (:require [riemann.pagerduty :refer :all] 3 | [riemann.streams :refer :all])) 4 | 5 | (defn pd-format 6 | [event] 7 | {:incident_key (str (:host event) " " (:service event)) 8 | :description (str "Host: " (:host event) " " 9 | (:service event) " is " 10 | (:state event) " (" 11 | (:metric event) ")") 12 | :details (assoc event :graphs (str "http://graphitea.example.com:3000/dashboard/script/riemann.js?host="(:host event)))}) 13 | 14 | (def pd (pagerduty { :service-key "123ABC123" :formatter pd-format})) 15 | 16 | (defn page 17 | [] 18 | (changed-state {:init "ok"} 19 | (where (state "ok") 20 | (:resolve pd) 21 | (else (:trigger pd))))) 22 | -------------------------------------------------------------------------------- /11-13/riemann/examplecom/etc/slack.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.slack 2 | (:require [riemann.slack :refer :all])) 3 | 4 | (def credentials {:account "examplecom", :token "123ABC123ABC"}) 5 | 6 | (defn slack-format 7 | "Format our Slack message" 8 | [event] 9 | (str "Service " (:service event) " on host " (:host event) " is in state " (:state event) ".\n" 10 | "See http://graphitea.example.com:3000/dashboard/script/riemann.js?host="(:host event) )) 11 | 12 | (defn slacker 13 | "Send notifications to Slack" 14 | [& {:keys [recipient] 15 | :or {recipient "#monitoring"}}] 16 | (slack credentials {:username "Riemann bot" 17 | :channel recipient 18 | :formatter (fn [e] { :text (slack-format e) } ) 19 | :icon ":smile:"})) 20 | -------------------------------------------------------------------------------- /11-13/riemann/riemann.config: -------------------------------------------------------------------------------- 1 | (logging/init {:file "/var/log/riemann/riemann.log"}) 2 | 3 | (require 'riemann.client) 4 | (require '[examplecom.etc.email :refer :all]) 5 | (require '[examplecom.etc.graphite :refer :all]) 6 | (require '[examplecom.etc.collectd :refer :all]) 7 | (require '[examplecom.app.tornado :as tornado]) 8 | 9 | (let [host "0.0.0.0"] 10 | (repl-server {:host "127.0.0.1"}) 11 | (tcp-server {:host host}) 12 | (udp-server {:host host}) 13 | (ws-server {:host host})) 14 | 15 | (periodically-expire 10 {:keep-keys [:host :service :tags, :state, :description, :metric]}) 16 | 17 | (let [index (index) 18 | downstream (batch 100 1/10 19 | (async-queue! :agg { :queue-size 1e3 20 | :core-pool-size 4 21 | :max-pool-size 32} 22 | (forward 23 | (riemann.client/tcp-client :host "riemannmc"))))] 24 | 25 | ; Inbound events will be passed to these streams: 26 | (streams 27 | (default :ttl 60 28 | ; Index all events immediately. 29 | (where (not (tagged "notification")) 30 | index) 31 | 32 | (tagged "tornado" 33 | (tornado/checks)) 34 | 35 | (tagged "collectd" 36 | (where (not (= (:plugin event) "docker")) 37 | (smap rewrite-service graph)) 38 | 39 | (where (= (:plugin event) "docker") 40 | (smap (comp parse-docker-service-host docker-attributes rewrite-service) graph)) 41 | 42 | (tagged "notification" 43 | (changed-state {:init "ok"} 44 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count$" "$1"] 45 | (email "james@example.com")))) 46 | 47 | (where (and (expired? event) 48 | (service #"^processes-.+\/ps_count\/processes")) 49 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count\/processes$" "$1"] 50 | (email "james@example.com")))) 51 | 52 | (where (service #"^riemann.*") 53 | graph 54 | 55 | downstream)))) 56 | -------------------------------------------------------------------------------- /11-13/rsyslog/35-aom-clojure-rest.conf: -------------------------------------------------------------------------------- 1 | module(load="imfile" PollingInterval="10") 2 | 3 | input(type="imfile" 4 | File="/var/log/aom-clojure-rest.log" 5 | StateFile="aom_clojure_rest" 6 | Tag="aom-clojure-rest:" 7 | Severity="info" 8 | Facility="local7") 9 | -------------------------------------------------------------------------------- /3/collectd/riemann.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin processes 2 | 3 | ProcessMatch riemann "riemann.bin start" 4 | 5 | 6 | LoadPlugin java 7 | 8 | JVMARG "-Djava.class.path=/usr/share/collectd/java/collectd-api.jar:/usr/share/collectd/java/generic-jmx.jar" 9 | LoadPlugin "org.collectd.java.GenericJMX" 10 | 11 | 12 | ObjectName "java.lang:type=GarbageCollector,*" 13 | InstancePrefix "gc-" 14 | InstanceFrom "name" 15 | 16 | Type "derive" 17 | Table false 18 | Attribute "CollectionCount" 19 | InstancePrefix "count" 20 | 21 | 22 | 23 | ObjectName "java.lang:type=GarbageCollector,*" 24 | InstancePrefix "gc-" 25 | InstanceFrom "name" 26 | 27 | Type "derive" 28 | Table false 29 | Attribute "CollectionTime" 30 | InstancePrefix "time" 31 | 32 | 33 | 34 | ObjectName "java.lang:type=MemoryPool,*" 35 | InstancePrefix "memory_pool-" 36 | InstanceFrom "name" 37 | 38 | Type "memory" 39 | Table true 40 | Attribute "Usage" 41 | 42 | 43 | 44 | ObjectName "java.lang:type=Memory" 45 | InstancePrefix "memory-heap" 46 | 47 | Type "memory" 48 | Table true 49 | Attribute "HeapMemoryUsage" 50 | 51 | 52 | 53 | ObjectName "java.lang:type=Memory" 54 | InstancePrefix "memory-nonheap" 55 | 56 | Type "memory" 57 | Table true 58 | Attribute "NonHeapMemoryUsage" 59 | 60 | 61 | 62 | ObjectName "java.lang:type=Threading" 63 | InstancePrefix "threading" 64 | 65 | Type "gauge" 66 | Table false 67 | Attribute "ThreadCount" 68 | InstancePrefix "count" 69 | 70 | 71 | 72 | ObjectName "java.lang:type=Threading" 73 | InstancePrefix "threading" 74 | 75 | Type "gauge" 76 | Table false 77 | Attribute "DaemonThreadCount" 78 | InstancePrefix "count-daemon" 79 | 80 | 81 | 82 | ServiceURL "service:jmx:rmi:///jndi/rmi://localhost:8855/jmxrmi" 83 | Collect "memory_pool" 84 | Collect "memory-heap" 85 | Collect "memory-nonheap" 86 | Collect "gc-count" 87 | Collect "gc-time" 88 | Collect "thread" 89 | Collect "thread-daemon" 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /3/riemann/examplecom/etc/email.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.email 2 | (:require [riemann.email :refer :all])) 3 | 4 | (def email (mailer {:from "riemann@example.com"})) 5 | -------------------------------------------------------------------------------- /3/riemann/riemann.config: -------------------------------------------------------------------------------- 1 | (logging/init {:file "/var/log/riemann/riemann.log"}) 2 | 3 | (require 'riemann.client) 4 | (require '[examplecom.etc.email :refer :all]) 5 | 6 | (let [host "0.0.0.0"] 7 | (repl-server {:host "127.0.0.1"}) 8 | (tcp-server {:host host}) 9 | (udp-server {:host host}) 10 | (ws-server {:host host})) 11 | 12 | (periodically-expire 10 {:keep-keys [:host :service :tags, :state, :description, :metric]}) 13 | 14 | (let [index (index) 15 | downstream (batch 100 1/10 16 | (async-queue! :agg { :queue-size 1e3 17 | :core-pool-size 4 18 | :max-pool-size 32} 19 | (forward 20 | (riemann.client/tcp-client :host "riemannmc"))))] 21 | 22 | ; Inbound events will be passed to these streams: 23 | (streams 24 | (default :ttl 60 25 | ; Index all events immediately. 26 | index 27 | 28 | ; Send all events to the log file. 29 | ;#(info %) 30 | (where (and (service "disk /") (metric > 0.10)) 31 | #(info "Disk space on / is over 10%!" %)) 32 | 33 | (where (service #"^riemann.*") 34 | 35 | downstream)))) 36 | -------------------------------------------------------------------------------- /3/riemann/riemann.config_riemannmc: -------------------------------------------------------------------------------- 1 | (logging/init {:file "/var/log/riemann/riemann.log"}) 2 | 3 | (require 'riemann.client) 4 | (require '[examplecom.etc.email :refer :all]) 5 | 6 | (let [host "0.0.0.0"] 7 | (repl-server {:host "127.0.0.1"}) 8 | (tcp-server {:host host}) 9 | (udp-server {:host host}) 10 | (ws-server {:host host})) 11 | 12 | (periodically-expire 10 {:keep-keys [:host :service :tags, :state, :description, :metric]}) 13 | 14 | (let [index (index)] 15 | 16 | (streams 17 | (default :ttl 60 18 | ; Index all events immediately. 19 | index 20 | 21 | (expired 22 | (throttle 1 120 23 | (where (service #"^riemann.*" 24 | (email "james@example.com"))))))) 25 | -------------------------------------------------------------------------------- /4/collectd/carbon.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin processes 2 | 3 | 4 | ProcessMatch "carbon-cache" "python.+carbon-cache" 5 | ProcessMatch "carbon-relay" "python.+carbon-relay" 6 | 7 | 8 | 9 | ProcessMatch "graphite-api" "graphite_api.app:app" 10 | 11 | 12 | 13 | 14 | Instance "carbon-cache" 15 | 16 | DataSource "processes" 17 | WarningMin 2 18 | FailureMin 1 19 | 20 | Instance "carbon-relay" 21 | 22 | DataSource "processes" 23 | FailureMin 1 24 | 25 | Instance "graphite-api" 26 | 27 | DataSource "processes" 28 | FailureMin 2 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /4/collectd/grafana.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin processes 2 | 3 | 4 | Process "grafana-server" 5 | 6 | -------------------------------------------------------------------------------- /4/graphite/carbon-cache-ubuntu.init: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Initscript for carbon-cache processes 4 | # Jason Dixon 5 | # 6 | # You must set the variables below. The 7 | # CACHE_INSTANCES variable should be set to the 8 | # number of carbon-cache instances you have 9 | # configured in your carbon.conf. Note that 10 | # they must be numerically indexed from 1. 11 | # (e.g. [cache:1], [cache:2], [cache:3] 12 | 13 | ### BEGIN INIT INFO 14 | # Provides: carbon-cache 15 | # Required-Start: $remote_fs $syslog 16 | # Required-Stop: $remote_fs $syslog 17 | # Default-Start: 2 3 4 5 18 | # Default-Stop: 0 1 6 19 | # Short-Description: Start carbon-cache daemon at boot time 20 | # Description: Runs the carbon-cache daemon. 21 | ### END INIT INFO 22 | 23 | PATH=/sbin:/usr/sbin:/bin:/usr/bin 24 | NAME=carbon-cache 25 | DAEMON=/usr/bin/$NAME 26 | DAEMON_ARGS="--config=/etc/carbon/carbon.conf --logdir=/var/log/carbon/" 27 | SCRIPTNAME=/etc/init.d/$NAME 28 | PID_DIR=/var/run 29 | 30 | set -e 31 | 32 | test -x $DAEMON || exit 0 33 | 34 | [ -r /etc/default/graphite-carbon ] && . /etc/default/graphite-carbon 35 | 36 | case "$1" in 37 | 38 | start) 39 | for INSTANCE in $(seq 1 $CACHE_INSTANCES); do 40 | echo -n "Starting ${NAME}-${INSTANCE}: " 41 | PID="${PID_DIR}/${NAME}-${INSTANCE}.pid" 42 | if start-stop-daemon --start --quiet --pidfile $PID --exec $DAEMON -- $DAEMON_ARGS start --pidfile=$PID --instance=${INSTANCE} 43 | then 44 | echo "succeeded" 45 | else 46 | echo "failed" 47 | fi 48 | done 49 | ${0} status 50 | ;; 51 | 52 | stop) 53 | for INSTANCE in $(seq 1 $CACHE_INSTANCES); do 54 | echo -n "Stopping ${NAME}-${INSTANCE}: " 55 | PID="${PID_DIR}/${NAME}-${INSTANCE}.pid" 56 | $DAEMON stop $DAEMON_ARGS --pidfile=$PID --instance=${INSTANCE} 57 | echo "stopped" 58 | rm -f $PID 59 | done 60 | exit 0 61 | ;; 62 | 63 | restart) 64 | ${0} stop 65 | ${0} start 66 | ;; 67 | 68 | status) 69 | for INSTANCE in $(seq 1 $CACHE_INSTANCES); do 70 | if [ -f "${PID_DIR}/${NAME}-${INSTANCE}.pid" ]; then 71 | PID=`cat "${PID_DIR}/${NAME}-${INSTANCE}.pid"` 72 | 73 | echo -n "${NAME}-${INSTANCE} (pid: $PID): " 74 | if ps -p $PID >/dev/null; then 75 | echo "running" 76 | else 77 | echo "failed" 78 | fi 79 | else 80 | echo "${NAME}-${INSTANCE} not running" 81 | fi 82 | done 83 | for INSTANCE in $(seq 1 $CACHE_INSTANCES); do 84 | if [ ! -f "${PID_DIR}/${NAME}-${INSTANCE}.pid" ]; then 85 | exit 1 86 | fi 87 | done 88 | exit 0 89 | ;; 90 | 91 | *) 92 | echo "Usage: /etc/init.d/${NAME} {start|stop|restart|status}" >%2 93 | exit 1 94 | ;; 95 | 96 | esac 97 | 98 | exit 0 99 | -------------------------------------------------------------------------------- /4/graphite/carbon-cache@.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=carbon-cache instance %i (graphite) 3 | 4 | [Service] 5 | ExecStartPre=/bin/rm -f /var/run/carbon-cache-%i.pid 6 | ExecStart=/usr/bin/carbon-cache --config=/etc/carbon/carbon.conf --pidfile=/var/run/carbon-cache-%i.pid --logdir=/var/log/carbon/ --instance=%i start 7 | Type=forking 8 | PIDFile=/var/run/carbon-cache-%i.pid 9 | 10 | [Install] 11 | WantedBy=multi-user.target 12 | -------------------------------------------------------------------------------- /4/graphite/carbon-relay-ubuntu.init: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Initscript for carbon-relay processes 4 | # Jason Dixon 5 | # 6 | # You must set the variables below. The 7 | # RELAY_INSTANCES variable should be set to the 8 | # number of carbon-relay instances you have 9 | # configured in your carbon.conf. Note that 10 | # they must be numerically indexed from 1. 11 | # (e.g. [relay:1], [relay:2], [relay:3] 12 | 13 | ### BEGIN INIT INFO 14 | # Provides: carbon-relay 15 | # Required-Start: $remote_fs $syslog 16 | # Required-Stop: $remote_fs $syslog 17 | # Default-Start: 2 3 4 5 18 | # Default-Stop: 0 1 6 19 | # Short-Description: Start carbon-relay daemon at boot time 20 | # Description: Runs the carbon-relay daemon. 21 | ### END INIT INFO 22 | 23 | PATH=/sbin:/usr/sbin:/bin:/usr/bin 24 | NAME=carbon-relay 25 | DAEMON=/usr/bin/$NAME 26 | DAEMON_ARGS="--config=/etc/carbon/carbon.conf --logdir=/var/log/carbon/" 27 | SCRIPTNAME=/etc/init.d/$NAME 28 | PID_DIR=/var/run 29 | 30 | set -e 31 | 32 | test -x $DAEMON || exit 0 33 | 34 | [ -r /etc/default/graphite-carbon ] && . /etc/default/graphite-carbon 35 | 36 | case "$1" in 37 | 38 | start) 39 | for INSTANCE in $(seq 1 $RELAY_INSTANCES); do 40 | echo -n "Starting ${NAME}-${INSTANCE}: " 41 | PID="${PID_DIR}/${NAME}-${INSTANCE}.pid" 42 | if start-stop-daemon --start --quiet --pidfile $PID --exec $DAEMON -- $DAEMON_ARGS start --pidfile=$PID --instance=${INSTANCE} 43 | then 44 | echo "succeeded" 45 | else 46 | echo "failed" 47 | fi 48 | done 49 | ${0} status 50 | ;; 51 | 52 | stop) 53 | for INSTANCE in $(seq 1 $RELAY_INSTANCES); do 54 | echo -n "Stopping ${NAME}-${INSTANCE}: " 55 | PID="${PID_DIR}/${NAME}-${INSTANCE}.pid" 56 | $DAEMON stop $DAEMON_ARGS --pidfile=$PID --instance=${INSTANCE} 57 | echo "stopped" 58 | rm -f $PID 59 | done 60 | exit 0 61 | ;; 62 | 63 | restart) 64 | ${0} stop 65 | ${0} start 66 | ;; 67 | 68 | status) 69 | for INSTANCE in $(seq 1 $RELAY_INSTANCES); do 70 | if [ -f "${PID_DIR}/${NAME}-${INSTANCE}.pid" ]; then 71 | PID=`cat "${PID_DIR}/${NAME}-${INSTANCE}.pid"` 72 | 73 | echo -n "${NAME}-${INSTANCE} (pid: $PID): " 74 | if ps -p $PID >/dev/null; then 75 | echo "running" 76 | else 77 | echo "failed" 78 | fi 79 | else 80 | echo "${NAME}-${INSTANCE} not running" 81 | fi 82 | done 83 | for INSTANCE in $(seq 1 $RELAY_INSTANCES); do 84 | if [ ! -f "${PID_DIR}/${NAME}-${INSTANCE}.pid" ]; then 85 | exit 1 86 | fi 87 | done 88 | exit 0 89 | ;; 90 | 91 | *) 92 | echo "Usage: /etc/init.d/${NAME} {start|stop|restart|status}" >%2 93 | exit 1 94 | ;; 95 | 96 | esac 97 | 98 | exit 0 99 | -------------------------------------------------------------------------------- /4/graphite/carbon-relay@.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=carbon-relay instance %i (graphite) 3 | 4 | [Service] 5 | ExecStartPre=/bin/rm -f /var/run/carbon-relay-%i.pid 6 | ExecStart=/usr/bin/carbon-relay --config=/etc/carbon/carbon.conf --pidfile=/var/run/carbon-relay-%i.pid --logdir=/var/log/carbon/ --instance=%i start 7 | Type=forking 8 | PIDFile=/var/run/carbon-relay-%i.pid 9 | 10 | [Install] 11 | WantedBy=multi-user.target 12 | -------------------------------------------------------------------------------- /4/graphite/carbon.conf: -------------------------------------------------------------------------------- 1 | [cache] 2 | STORAGE_DIR = /var/lib/graphite/ 3 | CONF_DIR = /etc/carbon/ 4 | LOG_DIR = /var/log/carbon/ 5 | PID_DIR = /var/run/ 6 | LOCAL_DATA_DIR = /var/lib/graphite/whisper/ 7 | USER = _graphite 8 | ENABLE_LOGROTATION = True 9 | LINE_RECEIVER_INTERFACE = 127.0.0.1 10 | PICKLE_RECEIVER_INTERFACE = 127.0.0.1 11 | CACHE_QUERY_INTERFACE = 127.0.0.1 12 | LOG_UPDATES = False 13 | LOG_CACHE_HITS = False 14 | 15 | [cache:1] 16 | LINE_RECEIVER_PORT = 2013 17 | PICKLE_RECEIVER_PORT = 2014 18 | CACHE_QUERY_PORT = 7012 19 | 20 | [cache:2] 21 | LINE_RECEIVER_PORT = 2023 22 | PICKLE_RECEIVER_PORT = 2024 23 | CACHE_QUERY_PORT = 7022 24 | 25 | [relay] 26 | USER = _graphite 27 | LINE_RECEIVER_INTERFACE = 0.0.0.0 28 | LINE_RECEIVER_PORT = 2003 29 | PICKLE_RECEIVER_INTERFACE = 0.0.0.0 30 | PICKLE_RECEIVER_PORT = 2004 31 | RELAY_METHOD = consistent-hashing 32 | REPLICATION_FACTOR = 1 33 | DESTINATIONS = 127.0.0.1:2014:1, 127.0.0.1:2024:2 34 | -------------------------------------------------------------------------------- /4/graphite/graphite-api.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=graphite-api (graphite) 3 | 4 | [Service] 5 | ExecStartPre=/bin/rm -f /var/run/graphite-api.pid 6 | ExecStart=/usr/bin/gunicorn --pid /var/run/graphite-api.pid -b 0.0.0.0:8888 -w 2 --daemon graphite_api.app:app 7 | Type=forking 8 | PIDFile=/var/run/graphite-api.pid 9 | 10 | [Install] 11 | WantedBy=multi-user.target 12 | -------------------------------------------------------------------------------- /4/graphite/graphite-api.yaml: -------------------------------------------------------------------------------- 1 | search_index: /var/lib/graphite/api_search_index 2 | finders: 3 | - graphite_api.finders.whisper.WhisperFinder 4 | functions: 5 | - graphite_api.functions.SeriesFunctions 6 | - graphite_api.functions.PieFunctions 7 | whisper: 8 | directories: 9 | - /var/lib/graphite/whisper 10 | carbon: 11 | hosts: 12 | - 127.0.0.1:7012 13 | - 127.0.0.1:7022 14 | timeout: 1 15 | retry_delay: 15 16 | carbon_prefix: carbon 17 | replication_factor: 1 18 | time_zone: UTC 19 | -------------------------------------------------------------------------------- /4/graphite/graphite-carbon.default: -------------------------------------------------------------------------------- 1 | # Change to true, to enable carbon-cache on boot 2 | CARBON_CACHE_ENABLED=true 3 | RELAY_INSTANCES=1 4 | CACHE_INSTANCES=2 5 | -------------------------------------------------------------------------------- /4/graphite/local_settings.py: -------------------------------------------------------------------------------- 1 | ## Graphite local_settings.py 2 | # Edit this file to customize the default Graphite webapp settings 3 | # 4 | # Additional customizations to Django settings can be added to this file as well 5 | 6 | SECRET_KET = 'tqIaQJEnthL5zVRKgBYR4KkcSzks98F55LRKdZo821tC9pwvCr7Bf5edqTIcr2Gemmttr3FXTMCofzH0zdEaNHpcCstiN7zFZxuUeCxB7rHLbD7VYwlh0gGSstIgkMyvYXLHc6bnwlClioGNI0GFdaVg8xrfnD9gr7W0ESL5O9luLVrRLwpLbZKoEV93DXwMBINTqXemgupVFJnBdUhWZMFfWzRiNDr0pvCawFl5ZVC7Y8fVXy4dj7hSOGzumV9i' 7 | TIME_ZONE = 'America/New_York' 8 | USE_REMOTE_USER_AUTHENTICATION = True 9 | LOG_RENDERING_PERFORMANCE = True 10 | LOG_CACHE_PERFORMANCE = True 11 | LOG_METRIC_ACCESS = True 12 | GRAPHITE_ROOT = '/usr/share/graphite-web' 13 | CONF_DIR = '/etc/graphite' 14 | STORAGE_DIR = '/var/lib/graphite/whisper' 15 | CONTENT_DIR = '/usr/share/graphite-web/static' 16 | WHISPER_DIR = '/var/lib/graphite/whisper' 17 | LOG_DIR = '/var/log/graphite' 18 | INDEX_FILE = '/var/lib/graphite/search_index' # Search index file 19 | 20 | DATABASES = { 21 | 'default': { 22 | 'NAME': 'graphite', 23 | 'ENGINE': 'django.db.backends.postgresql_psycopg2', 24 | 'USER': 'graphite', 25 | 'PASSWORD': 'strongpassword', 26 | 'HOST': '127.0.0.1', 27 | 'PORT': '' 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /4/graphite/whisper-calculator.py: -------------------------------------------------------------------------------- 1 | ## Original source - https://gist.github.com/jjmaestro/5774063 2 | 3 | #!/usr/bin/env python 4 | # -*- coding: utf-8 -*- 5 | 6 | 7 | def archive_to_bytes(archive): 8 | def to_seconds(s): 9 | SECONDS_IN_A = { 10 | 's': 1, 11 | 'm': 1 * 60, 12 | 'h': 1 * 60 * 60, 13 | 'd': 1 * 60 * 60 * 24, 14 | 'y': 1 * 60 * 60 * 24 * 365, 15 | } 16 | 17 | return int(s[:-1]) * SECONDS_IN_A[s[-1]] 18 | 19 | archive = [map(to_seconds, point.split(':')) 20 | for point in args.archive.split(',')] 21 | 22 | SIZE_METADATA = 2 * 4 + 4 + 4 # 16 [!2LfL] 23 | SIZE_ARCHIVE_INFO = 3 * 4 # 12 [!3L]+ 24 | SIZE_POINT = 4 + 8 # 12 [!Ld]+ 25 | 26 | size = 0 27 | for resolution, retention in archive: 28 | size += SIZE_ARCHIVE_INFO + SIZE_POINT * retention/resolution 29 | 30 | if size: 31 | size += SIZE_METADATA 32 | 33 | return size 34 | 35 | 36 | if __name__ == '__main__': 37 | import argparse 38 | 39 | parser = argparse.ArgumentParser( 40 | description="Calculates the size of the whisper storage for the given \ 41 | archive (in resolution:retention format, e.g. 1m:24h,5m:3m)" 42 | ) 43 | parser.add_argument( 44 | 'archive', 45 | help="Archive in storage-schemas.conf format (resolution:retention)" 46 | ) 47 | 48 | args = parser.parse_args() 49 | 50 | print "{} >> {} bytes".format(args.archive, archive_to_bytes(args.archive)) 51 | 52 | -------------------------------------------------------------------------------- /4/riemann/examplecom/etc/email.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.email 2 | (:require [riemann.email :refer :all])) 3 | 4 | (def email (mailer {:from "riemann@example.com"})) 5 | -------------------------------------------------------------------------------- /4/riemann/examplecom/etc/graphite.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.graphite 2 | (:require [riemann.config :refer :all] 3 | [riemann.graphite :refer :all])) 4 | 5 | (defn add-environment-to-graphite [event] (str "productiona.hosts.", (riemann.graphite/graphite-path-percentiles event))) 6 | 7 | (def graph (async-queue! :graphite {:queue-size 1000} 8 | (graphite {:host "graphitea" :path add-environment-to-graphite}))) 9 | -------------------------------------------------------------------------------- /4/riemann/riemann.config: -------------------------------------------------------------------------------- 1 | (logging/init {:file "/var/log/riemann/riemann.log"}) 2 | 3 | (require 'riemann.client) 4 | (require '[examplecom.etc.email :refer :all]) 5 | (require '[examplecom.etc.graphite :refer :all]) 6 | 7 | (let [host "0.0.0.0"] 8 | (repl-server {:host "127.0.0.1"}) 9 | (tcp-server {:host host}) 10 | (udp-server {:host host}) 11 | (ws-server {:host host})) 12 | 13 | (periodically-expire 10 {:keep-keys [:host :service :tags, :state, :description, :metric]}) 14 | 15 | (let [index (index) 16 | downstream (batch 100 1/10 17 | (async-queue! :agg { :queue-size 1e3 18 | :core-pool-size 4 19 | :max-pool-size 32} 20 | (forward 21 | (riemann.client/tcp-client :host "riemannmc"))))] 22 | 23 | ; Inbound events will be passed to these streams: 24 | (streams 25 | (default :ttl 60 26 | ; Index all events immediately. 27 | index 28 | 29 | ; Send all events to the log file. 30 | #(info %) 31 | 32 | (where (service #"^riemann.*") 33 | graph 34 | 35 | downstream)))) 36 | -------------------------------------------------------------------------------- /4/riemann/riemann.config_riemannmc: -------------------------------------------------------------------------------- 1 | ; -*- mode: clojure; -*- 2 | ; vim: filetype=clojure 3 | (include "/etc/riemann/include") 4 | 5 | (let [index (index)] 6 | 7 | (streams 8 | (default :ttl 60 9 | ; Index all events immediately. 10 | index 11 | 12 | (where (service #"^riemann.*") 13 | graph) 14 | 15 | (expired 16 | (throttle 1 120 17 | (where (service #"^riemann.*") (email "james@example.com"))))))) 18 | -------------------------------------------------------------------------------- /5-6/collectd/collectd.conf: -------------------------------------------------------------------------------- 1 | TypesDB "/usr/share/collectd/types.db" 2 | 3 | Interval 2 4 | CheckThresholds true 5 | WriteQueueLimitHigh 5000 6 | WriteQueueLimitLow 5000 7 | 8 | LoadPlugin logfile 9 | 10 | 11 | LogLevel "info" 12 | File "/var/log/collectd.log" 13 | Timestamp true 14 | 15 | 16 | LoadPlugin threshold 17 | 18 | Include "/etc/collectd.d/*.conf" 19 | -------------------------------------------------------------------------------- /5-6/collectd/collectd.d/carbon.conf: -------------------------------------------------------------------------------- 1 | 2 | ProcessMatch "carbon-cache" "python.+carbon-cache" 3 | ProcessMatch "carbon-relay" "python.+carbon-relay" 4 | 5 | 6 | 7 | 8 | Instance "carbon-cache" 9 | 10 | DataSource "processes" 11 | WarningMin 2 12 | FailureMin 1 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /5-6/collectd/collectd.d/cpu.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin cpu 2 | 3 | ValuesPercentage true 4 | ReportByCpu false 5 | 6 | -------------------------------------------------------------------------------- /5-6/collectd/collectd.d/df.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin df 2 | 3 | MountPoint "/" 4 | ValuesPercentage true 5 | 6 | -------------------------------------------------------------------------------- /5-6/collectd/collectd.d/memory.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin memory 2 | 3 | ValuesPercentage true 4 | 5 | -------------------------------------------------------------------------------- /5-6/collectd/collectd.d/processes.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin processes 2 | 3 | Process "collectd" 4 | 5 | 6 | 7 | 8 | 9 | DataSource "processes" 10 | FailureMin 1 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /5-6/collectd/collectd.d/swap.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin swap 2 | 3 | ValuesPercentage true 4 | 5 | -------------------------------------------------------------------------------- /5-6/collectd/collectd.d/write_riemann.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin write_riemann 2 | 3 | 4 | Host "riemanna.example.com" 5 | Port "5555" 6 | Protocol TCP 7 | StoreRates false 8 | CheckThresholds true 9 | TTLFactor 30.0 10 | 11 | Tag "collectd" 12 | 13 | -------------------------------------------------------------------------------- /5-6/riemann/examplecom/etc/checks.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.checks 2 | (:require [riemann.config :refer :all] 3 | [clojure.tools.logging :refer :all] 4 | [riemann.streams :refer :all])) 5 | 6 | (defn set_state [warning critical] 7 | (fn [event] 8 | (assoc event :state 9 | (condp < (:metric event) 10 | critical "critical" 11 | warning "warning" 12 | "ok")))) 13 | 14 | (defn check_threshold [srv window func warning critical & children] 15 | (where (service srv) 16 | (fixed-time-window window 17 | (smap func 18 | (where (< warning metric) 19 | (smap (set_state warning critical) 20 | (fn [event] 21 | (call-rescue event children)))))))) 22 | 23 | (defn check_percentiles [srv window & children] 24 | (where (service srv) 25 | (percentiles window [0.5 0.95 0.99 1] 26 | (fn [event] 27 | (call-rescue event children))))) 28 | -------------------------------------------------------------------------------- /5-6/riemann/examplecom/etc/collectd.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.collectd 2 | (:require [clojure.tools.logging :refer :all] 3 | [riemann.streams :refer :all] 4 | [clojure.string :as str])) 5 | 6 | (def default-services 7 | [{:service #"^load/load/(.*)$" :rewrite "load $1"} 8 | {:service #"^swap/percent-(.*)$" :rewrite "swap $1"} 9 | {:service #"^memory/percent-(.*)$" :rewrite "memory $1"} 10 | {:service #"^processes/ps_state-(.*)$" :rewrite "processes $1"} 11 | {:service #"^cpu/percent-(.*)$" :rewrite "cpu $1"} 12 | {:service #"^df-(.*)/(df_complex|percent_bytes)-(.*)$" :rewrite "df $1 $2 $3"} 13 | {:service #"^interface-(.*)/if_(errors|packets|octets)/(tx|rx)$" :rewrite "nic $1 $3 $2"}]) 14 | 15 | (defn rewrite-service-with 16 | [rules] 17 | (let [matcher (fn [s1 s2] (if (string? s1) (= s1 s2) (re-find s1 s2)))] 18 | (fn [{:keys [service] :as event}] 19 | (or 20 | (first 21 | (for [{:keys [rewrite] :as rule} rules 22 | :when (matcher (:service rule) service)] 23 | (assoc event :service 24 | (if (string? (:service rule)) 25 | rewrite 26 | (str/replace service (:service rule) rewrite))))) 27 | event)))) 28 | 29 | (def rewrite-service 30 | (rewrite-service-with default-services)) 31 | -------------------------------------------------------------------------------- /5-6/riemann/examplecom/etc/email.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.email 2 | (:require [riemann.email :refer :all])) 3 | 4 | (def email (mailer {:from "riemann@example.com"})) 5 | -------------------------------------------------------------------------------- /5-6/riemann/examplecom/etc/graphite.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.graphite 2 | (:require [riemann.config :refer :all] 3 | [riemann.graphite :refer :all])) 4 | 5 | (defn add-environment-to-graphite [event] (str "productiona.hosts.", (riemann.graphite/graphite-path-percentiles event))) 6 | 7 | (def graph (async-queue! :graphite {:queue-size 1000} 8 | (graphite {:host "graphitea" :path add-environment-to-graphite}))) 9 | -------------------------------------------------------------------------------- /5-6/riemann/riemann.config: -------------------------------------------------------------------------------- 1 | (logging/init {:file "/var/log/riemann/riemann.log"}) 2 | 3 | (require 'riemann.client) 4 | (require '[examplecom.etc.email :refer :all]) 5 | (require '[examplecom.etc.graphite :refer :all]) 6 | (require '[examplecom.etc.collectd :refer :all]) 7 | 8 | (let [host "0.0.0.0"] 9 | (repl-server {:host "127.0.0.1"}) 10 | (tcp-server {:host host}) 11 | (udp-server {:host host}) 12 | (ws-server {:host host})) 13 | 14 | (periodically-expire 10 {:keep-keys [:host :service :tags, :state, :description, :metric]}) 15 | 16 | (let [index (index) 17 | downstream (batch 100 1/10 18 | (async-queue! :agg { :queue-size 1e3 19 | :core-pool-size 4 20 | :max-pool-size 32} 21 | (forward 22 | (riemann.client/tcp-client :host "riemannmc"))))] 23 | 24 | ; Inbound events will be passed to these streams: 25 | (streams 26 | (default :ttl 60 27 | ; Index all events immediately. 28 | (where (not (tagged "notification")) 29 | index) 30 | 31 | (tagged "collectd" 32 | (smap rewrite-service graph) 33 | 34 | (tagged "notification" 35 | (changed-state {:init "ok"} 36 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count$" "$1"] 37 | (email "james@example.com")))) 38 | 39 | (where (and (expired? event) 40 | (service #"^processes-.+\/ps_count\/processes")) 41 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count\/processes$" "$1"] 42 | (email "james@example.com")))) 43 | 44 | (where (service #"^riemann.*") 45 | graph 46 | 47 | downstream)))) 48 | -------------------------------------------------------------------------------- /7/collectd/collectd.conf: -------------------------------------------------------------------------------- 1 | TypesDB "/usr/share/collectd/types.db" 2 | 3 | Interval 2 4 | CheckThresholds true 5 | 6 | LoadPlugin logfile 7 | 8 | 9 | LogLevel "info" 10 | File "/var/log/collectd.log" 11 | Timestamp true 12 | 13 | 14 | LoadPlugin threshold 15 | 16 | Include "/etc/collectd.d/*.conf" 17 | -------------------------------------------------------------------------------- /7/collectd/collectd.d/cpu.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin cpu 2 | 3 | ValuesPercentage true 4 | ReportByCpu false 5 | 6 | -------------------------------------------------------------------------------- /7/collectd/collectd.d/df.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin df 2 | 3 | MountPoint "/" 4 | ValuesPercentage true 5 | 6 | -------------------------------------------------------------------------------- /7/collectd/collectd.d/docker.conf: -------------------------------------------------------------------------------- 1 | TypesDB "/usr/lib/collectd/docker/dockerplugin.db" 2 | LoadPlugin python 3 | 4 | 5 | ModulePath "/usr/lib/collectd/docker" 6 | Import "dockerplugin" 7 | 8 | 9 | BaseURL "unix://var/run/docker.sock" 10 | Timeout 3 11 | 12 | 13 | 14 | 15 | Process "docker" 16 | 17 | -------------------------------------------------------------------------------- /7/collectd/collectd.d/memory.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin memory 2 | 3 | ValuesPercentage true 4 | 5 | -------------------------------------------------------------------------------- /7/collectd/collectd.d/processes.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin processes 2 | 3 | Process "collectd" 4 | 5 | 6 | 7 | 8 | 9 | DataSource "processes" 10 | FailureMin 1 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /7/collectd/collectd.d/swap.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin swap 2 | 3 | ValuesPercentage true 4 | 5 | -------------------------------------------------------------------------------- /7/collectd/collectd.d/write_riemann.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin write_riemann 2 | 3 | 4 | Host "riemanna.example.com" 5 | Port "5555" 6 | Protocol TCP 7 | StoreRates false 8 | CheckThresholds true 9 | TTLFactor 30.0 10 | 11 | Tag "collectd" 12 | 13 | -------------------------------------------------------------------------------- /7/riemann/examplecom/etc/checks.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.checks 2 | (:require [riemann.config :refer :all] 3 | [clojure.tools.logging :refer :all] 4 | [riemann.streams :refer :all])) 5 | 6 | (defn set_state [warning critical] 7 | (fn [event] 8 | (assoc event :state 9 | (condp < (:metric event) 10 | critical "critical" 11 | warning "warning" 12 | "ok")))) 13 | 14 | (defn check_threshold [srv window func warning critical & children] 15 | (where (service srv) 16 | (fixed-time-window window 17 | (smap func 18 | (where (< warning metric) 19 | (smap (set_state warning critical) 20 | (fn [event] 21 | (call-rescue event children)))))))) 22 | 23 | (defn check_percentiles [srv window & children] 24 | (where (service srv) 25 | (percentiles window [0.5 0.95 0.99 1] 26 | (fn [event] 27 | (call-rescue event children))))) 28 | -------------------------------------------------------------------------------- /7/riemann/examplecom/etc/collectd.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.collectd 2 | (:require [clojure.tools.logging :refer :all] 3 | [riemann.streams :refer :all] 4 | [clojure.string :as str] 5 | [clojure.walk :as walk])) 6 | 7 | (defn docker-attribute-map 8 | [attributes] 9 | (let [instance (str/split (str/replace attributes #"^.*\[(.*)\]$" "$1") #",")] 10 | (walk/keywordize-keys (into {} (for [pair instance] (apply hash-map (str/split pair #"="))))))) 11 | 12 | (defn docker-attributes 13 | [{:keys [plugin_instance] :as event}] 14 | (if-let [attributes (re-find #"^.*\[.*\]$" plugin_instance)] 15 | (merge event (docker-attribute-map attributes)) 16 | event)) 17 | 18 | (defn parse-docker-service-host 19 | [{:keys [type type_instance plugin_instance] :as event}] 20 | (let [host (re-find #"^\w+\.?\w+\.?\w+" (:plugin_instance event)) 21 | service (cond-> (str (:type event)) (:type_instance event) (str "." (:type_instance event)))] 22 | (assoc event :service service :host host))) 23 | 24 | (def default-services 25 | [{:service #"^load/load/(.*)$" :rewrite "load $1"} 26 | {:service #"^swap/percent-(.*)$" :rewrite "swap $1"} 27 | {:service #"^memory/percent-(.*)$" :rewrite "memory $1"} 28 | {:service #"^processes/ps_state-(.*)$" :rewrite "processes $1"} 29 | {:service #"^processes-(.*)/(.*)$" :rewrite "processes $1 $2"} 30 | {:service #"^cpu/percent-(.*)$" :rewrite "cpu $1"} 31 | {:service #"^df-(.*)/(df_complex|percent_bytes)-(.*)$" :rewrite "df $1 $2 $3"} 32 | {:service #"^interface-(.*)/if_(errors|packets|octets)/(tx|rx)$" :rewrite "nic $1 $3 $2"} 33 | {:service #"^protocols-(.*)/(.*)$" :rewrite "protocols $1 $2"} 34 | {:service #"^GenericJMX-(:?_|\/)?(.*)$" :rewrite "jmx $2"} 35 | {:service #"^haproxy\/(gauge|derive)-(.*)$" :rewrite "haproxy $2"} 36 | {:service #"^statsd\/(gauge|derive|latency)-(.*)$" :rewrite "$2"} 37 | {:service #"^statsd\/(gauge|derive|latency)-(.*)$" :rewrite "statsd $1 $2"} 38 | {:service #"^mysql-(.*)\/(counter|gauge)-(.*)$" :rewrite "mysql $1 $3"} 39 | {:service #"^dbi-(.*)\/(gauge|counter)-(.*)$" :rewrite "dbi $1 $3"} 40 | {:service #"^redis-(.*)$" :rewrite "redis $1"}]) 41 | 42 | (defn rewrite-service-with 43 | [rules] 44 | (let [matcher (fn [s1 s2] (if (string? s1) (= s1 s2) (re-find s1 s2)))] 45 | (fn [{:keys [service] :as event}] 46 | (or 47 | (first 48 | (for [{:keys [rewrite] :as rule} rules 49 | :when (matcher (:service rule) service)] 50 | (assoc event :service 51 | (if (string? (:service rule)) 52 | rewrite 53 | (str/replace service (:service rule) rewrite))))) 54 | event)))) 55 | 56 | (def rewrite-service 57 | (rewrite-service-with default-services)) 58 | -------------------------------------------------------------------------------- /7/riemann/examplecom/etc/email.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.email 2 | (:require [riemann.email :refer :all])) 3 | 4 | (def email (mailer {:from "riemann@example.com"})) 5 | -------------------------------------------------------------------------------- /7/riemann/examplecom/etc/graphite.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.graphite 2 | (:require [riemann.config :refer :all] 3 | [riemann.graphite :refer :all])) 4 | 5 | (defn add-environment-to-graphite [event] 6 | (condp = (:plugin event) 7 | "docker" 8 | (if (:com.example.application event) 9 | (str "productiona.docker.", (:com.example.application event), ".", (riemann.graphite/graphite-path-percentiles event)) 10 | (str "productiona.docker.", (riemann.graphite/graphite-path-percentiles event))) 11 | (str "productiona.hosts.", (riemann.graphite/graphite-path-percentiles event)))) 12 | 13 | (def graph (async-queue! :graphite {:queue-size 1000} 14 | (graphite {:host "graphitea" :path add-environment-to-graphite}))) 15 | -------------------------------------------------------------------------------- /7/riemann/riemann.config: -------------------------------------------------------------------------------- 1 | (logging/init {:file "/var/log/riemann/riemann.log"}) 2 | 3 | (require 'riemann.client) 4 | (require '[examplecom.etc.email :refer :all]) 5 | (require '[examplecom.etc.graphite :refer :all]) 6 | (require '[examplecom.etc.collectd :refer :all]) 7 | 8 | (let [host "0.0.0.0"] 9 | (repl-server {:host "127.0.0.1"}) 10 | (tcp-server {:host host}) 11 | (udp-server {:host host}) 12 | (ws-server {:host host})) 13 | 14 | (periodically-expire 10 {:keep-keys [:host :service :tags, :state, :description, :metric]}) 15 | 16 | (let [index (index) 17 | downstream (batch 100 1/10 18 | (async-queue! :agg { :queue-size 1e3 19 | :core-pool-size 4 20 | :max-pool-size 32} 21 | (forward 22 | (riemann.client/tcp-client :host "riemannmc"))))] 23 | 24 | ; Inbound events will be passed to these streams: 25 | (streams 26 | (default :ttl 60 27 | ; Index all events immediately. 28 | (where (not (tagged "notification")) 29 | index) 30 | 31 | (tagged "collectd" 32 | (where (not (= (:plugin event) "docker")) 33 | (smap rewrite-service graph)) 34 | 35 | (where (= (:plugin event) "docker") 36 | (smap (comp parse-docker-service-host docker-attributes rewrite-service) graph)) 37 | 38 | (tagged "notification" 39 | (changed-state {:init "ok"} 40 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count$" "$1"] 41 | (email "james@example.com")))) 42 | 43 | (where (and (expired? event) 44 | (service #"^processes-.+\/ps_count\/processes")) 45 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count\/processes$" "$1"] 46 | (email "james@example.com")))) 47 | 48 | (where (service #"^riemann.*") 49 | graph 50 | 51 | downstream)))) 52 | -------------------------------------------------------------------------------- /8/collectd/elasticsearch.conf: -------------------------------------------------------------------------------- 1 | 2 | Globals true 3 | 4 | 5 | 6 | ModulePath "/usr/lib/collectd/" 7 | 8 | Import "elasticsearch_collectd" 9 | 10 | 11 | Verbose false 12 | Cluster "productiona" 13 | 14 | 15 | 16 | LoadPlugin processes 17 | 18 | Process "elasticsearch" 19 | 20 | 21 | -------------------------------------------------------------------------------- /8/collectd/elasticsearch_collectd.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | #Copyright 2014 Jeremy Carroll 3 | # 4 | #Licensed under the Apache License, Version 2.0 (the "License"); 5 | #you may not use this file except in compliance with the License. 6 | #You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | #Unless required by applicable law or agreed to in writing, software 11 | #distributed under the License is distributed on an "AS IS" BASIS, 12 | #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | #See the License for the specific language governing permissions and 14 | #limitations under the License. 15 | 16 | 17 | import collectd 18 | import json 19 | import urllib2 20 | import socket 21 | import collections 22 | from distutils.version import StrictVersion 23 | 24 | 25 | ES_CLUSTER = "elasticsearch" 26 | ES_HOST = "localhost" 27 | ES_PORT = 9200 28 | 29 | # ES indexes must be fully qualified. E.g. _all, index1,index2 30 | # To do: Handle glob sytanx for index names. 31 | ES_INDEX = [ ] 32 | 33 | ENABLE_INDEX_STATS = False 34 | ENABLE_NODE_STATS = True 35 | 36 | VERBOSE_LOGGING = False 37 | 38 | Stat = collections.namedtuple('Stat', ('type', 'path')) 39 | 40 | # Indices are cluster wide, metrics should be collected from only one server 41 | # in the cluster or from an external probe server. 42 | INDEX_STATS = { 43 | 44 | # === ElasticSearch 0.90.x and higher === 45 | "v('{es_version}') >= v('0.90.0')": { 46 | 47 | ## PRIMARIES 48 | # DOCS 49 | "indices.{index_name}.primaries.docs.count" : Stat("counter", "indices.%s.primaries.docs.count"), 50 | "indices.{index_name}.primaries.docs.deleted" : Stat("counter", "indices.%s.primaries.docs.deleted"), 51 | # STORE 52 | "indices.{index_name}.primaries.store.size_in_bytes" : Stat("bytes", "indices.%s.primaries.store.size_in_bytes"), 53 | "indices.{index_name}.primaries.store.throttle_time_in_millis" : Stat("counter", "indices.%s.primaries.store.throttle_time_in_millis"), 54 | # INDEXING 55 | "indices.{index_name}.primaries.indexing.index_total" : Stat("counter", "indices.%s.primaries.indexing.index_total"), 56 | "indices.{index_name}.primaries.indexing.index_time_in_millis" : Stat("counter", "indices.%s.primaries.indexing.index_time_in_millis"), 57 | "indices.{index_name}.primaries.indexing.index_current" : Stat("gauge", "indices.%s.primaries.indexing.index_current"), 58 | "indices.{index_name}.primaries.indexing.delete_total" : Stat("counter", "indices.%s.primaries.indexing.delete_total"), 59 | "indices.{index_name}.primaries.indexing.delete_time_in_millis" : Stat("counter", "indices.%s.primaries.indexing.delete_time_in_millis"), 60 | "indices.{index_name}.primaries.indexing.delete_current" : Stat("gauge", "indices.%s.primaries.indexing.delete_current"), 61 | # GET 62 | "indices.{index_name}.primaries.get.time_in_millis" : Stat("counter", "indices.%s.primaries.get.time_in_millis"), 63 | "indices.{index_name}.primaries.get.exists_total" : Stat("counter", "indices.%s.primaries.get.exists_total"), 64 | "indices.{index_name}.primaries.get.exists_time_in_millis" : Stat("counter", "indices.%s.primaries.get.exists_time_in_millis"), 65 | "indices.{index_name}.primaries.get.missing_total" : Stat("counter", "indices.%s.primaries.get.missing_total"), 66 | "indices.{index_name}.primaries.get.missing_time_in_millis" : Stat("counter", "indices.%s.primaries.get.missing_time_in_millis"), 67 | "indices.{index_name}.primaries.get.current" : Stat("gauge", "indices.%s.primaries.get.current"), 68 | # SEARCH 69 | "indices.{index_name}.primaries.search.open_contexts" : Stat("gauge", "indices.%s.primaries.search.open_contexts"), 70 | "indices.{index_name}.primaries.search.query_total" : Stat("counter", "indices.%s.primaries.search.query_total"), 71 | "indices.{index_name}.primaries.search.query_time_in_millis" : Stat("counter", "indices.%s.primaries.search.query_time_in_millis"), 72 | "indices.{index_name}.primaries.search.query_current" : Stat("gauge", "indices.%s.primaries.search.query_current"), 73 | "indices.{index_name}.primaries.search.fetch_total" : Stat("counter", "indices.%s.primaries.search.fetch_total"), 74 | "indices.{index_name}.primaries.search.fetch_time_in_millis" : Stat("counter", "indices.%s.primaries.search.fetch_time_in_millis"), 75 | "indices.{index_name}.primaries.search.fetch_current" : Stat("gauge", "indices.%s.primaries.search.fetch_current"), 76 | # MERGES 77 | "indices.{index_name}.primaries.merges.current" : Stat("gauge", "indices.%s.primaries.merges.current"), 78 | "indices.{index_name}.primaries.merges.current_docs" : Stat("gauge", "indices.%s.primaries.merges.current_docs"), 79 | "indices.{index_name}.primaries.merges.current_size_in_bytes" : Stat("bytes", "indices.%s.primaries.merges.current_size_in_bytes"), 80 | "indices.{index_name}.primaries.merges.total" : Stat("counter", "indices.%s.primaries.merges.total"), 81 | "indices.{index_name}.primaries.merges.total_time_in_millis" : Stat("counter", "indices.%s.primaries.merges.total_time_in_millis"), 82 | "indices.{index_name}.primaries.merges.total_docs" : Stat("counter", "indices.%s.primaries.merges.total_docs"), 83 | "indices.{index_name}.primaries.merges.total_size_in_bytes" : Stat("bytes", "indices.%s.primaries.merges.total_size_in_bytes"), 84 | # REFRESH 85 | "indices.{index_name}.primaries.refresh.total" : Stat("counter", "indices.%s.primaries.refresh.total"), 86 | "indices.{index_name}.primaries.refresh.total_time_in_millis" : Stat("counter", "indices.%s.primaries.refresh.total_time_in_millis"), 87 | # FLUSH 88 | "indices.{index_name}.primaries.flush.total" : Stat("counter", "indices.%s.primaries.flush.total"), 89 | "indices.{index_name}.primaries.flush.total_time_in_millis" : Stat("counter", "indices.%s.primaries.flush.total_time_in_millis"), 90 | # WARMER 91 | "indices.{index_name}.primaries.warmer.current" : Stat("gauge", "indices.%s.primaries.warmer.current"), 92 | "indices.{index_name}.primaries.warmer.total" : Stat("counter", "indices.%s.primaries.warmer.total"), 93 | "indices.{index_name}.primaries.warmer.total_time_in_millis" : Stat("counter", "indices.%s.primaries.warmer.total_time_in_millis"), 94 | # FILTER_CACHE 95 | "indices.{index_name}.primaries.filter_cache.memory_size_in_bytes" : Stat("bytes", "indices.%s.primaries.filter_cache.memory_size_in_bytes"), 96 | "indices.{index_name}.primaries.filter_cache.evictions" : Stat("counter", "indices.%s.primaries.filter_cache.evictions"), 97 | # ID_CACHE 98 | "indices.{index_name}.primaries.id_cache.memory_size_in_bytes" : Stat("bytes", "indices.%s.primaries.id_cache.memory_size_in_bytes"), 99 | # FIELDDATA 100 | "indices.{index_name}.primaries.fielddata.memory_size_in_bytes" : Stat("bytes", "indices.%s.primaries.fielddata.memory_size_in_bytes"), 101 | "indices.{index_name}.primaries.fielddata.evictions" : Stat("counter", "indices.%s.primaries.fielddata.evictions"), 102 | # PERCOLATE 103 | "indices.{index_name}.primaries.percolate.total" : Stat("counter", "indices.%s.primaries.percolate.total"), 104 | "indices.{index_name}.primaries.percolate.time_in_millis" : Stat("counter", "indices.%s.primaries.percolate.time_in_millis"), 105 | "indices.{index_name}.primaries.percolate.current" : Stat("gauge", "indices.%s.primaries.percolate.current"), 106 | "indices.{index_name}.primaries.percolate.memory_size_in_bytes" : Stat("bytes", "indices.%s.primaries.percolate.memory_size_in_bytes"), 107 | "indices.{index_name}.primaries.percolate.queries" : Stat("counter", "indices.%s.primaries.percolate.queries"), 108 | # COMPELTION 109 | "indices.{index_name}.primaries.completion.size_in_bytes" : Stat("bytes", "indices.%s.primaries.completion.size_in_bytes"), 110 | # SEGMENTS 111 | "indices.{index_name}.primaries.segments.count" : Stat("counter", "indices.%s.primaries.segments.count"), 112 | "indices.{index_name}.primaries.segments.memory_in_bytes" : Stat("bytes", "indices.%s.primaries.segments.memory_in_bytes"), 113 | "indices.{index_name}.primaries.segments.index_writer_memory_in_bytes" : Stat("bytes", "indices.%s.primaries.segments.index_writer_memory_in_bytes"), 114 | "indices.{index_name}.primaries.segments.version_map_memory_in_bytes" : Stat("bytes", "indices.%s.primaries.segments.version_map_memory_in_bytes"), 115 | # TRANSLOG 116 | "indices.{index_name}.primaries.translog.operations" : Stat("counter", "indices.%s.primaries.translog.operations"), 117 | "indices.{index_name}.primaries.translog.size_in_bytes" : Stat("bytes", "indices.%s.primaries.translog.size_in_bytes"), 118 | # SUGGEST 119 | "indices.{index_name}.primaries.suggest.total" : Stat("counter", "indices.%s.primaries.suggest.total"), 120 | "indices.{index_name}.primaries.suggest.time_in_millis" : Stat("counter", "indices.%s.primaries.suggest.time_in_millis"), 121 | "indices.{index_name}.primaries.suggest.current" : Stat("gauge", "indices.%s.primaries.suggest.current"), 122 | 123 | ## TOTAL ## 124 | # DOCS 125 | "indices.{index_name}.total.docs.count" : Stat("gauge", "indices.%s.total.docs.count"), 126 | "indices.{index_name}.total.docs.deleted" : Stat("gauge", "indices.%s.total.docs.deleted"), 127 | # STORE 128 | "indices.{index_name}.total.store.size_in_bytes" : Stat("gauge", "indices.%s.total.store.size_in_bytes"), 129 | "indices.{index_name}.total.store.throttle_time_in_millis" : Stat("counter", "indices.%s.total.store.throttle_time_in_millis"), 130 | # INDEXING 131 | "indices.{index_name}.total.indexing.index_total" : Stat("counter", "indices.%s.total.indexing.index_total"), 132 | "indices.{index_name}.total.indexing.index_time_in_millis" : Stat("counter", "indices.%s.total.indexing.index_time_in_millis"), 133 | "indices.{index_name}.total.indexing.index_current" : Stat("gauge", "indices.%s.total.indexing.index_current"), 134 | "indices.{index_name}.total.indexing.delete_total" : Stat("counter", "indices.%s.total.indexing.delete_total"), 135 | "indices.{index_name}.total.indexing.delete_time_in_millis" : Stat("counter", "indices.%s.total.indexing.delete_time_in_millis"), 136 | "indices.{index_name}.total.indexing.delete_current" : Stat("gauge", "indices.%s.total.indexing.delete_current"), 137 | # GET 138 | "indices.{index_name}.total.get.total" : Stat("counter", "indices.%s.total.get.total"), 139 | "indices.{index_name}.total.get.time_in_millis" : Stat("counter", "indices.%s.total.get.time_in_millis"), 140 | "indices.{index_name}.total.get.exists_total" : Stat("counter", "indices.%s.total.get.exists_total"), 141 | "indices.{index_name}.total.get.exists_time_in_millis" : Stat("counter", "indices.%s.total.get.exists_time_in_millis"), 142 | "indices.{index_name}.total.get.missing_total" : Stat("counter", "indices.%s.total.get.missing_total"), 143 | "indices.{index_name}.total.get.missing_time_in_millis" : Stat("counter", "indices.%s.total.get.missing_time_in_millis"), 144 | "indices.{index_name}.total.get.current" : Stat("gauge", "indices.%s.total.get.current"), 145 | # SEARCH 146 | "indices.{index_name}.total.search.open_contexts" : Stat("gauge", "indices.%s.total.search.open_contexts"), 147 | "indices.{index_name}.total.search.query_total" : Stat("counter", "indices.%s.total.search.query_total"), 148 | "indices.{index_name}.total.search.query_time_in_millis" : Stat("counter", "indices.%s.total.search.query_time_in_millis"), 149 | "indices.{index_name}.total.search.query_current" : Stat("gauge", "indices.%s.total.search.query_current"), 150 | "indices.{index_name}.total.search.fetch_total" : Stat("counter", "indices.%s.total.search.fetch_total"), 151 | } 152 | } 153 | 154 | NODE_STATS = { 155 | 156 | # === ElasticSearch 0.90.x and higher === 157 | "v('{es_version}') >= v('0.90.0')": { 158 | ## DOCS 159 | 'indices.docs.count': Stat("gauge", "nodes.%s.indices.docs.count"), 160 | 'indices.docs.deleted': Stat("counter", "nodes.%s.indices.docs.deleted"), 161 | 162 | ## STORE 163 | 'indices.store.size': Stat("bytes", "nodes.%s.indices.store.size_in_bytes"), 164 | 165 | ## INDEXING 166 | 'indices.indexing.index-total': Stat("counter", "nodes.%s.indices.indexing.index_total"), 167 | 'indices.indexing.index-time': Stat("counter", "nodes.%s.indices.indexing.index_time_in_millis"), 168 | 'indices.indexing.delete-total': Stat("counter", "nodes.%s.indices.indexing.delete_total"), 169 | 'indices.indexing.delete-time': Stat("counter", "nodes.%s.indices.indexing.delete_time_in_millis"), 170 | 'indices.indexing.index-current': Stat("gauge", "nodes.%s.indices.indexing.index_current"), 171 | 'indices.indexing.delete-current': Stat("gauge", "nodes.%s.indices.indexing.delete_current"), 172 | 173 | ## GET 174 | 'indices.get.total': Stat("counter", "nodes.%s.indices.get.total"), 175 | 'indices.get.time': Stat("counter", "nodes.%s.indices.get.time_in_millis"), 176 | 'indices.get.exists-total': Stat("counter", "nodes.%s.indices.get.exists_total"), 177 | 'indices.get.exists-time': Stat("counter", "nodes.%s.indices.get.exists_time_in_millis"), 178 | 'indices.get.missing-total': Stat("counter", "nodes.%s.indices.get.missing_total"), 179 | 'indices.get.missing-time': Stat("counter", "nodes.%s.indices.get.missing_time_in_millis"), 180 | 'indices.get.current': Stat("gauge", "nodes.%s.indices.get.current"), 181 | 182 | ## SEARCH 183 | 'indices.search.query-current': Stat("gauge", "nodes.%s.indices.search.query_current"), 184 | 'indices.search.query-total': Stat("counter", "nodes.%s.indices.search.query_total"), 185 | 'indices.search.query-time': Stat("counter", "nodes.%s.indices.search.query_time_in_millis"), 186 | 'indices.search.fetch-current': Stat("gauge", "nodes.%s.indices.search.fetch_current"), 187 | 'indices.search.fetch-total': Stat("counter", "nodes.%s.indices.search.fetch_total"), 188 | 'indices.search.fetch-time': Stat("counter", "nodes.%s.indices.search.fetch_time_in_millis"), 189 | 190 | # JVM METRICS # 191 | ##GC 192 | 'jvm.gc.time': Stat("counter", "nodes.%s.jvm.gc.collectors.young.collection_time_in_millis"), 193 | 'jvm.gc.count': Stat("counter", "nodes.%s.jvm.gc.collectors.young.collection_count"), 194 | 'jvm.gc.old-time': Stat("counter", "nodes.%s.jvm.gc.collectors.old.collection_time_in_millis"), 195 | 'jvm.gc.old-count': Stat("counter", "nodes.%s.jvm.gc.collectors.old.collection_count"), 196 | 197 | ## MEM 198 | 'jvm.mem.heap-committed': Stat("bytes", "nodes.%s.jvm.mem.heap_committed_in_bytes"), 199 | 'jvm.mem.heap-used': Stat("bytes", "nodes.%s.jvm.mem.heap_used_in_bytes"), 200 | 'jvm.mem.heap-used-percent': Stat("percent", "nodes.%s.jvm.mem.heap_used_percent"), 201 | 'jvm.mem.non-heap-committed': Stat("bytes", "nodes.%s.jvm.mem.non_heap_committed_in_bytes"), 202 | 'jvm.mem.non-heap-used': Stat("bytes", "nodes.%s.jvm.mem.non_heap_used_in_bytes"), 203 | 204 | ## THREADS 205 | 'jvm.threads.count': Stat("gauge", "nodes.%s.jvm.threads.count"), 206 | 'jvm.threads.peak': Stat("gauge", "nodes.%s.jvm.threads.peak_count"), 207 | 208 | # TRANSPORT METRICS # 209 | 'transport.server_open': Stat("gauge", "nodes.%s.transport.server_open"), 210 | 'transport.rx.count': Stat("counter", "nodes.%s.transport.rx_count"), 211 | 'transport.rx.size': Stat("bytes", "nodes.%s.transport.rx_size_in_bytes"), 212 | 'transport.tx.count': Stat("counter", "nodes.%s.transport.tx_count"), 213 | 'transport.tx.size': Stat("bytes", "nodes.%s.transport.tx_size_in_bytes"), 214 | 215 | # HTTP METRICS # 216 | 'http.current_open': Stat("gauge", "nodes.%s.http.current_open"), 217 | 'http.total_open': Stat("counter", "nodes.%s.http.total_opened"), 218 | 219 | # PROCESS METRICS # 220 | 'process.open_file_descriptors': Stat("gauge", "nodes.%s.process.open_file_descriptors"), 221 | }, 222 | 223 | # === ElasticSearch 0.90.x only === 224 | "v('0.90.0') <= v('{es_version}') < v('1.0.0')": { 225 | ##CPU 226 | 'process.cpu.percent': Stat("gauge", "nodes.%s.process.cpu.percent") 227 | }, 228 | 229 | # === ElasticSearch 1.0.0 or greater === 230 | "v('{es_version}') >= v('1.0.0')": { 231 | ## STORE 232 | 'indices.store.throttle-time': Stat("counter", "nodes.%s.indices.store.throttle_time_in_millis"), 233 | 234 | ##SEARCH 235 | 'indices.search.open-contexts': Stat("gauge", "nodes.%s.indices.search.open_contexts"), 236 | 237 | ##CACHE 238 | 'indices.cache.field.eviction': Stat("counter", "nodes.%s.indices.fielddata.evictions"), 239 | 'indices.cache.field.size': Stat("bytes", "nodes.%s.indices.fielddata.memory_size_in_bytes"), 240 | 'indices.cache.filter.evictions': Stat("counter", "nodes.%s.indices.filter_cache.evictions"), 241 | 'indices.cache.filter.size': Stat("bytes", "nodes.%s.indices.filter_cache.memory_size_in_bytes"), 242 | 243 | ## FLUSH 244 | 'indices.flush.total': Stat("counter", "nodes.%s.indices.flush.total"), 245 | 'indices.flush.time': Stat("counter", "nodes.%s.indices.flush.total_time_in_millis"), 246 | 247 | ## MERGES 248 | 'indices.merges.current': Stat("gauge", "nodes.%s.indices.merges.current"), 249 | 'indices.merges.current-docs': Stat("gauge", "nodes.%s.indices.merges.current_docs"), 250 | 'indices.merges.current-size': Stat("bytes", "nodes.%s.indices.merges.current_size_in_bytes"), 251 | 'indices.merges.total': Stat("counter", "nodes.%s.indices.merges.total"), 252 | 'indices.merges.total-docs': Stat("gauge", "nodes.%s.indices.merges.total_docs"), 253 | 'indices.merges.total-size': Stat("bytes", "nodes.%s.indices.merges.total_size_in_bytes"), 254 | 'indices.merges.time': Stat("counter", "nodes.%s.indices.merges.total_time_in_millis"), 255 | 256 | ## REFRESH 257 | 'indices.refresh.total': Stat("counter", "nodes.%s.indices.refresh.total"), 258 | 'indices.refresh.time': Stat("counter", "nodes.%s.indices.refresh.total_time_in_millis"), 259 | 260 | ## SEGMENTS 261 | 'indices.segments.count': Stat("gauge", "nodes.%s.indices.segments.count"), 262 | 'indices.segments.size': Stat("bytes", "nodes.%s.indices.segments.memory_in_bytes"), 263 | 264 | ## TRANSLOG 265 | 'indices.translog.operations': Stat("gauge", "nodes.%s.indices.translog.operations"), 266 | 'indices.translog.size': Stat("bytes", "nodes.%s.indices.translog.size_in_bytes"), 267 | }, 268 | 269 | # DICT: ElasticSearch 1.3.0 or greater 270 | "v('{es_version}') >= v('1.3.0')": { 271 | 'indices.segments.index-writer-memory': Stat("bytes", "nodes.%s.indices.segments.index_writer_memory_in_bytes"), 272 | 'indices.segments.index-memory': Stat("bytes", "nodes.%s.indices.segments.memory_in_bytes"), 273 | } 274 | } 275 | 276 | STATS_CUR = {} 277 | 278 | def check_es_version(rule, version): 279 | log_verbose('Elasticsearch version rule: %s' % (rule.format(es_version=version)) ) 280 | v = StrictVersion 281 | eval_string = rule.format(es_version=version) 282 | return eval(eval_string) 283 | 284 | 285 | def generate_metric_set(rules, es_version): 286 | """ 287 | @breif - Given an initial set of metrics with the elasticsearch version and the 288 | requested metrics to be fetched, parse all pre-defined metrics and 289 | return a sythesised set of metrics which is compatiable with existing 290 | functions. 291 | 292 | @rules - a struction which contains a rule to be evaluated when evaluting 293 | which metrics to be appended to the returned data set. 294 | 295 | @es_version - the Elasticsearch version. 296 | """ 297 | synthesised_metrics = {} 298 | 299 | for k in rules.keys(): 300 | if check_es_version(k, es_version): 301 | log_verbose("Adding %s" % k) 302 | synthesised_metrics.update(rules[k]) 303 | 304 | return synthesised_metrics 305 | 306 | 307 | # FUNCTION: Collect stats from JSON result 308 | def lookup_node_stat(stat, metrics, json): 309 | node = json['nodes'].keys()[0] 310 | val = dig_it_up(json, metrics[stat].path % node) 311 | 312 | # Check to make sure we have a valid result 313 | # dig_it_up returns False if no match found 314 | if not isinstance(val, bool): 315 | return int(val) 316 | else: 317 | return None 318 | 319 | 320 | def lookup_index_stat(stat, metrics, json): 321 | indices = json['indices'].keys() 322 | 323 | for index in indices: 324 | formatted_stat = stat.format(index_name=index) 325 | val = index_dig_it_up(json, metrics[stat].path, index ) 326 | 327 | # Check to make sure we have a valid result 328 | # dig_it_up returns False if no match found 329 | if not isinstance(val, bool): 330 | return int(val) 331 | else: 332 | return None 333 | 334 | 335 | def log_verbose(msg): 336 | if VERBOSE_LOGGING == True: 337 | collectd.warning('elasticsearch plugin [verbose]: %s' % msg) 338 | 339 | 340 | def configure_callback(conf): 341 | """Received configuration information""" 342 | global ES_HOST, ES_PORT, VERBOSE_LOGGING, ES_CLUSTER, ES_INDEX, ENABLE_INDEX_STATS, ENABLE_NODE_STATS 343 | for node in conf.children: 344 | if node.key == 'Host': 345 | ES_HOST = node.values[0] 346 | elif node.key == 'Port': 347 | ES_PORT = int(node.values[0]) 348 | elif node.key == 'Verbose': 349 | VERBOSE_LOGGING = bool(node.values[0]) 350 | elif node.key == 'Cluster': 351 | ES_CLUSTER = node.values[0] 352 | elif node.key == 'Indexes': 353 | ES_INDEX = node.values 354 | log_verbose('Indexes to query: %s' % (str(ES_INDEX))) 355 | elif node.key == 'EnableIndexStats': 356 | ENABLE_INDEX_STATS = bool(node.values[0]) 357 | log_verbose("Enable Index Stats : %s" % ENABLE_INDEX_STATS) 358 | elif node.key == 'EnableNodeStats': 359 | ENABLE_NODE_STATS = bool(node.values[0]) 360 | log_verbose("Enable Node Stats : %s" % ENABLE_NODE_STATS) 361 | else: 362 | collectd.warning('elasticsearch plugin: Ignoring unknown config key: %s.' % node.key) 363 | 364 | log_verbose('Configured with host=%s, port=%s' % (ES_HOST, ES_PORT)) 365 | 366 | 367 | 368 | def fetch_url(url): 369 | try: 370 | result = json.load(urllib2.urlopen(url, timeout=10)) 371 | except urllib2.URLError, e: 372 | collectd.error('elasticsearch plugin: Error connecting to %s - %r' % (url, e)) 373 | return None 374 | return result 375 | 376 | 377 | 378 | def fetch_stats(): 379 | global ES_CLUSTER, ES_HOST, ES_PORT, STATS_CUR, ES_INDEX, ENABLE_NODE_STATS, ENABLE_INDEX_STATS 380 | 381 | NODE_STATS_URL = { 382 | "v('{es_version}') >= v('0.90.0')": '{url}_cluster/nodes/_local/stats?http=true&process=true&jvm=true&transport=true', 383 | "v('{es_version}') >= v('1.0.0')" : '{url}_nodes/_local/stats/transport,http,process,jvm,indices' 384 | } 385 | 386 | node_stats_url = "" 387 | base_url = 'http://' + ES_HOST + ':' + str(ES_PORT) + '/' 388 | server_info = fetch_url(base_url) 389 | version = server_info['version']['number'] 390 | 391 | # Get the cluster name. 392 | if server_info.has_key("cluster_name"): 393 | ES_CLUSTER = server_info["cluster_name"] 394 | else: 395 | ES_CLUSTER = fetch_url(base_url+"_nodes")['cluster_name'] 396 | 397 | log_verbose('Elasticsearch cluster: %s version : %s' % (ES_CLUSTER, version)) 398 | 399 | # Node statistics 400 | if ENABLE_NODE_STATS: 401 | node_metrics = {} 402 | for k in NODE_STATS_URL.keys(): 403 | if check_es_version(k, str(version)): 404 | node_stats_url = NODE_STATS_URL[k].format(url=base_url) 405 | log_verbose('Node url : %s' % node_stats_url) 406 | 407 | node_metrics.update(generate_metric_set(NODE_STATS, version)) 408 | 409 | # FIXME: Re-add the thread pool statistics. 410 | # # add info on thread pools 411 | # for pool in ['generic', 'index', 'get', 'snapshot', 'merge', 'optimize', 'bulk', 'warmer', 'flush', 'search', 'refresh']: 412 | # for attr in ['threads', 'queue', 'active', 'largest']: 413 | # path = 'thread_pool.{0}.{1}'.format(pool, attr) 414 | # node_metrics[path] = Stat("gauge", 'nodes.%s.{0}'.format(path)) 415 | # for attr in ['completed', 'rejected']: 416 | # path = 'thread_pool.{0}.{1}'.format(pool, attr) 417 | # node_metrics[path] = Stat("counter", 'nodes.%s.{0}'.format(path)) 418 | 419 | node_json = fetch_url(node_stats_url) 420 | parse_node_stats(node_metrics, node_json) 421 | log_verbose('Node stats processed') 422 | 423 | # Indexes statistics 424 | if ENABLE_INDEX_STATS: 425 | index_metrics = {} 426 | log_verbose('Checking index.') 427 | for k in ES_INDEX: 428 | index_stats_url = base_url + k + "/_stats" 429 | index_metrics.update(generate_metric_set(INDEX_STATS, version)) 430 | log_verbose('Index statistics url : %s' % index_stats_url) 431 | 432 | index_json = fetch_url(index_stats_url) 433 | parse_index_stats(index_metrics, index_json, k) 434 | 435 | return True 436 | 437 | 438 | 439 | def parse_node_stats(metrics, json): 440 | """Parse stats response from ElasticSearch""" 441 | for name, key in metrics.iteritems(): 442 | result = lookup_node_stat(name, metrics, json) 443 | dispatch_stat(result, name, key) 444 | return True 445 | 446 | 447 | def parse_index_stats(metrics, json, index): 448 | """Parse stats response from ElasticSearch""" 449 | for name, key in metrics.iteritems(): 450 | result = lookup_index_stat(name, metrics, json) 451 | dispatch_stat(result, name.format(index_name=index), key) 452 | return True 453 | 454 | 455 | def dispatch_stat(result, name, key): 456 | """Read a key from info response data and dispatch a value""" 457 | if result is None: 458 | collectd.warning('elasticsearch plugin: Value not found for %s' % name) 459 | return 460 | estype = key.type 461 | value = int(result) 462 | log_verbose('Sending value[%s]: %s=%s' % (estype, name, value)) 463 | 464 | val = collectd.Values(plugin='elasticsearch') 465 | val.plugin_instance = ES_CLUSTER 466 | val.type = estype 467 | val.type_instance = name 468 | val.values = [value] 469 | val.meta={'0': True} 470 | val.dispatch() 471 | 472 | 473 | def read_callback(): 474 | log_verbose('Read callback called') 475 | stats = fetch_stats() 476 | 477 | 478 | 479 | def dig_it_up(obj, path): 480 | try: 481 | if type(path) in (str, unicode): 482 | path = path.split('.') 483 | return reduce(lambda x, y: x[y], path, obj) 484 | except: 485 | return False 486 | 487 | 488 | def index_dig_it_up(obj, path, index_name): 489 | try: 490 | if type(path) in (str, unicode): 491 | path = path.split('.') 492 | path[1] = path[1] % index_name 493 | return reduce(lambda x, y: x[y], path, obj) 494 | except: 495 | return False 496 | 497 | 498 | 499 | collectd.register_config(configure_callback) 500 | collectd.register_read(read_callback) 501 | -------------------------------------------------------------------------------- /8/collectd/logstash.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin processes 2 | 3 | ProcessMatch "logstash" "logstash\/runner.rb" 4 | 5 | -------------------------------------------------------------------------------- /8/collectd/logstash_jmx.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin java 2 | 3 | JVMARG "-Djava.class.path=/usr/share/collectd/java/collectd-api.jar:/usr/share/collectd/java/generic-jmx.jar" 4 | LoadPlugin "org.collectd.java.GenericJMX" 5 | 6 | 7 | ObjectName "java.lang:type=GarbageCollector,*" 8 | InstancePrefix "gc-" 9 | InstanceFrom "name" 10 | 11 | Type "derive" 12 | Table false 13 | Attribute "CollectionCount" 14 | InstancePrefix "count" 15 | 16 | 17 | 18 | ObjectName "java.lang:type=GarbageCollector,*" 19 | InstancePrefix "gc-" 20 | InstanceFrom "name" 21 | 22 | Type "derive" 23 | Table false 24 | Attribute "CollectionTime" 25 | InstancePrefix "time" 26 | 27 | 28 | 29 | ObjectName "java.lang:type=MemoryPool,*" 30 | InstancePrefix "memory_pool-" 31 | InstanceFrom "name" 32 | 33 | Type "memory" 34 | Table true 35 | Attribute "Usage" 36 | 37 | 38 | 39 | ObjectName "java.lang:type=Memory" 40 | InstancePrefix "memory-heap" 41 | 42 | Type "memory" 43 | Table true 44 | Attribute "HeapMemoryUsage" 45 | 46 | 47 | 48 | ObjectName "java.lang:type=Memory" 49 | InstancePrefix "memory-nonheap" 50 | 51 | Type "memory" 52 | Table true 53 | Attribute "NonHeapMemoryUsage" 54 | 55 | 56 | 57 | ObjectName "java.lang:type=Threading" 58 | InstancePrefix "threading" 59 | 60 | Type "gauge" 61 | Table false 62 | Attribute "ThreadCount" 63 | InstancePrefix "count" 64 | 65 | 66 | 67 | ObjectName "java.lang:type=Threading" 68 | InstancePrefix "threading" 69 | 70 | Type "gauge" 71 | Table false 72 | Attribute "DaemonThreadCount" 73 | InstancePrefix "count-daemon" 74 | 75 | 76 | 77 | ServiceURL "service:jmx:rmi:///jndi/rmi://localhost:8855/jmxrmi" 78 | Collect "memory_pool" 79 | Collect "memory-heap" 80 | Collect "memory-nonheap" 81 | Collect "gc-count" 82 | Collect "gc-time" 83 | Collect "thread" 84 | Collect "thread-daemon" 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /8/collectd/rsyslogd.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin processes 2 | 3 | Process "rsyslogd" 4 | 5 | -------------------------------------------------------------------------------- /8/logstash/logstash.conf: -------------------------------------------------------------------------------- 1 | input { 2 | tcp { 3 | port => 5514 4 | type => syslog 5 | } 6 | tcp { 7 | port => 2003 8 | type => "riemann" 9 | codec => "json" 10 | } 11 | udp { 12 | port => 5514 13 | type => syslog 14 | } 15 | file { 16 | path => [ "/var/log/syslog", "/var/log/auth.log" ] 17 | type => "syslog" 18 | } 19 | } 20 | filter { 21 | if [type] == "syslog" { 22 | grok { 23 | match => { "message" => "(?:%{SYSLOGTIMESTAMP:syslog_timestamp}|%{TIMESTAMP_ISO8601:syslog_timestamp}) %{SYSLOGHOST:syslog_hostname} %{DATA:syslog_program}(?:\/%{DATA:container_name}\/%{DATA:container_id})?(?:\[%{POSINT:syslog_pid}\])?: %{GREEDYDATA:syslog_message}" } 24 | remove_field => ["message"] 25 | } 26 | syslog_pri { } 27 | date { 28 | match => [ "syslog_timestamp", "MMM d HH:mm:ss", "MMM dd HH:mm:ss", "ISO8601" ] 29 | } 30 | } 31 | } 32 | output { 33 | stdout { } 34 | elasticsearch { 35 | sniffing => true 36 | hosts => "esa1.example.com" 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /8/riemann/examplecom/etc/checks.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.checks 2 | (:require [riemann.config :refer :all] 3 | [clojure.tools.logging :refer :all] 4 | [riemann.streams :refer :all])) 5 | 6 | (defn set_state [warning critical] 7 | (fn [event] 8 | (assoc event :state 9 | (condp < (:metric event) 10 | critical "critical" 11 | warning "warning" 12 | "ok")))) 13 | 14 | (defn check_threshold [srv window func warning critical & children] 15 | (where (service srv) 16 | (fixed-time-window window 17 | (smap func 18 | (where (< warning metric) 19 | (smap (set_state warning critical) 20 | (fn [event] 21 | (call-rescue event children)))))))) 22 | 23 | (defn check_percentiles [srv window & children] 24 | (where (service srv) 25 | (percentiles window [0.5 0.95 0.99 1] 26 | (fn [event] 27 | (call-rescue event children))))) 28 | -------------------------------------------------------------------------------- /8/riemann/examplecom/etc/collectd.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.collectd 2 | (:require [clojure.tools.logging :refer :all] 3 | [riemann.streams :refer :all] 4 | [clojure.string :as str] 5 | [clojure.walk :as walk])) 6 | 7 | (defn docker-attribute-map 8 | [attributes] 9 | (let [instance (str/split (str/replace attributes #"^.*\[(.*)\]$" "$1") #",")] 10 | (walk/keywordize-keys (into {} (for [pair instance] (apply hash-map (str/split pair #"="))))))) 11 | 12 | (defn docker-attributes 13 | [{:keys [plugin_instance] :as event}] 14 | (if-let [attributes (re-find #"^.*\[.*\]$" plugin_instance)] 15 | (merge event (docker-attribute-map attributes)) 16 | event)) 17 | 18 | (defn parse-docker-service-host 19 | [{:keys [type type_instance plugin_instance] :as event}] 20 | (let [host (re-find #"^\w+\.?\w+\.?\w+" (:plugin_instance event)) 21 | service (cond-> (str (:type event)) (:type_instance event) (str "." (:type_instance event)))] 22 | (assoc event :service service :host host))) 23 | 24 | (def default-services 25 | [{:service #"^load/load/(.*)$" :rewrite "load $1"} 26 | {:service #"^swap/percent-(.*)$" :rewrite "swap $1"} 27 | {:service #"^memory/percent-(.*)$" :rewrite "memory $1"} 28 | {:service #"^processes/ps_state-(.*)$" :rewrite "processes $1"} 29 | {:service #"^processes-(.*)/(.*)$" :rewrite "processes $1 $2"} 30 | {:service #"^cpu/percent-(.*)$" :rewrite "cpu $1"} 31 | {:service #"^df-(.*)/(df_complex|percent_bytes)-(.*)$" :rewrite "df $1 $2 $3"} 32 | {:service #"^interface-(.*)/if_(errors|packets|octets)/(tx|rx)$" :rewrite "nic $1 $3 $2"} 33 | {:service #"^protocols-(.*)/(.*)$" :rewrite "protocols $1 $2"} 34 | {:service #"^GenericJMX-(:?_|\/)?(.*)$" :rewrite "jmx $2"} 35 | {:service #"^haproxy\/(gauge|derive)-(.*)$" :rewrite "haproxy $2"} 36 | {:service #"^statsd\/(gauge|derive|latency)-(.*)$" :rewrite "$2"} 37 | {:service #"^statsd\/(gauge|derive|latency)-(.*)$" :rewrite "statsd $1 $2"} 38 | {:service #"^mysql-(.*)\/(counter|gauge)-(.*)$" :rewrite "mysql $1 $3"} 39 | {:service #"^dbi-(.*)\/(gauge|counter)-(.*)$" :rewrite "dbi $1 $3"} 40 | {:service #"^redis-(.*)$" :rewrite "redis $1"}]) 41 | 42 | (defn rewrite-service-with 43 | [rules] 44 | (let [matcher (fn [s1 s2] (if (string? s1) (= s1 s2) (re-find s1 s2)))] 45 | (fn [{:keys [service] :as event}] 46 | (or 47 | (first 48 | (for [{:keys [rewrite] :as rule} rules 49 | :when (matcher (:service rule) service)] 50 | (assoc event :service 51 | (if (string? (:service rule)) 52 | rewrite 53 | (str/replace service (:service rule) rewrite))))) 54 | event)))) 55 | 56 | (def rewrite-service 57 | (rewrite-service-with default-services)) 58 | -------------------------------------------------------------------------------- /8/riemann/examplecom/etc/email.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.email 2 | (:require [riemann.email :refer :all])) 3 | 4 | (def email (mailer {:from "riemann@example.com"})) 5 | -------------------------------------------------------------------------------- /8/riemann/examplecom/etc/graphite.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.graphite 2 | (:require [clojure.string :as str] 3 | [riemann.config :refer :all] 4 | [riemann.graphite :refer :all])) 5 | 6 | (defn add-environment-to-graphite [event] 7 | (condp = (:plugin event) 8 | "docker" 9 | (if (:com.example.application event) 10 | (str "productiona.docker.", (:com.example.application event), ".", (riemann.graphite/graphite-path-percentiles event)) 11 | (str "productiona.docker.", (riemann.graphite/graphite-path-percentiles event))) 12 | (str "productiona.hosts.", (riemann.graphite/graphite-path-percentiles event)))) 13 | 14 | (def graph (async-queue! :graphite {:queue-size 1000} 15 | (graphite {:host "graphitea" :path add-environment-to-graphite}))) 16 | -------------------------------------------------------------------------------- /8/riemann/examplecom/etc/logstash.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.logstash 2 | (:require [riemann.logstash :refer :all])) 3 | 4 | (def logstash (async-queue! :logstash {:queue-size 1000} 5 | (logstash {:host "logstash" :port 2003 :port-size 20}))) 6 | -------------------------------------------------------------------------------- /8/riemann/riemann.config: -------------------------------------------------------------------------------- 1 | (logging/init {:file "/var/log/riemann/riemann.log"}) 2 | 3 | (require 'riemann.client) 4 | (require '[examplecom.etc.email :refer :all]) 5 | (require '[examplecom.etc.graphite :refer :all]) 6 | (require '[examplecom.etc.collectd :refer :all]) 7 | 8 | (let [host "0.0.0.0"] 9 | (repl-server {:host "127.0.0.1"}) 10 | (tcp-server {:host host}) 11 | (udp-server {:host host}) 12 | (ws-server {:host host})) 13 | 14 | (periodically-expire 10 {:keep-keys [:host :service :tags, :state, :description, :metric]}) 15 | 16 | (let [index (index) 17 | downstream (batch 100 1/10 18 | (async-queue! :agg { :queue-size 1e3 19 | :core-pool-size 4 20 | :max-pool-size 32} 21 | (forward 22 | (riemann.client/tcp-client :host "riemannmc"))))] 23 | 24 | ; Inbound events will be passed to these streams: 25 | (streams 26 | (default :ttl 60 27 | ; Index all events immediately. 28 | (where (not (tagged "notification")) 29 | index) 30 | 31 | (tagged "collectd" 32 | (where (not (= (:plugin event) "docker")) 33 | (smap rewrite-service graph)) 34 | 35 | (where (= (:plugin event) "docker") 36 | (smap (comp parse-docker-service-host docker-attributes rewrite-service) graph)) 37 | 38 | (tagged "notification" 39 | (changed-state {:init "ok"} 40 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count$" "$1"] 41 | (email "james@example.com")))) 42 | 43 | (where (and (expired? event) 44 | (service #"^processes-.+\/ps_count\/processes")) 45 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count\/processes$" "$1"] 46 | (email "james@example.com")))) 47 | 48 | (where (service #"^riemann.*") 49 | graph 50 | 51 | downstream)))) 52 | -------------------------------------------------------------------------------- /9/collectd/statsd.conf: -------------------------------------------------------------------------------- 1 | LoadPlugin statsd 2 | 3 | 4 | Host "localhost" 5 | Port "8125" 6 | TimerPercentile 90 7 | TimerPercentile 99 8 | TimerLower true 9 | TimerUpper true 10 | TimerSum true 11 | TimerCount true 12 | 13 | -------------------------------------------------------------------------------- /9/riemann/examplecom/etc/checks.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.checks 2 | (:require [riemann.config :refer :all] 3 | [clojure.tools.logging :refer :all] 4 | [riemann.streams :refer :all])) 5 | 6 | (defn set_state [warning critical] 7 | (fn [event] 8 | (assoc event :state 9 | (condp < (:metric event) 10 | critical "critical" 11 | warning "warning" 12 | "ok")))) 13 | 14 | (defn check_threshold [srv window func warning critical & children] 15 | (where (service srv) 16 | (fixed-time-window window 17 | (smap func 18 | (where (< warning metric) 19 | (smap (set_state warning critical) 20 | (fn [event] 21 | (call-rescue event children)))))))) 22 | 23 | (defn check_percentiles [srv window & children] 24 | (where (service srv) 25 | (percentiles window [0.5 0.95 0.99 1] 26 | (fn [event] 27 | (call-rescue event children))))) 28 | -------------------------------------------------------------------------------- /9/riemann/examplecom/etc/collectd.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.collectd 2 | (:require [clojure.tools.logging :refer :all] 3 | [riemann.streams :refer :all] 4 | [clojure.string :as str] 5 | [clojure.walk :as walk])) 6 | 7 | (defn docker-attribute-map 8 | [attributes] 9 | (let [instance (str/split (str/replace attributes #"^.*\[(.*)\]$" "$1") #",")] 10 | (walk/keywordize-keys (into {} (for [pair instance] (apply hash-map (str/split pair #"="))))))) 11 | 12 | (defn docker-attributes 13 | [{:keys [plugin_instance] :as event}] 14 | (if-let [attributes (re-find #"^.*\[.*\]$" plugin_instance)] 15 | (merge event (docker-attribute-map attributes)) 16 | event)) 17 | 18 | (defn parse-docker-service-host 19 | [{:keys [type type_instance plugin_instance] :as event}] 20 | (let [host (re-find #"^\w+\.?\w+\.?\w+" (:plugin_instance event)) 21 | service (cond-> (str (:type event)) (:type_instance event) (str "." (:type_instance event)))] 22 | (assoc event :service service :host host))) 23 | 24 | (def default-services 25 | [{:service #"^load/load/(.*)$" :rewrite "load $1"} 26 | {:service #"^swap/percent-(.*)$" :rewrite "swap $1"} 27 | {:service #"^memory/percent-(.*)$" :rewrite "memory $1"} 28 | {:service #"^processes/ps_state-(.*)$" :rewrite "processes $1"} 29 | {:service #"^processes-(.*)/(.*)$" :rewrite "processes $1 $2"} 30 | {:service #"^cpu/percent-(.*)$" :rewrite "cpu $1"} 31 | {:service #"^df-(.*)/(df_complex|percent_bytes)-(.*)$" :rewrite "df $1 $2 $3"} 32 | {:service #"^interface-(.*)/if_(errors|packets|octets)/(tx|rx)$" :rewrite "nic $1 $3 $2"} 33 | {:service #"^protocols-(.*)/(.*)$" :rewrite "protocols $1 $2"} 34 | {:service #"^GenericJMX-(:?_|\/)?(.*)$" :rewrite "jmx $2"} 35 | {:service #"^haproxy\/(gauge|derive)-(.*)$" :rewrite "haproxy $2"} 36 | {:service #"^statsd\/(gauge|derive|latency)-(.*)$" :rewrite "$2"} 37 | {:service #"^statsd\/(gauge|derive|latency)-(.*)$" :rewrite "statsd $1 $2"} 38 | {:service #"^mysql-(.*)\/(counter|gauge)-(.*)$" :rewrite "mysql $1 $3"} 39 | {:service #"^dbi-(.*)\/(gauge|counter)-(.*)$" :rewrite "dbi $1 $3"} 40 | {:service #"^redis-(.*)$" :rewrite "redis $1"}]) 41 | 42 | (defn rewrite-service-with 43 | [rules] 44 | (let [matcher (fn [s1 s2] (if (string? s1) (= s1 s2) (re-find s1 s2)))] 45 | (fn [{:keys [service] :as event}] 46 | (or 47 | (first 48 | (for [{:keys [rewrite] :as rule} rules 49 | :when (matcher (:service rule) service)] 50 | (assoc event :service 51 | (if (string? (:service rule)) 52 | rewrite 53 | (str/replace service (:service rule) rewrite))))) 54 | event)))) 55 | 56 | (def rewrite-service 57 | (rewrite-service-with default-services)) 58 | -------------------------------------------------------------------------------- /9/riemann/examplecom/etc/email.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.email 2 | (:require [riemann.email :refer :all])) 3 | 4 | (def email (mailer {:from "riemann@example.com"})) 5 | -------------------------------------------------------------------------------- /9/riemann/examplecom/etc/graphite.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.graphite 2 | (:require [clojure.string :as str] 3 | [riemann.config :refer :all] 4 | [riemann.graphite :refer :all])) 5 | 6 | (defn graphite-path-statsd [event] 7 | (let [host (:host event) 8 | app (re-find #"^.*?\." (:service event)) 9 | service (str/replace-first (:service event) #"^.*?\." "") 10 | split-host (if host (str/split host #"\.") []) 11 | split-service (if service (str/split service #" ") [])] 12 | (str app, (str/join "." (concat (reverse split-host) split-service))))) 13 | 14 | (defn add-environment-to-graphite [event] 15 | (condp = (:plugin event) 16 | "docker" 17 | (if (:com.example.application event) 18 | (str "productiona.docker.", (:com.example.application event), ".", (riemann.graphite/graphite-path-percentiles event)) 19 | (str "productiona.docker.", (riemann.graphite/graphite-path-percentiles event))) 20 | "statsd" (str "productiona.", (graphite-path-statsd event)) 21 | (str "productiona.hosts.", (riemann.graphite/graphite-path-percentiles event)))) 22 | 23 | (def graph (async-queue! :graphite {:queue-size 1000} 24 | (graphite {:host "graphitea" :path add-environment-to-graphite}))) 25 | -------------------------------------------------------------------------------- /9/riemann/examplecom/etc/logstash.clj: -------------------------------------------------------------------------------- 1 | (ns examplecom.etc.logstash 2 | (:require [riemann.logstash :refer :all])) 3 | 4 | (def logstash (async-queue! :logstash {:queue-size 1000} 5 | (logstash {:host "logstash" :port 2003 :port-size 20}))) 6 | -------------------------------------------------------------------------------- /9/riemann/riemann.config: -------------------------------------------------------------------------------- 1 | (logging/init {:file "/var/log/riemann/riemann.log"}) 2 | 3 | (require 'riemann.client) 4 | (require '[examplecom.etc.email :refer :all]) 5 | (require '[examplecom.etc.graphite :refer :all]) 6 | (require '[examplecom.etc.collectd :refer :all]) 7 | 8 | (let [host "0.0.0.0"] 9 | (repl-server {:host "127.0.0.1"}) 10 | (tcp-server {:host host}) 11 | (udp-server {:host host}) 12 | (ws-server {:host host})) 13 | 14 | (periodically-expire 10 {:keep-keys [:host :service :tags, :state, :description, :metric]}) 15 | 16 | (let [index (index) 17 | downstream (batch 100 1/10 18 | (async-queue! :agg { :queue-size 1e3 19 | :core-pool-size 4 20 | :max-pool-size 32} 21 | (forward 22 | (riemann.client/tcp-client :host "riemannmc"))))] 23 | 24 | ; Inbound events will be passed to these streams: 25 | (streams 26 | (default :ttl 60 27 | ; Index all events immediately. 28 | (where (not (tagged "notification")) 29 | index) 30 | 31 | (tagged "collectd" 32 | (where (not (= (:plugin event) "docker")) 33 | (smap rewrite-service graph)) 34 | 35 | (where (= (:plugin event) "docker") 36 | (smap (comp parse-docker-service-host docker-attributes rewrite-service) graph)) 37 | 38 | (tagged "notification" 39 | (changed-state {:init "ok"} 40 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count$" "$1"] 41 | (email "james@example.com")))) 42 | 43 | (where (and (expired? event) 44 | (service #"^processes-.+\/ps_count\/processes")) 45 | (adjust [:service clojure.string/replace #"^processes-(.*)\/ps_count\/processes$" "$1"] 46 | (email "james@example.com")))) 47 | 48 | (where (service #"^riemann.*") 49 | graph 50 | 51 | downstream)))) 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The Art of Monitoring code repository 2 | 3 | The source code to accompany [The Art of 4 | Monitoring](http://artofmonitoring.com) book. 5 | 6 | Each directory contains one or more chapter's code. 7 | --------------------------------------------------------------------------------