├── .gitignore ├── hosts ├── ansible.cfg ├── roles └── common │ ├── handlers │ └── main.yml │ ├── files │ ├── noroot.conf │ └── vimrc │ └── tasks │ └── main.yml ├── requirements.yml ├── group_vars └── all │ ├── grafana.yml │ ├── vars.yml │ ├── vault.yml │ └── prometheus.yml ├── playbook.yml └── am_templates └── my_telegram.tmpl /.gitignore: -------------------------------------------------------------------------------- 1 | .vault_pass 2 | *.swp 3 | -------------------------------------------------------------------------------- /hosts: -------------------------------------------------------------------------------- 1 | [monitoring] 2 | 95.217.22.53 3 | -------------------------------------------------------------------------------- /ansible.cfg: -------------------------------------------------------------------------------- 1 | [defaults] 2 | vault_password_file = .vault_pass 3 | inventory = hosts 4 | -------------------------------------------------------------------------------- /roles/common/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: restart sshd 3 | systemd: name=ssh state=restarted 4 | -------------------------------------------------------------------------------- /roles/common/files/noroot.conf: -------------------------------------------------------------------------------- 1 | PermitRootLogin no 2 | PasswordAuthentication no 3 | X11Forwarding no 4 | -------------------------------------------------------------------------------- /requirements.yml: -------------------------------------------------------------------------------- 1 | --- 2 | collections: 3 | - name: prometheus.prometheus 4 | type: galaxy 5 | version: "0.23.0" 6 | - name: grafana.grafana 7 | type: galaxy 8 | version: "5.7.0" 9 | 10 | roles: 11 | - name: caddy_ansible.caddy_ansible 12 | -------------------------------------------------------------------------------- /group_vars/all/grafana.yml: -------------------------------------------------------------------------------- 1 | --- 2 | grafana_ini: 3 | security: 4 | admin_user: "{{ v_grafana_admin_user }}" 5 | admin_password: "{{ v_grafana_admin_password }}" 6 | 7 | grafana_datasources: 8 | - name: Prometheus 9 | type: prometheus 10 | access: proxy 11 | url: "http://127.0.0.1:9090" 12 | isDefault: true 13 | 14 | grafana_dashboards: 15 | - dashboard_id: 7362 16 | revision_id: 5 17 | datasource: Prometheus 18 | - dashboard_id: 1860 19 | revision_id: 37 20 | datasource: Prometheus 21 | - dashboard_id: 9628 22 | revision_id: 8 23 | datasource: Prometheus 24 | -------------------------------------------------------------------------------- /playbook.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | gather_facts: no 4 | become: yes 5 | roles: 6 | - { role: common, tags: common } 7 | 8 | - hosts: all 9 | tasks: 10 | - name: Check that we're not a root user 11 | when: ansible_env.USER != admin 12 | fail: 13 | msg: Please restart the playbook with {{ admin }} user. 14 | 15 | - hosts: all 16 | become: yes 17 | roles: 18 | - { role: caddy_ansible.caddy_ansible, tags: caddy } 19 | - { role: prometheus.prometheus.node_exporter, tags: exporter } 20 | - { role: prometheus.prometheus.prometheus, tags: prometheus } 21 | - { role: prometheus.prometheus.alertmanager, tags: alerts } 22 | - { role: grafana.grafana.grafana, tags: grafana } 23 | -------------------------------------------------------------------------------- /group_vars/all/vars.yml: -------------------------------------------------------------------------------- 1 | --- 2 | admin: zverik 3 | 4 | caddy_config: | 5 | monitor.izv.ee { 6 | reverse_proxy 127.0.0.1:3000 7 | } 8 | p.monitor.izv.ee { 9 | reverse_proxy 127.0.0.1:9090 10 | } 11 | a.monitor.izv.ee { 12 | reverse_proxy 127.0.0.1:9093 13 | } 14 | 15 | alertmanager_web_external_url: "https://a.monitor.izv.ee" 16 | alertmanager_receivers: 17 | - name: telegram 18 | telegram_configs: 19 | - bot_token: "{{ v_telegram_token }}" 20 | chat_id: -1002270253193 21 | message: !unsafe '{{ template "my.telegram.message" . }}' 22 | 23 | alertmanager_route: 24 | group_by: ['alertname', 'cluster', 'service'] 25 | group_wait: 30s 26 | group_interval: 5m 27 | repeat_interval: 3h 28 | receiver: telegram 29 | 30 | alertmanager_template_files: ["am_templates/*.tmpl"] 31 | -------------------------------------------------------------------------------- /am_templates/my_telegram.tmpl: -------------------------------------------------------------------------------- 1 | {{- define "my.severity.emoji" -}} 2 | {{- $priority := "" -}} 3 | {{- $severity := index .Labels "severity" -}} 4 | {{- if (eq $severity "critical") -}} 5 | {{- $priority = "🚨" -}} 6 | {{- else if (eq $severity "warning") -}} 7 | {{- $priority = "⚠️" -}} 8 | {{- else if (eq $severity "info") -}} 9 | {{- $priority = "ℹ️" -}} 10 | {{- end -}} 11 | {{- $priority -}} 12 | {{- end -}} 13 | 14 | {{- define "my.alert.desc" -}} 15 | {{- index .Annotations "summary" -}} 16 | {{- $desc := index .Annotations "description" -}} 17 | {{ if (ne $desc "") }} 18 | {{ $desc }}{{ end }} 19 | {{ .GeneratorURL }} 20 | {{ end }} 21 | 22 | {{- define "my.telegram.message" -}} 23 | {{- range .Alerts.Firing -}} 24 | {{- template "my.severity.emoji" . }} [FIRING] {{ template "my.alert.desc" . -}} 25 | {{- end -}} 26 | {{- range .Alerts.Resolved -}} 27 | {{ template "my.severity.emoji" . }} [Resolved] {{ template "my.alert.desc" . -}} 28 | {{- end -}} 29 | {{- end -}} 30 | -------------------------------------------------------------------------------- /group_vars/all/vault.yml: -------------------------------------------------------------------------------- 1 | $ANSIBLE_VAULT;1.1;AES256 2 | 65383262366461653064323738306664306132616231636439363261613232326530386232646265 3 | 6137393764333631656139653035323462393763633762390a643339346439363964633838643635 4 | 62343037653737356564613730313062343732326335343138653333623566356132366465633966 5 | 3736343338366364350a306438333031616365343534636338346366396564303066663635653131 6 | 38396234363937383032393732313933326565323234363138396634643161393139626465376632 7 | 65343537656236313035393330323637363461376431363562306433313634633736346330373132 8 | 37656539313765376262363934666231323866363138636237313962393337363838363630323636 9 | 32333436346363313436623934393261396465366231313531643934323062353766333566333564 10 | 39613865656162633331316238323532316465353432393434626564633635636265363830316563 11 | 34363236626461633665663036303539616266396466626637313964303234326165383062356233 12 | 39613832663435623739383932623163646664333131386131353135666233333936633561346237 13 | 39326133626535383564313638396136313761393134393737343733383133386336663836633763 14 | 63373265663731323066326338383564653235386131643439663262653333356365 15 | -------------------------------------------------------------------------------- /roles/common/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install python for Ansible 3 | raw: test -e /usr/bin/python3 || (apt -qy update && apt install -y python3) 4 | register: output 5 | changed_when: output.stdout 6 | 7 | - name: Ensure apt cache is up to date 8 | apt: update_cache=yes cache_valid_time=3600 upgrade=dist 9 | changed_when: False 10 | 11 | - name: install setfacl support 12 | apt: name=acl 13 | 14 | - name: Create {{ admin }} user 15 | user: 16 | name: "{{ admin }}" 17 | shell: /bin/bash 18 | 19 | - name: Add a ssh key to {{ admin }} 20 | authorized_key: 21 | user: "{{ admin }}" 22 | state: present 23 | key: ecdsa-sha2-nistp384 AAAAE2VjZHNhLXNoYTItbmlzdHAzODQAAAAIbmlzdHAzODQAAABhBP2SWls1C24jvQ9F3MVFQ2Lyl0zgWheBqellShhlF5d8RWP61/C8pr/ejiowEDxtTrvrd0iJum6amOvFp9I5eAXJGIoVxQiD0Y5ae+TaaVfsu/5XlwedQin31OmkdnRyrA== zverik@fedora 24 | 25 | - name: Add {{ admin }} to sudoers 26 | copy: 27 | content: "{{ admin }} ALL=(ALL) NOPASSWD:ALL" 28 | dest: /etc/sudoers.d/admin 29 | 30 | - name: Remove ssh authorized_key 31 | file: 32 | path: /root/.ssh/authorized_keys 33 | state: absent 34 | 35 | - name: Forbid root ssh login 36 | copy: 37 | src: noroot.conf 38 | dest: /etc/ssh/sshd_config.d/ 39 | notify: restart sshd 40 | 41 | - name: Install useful packages 42 | apt: 43 | name: 44 | - tmux 45 | - htop 46 | - ncdu 47 | - vim 48 | state: present 49 | 50 | - name: Upload vimrc 51 | copy: 52 | src: vimrc 53 | dest: /home/{{ admin }}/.vimrc 54 | owner: "{{ admin }}" 55 | -------------------------------------------------------------------------------- /roles/common/files/vimrc: -------------------------------------------------------------------------------- 1 | " An example for a vimrc file. 2 | " 3 | " Maintainer: Bram Moolenaar 4 | " Last change: 2011 Apr 15 5 | " 6 | " To use it, copy it to 7 | " for Unix and OS/2: ~/.vimrc 8 | " for Amiga: s:.vimrc 9 | " for MS-DOS and Win32: $VIM\_vimrc 10 | " for OpenVMS: sys$login:.vimrc 11 | 12 | " Use Vim settings, rather than Vi settings (much better!). 13 | " This must be first, because it changes other options as a side effect. 14 | set nocompatible 15 | 16 | " allow backspacing over everything in insert mode 17 | set backspace=indent,eol,start 18 | set softtabstop=2 19 | set shiftwidth=2 20 | set fileencodings=utf-8,cp1251,dos 21 | set expandtab 22 | " set noexpandtab 23 | set tabstop=2 24 | set fileformats+=dos 25 | set nohlsearch 26 | 27 | "syntax enable 28 | set background=dark 29 | "colorscheme solarized 30 | 31 | if &term =~ "xterm" 32 | set t_Co=256 33 | endif 34 | 35 | if has("vms") 36 | set nobackup " do not keep a backup file, use versions instead 37 | else 38 | set backup " keep a backup file 39 | set backupdir=~/.vim/backup,/tmp 40 | endif 41 | set history=50 " keep 50 lines of command line history 42 | set ruler " show the cursor position all the time 43 | set showcmd " display incomplete commands 44 | set incsearch " do incremental searching 45 | 46 | " For Win32 GUI: remove 't' flag from 'guioptions': no tearoff menu entries 47 | " let &guioptions = substitute(&guioptions, "t", "", "g") 48 | 49 | " Don't use Ex mode, use Q for formatting 50 | map Q gq 51 | 52 | " CTRL-U in insert mode deletes a lot. Use CTRL-G u to first break undo, 53 | " so that you can undo CTRL-U after inserting a line break. 54 | inoremap u 55 | 56 | " In many terminal emulators the mouse works just fine, thus enable it. 57 | "if has('mouse') 58 | " set mouse=a 59 | "endif 60 | 61 | " Switch syntax highlighting on, when the terminal has colors 62 | " Also switch on highlighting the last used search pattern. 63 | if &t_Co > 2 || has("gui_running") 64 | syntax on 65 | set nohlsearch 66 | endif 67 | 68 | " Only do this part when compiled with support for autocommands. 69 | if has("autocmd") 70 | 71 | " Enable file type detection. 72 | " Use the default filetype settings, so that mail gets 'tw' set to 72, 73 | " 'cindent' is on in C files, etc. 74 | " Also load indent files, to automatically do language-dependent indenting. 75 | filetype plugin indent on 76 | 77 | " Put these in an autocmd group, so that we can delete them easily. 78 | augroup vimrcEx 79 | au! 80 | 81 | " For all text files set 'textwidth' to 78 characters. 82 | "autocmd FileType text setlocal textwidth=78 83 | 84 | " When editing a file, always jump to the last known cursor position. 85 | " Don't do it when the position is invalid or when inside an event handler 86 | " (happens when dropping a file on gvim). 87 | " Also don't do it when the mark is in the first line, that is the default 88 | " position when opening a file. 89 | autocmd BufReadPost * 90 | \ if line("'\"") > 1 && line("'\"") <= line("$") | 91 | \ exe "normal! g`\"" | 92 | \ endif 93 | 94 | augroup END 95 | 96 | else 97 | 98 | set autoindent " always set autoindenting on 99 | 100 | endif " has("autocmd") 101 | 102 | " Convenient command to see the difference between the current buffer and the 103 | " file it was loaded from, thus the changes you made. 104 | " Only define it when not defined already. 105 | if !exists(":DiffOrig") 106 | command DiffOrig vert new | set bt=nofile | r ++edit # | 0d_ | diffthis 107 | \ | wincmd p | diffthis 108 | endif 109 | 110 | au BufNewFile,BufRead *.py 111 | \ set tabstop=4 | 112 | \ set softtabstop=4 | 113 | \ set shiftwidth=4 | 114 | \ set expandtab | 115 | \ set autoindent | 116 | \ set fileformat=unix 117 | 118 | au BufNewFile,BufRead *.js,*.html,*.css 119 | \ set tabstop=2 | 120 | \ set softtabstop=2 | 121 | \ set shiftwidth=2 122 | 123 | set shiftwidth=2 124 | "set expandtab 125 | 126 | highlight BadWhitespace ctermbg=red guibg=darkred 127 | au BufRead,BufNewFile *.py,*.pyw,*.c,*.h match BadWhitespace /\s\+$/ 128 | -------------------------------------------------------------------------------- /group_vars/all/prometheus.yml: -------------------------------------------------------------------------------- 1 | --- 2 | prometheus_web_external_url: "https://p.monitor.izv.ee" 3 | prometheus_storage_retention: "180d" 4 | 5 | prometheus_global: 6 | scrape_interval: 10s 7 | scrape_timeout: 9s 8 | evaluation_interval: 15s 9 | 10 | prometheus_alertmanager_config: 11 | - scheme: http 12 | static_configs: 13 | - targets: 14 | - "127.0.0.1:9093" 15 | 16 | prometheus_scrape_configs: 17 | - job_name: prometheus 18 | static_configs: 19 | - targets: ['127.0.0.1:9090'] 20 | - job_name: izv_node 21 | static_configs: 22 | - targets: ['izv.ee:9100'] 23 | - job_name: izv_mysql 24 | static_configs: 25 | - targets: ['izv.ee:9104'] 26 | - job_name: tile_node 27 | static_configs: 28 | - targets: ['tile.osmz.ru:9100'] 29 | - job_name: tile_psql 30 | static_configs: 31 | - targets: ['tile.osmz.ru:9187'] 32 | 33 | prometheus_alert_rules: 34 | - alert: Watchdog 35 | expr: vector(1) 36 | for: 10m 37 | labels: 38 | severity: info 39 | annotations: 40 | summary: 'Ensure entire alerting pipeline is functional' 41 | - alert: InstanceDown 42 | expr: 'up == 0' 43 | for: 5m 44 | labels: 45 | severity: critical 46 | annotations: 47 | summary: '{% raw %}Instance {{ $labels.instance }} down{% endraw %}' 48 | - alert: MySQLDown 49 | expr: 'mysql_up == 0' 50 | for: 5m 51 | labels: 52 | severity: critical 53 | annotations: 54 | summary: '{% raw %}MySQL on instance {{ $labels.instance }} down{% endraw %}' 55 | - alert: PostgresDown 56 | expr: 'pg_up == 0' 57 | for: 5m 58 | labels: 59 | severity: critical 60 | annotations: 61 | summary: '{% raw %}PostgreSQL on instance {{ $labels.instance }} down{% endraw %}' 62 | - alert: RebootRequired 63 | expr: 'node_reboot_required > 0' 64 | labels: 65 | severity: warning 66 | annotations: 67 | description: '{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}' 68 | - alert: NodeFilesystemAlmostOutOfSpace 69 | annotations: 70 | description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}' 71 | summary: 'Filesystem has less than 2% space left.' 72 | expr: "(\n node_filesystem_avail_bytes{fstype!=\"\"} / node_filesystem_size_bytes{fstype!=\"\"} * 100 < 2\nand\n node_filesystem_readonly{fstype!=\"\"} == 0\n)\n" 73 | for: 1h 74 | labels: 75 | severity: critical 76 | - alert: NodeFilesystemAlmostOutOfFiles 77 | annotations: 78 | description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}' 79 | summary: 'Filesystem has less than 3% inodes left.' 80 | expr: "(\n node_filesystem_files_free{fstype!=\"\"} / node_filesystem_files{fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{fstype!=\"\"} == 0\n)\n" 81 | for: 1h 82 | labels: 83 | severity: critical 84 | - alert: NodeNetworkReceiveErrs 85 | annotations: 86 | description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.{% endraw %}' 87 | summary: 'Network interface is reporting many receive errors.' 88 | expr: "increase(node_network_receive_errs_total[2m]) > 10\n" 89 | for: 1h 90 | labels: 91 | severity: warning 92 | - alert: NodeNetworkTransmitErrs 93 | annotations: 94 | description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.{% endraw %}' 95 | summary: 'Network interface is reporting many transmit errors.' 96 | expr: "increase(node_network_transmit_errs_total[2m]) > 10\n" 97 | for: 1h 98 | labels: 99 | severity: warning 100 | - alert: NodeHighNumberConntrackEntriesUsed 101 | annotations: 102 | description: '{% raw %}{{ $value | humanizePercentage }} of conntrack entries are used{% endraw %}' 103 | summary: 'Number of conntrack are getting close to the limit' 104 | expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n" 105 | labels: 106 | severity: warning 107 | - alert: NodeClockSkewDetected 108 | annotations: 109 | message: '{% raw %}Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.{% endraw %}' 110 | summary: 'Clock skew detected.' 111 | expr: "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n" 112 | for: 10m 113 | labels: 114 | severity: warning 115 | - alert: NodeClockNotSynchronising 116 | annotations: 117 | message: '{% raw %}Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.{% endraw %}' 118 | summary: 'Clock not synchronising.' 119 | expr: "min_over_time(node_timex_sync_status[5m]) == 0\n" 120 | for: 10m 121 | labels: 122 | severity: warning 123 | --------------------------------------------------------------------------------