├── LICENSE ├── README.md ├── ansible.cfg ├── inventory ├── playbook.yml └── roles ├── alertmanager ├── files │ └── alertmanager.yml ├── handlers │ └── main.yml ├── tasks │ └── main.yml ├── templates │ └── init.service.j2 └── vars │ └── main.yml ├── prometheus ├── files │ └── alertrules.yml ├── handlers │ └── main.yml ├── tasks │ └── main.yml ├── templates │ ├── init.service.j2 │ └── prometheus.conf.j2 └── vars │ └── main.yml └── prometheus_node_exporter ├── tasks └── main.yml ├── templates └── init.service.j2 └── vars └── main.yml /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Mitesh Sharma 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Prometheus with AlertManager using Ansible 2 | 3 | In this project, we are configurating prometheus, node_exporter with alertmanager. We track "up" state of node_exporter instance and if instance goes down prometheus trigger an alert to alertmanager which is fired to slack. 4 | 5 | ## Getting Started 6 | 7 | Step 1: Update slack webhook slack_api_url param in alermanager template file alertmanager.yml.j2 8 | 9 | Step 2: Update ip address of instances in inventory file. 10 | 11 | Step 3: Run ansible command to setup prometheus, node_exporter and alertmanager services 12 | 13 | Ansible command: ansible-playbook playbook.yml 14 | -------------------------------------------------------------------------------- /ansible.cfg: -------------------------------------------------------------------------------- 1 | [defaults] 2 | inventory = ./inventory 3 | remote_user = ec2-user -------------------------------------------------------------------------------- /inventory: -------------------------------------------------------------------------------- 1 | [prometheus] 2 | 18.222.106.129 3 | 4 | [node_exporter] 5 | 18.224.63.194 6 | 7 | [alertmanager] 8 | 18.216.247.122 -------------------------------------------------------------------------------- /playbook.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: alertmanager 3 | become: yes 4 | become_user: root 5 | become_method: sudo 6 | roles: 7 | - alertmanager 8 | 9 | - hosts: node_exporter 10 | become: yes 11 | become_user: root 12 | become_method: sudo 13 | roles: 14 | - prometheus_node_exporter 15 | 16 | - hosts: prometheus 17 | become: yes 18 | become_user: root 19 | become_method: sudo 20 | roles: 21 | - prometheus -------------------------------------------------------------------------------- /roles/alertmanager/files/alertmanager.yml: -------------------------------------------------------------------------------- 1 | global: 2 | slack_api_url: "https://hooks.slack.com/services/SlackWebhookAPI" 3 | 4 | route: 5 | group_by: ['instance', 'severity'] 6 | group_wait: 30s 7 | group_interval: 5m 8 | repeat_interval: 3h 9 | routes: 10 | - match: 11 | alertname: InstanceDown 12 | receiver: 'alert-team' 13 | 14 | receivers: 15 | - name: 'alert-team' 16 | slack_configs: 17 | - channel: "#webhook-test" 18 | text: "summary: {{ .CommonAnnotations.summary }}\ndescription: {{ .CommonAnnotations.description }}" -------------------------------------------------------------------------------- /roles/alertmanager/handlers/main.yml: -------------------------------------------------------------------------------- 1 | - name: Reload systemd 2 | command: systemctl daemon-reload 3 | listen: systemd_reload -------------------------------------------------------------------------------- /roles/alertmanager/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: Creating alertmanager user group 2 | group: name="{{groupId}}" 3 | become: true 4 | 5 | - name: Creating alertmanager user 6 | user: 7 | name: "{{userId}}" 8 | group: "{{groupId}}" 9 | system: yes 10 | shell: "/sbin/nologin" 11 | comment: "{{userId}} nologin User" 12 | createhome: "no" 13 | state: present 14 | 15 | - name: Download alertmanager 16 | unarchive: 17 | src: "https://github.com/prometheus/alertmanager/releases/download/v{{ version }}/alertmanager-{{ version }}.linux-amd64.tar.gz" 18 | dest: /tmp/ 19 | remote_src: yes 20 | 21 | - name: Copy alertmanager executable to bin 22 | copy: 23 | src: "/tmp/alertmanager-{{ version }}.linux-amd64/alertmanager" 24 | dest: "/usr/local/bin/alertmanager" 25 | owner: "{{userId}}" 26 | group: "{{groupId}}" 27 | remote_src: yes 28 | mode: 0755 29 | 30 | - name: Delete alertmanager tmp folder 31 | file: 32 | path: '/tmp/alertmanager-{{ version }}.linux-amd64' 33 | state: absent 34 | 35 | - name: Creates data directory 36 | file: 37 | path: "/data/alertmanager/" 38 | state: directory 39 | owner: "{{userId}}" 40 | group: "{{groupId}}" 41 | mode: 0755 42 | 43 | - name: Creates config directory 44 | file: 45 | path: "/etc/alertmanager/" 46 | state: directory 47 | owner: "{{userId}}" 48 | group: "{{groupId}}" 49 | mode: 0755 50 | 51 | - name: Copy config file 52 | copy: 53 | src: "{{ role_path }}/files/alertmanager.yml" 54 | dest: /etc/alertmanager/alertmanager.yml 55 | 56 | - name: Copy systemd init file 57 | template: 58 | src: init.service.j2 59 | dest: /etc/systemd/system/alertmanager.service 60 | notify: systemd_reload 61 | 62 | - name: Start alertmanager service 63 | service: 64 | name: alertmanager 65 | state: started 66 | enabled: yes 67 | 68 | - name: Check if alertmanager is accessible 69 | uri: 70 | url: http://localhost:9093 71 | method: GET 72 | status_code: 200 -------------------------------------------------------------------------------- /roles/alertmanager/templates/init.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description={{serviceName}} 3 | Wants=network-online.target 4 | After=network-online.target 5 | 6 | [Service] 7 | User={{ userId }} 8 | Group={{ groupId }} 9 | Restart=always 10 | RestartSec=2 11 | StartLimitInterval=0 12 | Type=simple 13 | ExecStart={{ exec_command }} 14 | 15 | [Install] 16 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /roles/alertmanager/vars/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | serviceName: "alertmanager" 3 | userId: "alertmanager" 4 | groupId: "alertmanager" 5 | exec_command: "/usr/local/bin/alertmanager --config.file=/etc/alertmanager/alertmanager.yml --storage.path=/data/alertmanager" 6 | version: "0.15.3" -------------------------------------------------------------------------------- /roles/prometheus/files/alertrules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: alert.rules 3 | rules: 4 | - alert: InstanceDown 5 | expr: up == 0 6 | for: 1m 7 | labels: 8 | severity: "critical" 9 | annotations: 10 | summary: "Endpoint {{ $labels.instance }} down" 11 | description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes." -------------------------------------------------------------------------------- /roles/prometheus/handlers/main.yml: -------------------------------------------------------------------------------- 1 | - name: Restart the Prometheus service 2 | service: 3 | name: prometheus 4 | state: restarted 5 | listen: event_restart_prometheus 6 | 7 | - name: Reload systemd 8 | command: systemctl daemon-reload 9 | listen: systemd_reload -------------------------------------------------------------------------------- /roles/prometheus/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: Creating prometheus user group 2 | group: name="{{groupId}}" 3 | become: true 4 | 5 | - name: Creating prometheus user 6 | user: 7 | name: "{{userId}}" 8 | group: "{{groupId}}" 9 | system: yes 10 | shell: "/sbin/nologin" 11 | comment: "{{userId}} nologin User" 12 | createhome: "no" 13 | state: present 14 | 15 | - name: Install prometheus 16 | unarchive: 17 | src: "https://github.com/prometheus/prometheus/releases/download/v{{ version }}/prometheus-{{ version }}.linux-amd64.tar.gz" 18 | dest: /tmp/ 19 | remote_src: yes 20 | 21 | - name: Copy prometheus file to bin 22 | copy: 23 | src: "/tmp/prometheus-{{ version }}.linux-amd64/prometheus" 24 | dest: "/usr/local/bin/prometheus" 25 | owner: "{{userId}}" 26 | group: "{{groupId}}" 27 | remote_src: yes 28 | mode: 0755 29 | 30 | - name: Delete prometheus tmp folder 31 | file: 32 | path: '/tmp/prometheus-{{ version }}.linux-amd64' 33 | state: absent 34 | 35 | - name: Creates directory 36 | file: 37 | path: "/data/prometheus/" 38 | state: directory 39 | owner: "{{userId}}" 40 | group: "{{groupId}}" 41 | mode: 0755 42 | 43 | - name: Creates directory 44 | file: 45 | path: "/etc/prometheus/" 46 | state: directory 47 | owner: "{{userId}}" 48 | group: "{{groupId}}" 49 | mode: 0755 50 | 51 | - name: config file 52 | template: 53 | src: prometheus.conf.j2 54 | dest: /etc/prometheus/prometheus.conf 55 | 56 | - name: alert config file 57 | copy: 58 | src: "{{ role_path }}/files/alertrules.yml" 59 | dest: /etc/prometheus/alert.rules.yml 60 | 61 | - name: Copy systemd init file 62 | template: 63 | src: init.service.j2 64 | dest: /etc/systemd/system/prometheus.service 65 | notify: systemd_reload 66 | 67 | - name: Start prometheus service 68 | service: 69 | name: prometheus 70 | state: started 71 | enabled: yes 72 | 73 | - name: Check if prometheus is accessible 74 | uri: 75 | url: http://localhost:9090 76 | method: GET 77 | status_code: 200 78 | -------------------------------------------------------------------------------- /roles/prometheus/templates/init.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description={{serviceName}} 3 | Wants=network-online.target 4 | After=network-online.target 5 | 6 | [Service] 7 | User={{ userId }} 8 | Group={{ groupId }} 9 | Restart=always 10 | RestartSec=2 11 | StartLimitInterval=0 12 | Type=simple 13 | ExecStart={{ exec_command }} 14 | 15 | [Install] 16 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /roles/prometheus/templates/prometheus.conf.j2: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 15s 3 | 4 | scrape_configs: 5 | - job_name: 'prometheus' 6 | scrape_interval: 5s 7 | static_configs: 8 | - targets: ['localhost:9090'] 9 | - job_name: 'node_exporter' 10 | scrape_interval: 5s 11 | static_configs: 12 | - targets: 13 | {% for host in groups['node_exporter'] %} 14 | {% if inventory_hostname != host %} 15 | - '{{ host }}:9100' 16 | {% endif %} 17 | {% endfor %} 18 | 19 | rule_files: 20 | - alert.rules.yml 21 | 22 | alerting: 23 | alertmanagers: 24 | - static_configs: 25 | - targets: 26 | {% for host in groups['alertmanager'] %} 27 | {% if inventory_hostname != host %} 28 | - '{{ host }}:9093' 29 | {% endif %} 30 | {% endfor %} -------------------------------------------------------------------------------- /roles/prometheus/vars/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | serviceName: "prometheus" 3 | userId: "prometheus" 4 | groupId: "prometheus" 5 | exec_command: "/usr/local/bin/prometheus --config.file=/etc/prometheus/prometheus.conf --storage.tsdb.path=/data/prometheus --storage.tsdb.retention=2d" 6 | version: "2.3.2" -------------------------------------------------------------------------------- /roles/prometheus_node_exporter/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: Creating node_exporter user group 2 | group: name="{{groupId}}" 3 | become: true 4 | 5 | - name: Creating node_exporter user 6 | user: 7 | name: "{{userId}}" 8 | group: "{{groupId}}" 9 | system: yes 10 | shell: "/sbin/nologin" 11 | comment: "{{userId}} nologin User" 12 | createhome: "no" 13 | state: present 14 | 15 | - name: Install prometheus node exporter 16 | unarchive: 17 | src: "https://github.com/prometheus/node_exporter/releases/download/v{{ version }}/node_exporter-{{ version }}.linux-amd64.tar.gz" 18 | dest: /tmp/ 19 | remote_src: yes 20 | 21 | - name: Copy prometheus node exporter file to bin 22 | copy: 23 | src: "/tmp/node_exporter-{{ version }}.linux-amd64/node_exporter" 24 | dest: "/usr/local/bin/node_exporter" 25 | owner: "{{userId}}" 26 | group: "{{groupId}}" 27 | remote_src: yes 28 | mode: 0755 29 | 30 | - name: Delete node exporter tmp folder 31 | file: 32 | path: '/tmp/node_exporter-{{ version }}.linux-amd64' 33 | state: absent 34 | 35 | - name: Copy systemd init file 36 | template: 37 | src: init.service.j2 38 | dest: /etc/systemd/system/node_exporter.service 39 | 40 | - name: Start node_exporter service 41 | service: 42 | name: node_exporter 43 | state: started 44 | enabled: yes 45 | 46 | - name: Check if node exporter emits metrices 47 | uri: 48 | url: http://127.0.0.1:9100/metrics 49 | method: GET 50 | status_code: 200 -------------------------------------------------------------------------------- /roles/prometheus_node_exporter/templates/init.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description={{serviceName}} 3 | Wants=network-online.target 4 | After=network-online.target 5 | 6 | [Service] 7 | User={{ userId }} 8 | Group={{ groupId }} 9 | Restart=on-failure 10 | Type=simple 11 | ExecStart={{ exec_command }} 12 | 13 | [Install] 14 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /roles/prometheus_node_exporter/vars/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | serviceName: "node_exporter" 3 | userId: "node_exporter" 4 | groupId: "node_exporter" 5 | exec_command: /usr/local/bin/node_exporter 6 | version: 0.16.0 --------------------------------------------------------------------------------