├── .gitignore
├── LICENSE
├── README.md
├── ansible.cfg
├── group_vars
    └── all.default
├── host_vars
    └── c1.default
├── hosts
├── roles
    ├── basic
    │   ├── README.md
    │   ├── defaults
    │   │   └── main.yml
    │   ├── files
    │   │   ├── aliases
    │   │   └── sshd_config
    │   ├── tasks
    │   │   └── main.yml
    │   └── templates
    │   │   ├── main.cf
    │   │   ├── ntp-cn.conf
    │   │   └── ntp.conf
    ├── cgroup
    │   ├── README.md
    │   ├── defaults
    │   │   └── main.yml
    │   ├── files
    │   │   ├── cgconfig.conf
    │   │   └── cgrules.conf.default
    │   └── tasks
    │   │   └── main.yml
    ├── drivers
    │   ├── README.md
    │   ├── defaults
    │   │   └── main.yml
    │   └── tasks
    │   │   └── main.yml
    ├── elastalert
    │   ├── README.md
    │   ├── defaults
    │   │   └── main.yml
    │   ├── files
    │   │   └── elastalert.conf
    │   ├── tasks
    │   │   └── main.yml
    │   └── templates
    │   │   └── config.yaml
    ├── elk
    │   ├── README.md
    │   ├── defaults
    │   │   └── main.yml
    │   ├── files
    │   │   ├── apache2.yml
    │   │   ├── apache2_pipeline.json
    │   │   ├── jvm.options
    │   │   ├── nginx.yml
    │   │   └── system.yml
    │   ├── tasks
    │   │   └── main.yml
    │   └── templates
    │   │   ├── 02-beats-input.conf
    │   │   ├── 30-elasticsearch-output.conf
    │   │   ├── elasticsearch.yml
    │   │   ├── filebeat.yml
    │   │   ├── kibana.conf
    │   │   └── kibana.yml
    ├── ganglia
    │   ├── README.md
    │   ├── defaults
    │   │   └── main.yml
    │   ├── files
    │   │   ├── cpu_stats.py
    │   │   ├── gpu.sh
    │   │   ├── netstats.py
    │   │   └── temg.sh
    │   ├── tasks
    │   │   └── main.yml
    │   └── templates
    │   │   ├── avail-monitor.sh
    │   │   ├── ganglia.conf
    │   │   ├── gmetad.conf
    │   │   ├── gmond-cn.conf
    │   │   └── gmond.conf
    ├── mpi
    │   ├── README.md
    │   ├── defaults
    │   │   └── main.yml
    │   └── tasks
    │   │   └── main.yml
    ├── network
    │   ├── README.md
    │   ├── defaults
    │   │   └── main.yml
    │   ├── files
    │   │   ├── 20auto-upgrades
    │   │   └── sources.list
    │   ├── tasks
    │   │   └── main.yml
    │   └── templates
    │   │   ├── 60-config.yaml
    │   │   ├── 70-config.yaml
    │   │   ├── apt.conf
    │   │   ├── dnsmasq.conf
    │   │   ├── hosts
    │   │   ├── map.hosts
    │   │   └── proxy-set
    ├── python
    │   ├── README.md
    │   ├── defaults
    │   │   └── main.yml
    │   ├── files
    │   │   └── pip.conf
    │   ├── tasks
    │   │   └── main.yml
    │   └── templates
    │   │   ├── home.pth
    │   │   └── spack.pth
    ├── restic
    │   ├── README.md
    │   ├── defaults
    │   │   └── main.yml
    │   ├── files
    │   │   └── ignorefile
    │   └── tasks
    │   │   └── main.yml
    ├── slurm
    │   ├── README.md
    │   ├── defaults
    │   │   └── main.yml
    │   ├── files
    │   │   ├── access.conf
    │   │   ├── cgroup.conf
    │   │   ├── pam-common-session
    │   │   └── pam-sshd
    │   ├── tasks
    │   │   └── main.yml
    │   └── templates
    │   │   ├── gres.conf
    │   │   ├── slurm.conf
    │   │   ├── slurmdbd.conf
    │   │   └── smail.sh
    ├── spack
    │   ├── README.md
    │   ├── defaults
    │   │   └── main.yml
    │   ├── files
    │   │   ├── compilers.yaml
    │   │   ├── modules.yaml
    │   │   ├── packages.yaml
    │   │   ├── repo.yaml
    │   │   └── repos.yaml
    │   ├── tasks
    │   │   └── main.yml
    │   └── templates
    │   │   ├── pyinstall.sh
    │   │   └── spack-load
    ├── storage
    │   ├── README.md
    │   ├── defaults
    │   │   └── main.yml
    │   ├── tasks
    │   │   └── main.yml
    │   └── templates
    │   │   ├── exports
    │   │   └── tmpreaper.conf
    └── user
    │   ├── README.md
    │   ├── defaults
    │       └── main.yml
    │   ├── files
    │       ├── memory.conf
    │       └── nproc.conf
    │   └── tasks
    │       └── main.yml
└── site.yml


/.gitignore:
--------------------------------------------------------------------------------
 1 | vars/
 2 | meta/
 3 | handlers/
 4 | tests/
 5 | *_vars/*.yml
 6 | *.pub
 7 | munge.key
 8 | site.retry
 9 | ._README.md
10 | *.fuse*
11 | cgrules.conf
12 | hosts_test
13 | site_test.yml
14 | roles/test/
15 | master.json
16 | c1.json
17 | c4.json
18 | pass.yaml
19 | site_test.retry
20 | roles/elastalert/files/elastalert/rules/
21 | elastic-certificates.p12
22 | elastic-stack-ca.p12
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # HPC-BUILD-ANSIBLE-PLAYBOOKS
 2 | 
 3 | *Everyone can build his own supercomputer!*
 4 | 
 5 | ## Glossary
 6 | 
 7 | HPC: High performance computation
 8 | 
 9 | Cluster: Many machines connected by switch on hardware level.
10 | 
11 | Node: One machine
12 | 
13 | Login node: Node that can be accessed from outside network
14 | 
15 | Master node: Node that run central services in the cluster. In our setup, login and master are the same node, [ln] group.
16 | 
17 | Compute nodes: Nodes for running jobs, [cn] group.
18 | 
19 | GPU nodes: Nodes equipped with Nvidia GPUs, [gn] group.
20 | 
21 | ## Usage
22 | 
23 | Decomment the relevant roles you want to run in site.yml. You should review README file and task.yml carefully for each role you'd like to run.
24 | 
25 | ```bash
26 | $ cd <project-folder>
27 | $ ansible-playbook -i hosts site.yml -Kvv ## enter sudo user password for the next prompt
28 | ```
29 | 
30 | ### Some notes
31 | 
32 | * The playbooks should be located in a directory with permission 600 since it has lots of secret information and normal users shouldn't access them.
33 | 
34 | ### Possible approach start from scratch
35 | 
36 | * Install OS on the master node and directly `apt install ansible`. Then git clone this repo somewhere locally. Configure the inventory files and host_vars to include all possible machines with the knowledge of their mac address.
37 | * Possible a first Ansible run on master only with a complete inventory file (to generate complete hosts and mac-ip binding dhcp service). `network`, `basic` roles are suggested. A command `sudo ip addr add <lan_ip> dev <lan_nic>` is recommended before running network Ansible roles.
38 | * Install OS on compute nodes, either by hand or by some bare metal provision mechanisms from the master. (Note the playbooks here doesn't cover setups of provisioning.)
39 | * Make sure the sudo user with uid 1000 are the same (name, passwd) on all machines. And ensure ssh server is running  with admin's pubkey in authorized_keys on all machines.
40 | * Plug all machines together by switch and run Ansible playbooks from beginning again on master. 
41 | 
42 | ### Possible workflows beyond these playbooks
43 | 
44 | Ansible cannot do everything, and for some flexible and risky jobs, you may want to do them by hand directly.
45 | 
46 | * Manage local hard disk if there are any. Partition, format and mount them at master node. If there is any local mount need for compute nodes (which is rare for HPC style setup), you may want to add them to `basic` role, to make the world simple. This must be done before the running of `basic` role, which make the nfs mount possible.
47 | * Disk quota initial configure if you want to limit users disk usage on certain filesystem. This must be done before the running of `user` role, where newly created user can automatically limit by quota.
48 | * Manage slurm account, qos and user by `sacctmgr`, this can only be done after `slurm` role, when slurm is well configured and running. Besides, this should be done before `user` role, where new user can automatically be added to some account or limited by some qos policy.
49 | * Install necessary external softwares for numerical computation, the common ones are Intel parallel studio, Mathematica and Matlab. Further manage and install packages by spack and conda provided by intel parallel studio.
50 | 
51 | ### Limitations
52 | 
53 | These ansible playbooks here are very limited to a small cluster setup, where only one master/login node with possible several dozens of homogeneous compute nodes.
54 | 
55 | For a larger cluster setup, there should be more than one login nodes, and different master nodes may play different roles (some provide disk storage, some provide slurm database, some provide slurm controller, some provide backup…). Besides, in such scale, the compute nodes are highly likely to be heterogeneous (some with big memory, some with GPU resource...), more detailed setups and carefully designed slurm configurations are needed in such scenario. It is in princinple OK to generalize our playbook for such large HPC clusters, but more effort should be paid instead of directly applying the playbooks here.
56 | 
57 | ## Platform information
58 | 
59 | These ansible playbooks are not platform independent, instead, they are strongly correlated with Ubuntu 18.04 server distributions.
60 | 
61 | ### Suggestions on the possible change for different platforms
62 | 
63 | * For different version of Ubuntu:
64 |     - Some apt packages' name and config path might be different, especially these packages related to slurm.
65 |     - Netplan may not work in old versions of Ubuntu.
66 | 
67 | * For totally different Linux distribution like CentOS:
68 |     - Apt should be replaced with yum or some other package managers. 
69 |     - Names of many packages and services might be changed.
70 |     - Network setup might be in different approach.
71 | 
72 | * For OS beyond Linux:
73 |     - You must be very experienced with these stuff. I have no specific suggetion for you:)
74 | 


--------------------------------------------------------------------------------
/ansible.cfg:
--------------------------------------------------------------------------------
1 | [defaults]
2 | inventory = ./hosts
3 | host_key_checking = False
4 | 


--------------------------------------------------------------------------------
/group_vars/all.default:
--------------------------------------------------------------------------------
 1 | ansible_python_interpreter: "/usr/bin/python3"
 2 | timezone: "Asia/Shanghai"
 3 | admin: ubuntu 
 4 | ## admin user account
 5 | netmask: 255.255.255.0  
 6 | ## netmask for cluster LAN
 7 | mask: 24 
 8 | ## corresponding netmask bits
 9 | ip_range: 192.168.1.0
10 | ntp_server: ntp.tuna.tsinghua.edu.cn
11 | wan_ip: 10.0.0.10 
12 | ## WAN ip for login node
13 | wan_gateway: 10.0.0.1
14 | wan_mask: 25  
15 | ## netmask bits for WAN
16 | master_ip: 192.168.1.10  
17 | ## LAN ip for master/login node
18 | master_name: master 
19 | ## hostname of master node
20 | dhcp_start_ip: 192.168.1.40  
21 | ## dhcp ip range start
22 | dhcp_end_ip: 192.168.1.127   
23 | ## dhcp ip range end
24 | dns_server:
25 |   - 8.8.8.8
26 | ln_lan_nic: eno1
27 | ln_wan_nic: eno2
28 | cluster_domain: hpc.cluster
29 | cluster_name: hpc
30 | env_vars:  
31 | ## possible environment variables that you want to export for ansible roles
32 |   http_proxy: http://
33 |   https_proxy: http://
34 |   ftp_proxy: http://
35 | 


--------------------------------------------------------------------------------
/host_vars/c1.default:
--------------------------------------------------------------------------------
1 | ip: 192.168.1.21
2 | mac: 00:00:00:00:00:00
3 | 


--------------------------------------------------------------------------------
/hosts:
--------------------------------------------------------------------------------
 1 | [cn]
 2 | c[1:14]
 3 | 
 4 | [general]
 5 | c[1:8]
 6 | 
 7 | [hyper]
 8 | c[10:14]
 9 | 
10 | [ln]
11 | master
12 | 
13 | [gn]
14 | master
15 | c9
16 | 
17 | [sn]
18 | master
19 | c8
20 | 


--------------------------------------------------------------------------------
/roles/basic/README.md:
--------------------------------------------------------------------------------
 1 | Basic
 2 | =========
 3 | 
 4 | This role is designed to configure all basic stuff after network settings, such at ntp, locale, timezone, ssh, mail config and some basic packages installation.
 5 | 
 6 | Requirements
 7 | ------------
 8 | 
 9 | This is usually the second role to run. After running both network basic and the following storage roles, you would have a basic cluster infrastructure.
10 | 
11 | Role Variables
12 | --------------
13 | 
14 | `aptpacks` is apt packages installed on all nodes.
15 | 


--------------------------------------------------------------------------------
/roles/basic/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # defaults file for basic
 3 | aptpacks:
 4 |   - tree
 5 |   - ntp
 6 |   - make
 7 |   - cmake
 8 |   - python
 9 |   - gfortran
10 |   - unzip
11 |   - openjdk-8-jdk
12 |   - pandoc
13 |   - postfix
14 | 


--------------------------------------------------------------------------------
/roles/basic/files/aliases:
--------------------------------------------------------------------------------
1 | #  See man 5 aliases for format
2 | postmaster:    root
3 | 


--------------------------------------------------------------------------------
/roles/basic/files/sshd_config:
--------------------------------------------------------------------------------
  1 | #	$OpenBSD: sshd_config,v 1.101 2017/03/14 07:19:07 djm Exp $
  2 | 
  3 | # This is the sshd server system-wide configuration file.  See
  4 | # sshd_config(5) for more information.
  5 | 
  6 | # This sshd was compiled with PATH=/usr/bin:/bin:/usr/sbin:/sbin
  7 | 
  8 | # The strategy used for options in the default sshd_config shipped with
  9 | # OpenSSH is to specify options with their default value where
 10 | # possible, but leave them commented.  Uncommented options override the
 11 | # default value.
 12 | 
 13 | #Port 22
 14 | #AddressFamily any
 15 | #ListenAddress 0.0.0.0
 16 | #ListenAddress ::
 17 | 
 18 | #HostKey /etc/ssh/ssh_host_rsa_key
 19 | #HostKey /etc/ssh/ssh_host_ecdsa_key
 20 | #HostKey /etc/ssh/ssh_host_ed25519_key
 21 | 
 22 | # Ciphers and keying
 23 | #RekeyLimit default none
 24 | 
 25 | # Logging
 26 | #SyslogFacility AUTH
 27 | #LogLevel INFO
 28 | 
 29 | # Authentication:
 30 | 
 31 | #LoginGraceTime 2m
 32 | PermitRootLogin no
 33 | StrictModes yes
 34 | #MaxAuthTries 6
 35 | #MaxSessions 10
 36 | 
 37 | #PubkeyAuthentication yes
 38 | 
 39 | # Expect .ssh/authorized_keys2 to be disregarded by default in future.
 40 | #AuthorizedKeysFile	.ssh/authorized_keys .ssh/authorized_keys2
 41 | 
 42 | #AuthorizedPrincipalsFile none
 43 | 
 44 | #AuthorizedKeysCommand none
 45 | #AuthorizedKeysCommandUser nobody
 46 | 
 47 | # For this to work you will also need host keys in /etc/ssh/ssh_known_hosts
 48 | #HostbasedAuthentication no
 49 | # Change to yes if you don't trust ~/.ssh/known_hosts for
 50 | # HostbasedAuthentication
 51 | #IgnoreUserKnownHosts no
 52 | # Don't read the user's ~/.rhosts and ~/.shosts files
 53 | #IgnoreRhosts yes
 54 | 
 55 | # To disable tunneled clear text passwords, change to no here!
 56 | #PasswordAuthentication yes
 57 | PermitEmptyPasswords no
 58 | 
 59 | # Change to yes to enable challenge-response passwords (beware issues with
 60 | # some PAM modules and threads)
 61 | ChallengeResponseAuthentication no
 62 | 
 63 | # Kerberos options
 64 | #KerberosAuthentication no
 65 | #KerberosOrLocalPasswd yes
 66 | #KerberosTicketCleanup yes
 67 | #KerberosGetAFSToken no
 68 | 
 69 | # GSSAPI options
 70 | #GSSAPIAuthentication no
 71 | #GSSAPICleanupCredentials yes
 72 | #GSSAPIStrictAcceptorCheck yes
 73 | #GSSAPIKeyExchange no
 74 | 
 75 | # Set this to 'yes' to enable PAM authentication, account processing,
 76 | # and session processing. If this is enabled, PAM authentication will
 77 | # be allowed through the ChallengeResponseAuthentication and
 78 | # PasswordAuthentication.  Depending on your PAM configuration,
 79 | # PAM authentication via ChallengeResponseAuthentication may bypass
 80 | # the setting of "PermitRootLogin without-password".
 81 | # If you just want the PAM account and session checks to run without
 82 | # PAM authentication, then enable this but set PasswordAuthentication
 83 | # and ChallengeResponseAuthentication to 'no'.
 84 | UsePAM yes
 85 | 
 86 | #AllowAgentForwarding yes
 87 | #AllowTcpForwarding yes
 88 | #GatewayPorts no
 89 | X11Forwarding yes
 90 | #X11DisplayOffset 10
 91 | #X11UseLocalhost yes
 92 | #PermitTTY yes
 93 | PrintMotd no
 94 | PrintLastLog no
 95 | #TCPKeepAlive yes
 96 | #UseLogin no
 97 | #PermitUserEnvironment no
 98 | #Compression delayed
 99 | #ClientAliveInterval 0
100 | #ClientAliveCountMax 3
101 | #UseDNS no
102 | #PidFile /var/run/sshd.pid
103 | #MaxStartups 10:30:100
104 | #PermitTunnel no
105 | #ChrootDirectory none
106 | #VersionAddendum none
107 | 
108 | # no default banner path
109 | #Banner none
110 | 
111 | # Allow client to pass locale environment variables
112 | AcceptEnv LANG LC_*
113 | 
114 | # override default of no subsystems
115 | Subsystem sftp	/usr/lib/openssh/sftp-server
116 | 
117 | # Example of overriding settings on a per-user basis
118 | #Match User anoncvs
119 | #	X11Forwarding no
120 | #	AllowTcpForwarding no
121 | #	PermitTTY no
122 | #	ForceCommand cvs server
123 | PasswordAuthentication yes
124 | 


--------------------------------------------------------------------------------
/roles/basic/tasks/main.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | # tasks file for basic
  3 | - name: ensure en locale
  4 |   become: yes
  5 |   locale_gen:
  6 |     name: en_US.UTF-8
  7 |     state: present
  8 | - name: ensure timezone
  9 |   become: yes
 10 |   timezone:
 11 |     name: "{{ timezone }}"
 12 |   register: tz
 13 | - name: sync the timezone of rsyslog
 14 |   become: yes
 15 |   when: tz.changed
 16 |   service:
 17 |     name: rsyslog
 18 |     state: restarted
 19 | - name: install some apt packs on nodes
 20 |   become: yes
 21 |   apt:
 22 |     name: "{{ aptpacks }}"
 23 |     state: present
 24 |     update_cache: yes
 25 | - name: ensure ntp sevice is started
 26 |   become: yes
 27 |   service:
 28 |     name: ntp
 29 |     state: started
 30 |     enabled: yes
 31 | - name: update ntp config file on ln node
 32 |   become: yes
 33 |   template:
 34 |     src: ../templates/ntp.conf
 35 |     dest: /etc/ntp.conf
 36 |     owner: root
 37 |     backup: yes
 38 |   register: lnntp
 39 |   when: inventory_hostname in groups['ln']
 40 | - name: restart ntp service on lg node
 41 |   become: yes
 42 |   service:
 43 |     name: ntp
 44 |     state: restarted
 45 |   when: inventory_hostname in groups['ln'] and lnntp.changed
 46 | - name: update ntp config file on cn nodes
 47 |   become: yes
 48 |   template:
 49 |     src: ntp-cn.conf
 50 |     dest: /etc/ntp.conf
 51 |   register: ntpconfig
 52 |   when: inventory_hostname in groups['cn']
 53 | - name: restart ntp service on cn nodes
 54 |   become: yes
 55 |   service:
 56 |     name: ntp
 57 |     state: restarted
 58 |   when: inventory_hostname in groups['cn'] and ntpconfig.changed
 59 | - name: update ssh config in all nodes
 60 |   become: yes
 61 |   copy:
 62 |     src: sshd_config
 63 |     dest: /etc/ssh/sshd_config
 64 |     backup: yes
 65 |   register: sshdupdate
 66 | - name: ensure ssh server is started
 67 |   become: yes
 68 |   service:
 69 |     name: sshd
 70 |     state: started
 71 | - name: restart ssh
 72 |   become: yes
 73 |   service:
 74 |     name: sshd
 75 |     state: restarted
 76 |   when: sshdupdate.changed
 77 | - name: remove welcome message
 78 |   become: yes
 79 |   file:
 80 |     path: /etc/update-motd.d/
 81 |     mode: '0644'
 82 |     state: directory
 83 |     recurse: yes
 84 | - name: postfix config
 85 |   become: yes
 86 |   template:
 87 |     src: main.cf
 88 |     dest: /etc/postfix/main.cf
 89 |     backup: yes
 90 |   register: postconf
 91 | - name: postfix alias file
 92 |   copy:
 93 |     src: aliases
 94 |     dest: /etc/aliases
 95 |   become: yes
 96 |   register: alias
 97 | - name: ensure postfix running
 98 |   service:
 99 |     name: postfix
100 |     state: started
101 |     enabled: yes
102 |   become: yes
103 | - name: new aliases db
104 |   become: yes
105 |   command: "newaliases"
106 |   when: alias.changed
107 | - name: reload postfix
108 |   service:
109 |     name: postfix
110 |     state: reloaded
111 |   become: yes
112 |   when: postconf.changed
113 | - name: stop snapd
114 |   become: yes
115 |   service:
116 |     name: snapd
117 |     state: stopped
118 |     enabled: no
119 | 


--------------------------------------------------------------------------------
/roles/basic/templates/main.cf:
--------------------------------------------------------------------------------
 1 | # Debian specific:  Specifying a file name will cause the first
 2 | # line of that file to be used as the name.  The Debian default
 3 | # is /etc/mailname.
 4 | 
 5 | smtpd_banner = $myhostname ESMTP $mail_name (Ubuntu)
 6 | biff = no
 7 | 
 8 | # appending .domain is the MUA's job.
 9 | append_dot_mydomain = no
10 | 
11 | # Uncomment the next line to generate "delayed mail" warnings
12 | #delay_warning_time = 4h
13 | 
14 | readme_directory = no
15 | 
16 | # See http://www.postfix.org/COMPATIBILITY_README.html -- default to 2 on
17 | # fresh installs.
18 | compatibility_level = 2
19 | 
20 | # TLS parameters
21 | smtpd_tls_cert_file=/etc/ssl/certs/ssl-cert-snakeoil.pem
22 | smtpd_tls_key_file=/etc/ssl/private/ssl-cert-snakeoil.key
23 | smtpd_use_tls=yes
24 | smtpd_tls_session_cache_database = btree:${data_directory}/smtpd_scache
25 | smtp_tls_session_cache_database = btree:${data_directory}/smtp_scache
26 | 
27 | # See /usr/share/doc/postfix/TLS_README.gz in the postfix-doc package for
28 | # information on enabling SSL in the smtp client.
29 | 
30 | smtpd_relay_restrictions = permit_mynetworks permit_sasl_authenticated defer_unauth_destination
31 | myhostname = {{ inventory_hostname }}.localdomain
32 | alias_maps = hash:/etc/aliases
33 | alias_database = hash:/etc/aliases
34 | mydestination = {{ inventory_hostname }}.localdomain, {{ inventory_hostname }}, localhost.localdomain, , localhost
35 | relayhost =
36 | mynetworks = 127.0.0.0/8 [::ffff:127.0.0.0]/104 [::1]/128
37 | mailbox_size_limit = 0
38 | recipient_delimiter = +
39 | inet_interfaces = all
40 | inet_protocols = all
41 | myorigin = {{ inventory_hostname }}.localdomain
42 | 


--------------------------------------------------------------------------------
/roles/basic/templates/ntp-cn.conf:
--------------------------------------------------------------------------------
 1 | # /etc/ntp.conf, configuration for ntpd; see ntp.conf(5) for help
 2 | 
 3 | driftfile /var/lib/ntp/ntp.drift
 4 | 
 5 | # Leap seconds definition provided by tzdata
 6 | leapfile /usr/share/zoneinfo/leap-seconds.list
 7 | 
 8 | # Enable this if you want statistics to be logged.
 9 | #statsdir /var/log/ntpstats/
10 | 
11 | statistics loopstats peerstats clockstats
12 | filegen loopstats file loopstats type day enable
13 | filegen peerstats file peerstats type day enable
14 | filegen clockstats file clockstats type day enable
15 | 
16 | # Specify one or more NTP servers.
17 | 
18 | # Use servers from the NTP Pool Project. Approved by Ubuntu Technical Board
19 | # on 2011-02-08 (LP: #104525). See http://www.pool.ntp.org/join.html for
20 | # more information.
21 | # pool 0.ubuntu.pool.ntp.org iburst
22 | # pool 1.ubuntu.pool.ntp.org iburst
23 | # pool 2.ubuntu.pool.ntp.org iburst
24 | # pool 3.ubuntu.pool.ntp.org iburst
25 | 
26 | # Use Ubuntu's ntp server as a fallback.
27 | # pool ntp.ubuntu.com
28 | 
29 | # Access control configuration; see /usr/share/doc/ntp-doc/html/accopt.html for
30 | # details.  The web page <http://support.ntp.org/bin/view/Support/AccessRestrictions>
31 | # might also be helpful.
32 | #
33 | # Note that "restrict" applies to both servers and clients, so a configuration
34 | # that might be intended to block requests from certain clients could also end
35 | # up blocking replies from your own upstream servers.
36 | 
37 | # By default, exchange time with everybody, but don't allow configuration.
38 | restrict -4 default kod notrap nomodify nopeer noquery limited
39 | restrict -6 default kod notrap nomodify nopeer noquery limited
40 | 
41 | # Local users may interrogate the ntp server more closely.
42 | restrict 127.0.0.1
43 | restrict ::1
44 | 
45 | # Needed for adding pool entries
46 | restrict source notrap nomodify noquery
47 | 
48 | # Clients from this (example!) subnet have unlimited access, but only if
49 | # cryptographically authenticated.
50 | #restrict 192.168.123.0 mask 255.255.255.0 notrust
51 | 
52 | 
53 | # If you want to provide time to your local subnet, change the next line.
54 | # (Again, the address is an example only.)
55 | #broadcast 192.168.123.255
56 | 
57 | # If you want to listen to time broadcasts on your local subnet, de-comment the
58 | # next lines.  Please do this only if you trust everybody on the network!
59 | #disable auth
60 | #broadcastclient
61 | 
62 | #Changes recquired to use pps synchonisation as explained in documentation:
63 | #http://www.ntp.org/ntpfaq/NTP-s-config-adv.htm#AEN3918
64 | 
65 | #server 127.127.8.1 mode 135 prefer    # Meinberg GPS167 with PPS
66 | #fudge 127.127.8.1 time1 0.0042        # relative to PPS for my hardware
67 | 
68 | #server 127.127.22.1                   # ATOM(PPS)
69 | #fudge 127.127.22.1 flag3 1            # enable PPS API
70 | server {{ master_name }} prefer
71 | 


--------------------------------------------------------------------------------
/roles/basic/templates/ntp.conf:
--------------------------------------------------------------------------------
 1 | # /etc/ntp.conf, configuration for ntpd; see ntp.conf(5) for help
 2 | 
 3 | driftfile /var/lib/ntp/ntp.drift
 4 | 
 5 | # Leap seconds definition provided by tzdata
 6 | leapfile /usr/share/zoneinfo/leap-seconds.list
 7 | 
 8 | # Enable this if you want statistics to be logged.
 9 | #statsdir /var/log/ntpstats/
10 | 
11 | # logconfig =syncstatus +sysstatus
12 | 
13 | statistics loopstats peerstats clockstats
14 | filegen loopstats file loopstats type day enable
15 | filegen peerstats file peerstats type day enable
16 | filegen clockstats file clockstats type day enable
17 | 
18 | # Specify one or more NTP servers.
19 | 
20 | # Use servers from the NTP Pool Project. Approved by Ubuntu Technical Board
21 | # on 2011-02-08 (LP: #104525). See http://www.pool.ntp.org/join.html for
22 | # more information.
23 | # pool 0.ubuntu.pool.ntp.org iburst
24 | # pool 1.ubuntu.pool.ntp.org iburst
25 | # pool 2.ubuntu.pool.ntp.org iburst
26 | # pool 3.ubuntu.pool.ntp.org iburst
27 | 
28 | # Use Ubuntu's ntp server as a fallback.
29 | # pool ntp.ubuntu.com
30 | 
31 | # Access control configuration; see /usr/share/doc/ntp-doc/html/accopt.html for
32 | # details.  The web page <http://support.ntp.org/bin/view/Support/AccessRestrictions>
33 | # might also be helpful.
34 | #
35 | # Note that "restrict" applies to both servers and clients, so a configuration
36 | # that might be intended to block requests from certain clients could also end
37 | # up blocking replies from your own upstream servers.
38 | 
39 | # By default, exchange time with everybody, but don't allow configuration.
40 | restrict -4 default kod notrap nomodify nopeer noquery limited
41 | restrict -6 default kod notrap nomodify nopeer noquery limited
42 | 
43 | # Local users may interrogate the ntp server more closely.
44 | restrict 127.0.0.1
45 | restrict ::1
46 | 
47 | # Needed for adding pool entries
48 | restrict source notrap nomodify noquery
49 | 
50 | # Clients from this (example!) subnet have unlimited access, but only if
51 | # cryptographically authenticated.
52 | #restrict 192.168.123.0 mask 255.255.255.0 notrust
53 | 
54 | restrict {{ ntp_server }}
55 | restrict {{ ip_range }}  mask {{netmask}} nomodify
56 | 
57 | # If you want to provide time to your local subnet, change the next line.
58 | # (Again, the address is an example only.)
59 | #broadcast 192.168.123.255
60 | 
61 | # If you want to listen to time broadcasts on your local subnet, de-comment the
62 | # next lines.  Please do this only if you trust everybody on the network!
63 | #disable auth
64 | #broadcastclient
65 | 
66 | #Changes recquired to use pps synchonisation as explained in documentation:
67 | #http://www.ntp.org/ntpfaq/NTP-s-config-adv.htm#AEN3918
68 | 
69 | server {{ ntp_server}} prefer
70 | 
71 | #server 127.127.8.1 mode 135 prefer    # Meinberg GPS167 with PPS
72 | #fudge 127.127.8.1 time1 0.0042        # relative to PPS for my hardware
73 | 
74 | #server 127.127.22.1                   # ATOM(PPS)
75 | #fudge 127.127.22.1 flag3 1            # enable PPS API
76 | 


--------------------------------------------------------------------------------
/roles/cgroup/README.md:
--------------------------------------------------------------------------------
 1 | CGroup
 2 | =========
 3 | 
 4 | This role is designed to manage cgroup and resource limit by user or app basis.
 5 | 
 6 | Requirements
 7 | ------------
 8 | 
 9 | You must have these user created defined in `cgrules.conf`. For example, ELK stack users restriction must be added after install of ELK stack.
10 | 
11 | Templates and Files
12 | --------------
13 | 
14 | cgrules.conf and cgconfig.conf in files dir are very **specific**. You may want to change them based on your needs and your hardware specs. We only provide an example file named after `cgrules.conf.default`, you should rename the file without `.default` before applying the role.
15 | 
16 | Distribution related
17 | ---------------
18 | 
19 | It is worth noting, the cgroup auto classify system is very different in CentOS. So be careful on this role, if your distribution is not Ubuntu.
20 | 


--------------------------------------------------------------------------------
/roles/cgroup/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # defaults file for cgroup


--------------------------------------------------------------------------------
/roles/cgroup/files/cgconfig.conf:
--------------------------------------------------------------------------------
 1 | group service {
 2 |   cpuset {
 3 |     cpuset.cpus="0-13";
 4 |     cpuset.mems=0;
 5 |   }
 6 | }
 7 | group userhard {
 8 |   cpuset {
 9 |     cpuset.cpus="14-27,42-55";
10 |     cpuset.mems=1;
11 |   }
12 | }
13 | group usersoft {
14 |   cpu {
15 |    cpu.shares=500;
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/roles/cgroup/files/cgrules.conf.default:
--------------------------------------------------------------------------------
1 | elasticsearch cpuset service/
2 | kibana        cpuset service/
3 | logstash      cpuset service/
4 | test          cpuset usersoft/
5 | 


--------------------------------------------------------------------------------
/roles/cgroup/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # tasks file for cgroup
 3 | - name: install cgroup tool
 4 |   become: yes
 5 |   apt:
 6 |     name: cgroup-tools
 7 |     state: present
 8 |   when: inventory_hostname in groups['ln']
 9 | - name: copy cgconfig
10 |   become: yes
11 |   copy: 
12 |     src: cgconfig.conf
13 |     dest: /etc/cgconfig.conf
14 |   when: inventory_hostname in groups['ln']
15 |   register: cgconf
16 | - name: update cgroup fs
17 |   become: yes
18 |   when: inventory_hostname in groups['ln'] and cgconf.changed
19 |   command: "/usr/sbin/cgconfigparser -l /etc/cgconfig.conf"
20 | - name: copy cgrules
21 |   become: yes
22 |   copy:
23 |     src: cgrules.conf
24 |     dest: /etc/cgrules.conf
25 |   when: inventory_hostname in groups['ln']
26 |   register: cgruleconf
27 | - name: restart cgd
28 |   become: yes
29 |   #  command: "kill `ps aux|grep cgrulesengd|head -n 1|awk '{print $2}' && /usr/sbin/cgrulesengd"
30 |   shell: "/usr/sbin/cgrulesengd"
31 |   when: inventory_hostname in groups['ln'] and cgruleconf.changed
32 | 


--------------------------------------------------------------------------------
/roles/drivers/README.md:
--------------------------------------------------------------------------------
 1 | Drivers
 2 | =========
 3 | 
 4 | Install Nvidia drivers on nodes in [gn] group.
 5 | 
 6 | Role Variables
 7 | --------------
 8 | 
 9 | See defaults/main.yml. One should sepcify the driver versions, and it can be done in a more finer way, such as `driver_name: "nvidia-driver-418=418.56-0ubuntu0~gpu18.04.1"`.
10 | 
11 | Notes
12 | --------------
13 | 
14 | Before running this role, you mat need to run `sudo apt-get purge nvidia*` on [gn] to ensure the preinstalled drivers deleted.
15 | 
16 | After the installation of GPU drivers, a reboot is necessary. The reboot thing is not controlled by the role, so reboot the machines by hand.
17 | 


--------------------------------------------------------------------------------
/roles/drivers/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # defaults file for drivers
3 | driver_name: "nvidia-driver-430"
4 | 


--------------------------------------------------------------------------------
/roles/drivers/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # tasks file for drivers
 3 | - name: install python3 apt for apt repo task
 4 |   become: yes
 5 |   apt:
 6 |     name: python3-apt
 7 |     state: present
 8 | - name: add nvidia repo
 9 |   become: yes
10 |   environment: "{{ env_vars }}"
11 |   apt_repository:
12 |     repo: "ppa:graphics-drivers/ppa"
13 |     state: present
14 |     update_cache: yes
15 |   when: inventory_hostname in groups['gn'] 
16 | - name: install gpu driver
17 |   become: yes
18 |   apt:
19 |     name: "{{ driver_name }}"
20 |     state: present
21 |   when: inventory_hostname in groups['gn'] 
22 | 


--------------------------------------------------------------------------------
/roles/elastalert/README.md:
--------------------------------------------------------------------------------
 1 | Elastalert
 2 | =========
 3 | 
 4 | This role is designed to integrate elastalert by Yelp into ELK stacks.
 5 | 
 6 | Requirements
 7 | ------------
 8 | 
 9 | You must run `elk` role first to set up the ELK stack with elasticsearch database.
10 | Also, you should setup index on elasticsearch by `elastalert-create-index`.
11 | 
12 | Templates and Files
13 | --------------
14 | Files in elastalert/rules should be added by hands, please refer to the doc of elastalert on how to write yaml files for alert rules.
15 | 


--------------------------------------------------------------------------------
/roles/elastalert/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # defaults file for elastalert
3 | es_user: elastic
4 | es_pass: 123456notgood
5 | 


--------------------------------------------------------------------------------
/roles/elastalert/files/elastalert.conf:
--------------------------------------------------------------------------------
 1 | [program:elastalert]
 2 | command=/usr/bin/elastalert --config /etc/elastalert/config.yaml --verbose 
 3 | process_name=elastalert
 4 | autostart=true
 5 | autorestart=true
 6 | startsecs=15
 7 | stopsignal=INT
 8 | stopasgroup=true
 9 | killasgroup=true
10 | stderr_logfile=/var/log/elastalert_stderr.log
11 | stderr_logfile_maxbytes=5MB
12 | 
13 | 


--------------------------------------------------------------------------------
/roles/elastalert/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # tasks file for elastalert
 3 | - name: install some necessary apt packages
 4 |   become: yes
 5 |   apt:
 6 |     name: "{{ item }}"
 7 |     state: present
 8 |     update_cache: yes
 9 |   when: inventory_hostname in groups['ln']
10 |   with_items:
11 |     - supervisor
12 |     - elastalert
13 | - name: ensure supervisord is started
14 |   become: yes
15 |   service:
16 |     name: supervisor
17 |     state: started
18 |   when: inventory_hostname in groups['ln']
19 | - name: move elastalert conf to supervisord
20 |   become: yes
21 |   copy:
22 |     src: elastalert.conf
23 |     dest: /etc/supervisor/conf.d/elastalert.conf
24 |   when: inventory_hostname in groups['ln']
25 |   register: supconf
26 | - name: reload supervisord if conf changed
27 |   become: yes
28 |   service:
29 |     name: supervisor
30 |     state: reloaded
31 |   when: inventory_hostname in groups['ln'] and supconf.changed
32 | - name: create config directory
33 |   become: yes
34 |   file:
35 |     state: directory
36 |     path: /etc/elastalert
37 |     mode: '700'
38 |   when: inventory_hostname in groups['ln']
39 | - name: copy elastalert configs to etc
40 |   become: yes
41 |   copy:
42 |     src: elastalert/
43 |     dest: /etc/elastalert/
44 |   when: inventory_hostname in groups['ln']
45 |   register: eaconf
46 | - name: render main config to etc
47 |   become: yes
48 |   template:
49 |     src: config.yaml
50 |     dest: /etc/elastalert/config.yaml
51 |   when: inventory_hostname in groups['ln']
52 |   register: eaconf2
53 | - name: supervisor start elastalert
54 |   become: yes
55 |   supervisorctl:
56 |     name: elastalert
57 |     state: started
58 |   when: inventory_hostname in groups['ln']
59 | - name: restart elastalert if conf changed
60 |   become: yes
61 |   supervisorctl:
62 |     name: elastalert
63 |     state: restarted
64 |   when: inventory_hostname in groups['ln'] and (eaconf.changed or eaconf2.changed)
65 | 


--------------------------------------------------------------------------------
/roles/elastalert/templates/config.yaml:
--------------------------------------------------------------------------------
  1 | # This is the folder that contains the rule yaml files
  2 | # Any .yaml file will be loaded as a rule
  3 | rules_folder: "/etc/elastalert/rules"
  4 | 
  5 | # How often ElastAlert will query Elasticsearch
  6 | # The unit can be anything from weeks to seconds
  7 | run_every:
  8 |   minutes: 10
  9 | 
 10 | # ElastAlert will buffer results from the most recent
 11 | # period of time, in case some log sources are not in real time
 12 | buffer_time:
 13 |   minutes: 15
 14 | 
 15 | # The Elasticsearch hostname for metadata writeback
 16 | # Note that every rule can have its own Elasticsearch host
 17 | es_host: "{{ es_host }}"
 18 | 
 19 | # The Elasticsearch port
 20 | es_port: 9200
 21 | 
 22 | es_username: "{{ es_user }}"
 23 | 
 24 | es_password: "{{ es_pass }}"
 25 | 
 26 | # The AWS region to use. Set this when using AWS-managed elasticsearch
 27 | #aws_region: us-east-1
 28 | 
 29 | # The AWS profile to use. Use this if you are using an aws-cli profile.
 30 | # See http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html
 31 | # for details
 32 | #profile: test
 33 | 
 34 | # Optional URL prefix for Elasticsearch
 35 | #es_url_prefix: elasticsearch
 36 | 
 37 | # Connect with TLS to Elasticsearch
 38 | #use_ssl: True
 39 | 
 40 | # Verify TLS certificates
 41 | #verify_certs: True
 42 | 
 43 | # GET request with body is the default option for Elasticsearch.
 44 | # If it fails for some reason, you can pass 'GET', 'POST' or 'source'.
 45 | # See http://elasticsearch-py.readthedocs.io/en/master/connection.html?highlight=send_get_body_as#transport
 46 | # for details
 47 | #es_send_get_body_as: GET
 48 | 
 49 | # Option basic-auth username and password for Elasticsearch
 50 | #es_username: someusername
 51 | #es_password: somepassword
 52 | 
 53 | # Use SSL authentication with client certificates client_cert must be
 54 | # a pem file containing both cert and key for client
 55 | #verify_certs: True
 56 | #ca_certs: /path/to/cacert.pem
 57 | #client_cert: /path/to/client_cert.pem
 58 | #client_key: /path/to/client_key.key
 59 | 
 60 | # The index on es_host which is used for metadata storage
 61 | # This can be a unmapped index, but it is recommended that you run
 62 | # elastalert-create-index to set a mapping
 63 | writeback_index: elastalert_status
 64 | 
 65 | # If an alert fails for some reason, ElastAlert will retry
 66 | # sending the alert until this time period has elapsed
 67 | alert_time_limit:
 68 |   days: 2
 69 | 
 70 | # Custom logging configuration
 71 | # If you want to setup your own logging configuration to log into
 72 | # files as well or to Logstash and/or modify log levels, use
 73 | # the configuration below and adjust to your needs.
 74 | # Note: if you run ElastAlert with --verbose/--debug, the log level of
 75 | # the "elastalert" logger is changed to INFO, if not already INFO/DEBUG.
 76 | #logging:
 77 | #  version: 1
 78 | #  incremental: false
 79 | #  disable_existing_loggers: false
 80 | #  formatters:
 81 | #    logline:
 82 | #      format: '%(asctime)s %(levelname)+8s %(name)+20s %(message)s'
 83 | #
 84 | #    handlers:
 85 | #      console:
 86 | #        class: logging.StreamHandler
 87 | #        formatter: logline
 88 | #        level: DEBUG
 89 | #        stream: ext://sys.stderr
 90 | #
 91 | #      file:
 92 | #        class : logging.FileHandler
 93 | #        formatter: logline
 94 | #        level: DEBUG
 95 | #        filename: elastalert.log
 96 | #
 97 | #    loggers:
 98 | #      elastalert:
 99 | #        level: WARN
100 | #        handlers: []
101 | #        propagate: true
102 | #
103 | #      elasticsearch:
104 | #        level: WARN
105 | #        handlers: []
106 | #        propagate: true
107 | #
108 | #      elasticsearch.trace:
109 | #        level: WARN
110 | #        handlers: []
111 | #        propagate: true
112 | #
113 | #      '':  # root logger
114 | #        level: WARN
115 | #          handlers:
116 | #            - console
117 | #            - file
118 | #        propagate: false
119 | 


--------------------------------------------------------------------------------
/roles/elk/README.md:
--------------------------------------------------------------------------------
 1 | ELK
 2 | =========
 3 | 
 4 | This role is designed to configure a minimal ELK (elasticsearch+logstash+kibana+filebeat) stack for logging system. 
 5 | It also enables the user authetication of elastisearch.
 6 | 
 7 | Requirements
 8 | ------------
 9 | 
10 | Java 8 should be installed, which is done by `basic` role.
11 | 
12 | python3-passlib should be installed to confige http authetication, which is done by `ganglia` role
13 | 
14 | Role Variables
15 | --------------
16 | 
17 | See defaults/main.yml. It is worth noting, when running the role at the first time to configure the whole stack, you should run it with filebeat_init as no and as yes each once. After the first run with filebeat_init as no, you can return to command line set es password by `sudo /usr/share/elasticsearch/bin/elasticsearch-setup-passwords interactive`. After this, run the role with filebeat_init as yes to finish the initial configurations. Afterward, you should keep filebeat_init to no, unless you want to reconfigure modules in filebeats.
18 | 
19 | Templates and Files
20 | --------------
21 | 
22 | Notes
23 | --------------
24 | 
25 | The initial configuration is in general referenced on [this post](https://www.digitalocean.com/community/tutorials/how-to-install-elasticsearch-logstash-and-kibana-elastic-stack-on-ubuntu-18-04#step-3-%E2%80%94-installing-and-configuring-logstash) with generalization to multiple distributed filebeats.
26 | 
27 | We further add multiple features from the minimal infrastructure: user authetication, multiple modules from filebeat, correct timestamps and no filters in logstash.
28 | 
29 | Also note nginx http authentication might be conflict with kibana intrinsic ones, so don't set http auth twice.


--------------------------------------------------------------------------------
/roles/elk/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # defaults file for elk
 3 | kibana_web_port: 8080
 4 | es_host:
 5 |   - master
 6 |   - c9
 7 | kb_port: 5601
 8 | kb_user: "kibana"
 9 | kb_pass: "654321alsobad"
10 | es_user: "elastic"
11 | es_pass: "654321alsobad"
12 | filebeat_init: no
13 | filebeat_ln_modules:
14 |   - system
15 |   - nginx
16 |   - apache2
17 |   - mysql
18 |   - iptables
19 | filebeat_cn_modules:
20 |   - system
21 | 


--------------------------------------------------------------------------------
/roles/elk/files/apache2.yml:
--------------------------------------------------------------------------------
 1 | - module: apache2
 2 |   # Access logs
 3 |   access:
 4 |     enabled: true
 5 | 
 6 |     # Set custom paths for the log files. If left empty,
 7 |     # Filebeat will choose the paths depending on your OS.
 8 |     #var.paths:
 9 | 
10 |   # Error logs
11 |   error:
12 |     enabled: true
13 | 
14 |     # Set custom paths for the log files. If left empty,
15 |     # Filebeat will choose the paths depending on your OS.
16 |     #var.paths:
17 |     var.convert_timezone: true
18 | 


--------------------------------------------------------------------------------
/roles/elk/files/apache2_pipeline.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "description": "Pipeline for parsing apache2 error logs",
 3 |   "processors": [
 4 |     {
 5 |       "grok": {
 6 |         "field": "message",
 7 |         "patterns": [
 8 |           "\\[%{APACHE_TIME:apache2.error.timestamp}\\] \\[%{LOGLEVEL:apache2.error.level}\\]( \\[client %{IPORHOST:apache2.error.client}\\])? %{GREEDYDATA:apache2.error.message}",
 9 |           "\\[%{APACHE_TIME:apache2.error.timestamp}\\] \\[%{DATA:apache2.error.module}:%{LOGLEVEL:apache2.error.level}\\] \\[pid %{NUMBER:apache2.error.pid}(:tid %{NUMBER:apache2.error.tid})?\\]( \\[client %{IPORHOST:apache2.error.client}\\])? %{GREEDYDATA:apache2.error.message1}"
10 |         ],
11 |         "pattern_definitions": {
12 |           "APACHE_TIME": "%{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{YEAR}"
13 |         },
14 |         "ignore_missing": true
15 |       }
16 |     },
17 |     {
18 |       "remove":{
19 |         "field": "message"
20 |       }
21 |     },
22 |     {
23 |       "rename": {
24 |         "field": "apache2.error.message1",
25 |         "target_field": "apache2.error.message",
26 |         "ignore_failure": true
27 |       }
28 |     },
29 |     {
30 |       "date": {
31 |         "field": "apache2.error.timestamp",
32 |         "target_field": "@timestamp",
33 |         "formats": ["EEE MMM dd H:m:s yyyy", "EEE MMM dd H:m:s.SSSSSS yyyy"],
34 | 	{< if .convert_timezone >}"timezone": "{{ beat.timezone }}",{< end >}
35 |         "ignore_failure": true
36 |       }
37 |     },
38 |     {
39 |       "remove": {
40 |         "field": "apache2.error.timestamp",
41 |         "ignore_failure": true
42 |       }
43 |     }
44 |   ],
45 |   "on_failure" : [{
46 |     "set" : {
47 |       "field" : "error.message",
48 |       "value" : "{{ _ingest.on_failure_message }}"
49 |     }
50 |   }]
51 | }
52 | 


--------------------------------------------------------------------------------
/roles/elk/files/jvm.options:
--------------------------------------------------------------------------------
 1 | -Xmx8g
 2 | -Xms8g
 3 | -XX:+UseConcMarkSweepGC
 4 | -XX:CMSInitiatingOccupancyFraction=75
 5 | -XX:+UseCMSInitiatingOccupancyOnly
 6 | -Des.networkaddress.cache.ttl=60
 7 | -Des.networkaddress.cache.negative.ttl=10
 8 | -XX:+AlwaysPreTouch
 9 | -Xss1m
10 | -Djava.awt.headless=true
11 | -Dfile.encoding=UTF-8
12 | -Djna.nosys=true
13 | -XX:-OmitStackTraceInFastThrow
14 | -Dio.netty.noUnsafe=true
15 | -Dio.netty.noKeySetOptimization=true
16 | -Dio.netty.recycler.maxCapacityPerThread=0
17 | -Dlog4j.shutdownHookEnabled=false
18 | -Dlog4j2.disable.jmx=true
19 | -Djava.io.tmpdir=${ES_TMPDIR}
20 | -XX:+HeapDumpOnOutOfMemoryError
21 | -XX:HeapDumpPath=/var/lib/elasticsearch
22 | -XX:ErrorFile=/var/log/elasticsearch/hs_err_pid%p.log
23 | 8:-XX:+PrintGCDetails
24 | 8:-XX:+PrintGCDateStamps
25 | 8:-XX:+PrintTenuringDistribution
26 | 8:-XX:+PrintGCApplicationStoppedTime
27 | 8:-Xloggc:/var/log/elasticsearch/gc.log
28 | 8:-XX:+UseGCLogFileRotation
29 | 8:-XX:NumberOfGCLogFiles=32
30 | 8:-XX:GCLogFileSize=64m
31 | 9-:-Xlog:gc*,gc+age=trace,safepoint:file=/var/log/elasticsearch/gc.log:utctime,pid,tags:filecount=32,filesize=64m
32 | 9-:-Djava.locale.providers=COMPAT
33 | 10-:-XX:UseAVX=2
34 | 


--------------------------------------------------------------------------------
/roles/elk/files/nginx.yml:
--------------------------------------------------------------------------------
 1 | - module: nginx
 2 |   # Access logs
 3 |   access:
 4 |     enabled: true
 5 | 
 6 |     # Set custom paths for the log files. If left empty,
 7 |     # Filebeat will choose the paths depending on your OS.
 8 |     #var.paths:
 9 | 
10 |     # Convert the timestamp to UTC. Requires Elasticsearch >= 6.1.
11 |     #var.convert_timezone: true
12 | 
13 |   # Error logs
14 |   error:
15 |     enabled: true
16 | 
17 |     # Set custom paths for the log files. If left empty,
18 |     # Filebeat will choose the paths depending on your OS.
19 |     #var.paths:
20 | 
21 |     # Convert the timestamp to UTC. Requires Elasticsearch >= 6.1.
22 |     var.convert_timezone: true
23 | 


--------------------------------------------------------------------------------
/roles/elk/files/system.yml:
--------------------------------------------------------------------------------
 1 | - module: system
 2 |   # Syslog
 3 |   syslog:
 4 |     enabled: true
 5 | 
 6 |     # Set custom paths for the log files. If left empty,
 7 |     # Filebeat will choose the paths depending on your OS.
 8 |     #var.paths:
 9 | 
10 |     # Convert the timestamp to UTC. Requires Elasticsearch >= 6.1.
11 |     var.convert_timezone: true
12 | 
13 |   # Authorization logs
14 |   auth:
15 |     enabled: true
16 | 
17 |     # Set custom paths for the log files. If left empty,
18 |     # Filebeat will choose the paths depending on your OS.
19 |     #var.paths:
20 | 
21 |     # Convert the timestamp to UTC. Requires Elasticsearch >= 6.1.
22 |     var.convert_timezone: true
23 | 


--------------------------------------------------------------------------------
/roles/elk/tasks/main.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | # tasks file for elk
  3 | - name: install python3 apt for apt repo task
  4 |   become: yes
  5 |   apt:
  6 |     name: python3-apt
  7 |     state: present
  8 | - name: add elastic apt key
  9 |   become: yes
 10 |   apt_key:
 11 |     url: https://artifacts.elastic.co/GPG-KEY-elasticsearch
 12 | - name: add elastic repos for apt
 13 |   become: yes
 14 |   apt_repository:
 15 |     repo: "deb https://artifacts.elastic.co/packages/6.x/apt stable main"
 16 |     state: present
 17 |     update_cache: yes
 18 | - name: install elastic
 19 |   become: yes
 20 |   apt:
 21 |     name: elasticsearch=6.8.0
 22 |     state: present
 23 |   when: inventory_hostname in es_host
 24 | - name: elastic config
 25 |   become: yes
 26 |   template:
 27 |     src: elasticsearch.yml
 28 |     dest: /etc/elasticsearch/elasticsearch.yml
 29 |     owner: root
 30 |     group: elasticsearch
 31 |     mode: 0640
 32 |   when: inventory_hostname in es_host
 33 |   register: esconf
 34 | - name: jvm option
 35 |   become: yes
 36 |   copy:
 37 |     src: jvm.options
 38 |     dest: /etc/elasticsearch/jvm.options
 39 |     owner: root
 40 |     group: elasticsearch
 41 |     mode: 0640
 42 |   when: inventory_hostname in es_host
 43 |   register: jvmconf
 44 | - name: ensure cert directory exist
 45 |   become: yes
 46 |   file:
 47 |     path: /etc/elasticsearch/certs
 48 |     state: directory
 49 |   when: inventory_hostname in es_host
 50 |   register: cert
 51 | - name: generate ssl ca
 52 |   when: inventory_hostname == es_host[0] and cert.changed
 53 |   become: yes
 54 |   command: '/usr/share/elasticsearch/bin/elasticsearch-certutil ca --pass "" --out /etc/elasticsearch/certs/elastic-stack-ca.p12'
 55 | - name: generate ssl cert
 56 |   when: inventory_hostname == es_host[0] and cert.changed
 57 |   become: yes
 58 |   command: '/usr/share/elasticsearch/bin/elasticsearch-certutil cert --ca /etc/elasticsearch/certs/elastic-stack-ca.p12 --pass "" --out {{ role_path }}/files/elastic-certificates.p12 --ca-pass ""'
 59 | - name: chown ssl key files
 60 |   become: yes
 61 |   when: inventory_hostname == es_host[0] and cert.changed
 62 |   file:
 63 |     owner: "{{ admin }}"
 64 |     path: "{{ role_path }}/files/elastic-certificates.p12"
 65 | - name: copy ssl key files
 66 |   become: yes
 67 |   copy:
 68 |     src: elastic-certificates.p12 
 69 |     dest: /etc/elasticsearch/certs/elastic-certificates.p12
 70 |   when: inventory_hostname in es_host
 71 |   register: sslconf
 72 | - name: ensure es is started
 73 |   become: yes
 74 |   service:
 75 |     name: elasticsearch
 76 |     state: started
 77 |     enabled: yes
 78 |   when: inventory_hostname in es_host
 79 | - name: elasticsearch restart
 80 |   become: yes
 81 |   service:
 82 |     name: elasticsearch
 83 |     state: restarted
 84 |     enabled: yes
 85 |   when: inventory_hostname in es_host and (esconf.changed or sslconf.changed or jvmconf.changed)
 86 | - name: install kibana
 87 |   become: yes
 88 |   apt:
 89 |     name: kibana=6.8.0
 90 |     state: present
 91 |   when: inventory_hostname in groups['ln']
 92 | - name: configure kibana
 93 |   become: yes
 94 |   template:
 95 |     src: kibana.yml
 96 |     dest: /etc/kibana/kibana.yml
 97 |     mode: 0600
 98 |     owner: kibana
 99 |   when: inventory_hostname in groups['ln']
100 |   register: kbconf
101 | - name: ensure kibana is started
102 |   become: yes
103 |   service:
104 |     name: kibana
105 |     state: started
106 |     enabled: yes
107 |   when: inventory_hostname in groups['ln']
108 | - name: service kibana enable and start
109 |   become: yes
110 |   service:
111 |     name: kibana
112 |     state: restarted
113 |     enabled: yes
114 |   when: inventory_hostname in groups['ln'] and kbconf.changed
115 | - name: install nginx
116 |   become: yes
117 |   apt:
118 |     name: nginx
119 |     state: present
120 |   when: inventory_hostname in groups['ln']
121 | - name: delete default nginx page
122 |   become: yes
123 |   file:
124 |     path: /etc/nginx/sites-enabled/default
125 |     state: absent
126 |   when: inventory_hostname in groups['ln']
127 |   register: rmdefault
128 | - name: nginx kibana server set
129 |   template:
130 |     src: kibana.conf
131 |     dest: /etc/nginx/sites-enabled/kibana.conf
132 |   become: yes
133 |   when: inventory_hostname in groups['ln']
134 |   register: kibanaserver
135 | - name: ensure nginx is started
136 |   become: yes
137 |   service:
138 |     name: nginx
139 |     state: started
140 |     enabled: yes
141 |   when: inventory_hostname in groups['ln']
142 | - name: restart nginx
143 |   become: yes
144 |   service:
145 |     name: nginx
146 |     state: restarted
147 |     enabled: yes
148 |   when: inventory_hostname in groups['ln'] and (rmdefault.changed or kibanaserver.changed)
149 | - name: install logstash
150 |   become: yes
151 |   apt:
152 |     name: logstash=1:6.8.0-1
153 |     state: present
154 |   when: inventory_hostname in groups['ln']
155 | - name: config logstash
156 |   template:
157 |     src: "{{ item }}"
158 |     dest: "/etc/logstash/conf.d/{{ item }}"
159 |     owner: logstash
160 |     mode: 0600
161 |   become: yes
162 |   when: inventory_hostname in groups['ln']
163 |   register: logstashconf
164 |   with_items:
165 |     - "02-beats-input.conf"
166 |     - "30-elasticsearch-output.conf"
167 | - name: ensure logstash is started
168 |   service:
169 |     name: logstash
170 |     state: started
171 |   when: inventory_hostname in groups['ln']
172 |   become: yes
173 | - name: restart logstash service
174 |   become: yes
175 |   service:
176 |     name: logstash
177 |     state: restarted
178 |     enabled: yes
179 |   when: inventory_hostname in groups['ln'] and logstashconf.changed
180 | - name: install filebeat
181 |   become: yes
182 |   apt: 
183 |     name: filebeat=6.8.0
184 |     state: present
185 | - name: config filebeat
186 |   become: yes
187 |   template:
188 |     src: filebeat.yml
189 |     dest: /etc/filebeat/filebeat.yml
190 |     owner: root
191 |     mode: 0600
192 |   register: filebeatconf
193 | - name: filebeat modules file
194 |   become: yes
195 |   copy:
196 |     src: "{{ item }}"
197 |     dest: "/etc/filebeat/modules.d/{{ item }}"
198 |   with_items:
199 |     - nginx.yml
200 |     - system.yml
201 |     - apache2.yml
202 |   when: inventory_hostname in groups['ln']
203 | - name: filebeat modules file on cns
204 |   become: yes
205 |   copy:
206 |     src: "{{ item }}"
207 |     dest: "/etc/filebeat/modules.d/{{ item }}"
208 |   with_items:
209 |     - system.yml
210 |   when: inventory_hostname in groups['cn']
211 | - name: ensure fb is started
212 |   become: yes
213 |   service:
214 |     name: filebeat
215 |     state: started
216 | - name: restart filebeat
217 |   become: yes
218 |   service:
219 |     name: filebeat
220 |     state: restarted  
221 |   when:  filebeatconf.changed
222 | ## the following tasks is used for initialization of fb
223 | - name: stop filebeat for init
224 |   become: yes
225 |   service:
226 |     name: filebeat
227 |     state: stopped
228 |   when: filebeat_init
229 | - name: hack apache2 error pipelines enabling it supporting timezone convert
230 |   become: yes
231 |   when: inventory_hostname in groups['ln'] and filebeat_init
232 |   copy:
233 |     dest: '/usr/share/filebeat/module/apache2/error/ingest/pipeline.json'
234 |     src: 'apache2_pipeline.json'
235 | - name: delete all exisiting pipelines
236 |   shell: "unset http_proxy&&curl -XDELETE -u {{ es_user }}:{{ es_pass }} 'http://{{ es_host[0] }}:9200/_ingest/pipeline/filebeat*'"
237 |   when: inventory_hostname in groups['ln'] and filebeat_init
238 | - name: enable filebeat module in login node
239 |   become: yes
240 |   command: "filebeat modules enable {{item}}"
241 |   with_items:
242 |      "{{ filebeat_ln_modules }}"
243 |   when: inventory_hostname in groups['ln'] and filebeat_init
244 |   register: r
245 |   changed_when: r.stdout.startswith("Enable")
246 | - name: enable filebeat module in compute nodes
247 |   become: yes
248 |   command: "filebeat modules enable {{item}}"
249 |   with_items:
250 |      "{{ filebeat_cn_modules }}"
251 |   when: inventory_hostname in groups['cn'] and filebeat_init
252 |   register: rcn
253 |   changed_when: rcn.stdout.startswith("Enable") 
254 | - name: filebeat setup init
255 |   become: yes
256 |   shell: "unset http_proxy&&filebeat setup -e -E output.logstash.enabled=false -E output.elasticsearch.hosts=[{{ hostvars[es_host[0]]['ip'] }}:9200] -E output.elasticsearch.username={{ es_user }} -E output.elasticsearch.password={{ es_pass }}"
257 |   when: inventory_hostname in groups['ln'] and filebeat_init
258 | - name: filebeat add pipelines into ES
259 |   become: yes
260 |   shell: "unset http_proxy&&filebeat setup --pipelines --modules {{ filebeat_ln_modules|join(',') }}  -E output.logstash.enabled=false -E output.elasticsearch.hosts=['{{ es_host[0] }}:9200'] -E output.elasticsearch.username={{ es_user }} -E output.elasticsearch.password={{ es_pass }} -M system.auth.var.convert_timezone=true -M system.syslog.var.convert_timezone=true -M nginx.error.var.convert_timezone=true -M apache2.error.var.convert_timezone=true"
261 |   when: inventory_hostname in groups['ln'] and filebeat_init
262 | - name: start filebeat again
263 |   become: yes
264 |   service:
265 |     name: filebeat
266 |     state: started
267 |   when: filebeat_init
268 | 


--------------------------------------------------------------------------------
/roles/elk/templates/02-beats-input.conf:
--------------------------------------------------------------------------------
1 | input {
2 |   beats {
3 |     port => 5044
4 |     ssl => false
5 |     host => "{{ master_ip }}"
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/roles/elk/templates/30-elasticsearch-output.conf:
--------------------------------------------------------------------------------
 1 | output {
 2 |   if [@metadata][pipeline] {
 3 |     elasticsearch {
 4 |       hosts => ["{{ es_host[0] }}:9200"]
 5 |       manage_template => false
 6 |       index => "%{[@metadata][beat]}-%{[@metadata][version]}-%{+YYYY.MM.dd}"
 7 |       pipeline => "%{[@metadata][pipeline]}" 
 8 |       user => "{{ es_user }}"
 9 |       password => "{{ es_pass }}"
10 |     }
11 |   } else {
12 |     elasticsearch {
13 |       hosts => ["{{ es_host[0] }}:9200"]
14 |       manage_template => false
15 |       index => "%{[@metadata][beat]}-%{[@metadata][version]}-%{+YYYY.MM.dd}"
16 |       user => "{{ es_user }}"
17 |       password => "{{ es_pass }}"
18 |     }
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/roles/elk/templates/elasticsearch.yml:
--------------------------------------------------------------------------------
 1 | # ======================== Elasticsearch Configuration =========================
 2 | #
 3 | # NOTE: Elasticsearch comes with reasonable defaults for most settings.
 4 | #       Before you set out to tweak and tune the configuration, make sure you
 5 | #       understand what are you trying to accomplish and the consequences.
 6 | #
 7 | # The primary way of configuring a node is via this file. This template lists
 8 | # the most important settings you may want to configure for a production cluster.
 9 | #
10 | # Please consult the documentation for further information on configuration options:
11 | # https://www.elastic.co/guide/en/elasticsearch/reference/index.html
12 | #
13 | # ---------------------------------- Cluster -----------------------------------
14 | #
15 | # Use a descriptive name for your cluster:
16 | #
17 | cluster.name: hpc2-es
18 | #
19 | # ------------------------------------ Node ------------------------------------
20 | #
21 | # Use a descriptive name for the node:
22 | #
23 | node.name: {{ inventory_hostname }}
24 | #
25 | # Add custom attributes to the node:
26 | #
27 | #node.attr.rack: r1
28 | #
29 | # ----------------------------------- Paths ------------------------------------
30 | #
31 | # Path to directory where to store the data (separate multiple locations by comma):
32 | #
33 | path.data: /var/lib/elasticsearch
34 | #
35 | # Path to log files:
36 | #
37 | path.logs: /var/log/elasticsearch
38 | #
39 | # ----------------------------------- Memory -----------------------------------
40 | #
41 | # Lock the memory on startup:
42 | #
43 | #bootstrap.memory_lock: true
44 | #
45 | # Make sure that the heap size is set to about half the memory available
46 | # on the system and that the owner of the process is allowed to use this
47 | # limit.
48 | #
49 | # Elasticsearch performs poorly when the system is swapping the memory.
50 | #
51 | # ---------------------------------- Network -----------------------------------
52 | #
53 | # Set the bind address to a specific IP (IPv4 or IPv6):
54 | #
55 | network.host: {{ hostvars[inventory_hostname]['ip'] }}
56 | #
57 | # Set a custom port for HTTP:
58 | #
59 | #http.port: 9200
60 | #
61 | # For more information, consult the network module documentation.
62 | #
63 | # --------------------------------- Discovery ----------------------------------
64 | #
65 | # Pass an initial list of hosts to perform discovery when new node is started:
66 | # The default list of hosts is ["127.0.0.1", "[::1]"]
67 | #
68 | discovery.zen.ping.unicast.hosts: [{% for h in es_host %} "{{ h }}" {{ "," if not loop.last else "" }} {% endfor %}]
69 | #
70 | # Prevent the "split brain" by configuring the majority of nodes (total number of master-eligible nodes / 2 + 1):
71 | #
72 | discovery.zen.minimum_master_nodes: 2 
73 | #
74 | # For more information, consult the zen discovery module documentation.
75 | #
76 | # ---------------------------------- Gateway -----------------------------------
77 | #
78 | # Block initial recovery after a full cluster restart until N nodes are started:
79 | #
80 | #gateway.recover_after_nodes: 3
81 | #
82 | # For more information, consult the gateway module documentation.
83 | #
84 | # ---------------------------------- Various -----------------------------------
85 | #
86 | # Require explicit names when deleting indices:
87 | #
88 | #action.destructive_requires_name: true
89 | 
90 | xpack.security.enabled: true
91 | xpack.security.transport.ssl.enabled: true
92 | xpack.security.transport.ssl.verification_mode: certificate
93 | xpack.security.transport.ssl.keystore.path: certs/elastic-certificates.p12
94 | xpack.security.transport.ssl.truststore.path: certs/elastic-certificates.p12
95 | 
96 | ## miscs
97 | 


--------------------------------------------------------------------------------
/roles/elk/templates/filebeat.yml:
--------------------------------------------------------------------------------
  1 | ###################### Filebeat Configuration Example #########################
  2 | 
  3 | # This file is an example configuration file highlighting only the most common
  4 | # options. The filebeat.reference.yml file from the same directory contains all the
  5 | # supported options with more comments. You can use it as a reference.
  6 | #
  7 | # You can find the full configuration reference here:
  8 | # https://www.elastic.co/guide/en/beats/filebeat/index.html
  9 | 
 10 | # For more available modules and options, please see the filebeat.reference.yml sample
 11 | # configuration file.
 12 | 
 13 | #=========================== Filebeat inputs =============================
 14 | 
 15 | filebeat.inputs:
 16 | 
 17 | # Each - is an input. Most options can be set at the input level, so
 18 | # you can use different inputs for various configurations.
 19 | # Below are the input specific configurations.
 20 | 
 21 | - type: log
 22 | 
 23 |   # Change to true to enable this input configuration.
 24 |   enabled: false
 25 | 
 26 |   # Paths that should be crawled and fetched. Glob based paths.
 27 |   paths:
 28 |     - /var/log/*.log
 29 |     #- c:\programdata\elasticsearch\logs\*
 30 | 
 31 |   # Exclude lines. A list of regular expressions to match. It drops the lines that are
 32 |   # matching any regular expression from the list.
 33 |   #exclude_lines: ['^DBG']
 34 | 
 35 |   # Include lines. A list of regular expressions to match. It exports the lines that are
 36 |   # matching any regular expression from the list.
 37 |   #include_lines: ['^ERR', '^WARN']
 38 | 
 39 |   # Exclude files. A list of regular expressions to match. Filebeat drops the files that
 40 |   # are matching any regular expression from the list. By default, no files are dropped.
 41 |   #exclude_files: ['.gz$']
 42 | 
 43 |   # Optional additional fields. These fields can be freely picked
 44 |   # to add additional information to the crawled log files for filtering
 45 |   #fields:
 46 |   #  level: debug
 47 |   #  review: 1
 48 | 
 49 |   ### Multiline options
 50 | 
 51 |   # Multiline can be used for log messages spanning multiple lines. This is common
 52 |   # for Java Stack Traces or C-Line Continuation
 53 | 
 54 |   # The regexp Pattern that has to be matched. The example pattern matches all lines starting with [
 55 |   #multiline.pattern: ^\[
 56 | 
 57 |   # Defines if the pattern set under pattern should be negated or not. Default is false.
 58 |   #multiline.negate: false
 59 | 
 60 |   # Match can be set to "after" or "before". It is used to define if lines should be append to a pattern
 61 |   # that was (not) matched before or after or as long as a pattern is not matched based on negate.
 62 |   # Note: After is the equivalent to previous and before is the equivalent to to next in Logstash
 63 |   #multiline.match: after
 64 | 
 65 | 
 66 | #============================= Filebeat modules ===============================
 67 | 
 68 | filebeat.config.modules:
 69 |   # Glob pattern for configuration loading
 70 |   path: ${path.config}/modules.d/*.yml
 71 | 
 72 |   # Set to true to enable config reloading
 73 |   reload.enabled: false
 74 | 
 75 |   # Period on which files under path should be checked for changes
 76 |   #reload.period: 10s
 77 | 
 78 | #==================== Elasticsearch template setting ==========================
 79 | 
 80 | setup.template.settings:
 81 |   index.number_of_shards: 3
 82 |   #index.codec: best_compression
 83 |   #_source.enabled: false
 84 | 
 85 | #================================ General =====================================
 86 | 
 87 | # The name of the shipper that publishes the network data. It can be used to group
 88 | # all the transactions sent by a single shipper in the web interface.
 89 | #name:
 90 | 
 91 | # The tags of the shipper are included in their own field with each
 92 | # transaction published.
 93 | #tags: ["service-X", "web-tier"]
 94 | 
 95 | # Optional fields that you can specify to add additional information to the
 96 | # output.
 97 | #fields:
 98 | #  env: staging
 99 | 
100 | 
101 | #============================== Dashboards =====================================
102 | # These settings control loading the sample dashboards to the Kibana index. Loading
103 | # the dashboards is disabled by default and can be enabled either by setting the
104 | # options here, or by using the `-setup` CLI flag or the `setup` command.
105 | #setup.dashboards.enabled: false
106 | 
107 | # The URL from where to download the dashboards archive. By default this URL
108 | # has a value which is computed based on the Beat name and version. For released
109 | # versions, this URL points to the dashboard archive on the artifacts.elastic.co
110 | # website.
111 | #setup.dashboards.url:
112 | 
113 | #============================== Kibana =====================================
114 | 
115 | # Starting with Beats version 6.0.0, the dashboards are loaded via the Kibana API.
116 | # This requires a Kibana endpoint configuration.
117 | setup.kibana:
118 | 
119 |   # Kibana Host
120 |   # Scheme and port can be left out and will be set to the default (http and 5601)
121 |   # In case you specify and additional path, the scheme is required: http://localhost:5601/path
122 |   # IPv6 addresses should always be defined as: https://[2001:db8::1]:5601
123 |   #host: "localhost:5601"
124 | 
125 |   # Kibana Space ID
126 |   # ID of the Kibana Space into which the dashboards should be loaded. By default,
127 |   # the Default Space will be used.
128 |   #space.id:
129 | 
130 | #============================= Elastic Cloud ==================================
131 | 
132 | # These settings simplify using filebeat with the Elastic Cloud (https://cloud.elastic.co/).
133 | 
134 | # The cloud.id setting overwrites the `output.elasticsearch.hosts` and
135 | # `setup.kibana.host` options.
136 | # You can find the `cloud.id` in the Elastic Cloud web UI.
137 | #cloud.id:
138 | 
139 | # The cloud.auth setting overwrites the `output.elasticsearch.username` and
140 | # `output.elasticsearch.password` settings. The format is `<user>:<pass>`.
141 | #cloud.auth:
142 | 
143 | #================================ Outputs =====================================
144 | 
145 | # Configure what output to use when sending the data collected by the beat.
146 | 
147 | #-------------------------- Elasticsearch output ------------------------------
148 | #output.elasticsearch:
149 |   # Array of hosts to connect to.
150 |   # hosts: ["localhost:9200"]
151 | 
152 |   # Enabled ilm (beta) to use index lifecycle management instead daily indices.
153 |   #ilm.enabled: false
154 | 
155 |   # Optional protocol and basic auth credentials.
156 |   #protocol: "https"
157 |   #username: "elastic"
158 |   #password: "changeme"
159 | 
160 | #----------------------------- Logstash output --------------------------------
161 | output.logstash:
162 |   # The Logstash hosts
163 |   hosts: ["{{ master_ip }}:5044"]
164 |   ssl:
165 |     enabled: false
166 |   # Optional SSL. By default is off.
167 |   # List of root certificates for HTTPS server verifications
168 |   #ssl.certificate_authorities: ["/etc/pki/root/ca.pem"]
169 | 
170 |   # Certificate for SSL client authentication
171 |   #ssl.certificate: "/etc/pki/client/cert.pem"
172 | 
173 |   # Client Certificate Key
174 |   #ssl.key: "/etc/pki/client/cert.key"
175 | 
176 | #================================ Processors =====================================
177 | 
178 | # Configure processors to enhance or manipulate events generated by the beat.
179 | 
180 | processors:
181 |   - add_host_metadata: ~
182 |   - add_cloud_metadata: ~
183 | 
184 | #================================ Logging =====================================
185 | 
186 | # Sets log level. The default log level is info.
187 | # Available log levels are: error, warning, info, debug
188 | #logging.level: debug
189 | 
190 | # At debug level, you can selectively enable logging only for some components.
191 | # To enable all selectors use ["*"]. Examples of other selectors are "beat",
192 | # "publish", "service".
193 | #logging.selectors: ["*"]
194 | 
195 | #============================== Xpack Monitoring ===============================
196 | # filebeat can export internal metrics to a central Elasticsearch monitoring
197 | # cluster.  This requires xpack monitoring to be enabled in Elasticsearch.  The
198 | # reporting is disabled by default.
199 | 
200 | # Set to true to enable the monitoring reporter.
201 | #xpack.monitoring.enabled: false
202 | 
203 | # Uncomment to send the metrics to Elasticsearch. Most settings from the
204 | # Elasticsearch output are accepted here as well. Any setting that is not set is
205 | # automatically inherited from the Elasticsearch output configuration, so if you
206 | # have the Elasticsearch output configured, you can simply uncomment the
207 | # following line.
208 | #xpack.monitoring.elasticsearch:
209 | 


--------------------------------------------------------------------------------
/roles/elk/templates/kibana.conf:
--------------------------------------------------------------------------------
 1 | server {
 2 |     listen {{ kibana_web_port }};
 3 | 
 4 |     location / {
 5 |         proxy_pass http://localhost:{{ kb_port }};
 6 |         proxy_http_version 1.1;
 7 |         proxy_set_header Upgrade $http_upgrade;
 8 |         proxy_set_header Connection 'upgrade';
 9 |         proxy_set_header Host $host;
10 |         proxy_cache_bypass $http_upgrade;
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/roles/elk/templates/kibana.yml:
--------------------------------------------------------------------------------
  1 | # Set es user and password for kibana to connect
  2 | elasticsearch.username: "{{ kb_user }}"
  3 | elasticsearch.password: "{{ kb_pass }}"
  4 | 
  5 | # Kibana is served by a back end server. This setting specifies the port to use.
  6 | server.port: {{ kb_port }}
  7 | 
  8 | # Specifies the address to which the Kibana server will bind. IP addresses and host names are both valid values.
  9 | # The default is 'localhost', which usually means remote machines will not be able to connect.
 10 | # To allow connections from remote users, set this parameter to a non-loopback address.
 11 | #server.host: "localhost"
 12 | 
 13 | # Enables you to specify a path to mount Kibana at if you are running behind a proxy.
 14 | # Use the `server.rewriteBasePath` setting to tell Kibana if it should remove the basePath
 15 | # from requests it receives, and to prevent a deprecation warning at startup.
 16 | # This setting cannot end in a slash.
 17 | #server.basePath: ""
 18 | 
 19 | # Specifies whether Kibana should rewrite requests that are prefixed with
 20 | # `server.basePath` or require that they are rewritten by your reverse proxy.
 21 | # This setting was effectively always `false` before Kibana 6.3 and will
 22 | # default to `true` starting in Kibana 7.0.
 23 | #server.rewriteBasePath: false
 24 | 
 25 | # The maximum payload size in bytes for incoming server requests.
 26 | #server.maxPayloadBytes: 1048576
 27 | 
 28 | # The Kibana server's name.  This is used for display purposes.
 29 | #server.name: "your-hostname"
 30 | 
 31 | # The URLs of the Elasticsearch instances to use for all your queries.
 32 | elasticsearch.hosts: ["http://{{ es_host[0] }}:9200"]
 33 | 
 34 | # When this setting's value is true Kibana uses the hostname specified in the server.host
 35 | # setting. When the value of this setting is false, Kibana uses the hostname of the host
 36 | # that connects to this Kibana instance.
 37 | #elasticsearch.preserveHost: true
 38 | 
 39 | # Kibana uses an index in Elasticsearch to store saved searches, visualizations and
 40 | # dashboards. Kibana creates a new index if the index doesn't already exist.
 41 | #kibana.index: ".kibana"
 42 | 
 43 | # The default application to load.
 44 | #kibana.defaultAppId: "home"
 45 | 
 46 | # If your Elasticsearch is protected with basic authentication, these settings provide
 47 | # the username and password that the Kibana server uses to perform maintenance on the Kibana
 48 | # index at startup. Your Kibana users still need to authenticate with Elasticsearch, which
 49 | # is proxied through the Kibana server.
 50 | #elasticsearch.username: "user"
 51 | #elasticsearch.password: "pass"
 52 | 
 53 | # Enables SSL and paths to the PEM-format SSL certificate and SSL key files, respectively.
 54 | # These settings enable SSL for outgoing requests from the Kibana server to the browser.
 55 | #server.ssl.enabled: false
 56 | #server.ssl.certificate: /path/to/your/server.crt
 57 | #server.ssl.key: /path/to/your/server.key
 58 | 
 59 | # Optional settings that provide the paths to the PEM-format SSL certificate and key files.
 60 | # These files validate that your Elasticsearch backend uses the same key files.
 61 | #elasticsearch.ssl.certificate: /path/to/your/client.crt
 62 | #elasticsearch.ssl.key: /path/to/your/client.key
 63 | 
 64 | # Optional setting that enables you to specify a path to the PEM file for the certificate
 65 | # authority for your Elasticsearch instance.
 66 | #elasticsearch.ssl.certificateAuthorities: [ "/path/to/your/CA.pem" ]
 67 | 
 68 | # To disregard the validity of SSL certificates, change this setting's value to 'none'.
 69 | #elasticsearch.ssl.verificationMode: full
 70 | 
 71 | # Time in milliseconds to wait for Elasticsearch to respond to pings. Defaults to the value of
 72 | # the elasticsearch.requestTimeout setting.
 73 | #elasticsearch.pingTimeout: 1500
 74 | 
 75 | # Time in milliseconds to wait for responses from the back end or Elasticsearch. This value
 76 | # must be a positive integer.
 77 | #elasticsearch.requestTimeout: 30000
 78 | 
 79 | # List of Kibana client-side headers to send to Elasticsearch. To send *no* client-side
 80 | # headers, set this value to [] (an empty list).
 81 | #elasticsearch.requestHeadersWhitelist: [ authorization ]
 82 | 
 83 | # Header names and values that are sent to Elasticsearch. Any custom headers cannot be overwritten
 84 | # by client-side headers, regardless of the elasticsearch.requestHeadersWhitelist configuration.
 85 | #elasticsearch.customHeaders: {}
 86 | 
 87 | # Time in milliseconds for Elasticsearch to wait for responses from shards. Set to 0 to disable.
 88 | #elasticsearch.shardTimeout: 30000
 89 | 
 90 | # Time in milliseconds to wait for Elasticsearch at Kibana startup before retrying.
 91 | #elasticsearch.startupTimeout: 5000
 92 | 
 93 | # Logs queries sent to Elasticsearch. Requires logging.verbose set to true.
 94 | #elasticsearch.logQueries: false
 95 | 
 96 | # Specifies the path where Kibana creates the process ID file.
 97 | #pid.file: /var/run/kibana.pid
 98 | 
 99 | # Enables you specify a file where Kibana stores log output.
100 | #logging.dest: stdout
101 | 
102 | # Set the value of this setting to true to suppress all logging output.
103 | #logging.silent: false
104 | 
105 | # Set the value of this setting to true to suppress all logging output other than error messages.
106 | #logging.quiet: false
107 | 
108 | # Set the value of this setting to true to log all events, including system usage information
109 | # and all requests.
110 | #logging.verbose: false
111 | 
112 | # Set the interval in milliseconds to sample system and process performance
113 | # metrics. Minimum is 100ms. Defaults to 5000.
114 | #ops.interval: 5000
115 | 
116 | # Specifies locale to be used for all localizable strings, dates and number formats.
117 | #i18n.locale: "en"
118 | 


--------------------------------------------------------------------------------
/roles/ganglia/README.md:
--------------------------------------------------------------------------------
 1 | Ganglia
 2 | =========
 3 | 
 4 | This role is designed to configure ganglia monitoring tools on the cluster.
 5 | 
 6 | 
 7 | Role Variables
 8 | --------------
 9 | 
10 | See defaults/main.yml. ganglia_url is the url to access ganglia webfrontend. For example, you can visit the web interface by http://<master_ip>/<ganglia_url>.


--------------------------------------------------------------------------------
/roles/ganglia/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # defaults file for ganglia
 3 | ganglia_url: ganglia
 4 | ganglia_http_user: admin
 5 | ganglia_http_pass: 123456notgood
 6 | # the following one is used to configure the general avail monitoring scripts
 7 | num_nfs_cn: 4
 8 | num_ext_ln: 3
 9 | disk_warning: 350
10 | master_nic_no: 5 
11 | memory_avail_warning: 5000000
12 | 


--------------------------------------------------------------------------------
/roles/ganglia/files/cpu_stats.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import traceback
  3 | import os
  4 | import re
  5 | import time
  6 | import copy
  7 | 
  8 | METRICS = {
  9 |     'time': 0,
 10 |     'data': {}
 11 | }
 12 | 
 13 | # Got these from /proc/softirqs
 14 | softirq_pos = {
 15 |   'hi': 1,
 16 |   'timer': 2,
 17 |   'nettx': 3,
 18 |   'netrx': 4,
 19 |   'block': 5,
 20 |   'blockiopoll': 6,
 21 |   'tasklet': 7,
 22 |   'sched': 8,
 23 |   'hrtimer': 9,
 24 |   'rcu': 10
 25 | }
 26 | 
 27 | LAST_METRICS = copy.deepcopy(METRICS)
 28 | METRICS_CACHE_MAX = 5
 29 | 
 30 | 
 31 | stat_file = "/proc/stat"
 32 | 
 33 | ###############################################################################
 34 | #
 35 | ###############################################################################
 36 | 
 37 | 
 38 | def get_metrics():
 39 |     """Return all metrics"""
 40 | 
 41 |     global METRICS, LAST_METRICS
 42 | 
 43 |     if (time.time() - METRICS['time']) > METRICS_CACHE_MAX:
 44 | 
 45 |         try:
 46 |             file = open(stat_file, 'r')
 47 | 
 48 |         except IOError:
 49 |             return 0
 50 | 
 51 |         # convert to dict
 52 |         metrics = {}
 53 |         for line in file:
 54 |             parts = re.split("\s+", line)
 55 |             metrics[parts[0]] = list(parts[1:])
 56 | 
 57 |         # update cache
 58 |         LAST_METRICS = copy.deepcopy(METRICS)
 59 |         METRICS = {
 60 |             'time': time.time(),
 61 |             'data': metrics
 62 |         }
 63 | 
 64 |     return [METRICS, LAST_METRICS]
 65 | 
 66 | 
 67 | def get_value(name):
 68 |     """Return a value for the requested metric"""
 69 | 
 70 |     metrics = get_metrics()[0]
 71 | 
 72 |     NAME_PREFIX = "cpu_"
 73 | 
 74 |     name = name.replace(NAME_PREFIX, "")  # remove prefix from name
 75 | 
 76 |     try:
 77 |         result = metrics['data'][name][0]
 78 |     except StandardError:
 79 |         result = 0
 80 | 
 81 |     return result
 82 | 
 83 | 
 84 | def get_delta(name):
 85 |     """Return change over time for the requested metric"""
 86 | 
 87 |     # get metrics
 88 |     [curr_metrics, last_metrics] = get_metrics()
 89 | 
 90 |     NAME_PREFIX = "cpu_"
 91 | 
 92 |     name = name.replace(NAME_PREFIX, "")  # remove prefix from name
 93 | 
 94 |     if name == "procs_created":
 95 |         name = "processes"
 96 | 
 97 |     try:
 98 |         delta = (float(curr_metrics['data'][name][0]) - float(last_metrics['data'][name][0])) / (curr_metrics['time'] - last_metrics['time'])
 99 |         if delta < 0:
100 |             print name + " is less 0"
101 |             delta = 0
102 |     except KeyError:
103 |         delta = 0.0
104 | 
105 |     return delta
106 | 
107 | ##############################################################################
108 | # SoftIRQ has multiple values which are defined in a dictionary at the top
109 | ##############################################################################
110 | 
111 | 
112 | def get_softirq_delta(name):
113 |     """Return change over time for the requested metric"""
114 | 
115 |     # get metrics
116 |     [curr_metrics, last_metrics] = get_metrics()
117 | 
118 |     NAME_PREFIX = "softirq_"
119 | 
120 |     name = name[len(NAME_PREFIX):]  # remove prefix from name
121 | 
122 |     index = softirq_pos[name]
123 | 
124 |     try:
125 |         delta = (float(curr_metrics['data']['softirq'][index]) - float(last_metrics['data']['softirq'][index])) / (curr_metrics['time'] - last_metrics['time'])
126 |         if delta < 0:
127 |             print name + " is less 0"
128 |             delta = 0
129 |     except KeyError:
130 |         delta = 0.0
131 | 
132 |     return delta
133 | 
134 | 
135 | def create_desc(skel, prop):
136 |     d = skel.copy()
137 |     for k, v in prop.iteritems():
138 |         d[k] = v
139 |     return d
140 | 
141 | 
142 | def metric_init(params):
143 |     global descriptors, metric_map, Desc_Skel
144 | 
145 |     descriptors = []
146 | 
147 |     Desc_Skel = {
148 |         'name'        : 'XXX',
149 |         'orig_name'   : 'XXX',
150 |         'call_back'   : get_delta,
151 |         'time_max'    : 60,
152 |         'value_type'  : 'float',
153 |         'format'      : '%.0f',
154 |         'units'       : 'XXX',
155 |         'slope'       : 'both',  # zero|positive|negative|both
156 |         'description' : '',
157 |         'groups'      : 'cpu',
158 |         }
159 | 
160 |     descriptors.append(create_desc(Desc_Skel, {
161 |                 "name"       : "cpu_ctxt",
162 |                 "units"      : "ctxs/sec",
163 |                 "description": "Context Switches",
164 |                 }))
165 | 
166 |     descriptors.append(create_desc(Desc_Skel, {
167 |                 "name"       : "procs_created",
168 |                 "units"      : "proc/sec",
169 |                 "description": "Number of processes and threads created",
170 |                 }))
171 | 
172 |     descriptors.append(create_desc(Desc_Skel, {
173 |                 "name"       : "cpu_intr",
174 |                 "units"      : "intr/sec",
175 |                 "description": "Interrupts serviced",
176 |                 }))
177 | 
178 |     descriptors.append(create_desc(Desc_Skel, {
179 |                 "name"       : "procs_blocked",
180 |                 "units"      : "processes",
181 |                 "call_back"   : get_value,
182 |                 "description": "Processes blocked",
183 |                 }))
184 | 
185 |     descriptors.append(create_desc(Desc_Skel, {
186 |                 "name"       : "softirq",
187 |                 "units"      : "ops/s",
188 |                 "description": "Soft IRQs",
189 |                 }))
190 | 
191 |     descriptors.append(create_desc(Desc_Skel, {
192 |                 "name"       : "softirq_hi",
193 |                 "units"      : "ops/s",
194 |                 'groups'     : 'softirq',
195 |                 "call_back"   : get_softirq_delta
196 |                 }))
197 | 
198 |     descriptors.append(create_desc(Desc_Skel, {
199 |                 "name"       : "softirq_timer",
200 |                 "units"      : "ops/s",
201 |                 'groups'     : 'softirq',
202 |                 "call_back"   : get_softirq_delta
203 |                 }))
204 | 
205 |     descriptors.append(create_desc(Desc_Skel, {
206 |                 "name"       : "softirq_nettx",
207 |                 "units"      : "ops/s",
208 |                 'groups'     : 'softirq',
209 |                 "call_back"   : get_softirq_delta
210 |                 }))
211 | 
212 |     descriptors.append(create_desc(Desc_Skel, {
213 |                 "name"       : "softirq_netrx",
214 |                 "units"      : "ops/s",
215 |                 'groups'     : 'softirq',
216 |                 "call_back"   : get_softirq_delta
217 |                 }))
218 | 
219 |     descriptors.append(create_desc(Desc_Skel, {
220 |                 "name"       : "softirq_block",
221 |                 "units"      : "ops/s",
222 |                 'groups'     : 'softirq',
223 |                 "call_back"   : get_softirq_delta
224 |                 }))
225 | 
226 |     descriptors.append(create_desc(Desc_Skel, {
227 |                 "name"       : "softirq_blockiopoll",
228 |                 "units"      : "ops/s",
229 |                 'groups'     : 'softirq',
230 |                 "call_back"   : get_softirq_delta
231 |                 }))
232 | 
233 |     descriptors.append(create_desc(Desc_Skel, {
234 |                 "name"       : "softirq_tasklet",
235 |                 "units"      : "ops/s",
236 |                 'groups'     : 'softirq',
237 |                 "call_back"   : get_softirq_delta
238 |                 }))
239 | 
240 |     descriptors.append(create_desc(Desc_Skel, {
241 |                 "name"       : "softirq_sched",
242 |                 "units"      : "ops/s",
243 |                 'groups'     : 'softirq',
244 |                 "call_back"   : get_softirq_delta
245 |                 }))
246 | 
247 |     descriptors.append(create_desc(Desc_Skel, {
248 |                 "name"       : "softirq_hrtimer",
249 |                 "units"      : "ops/s",
250 |                 'groups'     : 'softirq',
251 |                 "call_back"   : get_softirq_delta
252 |                 }))
253 | 
254 |     descriptors.append(create_desc(Desc_Skel, {
255 |                 "name"       : "softirq_rcu",
256 |                 "units"      : "ops/s",
257 |                 'groups'     : 'softirq',
258 |                 "call_back"   : get_softirq_delta
259 |                 }))
260 | 
261 |     # We need a metric_map that maps metric_name to the index in /proc/meminfo
262 |     metric_map = {}
263 | 
264 |     for d in descriptors:
265 |         metric_name = d['name']
266 |         metric_map[metric_name] = {"name": d['orig_name'], "units": d['units']}
267 | 
268 |     return descriptors
269 | 
270 | 
271 | def metric_cleanup():
272 |     '''Clean up the metric module.'''
273 |     pass
274 | 
275 | 
276 | #This code is for debugging and unit testing
277 | if __name__ == '__main__':
278 |     metric_init({})
279 |     while True:
280 |         for d in descriptors:
281 |             v = d['call_back'](d['name'])
282 |             print '%s = %s' % (d['name'], v)
283 |         print 'Sleeping 15 seconds'
284 |         time.sleep(5)
285 | 


--------------------------------------------------------------------------------
/roles/ganglia/files/gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # written by refraction-ray@Aug 2019
 3 | 
 4 | GMETRIC=/usr/bin/gmetric
 5 | LOG=/tmp/.gpu.log
 6 | 
 7 | nvidia-smi -q > $LOG
 8 | 
 9 | a=( $(cat $LOG |grep "Fan Speed"|awk '{print $4}') )
10 | j=1
11 | for i in "${a[@]}"; do
12 |     $GMETRIC -t float -n "gpu_${j}_fan_speed" -g "GPU" -u "percent" -v $i
13 |     j=$[$j+1]
14 | done
15 | a=( $(cat $LOG |grep "GPU Current Temp"|awk '{print $5}') )
16 | j=1
17 | for i in "${a[@]}"; do
18 |     $GMETRIC -t float -n "gpu_${j}_temp" -g "GPU" -u "Celcius" -v $i
19 |     j=$[$j+1]
20 | done
21 | a=( $(cat $LOG |grep "Power Draw"|awk '{print $4}') )
22 | j=1
23 | for i in "${a[@]}"; do
24 |     $GMETRIC -t float -n "gpu_${j}_power_draw" -g "GPU" -u "Watt" -v $i
25 |     j=$[$j+1]
26 | done
27 | a=( $(cat $LOG |grep "FB Memory Usage" -A 3|grep "Used"|awk '{print $3}') )
28 | j=1
29 | for i in "${a[@]}"; do
30 |     $GMETRIC -t float -n "gpu_${j}_mem_used" -g "GPU" -u "MiB" -v $i
31 |     j=$[$j+1]
32 | done
33 | a=( $(cat $LOG |grep "Utilization" -A 3|grep "Gpu"|awk '{print $3}') )
34 | j=1
35 | for i in "${a[@]}"; do
36 |     $GMETRIC -t float -n "gpu_${j}_utilization" -g "GPU" -u "persent" -v $i
37 |     j=$[$j+1]
38 | done
39 | 


--------------------------------------------------------------------------------
/roles/ganglia/files/netstats.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import re
  3 | import time
  4 | import copy
  5 | import string
  6 | 
  7 | PARAMS = {}
  8 | 
  9 | METRICS = {
 10 |     'time' : 0,
 11 |     'data' : {}
 12 | }
 13 | 
 14 | stats_files = [ "/proc/net/netstat", "/proc/net/snmp" ]
 15 | 
 16 | LAST_METRICS = copy.deepcopy(METRICS)
 17 | METRICS_CACHE_MAX = 5
 18 | 
 19 | stats_pos = {} 
 20 | 
 21 | def get_metrics():
 22 |     """Return all metrics"""
 23 | 
 24 |     global METRICS, LAST_METRICS
 25 | 
 26 |     if (time.time() - METRICS['time']) > METRICS_CACHE_MAX:
 27 | 
 28 | 	new_metrics = {}
 29 | 
 30 | 	for file in stats_files:
 31 | 	    try:
 32 | 		file = open(file, 'r')
 33 | 	
 34 | 	    except IOError:
 35 | 		return 0
 36 |     
 37 | 	    # convert to dict
 38 | 	    metrics = {}
 39 | 	    for line in file:
 40 | 		if re.match("(.*): [0-9]", line):
 41 | 		    count = 0
 42 | 		    metrics = re.split("\s+", line)
 43 | 		    metric_group = metrics[0].replace(":", "").lower()
 44 | 		    new_metrics[metric_group] = dict()
 45 | 		    for value in metrics:
 46 | 			# Skip first
 47 | 			if count > 0 and value >= 0 and count in stats_pos[metric_group]:
 48 | 			    metric_name = stats_pos[metric_group][count]
 49 | 			    new_metrics[metric_group][metric_name] = value
 50 | 			count += 1
 51 | 
 52 | 	    file.close()
 53 | 
 54 |         # update cache
 55 |         LAST_METRICS = copy.deepcopy(METRICS)
 56 |         METRICS = {
 57 |             'time': time.time(),
 58 |             'data': new_metrics
 59 |         }
 60 | 
 61 |     return [METRICS, LAST_METRICS]
 62 | 
 63 | 
 64 | def get_value(name):
 65 |     """Return a value for the requested metric"""
 66 | 
 67 |     metrics = get_metrics()[0]
 68 | 
 69 |     name = name[len(NAME_PREFIX):] # remove prefix from name
 70 | 
 71 |     try:
 72 |         result = metrics['data'][name]
 73 |     except StandardError:
 74 |         result = 0
 75 | 
 76 |     return result
 77 | 
 78 | 
 79 | def get_delta(name):
 80 |     """Return change over time for the requested metric"""
 81 | 
 82 |     # get metrics
 83 |     [curr_metrics, last_metrics] = get_metrics()
 84 | 
 85 |     parts = name.split("_")
 86 |     group = parts[0]
 87 |     metric = "_".join(parts[1:])
 88 | 
 89 |     try:
 90 |       delta = (float(curr_metrics['data'][group][metric]) - float(last_metrics['data'][group][metric])) /(curr_metrics['time'] - last_metrics['time'])
 91 |       if delta < 0:
 92 | 	print name + " is less 0"
 93 | 	delta = 0
 94 |     except KeyError:
 95 |       delta = 0.0      
 96 | 
 97 |     return delta
 98 | 
 99 | 
100 | def get_tcploss_percentage(name):
101 | 
102 |     # get metrics
103 |     [curr_metrics, last_metrics] = get_metrics()
104 | 
105 |     try:
106 |       pct = 100 * (float(curr_metrics['data']['tcpext']["tcploss"]) - float(last_metrics["data"]['tcpext']["tcploss"])) / (float(curr_metrics['data']['tcp']['outsegs']) +  float(curr_metrics['data']['tcp']['insegs']) - float(last_metrics['data']['tcp']['insegs']) - float(last_metrics['data']['tcp']['outsegs']))
107 |       if pct < 0:
108 | 	print name + " is less 0"
109 | 	pct = 0
110 |     except KeyError:
111 |       pct = 0.0
112 |     except ZeroDivisionError:
113 |       pct = 0.0
114 | 
115 |     return pct
116 | 
117 | def get_retrans_percentage(name):
118 | 
119 |     # get metrics
120 |     [curr_metrics, last_metrics] = get_metrics()
121 | 
122 |     try:
123 |       pct = 100 * (float(curr_metrics['data']['tcp']["retranssegs"]) - float(last_metrics['data']['tcp']["retranssegs"])) / (float(curr_metrics['data']['tcp']['outsegs']) +  float(curr_metrics['data']['tcp']['insegs']) - float(last_metrics['data']['tcp']['insegs']) - float(last_metrics['data']['tcp']['outsegs']))
124 |       if pct < 0:
125 | 	print name + " is less 0"
126 | 	pct = 0
127 |     except KeyError:
128 |       pct = 0.0
129 |     except ZeroDivisionError:
130 |       pct = 0.0
131 |     return pct
132 | 
133 | 
134 | def create_desc(skel, prop):
135 |     d = skel.copy()
136 |     for k,v in prop.iteritems():
137 |         d[k] = v
138 |     return d
139 | 
140 | def metric_init(params):
141 |     global descriptors, metric_map, Desc_Skel
142 | 
143 |     descriptors = []
144 | 
145 |     Desc_Skel = {
146 |         'name'        : 'XXX',
147 |         'call_back'   : get_delta,
148 |         'time_max'    : 60,
149 |         'value_type'  : 'float',
150 |         'format'      : '%.5f',
151 |         'units'       : 'count/s',
152 |         'slope'       : 'both', # zero|positive|negative|both
153 |         'description' : 'XXX',
154 |         'groups'      : 'XXX',
155 |         }
156 | 
157 |     ####################################################################################
158 |     # Let's figure out what metrics are available
159 |     #
160 |     # Read /proc/net/netstat
161 |     ####################################################################################
162 |     for file in stats_files:
163 | 	try:
164 | 	    file = open(file, 'r')
165 |     
166 | 	except IOError:
167 | 	    return 0
168 | 	
169 | 	# Find mapping
170 | 	for line in file:
171 | 	    # Lines with 
172 | 	    if not re.match("(.*): [0-9]", line):
173 | 		count = 0
174 | 		mapping = re.split("\s+", line)
175 | 		metric_group = mapping[0].replace(":", "").lower()
176 | 		stats_pos[metric_group] = dict()
177 | 		for metric in mapping:
178 | 		    # Skip first 
179 | 		    if count > 0 and metric != "":
180 | 			lowercase_metric = metric.lower()
181 | 			stats_pos[metric_group][count] = lowercase_metric
182 | 		    count += 1
183 |     
184 | 	file.close()
185 | 
186 |     for group in stats_pos:
187 | 	for item in stats_pos[group]:
188 | 	    descriptors.append(create_desc(Desc_Skel, {
189 | 		    "name"       : group + "_" + stats_pos[group][item],
190 | 		    "description": stats_pos[group][item],
191 | 		    'groups'	 : group
192 | 		    }))
193 | 
194 |     descriptors.append(create_desc(Desc_Skel, {
195 | 	"name"       : "tcpext_" + "tcploss_percentage",
196 | 	"call_back"  : get_tcploss_percentage,
197 | 	"description": "TCP percentage loss, tcploss / insegs + outsegs",
198 | 	"units"      : "pct",
199 |         'groups'      : 'tcpext'
200 | 	}))
201 | 
202 |     descriptors.append(create_desc(Desc_Skel, {
203 | 	"name"       : "tcp_" + "retrans_percentage",
204 | 	"call_back"  : get_retrans_percentage,
205 | 	"description": "TCP retrans percentage, retranssegs / insegs + outsegs",
206 | 	"units"      : "pct",
207 |         'groups'      : 'tcp'
208 | 	}))
209 | 
210 |     return descriptors
211 | 
212 | def metric_cleanup():
213 |     '''Clean up the metric module.'''
214 |     pass
215 | 
216 | #This code is for debugging and unit testing
217 | if __name__ == '__main__':
218 |     descriptors = metric_init(PARAMS)
219 |     while True:
220 |         for d in descriptors:
221 |             v = d['call_back'](d['name'])
222 |             print '%s = %s' % (d['name'],  v)
223 |         print 'Sleeping 15 seconds'
224 |         time.sleep(15)
225 | 


--------------------------------------------------------------------------------
/roles/ganglia/files/temg.sh:
--------------------------------------------------------------------------------
 1 | SENSORS=/usr/bin/sensors
 2 | GMETRIC=/usr/bin/gmetric
 3 | 
 4 | let count=0
 5 | sum=0.0
 6 | for temp in $($SENSORS | grep "^Core" | grep -e '+.*C' | cut -f 2 -d '+' | cut -f 1 -d ' ' | sed 's/°C//'); do
 7 |     sum=$(echo $sum+$temp | bc)
 8 |     # echo $temp, $sum
 9 |     let count+=1
10 | done
11 | temp=$(echo "$sum/$count" | bc)
12 | 
13 | $GMETRIC -t float -n "cpu_temp" -u "Celcius" -v $temp
14 | 
15 | if [ $temp -gt 89 ]; then
16 |     logger -t gangalia-monitor temperature_too_high: $temp
17 | fi
18 | 
19 | 


--------------------------------------------------------------------------------
/roles/ganglia/tasks/main.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | # tasks file for ganglia
  3 | - name: install necessary package for ganglia on ln node
  4 |   become: yes
  5 |   apt:
  6 |     name: "{{ item }}"
  7 |     state: present
  8 |   when: inventory_hostname in groups['ln']
  9 |   with_items:
 10 |     - ganglia-monitor=3.6.0-7ubuntu2
 11 |     - ganglia-webfrontend
 12 |     - gmetad
 13 |     - ganglia-monitor-python=3.6.0-7ubuntu2
 14 |     - lm-sensors
 15 | - name: install package for ganglia on cn nodes
 16 |   become: yes
 17 |   apt:
 18 |     name: "{{ item }}"
 19 |     state: present
 20 |   with_items:
 21 |     - ganglia-monitor=3.6.0-7ubuntu2
 22 |     - ganglia-monitor-python=3.6.0-7ubuntu2
 23 |     - lm-sensors
 24 |   when: inventory_hostname in groups['cn']
 25 | - name: hack on netstats.py
 26 |   become: yes
 27 |   copy:
 28 |     src: "{{ item }}"
 29 |     dest: "/usr/lib/ganglia/python_modules/{{ item }}"
 30 |   register: pymd
 31 |   with_items:
 32 |     - netstats.py
 33 |     - cpu_stats.py
 34 | - name: config gmond in cn nodes
 35 |   become: yes
 36 |   template:
 37 |     src: gmond-cn.conf
 38 |     dest: /etc/ganglia/gmond.conf
 39 |     backup: yes
 40 |   when: inventory_hostname in groups['cn']
 41 |   register: gmondc
 42 | - name: ensure gmond is started
 43 |   become: yes
 44 |   service:
 45 |     name: ganglia-monitor
 46 |     state: started
 47 |   when: inventory_hostname in groups['cn']
 48 | - name: restart gmond in cn nodes
 49 |   become: yes
 50 |   service:
 51 |     name: ganglia-monitor
 52 |     state: restarted
 53 |   when: inventory_hostname in groups['cn'] and (gmondc.changed or pymd.changed)
 54 | - name: config gmond in ln nodes
 55 |   become: yes
 56 |   template:
 57 |      src: gmond.conf
 58 |      dest: /etc/ganglia/gmond.conf
 59 |      backup: yes
 60 |   when: inventory_hostname in groups['ln']
 61 |   register: gmondl
 62 | - name: ensure gmond started in ln
 63 |   become: yes
 64 |   service:
 65 |     name: ganglia-monitor
 66 |     state: started
 67 |     enabled: yes
 68 |   when: inventory_hostname in groups['ln']
 69 | - name: restart gmond in ln node
 70 |   become: yes
 71 |   service:
 72 |     name: ganglia-monitor
 73 |     state: restarted
 74 |   when: inventory_hostname in groups['ln'] and (gmondl.changed or pymd.changed)
 75 | - name: config gmetad in ln nodes
 76 |   become: yes
 77 |   template:
 78 |     src: gmetad.conf
 79 |     dest: /etc/ganglia/gmetad.conf
 80 |   when: inventory_hostname in groups['ln']
 81 |   register: gmetadc
 82 | - name: ensure gmetad started in ln
 83 |   become: yes
 84 |   service:
 85 |     name: gmetad
 86 |     state: started
 87 |     enabled: yes
 88 |   when: inventory_hostname in groups['ln']
 89 | - name: restart gmetad in ln node
 90 |   become: yes
 91 |   service:
 92 |     name: gmetad
 93 |     state: restarted
 94 |   when: inventory_hostname in groups['ln'] and gmetadc.changed
 95 | - name: config web interface of ganglia
 96 |   become: yes
 97 |   template:
 98 |     src: ganglia.conf
 99 |     dest: /etc/apache2/sites-enabled/ganglia.conf
100 |   when: inventory_hostname in groups['ln']
101 |   register: gweb
102 | - name: patch on ganglia webfront cluster view
103 |   lineinfile:
104 |     path: /usr/share/ganglia-webfrontend/cluster_view.php
105 |     regexp: "context_metrics ="
106 |     line: "  $context_metrics = array();"
107 |   become: yes
108 |   when: inventory_hostname in groups['ln']
109 | - name: install passlib for password protection on apache
110 |   become: yes
111 |   apt:
112 |     name: python3-passlib
113 |     state: present
114 |   when: inventory_hostname in groups['ln']
115 | - name: setup apache passwd
116 |   htpasswd:
117 |     path: /etc/apache2/.htpasswd
118 |     name: "{{ ganglia_http_user }}"
119 |     password: "{{ ganglia_http_pass }}"
120 |     owner: root
121 |     group: www-data
122 |     mode: 0640
123 |   become: yes
124 |   when: inventory_hostname in groups['ln']
125 |   register: passwd
126 | - name: ensure apache is started
127 |   become: yes
128 |   service:
129 |     name: apache2
130 |     state: started
131 |     enabled: yes
132 |   when: inventory_hostname in groups['ln']
133 | - name: restart apache2
134 |   become: yes
135 |   service:
136 |     name: apache2
137 |     state: restarted
138 |   when: inventory_hostname in groups['ln'] and (gweb.changed or passwd.changed)
139 | - name: copy the temperature sensor script
140 |   copy:
141 |     src: temg.sh
142 |     dest: "{{ lookup('env', 'HOME') }}/.temg.sh"
143 |   when: inventory_hostname in groups['ln']
144 | - name: add cpu temperature to crontab
145 |   cron:
146 |     job: "/bin/bash {{ lookup('env', 'HOME') }}/.temg.sh"
147 |     name: "temperature monitoring"
148 |   become: yes
149 | - name: copy gpu monitoring script
150 |   copy:
151 |     src: gpu.sh
152 |     dest: "{{ lookup('env', 'HOME') }}/.gpu.sh"
153 |   when: inventory_hostname in groups['ln']
154 | - name: add gpu script to crontab
155 |   become: yes
156 |   when: inventory_hostname in groups['gn']
157 |   cron: 
158 |     minute: "*/2"
159 |     job: "/bin/bash {{ lookup('env', 'HOME') }}/.gpu.sh"
160 |     name: "gpu monitoring"
161 | - name: copy avail monitoring script
162 |   template:
163 |     src: avail-monitor.sh
164 |     dest: "{{ lookup('env', 'HOME') }}/.avail-monitor.sh"
165 |   when: inventory_hostname in groups['ln']
166 | - name: add avail script to crontab on all nodes
167 |   become: yes
168 |   cron:
169 |     minute: "*/3"
170 |     job: "/bin/bash {{ lookup('env', 'HOME') }}/.avail-monitor.sh"
171 |     name: "avail monitoring"
172 | - name: no mail from crontab
173 |   become: yes
174 |   cronvar:
175 |     name: "MAILTO"
176 |     value: '""'
177 | 


--------------------------------------------------------------------------------
/roles/ganglia/templates/avail-monitor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # refraction-ray @ AUG 2019
 3 | # Work in Progress
 4 | ############################
 5 | ## general variables
 6 | hostname=$(hostname)
 7 | tag="avail-monitor-warning"
 8 | 
 9 | ## slurm nodes availability
10 | errornum=$(sinfo -N|grep "down\|alloc\*\|idle\*\|mix\*"|wc -l)
11 | if [ $errornum -ne 0 ]; then
12 |   logger -t $tag "${errornum} nodes are down in slurm"
13 | fi
14 | 
15 | ## wanip fixed check
16 | if [ "$hostname" == "{{ master_name }}" ]; then
17 |     wanip=$(/sbin/ifconfig|grep {{ ln_wan_nic }} -A1|grep inet|awk '{print $2}')
18 |     if [ "$wanip" != "{{ wan_ip }}" ]; then
19 | 	logger -t $tag "wan ip for master has changed to ${wanip}"
20 |     fi
21 | fi
22 | 
23 | ## filesystem mount check
24 | if [ "$hostname" == "{{ master_name}}" ]; then
25 |     extnum=$(df -T|grep ext4|wc -l)
26 |     if [ $extnum -ne {{ num_ext_ln }} ]; then
27 |         logger -t $tag "ext4 mount is missing"
28 |     fi
29 | elif [ "$hostname" == "c8" ]; then
30 |    nfsnum=$(df -T|grep nfs4|wc -l)
31 |    if [ $nfsnum -ne 3 ]; then
32 |        logger -t $tag "nfs4 mount is missing"
33 |    fi
34 | else
35 |     nfsnum=$(df -T|grep nfs4|wc -l)
36 |     if [ $nfsnum -ne {{ num_nfs_cn }} ]; then
37 |         logger -t $tag "nfs4 mount is missing"
38 |     fi
39 | fi
40 | 
41 | ## load check on master
42 | if [ "$hostname" == "{{ master_name}}" ]; then
43 |     l=$(uptime|awk '{print $12}')
44 |     if (( $(echo "$l > 80" |bc -l) )); then 
45 |         logger -t $tag "the load on master cpu is too high: ${l}" 
46 |     fi
47 | fi
48 | 
49 | ## check the disk usage
50 | st=$(df -HT|grep "/dev/sda2"|awk '{print $4}')
51 | stn=${st%?}
52 | if [ $stn -gt {{ disk_warning }} ]; then
53 |     logger -t $tag "the disk usage is too much"
54 | fi
55 | 
56 | ## check the memory usage on master
57 | if [ "$hostname" == "{{ master_name}}" ]; then
58 |    mem=$(free|grep Mem|awk '{print $7}')
59 |    if [ $mem -lt {{ memory_avail_warning }} ]; then
60 |        logger -t $tag "available memory is draining on master!"
61 |    fi
62 | fi
63 | 
64 | ## check nics
65 | if [ "$hostname" == "{{ master_name }}" ]; then
66 |     nonic=$(/sbin/ifconfig|grep flags|wc -l)
67 |     if [ $nonic -ne {{ master_nic_no }} ]; then
68 |         logger -t $tag "nics seem to be missing on master!"
69 |     fi
70 | fi
71 | 
72 | ## check zombie processes
73 | nozo=$(ps axo pid=,stat=|grep Z|wc -l)
74 | if [ $nozo -gt 1 ]; then
75 |     logger -t $tag "there are several zombie processes! on ${hostname}"
76 | fi
77 | 


--------------------------------------------------------------------------------
/roles/ganglia/templates/ganglia.conf:
--------------------------------------------------------------------------------
 1 | Alias /{{ ganglia_url }} /usr/share/ganglia-webfrontend
 2 | 
 3 | <Directory "/usr/share/ganglia-webfrontend">
 4 | 	AllowOverride All
 5 | 	Order allow,deny
 6 | 	Allow from all
 7 | 	Deny from none
 8 |         AuthType Basic
 9 |         AuthName "Restricted Content"
10 |         AuthUserFile /etc/apache2/.htpasswd
11 |         Require valid-user
12 | </Directory>
13 | 
14 | 


--------------------------------------------------------------------------------
/roles/ganglia/templates/gmetad.conf:
--------------------------------------------------------------------------------
  1 | # This is an example of a Ganglia Meta Daemon configuration file
  2 | #                http://ganglia.sourceforge.net/
  3 | #
  4 | #
  5 | #-------------------------------------------------------------------------------
  6 | # Setting the debug_level to 1 will keep daemon in the forground and
  7 | # show only error messages. Setting this value higher than 1 will make 
  8 | # gmetad output debugging information and stay in the foreground.
  9 | # default: 0
 10 | # debug_level 10
 11 | #
 12 | #-------------------------------------------------------------------------------
 13 | # What to monitor. The most important section of this file. 
 14 | #
 15 | # The data_source tag specifies either a cluster or a grid to
 16 | # monitor. If we detect the source is a cluster, we will maintain a complete
 17 | # set of RRD databases for it, which can be used to create historical 
 18 | # graphs of the metrics. If the source is a grid (it comes from another gmetad),
 19 | # we will only maintain summary RRDs for it.
 20 | #
 21 | # Format: 
 22 | data_source "{{ cluster_name }}" 60 {{ master_name }}
 23 | # 
 24 | # The keyword 'data_source' must immediately be followed by a unique
 25 | # string which identifies the source, then an optional polling interval in 
 26 | # seconds. The source will be polled at this interval on average. 
 27 | # If the polling interval is omitted, 15sec is asssumed. 
 28 | #
 29 | # If you choose to set the polling interval to something other than the default,
 30 | # note that the web frontend determines a host as down if its TN value is less
 31 | # than 4 * TMAX (20sec by default).  Therefore, if you set the polling interval
 32 | # to something around or greater than 80sec, this will cause the frontend to
 33 | # incorrectly display hosts as down even though they are not.
 34 | #
 35 | # A list of machines which service the data source follows, in the 
 36 | # format ip:port, or name:port. If a port is not specified then 8649
 37 | # (the default gmond port) is assumed.
 38 | # default: There is no default value
 39 | #
 40 | # data_source "my cluster" 10 localhost  my.machine.edu:8649  1.2.3.5:8655
 41 | # data_source "my grid" 50 1.3.4.7:8655 grid.org:8651 grid-backup.org:8651
 42 | # data_source "another source" 1.3.4.7:8655  1.3.4.8
 43 | 
 44 | # data_source "my cluster" localhost
 45 | 
 46 | #
 47 | # Round-Robin Archives
 48 | # You can specify custom Round-Robin archives here (defaults are listed below)
 49 | #
 50 | # Old Default RRA: Keep 1 hour of metrics at 15 second resolution. 1 day at 6 minute
 51 | # RRAs "RRA:AVERAGE:0.5:1:244" "RRA:AVERAGE:0.5:24:244" "RRA:AVERAGE:0.5:168:244" "RRA:AVERAGE:0.5:672:244" \
 52 | #      "RRA:AVERAGE:0.5:5760:374"
 53 | # New Default RRA
 54 | # Keep 5856 data points at 15 second resolution assuming 15 second (default) polling. That's 1 day
 55 | # Two weeks of data points at 1 minute resolution (average)
 56 | #RRAs "RRA:AVERAGE:0.5:1:5856" "RRA:AVERAGE:0.5:4:20160" "RRA:AVERAGE:0.5:40:52704"
 57 | 
 58 | #
 59 | #-------------------------------------------------------------------------------
 60 | # Scalability mode. If on, we summarize over downstream grids, and respect
 61 | # authority tags. If off, we take on 2.5.0-era behavior: we do not wrap our output
 62 | # in <GRID></GRID> tags, we ignore all <GRID> tags we see, and always assume
 63 | # we are the "authority" on data source feeds. This approach does not scale to
 64 | # large groups of clusters, but is provided for backwards compatibility.
 65 | # default: on
 66 | # scalable off
 67 | #
 68 | #-------------------------------------------------------------------------------
 69 | # The name of this Grid. All the data sources above will be wrapped in a GRID
 70 | # tag with this name.
 71 | # default: unspecified
 72 | # gridname "MyGrid"
 73 | #
 74 | #-------------------------------------------------------------------------------
 75 | # The authority URL for this grid. Used by other gmetads to locate graphs
 76 | # for our data sources. Generally points to a ganglia/
 77 | # website on this machine.
 78 | # default: "http://hostname/ganglia/",
 79 | #   where hostname is the name of this machine, as defined by gethostname().
 80 | # authority "http://mycluster.org/newprefix/"
 81 | #
 82 | #-------------------------------------------------------------------------------
 83 | # List of machines this gmetad will share XML with. Localhost
 84 | # is always trusted. 
 85 | # default: There is no default value
 86 | # trusted_hosts 127.0.0.1 169.229.50.165 my.gmetad.org
 87 | #
 88 | #-------------------------------------------------------------------------------
 89 | # If you want any host which connects to the gmetad XML to receive
 90 | # data, then set this value to "on"
 91 | # default: off
 92 | # all_trusted on
 93 | #
 94 | #-------------------------------------------------------------------------------
 95 | # If you don't want gmetad to setuid then set this to off
 96 | # default: on
 97 | # setuid off
 98 | #
 99 | #-------------------------------------------------------------------------------
100 | # User gmetad will setuid to (defaults to "nobody")
101 | # default: "nobody"
102 | # setuid_username "nobody"
103 | #
104 | #-------------------------------------------------------------------------------
105 | # Umask to apply to created rrd files and grid directory structure
106 | # default: 0 (files are public)
107 | # umask 022
108 | #
109 | #-------------------------------------------------------------------------------
110 | # The port gmetad will answer requests for XML
111 | # default: 8651
112 | # xml_port 8651
113 | #
114 | #-------------------------------------------------------------------------------
115 | # The port gmetad will answer queries for XML. This facility allows
116 | # simple subtree and summation views of the XML tree.
117 | # default: 8652
118 | # interactive_port 8652
119 | #
120 | #-------------------------------------------------------------------------------
121 | # The number of threads answering XML requests
122 | # default: 4
123 | # server_threads 10
124 | #
125 | #-------------------------------------------------------------------------------
126 | # Where gmetad stores its round-robin databases
127 | # default: "/var/lib/ganglia/rrds"
128 | # rrd_rootdir "/some/other/place"
129 | #
130 | #-------------------------------------------------------------------------------
131 | # List of metric prefixes this gmetad will not summarize at cluster or grid level.
132 | # default: There is no default value
133 | # unsummarized_metrics diskstat CPU
134 | #
135 | #-------------------------------------------------------------------------------
136 | # In earlier versions of gmetad, hostnames were handled in a case
137 | # sensitive manner
138 | # If your hostname directories have been renamed to lower case,
139 | # set this option to 0 to disable backward compatibility.
140 | # From version 3.2, backwards compatibility will be disabled by default.
141 | # default: 1   (for gmetad < 3.2)
142 | # default: 0   (for gmetad >= 3.2)
143 | case_sensitive_hostnames 0
144 | 
145 | #-------------------------------------------------------------------------------
146 | # It is now possible to export all the metrics collected by gmetad directly to
147 | # graphite by setting the following attributes. 
148 | #
149 | # The hostname or IP address of the Graphite server
150 | # default: unspecified
151 | # carbon_server "my.graphite.box"
152 | #
153 | # The port and protocol on which Graphite is listening
154 | # default: 2003
155 | # carbon_port 2003
156 | #
157 | # default: tcp
158 | # carbon_protocol udp
159 | #
160 | # **Deprecated in favor of graphite_path** A prefix to prepend to the 
161 | # metric names exported by gmetad. Graphite uses dot-
162 | # separated paths to organize and refer to metrics. 
163 | # default: unspecified
164 | # graphite_prefix "datacenter1.gmetad"
165 | #
166 | # A user-definable graphite path. Graphite uses dot-
167 | # separated paths to organize and refer to metrics. 
168 | # For reverse compatibility graphite_prefix will be prepended to this
169 | # path, but this behavior should be considered deprecated.
170 | # This path may include 3 variables that will be replaced accordingly:
171 | # %s -> source (cluster name)
172 | # %h -> host (host name)
173 | # %m -> metric (metric name)
174 | # default: graphite_prefix.%s.%h.%m
175 | # graphite_path "datacenter1.gmetad.%s.%h.%m
176 | 
177 | # Number of milliseconds gmetad will wait for a response from the graphite server 
178 | # default: 500
179 | # carbon_timeout 500
180 | 
181 | #-------------------------------------------------------------------------------
182 | # Memcached configuration (if it has been compiled in)
183 | # Format documentation at http://docs.libmemcached.org/libmemcached_configuration.html
184 | # default: ""
185 | # memcached_parameters "--SERVER=127.0.0.1"
186 | #
187 | 
188 | 


--------------------------------------------------------------------------------
/roles/ganglia/templates/gmond-cn.conf:
--------------------------------------------------------------------------------
  1 | /* This configuration is as close to 2.5.x default behavior as possible 
  2 |    The values closely match ./gmond/metric.h definitions in 2.5.x */ 
  3 | globals {                    
  4 |   daemonize = yes              
  5 |   setuid = yes             
  6 |   user = ganglia              
  7 |   debug_level = 0               
  8 |   max_udp_msg_len = 1472        
  9 |   mute = no             
 10 |   deaf = yes             
 11 |   host_dmax = 0 /*secs */ 
 12 |   cleanup_threshold = 300 /*secs */ 
 13 |   gexec = no             
 14 |   send_metadata_interval = 30     
 15 | } 
 16 | 
 17 | /* If a cluster attribute is specified, then all gmond hosts are wrapped inside 
 18 |  * of a <CLUSTER> tag.  If you do not specify a cluster tag, then all <HOSTS> will 
 19 |  * NOT be wrapped inside of a <CLUSTER> tag. */ 
 20 | cluster { 
 21 |   name = "{{ cluster_name }}" 
 22 |   owner = "unspecified" 
 23 |   latlong = "unspecified" 
 24 |   url = "unspecified" 
 25 | } 
 26 | 
 27 | /* The host section describes attributes of the host, like the location */ 
 28 | host { 
 29 |   location = "unspecified" 
 30 | } 
 31 | 
 32 | /* Feel free to specify as many udp_send_channels as you like.  Gmond 
 33 |    used to only support having a single channel */ 
 34 | udp_send_channel { 
 35 |   host = {{ master_name }}
 36 |   port = 8649 
 37 |   ttl = 1 
 38 | }
 39 | /* You can specify as many udp_recv_channels as you like as well. */ 
 40 | 
 41 | /* You can specify as many tcp_accept_channels as you like to share 
 42 |    an xml description of the state of the cluster */ 
 43 | tcp_accept_channel { 
 44 |   port = 8649 
 45 | } 
 46 | 
 47 | /* Each metrics module that is referenced by gmond must be specified and 
 48 |    loaded. If the module has been statically linked with gmond, it does not 
 49 |    require a load path. However all dynamically loadable modules must include 
 50 |    a load path. */ 
 51 | modules { 
 52 |   module { 
 53 |     name = "core_metrics" 
 54 |   } 
 55 |   module { 
 56 |     name = "cpu_module" 
 57 |     path = "/usr/lib/ganglia/modcpu.so" 
 58 |   } 
 59 |   module { 
 60 |     name = "disk_module" 
 61 |     path = "/usr/lib/ganglia/moddisk.so" 
 62 |   } 
 63 |   module { 
 64 |     name = "load_module" 
 65 |     path = "/usr/lib/ganglia/modload.so" 
 66 |   } 
 67 |   module { 
 68 |     name = "mem_module" 
 69 |     path = "/usr/lib/ganglia/modmem.so" 
 70 |   } 
 71 |   module { 
 72 |     name = "net_module" 
 73 |     path = "/usr/lib/ganglia/modnet.so" 
 74 |   } 
 75 |   module { 
 76 |     name = "proc_module" 
 77 |     path = "/usr/lib/ganglia/modproc.so" 
 78 |   } 
 79 |   module { 
 80 |     name = "sys_module" 
 81 |     path = "/usr/lib/ganglia/modsys.so" 
 82 |   } 
 83 | } 
 84 | 
 85 | include ('/etc/ganglia/conf.d/*.conf') 
 86 | 
 87 | 
 88 | /* The old internal 2.5.x metric array has been replaced by the following 
 89 |    collection_group directives.  What follows is the default behavior for 
 90 |    collecting and sending metrics that is as close to 2.5.x behavior as 
 91 |    possible. */
 92 | 
 93 | /* This collection group will cause a heartbeat (or beacon) to be sent every 
 94 |    20 seconds.  In the heartbeat is the GMOND_STARTED data which expresses 
 95 |    the age of the running gmond. */ 
 96 | collection_group { 
 97 |   collect_once = yes 
 98 |   time_threshold = 20 
 99 |   metric { 
100 |     name = "heartbeat" 
101 |   } 
102 | } 
103 | 
104 | /* This collection group will send general info about this host every 1200 secs. 
105 |    This information doesn't change between reboots and is only collected once. */ 
106 | collection_group { 
107 |   collect_once = yes 
108 |   time_threshold = 1200 
109 |   metric { 
110 |     name = "cpu_num" 
111 |     title = "CPU Count" 
112 |   } 
113 |   metric { 
114 |     name = "cpu_speed" 
115 |     title = "CPU Speed" 
116 |   } 
117 |   metric { 
118 |     name = "mem_total" 
119 |     title = "Memory Total" 
120 |   } 
121 |   /* Should this be here? Swap can be added/removed between reboots. */ 
122 |   metric { 
123 |     name = "swap_total" 
124 |     title = "Swap Space Total" 
125 |   } 
126 |   metric { 
127 |     name = "boottime" 
128 |     title = "Last Boot Time" 
129 |   } 
130 |   metric { 
131 |     name = "machine_type" 
132 |     title = "Machine Type" 
133 |   } 
134 |   metric { 
135 |     name = "os_name" 
136 |     title = "Operating System" 
137 |   } 
138 |   metric { 
139 |     name = "os_release" 
140 |     title = "Operating System Release" 
141 |   } 
142 |   metric { 
143 |     name = "location" 
144 |     title = "Location" 
145 |   } 
146 | } 
147 | 
148 | /* This collection group will send the status of gexecd for this host every 300 secs */
149 | /* Unlike 2.5.x the default behavior is to report gexecd OFF.  */ 
150 | collection_group { 
151 |   collect_once = yes 
152 |   time_threshold = 300 
153 |   metric { 
154 |     name = "gexec" 
155 |     title = "Gexec Status" 
156 |   } 
157 | } 
158 | 
159 | /* This collection group will collect the CPU status info every 20 secs. 
160 |    The time threshold is set to 90 seconds.  In honesty, this time_threshold could be 
161 |    set significantly higher to reduce unneccessary network chatter. */ 
162 | collection_group { 
163 |   collect_every = 20 
164 |   time_threshold = 90 
165 |   /* CPU status */ 
166 |   metric { 
167 |     name = "cpu_user"  
168 |     value_threshold = "1.0" 
169 |     title = "CPU User" 
170 |   } 
171 |   metric { 
172 |     name = "cpu_system"   
173 |     value_threshold = "1.0" 
174 |     title = "CPU System" 
175 |   } 
176 |   metric { 
177 |     name = "cpu_idle"  
178 |     value_threshold = "5.0" 
179 |     title = "CPU Idle" 
180 |   } 
181 |   metric { 
182 |     name = "cpu_nice"  
183 |     value_threshold = "1.0" 
184 |     title = "CPU Nice" 
185 |   } 
186 |   metric { 
187 |     name = "cpu_aidle" 
188 |     value_threshold = "5.0" 
189 |     title = "CPU aidle" 
190 |   } 
191 |   metric { 
192 |     name = "cpu_wio" 
193 |     value_threshold = "1.0" 
194 |     title = "CPU wio" 
195 |   } 
196 |   /* The next two metrics are optional if you want more detail... 
197 |      ... since they are accounted for in cpu_system.  
198 |   metric { 
199 |     name = "cpu_intr" 
200 |     value_threshold = "1.0" 
201 |     title = "CPU intr" 
202 |   } 
203 |   metric { 
204 |     name = "cpu_sintr" 
205 |     value_threshold = "1.0" 
206 |     title = "CPU sintr" 
207 |   } 
208 |   */ 
209 | } 
210 | 
211 | collection_group { 
212 |   collect_every = 20 
213 |   time_threshold = 90 
214 |   /* Load Averages */ 
215 |   metric { 
216 |     name = "load_one" 
217 |     value_threshold = "1.0" 
218 |     title = "One Minute Load Average" 
219 |   } 
220 |   metric { 
221 |     name = "load_five" 
222 |     value_threshold = "1.0" 
223 |     title = "Five Minute Load Average" 
224 |   } 
225 |   metric { 
226 |     name = "load_fifteen" 
227 |     value_threshold = "1.0" 
228 |     title = "Fifteen Minute Load Average" 
229 |   }
230 | } 
231 | 
232 | /* This group collects the number of running and total processes */ 
233 | collection_group { 
234 |   collect_every = 80 
235 |   time_threshold = 950 
236 |   metric { 
237 |     name = "proc_run" 
238 |     value_threshold = "1.0" 
239 |     title = "Total Running Processes" 
240 |   } 
241 |   metric { 
242 |     name = "proc_total" 
243 |     value_threshold = "1.0" 
244 |     title = "Total Processes" 
245 |   } 
246 | }
247 | 
248 | /* This collection group grabs the volatile memory metrics every 40 secs and 
249 |    sends them at least every 180 secs.  This time_threshold can be increased 
250 |    significantly to reduce unneeded network traffic. */ 
251 | collection_group { 
252 |   collect_every = 40 
253 |   time_threshold = 180 
254 |   metric { 
255 |     name = "mem_free" 
256 |     value_threshold = "1024.0" 
257 |     title = "Free Memory" 
258 |   } 
259 |   metric { 
260 |     name = "mem_shared" 
261 |     value_threshold = "1024.0" 
262 |     title = "Shared Memory" 
263 |   } 
264 |   metric { 
265 |     name = "mem_buffers" 
266 |     value_threshold = "1024.0" 
267 |     title = "Memory Buffers" 
268 |   } 
269 |   metric { 
270 |     name = "mem_cached" 
271 |     value_threshold = "1024.0" 
272 |     title = "Cached Memory" 
273 |   } 
274 |   metric { 
275 |     name = "swap_free" 
276 |     value_threshold = "1024.0" 
277 |     title = "Free Swap Space" 
278 |   } 
279 | } 
280 | 
281 | collection_group { 
282 |   collect_every = 40 
283 |   time_threshold = 300 
284 |   metric { 
285 |     name = "bytes_out" 
286 |     value_threshold = 4096 
287 |     title = "Bytes Sent" 
288 |   } 
289 |   metric { 
290 |     name = "bytes_in" 
291 |     value_threshold = 4096 
292 |     title = "Bytes Received" 
293 |   } 
294 |   metric { 
295 |     name = "pkts_in" 
296 |     value_threshold = 256 
297 |     title = "Packets Received" 
298 |   } 
299 |   metric { 
300 |     name = "pkts_out" 
301 |     value_threshold = 256 
302 |     title = "Packets Sent" 
303 |   } 
304 | }
305 | 
306 | /* Different than 2.5.x default since the old config made no sense */ 
307 | collection_group { 
308 |   collect_every = 1800 
309 |   time_threshold = 3600 
310 |   metric { 
311 |     name = "disk_total" 
312 |     value_threshold = 1.0 
313 |     title = "Total Disk Space" 
314 |   } 
315 | }
316 | 
317 | collection_group { 
318 |   collect_every = 40 
319 |   time_threshold = 180 
320 |   metric { 
321 |     name = "disk_free" 
322 |     value_threshold = 1.0 
323 |     title = "Disk Space Available" 
324 |   } 
325 |   metric { 
326 |     name = "part_max_used" 
327 |     value_threshold = 1.0 
328 |     title = "Maximum Disk Space Used" 
329 |   } 
330 | }
331 | 
332 | 


--------------------------------------------------------------------------------
/roles/ganglia/templates/gmond.conf:
--------------------------------------------------------------------------------
  1 | /* This configuration is as close to 2.5.x default behavior as possible 
  2 |    The values closely match ./gmond/metric.h definitions in 2.5.x */ 
  3 | globals {                    
  4 |   daemonize = yes              
  5 |   setuid = yes             
  6 |   user = ganglia              
  7 |   debug_level = 0               
  8 |   max_udp_msg_len = 1472        
  9 |   mute = no             
 10 |   deaf = no             
 11 |   host_dmax = 0 /*secs */ 
 12 |   cleanup_threshold = 300 /*secs */ 
 13 |   gexec = no             
 14 |   send_metadata_interval = 30     
 15 | } 
 16 | 
 17 | /* If a cluster attribute is specified, then all gmond hosts are wrapped inside 
 18 |  * of a <CLUSTER> tag.  If you do not specify a cluster tag, then all <HOSTS> will 
 19 |  * NOT be wrapped inside of a <CLUSTER> tag. */ 
 20 | cluster { 
 21 |   name = "{{ cluster_name }}" 
 22 |   owner = "unspecified" 
 23 |   latlong = "unspecified" 
 24 |   url = "unspecified" 
 25 | } 
 26 | 
 27 | /* The host section describes attributes of the host, like the location */ 
 28 | host { 
 29 |   location = "unspecified" 
 30 | } 
 31 | 
 32 | /* Feel free to specify as many udp_send_channels as you like.  Gmond 
 33 |    used to only support having a single channel */ 
 34 | udp_send_channel { 
 35 |   host = {{ master_name }}
 36 |   port = 8649 
 37 |   ttl = 1 
 38 | }
 39 |  
 40 | 
 41 | /* You can specify as many udp_recv_channels as you like as well. */ 
 42 | udp_recv_channel { 
 43 |   port = 8649 
 44 | } 
 45 | 
 46 | /* You can specify as many tcp_accept_channels as you like to share 
 47 |    an xml description of the state of the cluster */ 
 48 | tcp_accept_channel { 
 49 |   port = 8649 
 50 | } 
 51 | 
 52 | /* Each metrics module that is referenced by gmond must be specified and 
 53 |    loaded. If the module has been statically linked with gmond, it does not 
 54 |    require a load path. However all dynamically loadable modules must include 
 55 |    a load path. */ 
 56 | modules { 
 57 |   module { 
 58 |     name = "core_metrics" 
 59 |   } 
 60 |   module { 
 61 |     name = "cpu_module" 
 62 |     path = "/usr/lib/ganglia/modcpu.so" 
 63 |   } 
 64 |   module { 
 65 |     name = "disk_module" 
 66 |     path = "/usr/lib/ganglia/moddisk.so" 
 67 |   } 
 68 |   module { 
 69 |     name = "load_module" 
 70 |     path = "/usr/lib/ganglia/modload.so" 
 71 |   } 
 72 |   module { 
 73 |     name = "mem_module" 
 74 |     path = "/usr/lib/ganglia/modmem.so" 
 75 |   } 
 76 |   module { 
 77 |     name = "net_module" 
 78 |     path = "/usr/lib/ganglia/modnet.so" 
 79 |   } 
 80 |   module { 
 81 |     name = "proc_module" 
 82 |     path = "/usr/lib/ganglia/modproc.so" 
 83 |   } 
 84 |   module { 
 85 |     name = "sys_module" 
 86 |     path = "/usr/lib/ganglia/modsys.so" 
 87 |   } 
 88 | } 
 89 | 
 90 | include ('/etc/ganglia/conf.d/*.conf') 
 91 | 
 92 | 
 93 | /* The old internal 2.5.x metric array has been replaced by the following 
 94 |    collection_group directives.  What follows is the default behavior for 
 95 |    collecting and sending metrics that is as close to 2.5.x behavior as 
 96 |    possible. */
 97 | 
 98 | /* This collection group will cause a heartbeat (or beacon) to be sent every 
 99 |    20 seconds.  In the heartbeat is the GMOND_STARTED data which expresses 
100 |    the age of the running gmond. */ 
101 | collection_group { 
102 |   collect_once = yes 
103 |   time_threshold = 20 
104 |   metric { 
105 |     name = "heartbeat" 
106 |   } 
107 | } 
108 | 
109 | /* This collection group will send general info about this host every 1200 secs. 
110 |    This information doesn't change between reboots and is only collected once. */ 
111 | collection_group { 
112 |   collect_once = yes 
113 |   time_threshold = 1200 
114 |   metric { 
115 |     name = "cpu_num" 
116 |     title = "CPU Count" 
117 |   } 
118 |   metric { 
119 |     name = "cpu_speed" 
120 |     title = "CPU Speed" 
121 |   } 
122 |   metric { 
123 |     name = "mem_total" 
124 |     title = "Memory Total" 
125 |   } 
126 |   /* Should this be here? Swap can be added/removed between reboots. */ 
127 |   metric { 
128 |     name = "swap_total" 
129 |     title = "Swap Space Total" 
130 |   } 
131 |   metric { 
132 |     name = "boottime" 
133 |     title = "Last Boot Time" 
134 |   } 
135 |   metric { 
136 |     name = "machine_type" 
137 |     title = "Machine Type" 
138 |   } 
139 |   metric { 
140 |     name = "os_name" 
141 |     title = "Operating System" 
142 |   } 
143 |   metric { 
144 |     name = "os_release" 
145 |     title = "Operating System Release" 
146 |   } 
147 |   metric { 
148 |     name = "location" 
149 |     title = "Location" 
150 |   } 
151 | } 
152 | 
153 | /* This collection group will send the status of gexecd for this host every 300 secs */
154 | /* Unlike 2.5.x the default behavior is to report gexecd OFF.  */ 
155 | collection_group { 
156 |   collect_once = yes 
157 |   time_threshold = 300 
158 |   metric { 
159 |     name = "gexec" 
160 |     title = "Gexec Status" 
161 |   } 
162 | } 
163 | 
164 | /* This collection group will collect the CPU status info every 20 secs. 
165 |    The time threshold is set to 90 seconds.  In honesty, this time_threshold could be 
166 |    set significantly higher to reduce unneccessary network chatter. */ 
167 | collection_group { 
168 |   collect_every = 20 
169 |   time_threshold = 90 
170 |   /* CPU status */ 
171 |   metric { 
172 |     name = "cpu_user"  
173 |     value_threshold = "1.0" 
174 |     title = "CPU User" 
175 |   } 
176 |   metric { 
177 |     name = "cpu_system"   
178 |     value_threshold = "1.0" 
179 |     title = "CPU System" 
180 |   } 
181 |   metric { 
182 |     name = "cpu_idle"  
183 |     value_threshold = "5.0" 
184 |     title = "CPU Idle" 
185 |   } 
186 |   metric { 
187 |     name = "cpu_nice"  
188 |     value_threshold = "1.0" 
189 |     title = "CPU Nice" 
190 |   } 
191 |   metric { 
192 |     name = "cpu_aidle" 
193 |     value_threshold = "5.0" 
194 |     title = "CPU aidle" 
195 |   } 
196 |   metric { 
197 |     name = "cpu_wio" 
198 |     value_threshold = "1.0" 
199 |     title = "CPU wio" 
200 |   } 
201 |   /* The next two metrics are optional if you want more detail... 
202 |      ... since they are accounted for in cpu_system.  
203 |   metric { 
204 |     name = "cpu_intr" 
205 |     value_threshold = "1.0" 
206 |     title = "CPU intr" 
207 |   } 
208 |   metric { 
209 |     name = "cpu_sintr" 
210 |     value_threshold = "1.0" 
211 |     title = "CPU sintr" 
212 |   } 
213 |   */ 
214 | } 
215 | 
216 | collection_group { 
217 |   collect_every = 20 
218 |   time_threshold = 90 
219 |   /* Load Averages */ 
220 |   metric { 
221 |     name = "load_one" 
222 |     value_threshold = "1.0" 
223 |     title = "One Minute Load Average" 
224 |   } 
225 |   metric { 
226 |     name = "load_five" 
227 |     value_threshold = "1.0" 
228 |     title = "Five Minute Load Average" 
229 |   } 
230 |   metric { 
231 |     name = "load_fifteen" 
232 |     value_threshold = "1.0" 
233 |     title = "Fifteen Minute Load Average" 
234 |   }
235 | } 
236 | 
237 | /* This group collects the number of running and total processes */ 
238 | collection_group { 
239 |   collect_every = 80 
240 |   time_threshold = 950 
241 |   metric { 
242 |     name = "proc_run" 
243 |     value_threshold = "1.0" 
244 |     title = "Total Running Processes" 
245 |   } 
246 |   metric { 
247 |     name = "proc_total" 
248 |     value_threshold = "1.0" 
249 |     title = "Total Processes" 
250 |   } 
251 | }
252 | 
253 | /* This collection group grabs the volatile memory metrics every 40 secs and 
254 |    sends them at least every 180 secs.  This time_threshold can be increased 
255 |    significantly to reduce unneeded network traffic. */ 
256 | collection_group { 
257 |   collect_every = 40 
258 |   time_threshold = 180 
259 |   metric { 
260 |     name = "mem_free" 
261 |     value_threshold = "1024.0" 
262 |     title = "Free Memory" 
263 |   } 
264 |   metric { 
265 |     name = "mem_shared" 
266 |     value_threshold = "1024.0" 
267 |     title = "Shared Memory" 
268 |   } 
269 |   metric { 
270 |     name = "mem_buffers" 
271 |     value_threshold = "1024.0" 
272 |     title = "Memory Buffers" 
273 |   } 
274 |   metric { 
275 |     name = "mem_cached" 
276 |     value_threshold = "1024.0" 
277 |     title = "Cached Memory" 
278 |   } 
279 |   metric { 
280 |     name = "swap_free" 
281 |     value_threshold = "1024.0" 
282 |     title = "Free Swap Space" 
283 |   } 
284 | } 
285 | 
286 | collection_group { 
287 |   collect_every = 40 
288 |   time_threshold = 300 
289 |   metric { 
290 |     name = "bytes_out" 
291 |     value_threshold = 4096 
292 |     title = "Bytes Sent" 
293 |   } 
294 |   metric { 
295 |     name = "bytes_in" 
296 |     value_threshold = 4096 
297 |     title = "Bytes Received" 
298 |   } 
299 |   metric { 
300 |     name = "pkts_in" 
301 |     value_threshold = 256 
302 |     title = "Packets Received" 
303 |   } 
304 |   metric { 
305 |     name = "pkts_out" 
306 |     value_threshold = 256 
307 |     title = "Packets Sent" 
308 |   } 
309 | }
310 | 
311 | /* Different than 2.5.x default since the old config made no sense */ 
312 | collection_group { 
313 |   collect_every = 1800 
314 |   time_threshold = 3600 
315 |   metric { 
316 |     name = "disk_total" 
317 |     value_threshold = 1.0 
318 |     title = "Total Disk Space" 
319 |   } 
320 | }
321 | 
322 | collection_group { 
323 |   collect_every = 40 
324 |   time_threshold = 180 
325 |   metric { 
326 |     name = "disk_free" 
327 |     value_threshold = 1.0 
328 |     title = "Disk Space Available" 
329 |   } 
330 |   metric { 
331 |     name = "part_max_used" 
332 |     value_threshold = 1.0 
333 |     title = "Maximum Disk Space Used" 
334 |   } 
335 | }
336 | 
337 | 


--------------------------------------------------------------------------------
/roles/mpi/README.md:
--------------------------------------------------------------------------------
 1 | Mpi
 2 | =========
 3 | 
 4 | This role is designed to install system wide mpi library.
 5 | 
 6 | 
 7 | Role Variables
 8 | --------------
 9 | 
10 | See defaults/main.yml. The default packages is openmpi. You could change to other mpi implementations if you need.


--------------------------------------------------------------------------------
/roles/mpi/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # defaults file for mpi
3 | mpi_packages:
4 |   - libopenmpi-dev=2.1.1-8
5 |   - openmpi-bin=2.1.1-8
6 |   - g++
7 |  # - libboost-all-dev
8 | 


--------------------------------------------------------------------------------
/roles/mpi/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # tasks file for mpi
 3 | - name: install necessary packages for openmpi
 4 |   become: yes
 5 |   apt:
 6 |     name: "{{ item }}"
 7 |     state: present
 8 |   with_items: "{{ mpi_packages }}"
 9 | - name: change openmpi config
10 |   become: yes
11 |   lineinfile:
12 |     path: /etc/openmpi/openmpi-mca-params.conf
13 |     regexp: "^btl_base_warn_component_unused = 0"
14 |     line: "btl_base_warn_component_unused = 0"
15 | 


--------------------------------------------------------------------------------
/roles/network/README.md:
--------------------------------------------------------------------------------
 1 | Network
 2 | =========
 3 | 
 4 | This role is designed to configure the network of the cluster including master as dns and dhcp server in LAN, NAT enable network on compute nodes and some proxy settings.
 5 | 
 6 | Requirements
 7 | ------------
 8 | 
 9 | No requirement as long as admin user account are consistent across machines. It is actually the first role to run, which make the cluster accessible and network connected.
10 | 
11 | Role Variables
12 | --------------
13 | 
14 | See defaults/main.yml. You need to specify set_proxy to no, if your system is not air-gapped and free to connect to the internet.
15 | 
16 | Templates and Files
17 | --------------
18 | sources.list in files dir changes the default apt source to a mirror. You may want to change this behavior depending on your own network conditions.
19 | 
20 | hosts in templates dir has some extra host items. You may want to delete or change this depending on your system.


--------------------------------------------------------------------------------
/roles/network/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # defaults file for network
3 | set_proxy: yes
4 | use_tinc: no
5 | tinc_if: tinc
6 | mtu: 8500 # reason for non 9000: see https://www.intel.com/content/www/us/en/design/products-and-solutions/networking-and-io/ethernet-connection-i218/technical-library.html?grouping=rdc%20Content%20Types&sort=title:asc specification update: point 3.
7 | 


--------------------------------------------------------------------------------
/roles/network/files/20auto-upgrades:
--------------------------------------------------------------------------------
1 | APT::Periodic::Update-Package-Lists "0";
2 | APT::Periodic::Unattended-Upgrade "0";
3 | 


--------------------------------------------------------------------------------
/roles/network/files/sources.list:
--------------------------------------------------------------------------------
 1 | # 默认注释了源码镜像以提高 apt update 速度，如有需要可自行取消注释
 2 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic main restricted universe multiverse
 3 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic main restricted universe multiverse
 4 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-updates main restricted universe multiverse
 5 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-updates main restricted universe multiverse
 6 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-backports main restricted universe multiverse
 7 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-backports main restricted universe multiverse
 8 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-security main restricted universe multiverse
 9 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-security main restricted universe multiverse
10 | 
11 | # 预发布软件源，不建议启用
12 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-proposed main restricted universe multiverse
13 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-proposed main restricted universe multiverse
14 | 


--------------------------------------------------------------------------------
/roles/network/tasks/main.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | # tasks file for network
  3 | - name: update host on ln node
  4 |   become: yes
  5 |   template:
  6 |     backup: yes
  7 |     owner: root
  8 |     src: hosts
  9 |     dest: /etc/hosts
 10 |   when: inventory_hostname in groups['ln']
 11 | - name: gather facts after host is specified
 12 |   setup: 
 13 | - name: net plan on ln node
 14 |   template:
 15 |     backup: yes
 16 |     owner: root
 17 |     src: ../templates/60-config.yaml
 18 |     dest: /etc/netplan/60-config.yaml
 19 |   register: lnnetplan
 20 |   become: yes
 21 |   when: inventory_hostname in groups['ln']
 22 | - name: netplan apply
 23 |   become: yes
 24 |   when: inventory_hostname in groups['ln'] and lnnetplan.changed
 25 |   command: netplan apply
 26 | - name: change proxy of apt
 27 |   become: yes
 28 |   template:
 29 |     src: apt.conf
 30 |     dest: /etc/apt/apt.conf
 31 |     backup: yes
 32 |   when: set_proxy
 33 | - name: change the apt source mirror to tuna
 34 |   become: yes
 35 |   copy:
 36 |     src: sources.list
 37 |     dest: /etc/apt/sources.list
 38 |     owner: root
 39 |     backup: yes
 40 | - name: turn off unattended updates
 41 |   become: yes
 42 |   copy:
 43 |     src: 20auto-upgrades
 44 |     dest: /etc/apt/apt.conf.d/20auto-upgrades
 45 | - name: dnsmasq install
 46 |   become: yes
 47 |   apt:
 48 |     update_cache: yes
 49 |     name: dnsmasq
 50 |     state: present
 51 |   when: inventory_hostname in groups['ln']
 52 | - name: dnsmasq config
 53 |   become: yes
 54 |   template:
 55 |     owner: root
 56 |     src: ../templates/dnsmasq.conf
 57 |     dest: /etc/dnsmasq.conf
 58 |     backup: yes
 59 |   register: lndnsmasq
 60 |   when: inventory_hostname in groups['ln']
 61 | - name: dnsmasq host
 62 |   become: yes
 63 |   template:
 64 |     backup: yes
 65 |     src: map.hosts
 66 |     dest: /etc/dnsmasq.d/map.hosts
 67 |   when: inventory_hostname in groups['ln']
 68 |   register: maphost
 69 | - name: ensure dnsmasq service is started
 70 |   become: yes
 71 |   service:
 72 |     name: dnsmasq
 73 |     state: started
 74 |   when: inventory_hostname in groups['ln']
 75 | - name: dnsmasq service restart
 76 |   become: yes
 77 |   service:
 78 |     name: dnsmasq
 79 |     state: restarted
 80 |   when: inventory_hostname in groups['ln'] and (lndnsmasq.changed or maphost.changed)  
 81 | - name: enable ip forward on ln nodes
 82 |   become: yes
 83 |   lineinfile:
 84 |     path: /etc/sysctl.conf
 85 |     regexp: "net.ipv4.ip_forward"
 86 |     line: "net.ipv4.ip_forward=1"
 87 |     backup: yes
 88 |   register: lnforward
 89 |   when: inventory_hostname in groups['ln']
 90 | - name: sysctl the ip forward feature
 91 |   become: yes
 92 |   command: sysctl -p /etc/sysctl.conf
 93 |   when:  inventory_hostname in groups['ln'] and lnforward.changed
 94 | - name: iptables for snat on tinc part
 95 |   when: inventory_hostname in groups['ln'] and use_tinc
 96 |   iptables:
 97 |     table: nat
 98 |     chain: POSTROUTING
 99 |     destination: "!{{ ip_range }}/{{ mask }}"
100 |     source: "{{ ip_range }}/{{ mask }}"
101 |     jump: SNAT
102 |     out_interface: "{{ tinc_if }}"
103 |     to_source: "{{ tinc_ip }}"
104 |   become: yes
105 | - name: iptables for snat on ln nodes
106 |   iptables:
107 |      table: nat
108 |      chain: POSTROUTING
109 |      destination: "!{{ ip_range }}/{{ mask }}"
110 |      source: "{{ ip_range }}/{{ mask }}"
111 |      jump: SNAT
112 |      to_source: "{{ wan_ip }}"
113 |   become: yes
114 |   when: inventory_hostname in groups['ln']
115 | - name: netplan config on cn nodes
116 |   become: yes
117 |   when: inventory_hostname in groups['cn']
118 |   template:
119 |     backup: yes
120 |     owner: root
121 |     src: ../templates/70-config.yaml
122 |     dest: /etc/netplan/70-config.yaml
123 |   register: cnnetplan
124 | - name: net restart on cn nodes
125 |   become: yes
126 |   when: inventory_hostname in groups['cn'] and cnnetplan.changed
127 |   command: netplan apply
128 | - name: ensure the hostname is consistent with name in hosts
129 |   become: yes
130 |   hostname:
131 |     name: "{{ inventory_hostname }}"
132 |   when: inventory_hostname in groups['cn']
133 | - name: disable cloud init reverting hostname when rebooting
134 |   become: yes
135 |   lineinfile:
136 |     path: /etc/cloud/cloud.cfg
137 |     regexp: "preserve_hostname:"
138 |     line: "preserve_hostname: true"
139 |     backup: yes
140 | - name: ensure the hostname of master
141 |   become: yes
142 |   hostname:
143 |     name: "{{ master_name }}"
144 |   when: inventory_hostname in groups['ln']
145 | - name: refresh the host file
146 |   become: yes
147 |   template:
148 |     src: hosts
149 |     dest: /etc/hosts
150 |     owner: root
151 |     backup: yes
152 | - name: copy proxy-set
153 |   template:
154 |     src: proxy-set
155 |     dest: /etc/proxy-set
156 |   become: yes
157 |   when: set_proxy
158 | - name: add proxy to profile
159 |   lineinfile:
160 |     path: /etc/profile
161 |     regexp: "#add proxy$"
162 |     line: "source /etc/proxy-set #add proxy"
163 |   become: yes
164 |   when: set_proxy
165 | - name: enable jumbo frame on ln
166 |   become: yes
167 |   command: "ip link set {{ ln_lan_nic }} mtu {{ mtu }}"
168 |   when: inventory_hostname in groups['ln']
169 | - name: enable jumbo frame on cn
170 |   become: yes
171 |   command: "ip link set {{ hostvars[inventory_hostname]['nic'] | default(cn_default_nic) }} mtu {{ mtu }}"
172 |   when: inventory_hostname in groups['cn']
173 | 


--------------------------------------------------------------------------------
/roles/network/templates/60-config.yaml:
--------------------------------------------------------------------------------
 1 | network:
 2 |   version: 2
 3 |   renderer: networkd
 4 |   ethernets:
 5 |     {{ ln_wan_nic }}:
 6 |       addresses:
 7 |       - {{ wan_ip|indent(1,true) }}/{{ wan_mask }}
 8 |       gateway4: {{ wan_gateway }}
 9 |       dhcp4: false
10 |       nameservers:
11 |         addresses: {{ dns_server|to_yaml }}
12 |     {{ ln_lan_nic }}:
13 |       addresses:
14 |        -  {{ master_ip|indent(1,true) }}/{{ mask }}
15 |       mtu: {{ mtu }}
16 |       dhcp4: false
17 |       match:
18 |         macaddress: {{ ansible_facts[ln_lan_nic]['macaddress'] }}
19 | 


--------------------------------------------------------------------------------
/roles/network/templates/70-config.yaml:
--------------------------------------------------------------------------------
1 | network:
2 |   version: 2
3 |   renderer: networkd
4 |   ethernets:
5 |     {{ hostvars[inventory_hostname]['nic'] | default(cn_default_nic) }}:
6 |       dhcp4: yes
7 |       gateway4: {{ master_ip }}
8 |       mtu: {{ mtu }}
9 | 


--------------------------------------------------------------------------------
/roles/network/templates/apt.conf:
--------------------------------------------------------------------------------
1 | Acquire::http::Proxy "{{ env_vars['http_proxy'] }}";
2 | Acquire::https::Proxy "{{ env_vars['https_proxy'] }}";
3 | 


--------------------------------------------------------------------------------
/roles/network/templates/dnsmasq.conf:
--------------------------------------------------------------------------------
 1 | # Tell any system-wide dnsmasq instance to make sure to bind to interfaces
 2 | # instead of listening on 0.0.0.0
 3 | interface={{ ln_lan_nic }}
 4 | bind-interfaces
 5 | # except-interface=lxdbr0
 6 | dhcp-range={{ dhcp_start_ip }},{{ dhcp_end_ip }}
 7 | dhcp-no-override
 8 | dhcp-hostsfile=/etc/dnsmasq.d/map.hosts
 9 | log-dhcp
10 | domain={{ cluster_domain }}
11 | expand-hosts
12 | 


--------------------------------------------------------------------------------
/roles/network/templates/hosts:
--------------------------------------------------------------------------------
 1 | 127.0.0.1	localhost.localdomain	localhost
 2 | ::1		localhost6.localdomain6	localhost6
 3 | 
 4 | # The following lines are desirable for IPv6 capable hosts
 5 | ::1     localhost ip6-localhost ip6-loopback
 6 | fe00::0 ip6-localnet
 7 | ff02::1 ip6-allnodes
 8 | ff02::2 ip6-allrouters
 9 | ff02::3 ip6-allhosts
10 | 
11 | # hostnames in the cluster
12 | 
13 | {{ master_ip }}   {{ master_name }}
14 | 
15 | {% for host in groups['cn']  %}
16 | {{ hostvars[host]['ip'] }}    {{ host }}
17 | {% endfor %}
18 | 
19 | # other hostnames
20 | 


--------------------------------------------------------------------------------
/roles/network/templates/map.hosts:
--------------------------------------------------------------------------------
1 | {% for h in groups['cn'] %}
2 | dhcp-host={{ hostvars[h]['mac'] }},{{ hostvars[h]['ip'] }},{{ h }},infinite
3 | {% endfor %}
4 | {% for h in groups['cn'] %}
5 | {% if 'idrac' in hostvars[h] %}
6 | dhcp-host={{ hostvars[h]['idrac'] }},{{ hostvars[h]['idracip'] }},{{ h+"-idrac" }},infinite
7 | {% endif %}
8 | {% endfor %}
9 | 


--------------------------------------------------------------------------------
/roles/network/templates/proxy-set:
--------------------------------------------------------------------------------
1 | {% if set_proxy %}
2 | export http_proxy={{ env_vars['http_proxy'] }}
3 | export https_proxy={{ env_vars['https_proxy'] }}
4 | export ftp_proxy={{ env_vars['ftp_proxy'] }}
5 | {% endif %}
6 | 


--------------------------------------------------------------------------------
/roles/python/README.md:
--------------------------------------------------------------------------------
 1 | Python
 2 | =========
 3 | 
 4 | This role is designed to make a compatible python enviroment.
 5 | 
 6 | Requirements
 7 | ------------
 8 | 
 9 | You should first install intel parallel studio with intel python, if you want to enable the last two tasks in the playbook. The workflow in general here is to make intelpython as the default python for numerical calculations. If that is not what you want to implement, review carefully and delete irrelevant tasks.
10 | 
11 | Role Variables
12 | --------------
13 | 
14 | See defaults/main.yml. You need to specify spack_path if you want intel python auto include spack pip packages.
15 | 
16 | Templates and Files
17 | --------------
18 | pip.conf in files dir changes the default pypi url to a mirror. You may want to change this behavior depending on your network conditions.


--------------------------------------------------------------------------------
/roles/python/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # defaults file for python
3 | spack_path: "/home/ubuntu/spack"
4 | 


--------------------------------------------------------------------------------
/roles/python/files/pip.conf:
--------------------------------------------------------------------------------
1 | [global]
2 | index-url=https://pypi.tuna.tsinghua.edu.cn/simple
3 | 


--------------------------------------------------------------------------------
/roles/python/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # tasks file for python
 3 | - name: create .pip dir
 4 |   file:
 5 |     path: "{{ lookup('env', 'HOME')+'/.pip' }}"
 6 |     state: directory
 7 |   when: inventory_hostname in groups['ln']
 8 | - name: pip config in etc
 9 |   copy:
10 |     src: pip.conf
11 |     dest: /etc/pip.conf
12 |     backup: yes
13 |   become: yes
14 |   when: inventory_hostname in groups['ln']
15 | - name: pip config in home
16 |   copy:
17 |     src: pip.conf
18 |     dest: "{{ lookup('env', 'HOME')+'/.pip/pip.conf' }}"
19 |     backup: yes
20 |   when: inventory_hostname in groups['ln']
21 | ## the following steps are optional
22 | - name: find spack py path
23 |   shell: "source {{spack_path}}/share/spack/setup-env.sh&&spack location -i python@3.6"
24 |   args:
25 |     executable: /bin/bash
26 |   when: inventory_hostname in groups['ln']
27 |   register: python_location 
28 | - name: pythonpath add for intelpython3
29 |   become: yes
30 |   template:
31 |     src: spack.pth
32 |     dest: /opt/intel/intelpython3/lib/python3.6/site-packages/spack.pth 
33 |   when: inventory_hostname in groups['ln']
34 | #- name: change the folder permission
35 | # command: chmod -c -R u=rw,go=r,a-x+X "{{item}}"
36 | # with_items:
37 | #   -  "{{ lookup('env', 'HOME')+'/.local' }}"
38 | # when: inventory_hostname in groups['ln']
39 | 


--------------------------------------------------------------------------------
/roles/python/templates/home.pth:
--------------------------------------------------------------------------------
1 | {{ lookup("env", "HOME")+"/.local/lib/python3.6/site-packages" }}
2 | 


--------------------------------------------------------------------------------
/roles/python/templates/spack.pth:
--------------------------------------------------------------------------------
1 | {{ python_location.stdout }}/lib/python3.6/site-packages
2 | 


--------------------------------------------------------------------------------
/roles/restic/README.md:
--------------------------------------------------------------------------------
 1 | Restic
 2 | =========
 3 | 
 4 | This role is designed to configure restic for backups.
 5 | 
 6 | Requirements
 7 | ------------
 8 | 
 9 | No explicit dependence and requirements, as long as the restic repo path exists.
10 | 
11 | Role Variables
12 | --------------
13 | 
14 | See defaults/main.yml. 
15 | 
16 | Templates and Files
17 | --------------
18 | 
19 | ignorefile is the file containing paths to be excluded when backup the whole system /.
20 | 


--------------------------------------------------------------------------------
/roles/restic/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # defaults file for restic
 3 | restic_repos:
 4 |   - path: "/BACKUP"
 5 |     pass: "123456notwell"
 6 |     init: True
 7 | default_repo: "/BACKUP"
 8 | default_pass: "123456notwell"
 9 | backup_dirs:
10 |   - path: "/home"
11 |     hour: "3"
12 |     minute: "0"
13 | ignorefile: yes
14 | 


--------------------------------------------------------------------------------
/roles/restic/files/ignorefile:
--------------------------------------------------------------------------------
 1 | /tmp/*
 2 | /dev/*
 3 | /DATA*
 4 | /BACKUP
 5 | /run/*
 6 | /proc/*
 7 | /swap.img
 8 | /lost+found/*
 9 | /mnt/*
10 | /home/*
11 | /opt/*
12 | 


--------------------------------------------------------------------------------
/roles/restic/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # tasks file for restic
 3 | - name: install restic by apt
 4 |   apt:
 5 |     name: restic
 6 |     state: present
 7 |   when: inventory_hostname in groups['ln']
 8 | - name: init repos
 9 |   environment:
10 |     RESTIC_REPOSITORY: "{{ item.path }}"
11 |     RESTIC_PASSWORD: "{{ item.pass }}"
12 |   register: re_init
13 |   command: "/usr/bin/restic init"
14 |   changed_when: "'created restic backend' in re_init.stdout"
15 |   failed_when: re_init.rc != 0 and not 'config file already exists' in re_init.stderr
16 |   with_items: "{{ restic_repos }}"
17 |   when: item.init and inventory_hostname in groups['ln']
18 | - name: copy ignore files if any
19 |   become: yes
20 |   copy:
21 |     src: ignorefile
22 |     dest: /BACKUP/ignorefile
23 |   when: inventory_hostname in groups['ln'] and ignorefile
24 | - name: add crontabs for backup
25 |   become: yes
26 |   cron:
27 |     minute: "{{ item.minute }}"
28 |     hour: "{{ item.hour }}"
29 |     name: "backup {{ item.path }}"
30 |     job: "RESTIC_REPOSITORY='{{ item.repo|default(default_repo) }}' RESTIC_PASSWORD='{{ item.pass|default(default_pass) }}' /usr/bin/restic backup {{ item.path }} {{ item.extras | default('') }}"
31 |   with_items: "{{ backup_dirs }}"
32 |   when: inventory_hostname in groups['ln']
33 | - name: prune crontab
34 |   become: yes
35 |   cron:
36 |     minute: "10"
37 |     hour: "7"
38 |     name: "prune backups {{ item.path }}"
39 |     job: "RESTIC_REPOSITORY='{{ item.path }}' RESTIC_PASSWORD='{{ item.pass }}' /usr/bin/restic forget --keep-last 1 --keep-daily 7 --keep-weekly 4 --keep-monthly 3 --prune"
40 |   with_items: "{{ restic_repos }}"
41 |   when: inventory_hostname in groups['ln']
42 | 


--------------------------------------------------------------------------------
/roles/slurm/README.md:
--------------------------------------------------------------------------------
 1 | Slurm
 2 | =========
 3 | 
 4 | This role is designed to set up the whole slurm service, including slurmctld slurmd and slurmdbd from scratch. It also enable the pam module which deny user ssh to compute node when there is no job of them there.
 5 | 
 6 | Role Variables
 7 | --------------
 8 | 
 9 | See defaults/main.yml. 
10 | 
11 | Templates and Files
12 | --------------
13 | 
14 | slurm.conf in templates dir may need further review to meet your specifc needs. For example, the default conf only has one partition, you may want to add more partitions or change Weight of nodes as you like. The current conf also includes the master node as a compute node, too. You may also want to exclude it. 
15 | 
16 | Besides, pay special attention on the config path if your OS is not Ubuntu 18.04, the path may vary for different distributions. And you need to change path relevant confs by hands.
17 | 
18 | Notes
19 | ------------
20 | 
21 | After running this playbook, you should add the cluster, account and users by sacctmgr by hand directly. This is designed for the flexibility on user management of slurm users including priority and qos.
22 | 


--------------------------------------------------------------------------------
/roles/slurm/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # defaults file for slurm
 3 | db_user: slurm
 4 | slurm_user: slurm
 5 | db_pass: 123456notgood
 6 | ctldhost: 
 7 |   - master
 8 |   - c8
 9 | dbdhost: master
10 | slurm_mail: slurm@localhost.domain
11 | slurm_spool_path: /var/spool/slurmd
12 | 


--------------------------------------------------------------------------------
/roles/slurm/files/access.conf:
--------------------------------------------------------------------------------
  1 | # Login access control table.
  2 | #
  3 | # Comment line must start with "#", no space at front.
  4 | # Order of lines is important.
  5 | #
  6 | # When someone logs in, the table is scanned for the first entry that
  7 | # matches the (user, host) combination, or, in case of non-networked
  8 | # logins, the first entry that matches the (user, tty) combination.  The
  9 | # permissions field of that table entry determines whether the login will
 10 | # be accepted or refused.
 11 | #
 12 | # Format of the login access control table is three fields separated by a
 13 | # ":" character:
 14 | #
 15 | # [Note, if you supply a 'fieldsep=|' argument to the pam_access.so
 16 | # module, you can change the field separation character to be
 17 | # '|'. This is useful for configurations where you are trying to use
 18 | # pam_access with X applications that provide PAM_TTY values that are
 19 | # the display variable like "host:0".]
 20 | #
 21 | # 	permission : users : origins
 22 | #
 23 | # The first field should be a "+" (access granted) or "-" (access denied)
 24 | # character.
 25 | #
 26 | # The second field should be a list of one or more login names, group
 27 | # names, or ALL (always matches). A pattern of the form user@host is
 28 | # matched when the login name matches the "user" part, and when the
 29 | # "host" part matches the local machine name.
 30 | #
 31 | # The third field should be a list of one or more tty names (for
 32 | # non-networked logins), host names, domain names (begin with "."), host
 33 | # addresses, internet network numbers (end with "."), ALL (always
 34 | # matches), NONE (matches no tty on non-networked logins) or
 35 | # LOCAL (matches any string that does not contain a "." character).
 36 | #
 37 | # You can use @netgroupname in host or user patterns; this even works
 38 | # for @usergroup@@hostgroup patterns.
 39 | #
 40 | # The EXCEPT operator makes it possible to write very compact rules.
 41 | #
 42 | # The group file is searched only when a name does not match that of the
 43 | # logged-in user. Both the user's primary group is matched, as well as
 44 | # groups in which users are explicitly listed.
 45 | # To avoid problems with accounts, which have the same name as a group,
 46 | # you can use brackets around group names '(group)' to differentiate.
 47 | # In this case, you should also set the "nodefgroup" option.
 48 | #
 49 | # TTY NAMES: Must be in the form returned by ttyname(3) less the initial
 50 | # "/dev" (e.g. tty1 or vc/1)
 51 | #
 52 | ##############################################################################
 53 | #
 54 | # Disallow non-root logins on tty1
 55 | #
 56 | #-:ALL EXCEPT root:tty1
 57 | #
 58 | # Disallow console logins to all but a few accounts.
 59 | #
 60 | #-:ALL EXCEPT wheel shutdown sync:LOCAL
 61 | #
 62 | # Same, but make sure that really the group wheel and not the user
 63 | # wheel is used (use nodefgroup argument, too):
 64 | #
 65 | #-:ALL EXCEPT (wheel) shutdown sync:LOCAL
 66 | #
 67 | # Disallow non-local logins to privileged accounts (group wheel).
 68 | #
 69 | #-:wheel:ALL EXCEPT LOCAL .win.tue.nl
 70 | #
 71 | # Some accounts are not allowed to login from anywhere:
 72 | #
 73 | #-:wsbscaro wsbsecr wsbspac wsbsym wscosor wstaiwde:ALL
 74 | #
 75 | # All other accounts are allowed to login from anywhere.
 76 | #
 77 | ##############################################################################
 78 | # All lines from here up to the end are building a more complex example.
 79 | ##############################################################################
 80 | #
 81 | # User "root" should be allowed to get access via cron .. tty5 tty6.
 82 | #+ : root : cron crond :0 tty1 tty2 tty3 tty4 tty5 tty6
 83 | #
 84 | # User "root" should be allowed to get access from hosts with ip addresses.
 85 | #+ : root : 192.168.200.1 192.168.200.4 192.168.200.9
 86 | #+ : root : 127.0.0.1
 87 | #
 88 | # User "root" should get access from network 192.168.201.
 89 | # This term will be evaluated by string matching.
 90 | # comment: It might be better to use network/netmask instead.
 91 | #          The same is 192.168.201.0/24 or 192.168.201.0/255.255.255.0
 92 | #+ : root : 192.168.201.
 93 | #
 94 | # User "root" should be able to have access from domain.
 95 | # Uses string matching also.
 96 | #+ : root : .foo.bar.org
 97 | #
 98 | # User "root" should be denied to get access from all other sources.
 99 | #- : root : ALL
100 | #
101 | # User "foo" and members of netgroup "nis_group" should be
102 | # allowed to get access from all sources.
103 | # This will only work if netgroup service is available.
104 | #+ : @nis_group foo : ALL
105 | #
106 | # User "john" should get access from ipv4 net/mask
107 | #+ : john : 127.0.0.0/24
108 | #
109 | # User "john" should get access from ipv4 as ipv6 net/mask
110 | #+ : john : ::ffff:127.0.0.0/127
111 | #
112 | # User "john" should get access from ipv6 host address
113 | #+ : john : 2001:4ca0:0:101::1
114 | #
115 | # User "john" should get access from ipv6 host address (same as above)
116 | #+ : john : 2001:4ca0:0:101:0:0:0:1
117 | #
118 | # User "john" should get access from ipv6 net/mask
119 | #+ : john : 2001:4ca0:0:101::/64
120 | #
121 | # All other users should be denied to get access from all sources.
122 | #- : ALL : ALL
123 | 
124 | +:sudo:ALL
125 | -:ALL:ALL
126 | 


--------------------------------------------------------------------------------
/roles/slurm/files/cgroup.conf:
--------------------------------------------------------------------------------
1 | CgroupAutomount=yes 
2 | ConstrainCores=yes 
3 | 


--------------------------------------------------------------------------------
/roles/slurm/files/pam-common-session:
--------------------------------------------------------------------------------
 1 | #
 2 | # /etc/pam.d/common-session - session-related modules common to all services
 3 | #
 4 | # This file is included from other service-specific PAM config files,
 5 | # and should contain a list of modules that define tasks to be performed
 6 | # at the start and end of sessions of *any* kind (both interactive and
 7 | # non-interactive).
 8 | #
 9 | # As of pam 1.0.1-6, this file is managed by pam-auth-update by default.
10 | # To take advantage of this, it is recommended that you configure any
11 | # local modules either before or after the default block, and use
12 | # pam-auth-update to manage selection of other modules.  See
13 | # pam-auth-update(8) for details.
14 | 
15 | # here are the per-package modules (the "Primary" block)
16 | session	[default=1]			pam_permit.so
17 | # here's the fallback if no module succeeds
18 | session	requisite			pam_deny.so
19 | # prime the stack with a positive return value if there isn't one already;
20 | # this avoids us returning an error just because nothing sets a success code
21 | # since the modules above will each just jump around
22 | session	required			pam_permit.so
23 | # The pam_umask module will set the umask according to the system default in
24 | # /etc/login.defs and user settings, solving the problem of different
25 | # umask settings with different shells, display managers, remote sessions etc.
26 | # See "man pam_umask".
27 | session optional			pam_umask.so
28 | # and here are more per-package modules (the "Additional" block)
29 | session	required	pam_unix.so 
30 | #session	optional	pam_systemd.so 
31 | # end of pam-auth-update config
32 | 


--------------------------------------------------------------------------------
/roles/slurm/files/pam-sshd:
--------------------------------------------------------------------------------
 1 | # PAM configuration for the Secure Shell service
 2 | 
 3 | 
 4 | # Standard Un*x authentication.
 5 | @include common-auth
 6 | 
 7 | # Disallow non-root logins when /etc/nologin exists.
 8 | account    required     pam_nologin.so
 9 | 
10 | # Uncomment and edit /etc/security/access.conf if you need to set complex
11 | # access limits that are hard to express in sshd_config.
12 | # account  required     pam_access.so
13 | 
14 | # Standard Un*x authorization.
15 | @include common-account
16 | 
17 | account    sufficient    pam_slurm.so
18 | account    required      pam_access.so
19 | 
20 | # SELinux needs to be the first session rule.  This ensures that any
21 | # lingering context has been cleared.  Without this it is possible that a
22 | # module could execute code in the wrong domain.
23 | session [success=ok ignore=ignore module_unknown=ignore default=bad]        pam_selinux.so close
24 | 
25 | # Set the loginuid process attribute.
26 | session    required     pam_loginuid.so
27 | 
28 | # Create a new session keyring.
29 | session    optional     pam_keyinit.so force revoke
30 | 
31 | # Standard Un*x session setup and teardown.
32 | @include common-session
33 | 
34 | # Print the message of the day upon successful login.
35 | # This includes a dynamically generated part from /run/motd.dynamic
36 | # and a static (admin-editable) part from /etc/motd.
37 | session    optional     pam_motd.so  motd=/run/motd.dynamic
38 | session    optional     pam_motd.so noupdate
39 | 
40 | # Print the status of the user's mailbox upon successful login.
41 | session    optional     pam_mail.so standard noenv # [1]
42 | 
43 | # Set up user limits from /etc/security/limits.conf.
44 | session    required     pam_limits.so
45 | 
46 | # Read environment variables from /etc/environment and
47 | # /etc/security/pam_env.conf.
48 | session    required     pam_env.so # [1]
49 | # In Debian 4.0 (etch), locale-related environment variables were moved to
50 | # /etc/default/locale, so read that as well.
51 | session    required     pam_env.so user_readenv=1 envfile=/etc/default/locale
52 | 
53 | # SELinux needs to intervene at login time to ensure that the process starts
54 | # in the proper default security context.  Only sessions which are intended
55 | # to run in the user's context should be run after this.
56 | session [success=ok ignore=ignore module_unknown=ignore default=bad]        pam_selinux.so open
57 | 
58 | # Standard Un*x password updating.
59 | @include common-password
60 | 


--------------------------------------------------------------------------------
/roles/slurm/tasks/main.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | # tasks file for slurm
  3 | - name: apt install slurm
  4 |   apt:
  5 |     name: "{{ item }}"
  6 |     state: present
  7 |   become: yes
  8 |   with_items:
  9 |     - slurm-wlm=17.11.2-1build1
 10 |     - libpam-slurm=17.11.2-1build1
 11 | - name: copy munge key from ln nodes
 12 |   copy:
 13 |     backup: yes
 14 |     src: /etc/munge/munge.key
 15 |     dest: "{{role_path}}/files/munge.key"
 16 |     remote_src: yes
 17 |   become: yes
 18 |   when: inventory_hostname == master_name
 19 | - name: copy munge key to cn nodes
 20 |   copy:
 21 |     backup: yes
 22 |     src: munge.key
 23 |     dest: /etc/munge/munge.key
 24 |     owner: munge
 25 |     group: munge
 26 |     mode: "400"
 27 |   become: yes
 28 |   register: cnmungekey
 29 | - name: ensure munged is started
 30 |   become: yes
 31 |   service:
 32 |     name: munge
 33 |     state: started
 34 |     enabled: yes
 35 | - name: restart munged
 36 |   become: yes
 37 |   service:
 38 |     name: munge
 39 |     state: restarted
 40 |   when: cnmungekey.changed
 41 | - name: config slurm
 42 |   become: yes
 43 |   template:
 44 |     src: slurm.conf
 45 |     dest: /etc/slurm-llnl/slurm.conf
 46 |     backup: yes
 47 |   register: lnslurm
 48 | - name: config gres
 49 |   become: yes
 50 |   template:
 51 |     src: gres.conf
 52 |     dest: /etc/slurm-llnl/gres.conf
 53 |     backup: yes
 54 |   register: lngres
 55 | - name: config cgroup
 56 |   become: yes
 57 |   copy:
 58 |     src: cgroup.conf
 59 |     dest: /etc/slurm-llnl/cgroup.conf
 60 |     backup: yes
 61 |   register: lncgroup
 62 | - name: add smail program for mail sending
 63 |   template:
 64 |     src: smail.sh
 65 |     dest: /usr/bin/smail.sh
 66 |     owner: "{{ slurm_user }}"
 67 |     mode: 0700
 68 |   become: yes
 69 | - name: add pam module in pamd/sshd
 70 |   become: yes
 71 |   copy:
 72 |     src: pam-sshd
 73 |     dest: /etc/pam.d/sshd
 74 |   when: inventory_hostname in groups['cn']
 75 | - name: comment pam_systemd
 76 |   become: yes
 77 |   copy:
 78 |     src: pam-common-session
 79 |     dest: /etc/pam.d/common-session
 80 |   when: inventory_hostname in groups['cn']
 81 | - name: add ssh permission to sudo group
 82 |   become: yes
 83 |   copy:
 84 |     src: access.conf
 85 |     dest: /etc/security/access.conf
 86 |     backup: yes
 87 |   when: inventory_hostname in groups['cn']
 88 | - name: install slurmdbd on master
 89 |   become: yes
 90 |   when: inventory_hostname == dbdhost
 91 |   apt:
 92 |     name: "{{ item }}"
 93 |     state: present
 94 |     update_cache: yes
 95 |   with_items:
 96 |     - slurmdbd=17.11.2-1build1
 97 |     - mysql-server
 98 |     - python-mysqldb
 99 |     - libmysqlclient-dev
100 |     - python3-mysqldb
101 |   register: lnslurmdbd
102 | - name: ensure mysql is running
103 |   become: yes
104 |   when: inventory_hostname == dbdhost
105 |   service:
106 |     name: mysql
107 |     state: started
108 | - name: create mysql user
109 |   become: yes
110 |   when: inventory_hostname == dbdhost
111 |   mysql_user:
112 |     login_host: localhost
113 |     login_user: root
114 |     name: "{{ db_user }}"
115 |     password: "{{ db_pass }}"
116 |     priv: 'slurm_acct_db.*:ALL'
117 |     host: localhost
118 |     state: present
119 |     update_password: on_create
120 |   register: mysqluser
121 | - name: restart mysql
122 |   become: yes
123 |   when: inventory_hostname == dbdhost and mysqluser.changed
124 |   service:
125 |     name: mysql
126 |     state: restarted
127 | - name: config file for slurmdbd
128 |   become: yes
129 |   when: inventory_hostname == dbdhost
130 |   template:
131 |     src: slurmdbd.conf
132 |     dest: /etc/slurm-llnl/slurmdbd.conf
133 |     backup: yes
134 |   register: slurmdbdconf
135 | - name: ensure slurmdbd is started
136 |   become: yes
137 |   service: 
138 |     name: slurmdbd
139 |     state: started
140 |   when: inventory_hostname == dbdhost
141 | - name: restart slurmdbd
142 |   become: yes
143 |   when: inventory_hostname == dbdhost and slurmdbdconf.changed
144 |   service:
145 |     name: slurmdbd
146 |     state: restarted
147 | - name: add cluster to database
148 |   command: "sacctmgr add cluster {{cluster_name}} -i"
149 |   when: inventory_hostname == dbdhost
150 |   failed_when: clusterr.rc != 1 and clusterr.rc != 0
151 |   changed_when: not clusterr.stdout.startswith("This cluster")
152 |   register: clusterr
153 | - name: ensure slurmctld is started # slurmctld must be started after slurmdbd
154 |   become: yes
155 |   service:
156 |     name: slurmctld
157 |     state: started
158 |   when: inventory_hostname in ctldhost
159 | - name: start slurmctld
160 |   become: yes
161 |   service:
162 |     name: slurmctld
163 |     state: restarted
164 |   when: inventory_hostname in ctldhost and (lnslurm.changed or lncgroup.changed or lngres.changed)
165 | - name: ensure slurmd is started
166 |   become: yes
167 |   service:
168 |     name: slurmd
169 |     enabled: yes
170 |     state: started
171 |   when: inventory_hostname in groups['cn']
172 | - name: restart slurmd
173 |   become: yes
174 |   service:
175 |     name: slurmd
176 |     state: restarted
177 |   when: (lnslurm.changed or lncgroup.changed or lngres.changed) and inventory_hostname in groups['cn']
178 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/gres.conf:
--------------------------------------------------------------------------------
1 | {% for h in groups['cn'] %}
2 | {% if h in groups['gn'] %}
3 | Nodename={{ h }} Name=gpu Type={{ hostvars[h]['gputype']|default('RTX2080TI') }} File={{ "/dev/nvidia0" if hostvars[h]['gpuno'] == 1 else "/dev/nvidia[0-" ~ (hostvars[h]['gpuno']-1) ~ "]" |default("/dev/nvidia[0-1]") }} 
4 | {% endif %}
5 | {% endfor %}
6 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/slurm.conf:
--------------------------------------------------------------------------------
  1 | # slurm.conf file generated by configurator.html.
  2 | # Put this file on all nodes of your cluster.
  3 | # See the slurm.conf man page for more information.
  4 | #
  5 | ControlMachine={{ ctldhost[0] }}
  6 | #ControlAddr=
  7 | BackupController={{ ctldhost[1] }}
  8 | #BackupAddr=
  9 | # 
 10 | AuthType=auth/munge
 11 | #CheckpointType=checkpoint/none 
 12 | CryptoType=crypto/munge
 13 | #DisableRootJobs=NO 
 14 | #EnforcePartLimits=NO 
 15 | #Epilog=
 16 | #EpilogSlurmctld= 
 17 | #FirstJobId=1 
 18 | #MaxJobId=999999 
 19 | GresTypes=gpu 
 20 | #GroupUpdateForce=0 
 21 | #GroupUpdateTime=600 
 22 | #JobCheckpointDir=/var/slurm/checkpoint 
 23 | #JobCredentialPrivateKey=
 24 | #JobCredentialPublicCertificate=
 25 | #JobFileAppend=0 
 26 | #JobRequeue=1 
 27 | #JobSubmitPlugins=1 
 28 | #KillOnBadExit=0 
 29 | #LaunchType=launch/slurm 
 30 | #Licenses=foo*4,bar 
 31 | MailProg=/usr/bin/smail.sh 
 32 | MaxJobCount=1000 
 33 | #MaxStepCount=40000 
 34 | #MaxTasksPerNode=128 
 35 | MpiDefault=none
 36 | #MpiParams=ports=#-# 
 37 | #PluginDir= 
 38 | #PlugStackConfig= 
 39 | #PrivateData=jobs 
 40 | ProctrackType=proctrack/cgroup
 41 | #Prolog=
 42 | PrologFlags=contain # for pam module 
 43 | #PrologSlurmctld= 
 44 | #PropagatePrioProcess=0 
 45 | PropagateResourceLimits=NONE 
 46 | #PropagateResourceLimitsExcept= 
 47 | #RebootProgram= 
 48 | ReturnToService=1
 49 | #SallocDefaultCommand= 
 50 | SlurmctldPidFile=/var/run/slurm-llnl/slurmctld.pid
 51 | SlurmctldPort=6817
 52 | SlurmdPidFile=/var/run/slurm-llnl/slurmd.pid
 53 | SlurmdPort=6818
 54 | SlurmdSpoolDir=/tmp/slurmd
 55 | SlurmUser={{ slurm_user }}
 56 | #SlurmdUser=root 
 57 | #SrunEpilog=
 58 | #SrunProlog=
 59 | StateSaveLocation={{ slurm_spool_path }}
 60 | SwitchType=switch/none
 61 | #TaskEpilog=
 62 | TaskPlugin=task/affinity
 63 | TaskPluginParam=Sched
 64 | #TaskProlog=
 65 | #TopologyPlugin=topology/tree 
 66 | #TmpFS=/tmp 
 67 | #TrackWCKey=no 
 68 | #TreeWidth= 
 69 | #UnkillableStepProgram= 
 70 | #UsePAM=0 
 71 | # 
 72 | # 
 73 | # TIMERS 
 74 | #BatchStartTimeout=10 
 75 | #CompleteWait=0 
 76 | #EpilogMsgTime=2000 
 77 | #GetEnvTimeout=2 
 78 | #HealthCheckInterval=0 
 79 | #HealthCheckProgram= 
 80 | InactiveLimit=0
 81 | KillWait=30
 82 | #MessageTimeout=10 
 83 | #ResvOverRun=0 
 84 | MinJobAge=300
 85 | #OverTimeLimit=0 
 86 | SlurmctldTimeout=120
 87 | SlurmdTimeout=300
 88 | #UnkillableStepTimeout=60 
 89 | #VSizeFactor=0 
 90 | Waittime=0
 91 | # 
 92 | # 
 93 | # SCHEDULING 
 94 | #DefMemPerCPU=0 
 95 | FastSchedule=0
 96 | #MaxMemPerCPU=0 
 97 | #SchedulerTimeSlice=30 
 98 | SchedulerType=sched/backfill
 99 | SelectType=select/cons_res
100 | SelectTypeParameters=CR_Core
101 | # 
102 | # 
103 | # JOB PRIORITY 
104 | #PriorityFlags= 
105 | PriorityType=priority/multifactor 
106 | PriorityDecayHalfLife=0 
107 | #PriorityCalcPeriod= 
108 | PriorityFavorSmall=YES 
109 | #PriorityMaxAge= 
110 | PriorityUsageResetPeriod=YEARLY 
111 | PriorityWeightAge=1000
112 | PriorityWeightFairshare=300 
113 | PriorityWeightJobSize=100 
114 | #PriorityWeightPartition= 
115 | PriorityWeightQOS=600 
116 | # 
117 | # 
118 | # LOGGING AND ACCOUNTING 
119 | AccountingStorageEnforce=limits,qos 
120 | #AccountingStorageHost=
121 | #AccountingStorageLoc=
122 | #AccountingStoragePass=
123 | AccountingStoragePort=6819
124 | AccountingStorageType=accounting_storage/slurmdbd
125 | AccountingStorageHost={{ dbdhost }}
126 | #AccountingStorageUser=
127 | AccountingStorageTRES=gres/gpu,gres/gpu:RTX2080TI
128 | AccountingStoreJobComment=YES
129 | ClusterName={{ cluster_name }}
130 | #DebugFlags= 
131 | #JobCompHost=
132 | #JobCompLoc=
133 | #JobCompPass=
134 | #JobCompPort=
135 | JobCompType=jobcomp/none
136 | #JobCompUser=
137 | #JobContainerType=job_container/none 
138 | JobAcctGatherFrequency=30
139 | JobAcctGatherType=jobacct_gather/none
140 | SlurmctldDebug=3
141 | #SlurmctldLogFile=
142 | SlurmdDebug=3
143 | #SlurmdLogFile=
144 | #SlurmSchedLogFile= 
145 | #SlurmSchedLogLevel= 
146 | # 
147 | # 
148 | # POWER SAVE SUPPORT FOR IDLE NODES (optional) 
149 | #SuspendProgram= 
150 | #ResumeProgram= 
151 | #SuspendTimeout= 
152 | #ResumeTimeout= 
153 | #ResumeRate= 
154 | #SuspendExcNodes= 
155 | #SuspendExcParts= 
156 | #SuspendRate= 
157 | #SuspendTime= 
158 | # 
159 | # 
160 | # COMPUTE NODES 
161 | 
162 | {% for h in groups['cn'] %}
163 | {% if h in groups['gn'] %}
164 | NodeName={{ h }} State=UNKNOWN Weight=20 CoresPerSocket={{ hostvars[h]['corespersocket']|default('14') }} Sockets=2 ThreadsPerCore=2 RealMemory={{ hostvars[h]['memory']|default('128000') }} Gres=gpu:{{ hostvars[h]['gputype']|default('RTX2080TI') }}:{{ hostvars[h]['gpuno']|default("2") }}
165 | {% else %}
166 | NodeName={{ h }} State=UNKNOWN Weight=10 CoresPerSocket={{ hostvars[h]['corespersocket']|default('14') }} Sockets=2 ThreadsPerCore=2 RealMemory={{ hostvars[h]['memory']|default('128000') }}
167 | {% endif %}
168 | {% endfor %} 
169 | # NodeName={{ master_name }} State=UNKNOWN Weight=30 CoresPerSocket=14 Sockets=2 ThreadsPerCore=2 RealMemory=128000
170 | 
171 | PartitionName=general MaxTime=Infinite Nodes={% for h in groups['general'] %}{{h+"," if not loop.last else h}}{% endfor %} PriorityJobFactor=5000 Default=YES State=UP
172 | PartitionName=hyper MaxTime=Infinite Nodes={% for h in groups['hyper'] %}{{h+"," if not loop.last else h}}{% endfor %} PriorityJobFactor=5000 Default=NO State=UP
173 | PartitionName=debug MaxTime=00:30:00 Nodes=ALL PriorityJobFactor=50000 Default=NO State=UP
174 | {% if groups['cn']|intersect(groups['gn']) %}
175 | PartitionName=gpu MaxTime=Infinite Nodes={% for h in groups['cn']|intersect(groups['gn'])  %}{{h+"," if not loop.last else h}}{% endfor %}   Default=No AllowAccounts=ubuntu TRESBillingWeights="CPU=1.0,GRES/gpu=3.0"
176 | {% endif %}
177 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/slurmdbd.conf:
--------------------------------------------------------------------------------
 1 | ArchiveEvents=yes
 2 | ArchiveJobs=yes
 3 | AuthType=auth/munge
 4 | DbdHost={{ dbdhost }}
 5 | DebugLevel=4
 6 | PurgeEventAfter=1month
 7 | PurgeJobAfter=24month
 8 | PurgeStepAfter=1month
 9 | PurgeSuspendAfter=1month
10 | LogFile=/var/log/slurmdbd.log
11 | PidFile=/var/run/slurm-llnl/slurmdbd.pid
12 | SlurmUser={{ slurm_user }}
13 | StorageHost=localhost
14 | StoragePass={{ db_pass }}
15 | StorageType=accounting_storage/mysql
16 | StorageUser={{ db_user }}
17 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/smail.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | MAIL=/usr/bin/mail
3 | 
4 | echo "$2"|$MAIL -s "$2" $3 -r {{ slurm_mail }}
5 | 


--------------------------------------------------------------------------------
/roles/spack/README.md:
--------------------------------------------------------------------------------
 1 | Spack
 2 | =========
 3 | 
 4 | This role will install spack, a flexible HPC package manager and configure it.
 5 | 
 6 | Requirements
 7 | -------------
 8 | 
 9 | You may want to config git by `git config --global url.https://github.com/.insteadOf git://github.com/` for air-gapped cluster, otherwise `spack install lmod` doesn't work since some of the dependences would be fetched by git which won't go through http proxy by default.
10 | 
11 | 
12 | Role Variables
13 | --------------
14 | 
15 | See defaults/main.yml. Only spack_path is needed, which specify the install path of spack. We highly recommend you install it on some admin user's home directory, which can be available for all users. For clusters, it is important to share /home on master to all nodes via nfs, such that spack is available to all nodes.
16 | 
17 | Templates and Files
18 | --------------
19 | 
20 | It is worth noting that spack config yaml files in files dir are very **specific** and not universal at all. It assumes that you would install intel parallel studio on /opt dir. Especially, in packages.yaml, there is specific information on external packages' path which you may want to edit before running the role.
21 | 


--------------------------------------------------------------------------------
/roles/spack/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # defaults file for spack
3 | spack_path: "/home/ubuntu/spack"
4 | 


--------------------------------------------------------------------------------
/roles/spack/files/compilers.yaml:
--------------------------------------------------------------------------------
 1 | compilers:
 2 | - compiler:
 3 |     environment: {}
 4 |     extra_rpaths: []
 5 |     flags: {}
 6 |     modules: []
 7 |     operating_system: ubuntu18.04
 8 |     paths:
 9 |       cc: /usr/bin/gcc
10 |       cxx: /usr/bin/g++
11 |       f77: /usr/bin/gfortran
12 |       fc: /usr/bin/gfortran
13 |     spec: gcc@7.4.0
14 |     target: x86_64
15 | - compiler:
16 |     operating_system: ubuntu18.04
17 |     modules: [intel-parallel-studio-2019-gcc-7.4.0-xl]
18 |     paths:
19 |       cc: /opt/intel/compilers_and_libraries/linux/bin/intel64/icc
20 |       cxx: /opt/intel/compilers_and_libraries/linux/bin/intel64/icpc
21 |       f77: /opt/intel/compilers_and_libraries/linux/bin/intel64/ifort
22 |       fc: /opt/intel/compilers_and_libraries/linux/bin/intel64/ifort
23 |     spec: intel@2019
24 |     target: x86_64
25 | 


--------------------------------------------------------------------------------
/roles/spack/files/modules.yaml:
--------------------------------------------------------------------------------
 1 | modules:
 2 |   tcl:
 3 |     hash_length: 2
 4 |   # naming_scheme: '{name}/{version}-{compiler.name}-{compiler.version}'
 5 |     all:
 6 |       suffixes:
 7 |         ^python@3.6.5: 'py3'
 8 |         ^python@2.7: 'py2'
 9 |         ^openblas: 'openblas'
10 |         ^openmpi: 'ompi'
11 |         ^intelmpi: 'impi'
12 |     intel-parallel-studio:
13 |       filter:
14 |         environment_blacklist: ['PS1']
15 |       environment:
16 |         set:
17 |           CPATH: "/opt/intel/compilers_and_libraries_2019.4.243/linux/ipp/include:/opt/intel/compilers_and_libraries_2019.4.243/linux/mkl/include:/opt/intel/compilers_and_libraries_2019.4.243/linux/pstl/include:/opt/intel/compilers_and_libraries_2019.4.243/linux/tbb/include:/opt/intel/compilers_and_libraries_2019.4.243/linux/daal/include:/opt/intel/include:/opt/intel/compilers_and_libraries_2019.4.243/linux/mpi/intel64/include"
18 |           FI_PROVIDER_PATH: "/opt/intel/compilers_and_libraries_2019.4.243/linux/mpi/intel64/libfabric/lib/prov"
19 |           FI_PROVIDER: sockets
20 |           I_MPI_ROOT: "/opt/intel/compilers_and_libraries_2019.4.243/linux/mpi"
21 |         prepend_path:
22 |           LD_LIBRARY_PATH: "/opt/intel/itac/2019.4.036/intel64/slib:/opt/intel/compilers_and_libraries_2019.4.243/linux/compiler/lib/intel64_lin:/opt/intel/compilers_and_libraries_2019.4.243/linux/mpi/intel64/libfabric/lib:/opt/intel/compilers_and_libraries_2019.4.243/linux/mpi/intel64/lib/release:/opt/intel/compilers_and_libraries_2019.4.243/linux/mpi/intel64/lib:/opt/intel/compilers_and_libraries_2019.4.243/linux/ipp/lib/intel64:/opt/intel/compilers_and_libraries_2019.4.243/linux/mkl/lib/intel64_lin:/opt/intel/compilers_and_libraries_2019.4.243/linux/tbb/lib/intel64/gcc4.7:/opt/intel/debugger_2019/libipt/intel64/lib:/opt/intel/compilers_and_libraries_2019.4.243/linux/daal/lib/intel64_lin:/opt/intel/compilers_and_libraries_2019.4.243/linux/daal/../tbb/lib/intel64_lin/gcc4.4:/opt/intel/lib"
23 |     petsc:
24 |       environment:
25 |         set:
26 |           PETSC_ARCH: ubuntu+intel
27 |     slepc:
28 |       environment:
29 |         set:
30 |           SLEPC_DIR: /home/ubuntu/softwares/petsc-slepc/slepc-3.11.1 
31 |     intel-mkl:
32 |       environment:
33 |         prepend_path:
34 |           LD_LIBRARY_PATH: "/opt/intel/mkl/lib/intel64"
35 | 


--------------------------------------------------------------------------------
/roles/spack/files/packages.yaml:
--------------------------------------------------------------------------------
 1 | packages:
 2 |   openmpi:
 3 |     paths:
 4 |       openmpi@2.1.1%gcc: /usr/lib/x86_64-linux-gnu/openmpi
 5 |   slurm:
 6 |     paths:
 7 |       slurm@17.11.2%gcc: /usr/lib/x86_64-linux-gnu/slurm-wlm
 8 |   jdk:
 9 |     paths:
10 |       jdk@1.8.0_212 %gcc@7.4.0 arch=linux-ubuntu18.04-x86_64: /usr/lib/jvm/java-8-openjdk-amd64
11 |     version:
12 |     - 1.8.0_212
13 |   mathematica:
14 |     paths:
15 |       mathematica@11.0.1: /opt/mathematica/11.0.1
16 |   matlab:
17 |     paths:
18 |       matlab@2018b: /opt/matlab/2018b
19 |   intel-parallel-studio:
20 |     paths:
21 |       intel-parallel-studio@2019: /opt/intel
22 |     compiler: [intel@2019]
23 |   intel-mkl:
24 |     paths:
25 |       intel-mkl@2019: /opt/intel/mkl
26 |   intel-mpi:
27 |     paths:
28 |       intel-mpi@2019: /opt/intel/impi
29 |   petsc:
30 |     paths:
31 |       petsc@3.11.2%intel: /home/ubuntu/softwares/petsc-slepc/petsc-3.11.2
32 |   slepc:
33 |     paths:
34 |       slepc@3.11.1%intel: /home/ubuntu/softwares/petsc-slepc/slepc-3.11.1  
35 |   armadillo:
36 |     paths:
37 |       armadillo@9.300.2%intel: /home/ubuntu/softwares/armadillo/armadillo-9.300.2
38 |  #all:
39 |    #providers:
40 |     # mpi:       [openmpi, intel-parallel-studio+mpi]
41 |     # blas:      [openblas, intel-parallel-studio+mkl]
42 |     # lapack:    [openblas, intel-parallel-studio+mkl]
43 |     # scalapack: [netlib-scalapack, intel-parallel-studio+mkl]
44 | 


--------------------------------------------------------------------------------
/roles/spack/files/repo.yaml:
--------------------------------------------------------------------------------
1 | repo:
2 |   namespace: override
3 | 


--------------------------------------------------------------------------------
/roles/spack/files/repos.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 |   - $spack/var/spack/repos/override
3 |   - $spack/var/spack/repos/builtin
4 | 


--------------------------------------------------------------------------------
/roles/spack/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # tasks file for spack
 3 | - name: download spack
 4 |   git:
 5 |     repo: "https://github.com/spack/spack"
 6 |     update: no
 7 |     force: no
 8 |     dest: "{{ spack_path }}"
 9 |   when: inventory_hostname in groups['ln']
10 | - name: spack install lmod for module management
11 |   shell: "source {{spack_path}}/share/spack/setup-env.sh&&spack install lmod"
12 |   args: 
13 |     executable: /bin/bash
14 |   when: inventory_hostname in groups['ln']
15 |   changed_when: not lmodr.stdout.startswith("==> lmod is already installed")
16 |   register: lmodr
17 | - name: update etc profile to activate spack when start
18 |   become: yes
19 |   lineinfile:
20 |     path: /etc/profile
21 |     regexp: "/share/spack/setup-env.sh$"
22 |     line: "{{ 'source '+spack_path+'/share/spack/setup-env.sh' }}"
23 | - name: update etc profile to activate module system
24 |   become: yes
25 |   lineinfile:
26 |     path: /etc/profile
27 |     line: "source $(spack location -i lmod)/lmod/lmod/init/bash  # load module"
28 |     regexp: ".* # load module"
29 | - name: create override repo for spack
30 |   when: inventory_hostname in groups['ln']
31 |   file:
32 |     path: "{{ spack_path }}/var/spack/repos/override/packages"
33 |     state: directory
34 | - name: create the repo.yaml for the new repo
35 |   when: inventory_hostname in groups['ln']
36 |   copy:
37 |     src: repo.yaml
38 |     dest: "{{ spack_path }}/var/spack/repos/override/repo.yaml"
39 | - name: spack config file
40 |   when: inventory_hostname in groups['ln']
41 |   copy:
42 |     src: "{{ item }}"
43 |     dest: "{{ spack_path+'/etc/spack/'+item }}"
44 |     backup: yes
45 |   with_items:
46 |     - packages.yaml
47 |     - compilers.yaml
48 |     - modules.yaml
49 |     - repos.yaml
50 | - name: add shortcut for spack activation
51 |   become: yes
52 |   template:
53 |     src: spack-load
54 |     dest: /etc/spack-load
55 |     mode: 0755
56 | 


--------------------------------------------------------------------------------
/roles/spack/templates/pyinstall.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | VER=3.6.5
 3 | source {{spack_path}}/share/spack/setup-env.sh
 4 | source $(spack location -i lmod)/lmod/lmod/init/bash
 5 | 
 6 | spack install python@$VER
 7 | spack install py-pip ^python@$VER
 8 | 
 9 | # spack install py-numpy ^python@3.6.5
10 | 
11 | spack load python@$VER
12 | spack load py-pip ^python@$VER
13 | spack load py-setuptools ^python@$VER
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/roles/spack/templates/spack-load:
--------------------------------------------------------------------------------
1 | source {{ spack_path }}/share/spack/setup-env.sh
2 | source $(spack location -i lmod)/lmod/lmod/init/bash  # load module
3 | 


--------------------------------------------------------------------------------
/roles/storage/README.md:
--------------------------------------------------------------------------------
 1 | Storage
 2 | =========
 3 | 
 4 | This role is designed to configure storage stuff, including local mount, nfs and tmp clean.
 5 | 
 6 | Requirements
 7 | ------------
 8 | 
 9 | This is the last of three roles to build cluster infrastructure, following network and basic.
10 | 
11 | Role Variables
12 | --------------
13 | 
14 | See defaults/main.yml.
15 | 
16 | `tmp_time` is the time to delete files in tmp folder after the file's last change.
17 | 


--------------------------------------------------------------------------------
/roles/storage/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # defaults file for storage
 3 | nfs_dir:
 4 |   - dir: "/home"
 5 |     host: "master"
 6 |     mnt: "/home"
 7 |   - dir: "/opt"
 8 |     host: "master"
 9 |     mnt: "/opt"
10 |   - dir: "/DATA"
11 |     host: "master"
12 |     mnt: "/DATA"
13 |   - dir: "/DATA.c8"
14 |     host: "c8"
15 |     mnt: "/DATA.c8"
16 | local_disk:
17 |   - dev: "/dev/sdb1"
18 |     host: "master"
19 |     mnt: "/DATA"
20 |   - dev: "/dev/sdc1"
21 |     host: "master"
22 |     mnt: "/BACKUP"
23 |   - dev: "/dev/sdb1"
24 |     host: "c8"
25 |     mnt: "/DATA.c8"
26 |   - dev: "/dev/sdb1"
27 |     host: "c10"
28 |     mnt: "/tmp"
29 |   - dev: "/dev/sdb1"
30 |     host: "c11"
31 |     mnt: "/tmp"
32 |   - dev: "/dev/sdb1"
33 |     host: "c12"
34 |     mnt: "/tmp"
35 |   - dev: "/dev/sdb1"
36 |     host: "c13"
37 |     mnt: "/tmp"
38 |   - dev: "/dev/sdb1"
39 |     host: "c14"
40 |     mnt: "/tmp"
41 | tmp_time: "15d"
42 | 


--------------------------------------------------------------------------------
/roles/storage/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # tasks file for storage
 3 | - name: install nfs server on sn node
 4 |   become: yes
 5 |   apt:
 6 |     name: nfs-kernel-server=1:1.3.4-2.1ubuntu5.2
 7 |     state: present
 8 |   when: inventory_hostname in groups['sn']
 9 | - name: install nfs client on all nodes
10 |   become: yes
11 |   apt:
12 |     name: nfs-common
13 |     state: present
14 | - name: update nfs config file on ln node
15 |   become: yes
16 |   template:
17 |     src: ../templates/exports
18 |     dest: /etc/exports
19 |     backup: yes
20 |     owner: root
21 |   register: lnnfs
22 |   when: inventory_hostname in groups['sn']
23 | - name: ensure nfs service start on ln node
24 |   become: yes
25 |   service:
26 |     name: nfs-kernel-server
27 |     state: started
28 |   when: inventory_hostname in groups['sn']
29 | - name: restart nfs service
30 |   become: yes
31 |   service:
32 |     name: nfs-kernel-server
33 |     state: restarted
34 |   when: lnnfs.changed and inventory_hostname in groups['sn']
35 | - name: mount localdisk
36 |   become: yes
37 |   mount:
38 |     path: "{{ item.mnt }}"
39 |     src: "{{ item.dev }}"
40 |     fstype: ext4
41 |     state: mounted
42 |   when: inventory_hostname == item.host
43 |   with_items: "{{ local_disk }}"
44 | - name: make sure the mount dir exist in cn nodes
45 |   when: inventory_hostname != item.host
46 |   become: yes
47 |   file:
48 |     path: "{{ item.mnt }}"
49 |     state: directory
50 |   with_items: "{{ nfs_dir }}"
51 | - name: mount dir on all other nodes
52 |   become: yes
53 |   mount:
54 |     name: "{{ item.mnt }}"
55 |     src: "{{ item.host }}:{{ item.dir }}"
56 |     fstype: nfs
57 |     state: mounted
58 |   when: inventory_hostname != item.host
59 |   with_items: "{{ nfs_dir }}"
60 | - name: install tmpreaper on all nodes
61 |   become: yes
62 |   apt:
63 |     name: tmpreaper
64 |     state: present
65 | - name: change the config of tmpreaper
66 |   template:
67 |     src: tmpreaper.conf
68 |     dest: /etc/tmpreaper.conf
69 |     backup: yes
70 |   become: yes
71 | 


--------------------------------------------------------------------------------
/roles/storage/templates/exports:
--------------------------------------------------------------------------------
 1 | # /etc/exports: the access control list for filesystems which may be exported
 2 | #		to NFS clients.  See exports(5).
 3 | #
 4 | # Example for NFSv2 and NFSv3:
 5 | # /srv/homes       hostname1(rw,sync,no_subtree_check) hostname2(ro,sync,no_subtree_check)
 6 | #
 7 | # Example for NFSv4:
 8 | # /srv/nfs4        gss/krb5i(rw,sync,fsid=0,crossmnt,no_subtree_check)
 9 | # /srv/nfs4/homes  gss/krb5i(rw,sync,no_subtree_check)
10 | #
11 | {% for d in nfs_dir %}
12 | {% if d.host == inventory_hostname %}
13 | {{ d.dir }} {{ ip_range }}/{{ mask }}(rw,sync,no_root_squash)
14 | {% endif %}
15 | {% endfor %}
16 | 


--------------------------------------------------------------------------------
/roles/storage/templates/tmpreaper.conf:
--------------------------------------------------------------------------------
 1 | # tmpreaper.conf
 2 | # - local configuration for tmpreaper's daily run
 3 | #
 4 | # This is only used if /etc/cron.daily/tmpreaper was also updated,
 5 | # i.e. there's a line ". /etc/tmpreaper.conf" in that file.
 6 | # The shell code that used to be here (pre version 1.6.7) is now
 7 | # in the cron.daily script.
 8 | 
 9 | # Remove the next line if you understand the possible security implications of
10 | # having tmpreaper run automatically;
11 | # see /usr/share/doc/tmpreaper/README.security.gz
12 | 
13 | #SHOWWARNING=true
14 | 
15 | # TMPREAPER_TIME
16 | #       is the max. age of files before they're removed.
17 | #       default:
18 | #       the TMPTIME value in /etc/default/rcS if it's there, else
19 | #       TMPREAPER_TIME=7d (for 7 days)
20 | #       I recommend setting the value in /etc/default/rcS, as
21 | #       that is used to clean out /tmp whenever the system is booted.
22 | #
23 | # TMPREAPER_PROTECT_EXTRA
24 | #       are extra patterns that you may want to protect.
25 | #       Example:
26 | #       TMPREAPER_PROTECT_EXTRA='/tmp/isdnctrl* /tmp/important*'
27 | #
28 | # TMPREAPER_DIRS
29 | #       are the directories to clean up.
30 | #       *never* supply / here! That will wipe most of your system!
31 | #       Example:
32 | #       TMPREAPER_DIRS='/tmp/. /var/tmp/.'
33 | #
34 | # TMPREAPER_DELAY
35 | #       defines the maximum (randomized) delay before starting processing.
36 | #       See the manpage entry for --delay. Default is 256.
37 | #       Example:
38 | #       TMPREAPER_DELAY='256'
39 | #
40 | # TMPREAPER_ADDITIONALOPTIONS
41 | #       extra options that are passed to tmpreaper, e.g. --all
42 | 
43 | # uncomment and change the next line to overrule the /etc/default/rcS value
44 | # TMPREAPER_TIME=7d
45 | 
46 | TMPREAPER_PROTECT_EXTRA='/tmp/slurm*'
47 | TMPREAPER_DIRS='/tmp/.'
48 | TMPREAPER_DELAY='256'
49 | TMPREAPER_ADDITIONALOPTIONS=''
50 | TMPREAPER_TIME='{{ tmp_time }}'
51 | 


--------------------------------------------------------------------------------
/roles/user/README.md:
--------------------------------------------------------------------------------
 1 | User
 2 | =========
 3 | 
 4 | This role is designed to add new users on the cluster as easy as possible.
 5 | 
 6 | Requirements
 7 | ------------
 8 | 
 9 | You must run `slurm` role first and have configured slurm cluster, account, user, qos info by `sacctmgr`.
10 | 
11 | If you use quota to limit normal users disk usage, you also need to firstly configure quota.
12 | 
13 | Role Variables
14 | --------------
15 | 
16 | See defaults/main.yml. It is worth noting, for every user item, only name, password, and uid is necessary. Others will be the default value if not given.
17 | 
18 | Templates and Files
19 | --------------
20 | memory.conf and nproc.conf in files dir are very **specific**. You may want to change the specific limits based on your needs and your hardware specs.


--------------------------------------------------------------------------------
/roles/user/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # defaults file for user
 3 | set_quota: yes
 4 | sacct_default_account: root
 5 | sacct_default_qos: normal
 6 | quota_default_soft: 0
 7 | quota_default_hard: 0
 8 | quota_root: "/"
 9 | other_user_dir: yes
10 | data_root:
11 |   - "/DATA"
12 | users:
13 |   - name: "test0"
14 |     uid: 20000
15 |     password: "123456sobad"
16 |     quota_soft: 10G
17 |     quota_hard: 20G
18 |     home_permission: "0755"
19 |     sacct_account: root
20 |     sacct_qos: normal
21 | 


--------------------------------------------------------------------------------
/roles/user/files/memory.conf:
--------------------------------------------------------------------------------
1 | * hard as 224000000
2 | @sudo hard as unlimited
3 | 


--------------------------------------------------------------------------------
/roles/user/files/nproc.conf:
--------------------------------------------------------------------------------
1 | *          soft    nproc     8192
2 | root       soft    nproc     unlimited
3 | 


--------------------------------------------------------------------------------
/roles/user/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # tasks file for user
 3 | - name: create users
 4 |   become: yes
 5 |   user:
 6 |     name: "{{ item.name }}"
 7 |     uid: "{{ item.uid }}"
 8 |     shell: /bin/bash
 9 |     password: "{{ item.password|password_hash('sha512')  }}"
10 |     update_password: on_create
11 |     generate_ssh_key: yes
12 |   with_items: "{{ users }}"
13 |   when: inventory_hostname in groups['ln']
14 | - name: permission for home dir
15 |   become: yes
16 |   file:
17 |     mode: "{{ item['home_permission']|default('0700') }}"
18 |     state: directory
19 |     path: "/home/{{ item.name }}"
20 |     owner: "{{ item.name }}"
21 |   with_items: "{{ users }}"
22 |   when: inventory_hostname in groups['ln']
23 | - name: create dir in DATA
24 |   become: yes
25 |   file:
26 |     mode: "0755"
27 |     state: directory
28 |     path: "{{ item[1] }}/{{ item[0].name }}"
29 |     owner: "{{ item[0].name }}"
30 |   loop: "{{ users |product(data_root)|list }}"
31 |   when: inventory_hostname in groups['ln'] and other_user_dir
32 | - name: create users in cn
33 |   become: yes
34 |   user:
35 |     name: "{{ item.name }}"
36 |     uid: "{{ item.uid }}"
37 |     shell: /bin/bash
38 |     password: "{{ item.password|password_hash('sha512')  }}"
39 |     update_password: on_create
40 |     generate_ssh_key: no
41 |     create_home: no
42 |   with_items: "{{ users }}"
43 |   when: inventory_hostname in groups['cn']
44 | - name: cat ssh pubkey
45 |   register: pubkey
46 |   become: yes
47 |   when: inventory_hostname in groups['ln']
48 |   copy:
49 |     src: "{{ '/home/'+item.name+'/.ssh/id_rsa.pub' }}"
50 |     dest: "{{ role_path+'/files/'+item.name+'.pub' }}"
51 |     remote_src: true
52 |   with_items: "{{ users }}"
53 | - name: authorized keys add
54 |   become: yes
55 |   authorized_key:
56 |     exclusive: no
57 |     user: "{{ item.name }}"
58 |     key: "{{ lookup('file', '../files/'+item.name+'.pub') }}"
59 |   with_items: "{{ users }}"
60 |   when: inventory_hostname in groups['ln']
61 | - name: add user into slurm database
62 |   command: "sacctmgr -i add user {{ item.name }} account={{ item['sacct_account']|default(sacct_default_account) }} qos={{ item['sacct_qos']|default(sacct_default_qos) }}"
63 |   when: inventory_hostname in groups['ln']
64 |   with_items: "{{ users }}"
65 |   failed_when: r.rc != 1 and r.rc != 0
66 |   changed_when: r.rc == 0
67 |   register: r
68 | - name: add nproc limit to all nodes avoding shell fork
69 |   become: yes
70 |   copy:
71 |     src: nproc.conf
72 |     dest: /etc/security/limits.d/nproc.conf
73 | - name: add memory limit to ln nodes for normal user
74 |   become: yes
75 |   copy:
76 |     src: memory.conf
77 |     dest: /etc/security/limits.d/memory.conf
78 |   when: inventory_hostname in groups['ln']
79 | - name: fs quota limit on ln nodes
80 |   become: yes
81 |   command: "setquota -u {{ item.name }} {{ item['quota_soft']|default(quota_default_soft) }}  {{ item['quota_hard']|default(quota_default_hard) }} 0 0 {{ quota_root }} "
82 |   when: inventory_hostname in groups['ln'] and set_quota
83 |   with_items: "{{ users }}"
84 |   changed_when: r.rc != 0
85 |   register: r
86 | 


--------------------------------------------------------------------------------
/site.yml:
--------------------------------------------------------------------------------
 1 | - hosts: all
 2 |   gather_facts: no
 3 |   environment: "{{env_vars}}"
 4 |   roles:
 5 |     # - network
 6 |     # - basic
 7 |     # - storage
 8 |     # - drivers
 9 |     # - slurm
10 |     # - mpi
11 |     # - spack
12 |     # - python
13 |     # - ganglia
14 |     # - user
15 |     # - elk
16 |     # - elastalert
17 |     # - cgroup
18 |     # - restic
19 | 


--------------------------------------------------------------------------------