├── .gitignore ├── LICENSE ├── README.md ├── ansible.cfg ├── group_vars └── all.default ├── host_vars └── c1.default ├── hosts ├── roles ├── basic │ ├── README.md │ ├── defaults │ │ └── main.yml │ ├── files │ │ ├── aliases │ │ └── sshd_config │ ├── tasks │ │ └── main.yml │ └── templates │ │ ├── main.cf │ │ ├── ntp-cn.conf │ │ └── ntp.conf ├── cgroup │ ├── README.md │ ├── defaults │ │ └── main.yml │ ├── files │ │ ├── cgconfig.conf │ │ └── cgrules.conf.default │ └── tasks │ │ └── main.yml ├── drivers │ ├── README.md │ ├── defaults │ │ └── main.yml │ └── tasks │ │ └── main.yml ├── elastalert │ ├── README.md │ ├── defaults │ │ └── main.yml │ ├── files │ │ └── elastalert.conf │ ├── tasks │ │ └── main.yml │ └── templates │ │ └── config.yaml ├── elk │ ├── README.md │ ├── defaults │ │ └── main.yml │ ├── files │ │ ├── apache2.yml │ │ ├── apache2_pipeline.json │ │ ├── jvm.options │ │ ├── nginx.yml │ │ └── system.yml │ ├── tasks │ │ └── main.yml │ └── templates │ │ ├── 02-beats-input.conf │ │ ├── 30-elasticsearch-output.conf │ │ ├── elasticsearch.yml │ │ ├── filebeat.yml │ │ ├── kibana.conf │ │ └── kibana.yml ├── ganglia │ ├── README.md │ ├── defaults │ │ └── main.yml │ ├── files │ │ ├── cpu_stats.py │ │ ├── gpu.sh │ │ ├── netstats.py │ │ └── temg.sh │ ├── tasks │ │ └── main.yml │ └── templates │ │ ├── avail-monitor.sh │ │ ├── ganglia.conf │ │ ├── gmetad.conf │ │ ├── gmond-cn.conf │ │ └── gmond.conf ├── mpi │ ├── README.md │ ├── defaults │ │ └── main.yml │ └── tasks │ │ └── main.yml ├── network │ ├── README.md │ ├── defaults │ │ └── main.yml │ ├── files │ │ ├── 20auto-upgrades │ │ └── sources.list │ ├── tasks │ │ └── main.yml │ └── templates │ │ ├── 60-config.yaml │ │ ├── 70-config.yaml │ │ ├── apt.conf │ │ ├── dnsmasq.conf │ │ ├── hosts │ │ ├── map.hosts │ │ └── proxy-set ├── python │ ├── README.md │ ├── defaults │ │ └── main.yml │ ├── files │ │ └── pip.conf │ ├── tasks │ │ └── main.yml │ └── templates │ │ ├── home.pth │ │ └── spack.pth ├── restic │ ├── README.md │ ├── defaults │ │ └── main.yml │ ├── files │ │ └── ignorefile │ └── tasks │ │ └── main.yml ├── slurm │ ├── README.md │ ├── defaults │ │ └── main.yml │ ├── files │ │ ├── access.conf │ │ ├── cgroup.conf │ │ ├── pam-common-session │ │ └── pam-sshd │ ├── tasks │ │ └── main.yml │ └── templates │ │ ├── gres.conf │ │ ├── slurm.conf │ │ ├── slurmdbd.conf │ │ └── smail.sh ├── spack │ ├── README.md │ ├── defaults │ │ └── main.yml │ ├── files │ │ ├── compilers.yaml │ │ ├── modules.yaml │ │ ├── packages.yaml │ │ ├── repo.yaml │ │ └── repos.yaml │ ├── tasks │ │ └── main.yml │ └── templates │ │ ├── pyinstall.sh │ │ └── spack-load ├── storage │ ├── README.md │ ├── defaults │ │ └── main.yml │ ├── tasks │ │ └── main.yml │ └── templates │ │ ├── exports │ │ └── tmpreaper.conf └── user │ ├── README.md │ ├── defaults │ └── main.yml │ ├── files │ ├── memory.conf │ └── nproc.conf │ └── tasks │ └── main.yml └── site.yml /.gitignore: -------------------------------------------------------------------------------- 1 | vars/ 2 | meta/ 3 | handlers/ 4 | tests/ 5 | *_vars/*.yml 6 | *.pub 7 | munge.key 8 | site.retry 9 | ._README.md 10 | *.fuse* 11 | cgrules.conf 12 | hosts_test 13 | site_test.yml 14 | roles/test/ 15 | master.json 16 | c1.json 17 | c4.json 18 | pass.yaml 19 | site_test.retry 20 | roles/elastalert/files/elastalert/rules/ 21 | elastic-certificates.p12 22 | elastic-stack-ca.p12 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HPC-BUILD-ANSIBLE-PLAYBOOKS 2 | 3 | *Everyone can build his own supercomputer!* 4 | 5 | ## Glossary 6 | 7 | HPC: High performance computation 8 | 9 | Cluster: Many machines connected by switch on hardware level. 10 | 11 | Node: One machine 12 | 13 | Login node: Node that can be accessed from outside network 14 | 15 | Master node: Node that run central services in the cluster. In our setup, login and master are the same node, [ln] group. 16 | 17 | Compute nodes: Nodes for running jobs, [cn] group. 18 | 19 | GPU nodes: Nodes equipped with Nvidia GPUs, [gn] group. 20 | 21 | ## Usage 22 | 23 | Decomment the relevant roles you want to run in site.yml. You should review README file and task.yml carefully for each role you'd like to run. 24 | 25 | ```bash 26 | $ cd 27 | $ ansible-playbook -i hosts site.yml -Kvv ## enter sudo user password for the next prompt 28 | ``` 29 | 30 | ### Some notes 31 | 32 | * The playbooks should be located in a directory with permission 600 since it has lots of secret information and normal users shouldn't access them. 33 | 34 | ### Possible approach start from scratch 35 | 36 | * Install OS on the master node and directly `apt install ansible`. Then git clone this repo somewhere locally. Configure the inventory files and host_vars to include all possible machines with the knowledge of their mac address. 37 | * Possible a first Ansible run on master only with a complete inventory file (to generate complete hosts and mac-ip binding dhcp service). `network`, `basic` roles are suggested. A command `sudo ip addr add dev ` is recommended before running network Ansible roles. 38 | * Install OS on compute nodes, either by hand or by some bare metal provision mechanisms from the master. (Note the playbooks here doesn't cover setups of provisioning.) 39 | * Make sure the sudo user with uid 1000 are the same (name, passwd) on all machines. And ensure ssh server is running with admin's pubkey in authorized_keys on all machines. 40 | * Plug all machines together by switch and run Ansible playbooks from beginning again on master. 41 | 42 | ### Possible workflows beyond these playbooks 43 | 44 | Ansible cannot do everything, and for some flexible and risky jobs, you may want to do them by hand directly. 45 | 46 | * Manage local hard disk if there are any. Partition, format and mount them at master node. If there is any local mount need for compute nodes (which is rare for HPC style setup), you may want to add them to `basic` role, to make the world simple. This must be done before the running of `basic` role, which make the nfs mount possible. 47 | * Disk quota initial configure if you want to limit users disk usage on certain filesystem. This must be done before the running of `user` role, where newly created user can automatically limit by quota. 48 | * Manage slurm account, qos and user by `sacctmgr`, this can only be done after `slurm` role, when slurm is well configured and running. Besides, this should be done before `user` role, where new user can automatically be added to some account or limited by some qos policy. 49 | * Install necessary external softwares for numerical computation, the common ones are Intel parallel studio, Mathematica and Matlab. Further manage and install packages by spack and conda provided by intel parallel studio. 50 | 51 | ### Limitations 52 | 53 | These ansible playbooks here are very limited to a small cluster setup, where only one master/login node with possible several dozens of homogeneous compute nodes. 54 | 55 | For a larger cluster setup, there should be more than one login nodes, and different master nodes may play different roles (some provide disk storage, some provide slurm database, some provide slurm controller, some provide backup…). Besides, in such scale, the compute nodes are highly likely to be heterogeneous (some with big memory, some with GPU resource...), more detailed setups and carefully designed slurm configurations are needed in such scenario. It is in princinple OK to generalize our playbook for such large HPC clusters, but more effort should be paid instead of directly applying the playbooks here. 56 | 57 | ## Platform information 58 | 59 | These ansible playbooks are not platform independent, instead, they are strongly correlated with Ubuntu 18.04 server distributions. 60 | 61 | ### Suggestions on the possible change for different platforms 62 | 63 | * For different version of Ubuntu: 64 | - Some apt packages' name and config path might be different, especially these packages related to slurm. 65 | - Netplan may not work in old versions of Ubuntu. 66 | 67 | * For totally different Linux distribution like CentOS: 68 | - Apt should be replaced with yum or some other package managers. 69 | - Names of many packages and services might be changed. 70 | - Network setup might be in different approach. 71 | 72 | * For OS beyond Linux: 73 | - You must be very experienced with these stuff. I have no specific suggetion for you:) 74 | -------------------------------------------------------------------------------- /ansible.cfg: -------------------------------------------------------------------------------- 1 | [defaults] 2 | inventory = ./hosts 3 | host_key_checking = False 4 | -------------------------------------------------------------------------------- /group_vars/all.default: -------------------------------------------------------------------------------- 1 | ansible_python_interpreter: "/usr/bin/python3" 2 | timezone: "Asia/Shanghai" 3 | admin: ubuntu 4 | ## admin user account 5 | netmask: 255.255.255.0 6 | ## netmask for cluster LAN 7 | mask: 24 8 | ## corresponding netmask bits 9 | ip_range: 192.168.1.0 10 | ntp_server: ntp.tuna.tsinghua.edu.cn 11 | wan_ip: 10.0.0.10 12 | ## WAN ip for login node 13 | wan_gateway: 10.0.0.1 14 | wan_mask: 25 15 | ## netmask bits for WAN 16 | master_ip: 192.168.1.10 17 | ## LAN ip for master/login node 18 | master_name: master 19 | ## hostname of master node 20 | dhcp_start_ip: 192.168.1.40 21 | ## dhcp ip range start 22 | dhcp_end_ip: 192.168.1.127 23 | ## dhcp ip range end 24 | dns_server: 25 | - 8.8.8.8 26 | ln_lan_nic: eno1 27 | ln_wan_nic: eno2 28 | cluster_domain: hpc.cluster 29 | cluster_name: hpc 30 | env_vars: 31 | ## possible environment variables that you want to export for ansible roles 32 | http_proxy: http:// 33 | https_proxy: http:// 34 | ftp_proxy: http:// 35 | -------------------------------------------------------------------------------- /host_vars/c1.default: -------------------------------------------------------------------------------- 1 | ip: 192.168.1.21 2 | mac: 00:00:00:00:00:00 3 | -------------------------------------------------------------------------------- /hosts: -------------------------------------------------------------------------------- 1 | [cn] 2 | c[1:14] 3 | 4 | [general] 5 | c[1:8] 6 | 7 | [hyper] 8 | c[10:14] 9 | 10 | [ln] 11 | master 12 | 13 | [gn] 14 | master 15 | c9 16 | 17 | [sn] 18 | master 19 | c8 20 | -------------------------------------------------------------------------------- /roles/basic/README.md: -------------------------------------------------------------------------------- 1 | Basic 2 | ========= 3 | 4 | This role is designed to configure all basic stuff after network settings, such at ntp, locale, timezone, ssh, mail config and some basic packages installation. 5 | 6 | Requirements 7 | ------------ 8 | 9 | This is usually the second role to run. After running both network basic and the following storage roles, you would have a basic cluster infrastructure. 10 | 11 | Role Variables 12 | -------------- 13 | 14 | `aptpacks` is apt packages installed on all nodes. 15 | -------------------------------------------------------------------------------- /roles/basic/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # defaults file for basic 3 | aptpacks: 4 | - tree 5 | - ntp 6 | - make 7 | - cmake 8 | - python 9 | - gfortran 10 | - unzip 11 | - openjdk-8-jdk 12 | - pandoc 13 | - postfix 14 | -------------------------------------------------------------------------------- /roles/basic/files/aliases: -------------------------------------------------------------------------------- 1 | # See man 5 aliases for format 2 | postmaster: root 3 | -------------------------------------------------------------------------------- /roles/basic/files/sshd_config: -------------------------------------------------------------------------------- 1 | # $OpenBSD: sshd_config,v 1.101 2017/03/14 07:19:07 djm Exp $ 2 | 3 | # This is the sshd server system-wide configuration file. See 4 | # sshd_config(5) for more information. 5 | 6 | # This sshd was compiled with PATH=/usr/bin:/bin:/usr/sbin:/sbin 7 | 8 | # The strategy used for options in the default sshd_config shipped with 9 | # OpenSSH is to specify options with their default value where 10 | # possible, but leave them commented. Uncommented options override the 11 | # default value. 12 | 13 | #Port 22 14 | #AddressFamily any 15 | #ListenAddress 0.0.0.0 16 | #ListenAddress :: 17 | 18 | #HostKey /etc/ssh/ssh_host_rsa_key 19 | #HostKey /etc/ssh/ssh_host_ecdsa_key 20 | #HostKey /etc/ssh/ssh_host_ed25519_key 21 | 22 | # Ciphers and keying 23 | #RekeyLimit default none 24 | 25 | # Logging 26 | #SyslogFacility AUTH 27 | #LogLevel INFO 28 | 29 | # Authentication: 30 | 31 | #LoginGraceTime 2m 32 | PermitRootLogin no 33 | StrictModes yes 34 | #MaxAuthTries 6 35 | #MaxSessions 10 36 | 37 | #PubkeyAuthentication yes 38 | 39 | # Expect .ssh/authorized_keys2 to be disregarded by default in future. 40 | #AuthorizedKeysFile .ssh/authorized_keys .ssh/authorized_keys2 41 | 42 | #AuthorizedPrincipalsFile none 43 | 44 | #AuthorizedKeysCommand none 45 | #AuthorizedKeysCommandUser nobody 46 | 47 | # For this to work you will also need host keys in /etc/ssh/ssh_known_hosts 48 | #HostbasedAuthentication no 49 | # Change to yes if you don't trust ~/.ssh/known_hosts for 50 | # HostbasedAuthentication 51 | #IgnoreUserKnownHosts no 52 | # Don't read the user's ~/.rhosts and ~/.shosts files 53 | #IgnoreRhosts yes 54 | 55 | # To disable tunneled clear text passwords, change to no here! 56 | #PasswordAuthentication yes 57 | PermitEmptyPasswords no 58 | 59 | # Change to yes to enable challenge-response passwords (beware issues with 60 | # some PAM modules and threads) 61 | ChallengeResponseAuthentication no 62 | 63 | # Kerberos options 64 | #KerberosAuthentication no 65 | #KerberosOrLocalPasswd yes 66 | #KerberosTicketCleanup yes 67 | #KerberosGetAFSToken no 68 | 69 | # GSSAPI options 70 | #GSSAPIAuthentication no 71 | #GSSAPICleanupCredentials yes 72 | #GSSAPIStrictAcceptorCheck yes 73 | #GSSAPIKeyExchange no 74 | 75 | # Set this to 'yes' to enable PAM authentication, account processing, 76 | # and session processing. If this is enabled, PAM authentication will 77 | # be allowed through the ChallengeResponseAuthentication and 78 | # PasswordAuthentication. Depending on your PAM configuration, 79 | # PAM authentication via ChallengeResponseAuthentication may bypass 80 | # the setting of "PermitRootLogin without-password". 81 | # If you just want the PAM account and session checks to run without 82 | # PAM authentication, then enable this but set PasswordAuthentication 83 | # and ChallengeResponseAuthentication to 'no'. 84 | UsePAM yes 85 | 86 | #AllowAgentForwarding yes 87 | #AllowTcpForwarding yes 88 | #GatewayPorts no 89 | X11Forwarding yes 90 | #X11DisplayOffset 10 91 | #X11UseLocalhost yes 92 | #PermitTTY yes 93 | PrintMotd no 94 | PrintLastLog no 95 | #TCPKeepAlive yes 96 | #UseLogin no 97 | #PermitUserEnvironment no 98 | #Compression delayed 99 | #ClientAliveInterval 0 100 | #ClientAliveCountMax 3 101 | #UseDNS no 102 | #PidFile /var/run/sshd.pid 103 | #MaxStartups 10:30:100 104 | #PermitTunnel no 105 | #ChrootDirectory none 106 | #VersionAddendum none 107 | 108 | # no default banner path 109 | #Banner none 110 | 111 | # Allow client to pass locale environment variables 112 | AcceptEnv LANG LC_* 113 | 114 | # override default of no subsystems 115 | Subsystem sftp /usr/lib/openssh/sftp-server 116 | 117 | # Example of overriding settings on a per-user basis 118 | #Match User anoncvs 119 | # X11Forwarding no 120 | # AllowTcpForwarding no 121 | # PermitTTY no 122 | # ForceCommand cvs server 123 | PasswordAuthentication yes 124 | -------------------------------------------------------------------------------- /roles/basic/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for basic 3 | - name: ensure en locale 4 | become: yes 5 | locale_gen: 6 | name: en_US.UTF-8 7 | state: present 8 | - name: ensure timezone 9 | become: yes 10 | timezone: 11 | name: "{{ timezone }}" 12 | register: tz 13 | - name: sync the timezone of rsyslog 14 | become: yes 15 | when: tz.changed 16 | service: 17 | name: rsyslog 18 | state: restarted 19 | - name: install some apt packs on nodes 20 | become: yes 21 | apt: 22 | name: "{{ aptpacks }}" 23 | state: present 24 | update_cache: yes 25 | - name: ensure ntp sevice is started 26 | become: yes 27 | service: 28 | name: ntp 29 | state: started 30 | enabled: yes 31 | - name: update ntp config file on ln node 32 | become: yes 33 | template: 34 | src: ../templates/ntp.conf 35 | dest: /etc/ntp.conf 36 | owner: root 37 | backup: yes 38 | register: lnntp 39 | when: inventory_hostname in groups['ln'] 40 | - name: restart ntp service on lg node 41 | become: yes 42 | service: 43 | name: ntp 44 | state: restarted 45 | when: inventory_hostname in groups['ln'] and lnntp.changed 46 | - name: update ntp config file on cn nodes 47 | become: yes 48 | template: 49 | src: ntp-cn.conf 50 | dest: /etc/ntp.conf 51 | register: ntpconfig 52 | when: inventory_hostname in groups['cn'] 53 | - name: restart ntp service on cn nodes 54 | become: yes 55 | service: 56 | name: ntp 57 | state: restarted 58 | when: inventory_hostname in groups['cn'] and ntpconfig.changed 59 | - name: update ssh config in all nodes 60 | become: yes 61 | copy: 62 | src: sshd_config 63 | dest: /etc/ssh/sshd_config 64 | backup: yes 65 | register: sshdupdate 66 | - name: ensure ssh server is started 67 | become: yes 68 | service: 69 | name: sshd 70 | state: started 71 | - name: restart ssh 72 | become: yes 73 | service: 74 | name: sshd 75 | state: restarted 76 | when: sshdupdate.changed 77 | - name: remove welcome message 78 | become: yes 79 | file: 80 | path: /etc/update-motd.d/ 81 | mode: '0644' 82 | state: directory 83 | recurse: yes 84 | - name: postfix config 85 | become: yes 86 | template: 87 | src: main.cf 88 | dest: /etc/postfix/main.cf 89 | backup: yes 90 | register: postconf 91 | - name: postfix alias file 92 | copy: 93 | src: aliases 94 | dest: /etc/aliases 95 | become: yes 96 | register: alias 97 | - name: ensure postfix running 98 | service: 99 | name: postfix 100 | state: started 101 | enabled: yes 102 | become: yes 103 | - name: new aliases db 104 | become: yes 105 | command: "newaliases" 106 | when: alias.changed 107 | - name: reload postfix 108 | service: 109 | name: postfix 110 | state: reloaded 111 | become: yes 112 | when: postconf.changed 113 | - name: stop snapd 114 | become: yes 115 | service: 116 | name: snapd 117 | state: stopped 118 | enabled: no 119 | -------------------------------------------------------------------------------- /roles/basic/templates/main.cf: -------------------------------------------------------------------------------- 1 | # Debian specific: Specifying a file name will cause the first 2 | # line of that file to be used as the name. The Debian default 3 | # is /etc/mailname. 4 | 5 | smtpd_banner = $myhostname ESMTP $mail_name (Ubuntu) 6 | biff = no 7 | 8 | # appending .domain is the MUA's job. 9 | append_dot_mydomain = no 10 | 11 | # Uncomment the next line to generate "delayed mail" warnings 12 | #delay_warning_time = 4h 13 | 14 | readme_directory = no 15 | 16 | # See http://www.postfix.org/COMPATIBILITY_README.html -- default to 2 on 17 | # fresh installs. 18 | compatibility_level = 2 19 | 20 | # TLS parameters 21 | smtpd_tls_cert_file=/etc/ssl/certs/ssl-cert-snakeoil.pem 22 | smtpd_tls_key_file=/etc/ssl/private/ssl-cert-snakeoil.key 23 | smtpd_use_tls=yes 24 | smtpd_tls_session_cache_database = btree:${data_directory}/smtpd_scache 25 | smtp_tls_session_cache_database = btree:${data_directory}/smtp_scache 26 | 27 | # See /usr/share/doc/postfix/TLS_README.gz in the postfix-doc package for 28 | # information on enabling SSL in the smtp client. 29 | 30 | smtpd_relay_restrictions = permit_mynetworks permit_sasl_authenticated defer_unauth_destination 31 | myhostname = {{ inventory_hostname }}.localdomain 32 | alias_maps = hash:/etc/aliases 33 | alias_database = hash:/etc/aliases 34 | mydestination = {{ inventory_hostname }}.localdomain, {{ inventory_hostname }}, localhost.localdomain, , localhost 35 | relayhost = 36 | mynetworks = 127.0.0.0/8 [::ffff:127.0.0.0]/104 [::1]/128 37 | mailbox_size_limit = 0 38 | recipient_delimiter = + 39 | inet_interfaces = all 40 | inet_protocols = all 41 | myorigin = {{ inventory_hostname }}.localdomain 42 | -------------------------------------------------------------------------------- /roles/basic/templates/ntp-cn.conf: -------------------------------------------------------------------------------- 1 | # /etc/ntp.conf, configuration for ntpd; see ntp.conf(5) for help 2 | 3 | driftfile /var/lib/ntp/ntp.drift 4 | 5 | # Leap seconds definition provided by tzdata 6 | leapfile /usr/share/zoneinfo/leap-seconds.list 7 | 8 | # Enable this if you want statistics to be logged. 9 | #statsdir /var/log/ntpstats/ 10 | 11 | statistics loopstats peerstats clockstats 12 | filegen loopstats file loopstats type day enable 13 | filegen peerstats file peerstats type day enable 14 | filegen clockstats file clockstats type day enable 15 | 16 | # Specify one or more NTP servers. 17 | 18 | # Use servers from the NTP Pool Project. Approved by Ubuntu Technical Board 19 | # on 2011-02-08 (LP: #104525). See http://www.pool.ntp.org/join.html for 20 | # more information. 21 | # pool 0.ubuntu.pool.ntp.org iburst 22 | # pool 1.ubuntu.pool.ntp.org iburst 23 | # pool 2.ubuntu.pool.ntp.org iburst 24 | # pool 3.ubuntu.pool.ntp.org iburst 25 | 26 | # Use Ubuntu's ntp server as a fallback. 27 | # pool ntp.ubuntu.com 28 | 29 | # Access control configuration; see /usr/share/doc/ntp-doc/html/accopt.html for 30 | # details. The web page 31 | # might also be helpful. 32 | # 33 | # Note that "restrict" applies to both servers and clients, so a configuration 34 | # that might be intended to block requests from certain clients could also end 35 | # up blocking replies from your own upstream servers. 36 | 37 | # By default, exchange time with everybody, but don't allow configuration. 38 | restrict -4 default kod notrap nomodify nopeer noquery limited 39 | restrict -6 default kod notrap nomodify nopeer noquery limited 40 | 41 | # Local users may interrogate the ntp server more closely. 42 | restrict 127.0.0.1 43 | restrict ::1 44 | 45 | # Needed for adding pool entries 46 | restrict source notrap nomodify noquery 47 | 48 | # Clients from this (example!) subnet have unlimited access, but only if 49 | # cryptographically authenticated. 50 | #restrict 192.168.123.0 mask 255.255.255.0 notrust 51 | 52 | 53 | # If you want to provide time to your local subnet, change the next line. 54 | # (Again, the address is an example only.) 55 | #broadcast 192.168.123.255 56 | 57 | # If you want to listen to time broadcasts on your local subnet, de-comment the 58 | # next lines. Please do this only if you trust everybody on the network! 59 | #disable auth 60 | #broadcastclient 61 | 62 | #Changes recquired to use pps synchonisation as explained in documentation: 63 | #http://www.ntp.org/ntpfaq/NTP-s-config-adv.htm#AEN3918 64 | 65 | #server 127.127.8.1 mode 135 prefer # Meinberg GPS167 with PPS 66 | #fudge 127.127.8.1 time1 0.0042 # relative to PPS for my hardware 67 | 68 | #server 127.127.22.1 # ATOM(PPS) 69 | #fudge 127.127.22.1 flag3 1 # enable PPS API 70 | server {{ master_name }} prefer 71 | -------------------------------------------------------------------------------- /roles/basic/templates/ntp.conf: -------------------------------------------------------------------------------- 1 | # /etc/ntp.conf, configuration for ntpd; see ntp.conf(5) for help 2 | 3 | driftfile /var/lib/ntp/ntp.drift 4 | 5 | # Leap seconds definition provided by tzdata 6 | leapfile /usr/share/zoneinfo/leap-seconds.list 7 | 8 | # Enable this if you want statistics to be logged. 9 | #statsdir /var/log/ntpstats/ 10 | 11 | # logconfig =syncstatus +sysstatus 12 | 13 | statistics loopstats peerstats clockstats 14 | filegen loopstats file loopstats type day enable 15 | filegen peerstats file peerstats type day enable 16 | filegen clockstats file clockstats type day enable 17 | 18 | # Specify one or more NTP servers. 19 | 20 | # Use servers from the NTP Pool Project. Approved by Ubuntu Technical Board 21 | # on 2011-02-08 (LP: #104525). See http://www.pool.ntp.org/join.html for 22 | # more information. 23 | # pool 0.ubuntu.pool.ntp.org iburst 24 | # pool 1.ubuntu.pool.ntp.org iburst 25 | # pool 2.ubuntu.pool.ntp.org iburst 26 | # pool 3.ubuntu.pool.ntp.org iburst 27 | 28 | # Use Ubuntu's ntp server as a fallback. 29 | # pool ntp.ubuntu.com 30 | 31 | # Access control configuration; see /usr/share/doc/ntp-doc/html/accopt.html for 32 | # details. The web page 33 | # might also be helpful. 34 | # 35 | # Note that "restrict" applies to both servers and clients, so a configuration 36 | # that might be intended to block requests from certain clients could also end 37 | # up blocking replies from your own upstream servers. 38 | 39 | # By default, exchange time with everybody, but don't allow configuration. 40 | restrict -4 default kod notrap nomodify nopeer noquery limited 41 | restrict -6 default kod notrap nomodify nopeer noquery limited 42 | 43 | # Local users may interrogate the ntp server more closely. 44 | restrict 127.0.0.1 45 | restrict ::1 46 | 47 | # Needed for adding pool entries 48 | restrict source notrap nomodify noquery 49 | 50 | # Clients from this (example!) subnet have unlimited access, but only if 51 | # cryptographically authenticated. 52 | #restrict 192.168.123.0 mask 255.255.255.0 notrust 53 | 54 | restrict {{ ntp_server }} 55 | restrict {{ ip_range }} mask {{netmask}} nomodify 56 | 57 | # If you want to provide time to your local subnet, change the next line. 58 | # (Again, the address is an example only.) 59 | #broadcast 192.168.123.255 60 | 61 | # If you want to listen to time broadcasts on your local subnet, de-comment the 62 | # next lines. Please do this only if you trust everybody on the network! 63 | #disable auth 64 | #broadcastclient 65 | 66 | #Changes recquired to use pps synchonisation as explained in documentation: 67 | #http://www.ntp.org/ntpfaq/NTP-s-config-adv.htm#AEN3918 68 | 69 | server {{ ntp_server}} prefer 70 | 71 | #server 127.127.8.1 mode 135 prefer # Meinberg GPS167 with PPS 72 | #fudge 127.127.8.1 time1 0.0042 # relative to PPS for my hardware 73 | 74 | #server 127.127.22.1 # ATOM(PPS) 75 | #fudge 127.127.22.1 flag3 1 # enable PPS API 76 | -------------------------------------------------------------------------------- /roles/cgroup/README.md: -------------------------------------------------------------------------------- 1 | CGroup 2 | ========= 3 | 4 | This role is designed to manage cgroup and resource limit by user or app basis. 5 | 6 | Requirements 7 | ------------ 8 | 9 | You must have these user created defined in `cgrules.conf`. For example, ELK stack users restriction must be added after install of ELK stack. 10 | 11 | Templates and Files 12 | -------------- 13 | 14 | cgrules.conf and cgconfig.conf in files dir are very **specific**. You may want to change them based on your needs and your hardware specs. We only provide an example file named after `cgrules.conf.default`, you should rename the file without `.default` before applying the role. 15 | 16 | Distribution related 17 | --------------- 18 | 19 | It is worth noting, the cgroup auto classify system is very different in CentOS. So be careful on this role, if your distribution is not Ubuntu. 20 | -------------------------------------------------------------------------------- /roles/cgroup/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # defaults file for cgroup -------------------------------------------------------------------------------- /roles/cgroup/files/cgconfig.conf: -------------------------------------------------------------------------------- 1 | group service { 2 | cpuset { 3 | cpuset.cpus="0-13"; 4 | cpuset.mems=0; 5 | } 6 | } 7 | group userhard { 8 | cpuset { 9 | cpuset.cpus="14-27,42-55"; 10 | cpuset.mems=1; 11 | } 12 | } 13 | group usersoft { 14 | cpu { 15 | cpu.shares=500; 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /roles/cgroup/files/cgrules.conf.default: -------------------------------------------------------------------------------- 1 | elasticsearch cpuset service/ 2 | kibana cpuset service/ 3 | logstash cpuset service/ 4 | test cpuset usersoft/ 5 | -------------------------------------------------------------------------------- /roles/cgroup/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for cgroup 3 | - name: install cgroup tool 4 | become: yes 5 | apt: 6 | name: cgroup-tools 7 | state: present 8 | when: inventory_hostname in groups['ln'] 9 | - name: copy cgconfig 10 | become: yes 11 | copy: 12 | src: cgconfig.conf 13 | dest: /etc/cgconfig.conf 14 | when: inventory_hostname in groups['ln'] 15 | register: cgconf 16 | - name: update cgroup fs 17 | become: yes 18 | when: inventory_hostname in groups['ln'] and cgconf.changed 19 | command: "/usr/sbin/cgconfigparser -l /etc/cgconfig.conf" 20 | - name: copy cgrules 21 | become: yes 22 | copy: 23 | src: cgrules.conf 24 | dest: /etc/cgrules.conf 25 | when: inventory_hostname in groups['ln'] 26 | register: cgruleconf 27 | - name: restart cgd 28 | become: yes 29 | # command: "kill `ps aux|grep cgrulesengd|head -n 1|awk '{print $2}' && /usr/sbin/cgrulesengd" 30 | shell: "/usr/sbin/cgrulesengd" 31 | when: inventory_hostname in groups['ln'] and cgruleconf.changed 32 | -------------------------------------------------------------------------------- /roles/drivers/README.md: -------------------------------------------------------------------------------- 1 | Drivers 2 | ========= 3 | 4 | Install Nvidia drivers on nodes in [gn] group. 5 | 6 | Role Variables 7 | -------------- 8 | 9 | See defaults/main.yml. One should sepcify the driver versions, and it can be done in a more finer way, such as `driver_name: "nvidia-driver-418=418.56-0ubuntu0~gpu18.04.1"`. 10 | 11 | Notes 12 | -------------- 13 | 14 | Before running this role, you mat need to run `sudo apt-get purge nvidia*` on [gn] to ensure the preinstalled drivers deleted. 15 | 16 | After the installation of GPU drivers, a reboot is necessary. The reboot thing is not controlled by the role, so reboot the machines by hand. 17 | -------------------------------------------------------------------------------- /roles/drivers/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # defaults file for drivers 3 | driver_name: "nvidia-driver-430" 4 | -------------------------------------------------------------------------------- /roles/drivers/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for drivers 3 | - name: install python3 apt for apt repo task 4 | become: yes 5 | apt: 6 | name: python3-apt 7 | state: present 8 | - name: add nvidia repo 9 | become: yes 10 | environment: "{{ env_vars }}" 11 | apt_repository: 12 | repo: "ppa:graphics-drivers/ppa" 13 | state: present 14 | update_cache: yes 15 | when: inventory_hostname in groups['gn'] 16 | - name: install gpu driver 17 | become: yes 18 | apt: 19 | name: "{{ driver_name }}" 20 | state: present 21 | when: inventory_hostname in groups['gn'] 22 | -------------------------------------------------------------------------------- /roles/elastalert/README.md: -------------------------------------------------------------------------------- 1 | Elastalert 2 | ========= 3 | 4 | This role is designed to integrate elastalert by Yelp into ELK stacks. 5 | 6 | Requirements 7 | ------------ 8 | 9 | You must run `elk` role first to set up the ELK stack with elasticsearch database. 10 | Also, you should setup index on elasticsearch by `elastalert-create-index`. 11 | 12 | Templates and Files 13 | -------------- 14 | Files in elastalert/rules should be added by hands, please refer to the doc of elastalert on how to write yaml files for alert rules. 15 | -------------------------------------------------------------------------------- /roles/elastalert/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # defaults file for elastalert 3 | es_user: elastic 4 | es_pass: 123456notgood 5 | -------------------------------------------------------------------------------- /roles/elastalert/files/elastalert.conf: -------------------------------------------------------------------------------- 1 | [program:elastalert] 2 | command=/usr/bin/elastalert --config /etc/elastalert/config.yaml --verbose 3 | process_name=elastalert 4 | autostart=true 5 | autorestart=true 6 | startsecs=15 7 | stopsignal=INT 8 | stopasgroup=true 9 | killasgroup=true 10 | stderr_logfile=/var/log/elastalert_stderr.log 11 | stderr_logfile_maxbytes=5MB 12 | 13 | -------------------------------------------------------------------------------- /roles/elastalert/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for elastalert 3 | - name: install some necessary apt packages 4 | become: yes 5 | apt: 6 | name: "{{ item }}" 7 | state: present 8 | update_cache: yes 9 | when: inventory_hostname in groups['ln'] 10 | with_items: 11 | - supervisor 12 | - elastalert 13 | - name: ensure supervisord is started 14 | become: yes 15 | service: 16 | name: supervisor 17 | state: started 18 | when: inventory_hostname in groups['ln'] 19 | - name: move elastalert conf to supervisord 20 | become: yes 21 | copy: 22 | src: elastalert.conf 23 | dest: /etc/supervisor/conf.d/elastalert.conf 24 | when: inventory_hostname in groups['ln'] 25 | register: supconf 26 | - name: reload supervisord if conf changed 27 | become: yes 28 | service: 29 | name: supervisor 30 | state: reloaded 31 | when: inventory_hostname in groups['ln'] and supconf.changed 32 | - name: create config directory 33 | become: yes 34 | file: 35 | state: directory 36 | path: /etc/elastalert 37 | mode: '700' 38 | when: inventory_hostname in groups['ln'] 39 | - name: copy elastalert configs to etc 40 | become: yes 41 | copy: 42 | src: elastalert/ 43 | dest: /etc/elastalert/ 44 | when: inventory_hostname in groups['ln'] 45 | register: eaconf 46 | - name: render main config to etc 47 | become: yes 48 | template: 49 | src: config.yaml 50 | dest: /etc/elastalert/config.yaml 51 | when: inventory_hostname in groups['ln'] 52 | register: eaconf2 53 | - name: supervisor start elastalert 54 | become: yes 55 | supervisorctl: 56 | name: elastalert 57 | state: started 58 | when: inventory_hostname in groups['ln'] 59 | - name: restart elastalert if conf changed 60 | become: yes 61 | supervisorctl: 62 | name: elastalert 63 | state: restarted 64 | when: inventory_hostname in groups['ln'] and (eaconf.changed or eaconf2.changed) 65 | -------------------------------------------------------------------------------- /roles/elastalert/templates/config.yaml: -------------------------------------------------------------------------------- 1 | # This is the folder that contains the rule yaml files 2 | # Any .yaml file will be loaded as a rule 3 | rules_folder: "/etc/elastalert/rules" 4 | 5 | # How often ElastAlert will query Elasticsearch 6 | # The unit can be anything from weeks to seconds 7 | run_every: 8 | minutes: 10 9 | 10 | # ElastAlert will buffer results from the most recent 11 | # period of time, in case some log sources are not in real time 12 | buffer_time: 13 | minutes: 15 14 | 15 | # The Elasticsearch hostname for metadata writeback 16 | # Note that every rule can have its own Elasticsearch host 17 | es_host: "{{ es_host }}" 18 | 19 | # The Elasticsearch port 20 | es_port: 9200 21 | 22 | es_username: "{{ es_user }}" 23 | 24 | es_password: "{{ es_pass }}" 25 | 26 | # The AWS region to use. Set this when using AWS-managed elasticsearch 27 | #aws_region: us-east-1 28 | 29 | # The AWS profile to use. Use this if you are using an aws-cli profile. 30 | # See http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html 31 | # for details 32 | #profile: test 33 | 34 | # Optional URL prefix for Elasticsearch 35 | #es_url_prefix: elasticsearch 36 | 37 | # Connect with TLS to Elasticsearch 38 | #use_ssl: True 39 | 40 | # Verify TLS certificates 41 | #verify_certs: True 42 | 43 | # GET request with body is the default option for Elasticsearch. 44 | # If it fails for some reason, you can pass 'GET', 'POST' or 'source'. 45 | # See http://elasticsearch-py.readthedocs.io/en/master/connection.html?highlight=send_get_body_as#transport 46 | # for details 47 | #es_send_get_body_as: GET 48 | 49 | # Option basic-auth username and password for Elasticsearch 50 | #es_username: someusername 51 | #es_password: somepassword 52 | 53 | # Use SSL authentication with client certificates client_cert must be 54 | # a pem file containing both cert and key for client 55 | #verify_certs: True 56 | #ca_certs: /path/to/cacert.pem 57 | #client_cert: /path/to/client_cert.pem 58 | #client_key: /path/to/client_key.key 59 | 60 | # The index on es_host which is used for metadata storage 61 | # This can be a unmapped index, but it is recommended that you run 62 | # elastalert-create-index to set a mapping 63 | writeback_index: elastalert_status 64 | 65 | # If an alert fails for some reason, ElastAlert will retry 66 | # sending the alert until this time period has elapsed 67 | alert_time_limit: 68 | days: 2 69 | 70 | # Custom logging configuration 71 | # If you want to setup your own logging configuration to log into 72 | # files as well or to Logstash and/or modify log levels, use 73 | # the configuration below and adjust to your needs. 74 | # Note: if you run ElastAlert with --verbose/--debug, the log level of 75 | # the "elastalert" logger is changed to INFO, if not already INFO/DEBUG. 76 | #logging: 77 | # version: 1 78 | # incremental: false 79 | # disable_existing_loggers: false 80 | # formatters: 81 | # logline: 82 | # format: '%(asctime)s %(levelname)+8s %(name)+20s %(message)s' 83 | # 84 | # handlers: 85 | # console: 86 | # class: logging.StreamHandler 87 | # formatter: logline 88 | # level: DEBUG 89 | # stream: ext://sys.stderr 90 | # 91 | # file: 92 | # class : logging.FileHandler 93 | # formatter: logline 94 | # level: DEBUG 95 | # filename: elastalert.log 96 | # 97 | # loggers: 98 | # elastalert: 99 | # level: WARN 100 | # handlers: [] 101 | # propagate: true 102 | # 103 | # elasticsearch: 104 | # level: WARN 105 | # handlers: [] 106 | # propagate: true 107 | # 108 | # elasticsearch.trace: 109 | # level: WARN 110 | # handlers: [] 111 | # propagate: true 112 | # 113 | # '': # root logger 114 | # level: WARN 115 | # handlers: 116 | # - console 117 | # - file 118 | # propagate: false 119 | -------------------------------------------------------------------------------- /roles/elk/README.md: -------------------------------------------------------------------------------- 1 | ELK 2 | ========= 3 | 4 | This role is designed to configure a minimal ELK (elasticsearch+logstash+kibana+filebeat) stack for logging system. 5 | It also enables the user authetication of elastisearch. 6 | 7 | Requirements 8 | ------------ 9 | 10 | Java 8 should be installed, which is done by `basic` role. 11 | 12 | python3-passlib should be installed to confige http authetication, which is done by `ganglia` role 13 | 14 | Role Variables 15 | -------------- 16 | 17 | See defaults/main.yml. It is worth noting, when running the role at the first time to configure the whole stack, you should run it with filebeat_init as no and as yes each once. After the first run with filebeat_init as no, you can return to command line set es password by `sudo /usr/share/elasticsearch/bin/elasticsearch-setup-passwords interactive`. After this, run the role with filebeat_init as yes to finish the initial configurations. Afterward, you should keep filebeat_init to no, unless you want to reconfigure modules in filebeats. 18 | 19 | Templates and Files 20 | -------------- 21 | 22 | Notes 23 | -------------- 24 | 25 | The initial configuration is in general referenced on [this post](https://www.digitalocean.com/community/tutorials/how-to-install-elasticsearch-logstash-and-kibana-elastic-stack-on-ubuntu-18-04#step-3-%E2%80%94-installing-and-configuring-logstash) with generalization to multiple distributed filebeats. 26 | 27 | We further add multiple features from the minimal infrastructure: user authetication, multiple modules from filebeat, correct timestamps and no filters in logstash. 28 | 29 | Also note nginx http authentication might be conflict with kibana intrinsic ones, so don't set http auth twice. -------------------------------------------------------------------------------- /roles/elk/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # defaults file for elk 3 | kibana_web_port: 8080 4 | es_host: 5 | - master 6 | - c9 7 | kb_port: 5601 8 | kb_user: "kibana" 9 | kb_pass: "654321alsobad" 10 | es_user: "elastic" 11 | es_pass: "654321alsobad" 12 | filebeat_init: no 13 | filebeat_ln_modules: 14 | - system 15 | - nginx 16 | - apache2 17 | - mysql 18 | - iptables 19 | filebeat_cn_modules: 20 | - system 21 | -------------------------------------------------------------------------------- /roles/elk/files/apache2.yml: -------------------------------------------------------------------------------- 1 | - module: apache2 2 | # Access logs 3 | access: 4 | enabled: true 5 | 6 | # Set custom paths for the log files. If left empty, 7 | # Filebeat will choose the paths depending on your OS. 8 | #var.paths: 9 | 10 | # Error logs 11 | error: 12 | enabled: true 13 | 14 | # Set custom paths for the log files. If left empty, 15 | # Filebeat will choose the paths depending on your OS. 16 | #var.paths: 17 | var.convert_timezone: true 18 | -------------------------------------------------------------------------------- /roles/elk/files/apache2_pipeline.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Pipeline for parsing apache2 error logs", 3 | "processors": [ 4 | { 5 | "grok": { 6 | "field": "message", 7 | "patterns": [ 8 | "\\[%{APACHE_TIME:apache2.error.timestamp}\\] \\[%{LOGLEVEL:apache2.error.level}\\]( \\[client %{IPORHOST:apache2.error.client}\\])? %{GREEDYDATA:apache2.error.message}", 9 | "\\[%{APACHE_TIME:apache2.error.timestamp}\\] \\[%{DATA:apache2.error.module}:%{LOGLEVEL:apache2.error.level}\\] \\[pid %{NUMBER:apache2.error.pid}(:tid %{NUMBER:apache2.error.tid})?\\]( \\[client %{IPORHOST:apache2.error.client}\\])? %{GREEDYDATA:apache2.error.message1}" 10 | ], 11 | "pattern_definitions": { 12 | "APACHE_TIME": "%{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{YEAR}" 13 | }, 14 | "ignore_missing": true 15 | } 16 | }, 17 | { 18 | "remove":{ 19 | "field": "message" 20 | } 21 | }, 22 | { 23 | "rename": { 24 | "field": "apache2.error.message1", 25 | "target_field": "apache2.error.message", 26 | "ignore_failure": true 27 | } 28 | }, 29 | { 30 | "date": { 31 | "field": "apache2.error.timestamp", 32 | "target_field": "@timestamp", 33 | "formats": ["EEE MMM dd H:m:s yyyy", "EEE MMM dd H:m:s.SSSSSS yyyy"], 34 | {< if .convert_timezone >}"timezone": "{{ beat.timezone }}",{< end >} 35 | "ignore_failure": true 36 | } 37 | }, 38 | { 39 | "remove": { 40 | "field": "apache2.error.timestamp", 41 | "ignore_failure": true 42 | } 43 | } 44 | ], 45 | "on_failure" : [{ 46 | "set" : { 47 | "field" : "error.message", 48 | "value" : "{{ _ingest.on_failure_message }}" 49 | } 50 | }] 51 | } 52 | -------------------------------------------------------------------------------- /roles/elk/files/jvm.options: -------------------------------------------------------------------------------- 1 | -Xmx8g 2 | -Xms8g 3 | -XX:+UseConcMarkSweepGC 4 | -XX:CMSInitiatingOccupancyFraction=75 5 | -XX:+UseCMSInitiatingOccupancyOnly 6 | -Des.networkaddress.cache.ttl=60 7 | -Des.networkaddress.cache.negative.ttl=10 8 | -XX:+AlwaysPreTouch 9 | -Xss1m 10 | -Djava.awt.headless=true 11 | -Dfile.encoding=UTF-8 12 | -Djna.nosys=true 13 | -XX:-OmitStackTraceInFastThrow 14 | -Dio.netty.noUnsafe=true 15 | -Dio.netty.noKeySetOptimization=true 16 | -Dio.netty.recycler.maxCapacityPerThread=0 17 | -Dlog4j.shutdownHookEnabled=false 18 | -Dlog4j2.disable.jmx=true 19 | -Djava.io.tmpdir=${ES_TMPDIR} 20 | -XX:+HeapDumpOnOutOfMemoryError 21 | -XX:HeapDumpPath=/var/lib/elasticsearch 22 | -XX:ErrorFile=/var/log/elasticsearch/hs_err_pid%p.log 23 | 8:-XX:+PrintGCDetails 24 | 8:-XX:+PrintGCDateStamps 25 | 8:-XX:+PrintTenuringDistribution 26 | 8:-XX:+PrintGCApplicationStoppedTime 27 | 8:-Xloggc:/var/log/elasticsearch/gc.log 28 | 8:-XX:+UseGCLogFileRotation 29 | 8:-XX:NumberOfGCLogFiles=32 30 | 8:-XX:GCLogFileSize=64m 31 | 9-:-Xlog:gc*,gc+age=trace,safepoint:file=/var/log/elasticsearch/gc.log:utctime,pid,tags:filecount=32,filesize=64m 32 | 9-:-Djava.locale.providers=COMPAT 33 | 10-:-XX:UseAVX=2 34 | -------------------------------------------------------------------------------- /roles/elk/files/nginx.yml: -------------------------------------------------------------------------------- 1 | - module: nginx 2 | # Access logs 3 | access: 4 | enabled: true 5 | 6 | # Set custom paths for the log files. If left empty, 7 | # Filebeat will choose the paths depending on your OS. 8 | #var.paths: 9 | 10 | # Convert the timestamp to UTC. Requires Elasticsearch >= 6.1. 11 | #var.convert_timezone: true 12 | 13 | # Error logs 14 | error: 15 | enabled: true 16 | 17 | # Set custom paths for the log files. If left empty, 18 | # Filebeat will choose the paths depending on your OS. 19 | #var.paths: 20 | 21 | # Convert the timestamp to UTC. Requires Elasticsearch >= 6.1. 22 | var.convert_timezone: true 23 | -------------------------------------------------------------------------------- /roles/elk/files/system.yml: -------------------------------------------------------------------------------- 1 | - module: system 2 | # Syslog 3 | syslog: 4 | enabled: true 5 | 6 | # Set custom paths for the log files. If left empty, 7 | # Filebeat will choose the paths depending on your OS. 8 | #var.paths: 9 | 10 | # Convert the timestamp to UTC. Requires Elasticsearch >= 6.1. 11 | var.convert_timezone: true 12 | 13 | # Authorization logs 14 | auth: 15 | enabled: true 16 | 17 | # Set custom paths for the log files. If left empty, 18 | # Filebeat will choose the paths depending on your OS. 19 | #var.paths: 20 | 21 | # Convert the timestamp to UTC. Requires Elasticsearch >= 6.1. 22 | var.convert_timezone: true 23 | -------------------------------------------------------------------------------- /roles/elk/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for elk 3 | - name: install python3 apt for apt repo task 4 | become: yes 5 | apt: 6 | name: python3-apt 7 | state: present 8 | - name: add elastic apt key 9 | become: yes 10 | apt_key: 11 | url: https://artifacts.elastic.co/GPG-KEY-elasticsearch 12 | - name: add elastic repos for apt 13 | become: yes 14 | apt_repository: 15 | repo: "deb https://artifacts.elastic.co/packages/6.x/apt stable main" 16 | state: present 17 | update_cache: yes 18 | - name: install elastic 19 | become: yes 20 | apt: 21 | name: elasticsearch=6.8.0 22 | state: present 23 | when: inventory_hostname in es_host 24 | - name: elastic config 25 | become: yes 26 | template: 27 | src: elasticsearch.yml 28 | dest: /etc/elasticsearch/elasticsearch.yml 29 | owner: root 30 | group: elasticsearch 31 | mode: 0640 32 | when: inventory_hostname in es_host 33 | register: esconf 34 | - name: jvm option 35 | become: yes 36 | copy: 37 | src: jvm.options 38 | dest: /etc/elasticsearch/jvm.options 39 | owner: root 40 | group: elasticsearch 41 | mode: 0640 42 | when: inventory_hostname in es_host 43 | register: jvmconf 44 | - name: ensure cert directory exist 45 | become: yes 46 | file: 47 | path: /etc/elasticsearch/certs 48 | state: directory 49 | when: inventory_hostname in es_host 50 | register: cert 51 | - name: generate ssl ca 52 | when: inventory_hostname == es_host[0] and cert.changed 53 | become: yes 54 | command: '/usr/share/elasticsearch/bin/elasticsearch-certutil ca --pass "" --out /etc/elasticsearch/certs/elastic-stack-ca.p12' 55 | - name: generate ssl cert 56 | when: inventory_hostname == es_host[0] and cert.changed 57 | become: yes 58 | command: '/usr/share/elasticsearch/bin/elasticsearch-certutil cert --ca /etc/elasticsearch/certs/elastic-stack-ca.p12 --pass "" --out {{ role_path }}/files/elastic-certificates.p12 --ca-pass ""' 59 | - name: chown ssl key files 60 | become: yes 61 | when: inventory_hostname == es_host[0] and cert.changed 62 | file: 63 | owner: "{{ admin }}" 64 | path: "{{ role_path }}/files/elastic-certificates.p12" 65 | - name: copy ssl key files 66 | become: yes 67 | copy: 68 | src: elastic-certificates.p12 69 | dest: /etc/elasticsearch/certs/elastic-certificates.p12 70 | when: inventory_hostname in es_host 71 | register: sslconf 72 | - name: ensure es is started 73 | become: yes 74 | service: 75 | name: elasticsearch 76 | state: started 77 | enabled: yes 78 | when: inventory_hostname in es_host 79 | - name: elasticsearch restart 80 | become: yes 81 | service: 82 | name: elasticsearch 83 | state: restarted 84 | enabled: yes 85 | when: inventory_hostname in es_host and (esconf.changed or sslconf.changed or jvmconf.changed) 86 | - name: install kibana 87 | become: yes 88 | apt: 89 | name: kibana=6.8.0 90 | state: present 91 | when: inventory_hostname in groups['ln'] 92 | - name: configure kibana 93 | become: yes 94 | template: 95 | src: kibana.yml 96 | dest: /etc/kibana/kibana.yml 97 | mode: 0600 98 | owner: kibana 99 | when: inventory_hostname in groups['ln'] 100 | register: kbconf 101 | - name: ensure kibana is started 102 | become: yes 103 | service: 104 | name: kibana 105 | state: started 106 | enabled: yes 107 | when: inventory_hostname in groups['ln'] 108 | - name: service kibana enable and start 109 | become: yes 110 | service: 111 | name: kibana 112 | state: restarted 113 | enabled: yes 114 | when: inventory_hostname in groups['ln'] and kbconf.changed 115 | - name: install nginx 116 | become: yes 117 | apt: 118 | name: nginx 119 | state: present 120 | when: inventory_hostname in groups['ln'] 121 | - name: delete default nginx page 122 | become: yes 123 | file: 124 | path: /etc/nginx/sites-enabled/default 125 | state: absent 126 | when: inventory_hostname in groups['ln'] 127 | register: rmdefault 128 | - name: nginx kibana server set 129 | template: 130 | src: kibana.conf 131 | dest: /etc/nginx/sites-enabled/kibana.conf 132 | become: yes 133 | when: inventory_hostname in groups['ln'] 134 | register: kibanaserver 135 | - name: ensure nginx is started 136 | become: yes 137 | service: 138 | name: nginx 139 | state: started 140 | enabled: yes 141 | when: inventory_hostname in groups['ln'] 142 | - name: restart nginx 143 | become: yes 144 | service: 145 | name: nginx 146 | state: restarted 147 | enabled: yes 148 | when: inventory_hostname in groups['ln'] and (rmdefault.changed or kibanaserver.changed) 149 | - name: install logstash 150 | become: yes 151 | apt: 152 | name: logstash=1:6.8.0-1 153 | state: present 154 | when: inventory_hostname in groups['ln'] 155 | - name: config logstash 156 | template: 157 | src: "{{ item }}" 158 | dest: "/etc/logstash/conf.d/{{ item }}" 159 | owner: logstash 160 | mode: 0600 161 | become: yes 162 | when: inventory_hostname in groups['ln'] 163 | register: logstashconf 164 | with_items: 165 | - "02-beats-input.conf" 166 | - "30-elasticsearch-output.conf" 167 | - name: ensure logstash is started 168 | service: 169 | name: logstash 170 | state: started 171 | when: inventory_hostname in groups['ln'] 172 | become: yes 173 | - name: restart logstash service 174 | become: yes 175 | service: 176 | name: logstash 177 | state: restarted 178 | enabled: yes 179 | when: inventory_hostname in groups['ln'] and logstashconf.changed 180 | - name: install filebeat 181 | become: yes 182 | apt: 183 | name: filebeat=6.8.0 184 | state: present 185 | - name: config filebeat 186 | become: yes 187 | template: 188 | src: filebeat.yml 189 | dest: /etc/filebeat/filebeat.yml 190 | owner: root 191 | mode: 0600 192 | register: filebeatconf 193 | - name: filebeat modules file 194 | become: yes 195 | copy: 196 | src: "{{ item }}" 197 | dest: "/etc/filebeat/modules.d/{{ item }}" 198 | with_items: 199 | - nginx.yml 200 | - system.yml 201 | - apache2.yml 202 | when: inventory_hostname in groups['ln'] 203 | - name: filebeat modules file on cns 204 | become: yes 205 | copy: 206 | src: "{{ item }}" 207 | dest: "/etc/filebeat/modules.d/{{ item }}" 208 | with_items: 209 | - system.yml 210 | when: inventory_hostname in groups['cn'] 211 | - name: ensure fb is started 212 | become: yes 213 | service: 214 | name: filebeat 215 | state: started 216 | - name: restart filebeat 217 | become: yes 218 | service: 219 | name: filebeat 220 | state: restarted 221 | when: filebeatconf.changed 222 | ## the following tasks is used for initialization of fb 223 | - name: stop filebeat for init 224 | become: yes 225 | service: 226 | name: filebeat 227 | state: stopped 228 | when: filebeat_init 229 | - name: hack apache2 error pipelines enabling it supporting timezone convert 230 | become: yes 231 | when: inventory_hostname in groups['ln'] and filebeat_init 232 | copy: 233 | dest: '/usr/share/filebeat/module/apache2/error/ingest/pipeline.json' 234 | src: 'apache2_pipeline.json' 235 | - name: delete all exisiting pipelines 236 | shell: "unset http_proxy&&curl -XDELETE -u {{ es_user }}:{{ es_pass }} 'http://{{ es_host[0] }}:9200/_ingest/pipeline/filebeat*'" 237 | when: inventory_hostname in groups['ln'] and filebeat_init 238 | - name: enable filebeat module in login node 239 | become: yes 240 | command: "filebeat modules enable {{item}}" 241 | with_items: 242 | "{{ filebeat_ln_modules }}" 243 | when: inventory_hostname in groups['ln'] and filebeat_init 244 | register: r 245 | changed_when: r.stdout.startswith("Enable") 246 | - name: enable filebeat module in compute nodes 247 | become: yes 248 | command: "filebeat modules enable {{item}}" 249 | with_items: 250 | "{{ filebeat_cn_modules }}" 251 | when: inventory_hostname in groups['cn'] and filebeat_init 252 | register: rcn 253 | changed_when: rcn.stdout.startswith("Enable") 254 | - name: filebeat setup init 255 | become: yes 256 | shell: "unset http_proxy&&filebeat setup -e -E output.logstash.enabled=false -E output.elasticsearch.hosts=[{{ hostvars[es_host[0]]['ip'] }}:9200] -E output.elasticsearch.username={{ es_user }} -E output.elasticsearch.password={{ es_pass }}" 257 | when: inventory_hostname in groups['ln'] and filebeat_init 258 | - name: filebeat add pipelines into ES 259 | become: yes 260 | shell: "unset http_proxy&&filebeat setup --pipelines --modules {{ filebeat_ln_modules|join(',') }} -E output.logstash.enabled=false -E output.elasticsearch.hosts=['{{ es_host[0] }}:9200'] -E output.elasticsearch.username={{ es_user }} -E output.elasticsearch.password={{ es_pass }} -M system.auth.var.convert_timezone=true -M system.syslog.var.convert_timezone=true -M nginx.error.var.convert_timezone=true -M apache2.error.var.convert_timezone=true" 261 | when: inventory_hostname in groups['ln'] and filebeat_init 262 | - name: start filebeat again 263 | become: yes 264 | service: 265 | name: filebeat 266 | state: started 267 | when: filebeat_init 268 | -------------------------------------------------------------------------------- /roles/elk/templates/02-beats-input.conf: -------------------------------------------------------------------------------- 1 | input { 2 | beats { 3 | port => 5044 4 | ssl => false 5 | host => "{{ master_ip }}" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /roles/elk/templates/30-elasticsearch-output.conf: -------------------------------------------------------------------------------- 1 | output { 2 | if [@metadata][pipeline] { 3 | elasticsearch { 4 | hosts => ["{{ es_host[0] }}:9200"] 5 | manage_template => false 6 | index => "%{[@metadata][beat]}-%{[@metadata][version]}-%{+YYYY.MM.dd}" 7 | pipeline => "%{[@metadata][pipeline]}" 8 | user => "{{ es_user }}" 9 | password => "{{ es_pass }}" 10 | } 11 | } else { 12 | elasticsearch { 13 | hosts => ["{{ es_host[0] }}:9200"] 14 | manage_template => false 15 | index => "%{[@metadata][beat]}-%{[@metadata][version]}-%{+YYYY.MM.dd}" 16 | user => "{{ es_user }}" 17 | password => "{{ es_pass }}" 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /roles/elk/templates/elasticsearch.yml: -------------------------------------------------------------------------------- 1 | # ======================== Elasticsearch Configuration ========================= 2 | # 3 | # NOTE: Elasticsearch comes with reasonable defaults for most settings. 4 | # Before you set out to tweak and tune the configuration, make sure you 5 | # understand what are you trying to accomplish and the consequences. 6 | # 7 | # The primary way of configuring a node is via this file. This template lists 8 | # the most important settings you may want to configure for a production cluster. 9 | # 10 | # Please consult the documentation for further information on configuration options: 11 | # https://www.elastic.co/guide/en/elasticsearch/reference/index.html 12 | # 13 | # ---------------------------------- Cluster ----------------------------------- 14 | # 15 | # Use a descriptive name for your cluster: 16 | # 17 | cluster.name: hpc2-es 18 | # 19 | # ------------------------------------ Node ------------------------------------ 20 | # 21 | # Use a descriptive name for the node: 22 | # 23 | node.name: {{ inventory_hostname }} 24 | # 25 | # Add custom attributes to the node: 26 | # 27 | #node.attr.rack: r1 28 | # 29 | # ----------------------------------- Paths ------------------------------------ 30 | # 31 | # Path to directory where to store the data (separate multiple locations by comma): 32 | # 33 | path.data: /var/lib/elasticsearch 34 | # 35 | # Path to log files: 36 | # 37 | path.logs: /var/log/elasticsearch 38 | # 39 | # ----------------------------------- Memory ----------------------------------- 40 | # 41 | # Lock the memory on startup: 42 | # 43 | #bootstrap.memory_lock: true 44 | # 45 | # Make sure that the heap size is set to about half the memory available 46 | # on the system and that the owner of the process is allowed to use this 47 | # limit. 48 | # 49 | # Elasticsearch performs poorly when the system is swapping the memory. 50 | # 51 | # ---------------------------------- Network ----------------------------------- 52 | # 53 | # Set the bind address to a specific IP (IPv4 or IPv6): 54 | # 55 | network.host: {{ hostvars[inventory_hostname]['ip'] }} 56 | # 57 | # Set a custom port for HTTP: 58 | # 59 | #http.port: 9200 60 | # 61 | # For more information, consult the network module documentation. 62 | # 63 | # --------------------------------- Discovery ---------------------------------- 64 | # 65 | # Pass an initial list of hosts to perform discovery when new node is started: 66 | # The default list of hosts is ["127.0.0.1", "[::1]"] 67 | # 68 | discovery.zen.ping.unicast.hosts: [{% for h in es_host %} "{{ h }}" {{ "," if not loop.last else "" }} {% endfor %}] 69 | # 70 | # Prevent the "split brain" by configuring the majority of nodes (total number of master-eligible nodes / 2 + 1): 71 | # 72 | discovery.zen.minimum_master_nodes: 2 73 | # 74 | # For more information, consult the zen discovery module documentation. 75 | # 76 | # ---------------------------------- Gateway ----------------------------------- 77 | # 78 | # Block initial recovery after a full cluster restart until N nodes are started: 79 | # 80 | #gateway.recover_after_nodes: 3 81 | # 82 | # For more information, consult the gateway module documentation. 83 | # 84 | # ---------------------------------- Various ----------------------------------- 85 | # 86 | # Require explicit names when deleting indices: 87 | # 88 | #action.destructive_requires_name: true 89 | 90 | xpack.security.enabled: true 91 | xpack.security.transport.ssl.enabled: true 92 | xpack.security.transport.ssl.verification_mode: certificate 93 | xpack.security.transport.ssl.keystore.path: certs/elastic-certificates.p12 94 | xpack.security.transport.ssl.truststore.path: certs/elastic-certificates.p12 95 | 96 | ## miscs 97 | -------------------------------------------------------------------------------- /roles/elk/templates/filebeat.yml: -------------------------------------------------------------------------------- 1 | ###################### Filebeat Configuration Example ######################### 2 | 3 | # This file is an example configuration file highlighting only the most common 4 | # options. The filebeat.reference.yml file from the same directory contains all the 5 | # supported options with more comments. You can use it as a reference. 6 | # 7 | # You can find the full configuration reference here: 8 | # https://www.elastic.co/guide/en/beats/filebeat/index.html 9 | 10 | # For more available modules and options, please see the filebeat.reference.yml sample 11 | # configuration file. 12 | 13 | #=========================== Filebeat inputs ============================= 14 | 15 | filebeat.inputs: 16 | 17 | # Each - is an input. Most options can be set at the input level, so 18 | # you can use different inputs for various configurations. 19 | # Below are the input specific configurations. 20 | 21 | - type: log 22 | 23 | # Change to true to enable this input configuration. 24 | enabled: false 25 | 26 | # Paths that should be crawled and fetched. Glob based paths. 27 | paths: 28 | - /var/log/*.log 29 | #- c:\programdata\elasticsearch\logs\* 30 | 31 | # Exclude lines. A list of regular expressions to match. It drops the lines that are 32 | # matching any regular expression from the list. 33 | #exclude_lines: ['^DBG'] 34 | 35 | # Include lines. A list of regular expressions to match. It exports the lines that are 36 | # matching any regular expression from the list. 37 | #include_lines: ['^ERR', '^WARN'] 38 | 39 | # Exclude files. A list of regular expressions to match. Filebeat drops the files that 40 | # are matching any regular expression from the list. By default, no files are dropped. 41 | #exclude_files: ['.gz$'] 42 | 43 | # Optional additional fields. These fields can be freely picked 44 | # to add additional information to the crawled log files for filtering 45 | #fields: 46 | # level: debug 47 | # review: 1 48 | 49 | ### Multiline options 50 | 51 | # Multiline can be used for log messages spanning multiple lines. This is common 52 | # for Java Stack Traces or C-Line Continuation 53 | 54 | # The regexp Pattern that has to be matched. The example pattern matches all lines starting with [ 55 | #multiline.pattern: ^\[ 56 | 57 | # Defines if the pattern set under pattern should be negated or not. Default is false. 58 | #multiline.negate: false 59 | 60 | # Match can be set to "after" or "before". It is used to define if lines should be append to a pattern 61 | # that was (not) matched before or after or as long as a pattern is not matched based on negate. 62 | # Note: After is the equivalent to previous and before is the equivalent to to next in Logstash 63 | #multiline.match: after 64 | 65 | 66 | #============================= Filebeat modules =============================== 67 | 68 | filebeat.config.modules: 69 | # Glob pattern for configuration loading 70 | path: ${path.config}/modules.d/*.yml 71 | 72 | # Set to true to enable config reloading 73 | reload.enabled: false 74 | 75 | # Period on which files under path should be checked for changes 76 | #reload.period: 10s 77 | 78 | #==================== Elasticsearch template setting ========================== 79 | 80 | setup.template.settings: 81 | index.number_of_shards: 3 82 | #index.codec: best_compression 83 | #_source.enabled: false 84 | 85 | #================================ General ===================================== 86 | 87 | # The name of the shipper that publishes the network data. It can be used to group 88 | # all the transactions sent by a single shipper in the web interface. 89 | #name: 90 | 91 | # The tags of the shipper are included in their own field with each 92 | # transaction published. 93 | #tags: ["service-X", "web-tier"] 94 | 95 | # Optional fields that you can specify to add additional information to the 96 | # output. 97 | #fields: 98 | # env: staging 99 | 100 | 101 | #============================== Dashboards ===================================== 102 | # These settings control loading the sample dashboards to the Kibana index. Loading 103 | # the dashboards is disabled by default and can be enabled either by setting the 104 | # options here, or by using the `-setup` CLI flag or the `setup` command. 105 | #setup.dashboards.enabled: false 106 | 107 | # The URL from where to download the dashboards archive. By default this URL 108 | # has a value which is computed based on the Beat name and version. For released 109 | # versions, this URL points to the dashboard archive on the artifacts.elastic.co 110 | # website. 111 | #setup.dashboards.url: 112 | 113 | #============================== Kibana ===================================== 114 | 115 | # Starting with Beats version 6.0.0, the dashboards are loaded via the Kibana API. 116 | # This requires a Kibana endpoint configuration. 117 | setup.kibana: 118 | 119 | # Kibana Host 120 | # Scheme and port can be left out and will be set to the default (http and 5601) 121 | # In case you specify and additional path, the scheme is required: http://localhost:5601/path 122 | # IPv6 addresses should always be defined as: https://[2001:db8::1]:5601 123 | #host: "localhost:5601" 124 | 125 | # Kibana Space ID 126 | # ID of the Kibana Space into which the dashboards should be loaded. By default, 127 | # the Default Space will be used. 128 | #space.id: 129 | 130 | #============================= Elastic Cloud ================================== 131 | 132 | # These settings simplify using filebeat with the Elastic Cloud (https://cloud.elastic.co/). 133 | 134 | # The cloud.id setting overwrites the `output.elasticsearch.hosts` and 135 | # `setup.kibana.host` options. 136 | # You can find the `cloud.id` in the Elastic Cloud web UI. 137 | #cloud.id: 138 | 139 | # The cloud.auth setting overwrites the `output.elasticsearch.username` and 140 | # `output.elasticsearch.password` settings. The format is `:`. 141 | #cloud.auth: 142 | 143 | #================================ Outputs ===================================== 144 | 145 | # Configure what output to use when sending the data collected by the beat. 146 | 147 | #-------------------------- Elasticsearch output ------------------------------ 148 | #output.elasticsearch: 149 | # Array of hosts to connect to. 150 | # hosts: ["localhost:9200"] 151 | 152 | # Enabled ilm (beta) to use index lifecycle management instead daily indices. 153 | #ilm.enabled: false 154 | 155 | # Optional protocol and basic auth credentials. 156 | #protocol: "https" 157 | #username: "elastic" 158 | #password: "changeme" 159 | 160 | #----------------------------- Logstash output -------------------------------- 161 | output.logstash: 162 | # The Logstash hosts 163 | hosts: ["{{ master_ip }}:5044"] 164 | ssl: 165 | enabled: false 166 | # Optional SSL. By default is off. 167 | # List of root certificates for HTTPS server verifications 168 | #ssl.certificate_authorities: ["/etc/pki/root/ca.pem"] 169 | 170 | # Certificate for SSL client authentication 171 | #ssl.certificate: "/etc/pki/client/cert.pem" 172 | 173 | # Client Certificate Key 174 | #ssl.key: "/etc/pki/client/cert.key" 175 | 176 | #================================ Processors ===================================== 177 | 178 | # Configure processors to enhance or manipulate events generated by the beat. 179 | 180 | processors: 181 | - add_host_metadata: ~ 182 | - add_cloud_metadata: ~ 183 | 184 | #================================ Logging ===================================== 185 | 186 | # Sets log level. The default log level is info. 187 | # Available log levels are: error, warning, info, debug 188 | #logging.level: debug 189 | 190 | # At debug level, you can selectively enable logging only for some components. 191 | # To enable all selectors use ["*"]. Examples of other selectors are "beat", 192 | # "publish", "service". 193 | #logging.selectors: ["*"] 194 | 195 | #============================== Xpack Monitoring =============================== 196 | # filebeat can export internal metrics to a central Elasticsearch monitoring 197 | # cluster. This requires xpack monitoring to be enabled in Elasticsearch. The 198 | # reporting is disabled by default. 199 | 200 | # Set to true to enable the monitoring reporter. 201 | #xpack.monitoring.enabled: false 202 | 203 | # Uncomment to send the metrics to Elasticsearch. Most settings from the 204 | # Elasticsearch output are accepted here as well. Any setting that is not set is 205 | # automatically inherited from the Elasticsearch output configuration, so if you 206 | # have the Elasticsearch output configured, you can simply uncomment the 207 | # following line. 208 | #xpack.monitoring.elasticsearch: 209 | -------------------------------------------------------------------------------- /roles/elk/templates/kibana.conf: -------------------------------------------------------------------------------- 1 | server { 2 | listen {{ kibana_web_port }}; 3 | 4 | location / { 5 | proxy_pass http://localhost:{{ kb_port }}; 6 | proxy_http_version 1.1; 7 | proxy_set_header Upgrade $http_upgrade; 8 | proxy_set_header Connection 'upgrade'; 9 | proxy_set_header Host $host; 10 | proxy_cache_bypass $http_upgrade; 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /roles/elk/templates/kibana.yml: -------------------------------------------------------------------------------- 1 | # Set es user and password for kibana to connect 2 | elasticsearch.username: "{{ kb_user }}" 3 | elasticsearch.password: "{{ kb_pass }}" 4 | 5 | # Kibana is served by a back end server. This setting specifies the port to use. 6 | server.port: {{ kb_port }} 7 | 8 | # Specifies the address to which the Kibana server will bind. IP addresses and host names are both valid values. 9 | # The default is 'localhost', which usually means remote machines will not be able to connect. 10 | # To allow connections from remote users, set this parameter to a non-loopback address. 11 | #server.host: "localhost" 12 | 13 | # Enables you to specify a path to mount Kibana at if you are running behind a proxy. 14 | # Use the `server.rewriteBasePath` setting to tell Kibana if it should remove the basePath 15 | # from requests it receives, and to prevent a deprecation warning at startup. 16 | # This setting cannot end in a slash. 17 | #server.basePath: "" 18 | 19 | # Specifies whether Kibana should rewrite requests that are prefixed with 20 | # `server.basePath` or require that they are rewritten by your reverse proxy. 21 | # This setting was effectively always `false` before Kibana 6.3 and will 22 | # default to `true` starting in Kibana 7.0. 23 | #server.rewriteBasePath: false 24 | 25 | # The maximum payload size in bytes for incoming server requests. 26 | #server.maxPayloadBytes: 1048576 27 | 28 | # The Kibana server's name. This is used for display purposes. 29 | #server.name: "your-hostname" 30 | 31 | # The URLs of the Elasticsearch instances to use for all your queries. 32 | elasticsearch.hosts: ["http://{{ es_host[0] }}:9200"] 33 | 34 | # When this setting's value is true Kibana uses the hostname specified in the server.host 35 | # setting. When the value of this setting is false, Kibana uses the hostname of the host 36 | # that connects to this Kibana instance. 37 | #elasticsearch.preserveHost: true 38 | 39 | # Kibana uses an index in Elasticsearch to store saved searches, visualizations and 40 | # dashboards. Kibana creates a new index if the index doesn't already exist. 41 | #kibana.index: ".kibana" 42 | 43 | # The default application to load. 44 | #kibana.defaultAppId: "home" 45 | 46 | # If your Elasticsearch is protected with basic authentication, these settings provide 47 | # the username and password that the Kibana server uses to perform maintenance on the Kibana 48 | # index at startup. Your Kibana users still need to authenticate with Elasticsearch, which 49 | # is proxied through the Kibana server. 50 | #elasticsearch.username: "user" 51 | #elasticsearch.password: "pass" 52 | 53 | # Enables SSL and paths to the PEM-format SSL certificate and SSL key files, respectively. 54 | # These settings enable SSL for outgoing requests from the Kibana server to the browser. 55 | #server.ssl.enabled: false 56 | #server.ssl.certificate: /path/to/your/server.crt 57 | #server.ssl.key: /path/to/your/server.key 58 | 59 | # Optional settings that provide the paths to the PEM-format SSL certificate and key files. 60 | # These files validate that your Elasticsearch backend uses the same key files. 61 | #elasticsearch.ssl.certificate: /path/to/your/client.crt 62 | #elasticsearch.ssl.key: /path/to/your/client.key 63 | 64 | # Optional setting that enables you to specify a path to the PEM file for the certificate 65 | # authority for your Elasticsearch instance. 66 | #elasticsearch.ssl.certificateAuthorities: [ "/path/to/your/CA.pem" ] 67 | 68 | # To disregard the validity of SSL certificates, change this setting's value to 'none'. 69 | #elasticsearch.ssl.verificationMode: full 70 | 71 | # Time in milliseconds to wait for Elasticsearch to respond to pings. Defaults to the value of 72 | # the elasticsearch.requestTimeout setting. 73 | #elasticsearch.pingTimeout: 1500 74 | 75 | # Time in milliseconds to wait for responses from the back end or Elasticsearch. This value 76 | # must be a positive integer. 77 | #elasticsearch.requestTimeout: 30000 78 | 79 | # List of Kibana client-side headers to send to Elasticsearch. To send *no* client-side 80 | # headers, set this value to [] (an empty list). 81 | #elasticsearch.requestHeadersWhitelist: [ authorization ] 82 | 83 | # Header names and values that are sent to Elasticsearch. Any custom headers cannot be overwritten 84 | # by client-side headers, regardless of the elasticsearch.requestHeadersWhitelist configuration. 85 | #elasticsearch.customHeaders: {} 86 | 87 | # Time in milliseconds for Elasticsearch to wait for responses from shards. Set to 0 to disable. 88 | #elasticsearch.shardTimeout: 30000 89 | 90 | # Time in milliseconds to wait for Elasticsearch at Kibana startup before retrying. 91 | #elasticsearch.startupTimeout: 5000 92 | 93 | # Logs queries sent to Elasticsearch. Requires logging.verbose set to true. 94 | #elasticsearch.logQueries: false 95 | 96 | # Specifies the path where Kibana creates the process ID file. 97 | #pid.file: /var/run/kibana.pid 98 | 99 | # Enables you specify a file where Kibana stores log output. 100 | #logging.dest: stdout 101 | 102 | # Set the value of this setting to true to suppress all logging output. 103 | #logging.silent: false 104 | 105 | # Set the value of this setting to true to suppress all logging output other than error messages. 106 | #logging.quiet: false 107 | 108 | # Set the value of this setting to true to log all events, including system usage information 109 | # and all requests. 110 | #logging.verbose: false 111 | 112 | # Set the interval in milliseconds to sample system and process performance 113 | # metrics. Minimum is 100ms. Defaults to 5000. 114 | #ops.interval: 5000 115 | 116 | # Specifies locale to be used for all localizable strings, dates and number formats. 117 | #i18n.locale: "en" 118 | -------------------------------------------------------------------------------- /roles/ganglia/README.md: -------------------------------------------------------------------------------- 1 | Ganglia 2 | ========= 3 | 4 | This role is designed to configure ganglia monitoring tools on the cluster. 5 | 6 | 7 | Role Variables 8 | -------------- 9 | 10 | See defaults/main.yml. ganglia_url is the url to access ganglia webfrontend. For example, you can visit the web interface by http:///. -------------------------------------------------------------------------------- /roles/ganglia/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # defaults file for ganglia 3 | ganglia_url: ganglia 4 | ganglia_http_user: admin 5 | ganglia_http_pass: 123456notgood 6 | # the following one is used to configure the general avail monitoring scripts 7 | num_nfs_cn: 4 8 | num_ext_ln: 3 9 | disk_warning: 350 10 | master_nic_no: 5 11 | memory_avail_warning: 5000000 12 | -------------------------------------------------------------------------------- /roles/ganglia/files/cpu_stats.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import traceback 3 | import os 4 | import re 5 | import time 6 | import copy 7 | 8 | METRICS = { 9 | 'time': 0, 10 | 'data': {} 11 | } 12 | 13 | # Got these from /proc/softirqs 14 | softirq_pos = { 15 | 'hi': 1, 16 | 'timer': 2, 17 | 'nettx': 3, 18 | 'netrx': 4, 19 | 'block': 5, 20 | 'blockiopoll': 6, 21 | 'tasklet': 7, 22 | 'sched': 8, 23 | 'hrtimer': 9, 24 | 'rcu': 10 25 | } 26 | 27 | LAST_METRICS = copy.deepcopy(METRICS) 28 | METRICS_CACHE_MAX = 5 29 | 30 | 31 | stat_file = "/proc/stat" 32 | 33 | ############################################################################### 34 | # 35 | ############################################################################### 36 | 37 | 38 | def get_metrics(): 39 | """Return all metrics""" 40 | 41 | global METRICS, LAST_METRICS 42 | 43 | if (time.time() - METRICS['time']) > METRICS_CACHE_MAX: 44 | 45 | try: 46 | file = open(stat_file, 'r') 47 | 48 | except IOError: 49 | return 0 50 | 51 | # convert to dict 52 | metrics = {} 53 | for line in file: 54 | parts = re.split("\s+", line) 55 | metrics[parts[0]] = list(parts[1:]) 56 | 57 | # update cache 58 | LAST_METRICS = copy.deepcopy(METRICS) 59 | METRICS = { 60 | 'time': time.time(), 61 | 'data': metrics 62 | } 63 | 64 | return [METRICS, LAST_METRICS] 65 | 66 | 67 | def get_value(name): 68 | """Return a value for the requested metric""" 69 | 70 | metrics = get_metrics()[0] 71 | 72 | NAME_PREFIX = "cpu_" 73 | 74 | name = name.replace(NAME_PREFIX, "") # remove prefix from name 75 | 76 | try: 77 | result = metrics['data'][name][0] 78 | except StandardError: 79 | result = 0 80 | 81 | return result 82 | 83 | 84 | def get_delta(name): 85 | """Return change over time for the requested metric""" 86 | 87 | # get metrics 88 | [curr_metrics, last_metrics] = get_metrics() 89 | 90 | NAME_PREFIX = "cpu_" 91 | 92 | name = name.replace(NAME_PREFIX, "") # remove prefix from name 93 | 94 | if name == "procs_created": 95 | name = "processes" 96 | 97 | try: 98 | delta = (float(curr_metrics['data'][name][0]) - float(last_metrics['data'][name][0])) / (curr_metrics['time'] - last_metrics['time']) 99 | if delta < 0: 100 | print name + " is less 0" 101 | delta = 0 102 | except KeyError: 103 | delta = 0.0 104 | 105 | return delta 106 | 107 | ############################################################################## 108 | # SoftIRQ has multiple values which are defined in a dictionary at the top 109 | ############################################################################## 110 | 111 | 112 | def get_softirq_delta(name): 113 | """Return change over time for the requested metric""" 114 | 115 | # get metrics 116 | [curr_metrics, last_metrics] = get_metrics() 117 | 118 | NAME_PREFIX = "softirq_" 119 | 120 | name = name[len(NAME_PREFIX):] # remove prefix from name 121 | 122 | index = softirq_pos[name] 123 | 124 | try: 125 | delta = (float(curr_metrics['data']['softirq'][index]) - float(last_metrics['data']['softirq'][index])) / (curr_metrics['time'] - last_metrics['time']) 126 | if delta < 0: 127 | print name + " is less 0" 128 | delta = 0 129 | except KeyError: 130 | delta = 0.0 131 | 132 | return delta 133 | 134 | 135 | def create_desc(skel, prop): 136 | d = skel.copy() 137 | for k, v in prop.iteritems(): 138 | d[k] = v 139 | return d 140 | 141 | 142 | def metric_init(params): 143 | global descriptors, metric_map, Desc_Skel 144 | 145 | descriptors = [] 146 | 147 | Desc_Skel = { 148 | 'name' : 'XXX', 149 | 'orig_name' : 'XXX', 150 | 'call_back' : get_delta, 151 | 'time_max' : 60, 152 | 'value_type' : 'float', 153 | 'format' : '%.0f', 154 | 'units' : 'XXX', 155 | 'slope' : 'both', # zero|positive|negative|both 156 | 'description' : '', 157 | 'groups' : 'cpu', 158 | } 159 | 160 | descriptors.append(create_desc(Desc_Skel, { 161 | "name" : "cpu_ctxt", 162 | "units" : "ctxs/sec", 163 | "description": "Context Switches", 164 | })) 165 | 166 | descriptors.append(create_desc(Desc_Skel, { 167 | "name" : "procs_created", 168 | "units" : "proc/sec", 169 | "description": "Number of processes and threads created", 170 | })) 171 | 172 | descriptors.append(create_desc(Desc_Skel, { 173 | "name" : "cpu_intr", 174 | "units" : "intr/sec", 175 | "description": "Interrupts serviced", 176 | })) 177 | 178 | descriptors.append(create_desc(Desc_Skel, { 179 | "name" : "procs_blocked", 180 | "units" : "processes", 181 | "call_back" : get_value, 182 | "description": "Processes blocked", 183 | })) 184 | 185 | descriptors.append(create_desc(Desc_Skel, { 186 | "name" : "softirq", 187 | "units" : "ops/s", 188 | "description": "Soft IRQs", 189 | })) 190 | 191 | descriptors.append(create_desc(Desc_Skel, { 192 | "name" : "softirq_hi", 193 | "units" : "ops/s", 194 | 'groups' : 'softirq', 195 | "call_back" : get_softirq_delta 196 | })) 197 | 198 | descriptors.append(create_desc(Desc_Skel, { 199 | "name" : "softirq_timer", 200 | "units" : "ops/s", 201 | 'groups' : 'softirq', 202 | "call_back" : get_softirq_delta 203 | })) 204 | 205 | descriptors.append(create_desc(Desc_Skel, { 206 | "name" : "softirq_nettx", 207 | "units" : "ops/s", 208 | 'groups' : 'softirq', 209 | "call_back" : get_softirq_delta 210 | })) 211 | 212 | descriptors.append(create_desc(Desc_Skel, { 213 | "name" : "softirq_netrx", 214 | "units" : "ops/s", 215 | 'groups' : 'softirq', 216 | "call_back" : get_softirq_delta 217 | })) 218 | 219 | descriptors.append(create_desc(Desc_Skel, { 220 | "name" : "softirq_block", 221 | "units" : "ops/s", 222 | 'groups' : 'softirq', 223 | "call_back" : get_softirq_delta 224 | })) 225 | 226 | descriptors.append(create_desc(Desc_Skel, { 227 | "name" : "softirq_blockiopoll", 228 | "units" : "ops/s", 229 | 'groups' : 'softirq', 230 | "call_back" : get_softirq_delta 231 | })) 232 | 233 | descriptors.append(create_desc(Desc_Skel, { 234 | "name" : "softirq_tasklet", 235 | "units" : "ops/s", 236 | 'groups' : 'softirq', 237 | "call_back" : get_softirq_delta 238 | })) 239 | 240 | descriptors.append(create_desc(Desc_Skel, { 241 | "name" : "softirq_sched", 242 | "units" : "ops/s", 243 | 'groups' : 'softirq', 244 | "call_back" : get_softirq_delta 245 | })) 246 | 247 | descriptors.append(create_desc(Desc_Skel, { 248 | "name" : "softirq_hrtimer", 249 | "units" : "ops/s", 250 | 'groups' : 'softirq', 251 | "call_back" : get_softirq_delta 252 | })) 253 | 254 | descriptors.append(create_desc(Desc_Skel, { 255 | "name" : "softirq_rcu", 256 | "units" : "ops/s", 257 | 'groups' : 'softirq', 258 | "call_back" : get_softirq_delta 259 | })) 260 | 261 | # We need a metric_map that maps metric_name to the index in /proc/meminfo 262 | metric_map = {} 263 | 264 | for d in descriptors: 265 | metric_name = d['name'] 266 | metric_map[metric_name] = {"name": d['orig_name'], "units": d['units']} 267 | 268 | return descriptors 269 | 270 | 271 | def metric_cleanup(): 272 | '''Clean up the metric module.''' 273 | pass 274 | 275 | 276 | #This code is for debugging and unit testing 277 | if __name__ == '__main__': 278 | metric_init({}) 279 | while True: 280 | for d in descriptors: 281 | v = d['call_back'](d['name']) 282 | print '%s = %s' % (d['name'], v) 283 | print 'Sleeping 15 seconds' 284 | time.sleep(5) 285 | -------------------------------------------------------------------------------- /roles/ganglia/files/gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # written by refraction-ray@Aug 2019 3 | 4 | GMETRIC=/usr/bin/gmetric 5 | LOG=/tmp/.gpu.log 6 | 7 | nvidia-smi -q > $LOG 8 | 9 | a=( $(cat $LOG |grep "Fan Speed"|awk '{print $4}') ) 10 | j=1 11 | for i in "${a[@]}"; do 12 | $GMETRIC -t float -n "gpu_${j}_fan_speed" -g "GPU" -u "percent" -v $i 13 | j=$[$j+1] 14 | done 15 | a=( $(cat $LOG |grep "GPU Current Temp"|awk '{print $5}') ) 16 | j=1 17 | for i in "${a[@]}"; do 18 | $GMETRIC -t float -n "gpu_${j}_temp" -g "GPU" -u "Celcius" -v $i 19 | j=$[$j+1] 20 | done 21 | a=( $(cat $LOG |grep "Power Draw"|awk '{print $4}') ) 22 | j=1 23 | for i in "${a[@]}"; do 24 | $GMETRIC -t float -n "gpu_${j}_power_draw" -g "GPU" -u "Watt" -v $i 25 | j=$[$j+1] 26 | done 27 | a=( $(cat $LOG |grep "FB Memory Usage" -A 3|grep "Used"|awk '{print $3}') ) 28 | j=1 29 | for i in "${a[@]}"; do 30 | $GMETRIC -t float -n "gpu_${j}_mem_used" -g "GPU" -u "MiB" -v $i 31 | j=$[$j+1] 32 | done 33 | a=( $(cat $LOG |grep "Utilization" -A 3|grep "Gpu"|awk '{print $3}') ) 34 | j=1 35 | for i in "${a[@]}"; do 36 | $GMETRIC -t float -n "gpu_${j}_utilization" -g "GPU" -u "persent" -v $i 37 | j=$[$j+1] 38 | done 39 | -------------------------------------------------------------------------------- /roles/ganglia/files/netstats.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | import time 4 | import copy 5 | import string 6 | 7 | PARAMS = {} 8 | 9 | METRICS = { 10 | 'time' : 0, 11 | 'data' : {} 12 | } 13 | 14 | stats_files = [ "/proc/net/netstat", "/proc/net/snmp" ] 15 | 16 | LAST_METRICS = copy.deepcopy(METRICS) 17 | METRICS_CACHE_MAX = 5 18 | 19 | stats_pos = {} 20 | 21 | def get_metrics(): 22 | """Return all metrics""" 23 | 24 | global METRICS, LAST_METRICS 25 | 26 | if (time.time() - METRICS['time']) > METRICS_CACHE_MAX: 27 | 28 | new_metrics = {} 29 | 30 | for file in stats_files: 31 | try: 32 | file = open(file, 'r') 33 | 34 | except IOError: 35 | return 0 36 | 37 | # convert to dict 38 | metrics = {} 39 | for line in file: 40 | if re.match("(.*): [0-9]", line): 41 | count = 0 42 | metrics = re.split("\s+", line) 43 | metric_group = metrics[0].replace(":", "").lower() 44 | new_metrics[metric_group] = dict() 45 | for value in metrics: 46 | # Skip first 47 | if count > 0 and value >= 0 and count in stats_pos[metric_group]: 48 | metric_name = stats_pos[metric_group][count] 49 | new_metrics[metric_group][metric_name] = value 50 | count += 1 51 | 52 | file.close() 53 | 54 | # update cache 55 | LAST_METRICS = copy.deepcopy(METRICS) 56 | METRICS = { 57 | 'time': time.time(), 58 | 'data': new_metrics 59 | } 60 | 61 | return [METRICS, LAST_METRICS] 62 | 63 | 64 | def get_value(name): 65 | """Return a value for the requested metric""" 66 | 67 | metrics = get_metrics()[0] 68 | 69 | name = name[len(NAME_PREFIX):] # remove prefix from name 70 | 71 | try: 72 | result = metrics['data'][name] 73 | except StandardError: 74 | result = 0 75 | 76 | return result 77 | 78 | 79 | def get_delta(name): 80 | """Return change over time for the requested metric""" 81 | 82 | # get metrics 83 | [curr_metrics, last_metrics] = get_metrics() 84 | 85 | parts = name.split("_") 86 | group = parts[0] 87 | metric = "_".join(parts[1:]) 88 | 89 | try: 90 | delta = (float(curr_metrics['data'][group][metric]) - float(last_metrics['data'][group][metric])) /(curr_metrics['time'] - last_metrics['time']) 91 | if delta < 0: 92 | print name + " is less 0" 93 | delta = 0 94 | except KeyError: 95 | delta = 0.0 96 | 97 | return delta 98 | 99 | 100 | def get_tcploss_percentage(name): 101 | 102 | # get metrics 103 | [curr_metrics, last_metrics] = get_metrics() 104 | 105 | try: 106 | pct = 100 * (float(curr_metrics['data']['tcpext']["tcploss"]) - float(last_metrics["data"]['tcpext']["tcploss"])) / (float(curr_metrics['data']['tcp']['outsegs']) + float(curr_metrics['data']['tcp']['insegs']) - float(last_metrics['data']['tcp']['insegs']) - float(last_metrics['data']['tcp']['outsegs'])) 107 | if pct < 0: 108 | print name + " is less 0" 109 | pct = 0 110 | except KeyError: 111 | pct = 0.0 112 | except ZeroDivisionError: 113 | pct = 0.0 114 | 115 | return pct 116 | 117 | def get_retrans_percentage(name): 118 | 119 | # get metrics 120 | [curr_metrics, last_metrics] = get_metrics() 121 | 122 | try: 123 | pct = 100 * (float(curr_metrics['data']['tcp']["retranssegs"]) - float(last_metrics['data']['tcp']["retranssegs"])) / (float(curr_metrics['data']['tcp']['outsegs']) + float(curr_metrics['data']['tcp']['insegs']) - float(last_metrics['data']['tcp']['insegs']) - float(last_metrics['data']['tcp']['outsegs'])) 124 | if pct < 0: 125 | print name + " is less 0" 126 | pct = 0 127 | except KeyError: 128 | pct = 0.0 129 | except ZeroDivisionError: 130 | pct = 0.0 131 | return pct 132 | 133 | 134 | def create_desc(skel, prop): 135 | d = skel.copy() 136 | for k,v in prop.iteritems(): 137 | d[k] = v 138 | return d 139 | 140 | def metric_init(params): 141 | global descriptors, metric_map, Desc_Skel 142 | 143 | descriptors = [] 144 | 145 | Desc_Skel = { 146 | 'name' : 'XXX', 147 | 'call_back' : get_delta, 148 | 'time_max' : 60, 149 | 'value_type' : 'float', 150 | 'format' : '%.5f', 151 | 'units' : 'count/s', 152 | 'slope' : 'both', # zero|positive|negative|both 153 | 'description' : 'XXX', 154 | 'groups' : 'XXX', 155 | } 156 | 157 | #################################################################################### 158 | # Let's figure out what metrics are available 159 | # 160 | # Read /proc/net/netstat 161 | #################################################################################### 162 | for file in stats_files: 163 | try: 164 | file = open(file, 'r') 165 | 166 | except IOError: 167 | return 0 168 | 169 | # Find mapping 170 | for line in file: 171 | # Lines with 172 | if not re.match("(.*): [0-9]", line): 173 | count = 0 174 | mapping = re.split("\s+", line) 175 | metric_group = mapping[0].replace(":", "").lower() 176 | stats_pos[metric_group] = dict() 177 | for metric in mapping: 178 | # Skip first 179 | if count > 0 and metric != "": 180 | lowercase_metric = metric.lower() 181 | stats_pos[metric_group][count] = lowercase_metric 182 | count += 1 183 | 184 | file.close() 185 | 186 | for group in stats_pos: 187 | for item in stats_pos[group]: 188 | descriptors.append(create_desc(Desc_Skel, { 189 | "name" : group + "_" + stats_pos[group][item], 190 | "description": stats_pos[group][item], 191 | 'groups' : group 192 | })) 193 | 194 | descriptors.append(create_desc(Desc_Skel, { 195 | "name" : "tcpext_" + "tcploss_percentage", 196 | "call_back" : get_tcploss_percentage, 197 | "description": "TCP percentage loss, tcploss / insegs + outsegs", 198 | "units" : "pct", 199 | 'groups' : 'tcpext' 200 | })) 201 | 202 | descriptors.append(create_desc(Desc_Skel, { 203 | "name" : "tcp_" + "retrans_percentage", 204 | "call_back" : get_retrans_percentage, 205 | "description": "TCP retrans percentage, retranssegs / insegs + outsegs", 206 | "units" : "pct", 207 | 'groups' : 'tcp' 208 | })) 209 | 210 | return descriptors 211 | 212 | def metric_cleanup(): 213 | '''Clean up the metric module.''' 214 | pass 215 | 216 | #This code is for debugging and unit testing 217 | if __name__ == '__main__': 218 | descriptors = metric_init(PARAMS) 219 | while True: 220 | for d in descriptors: 221 | v = d['call_back'](d['name']) 222 | print '%s = %s' % (d['name'], v) 223 | print 'Sleeping 15 seconds' 224 | time.sleep(15) 225 | -------------------------------------------------------------------------------- /roles/ganglia/files/temg.sh: -------------------------------------------------------------------------------- 1 | SENSORS=/usr/bin/sensors 2 | GMETRIC=/usr/bin/gmetric 3 | 4 | let count=0 5 | sum=0.0 6 | for temp in $($SENSORS | grep "^Core" | grep -e '+.*C' | cut -f 2 -d '+' | cut -f 1 -d ' ' | sed 's/°C//'); do 7 | sum=$(echo $sum+$temp | bc) 8 | # echo $temp, $sum 9 | let count+=1 10 | done 11 | temp=$(echo "$sum/$count" | bc) 12 | 13 | $GMETRIC -t float -n "cpu_temp" -u "Celcius" -v $temp 14 | 15 | if [ $temp -gt 89 ]; then 16 | logger -t gangalia-monitor temperature_too_high: $temp 17 | fi 18 | 19 | -------------------------------------------------------------------------------- /roles/ganglia/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for ganglia 3 | - name: install necessary package for ganglia on ln node 4 | become: yes 5 | apt: 6 | name: "{{ item }}" 7 | state: present 8 | when: inventory_hostname in groups['ln'] 9 | with_items: 10 | - ganglia-monitor=3.6.0-7ubuntu2 11 | - ganglia-webfrontend 12 | - gmetad 13 | - ganglia-monitor-python=3.6.0-7ubuntu2 14 | - lm-sensors 15 | - name: install package for ganglia on cn nodes 16 | become: yes 17 | apt: 18 | name: "{{ item }}" 19 | state: present 20 | with_items: 21 | - ganglia-monitor=3.6.0-7ubuntu2 22 | - ganglia-monitor-python=3.6.0-7ubuntu2 23 | - lm-sensors 24 | when: inventory_hostname in groups['cn'] 25 | - name: hack on netstats.py 26 | become: yes 27 | copy: 28 | src: "{{ item }}" 29 | dest: "/usr/lib/ganglia/python_modules/{{ item }}" 30 | register: pymd 31 | with_items: 32 | - netstats.py 33 | - cpu_stats.py 34 | - name: config gmond in cn nodes 35 | become: yes 36 | template: 37 | src: gmond-cn.conf 38 | dest: /etc/ganglia/gmond.conf 39 | backup: yes 40 | when: inventory_hostname in groups['cn'] 41 | register: gmondc 42 | - name: ensure gmond is started 43 | become: yes 44 | service: 45 | name: ganglia-monitor 46 | state: started 47 | when: inventory_hostname in groups['cn'] 48 | - name: restart gmond in cn nodes 49 | become: yes 50 | service: 51 | name: ganglia-monitor 52 | state: restarted 53 | when: inventory_hostname in groups['cn'] and (gmondc.changed or pymd.changed) 54 | - name: config gmond in ln nodes 55 | become: yes 56 | template: 57 | src: gmond.conf 58 | dest: /etc/ganglia/gmond.conf 59 | backup: yes 60 | when: inventory_hostname in groups['ln'] 61 | register: gmondl 62 | - name: ensure gmond started in ln 63 | become: yes 64 | service: 65 | name: ganglia-monitor 66 | state: started 67 | enabled: yes 68 | when: inventory_hostname in groups['ln'] 69 | - name: restart gmond in ln node 70 | become: yes 71 | service: 72 | name: ganglia-monitor 73 | state: restarted 74 | when: inventory_hostname in groups['ln'] and (gmondl.changed or pymd.changed) 75 | - name: config gmetad in ln nodes 76 | become: yes 77 | template: 78 | src: gmetad.conf 79 | dest: /etc/ganglia/gmetad.conf 80 | when: inventory_hostname in groups['ln'] 81 | register: gmetadc 82 | - name: ensure gmetad started in ln 83 | become: yes 84 | service: 85 | name: gmetad 86 | state: started 87 | enabled: yes 88 | when: inventory_hostname in groups['ln'] 89 | - name: restart gmetad in ln node 90 | become: yes 91 | service: 92 | name: gmetad 93 | state: restarted 94 | when: inventory_hostname in groups['ln'] and gmetadc.changed 95 | - name: config web interface of ganglia 96 | become: yes 97 | template: 98 | src: ganglia.conf 99 | dest: /etc/apache2/sites-enabled/ganglia.conf 100 | when: inventory_hostname in groups['ln'] 101 | register: gweb 102 | - name: patch on ganglia webfront cluster view 103 | lineinfile: 104 | path: /usr/share/ganglia-webfrontend/cluster_view.php 105 | regexp: "context_metrics =" 106 | line: " $context_metrics = array();" 107 | become: yes 108 | when: inventory_hostname in groups['ln'] 109 | - name: install passlib for password protection on apache 110 | become: yes 111 | apt: 112 | name: python3-passlib 113 | state: present 114 | when: inventory_hostname in groups['ln'] 115 | - name: setup apache passwd 116 | htpasswd: 117 | path: /etc/apache2/.htpasswd 118 | name: "{{ ganglia_http_user }}" 119 | password: "{{ ganglia_http_pass }}" 120 | owner: root 121 | group: www-data 122 | mode: 0640 123 | become: yes 124 | when: inventory_hostname in groups['ln'] 125 | register: passwd 126 | - name: ensure apache is started 127 | become: yes 128 | service: 129 | name: apache2 130 | state: started 131 | enabled: yes 132 | when: inventory_hostname in groups['ln'] 133 | - name: restart apache2 134 | become: yes 135 | service: 136 | name: apache2 137 | state: restarted 138 | when: inventory_hostname in groups['ln'] and (gweb.changed or passwd.changed) 139 | - name: copy the temperature sensor script 140 | copy: 141 | src: temg.sh 142 | dest: "{{ lookup('env', 'HOME') }}/.temg.sh" 143 | when: inventory_hostname in groups['ln'] 144 | - name: add cpu temperature to crontab 145 | cron: 146 | job: "/bin/bash {{ lookup('env', 'HOME') }}/.temg.sh" 147 | name: "temperature monitoring" 148 | become: yes 149 | - name: copy gpu monitoring script 150 | copy: 151 | src: gpu.sh 152 | dest: "{{ lookup('env', 'HOME') }}/.gpu.sh" 153 | when: inventory_hostname in groups['ln'] 154 | - name: add gpu script to crontab 155 | become: yes 156 | when: inventory_hostname in groups['gn'] 157 | cron: 158 | minute: "*/2" 159 | job: "/bin/bash {{ lookup('env', 'HOME') }}/.gpu.sh" 160 | name: "gpu monitoring" 161 | - name: copy avail monitoring script 162 | template: 163 | src: avail-monitor.sh 164 | dest: "{{ lookup('env', 'HOME') }}/.avail-monitor.sh" 165 | when: inventory_hostname in groups['ln'] 166 | - name: add avail script to crontab on all nodes 167 | become: yes 168 | cron: 169 | minute: "*/3" 170 | job: "/bin/bash {{ lookup('env', 'HOME') }}/.avail-monitor.sh" 171 | name: "avail monitoring" 172 | - name: no mail from crontab 173 | become: yes 174 | cronvar: 175 | name: "MAILTO" 176 | value: '""' 177 | -------------------------------------------------------------------------------- /roles/ganglia/templates/avail-monitor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # refraction-ray @ AUG 2019 3 | # Work in Progress 4 | ############################ 5 | ## general variables 6 | hostname=$(hostname) 7 | tag="avail-monitor-warning" 8 | 9 | ## slurm nodes availability 10 | errornum=$(sinfo -N|grep "down\|alloc\*\|idle\*\|mix\*"|wc -l) 11 | if [ $errornum -ne 0 ]; then 12 | logger -t $tag "${errornum} nodes are down in slurm" 13 | fi 14 | 15 | ## wanip fixed check 16 | if [ "$hostname" == "{{ master_name }}" ]; then 17 | wanip=$(/sbin/ifconfig|grep {{ ln_wan_nic }} -A1|grep inet|awk '{print $2}') 18 | if [ "$wanip" != "{{ wan_ip }}" ]; then 19 | logger -t $tag "wan ip for master has changed to ${wanip}" 20 | fi 21 | fi 22 | 23 | ## filesystem mount check 24 | if [ "$hostname" == "{{ master_name}}" ]; then 25 | extnum=$(df -T|grep ext4|wc -l) 26 | if [ $extnum -ne {{ num_ext_ln }} ]; then 27 | logger -t $tag "ext4 mount is missing" 28 | fi 29 | elif [ "$hostname" == "c8" ]; then 30 | nfsnum=$(df -T|grep nfs4|wc -l) 31 | if [ $nfsnum -ne 3 ]; then 32 | logger -t $tag "nfs4 mount is missing" 33 | fi 34 | else 35 | nfsnum=$(df -T|grep nfs4|wc -l) 36 | if [ $nfsnum -ne {{ num_nfs_cn }} ]; then 37 | logger -t $tag "nfs4 mount is missing" 38 | fi 39 | fi 40 | 41 | ## load check on master 42 | if [ "$hostname" == "{{ master_name}}" ]; then 43 | l=$(uptime|awk '{print $12}') 44 | if (( $(echo "$l > 80" |bc -l) )); then 45 | logger -t $tag "the load on master cpu is too high: ${l}" 46 | fi 47 | fi 48 | 49 | ## check the disk usage 50 | st=$(df -HT|grep "/dev/sda2"|awk '{print $4}') 51 | stn=${st%?} 52 | if [ $stn -gt {{ disk_warning }} ]; then 53 | logger -t $tag "the disk usage is too much" 54 | fi 55 | 56 | ## check the memory usage on master 57 | if [ "$hostname" == "{{ master_name}}" ]; then 58 | mem=$(free|grep Mem|awk '{print $7}') 59 | if [ $mem -lt {{ memory_avail_warning }} ]; then 60 | logger -t $tag "available memory is draining on master!" 61 | fi 62 | fi 63 | 64 | ## check nics 65 | if [ "$hostname" == "{{ master_name }}" ]; then 66 | nonic=$(/sbin/ifconfig|grep flags|wc -l) 67 | if [ $nonic -ne {{ master_nic_no }} ]; then 68 | logger -t $tag "nics seem to be missing on master!" 69 | fi 70 | fi 71 | 72 | ## check zombie processes 73 | nozo=$(ps axo pid=,stat=|grep Z|wc -l) 74 | if [ $nozo -gt 1 ]; then 75 | logger -t $tag "there are several zombie processes! on ${hostname}" 76 | fi 77 | -------------------------------------------------------------------------------- /roles/ganglia/templates/ganglia.conf: -------------------------------------------------------------------------------- 1 | Alias /{{ ganglia_url }} /usr/share/ganglia-webfrontend 2 | 3 | 4 | AllowOverride All 5 | Order allow,deny 6 | Allow from all 7 | Deny from none 8 | AuthType Basic 9 | AuthName "Restricted Content" 10 | AuthUserFile /etc/apache2/.htpasswd 11 | Require valid-user 12 | 13 | 14 | -------------------------------------------------------------------------------- /roles/ganglia/templates/gmetad.conf: -------------------------------------------------------------------------------- 1 | # This is an example of a Ganglia Meta Daemon configuration file 2 | # http://ganglia.sourceforge.net/ 3 | # 4 | # 5 | #------------------------------------------------------------------------------- 6 | # Setting the debug_level to 1 will keep daemon in the forground and 7 | # show only error messages. Setting this value higher than 1 will make 8 | # gmetad output debugging information and stay in the foreground. 9 | # default: 0 10 | # debug_level 10 11 | # 12 | #------------------------------------------------------------------------------- 13 | # What to monitor. The most important section of this file. 14 | # 15 | # The data_source tag specifies either a cluster or a grid to 16 | # monitor. If we detect the source is a cluster, we will maintain a complete 17 | # set of RRD databases for it, which can be used to create historical 18 | # graphs of the metrics. If the source is a grid (it comes from another gmetad), 19 | # we will only maintain summary RRDs for it. 20 | # 21 | # Format: 22 | data_source "{{ cluster_name }}" 60 {{ master_name }} 23 | # 24 | # The keyword 'data_source' must immediately be followed by a unique 25 | # string which identifies the source, then an optional polling interval in 26 | # seconds. The source will be polled at this interval on average. 27 | # If the polling interval is omitted, 15sec is asssumed. 28 | # 29 | # If you choose to set the polling interval to something other than the default, 30 | # note that the web frontend determines a host as down if its TN value is less 31 | # than 4 * TMAX (20sec by default). Therefore, if you set the polling interval 32 | # to something around or greater than 80sec, this will cause the frontend to 33 | # incorrectly display hosts as down even though they are not. 34 | # 35 | # A list of machines which service the data source follows, in the 36 | # format ip:port, or name:port. If a port is not specified then 8649 37 | # (the default gmond port) is assumed. 38 | # default: There is no default value 39 | # 40 | # data_source "my cluster" 10 localhost my.machine.edu:8649 1.2.3.5:8655 41 | # data_source "my grid" 50 1.3.4.7:8655 grid.org:8651 grid-backup.org:8651 42 | # data_source "another source" 1.3.4.7:8655 1.3.4.8 43 | 44 | # data_source "my cluster" localhost 45 | 46 | # 47 | # Round-Robin Archives 48 | # You can specify custom Round-Robin archives here (defaults are listed below) 49 | # 50 | # Old Default RRA: Keep 1 hour of metrics at 15 second resolution. 1 day at 6 minute 51 | # RRAs "RRA:AVERAGE:0.5:1:244" "RRA:AVERAGE:0.5:24:244" "RRA:AVERAGE:0.5:168:244" "RRA:AVERAGE:0.5:672:244" \ 52 | # "RRA:AVERAGE:0.5:5760:374" 53 | # New Default RRA 54 | # Keep 5856 data points at 15 second resolution assuming 15 second (default) polling. That's 1 day 55 | # Two weeks of data points at 1 minute resolution (average) 56 | #RRAs "RRA:AVERAGE:0.5:1:5856" "RRA:AVERAGE:0.5:4:20160" "RRA:AVERAGE:0.5:40:52704" 57 | 58 | # 59 | #------------------------------------------------------------------------------- 60 | # Scalability mode. If on, we summarize over downstream grids, and respect 61 | # authority tags. If off, we take on 2.5.0-era behavior: we do not wrap our output 62 | # in tags, we ignore all tags we see, and always assume 63 | # we are the "authority" on data source feeds. This approach does not scale to 64 | # large groups of clusters, but is provided for backwards compatibility. 65 | # default: on 66 | # scalable off 67 | # 68 | #------------------------------------------------------------------------------- 69 | # The name of this Grid. All the data sources above will be wrapped in a GRID 70 | # tag with this name. 71 | # default: unspecified 72 | # gridname "MyGrid" 73 | # 74 | #------------------------------------------------------------------------------- 75 | # The authority URL for this grid. Used by other gmetads to locate graphs 76 | # for our data sources. Generally points to a ganglia/ 77 | # website on this machine. 78 | # default: "http://hostname/ganglia/", 79 | # where hostname is the name of this machine, as defined by gethostname(). 80 | # authority "http://mycluster.org/newprefix/" 81 | # 82 | #------------------------------------------------------------------------------- 83 | # List of machines this gmetad will share XML with. Localhost 84 | # is always trusted. 85 | # default: There is no default value 86 | # trusted_hosts 127.0.0.1 169.229.50.165 my.gmetad.org 87 | # 88 | #------------------------------------------------------------------------------- 89 | # If you want any host which connects to the gmetad XML to receive 90 | # data, then set this value to "on" 91 | # default: off 92 | # all_trusted on 93 | # 94 | #------------------------------------------------------------------------------- 95 | # If you don't want gmetad to setuid then set this to off 96 | # default: on 97 | # setuid off 98 | # 99 | #------------------------------------------------------------------------------- 100 | # User gmetad will setuid to (defaults to "nobody") 101 | # default: "nobody" 102 | # setuid_username "nobody" 103 | # 104 | #------------------------------------------------------------------------------- 105 | # Umask to apply to created rrd files and grid directory structure 106 | # default: 0 (files are public) 107 | # umask 022 108 | # 109 | #------------------------------------------------------------------------------- 110 | # The port gmetad will answer requests for XML 111 | # default: 8651 112 | # xml_port 8651 113 | # 114 | #------------------------------------------------------------------------------- 115 | # The port gmetad will answer queries for XML. This facility allows 116 | # simple subtree and summation views of the XML tree. 117 | # default: 8652 118 | # interactive_port 8652 119 | # 120 | #------------------------------------------------------------------------------- 121 | # The number of threads answering XML requests 122 | # default: 4 123 | # server_threads 10 124 | # 125 | #------------------------------------------------------------------------------- 126 | # Where gmetad stores its round-robin databases 127 | # default: "/var/lib/ganglia/rrds" 128 | # rrd_rootdir "/some/other/place" 129 | # 130 | #------------------------------------------------------------------------------- 131 | # List of metric prefixes this gmetad will not summarize at cluster or grid level. 132 | # default: There is no default value 133 | # unsummarized_metrics diskstat CPU 134 | # 135 | #------------------------------------------------------------------------------- 136 | # In earlier versions of gmetad, hostnames were handled in a case 137 | # sensitive manner 138 | # If your hostname directories have been renamed to lower case, 139 | # set this option to 0 to disable backward compatibility. 140 | # From version 3.2, backwards compatibility will be disabled by default. 141 | # default: 1 (for gmetad < 3.2) 142 | # default: 0 (for gmetad >= 3.2) 143 | case_sensitive_hostnames 0 144 | 145 | #------------------------------------------------------------------------------- 146 | # It is now possible to export all the metrics collected by gmetad directly to 147 | # graphite by setting the following attributes. 148 | # 149 | # The hostname or IP address of the Graphite server 150 | # default: unspecified 151 | # carbon_server "my.graphite.box" 152 | # 153 | # The port and protocol on which Graphite is listening 154 | # default: 2003 155 | # carbon_port 2003 156 | # 157 | # default: tcp 158 | # carbon_protocol udp 159 | # 160 | # **Deprecated in favor of graphite_path** A prefix to prepend to the 161 | # metric names exported by gmetad. Graphite uses dot- 162 | # separated paths to organize and refer to metrics. 163 | # default: unspecified 164 | # graphite_prefix "datacenter1.gmetad" 165 | # 166 | # A user-definable graphite path. Graphite uses dot- 167 | # separated paths to organize and refer to metrics. 168 | # For reverse compatibility graphite_prefix will be prepended to this 169 | # path, but this behavior should be considered deprecated. 170 | # This path may include 3 variables that will be replaced accordingly: 171 | # %s -> source (cluster name) 172 | # %h -> host (host name) 173 | # %m -> metric (metric name) 174 | # default: graphite_prefix.%s.%h.%m 175 | # graphite_path "datacenter1.gmetad.%s.%h.%m 176 | 177 | # Number of milliseconds gmetad will wait for a response from the graphite server 178 | # default: 500 179 | # carbon_timeout 500 180 | 181 | #------------------------------------------------------------------------------- 182 | # Memcached configuration (if it has been compiled in) 183 | # Format documentation at http://docs.libmemcached.org/libmemcached_configuration.html 184 | # default: "" 185 | # memcached_parameters "--SERVER=127.0.0.1" 186 | # 187 | 188 | -------------------------------------------------------------------------------- /roles/ganglia/templates/gmond-cn.conf: -------------------------------------------------------------------------------- 1 | /* This configuration is as close to 2.5.x default behavior as possible 2 | The values closely match ./gmond/metric.h definitions in 2.5.x */ 3 | globals { 4 | daemonize = yes 5 | setuid = yes 6 | user = ganglia 7 | debug_level = 0 8 | max_udp_msg_len = 1472 9 | mute = no 10 | deaf = yes 11 | host_dmax = 0 /*secs */ 12 | cleanup_threshold = 300 /*secs */ 13 | gexec = no 14 | send_metadata_interval = 30 15 | } 16 | 17 | /* If a cluster attribute is specified, then all gmond hosts are wrapped inside 18 | * of a tag. If you do not specify a cluster tag, then all will 19 | * NOT be wrapped inside of a tag. */ 20 | cluster { 21 | name = "{{ cluster_name }}" 22 | owner = "unspecified" 23 | latlong = "unspecified" 24 | url = "unspecified" 25 | } 26 | 27 | /* The host section describes attributes of the host, like the location */ 28 | host { 29 | location = "unspecified" 30 | } 31 | 32 | /* Feel free to specify as many udp_send_channels as you like. Gmond 33 | used to only support having a single channel */ 34 | udp_send_channel { 35 | host = {{ master_name }} 36 | port = 8649 37 | ttl = 1 38 | } 39 | /* You can specify as many udp_recv_channels as you like as well. */ 40 | 41 | /* You can specify as many tcp_accept_channels as you like to share 42 | an xml description of the state of the cluster */ 43 | tcp_accept_channel { 44 | port = 8649 45 | } 46 | 47 | /* Each metrics module that is referenced by gmond must be specified and 48 | loaded. If the module has been statically linked with gmond, it does not 49 | require a load path. However all dynamically loadable modules must include 50 | a load path. */ 51 | modules { 52 | module { 53 | name = "core_metrics" 54 | } 55 | module { 56 | name = "cpu_module" 57 | path = "/usr/lib/ganglia/modcpu.so" 58 | } 59 | module { 60 | name = "disk_module" 61 | path = "/usr/lib/ganglia/moddisk.so" 62 | } 63 | module { 64 | name = "load_module" 65 | path = "/usr/lib/ganglia/modload.so" 66 | } 67 | module { 68 | name = "mem_module" 69 | path = "/usr/lib/ganglia/modmem.so" 70 | } 71 | module { 72 | name = "net_module" 73 | path = "/usr/lib/ganglia/modnet.so" 74 | } 75 | module { 76 | name = "proc_module" 77 | path = "/usr/lib/ganglia/modproc.so" 78 | } 79 | module { 80 | name = "sys_module" 81 | path = "/usr/lib/ganglia/modsys.so" 82 | } 83 | } 84 | 85 | include ('/etc/ganglia/conf.d/*.conf') 86 | 87 | 88 | /* The old internal 2.5.x metric array has been replaced by the following 89 | collection_group directives. What follows is the default behavior for 90 | collecting and sending metrics that is as close to 2.5.x behavior as 91 | possible. */ 92 | 93 | /* This collection group will cause a heartbeat (or beacon) to be sent every 94 | 20 seconds. In the heartbeat is the GMOND_STARTED data which expresses 95 | the age of the running gmond. */ 96 | collection_group { 97 | collect_once = yes 98 | time_threshold = 20 99 | metric { 100 | name = "heartbeat" 101 | } 102 | } 103 | 104 | /* This collection group will send general info about this host every 1200 secs. 105 | This information doesn't change between reboots and is only collected once. */ 106 | collection_group { 107 | collect_once = yes 108 | time_threshold = 1200 109 | metric { 110 | name = "cpu_num" 111 | title = "CPU Count" 112 | } 113 | metric { 114 | name = "cpu_speed" 115 | title = "CPU Speed" 116 | } 117 | metric { 118 | name = "mem_total" 119 | title = "Memory Total" 120 | } 121 | /* Should this be here? Swap can be added/removed between reboots. */ 122 | metric { 123 | name = "swap_total" 124 | title = "Swap Space Total" 125 | } 126 | metric { 127 | name = "boottime" 128 | title = "Last Boot Time" 129 | } 130 | metric { 131 | name = "machine_type" 132 | title = "Machine Type" 133 | } 134 | metric { 135 | name = "os_name" 136 | title = "Operating System" 137 | } 138 | metric { 139 | name = "os_release" 140 | title = "Operating System Release" 141 | } 142 | metric { 143 | name = "location" 144 | title = "Location" 145 | } 146 | } 147 | 148 | /* This collection group will send the status of gexecd for this host every 300 secs */ 149 | /* Unlike 2.5.x the default behavior is to report gexecd OFF. */ 150 | collection_group { 151 | collect_once = yes 152 | time_threshold = 300 153 | metric { 154 | name = "gexec" 155 | title = "Gexec Status" 156 | } 157 | } 158 | 159 | /* This collection group will collect the CPU status info every 20 secs. 160 | The time threshold is set to 90 seconds. In honesty, this time_threshold could be 161 | set significantly higher to reduce unneccessary network chatter. */ 162 | collection_group { 163 | collect_every = 20 164 | time_threshold = 90 165 | /* CPU status */ 166 | metric { 167 | name = "cpu_user" 168 | value_threshold = "1.0" 169 | title = "CPU User" 170 | } 171 | metric { 172 | name = "cpu_system" 173 | value_threshold = "1.0" 174 | title = "CPU System" 175 | } 176 | metric { 177 | name = "cpu_idle" 178 | value_threshold = "5.0" 179 | title = "CPU Idle" 180 | } 181 | metric { 182 | name = "cpu_nice" 183 | value_threshold = "1.0" 184 | title = "CPU Nice" 185 | } 186 | metric { 187 | name = "cpu_aidle" 188 | value_threshold = "5.0" 189 | title = "CPU aidle" 190 | } 191 | metric { 192 | name = "cpu_wio" 193 | value_threshold = "1.0" 194 | title = "CPU wio" 195 | } 196 | /* The next two metrics are optional if you want more detail... 197 | ... since they are accounted for in cpu_system. 198 | metric { 199 | name = "cpu_intr" 200 | value_threshold = "1.0" 201 | title = "CPU intr" 202 | } 203 | metric { 204 | name = "cpu_sintr" 205 | value_threshold = "1.0" 206 | title = "CPU sintr" 207 | } 208 | */ 209 | } 210 | 211 | collection_group { 212 | collect_every = 20 213 | time_threshold = 90 214 | /* Load Averages */ 215 | metric { 216 | name = "load_one" 217 | value_threshold = "1.0" 218 | title = "One Minute Load Average" 219 | } 220 | metric { 221 | name = "load_five" 222 | value_threshold = "1.0" 223 | title = "Five Minute Load Average" 224 | } 225 | metric { 226 | name = "load_fifteen" 227 | value_threshold = "1.0" 228 | title = "Fifteen Minute Load Average" 229 | } 230 | } 231 | 232 | /* This group collects the number of running and total processes */ 233 | collection_group { 234 | collect_every = 80 235 | time_threshold = 950 236 | metric { 237 | name = "proc_run" 238 | value_threshold = "1.0" 239 | title = "Total Running Processes" 240 | } 241 | metric { 242 | name = "proc_total" 243 | value_threshold = "1.0" 244 | title = "Total Processes" 245 | } 246 | } 247 | 248 | /* This collection group grabs the volatile memory metrics every 40 secs and 249 | sends them at least every 180 secs. This time_threshold can be increased 250 | significantly to reduce unneeded network traffic. */ 251 | collection_group { 252 | collect_every = 40 253 | time_threshold = 180 254 | metric { 255 | name = "mem_free" 256 | value_threshold = "1024.0" 257 | title = "Free Memory" 258 | } 259 | metric { 260 | name = "mem_shared" 261 | value_threshold = "1024.0" 262 | title = "Shared Memory" 263 | } 264 | metric { 265 | name = "mem_buffers" 266 | value_threshold = "1024.0" 267 | title = "Memory Buffers" 268 | } 269 | metric { 270 | name = "mem_cached" 271 | value_threshold = "1024.0" 272 | title = "Cached Memory" 273 | } 274 | metric { 275 | name = "swap_free" 276 | value_threshold = "1024.0" 277 | title = "Free Swap Space" 278 | } 279 | } 280 | 281 | collection_group { 282 | collect_every = 40 283 | time_threshold = 300 284 | metric { 285 | name = "bytes_out" 286 | value_threshold = 4096 287 | title = "Bytes Sent" 288 | } 289 | metric { 290 | name = "bytes_in" 291 | value_threshold = 4096 292 | title = "Bytes Received" 293 | } 294 | metric { 295 | name = "pkts_in" 296 | value_threshold = 256 297 | title = "Packets Received" 298 | } 299 | metric { 300 | name = "pkts_out" 301 | value_threshold = 256 302 | title = "Packets Sent" 303 | } 304 | } 305 | 306 | /* Different than 2.5.x default since the old config made no sense */ 307 | collection_group { 308 | collect_every = 1800 309 | time_threshold = 3600 310 | metric { 311 | name = "disk_total" 312 | value_threshold = 1.0 313 | title = "Total Disk Space" 314 | } 315 | } 316 | 317 | collection_group { 318 | collect_every = 40 319 | time_threshold = 180 320 | metric { 321 | name = "disk_free" 322 | value_threshold = 1.0 323 | title = "Disk Space Available" 324 | } 325 | metric { 326 | name = "part_max_used" 327 | value_threshold = 1.0 328 | title = "Maximum Disk Space Used" 329 | } 330 | } 331 | 332 | -------------------------------------------------------------------------------- /roles/ganglia/templates/gmond.conf: -------------------------------------------------------------------------------- 1 | /* This configuration is as close to 2.5.x default behavior as possible 2 | The values closely match ./gmond/metric.h definitions in 2.5.x */ 3 | globals { 4 | daemonize = yes 5 | setuid = yes 6 | user = ganglia 7 | debug_level = 0 8 | max_udp_msg_len = 1472 9 | mute = no 10 | deaf = no 11 | host_dmax = 0 /*secs */ 12 | cleanup_threshold = 300 /*secs */ 13 | gexec = no 14 | send_metadata_interval = 30 15 | } 16 | 17 | /* If a cluster attribute is specified, then all gmond hosts are wrapped inside 18 | * of a tag. If you do not specify a cluster tag, then all will 19 | * NOT be wrapped inside of a tag. */ 20 | cluster { 21 | name = "{{ cluster_name }}" 22 | owner = "unspecified" 23 | latlong = "unspecified" 24 | url = "unspecified" 25 | } 26 | 27 | /* The host section describes attributes of the host, like the location */ 28 | host { 29 | location = "unspecified" 30 | } 31 | 32 | /* Feel free to specify as many udp_send_channels as you like. Gmond 33 | used to only support having a single channel */ 34 | udp_send_channel { 35 | host = {{ master_name }} 36 | port = 8649 37 | ttl = 1 38 | } 39 | 40 | 41 | /* You can specify as many udp_recv_channels as you like as well. */ 42 | udp_recv_channel { 43 | port = 8649 44 | } 45 | 46 | /* You can specify as many tcp_accept_channels as you like to share 47 | an xml description of the state of the cluster */ 48 | tcp_accept_channel { 49 | port = 8649 50 | } 51 | 52 | /* Each metrics module that is referenced by gmond must be specified and 53 | loaded. If the module has been statically linked with gmond, it does not 54 | require a load path. However all dynamically loadable modules must include 55 | a load path. */ 56 | modules { 57 | module { 58 | name = "core_metrics" 59 | } 60 | module { 61 | name = "cpu_module" 62 | path = "/usr/lib/ganglia/modcpu.so" 63 | } 64 | module { 65 | name = "disk_module" 66 | path = "/usr/lib/ganglia/moddisk.so" 67 | } 68 | module { 69 | name = "load_module" 70 | path = "/usr/lib/ganglia/modload.so" 71 | } 72 | module { 73 | name = "mem_module" 74 | path = "/usr/lib/ganglia/modmem.so" 75 | } 76 | module { 77 | name = "net_module" 78 | path = "/usr/lib/ganglia/modnet.so" 79 | } 80 | module { 81 | name = "proc_module" 82 | path = "/usr/lib/ganglia/modproc.so" 83 | } 84 | module { 85 | name = "sys_module" 86 | path = "/usr/lib/ganglia/modsys.so" 87 | } 88 | } 89 | 90 | include ('/etc/ganglia/conf.d/*.conf') 91 | 92 | 93 | /* The old internal 2.5.x metric array has been replaced by the following 94 | collection_group directives. What follows is the default behavior for 95 | collecting and sending metrics that is as close to 2.5.x behavior as 96 | possible. */ 97 | 98 | /* This collection group will cause a heartbeat (or beacon) to be sent every 99 | 20 seconds. In the heartbeat is the GMOND_STARTED data which expresses 100 | the age of the running gmond. */ 101 | collection_group { 102 | collect_once = yes 103 | time_threshold = 20 104 | metric { 105 | name = "heartbeat" 106 | } 107 | } 108 | 109 | /* This collection group will send general info about this host every 1200 secs. 110 | This information doesn't change between reboots and is only collected once. */ 111 | collection_group { 112 | collect_once = yes 113 | time_threshold = 1200 114 | metric { 115 | name = "cpu_num" 116 | title = "CPU Count" 117 | } 118 | metric { 119 | name = "cpu_speed" 120 | title = "CPU Speed" 121 | } 122 | metric { 123 | name = "mem_total" 124 | title = "Memory Total" 125 | } 126 | /* Should this be here? Swap can be added/removed between reboots. */ 127 | metric { 128 | name = "swap_total" 129 | title = "Swap Space Total" 130 | } 131 | metric { 132 | name = "boottime" 133 | title = "Last Boot Time" 134 | } 135 | metric { 136 | name = "machine_type" 137 | title = "Machine Type" 138 | } 139 | metric { 140 | name = "os_name" 141 | title = "Operating System" 142 | } 143 | metric { 144 | name = "os_release" 145 | title = "Operating System Release" 146 | } 147 | metric { 148 | name = "location" 149 | title = "Location" 150 | } 151 | } 152 | 153 | /* This collection group will send the status of gexecd for this host every 300 secs */ 154 | /* Unlike 2.5.x the default behavior is to report gexecd OFF. */ 155 | collection_group { 156 | collect_once = yes 157 | time_threshold = 300 158 | metric { 159 | name = "gexec" 160 | title = "Gexec Status" 161 | } 162 | } 163 | 164 | /* This collection group will collect the CPU status info every 20 secs. 165 | The time threshold is set to 90 seconds. In honesty, this time_threshold could be 166 | set significantly higher to reduce unneccessary network chatter. */ 167 | collection_group { 168 | collect_every = 20 169 | time_threshold = 90 170 | /* CPU status */ 171 | metric { 172 | name = "cpu_user" 173 | value_threshold = "1.0" 174 | title = "CPU User" 175 | } 176 | metric { 177 | name = "cpu_system" 178 | value_threshold = "1.0" 179 | title = "CPU System" 180 | } 181 | metric { 182 | name = "cpu_idle" 183 | value_threshold = "5.0" 184 | title = "CPU Idle" 185 | } 186 | metric { 187 | name = "cpu_nice" 188 | value_threshold = "1.0" 189 | title = "CPU Nice" 190 | } 191 | metric { 192 | name = "cpu_aidle" 193 | value_threshold = "5.0" 194 | title = "CPU aidle" 195 | } 196 | metric { 197 | name = "cpu_wio" 198 | value_threshold = "1.0" 199 | title = "CPU wio" 200 | } 201 | /* The next two metrics are optional if you want more detail... 202 | ... since they are accounted for in cpu_system. 203 | metric { 204 | name = "cpu_intr" 205 | value_threshold = "1.0" 206 | title = "CPU intr" 207 | } 208 | metric { 209 | name = "cpu_sintr" 210 | value_threshold = "1.0" 211 | title = "CPU sintr" 212 | } 213 | */ 214 | } 215 | 216 | collection_group { 217 | collect_every = 20 218 | time_threshold = 90 219 | /* Load Averages */ 220 | metric { 221 | name = "load_one" 222 | value_threshold = "1.0" 223 | title = "One Minute Load Average" 224 | } 225 | metric { 226 | name = "load_five" 227 | value_threshold = "1.0" 228 | title = "Five Minute Load Average" 229 | } 230 | metric { 231 | name = "load_fifteen" 232 | value_threshold = "1.0" 233 | title = "Fifteen Minute Load Average" 234 | } 235 | } 236 | 237 | /* This group collects the number of running and total processes */ 238 | collection_group { 239 | collect_every = 80 240 | time_threshold = 950 241 | metric { 242 | name = "proc_run" 243 | value_threshold = "1.0" 244 | title = "Total Running Processes" 245 | } 246 | metric { 247 | name = "proc_total" 248 | value_threshold = "1.0" 249 | title = "Total Processes" 250 | } 251 | } 252 | 253 | /* This collection group grabs the volatile memory metrics every 40 secs and 254 | sends them at least every 180 secs. This time_threshold can be increased 255 | significantly to reduce unneeded network traffic. */ 256 | collection_group { 257 | collect_every = 40 258 | time_threshold = 180 259 | metric { 260 | name = "mem_free" 261 | value_threshold = "1024.0" 262 | title = "Free Memory" 263 | } 264 | metric { 265 | name = "mem_shared" 266 | value_threshold = "1024.0" 267 | title = "Shared Memory" 268 | } 269 | metric { 270 | name = "mem_buffers" 271 | value_threshold = "1024.0" 272 | title = "Memory Buffers" 273 | } 274 | metric { 275 | name = "mem_cached" 276 | value_threshold = "1024.0" 277 | title = "Cached Memory" 278 | } 279 | metric { 280 | name = "swap_free" 281 | value_threshold = "1024.0" 282 | title = "Free Swap Space" 283 | } 284 | } 285 | 286 | collection_group { 287 | collect_every = 40 288 | time_threshold = 300 289 | metric { 290 | name = "bytes_out" 291 | value_threshold = 4096 292 | title = "Bytes Sent" 293 | } 294 | metric { 295 | name = "bytes_in" 296 | value_threshold = 4096 297 | title = "Bytes Received" 298 | } 299 | metric { 300 | name = "pkts_in" 301 | value_threshold = 256 302 | title = "Packets Received" 303 | } 304 | metric { 305 | name = "pkts_out" 306 | value_threshold = 256 307 | title = "Packets Sent" 308 | } 309 | } 310 | 311 | /* Different than 2.5.x default since the old config made no sense */ 312 | collection_group { 313 | collect_every = 1800 314 | time_threshold = 3600 315 | metric { 316 | name = "disk_total" 317 | value_threshold = 1.0 318 | title = "Total Disk Space" 319 | } 320 | } 321 | 322 | collection_group { 323 | collect_every = 40 324 | time_threshold = 180 325 | metric { 326 | name = "disk_free" 327 | value_threshold = 1.0 328 | title = "Disk Space Available" 329 | } 330 | metric { 331 | name = "part_max_used" 332 | value_threshold = 1.0 333 | title = "Maximum Disk Space Used" 334 | } 335 | } 336 | 337 | -------------------------------------------------------------------------------- /roles/mpi/README.md: -------------------------------------------------------------------------------- 1 | Mpi 2 | ========= 3 | 4 | This role is designed to install system wide mpi library. 5 | 6 | 7 | Role Variables 8 | -------------- 9 | 10 | See defaults/main.yml. The default packages is openmpi. You could change to other mpi implementations if you need. -------------------------------------------------------------------------------- /roles/mpi/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # defaults file for mpi 3 | mpi_packages: 4 | - libopenmpi-dev=2.1.1-8 5 | - openmpi-bin=2.1.1-8 6 | - g++ 7 | # - libboost-all-dev 8 | -------------------------------------------------------------------------------- /roles/mpi/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for mpi 3 | - name: install necessary packages for openmpi 4 | become: yes 5 | apt: 6 | name: "{{ item }}" 7 | state: present 8 | with_items: "{{ mpi_packages }}" 9 | - name: change openmpi config 10 | become: yes 11 | lineinfile: 12 | path: /etc/openmpi/openmpi-mca-params.conf 13 | regexp: "^btl_base_warn_component_unused = 0" 14 | line: "btl_base_warn_component_unused = 0" 15 | -------------------------------------------------------------------------------- /roles/network/README.md: -------------------------------------------------------------------------------- 1 | Network 2 | ========= 3 | 4 | This role is designed to configure the network of the cluster including master as dns and dhcp server in LAN, NAT enable network on compute nodes and some proxy settings. 5 | 6 | Requirements 7 | ------------ 8 | 9 | No requirement as long as admin user account are consistent across machines. It is actually the first role to run, which make the cluster accessible and network connected. 10 | 11 | Role Variables 12 | -------------- 13 | 14 | See defaults/main.yml. You need to specify set_proxy to no, if your system is not air-gapped and free to connect to the internet. 15 | 16 | Templates and Files 17 | -------------- 18 | sources.list in files dir changes the default apt source to a mirror. You may want to change this behavior depending on your own network conditions. 19 | 20 | hosts in templates dir has some extra host items. You may want to delete or change this depending on your system. -------------------------------------------------------------------------------- /roles/network/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # defaults file for network 3 | set_proxy: yes 4 | use_tinc: no 5 | tinc_if: tinc 6 | mtu: 8500 # reason for non 9000: see https://www.intel.com/content/www/us/en/design/products-and-solutions/networking-and-io/ethernet-connection-i218/technical-library.html?grouping=rdc%20Content%20Types&sort=title:asc specification update: point 3. 7 | -------------------------------------------------------------------------------- /roles/network/files/20auto-upgrades: -------------------------------------------------------------------------------- 1 | APT::Periodic::Update-Package-Lists "0"; 2 | APT::Periodic::Unattended-Upgrade "0"; 3 | -------------------------------------------------------------------------------- /roles/network/files/sources.list: -------------------------------------------------------------------------------- 1 | # 默认注释了源码镜像以提高 apt update 速度,如有需要可自行取消注释 2 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic main restricted universe multiverse 3 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic main restricted universe multiverse 4 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-updates main restricted universe multiverse 5 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-updates main restricted universe multiverse 6 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-backports main restricted universe multiverse 7 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-backports main restricted universe multiverse 8 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-security main restricted universe multiverse 9 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-security main restricted universe multiverse 10 | 11 | # 预发布软件源,不建议启用 12 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-proposed main restricted universe multiverse 13 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-proposed main restricted universe multiverse 14 | -------------------------------------------------------------------------------- /roles/network/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for network 3 | - name: update host on ln node 4 | become: yes 5 | template: 6 | backup: yes 7 | owner: root 8 | src: hosts 9 | dest: /etc/hosts 10 | when: inventory_hostname in groups['ln'] 11 | - name: gather facts after host is specified 12 | setup: 13 | - name: net plan on ln node 14 | template: 15 | backup: yes 16 | owner: root 17 | src: ../templates/60-config.yaml 18 | dest: /etc/netplan/60-config.yaml 19 | register: lnnetplan 20 | become: yes 21 | when: inventory_hostname in groups['ln'] 22 | - name: netplan apply 23 | become: yes 24 | when: inventory_hostname in groups['ln'] and lnnetplan.changed 25 | command: netplan apply 26 | - name: change proxy of apt 27 | become: yes 28 | template: 29 | src: apt.conf 30 | dest: /etc/apt/apt.conf 31 | backup: yes 32 | when: set_proxy 33 | - name: change the apt source mirror to tuna 34 | become: yes 35 | copy: 36 | src: sources.list 37 | dest: /etc/apt/sources.list 38 | owner: root 39 | backup: yes 40 | - name: turn off unattended updates 41 | become: yes 42 | copy: 43 | src: 20auto-upgrades 44 | dest: /etc/apt/apt.conf.d/20auto-upgrades 45 | - name: dnsmasq install 46 | become: yes 47 | apt: 48 | update_cache: yes 49 | name: dnsmasq 50 | state: present 51 | when: inventory_hostname in groups['ln'] 52 | - name: dnsmasq config 53 | become: yes 54 | template: 55 | owner: root 56 | src: ../templates/dnsmasq.conf 57 | dest: /etc/dnsmasq.conf 58 | backup: yes 59 | register: lndnsmasq 60 | when: inventory_hostname in groups['ln'] 61 | - name: dnsmasq host 62 | become: yes 63 | template: 64 | backup: yes 65 | src: map.hosts 66 | dest: /etc/dnsmasq.d/map.hosts 67 | when: inventory_hostname in groups['ln'] 68 | register: maphost 69 | - name: ensure dnsmasq service is started 70 | become: yes 71 | service: 72 | name: dnsmasq 73 | state: started 74 | when: inventory_hostname in groups['ln'] 75 | - name: dnsmasq service restart 76 | become: yes 77 | service: 78 | name: dnsmasq 79 | state: restarted 80 | when: inventory_hostname in groups['ln'] and (lndnsmasq.changed or maphost.changed) 81 | - name: enable ip forward on ln nodes 82 | become: yes 83 | lineinfile: 84 | path: /etc/sysctl.conf 85 | regexp: "net.ipv4.ip_forward" 86 | line: "net.ipv4.ip_forward=1" 87 | backup: yes 88 | register: lnforward 89 | when: inventory_hostname in groups['ln'] 90 | - name: sysctl the ip forward feature 91 | become: yes 92 | command: sysctl -p /etc/sysctl.conf 93 | when: inventory_hostname in groups['ln'] and lnforward.changed 94 | - name: iptables for snat on tinc part 95 | when: inventory_hostname in groups['ln'] and use_tinc 96 | iptables: 97 | table: nat 98 | chain: POSTROUTING 99 | destination: "!{{ ip_range }}/{{ mask }}" 100 | source: "{{ ip_range }}/{{ mask }}" 101 | jump: SNAT 102 | out_interface: "{{ tinc_if }}" 103 | to_source: "{{ tinc_ip }}" 104 | become: yes 105 | - name: iptables for snat on ln nodes 106 | iptables: 107 | table: nat 108 | chain: POSTROUTING 109 | destination: "!{{ ip_range }}/{{ mask }}" 110 | source: "{{ ip_range }}/{{ mask }}" 111 | jump: SNAT 112 | to_source: "{{ wan_ip }}" 113 | become: yes 114 | when: inventory_hostname in groups['ln'] 115 | - name: netplan config on cn nodes 116 | become: yes 117 | when: inventory_hostname in groups['cn'] 118 | template: 119 | backup: yes 120 | owner: root 121 | src: ../templates/70-config.yaml 122 | dest: /etc/netplan/70-config.yaml 123 | register: cnnetplan 124 | - name: net restart on cn nodes 125 | become: yes 126 | when: inventory_hostname in groups['cn'] and cnnetplan.changed 127 | command: netplan apply 128 | - name: ensure the hostname is consistent with name in hosts 129 | become: yes 130 | hostname: 131 | name: "{{ inventory_hostname }}" 132 | when: inventory_hostname in groups['cn'] 133 | - name: disable cloud init reverting hostname when rebooting 134 | become: yes 135 | lineinfile: 136 | path: /etc/cloud/cloud.cfg 137 | regexp: "preserve_hostname:" 138 | line: "preserve_hostname: true" 139 | backup: yes 140 | - name: ensure the hostname of master 141 | become: yes 142 | hostname: 143 | name: "{{ master_name }}" 144 | when: inventory_hostname in groups['ln'] 145 | - name: refresh the host file 146 | become: yes 147 | template: 148 | src: hosts 149 | dest: /etc/hosts 150 | owner: root 151 | backup: yes 152 | - name: copy proxy-set 153 | template: 154 | src: proxy-set 155 | dest: /etc/proxy-set 156 | become: yes 157 | when: set_proxy 158 | - name: add proxy to profile 159 | lineinfile: 160 | path: /etc/profile 161 | regexp: "#add proxy$" 162 | line: "source /etc/proxy-set #add proxy" 163 | become: yes 164 | when: set_proxy 165 | - name: enable jumbo frame on ln 166 | become: yes 167 | command: "ip link set {{ ln_lan_nic }} mtu {{ mtu }}" 168 | when: inventory_hostname in groups['ln'] 169 | - name: enable jumbo frame on cn 170 | become: yes 171 | command: "ip link set {{ hostvars[inventory_hostname]['nic'] | default(cn_default_nic) }} mtu {{ mtu }}" 172 | when: inventory_hostname in groups['cn'] 173 | -------------------------------------------------------------------------------- /roles/network/templates/60-config.yaml: -------------------------------------------------------------------------------- 1 | network: 2 | version: 2 3 | renderer: networkd 4 | ethernets: 5 | {{ ln_wan_nic }}: 6 | addresses: 7 | - {{ wan_ip|indent(1,true) }}/{{ wan_mask }} 8 | gateway4: {{ wan_gateway }} 9 | dhcp4: false 10 | nameservers: 11 | addresses: {{ dns_server|to_yaml }} 12 | {{ ln_lan_nic }}: 13 | addresses: 14 | - {{ master_ip|indent(1,true) }}/{{ mask }} 15 | mtu: {{ mtu }} 16 | dhcp4: false 17 | match: 18 | macaddress: {{ ansible_facts[ln_lan_nic]['macaddress'] }} 19 | -------------------------------------------------------------------------------- /roles/network/templates/70-config.yaml: -------------------------------------------------------------------------------- 1 | network: 2 | version: 2 3 | renderer: networkd 4 | ethernets: 5 | {{ hostvars[inventory_hostname]['nic'] | default(cn_default_nic) }}: 6 | dhcp4: yes 7 | gateway4: {{ master_ip }} 8 | mtu: {{ mtu }} 9 | -------------------------------------------------------------------------------- /roles/network/templates/apt.conf: -------------------------------------------------------------------------------- 1 | Acquire::http::Proxy "{{ env_vars['http_proxy'] }}"; 2 | Acquire::https::Proxy "{{ env_vars['https_proxy'] }}"; 3 | -------------------------------------------------------------------------------- /roles/network/templates/dnsmasq.conf: -------------------------------------------------------------------------------- 1 | # Tell any system-wide dnsmasq instance to make sure to bind to interfaces 2 | # instead of listening on 0.0.0.0 3 | interface={{ ln_lan_nic }} 4 | bind-interfaces 5 | # except-interface=lxdbr0 6 | dhcp-range={{ dhcp_start_ip }},{{ dhcp_end_ip }} 7 | dhcp-no-override 8 | dhcp-hostsfile=/etc/dnsmasq.d/map.hosts 9 | log-dhcp 10 | domain={{ cluster_domain }} 11 | expand-hosts 12 | -------------------------------------------------------------------------------- /roles/network/templates/hosts: -------------------------------------------------------------------------------- 1 | 127.0.0.1 localhost.localdomain localhost 2 | ::1 localhost6.localdomain6 localhost6 3 | 4 | # The following lines are desirable for IPv6 capable hosts 5 | ::1 localhost ip6-localhost ip6-loopback 6 | fe00::0 ip6-localnet 7 | ff02::1 ip6-allnodes 8 | ff02::2 ip6-allrouters 9 | ff02::3 ip6-allhosts 10 | 11 | # hostnames in the cluster 12 | 13 | {{ master_ip }} {{ master_name }} 14 | 15 | {% for host in groups['cn'] %} 16 | {{ hostvars[host]['ip'] }} {{ host }} 17 | {% endfor %} 18 | 19 | # other hostnames 20 | -------------------------------------------------------------------------------- /roles/network/templates/map.hosts: -------------------------------------------------------------------------------- 1 | {% for h in groups['cn'] %} 2 | dhcp-host={{ hostvars[h]['mac'] }},{{ hostvars[h]['ip'] }},{{ h }},infinite 3 | {% endfor %} 4 | {% for h in groups['cn'] %} 5 | {% if 'idrac' in hostvars[h] %} 6 | dhcp-host={{ hostvars[h]['idrac'] }},{{ hostvars[h]['idracip'] }},{{ h+"-idrac" }},infinite 7 | {% endif %} 8 | {% endfor %} 9 | -------------------------------------------------------------------------------- /roles/network/templates/proxy-set: -------------------------------------------------------------------------------- 1 | {% if set_proxy %} 2 | export http_proxy={{ env_vars['http_proxy'] }} 3 | export https_proxy={{ env_vars['https_proxy'] }} 4 | export ftp_proxy={{ env_vars['ftp_proxy'] }} 5 | {% endif %} 6 | -------------------------------------------------------------------------------- /roles/python/README.md: -------------------------------------------------------------------------------- 1 | Python 2 | ========= 3 | 4 | This role is designed to make a compatible python enviroment. 5 | 6 | Requirements 7 | ------------ 8 | 9 | You should first install intel parallel studio with intel python, if you want to enable the last two tasks in the playbook. The workflow in general here is to make intelpython as the default python for numerical calculations. If that is not what you want to implement, review carefully and delete irrelevant tasks. 10 | 11 | Role Variables 12 | -------------- 13 | 14 | See defaults/main.yml. You need to specify spack_path if you want intel python auto include spack pip packages. 15 | 16 | Templates and Files 17 | -------------- 18 | pip.conf in files dir changes the default pypi url to a mirror. You may want to change this behavior depending on your network conditions. -------------------------------------------------------------------------------- /roles/python/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # defaults file for python 3 | spack_path: "/home/ubuntu/spack" 4 | -------------------------------------------------------------------------------- /roles/python/files/pip.conf: -------------------------------------------------------------------------------- 1 | [global] 2 | index-url=https://pypi.tuna.tsinghua.edu.cn/simple 3 | -------------------------------------------------------------------------------- /roles/python/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for python 3 | - name: create .pip dir 4 | file: 5 | path: "{{ lookup('env', 'HOME')+'/.pip' }}" 6 | state: directory 7 | when: inventory_hostname in groups['ln'] 8 | - name: pip config in etc 9 | copy: 10 | src: pip.conf 11 | dest: /etc/pip.conf 12 | backup: yes 13 | become: yes 14 | when: inventory_hostname in groups['ln'] 15 | - name: pip config in home 16 | copy: 17 | src: pip.conf 18 | dest: "{{ lookup('env', 'HOME')+'/.pip/pip.conf' }}" 19 | backup: yes 20 | when: inventory_hostname in groups['ln'] 21 | ## the following steps are optional 22 | - name: find spack py path 23 | shell: "source {{spack_path}}/share/spack/setup-env.sh&&spack location -i python@3.6" 24 | args: 25 | executable: /bin/bash 26 | when: inventory_hostname in groups['ln'] 27 | register: python_location 28 | - name: pythonpath add for intelpython3 29 | become: yes 30 | template: 31 | src: spack.pth 32 | dest: /opt/intel/intelpython3/lib/python3.6/site-packages/spack.pth 33 | when: inventory_hostname in groups['ln'] 34 | #- name: change the folder permission 35 | # command: chmod -c -R u=rw,go=r,a-x+X "{{item}}" 36 | # with_items: 37 | # - "{{ lookup('env', 'HOME')+'/.local' }}" 38 | # when: inventory_hostname in groups['ln'] 39 | -------------------------------------------------------------------------------- /roles/python/templates/home.pth: -------------------------------------------------------------------------------- 1 | {{ lookup("env", "HOME")+"/.local/lib/python3.6/site-packages" }} 2 | -------------------------------------------------------------------------------- /roles/python/templates/spack.pth: -------------------------------------------------------------------------------- 1 | {{ python_location.stdout }}/lib/python3.6/site-packages 2 | -------------------------------------------------------------------------------- /roles/restic/README.md: -------------------------------------------------------------------------------- 1 | Restic 2 | ========= 3 | 4 | This role is designed to configure restic for backups. 5 | 6 | Requirements 7 | ------------ 8 | 9 | No explicit dependence and requirements, as long as the restic repo path exists. 10 | 11 | Role Variables 12 | -------------- 13 | 14 | See defaults/main.yml. 15 | 16 | Templates and Files 17 | -------------- 18 | 19 | ignorefile is the file containing paths to be excluded when backup the whole system /. 20 | -------------------------------------------------------------------------------- /roles/restic/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # defaults file for restic 3 | restic_repos: 4 | - path: "/BACKUP" 5 | pass: "123456notwell" 6 | init: True 7 | default_repo: "/BACKUP" 8 | default_pass: "123456notwell" 9 | backup_dirs: 10 | - path: "/home" 11 | hour: "3" 12 | minute: "0" 13 | ignorefile: yes 14 | -------------------------------------------------------------------------------- /roles/restic/files/ignorefile: -------------------------------------------------------------------------------- 1 | /tmp/* 2 | /dev/* 3 | /DATA* 4 | /BACKUP 5 | /run/* 6 | /proc/* 7 | /swap.img 8 | /lost+found/* 9 | /mnt/* 10 | /home/* 11 | /opt/* 12 | -------------------------------------------------------------------------------- /roles/restic/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for restic 3 | - name: install restic by apt 4 | apt: 5 | name: restic 6 | state: present 7 | when: inventory_hostname in groups['ln'] 8 | - name: init repos 9 | environment: 10 | RESTIC_REPOSITORY: "{{ item.path }}" 11 | RESTIC_PASSWORD: "{{ item.pass }}" 12 | register: re_init 13 | command: "/usr/bin/restic init" 14 | changed_when: "'created restic backend' in re_init.stdout" 15 | failed_when: re_init.rc != 0 and not 'config file already exists' in re_init.stderr 16 | with_items: "{{ restic_repos }}" 17 | when: item.init and inventory_hostname in groups['ln'] 18 | - name: copy ignore files if any 19 | become: yes 20 | copy: 21 | src: ignorefile 22 | dest: /BACKUP/ignorefile 23 | when: inventory_hostname in groups['ln'] and ignorefile 24 | - name: add crontabs for backup 25 | become: yes 26 | cron: 27 | minute: "{{ item.minute }}" 28 | hour: "{{ item.hour }}" 29 | name: "backup {{ item.path }}" 30 | job: "RESTIC_REPOSITORY='{{ item.repo|default(default_repo) }}' RESTIC_PASSWORD='{{ item.pass|default(default_pass) }}' /usr/bin/restic backup {{ item.path }} {{ item.extras | default('') }}" 31 | with_items: "{{ backup_dirs }}" 32 | when: inventory_hostname in groups['ln'] 33 | - name: prune crontab 34 | become: yes 35 | cron: 36 | minute: "10" 37 | hour: "7" 38 | name: "prune backups {{ item.path }}" 39 | job: "RESTIC_REPOSITORY='{{ item.path }}' RESTIC_PASSWORD='{{ item.pass }}' /usr/bin/restic forget --keep-last 1 --keep-daily 7 --keep-weekly 4 --keep-monthly 3 --prune" 40 | with_items: "{{ restic_repos }}" 41 | when: inventory_hostname in groups['ln'] 42 | -------------------------------------------------------------------------------- /roles/slurm/README.md: -------------------------------------------------------------------------------- 1 | Slurm 2 | ========= 3 | 4 | This role is designed to set up the whole slurm service, including slurmctld slurmd and slurmdbd from scratch. It also enable the pam module which deny user ssh to compute node when there is no job of them there. 5 | 6 | Role Variables 7 | -------------- 8 | 9 | See defaults/main.yml. 10 | 11 | Templates and Files 12 | -------------- 13 | 14 | slurm.conf in templates dir may need further review to meet your specifc needs. For example, the default conf only has one partition, you may want to add more partitions or change Weight of nodes as you like. The current conf also includes the master node as a compute node, too. You may also want to exclude it. 15 | 16 | Besides, pay special attention on the config path if your OS is not Ubuntu 18.04, the path may vary for different distributions. And you need to change path relevant confs by hands. 17 | 18 | Notes 19 | ------------ 20 | 21 | After running this playbook, you should add the cluster, account and users by sacctmgr by hand directly. This is designed for the flexibility on user management of slurm users including priority and qos. 22 | -------------------------------------------------------------------------------- /roles/slurm/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # defaults file for slurm 3 | db_user: slurm 4 | slurm_user: slurm 5 | db_pass: 123456notgood 6 | ctldhost: 7 | - master 8 | - c8 9 | dbdhost: master 10 | slurm_mail: slurm@localhost.domain 11 | slurm_spool_path: /var/spool/slurmd 12 | -------------------------------------------------------------------------------- /roles/slurm/files/access.conf: -------------------------------------------------------------------------------- 1 | # Login access control table. 2 | # 3 | # Comment line must start with "#", no space at front. 4 | # Order of lines is important. 5 | # 6 | # When someone logs in, the table is scanned for the first entry that 7 | # matches the (user, host) combination, or, in case of non-networked 8 | # logins, the first entry that matches the (user, tty) combination. The 9 | # permissions field of that table entry determines whether the login will 10 | # be accepted or refused. 11 | # 12 | # Format of the login access control table is three fields separated by a 13 | # ":" character: 14 | # 15 | # [Note, if you supply a 'fieldsep=|' argument to the pam_access.so 16 | # module, you can change the field separation character to be 17 | # '|'. This is useful for configurations where you are trying to use 18 | # pam_access with X applications that provide PAM_TTY values that are 19 | # the display variable like "host:0".] 20 | # 21 | # permission : users : origins 22 | # 23 | # The first field should be a "+" (access granted) or "-" (access denied) 24 | # character. 25 | # 26 | # The second field should be a list of one or more login names, group 27 | # names, or ALL (always matches). A pattern of the form user@host is 28 | # matched when the login name matches the "user" part, and when the 29 | # "host" part matches the local machine name. 30 | # 31 | # The third field should be a list of one or more tty names (for 32 | # non-networked logins), host names, domain names (begin with "."), host 33 | # addresses, internet network numbers (end with "."), ALL (always 34 | # matches), NONE (matches no tty on non-networked logins) or 35 | # LOCAL (matches any string that does not contain a "." character). 36 | # 37 | # You can use @netgroupname in host or user patterns; this even works 38 | # for @usergroup@@hostgroup patterns. 39 | # 40 | # The EXCEPT operator makes it possible to write very compact rules. 41 | # 42 | # The group file is searched only when a name does not match that of the 43 | # logged-in user. Both the user's primary group is matched, as well as 44 | # groups in which users are explicitly listed. 45 | # To avoid problems with accounts, which have the same name as a group, 46 | # you can use brackets around group names '(group)' to differentiate. 47 | # In this case, you should also set the "nodefgroup" option. 48 | # 49 | # TTY NAMES: Must be in the form returned by ttyname(3) less the initial 50 | # "/dev" (e.g. tty1 or vc/1) 51 | # 52 | ############################################################################## 53 | # 54 | # Disallow non-root logins on tty1 55 | # 56 | #-:ALL EXCEPT root:tty1 57 | # 58 | # Disallow console logins to all but a few accounts. 59 | # 60 | #-:ALL EXCEPT wheel shutdown sync:LOCAL 61 | # 62 | # Same, but make sure that really the group wheel and not the user 63 | # wheel is used (use nodefgroup argument, too): 64 | # 65 | #-:ALL EXCEPT (wheel) shutdown sync:LOCAL 66 | # 67 | # Disallow non-local logins to privileged accounts (group wheel). 68 | # 69 | #-:wheel:ALL EXCEPT LOCAL .win.tue.nl 70 | # 71 | # Some accounts are not allowed to login from anywhere: 72 | # 73 | #-:wsbscaro wsbsecr wsbspac wsbsym wscosor wstaiwde:ALL 74 | # 75 | # All other accounts are allowed to login from anywhere. 76 | # 77 | ############################################################################## 78 | # All lines from here up to the end are building a more complex example. 79 | ############################################################################## 80 | # 81 | # User "root" should be allowed to get access via cron .. tty5 tty6. 82 | #+ : root : cron crond :0 tty1 tty2 tty3 tty4 tty5 tty6 83 | # 84 | # User "root" should be allowed to get access from hosts with ip addresses. 85 | #+ : root : 192.168.200.1 192.168.200.4 192.168.200.9 86 | #+ : root : 127.0.0.1 87 | # 88 | # User "root" should get access from network 192.168.201. 89 | # This term will be evaluated by string matching. 90 | # comment: It might be better to use network/netmask instead. 91 | # The same is 192.168.201.0/24 or 192.168.201.0/255.255.255.0 92 | #+ : root : 192.168.201. 93 | # 94 | # User "root" should be able to have access from domain. 95 | # Uses string matching also. 96 | #+ : root : .foo.bar.org 97 | # 98 | # User "root" should be denied to get access from all other sources. 99 | #- : root : ALL 100 | # 101 | # User "foo" and members of netgroup "nis_group" should be 102 | # allowed to get access from all sources. 103 | # This will only work if netgroup service is available. 104 | #+ : @nis_group foo : ALL 105 | # 106 | # User "john" should get access from ipv4 net/mask 107 | #+ : john : 127.0.0.0/24 108 | # 109 | # User "john" should get access from ipv4 as ipv6 net/mask 110 | #+ : john : ::ffff:127.0.0.0/127 111 | # 112 | # User "john" should get access from ipv6 host address 113 | #+ : john : 2001:4ca0:0:101::1 114 | # 115 | # User "john" should get access from ipv6 host address (same as above) 116 | #+ : john : 2001:4ca0:0:101:0:0:0:1 117 | # 118 | # User "john" should get access from ipv6 net/mask 119 | #+ : john : 2001:4ca0:0:101::/64 120 | # 121 | # All other users should be denied to get access from all sources. 122 | #- : ALL : ALL 123 | 124 | +:sudo:ALL 125 | -:ALL:ALL 126 | -------------------------------------------------------------------------------- /roles/slurm/files/cgroup.conf: -------------------------------------------------------------------------------- 1 | CgroupAutomount=yes 2 | ConstrainCores=yes 3 | -------------------------------------------------------------------------------- /roles/slurm/files/pam-common-session: -------------------------------------------------------------------------------- 1 | # 2 | # /etc/pam.d/common-session - session-related modules common to all services 3 | # 4 | # This file is included from other service-specific PAM config files, 5 | # and should contain a list of modules that define tasks to be performed 6 | # at the start and end of sessions of *any* kind (both interactive and 7 | # non-interactive). 8 | # 9 | # As of pam 1.0.1-6, this file is managed by pam-auth-update by default. 10 | # To take advantage of this, it is recommended that you configure any 11 | # local modules either before or after the default block, and use 12 | # pam-auth-update to manage selection of other modules. See 13 | # pam-auth-update(8) for details. 14 | 15 | # here are the per-package modules (the "Primary" block) 16 | session [default=1] pam_permit.so 17 | # here's the fallback if no module succeeds 18 | session requisite pam_deny.so 19 | # prime the stack with a positive return value if there isn't one already; 20 | # this avoids us returning an error just because nothing sets a success code 21 | # since the modules above will each just jump around 22 | session required pam_permit.so 23 | # The pam_umask module will set the umask according to the system default in 24 | # /etc/login.defs and user settings, solving the problem of different 25 | # umask settings with different shells, display managers, remote sessions etc. 26 | # See "man pam_umask". 27 | session optional pam_umask.so 28 | # and here are more per-package modules (the "Additional" block) 29 | session required pam_unix.so 30 | #session optional pam_systemd.so 31 | # end of pam-auth-update config 32 | -------------------------------------------------------------------------------- /roles/slurm/files/pam-sshd: -------------------------------------------------------------------------------- 1 | # PAM configuration for the Secure Shell service 2 | 3 | 4 | # Standard Un*x authentication. 5 | @include common-auth 6 | 7 | # Disallow non-root logins when /etc/nologin exists. 8 | account required pam_nologin.so 9 | 10 | # Uncomment and edit /etc/security/access.conf if you need to set complex 11 | # access limits that are hard to express in sshd_config. 12 | # account required pam_access.so 13 | 14 | # Standard Un*x authorization. 15 | @include common-account 16 | 17 | account sufficient pam_slurm.so 18 | account required pam_access.so 19 | 20 | # SELinux needs to be the first session rule. This ensures that any 21 | # lingering context has been cleared. Without this it is possible that a 22 | # module could execute code in the wrong domain. 23 | session [success=ok ignore=ignore module_unknown=ignore default=bad] pam_selinux.so close 24 | 25 | # Set the loginuid process attribute. 26 | session required pam_loginuid.so 27 | 28 | # Create a new session keyring. 29 | session optional pam_keyinit.so force revoke 30 | 31 | # Standard Un*x session setup and teardown. 32 | @include common-session 33 | 34 | # Print the message of the day upon successful login. 35 | # This includes a dynamically generated part from /run/motd.dynamic 36 | # and a static (admin-editable) part from /etc/motd. 37 | session optional pam_motd.so motd=/run/motd.dynamic 38 | session optional pam_motd.so noupdate 39 | 40 | # Print the status of the user's mailbox upon successful login. 41 | session optional pam_mail.so standard noenv # [1] 42 | 43 | # Set up user limits from /etc/security/limits.conf. 44 | session required pam_limits.so 45 | 46 | # Read environment variables from /etc/environment and 47 | # /etc/security/pam_env.conf. 48 | session required pam_env.so # [1] 49 | # In Debian 4.0 (etch), locale-related environment variables were moved to 50 | # /etc/default/locale, so read that as well. 51 | session required pam_env.so user_readenv=1 envfile=/etc/default/locale 52 | 53 | # SELinux needs to intervene at login time to ensure that the process starts 54 | # in the proper default security context. Only sessions which are intended 55 | # to run in the user's context should be run after this. 56 | session [success=ok ignore=ignore module_unknown=ignore default=bad] pam_selinux.so open 57 | 58 | # Standard Un*x password updating. 59 | @include common-password 60 | -------------------------------------------------------------------------------- /roles/slurm/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for slurm 3 | - name: apt install slurm 4 | apt: 5 | name: "{{ item }}" 6 | state: present 7 | become: yes 8 | with_items: 9 | - slurm-wlm=17.11.2-1build1 10 | - libpam-slurm=17.11.2-1build1 11 | - name: copy munge key from ln nodes 12 | copy: 13 | backup: yes 14 | src: /etc/munge/munge.key 15 | dest: "{{role_path}}/files/munge.key" 16 | remote_src: yes 17 | become: yes 18 | when: inventory_hostname == master_name 19 | - name: copy munge key to cn nodes 20 | copy: 21 | backup: yes 22 | src: munge.key 23 | dest: /etc/munge/munge.key 24 | owner: munge 25 | group: munge 26 | mode: "400" 27 | become: yes 28 | register: cnmungekey 29 | - name: ensure munged is started 30 | become: yes 31 | service: 32 | name: munge 33 | state: started 34 | enabled: yes 35 | - name: restart munged 36 | become: yes 37 | service: 38 | name: munge 39 | state: restarted 40 | when: cnmungekey.changed 41 | - name: config slurm 42 | become: yes 43 | template: 44 | src: slurm.conf 45 | dest: /etc/slurm-llnl/slurm.conf 46 | backup: yes 47 | register: lnslurm 48 | - name: config gres 49 | become: yes 50 | template: 51 | src: gres.conf 52 | dest: /etc/slurm-llnl/gres.conf 53 | backup: yes 54 | register: lngres 55 | - name: config cgroup 56 | become: yes 57 | copy: 58 | src: cgroup.conf 59 | dest: /etc/slurm-llnl/cgroup.conf 60 | backup: yes 61 | register: lncgroup 62 | - name: add smail program for mail sending 63 | template: 64 | src: smail.sh 65 | dest: /usr/bin/smail.sh 66 | owner: "{{ slurm_user }}" 67 | mode: 0700 68 | become: yes 69 | - name: add pam module in pamd/sshd 70 | become: yes 71 | copy: 72 | src: pam-sshd 73 | dest: /etc/pam.d/sshd 74 | when: inventory_hostname in groups['cn'] 75 | - name: comment pam_systemd 76 | become: yes 77 | copy: 78 | src: pam-common-session 79 | dest: /etc/pam.d/common-session 80 | when: inventory_hostname in groups['cn'] 81 | - name: add ssh permission to sudo group 82 | become: yes 83 | copy: 84 | src: access.conf 85 | dest: /etc/security/access.conf 86 | backup: yes 87 | when: inventory_hostname in groups['cn'] 88 | - name: install slurmdbd on master 89 | become: yes 90 | when: inventory_hostname == dbdhost 91 | apt: 92 | name: "{{ item }}" 93 | state: present 94 | update_cache: yes 95 | with_items: 96 | - slurmdbd=17.11.2-1build1 97 | - mysql-server 98 | - python-mysqldb 99 | - libmysqlclient-dev 100 | - python3-mysqldb 101 | register: lnslurmdbd 102 | - name: ensure mysql is running 103 | become: yes 104 | when: inventory_hostname == dbdhost 105 | service: 106 | name: mysql 107 | state: started 108 | - name: create mysql user 109 | become: yes 110 | when: inventory_hostname == dbdhost 111 | mysql_user: 112 | login_host: localhost 113 | login_user: root 114 | name: "{{ db_user }}" 115 | password: "{{ db_pass }}" 116 | priv: 'slurm_acct_db.*:ALL' 117 | host: localhost 118 | state: present 119 | update_password: on_create 120 | register: mysqluser 121 | - name: restart mysql 122 | become: yes 123 | when: inventory_hostname == dbdhost and mysqluser.changed 124 | service: 125 | name: mysql 126 | state: restarted 127 | - name: config file for slurmdbd 128 | become: yes 129 | when: inventory_hostname == dbdhost 130 | template: 131 | src: slurmdbd.conf 132 | dest: /etc/slurm-llnl/slurmdbd.conf 133 | backup: yes 134 | register: slurmdbdconf 135 | - name: ensure slurmdbd is started 136 | become: yes 137 | service: 138 | name: slurmdbd 139 | state: started 140 | when: inventory_hostname == dbdhost 141 | - name: restart slurmdbd 142 | become: yes 143 | when: inventory_hostname == dbdhost and slurmdbdconf.changed 144 | service: 145 | name: slurmdbd 146 | state: restarted 147 | - name: add cluster to database 148 | command: "sacctmgr add cluster {{cluster_name}} -i" 149 | when: inventory_hostname == dbdhost 150 | failed_when: clusterr.rc != 1 and clusterr.rc != 0 151 | changed_when: not clusterr.stdout.startswith("This cluster") 152 | register: clusterr 153 | - name: ensure slurmctld is started # slurmctld must be started after slurmdbd 154 | become: yes 155 | service: 156 | name: slurmctld 157 | state: started 158 | when: inventory_hostname in ctldhost 159 | - name: start slurmctld 160 | become: yes 161 | service: 162 | name: slurmctld 163 | state: restarted 164 | when: inventory_hostname in ctldhost and (lnslurm.changed or lncgroup.changed or lngres.changed) 165 | - name: ensure slurmd is started 166 | become: yes 167 | service: 168 | name: slurmd 169 | enabled: yes 170 | state: started 171 | when: inventory_hostname in groups['cn'] 172 | - name: restart slurmd 173 | become: yes 174 | service: 175 | name: slurmd 176 | state: restarted 177 | when: (lnslurm.changed or lncgroup.changed or lngres.changed) and inventory_hostname in groups['cn'] 178 | -------------------------------------------------------------------------------- /roles/slurm/templates/gres.conf: -------------------------------------------------------------------------------- 1 | {% for h in groups['cn'] %} 2 | {% if h in groups['gn'] %} 3 | Nodename={{ h }} Name=gpu Type={{ hostvars[h]['gputype']|default('RTX2080TI') }} File={{ "/dev/nvidia0" if hostvars[h]['gpuno'] == 1 else "/dev/nvidia[0-" ~ (hostvars[h]['gpuno']-1) ~ "]" |default("/dev/nvidia[0-1]") }} 4 | {% endif %} 5 | {% endfor %} 6 | -------------------------------------------------------------------------------- /roles/slurm/templates/slurm.conf: -------------------------------------------------------------------------------- 1 | # slurm.conf file generated by configurator.html. 2 | # Put this file on all nodes of your cluster. 3 | # See the slurm.conf man page for more information. 4 | # 5 | ControlMachine={{ ctldhost[0] }} 6 | #ControlAddr= 7 | BackupController={{ ctldhost[1] }} 8 | #BackupAddr= 9 | # 10 | AuthType=auth/munge 11 | #CheckpointType=checkpoint/none 12 | CryptoType=crypto/munge 13 | #DisableRootJobs=NO 14 | #EnforcePartLimits=NO 15 | #Epilog= 16 | #EpilogSlurmctld= 17 | #FirstJobId=1 18 | #MaxJobId=999999 19 | GresTypes=gpu 20 | #GroupUpdateForce=0 21 | #GroupUpdateTime=600 22 | #JobCheckpointDir=/var/slurm/checkpoint 23 | #JobCredentialPrivateKey= 24 | #JobCredentialPublicCertificate= 25 | #JobFileAppend=0 26 | #JobRequeue=1 27 | #JobSubmitPlugins=1 28 | #KillOnBadExit=0 29 | #LaunchType=launch/slurm 30 | #Licenses=foo*4,bar 31 | MailProg=/usr/bin/smail.sh 32 | MaxJobCount=1000 33 | #MaxStepCount=40000 34 | #MaxTasksPerNode=128 35 | MpiDefault=none 36 | #MpiParams=ports=#-# 37 | #PluginDir= 38 | #PlugStackConfig= 39 | #PrivateData=jobs 40 | ProctrackType=proctrack/cgroup 41 | #Prolog= 42 | PrologFlags=contain # for pam module 43 | #PrologSlurmctld= 44 | #PropagatePrioProcess=0 45 | PropagateResourceLimits=NONE 46 | #PropagateResourceLimitsExcept= 47 | #RebootProgram= 48 | ReturnToService=1 49 | #SallocDefaultCommand= 50 | SlurmctldPidFile=/var/run/slurm-llnl/slurmctld.pid 51 | SlurmctldPort=6817 52 | SlurmdPidFile=/var/run/slurm-llnl/slurmd.pid 53 | SlurmdPort=6818 54 | SlurmdSpoolDir=/tmp/slurmd 55 | SlurmUser={{ slurm_user }} 56 | #SlurmdUser=root 57 | #SrunEpilog= 58 | #SrunProlog= 59 | StateSaveLocation={{ slurm_spool_path }} 60 | SwitchType=switch/none 61 | #TaskEpilog= 62 | TaskPlugin=task/affinity 63 | TaskPluginParam=Sched 64 | #TaskProlog= 65 | #TopologyPlugin=topology/tree 66 | #TmpFS=/tmp 67 | #TrackWCKey=no 68 | #TreeWidth= 69 | #UnkillableStepProgram= 70 | #UsePAM=0 71 | # 72 | # 73 | # TIMERS 74 | #BatchStartTimeout=10 75 | #CompleteWait=0 76 | #EpilogMsgTime=2000 77 | #GetEnvTimeout=2 78 | #HealthCheckInterval=0 79 | #HealthCheckProgram= 80 | InactiveLimit=0 81 | KillWait=30 82 | #MessageTimeout=10 83 | #ResvOverRun=0 84 | MinJobAge=300 85 | #OverTimeLimit=0 86 | SlurmctldTimeout=120 87 | SlurmdTimeout=300 88 | #UnkillableStepTimeout=60 89 | #VSizeFactor=0 90 | Waittime=0 91 | # 92 | # 93 | # SCHEDULING 94 | #DefMemPerCPU=0 95 | FastSchedule=0 96 | #MaxMemPerCPU=0 97 | #SchedulerTimeSlice=30 98 | SchedulerType=sched/backfill 99 | SelectType=select/cons_res 100 | SelectTypeParameters=CR_Core 101 | # 102 | # 103 | # JOB PRIORITY 104 | #PriorityFlags= 105 | PriorityType=priority/multifactor 106 | PriorityDecayHalfLife=0 107 | #PriorityCalcPeriod= 108 | PriorityFavorSmall=YES 109 | #PriorityMaxAge= 110 | PriorityUsageResetPeriod=YEARLY 111 | PriorityWeightAge=1000 112 | PriorityWeightFairshare=300 113 | PriorityWeightJobSize=100 114 | #PriorityWeightPartition= 115 | PriorityWeightQOS=600 116 | # 117 | # 118 | # LOGGING AND ACCOUNTING 119 | AccountingStorageEnforce=limits,qos 120 | #AccountingStorageHost= 121 | #AccountingStorageLoc= 122 | #AccountingStoragePass= 123 | AccountingStoragePort=6819 124 | AccountingStorageType=accounting_storage/slurmdbd 125 | AccountingStorageHost={{ dbdhost }} 126 | #AccountingStorageUser= 127 | AccountingStorageTRES=gres/gpu,gres/gpu:RTX2080TI 128 | AccountingStoreJobComment=YES 129 | ClusterName={{ cluster_name }} 130 | #DebugFlags= 131 | #JobCompHost= 132 | #JobCompLoc= 133 | #JobCompPass= 134 | #JobCompPort= 135 | JobCompType=jobcomp/none 136 | #JobCompUser= 137 | #JobContainerType=job_container/none 138 | JobAcctGatherFrequency=30 139 | JobAcctGatherType=jobacct_gather/none 140 | SlurmctldDebug=3 141 | #SlurmctldLogFile= 142 | SlurmdDebug=3 143 | #SlurmdLogFile= 144 | #SlurmSchedLogFile= 145 | #SlurmSchedLogLevel= 146 | # 147 | # 148 | # POWER SAVE SUPPORT FOR IDLE NODES (optional) 149 | #SuspendProgram= 150 | #ResumeProgram= 151 | #SuspendTimeout= 152 | #ResumeTimeout= 153 | #ResumeRate= 154 | #SuspendExcNodes= 155 | #SuspendExcParts= 156 | #SuspendRate= 157 | #SuspendTime= 158 | # 159 | # 160 | # COMPUTE NODES 161 | 162 | {% for h in groups['cn'] %} 163 | {% if h in groups['gn'] %} 164 | NodeName={{ h }} State=UNKNOWN Weight=20 CoresPerSocket={{ hostvars[h]['corespersocket']|default('14') }} Sockets=2 ThreadsPerCore=2 RealMemory={{ hostvars[h]['memory']|default('128000') }} Gres=gpu:{{ hostvars[h]['gputype']|default('RTX2080TI') }}:{{ hostvars[h]['gpuno']|default("2") }} 165 | {% else %} 166 | NodeName={{ h }} State=UNKNOWN Weight=10 CoresPerSocket={{ hostvars[h]['corespersocket']|default('14') }} Sockets=2 ThreadsPerCore=2 RealMemory={{ hostvars[h]['memory']|default('128000') }} 167 | {% endif %} 168 | {% endfor %} 169 | # NodeName={{ master_name }} State=UNKNOWN Weight=30 CoresPerSocket=14 Sockets=2 ThreadsPerCore=2 RealMemory=128000 170 | 171 | PartitionName=general MaxTime=Infinite Nodes={% for h in groups['general'] %}{{h+"," if not loop.last else h}}{% endfor %} PriorityJobFactor=5000 Default=YES State=UP 172 | PartitionName=hyper MaxTime=Infinite Nodes={% for h in groups['hyper'] %}{{h+"," if not loop.last else h}}{% endfor %} PriorityJobFactor=5000 Default=NO State=UP 173 | PartitionName=debug MaxTime=00:30:00 Nodes=ALL PriorityJobFactor=50000 Default=NO State=UP 174 | {% if groups['cn']|intersect(groups['gn']) %} 175 | PartitionName=gpu MaxTime=Infinite Nodes={% for h in groups['cn']|intersect(groups['gn']) %}{{h+"," if not loop.last else h}}{% endfor %} Default=No AllowAccounts=ubuntu TRESBillingWeights="CPU=1.0,GRES/gpu=3.0" 176 | {% endif %} 177 | -------------------------------------------------------------------------------- /roles/slurm/templates/slurmdbd.conf: -------------------------------------------------------------------------------- 1 | ArchiveEvents=yes 2 | ArchiveJobs=yes 3 | AuthType=auth/munge 4 | DbdHost={{ dbdhost }} 5 | DebugLevel=4 6 | PurgeEventAfter=1month 7 | PurgeJobAfter=24month 8 | PurgeStepAfter=1month 9 | PurgeSuspendAfter=1month 10 | LogFile=/var/log/slurmdbd.log 11 | PidFile=/var/run/slurm-llnl/slurmdbd.pid 12 | SlurmUser={{ slurm_user }} 13 | StorageHost=localhost 14 | StoragePass={{ db_pass }} 15 | StorageType=accounting_storage/mysql 16 | StorageUser={{ db_user }} 17 | -------------------------------------------------------------------------------- /roles/slurm/templates/smail.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MAIL=/usr/bin/mail 3 | 4 | echo "$2"|$MAIL -s "$2" $3 -r {{ slurm_mail }} 5 | -------------------------------------------------------------------------------- /roles/spack/README.md: -------------------------------------------------------------------------------- 1 | Spack 2 | ========= 3 | 4 | This role will install spack, a flexible HPC package manager and configure it. 5 | 6 | Requirements 7 | ------------- 8 | 9 | You may want to config git by `git config --global url.https://github.com/.insteadOf git://github.com/` for air-gapped cluster, otherwise `spack install lmod` doesn't work since some of the dependences would be fetched by git which won't go through http proxy by default. 10 | 11 | 12 | Role Variables 13 | -------------- 14 | 15 | See defaults/main.yml. Only spack_path is needed, which specify the install path of spack. We highly recommend you install it on some admin user's home directory, which can be available for all users. For clusters, it is important to share /home on master to all nodes via nfs, such that spack is available to all nodes. 16 | 17 | Templates and Files 18 | -------------- 19 | 20 | It is worth noting that spack config yaml files in files dir are very **specific** and not universal at all. It assumes that you would install intel parallel studio on /opt dir. Especially, in packages.yaml, there is specific information on external packages' path which you may want to edit before running the role. 21 | -------------------------------------------------------------------------------- /roles/spack/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # defaults file for spack 3 | spack_path: "/home/ubuntu/spack" 4 | -------------------------------------------------------------------------------- /roles/spack/files/compilers.yaml: -------------------------------------------------------------------------------- 1 | compilers: 2 | - compiler: 3 | environment: {} 4 | extra_rpaths: [] 5 | flags: {} 6 | modules: [] 7 | operating_system: ubuntu18.04 8 | paths: 9 | cc: /usr/bin/gcc 10 | cxx: /usr/bin/g++ 11 | f77: /usr/bin/gfortran 12 | fc: /usr/bin/gfortran 13 | spec: gcc@7.4.0 14 | target: x86_64 15 | - compiler: 16 | operating_system: ubuntu18.04 17 | modules: [intel-parallel-studio-2019-gcc-7.4.0-xl] 18 | paths: 19 | cc: /opt/intel/compilers_and_libraries/linux/bin/intel64/icc 20 | cxx: /opt/intel/compilers_and_libraries/linux/bin/intel64/icpc 21 | f77: /opt/intel/compilers_and_libraries/linux/bin/intel64/ifort 22 | fc: /opt/intel/compilers_and_libraries/linux/bin/intel64/ifort 23 | spec: intel@2019 24 | target: x86_64 25 | -------------------------------------------------------------------------------- /roles/spack/files/modules.yaml: -------------------------------------------------------------------------------- 1 | modules: 2 | tcl: 3 | hash_length: 2 4 | # naming_scheme: '{name}/{version}-{compiler.name}-{compiler.version}' 5 | all: 6 | suffixes: 7 | ^python@3.6.5: 'py3' 8 | ^python@2.7: 'py2' 9 | ^openblas: 'openblas' 10 | ^openmpi: 'ompi' 11 | ^intelmpi: 'impi' 12 | intel-parallel-studio: 13 | filter: 14 | environment_blacklist: ['PS1'] 15 | environment: 16 | set: 17 | CPATH: "/opt/intel/compilers_and_libraries_2019.4.243/linux/ipp/include:/opt/intel/compilers_and_libraries_2019.4.243/linux/mkl/include:/opt/intel/compilers_and_libraries_2019.4.243/linux/pstl/include:/opt/intel/compilers_and_libraries_2019.4.243/linux/tbb/include:/opt/intel/compilers_and_libraries_2019.4.243/linux/daal/include:/opt/intel/include:/opt/intel/compilers_and_libraries_2019.4.243/linux/mpi/intel64/include" 18 | FI_PROVIDER_PATH: "/opt/intel/compilers_and_libraries_2019.4.243/linux/mpi/intel64/libfabric/lib/prov" 19 | FI_PROVIDER: sockets 20 | I_MPI_ROOT: "/opt/intel/compilers_and_libraries_2019.4.243/linux/mpi" 21 | prepend_path: 22 | LD_LIBRARY_PATH: "/opt/intel/itac/2019.4.036/intel64/slib:/opt/intel/compilers_and_libraries_2019.4.243/linux/compiler/lib/intel64_lin:/opt/intel/compilers_and_libraries_2019.4.243/linux/mpi/intel64/libfabric/lib:/opt/intel/compilers_and_libraries_2019.4.243/linux/mpi/intel64/lib/release:/opt/intel/compilers_and_libraries_2019.4.243/linux/mpi/intel64/lib:/opt/intel/compilers_and_libraries_2019.4.243/linux/ipp/lib/intel64:/opt/intel/compilers_and_libraries_2019.4.243/linux/mkl/lib/intel64_lin:/opt/intel/compilers_and_libraries_2019.4.243/linux/tbb/lib/intel64/gcc4.7:/opt/intel/debugger_2019/libipt/intel64/lib:/opt/intel/compilers_and_libraries_2019.4.243/linux/daal/lib/intel64_lin:/opt/intel/compilers_and_libraries_2019.4.243/linux/daal/../tbb/lib/intel64_lin/gcc4.4:/opt/intel/lib" 23 | petsc: 24 | environment: 25 | set: 26 | PETSC_ARCH: ubuntu+intel 27 | slepc: 28 | environment: 29 | set: 30 | SLEPC_DIR: /home/ubuntu/softwares/petsc-slepc/slepc-3.11.1 31 | intel-mkl: 32 | environment: 33 | prepend_path: 34 | LD_LIBRARY_PATH: "/opt/intel/mkl/lib/intel64" 35 | -------------------------------------------------------------------------------- /roles/spack/files/packages.yaml: -------------------------------------------------------------------------------- 1 | packages: 2 | openmpi: 3 | paths: 4 | openmpi@2.1.1%gcc: /usr/lib/x86_64-linux-gnu/openmpi 5 | slurm: 6 | paths: 7 | slurm@17.11.2%gcc: /usr/lib/x86_64-linux-gnu/slurm-wlm 8 | jdk: 9 | paths: 10 | jdk@1.8.0_212 %gcc@7.4.0 arch=linux-ubuntu18.04-x86_64: /usr/lib/jvm/java-8-openjdk-amd64 11 | version: 12 | - 1.8.0_212 13 | mathematica: 14 | paths: 15 | mathematica@11.0.1: /opt/mathematica/11.0.1 16 | matlab: 17 | paths: 18 | matlab@2018b: /opt/matlab/2018b 19 | intel-parallel-studio: 20 | paths: 21 | intel-parallel-studio@2019: /opt/intel 22 | compiler: [intel@2019] 23 | intel-mkl: 24 | paths: 25 | intel-mkl@2019: /opt/intel/mkl 26 | intel-mpi: 27 | paths: 28 | intel-mpi@2019: /opt/intel/impi 29 | petsc: 30 | paths: 31 | petsc@3.11.2%intel: /home/ubuntu/softwares/petsc-slepc/petsc-3.11.2 32 | slepc: 33 | paths: 34 | slepc@3.11.1%intel: /home/ubuntu/softwares/petsc-slepc/slepc-3.11.1 35 | armadillo: 36 | paths: 37 | armadillo@9.300.2%intel: /home/ubuntu/softwares/armadillo/armadillo-9.300.2 38 | #all: 39 | #providers: 40 | # mpi: [openmpi, intel-parallel-studio+mpi] 41 | # blas: [openblas, intel-parallel-studio+mkl] 42 | # lapack: [openblas, intel-parallel-studio+mkl] 43 | # scalapack: [netlib-scalapack, intel-parallel-studio+mkl] 44 | -------------------------------------------------------------------------------- /roles/spack/files/repo.yaml: -------------------------------------------------------------------------------- 1 | repo: 2 | namespace: override 3 | -------------------------------------------------------------------------------- /roles/spack/files/repos.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - $spack/var/spack/repos/override 3 | - $spack/var/spack/repos/builtin 4 | -------------------------------------------------------------------------------- /roles/spack/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for spack 3 | - name: download spack 4 | git: 5 | repo: "https://github.com/spack/spack" 6 | update: no 7 | force: no 8 | dest: "{{ spack_path }}" 9 | when: inventory_hostname in groups['ln'] 10 | - name: spack install lmod for module management 11 | shell: "source {{spack_path}}/share/spack/setup-env.sh&&spack install lmod" 12 | args: 13 | executable: /bin/bash 14 | when: inventory_hostname in groups['ln'] 15 | changed_when: not lmodr.stdout.startswith("==> lmod is already installed") 16 | register: lmodr 17 | - name: update etc profile to activate spack when start 18 | become: yes 19 | lineinfile: 20 | path: /etc/profile 21 | regexp: "/share/spack/setup-env.sh$" 22 | line: "{{ 'source '+spack_path+'/share/spack/setup-env.sh' }}" 23 | - name: update etc profile to activate module system 24 | become: yes 25 | lineinfile: 26 | path: /etc/profile 27 | line: "source $(spack location -i lmod)/lmod/lmod/init/bash # load module" 28 | regexp: ".* # load module" 29 | - name: create override repo for spack 30 | when: inventory_hostname in groups['ln'] 31 | file: 32 | path: "{{ spack_path }}/var/spack/repos/override/packages" 33 | state: directory 34 | - name: create the repo.yaml for the new repo 35 | when: inventory_hostname in groups['ln'] 36 | copy: 37 | src: repo.yaml 38 | dest: "{{ spack_path }}/var/spack/repos/override/repo.yaml" 39 | - name: spack config file 40 | when: inventory_hostname in groups['ln'] 41 | copy: 42 | src: "{{ item }}" 43 | dest: "{{ spack_path+'/etc/spack/'+item }}" 44 | backup: yes 45 | with_items: 46 | - packages.yaml 47 | - compilers.yaml 48 | - modules.yaml 49 | - repos.yaml 50 | - name: add shortcut for spack activation 51 | become: yes 52 | template: 53 | src: spack-load 54 | dest: /etc/spack-load 55 | mode: 0755 56 | -------------------------------------------------------------------------------- /roles/spack/templates/pyinstall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | VER=3.6.5 3 | source {{spack_path}}/share/spack/setup-env.sh 4 | source $(spack location -i lmod)/lmod/lmod/init/bash 5 | 6 | spack install python@$VER 7 | spack install py-pip ^python@$VER 8 | 9 | # spack install py-numpy ^python@3.6.5 10 | 11 | spack load python@$VER 12 | spack load py-pip ^python@$VER 13 | spack load py-setuptools ^python@$VER 14 | 15 | 16 | -------------------------------------------------------------------------------- /roles/spack/templates/spack-load: -------------------------------------------------------------------------------- 1 | source {{ spack_path }}/share/spack/setup-env.sh 2 | source $(spack location -i lmod)/lmod/lmod/init/bash # load module 3 | -------------------------------------------------------------------------------- /roles/storage/README.md: -------------------------------------------------------------------------------- 1 | Storage 2 | ========= 3 | 4 | This role is designed to configure storage stuff, including local mount, nfs and tmp clean. 5 | 6 | Requirements 7 | ------------ 8 | 9 | This is the last of three roles to build cluster infrastructure, following network and basic. 10 | 11 | Role Variables 12 | -------------- 13 | 14 | See defaults/main.yml. 15 | 16 | `tmp_time` is the time to delete files in tmp folder after the file's last change. 17 | -------------------------------------------------------------------------------- /roles/storage/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # defaults file for storage 3 | nfs_dir: 4 | - dir: "/home" 5 | host: "master" 6 | mnt: "/home" 7 | - dir: "/opt" 8 | host: "master" 9 | mnt: "/opt" 10 | - dir: "/DATA" 11 | host: "master" 12 | mnt: "/DATA" 13 | - dir: "/DATA.c8" 14 | host: "c8" 15 | mnt: "/DATA.c8" 16 | local_disk: 17 | - dev: "/dev/sdb1" 18 | host: "master" 19 | mnt: "/DATA" 20 | - dev: "/dev/sdc1" 21 | host: "master" 22 | mnt: "/BACKUP" 23 | - dev: "/dev/sdb1" 24 | host: "c8" 25 | mnt: "/DATA.c8" 26 | - dev: "/dev/sdb1" 27 | host: "c10" 28 | mnt: "/tmp" 29 | - dev: "/dev/sdb1" 30 | host: "c11" 31 | mnt: "/tmp" 32 | - dev: "/dev/sdb1" 33 | host: "c12" 34 | mnt: "/tmp" 35 | - dev: "/dev/sdb1" 36 | host: "c13" 37 | mnt: "/tmp" 38 | - dev: "/dev/sdb1" 39 | host: "c14" 40 | mnt: "/tmp" 41 | tmp_time: "15d" 42 | -------------------------------------------------------------------------------- /roles/storage/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for storage 3 | - name: install nfs server on sn node 4 | become: yes 5 | apt: 6 | name: nfs-kernel-server=1:1.3.4-2.1ubuntu5.2 7 | state: present 8 | when: inventory_hostname in groups['sn'] 9 | - name: install nfs client on all nodes 10 | become: yes 11 | apt: 12 | name: nfs-common 13 | state: present 14 | - name: update nfs config file on ln node 15 | become: yes 16 | template: 17 | src: ../templates/exports 18 | dest: /etc/exports 19 | backup: yes 20 | owner: root 21 | register: lnnfs 22 | when: inventory_hostname in groups['sn'] 23 | - name: ensure nfs service start on ln node 24 | become: yes 25 | service: 26 | name: nfs-kernel-server 27 | state: started 28 | when: inventory_hostname in groups['sn'] 29 | - name: restart nfs service 30 | become: yes 31 | service: 32 | name: nfs-kernel-server 33 | state: restarted 34 | when: lnnfs.changed and inventory_hostname in groups['sn'] 35 | - name: mount localdisk 36 | become: yes 37 | mount: 38 | path: "{{ item.mnt }}" 39 | src: "{{ item.dev }}" 40 | fstype: ext4 41 | state: mounted 42 | when: inventory_hostname == item.host 43 | with_items: "{{ local_disk }}" 44 | - name: make sure the mount dir exist in cn nodes 45 | when: inventory_hostname != item.host 46 | become: yes 47 | file: 48 | path: "{{ item.mnt }}" 49 | state: directory 50 | with_items: "{{ nfs_dir }}" 51 | - name: mount dir on all other nodes 52 | become: yes 53 | mount: 54 | name: "{{ item.mnt }}" 55 | src: "{{ item.host }}:{{ item.dir }}" 56 | fstype: nfs 57 | state: mounted 58 | when: inventory_hostname != item.host 59 | with_items: "{{ nfs_dir }}" 60 | - name: install tmpreaper on all nodes 61 | become: yes 62 | apt: 63 | name: tmpreaper 64 | state: present 65 | - name: change the config of tmpreaper 66 | template: 67 | src: tmpreaper.conf 68 | dest: /etc/tmpreaper.conf 69 | backup: yes 70 | become: yes 71 | -------------------------------------------------------------------------------- /roles/storage/templates/exports: -------------------------------------------------------------------------------- 1 | # /etc/exports: the access control list for filesystems which may be exported 2 | # to NFS clients. See exports(5). 3 | # 4 | # Example for NFSv2 and NFSv3: 5 | # /srv/homes hostname1(rw,sync,no_subtree_check) hostname2(ro,sync,no_subtree_check) 6 | # 7 | # Example for NFSv4: 8 | # /srv/nfs4 gss/krb5i(rw,sync,fsid=0,crossmnt,no_subtree_check) 9 | # /srv/nfs4/homes gss/krb5i(rw,sync,no_subtree_check) 10 | # 11 | {% for d in nfs_dir %} 12 | {% if d.host == inventory_hostname %} 13 | {{ d.dir }} {{ ip_range }}/{{ mask }}(rw,sync,no_root_squash) 14 | {% endif %} 15 | {% endfor %} 16 | -------------------------------------------------------------------------------- /roles/storage/templates/tmpreaper.conf: -------------------------------------------------------------------------------- 1 | # tmpreaper.conf 2 | # - local configuration for tmpreaper's daily run 3 | # 4 | # This is only used if /etc/cron.daily/tmpreaper was also updated, 5 | # i.e. there's a line ". /etc/tmpreaper.conf" in that file. 6 | # The shell code that used to be here (pre version 1.6.7) is now 7 | # in the cron.daily script. 8 | 9 | # Remove the next line if you understand the possible security implications of 10 | # having tmpreaper run automatically; 11 | # see /usr/share/doc/tmpreaper/README.security.gz 12 | 13 | #SHOWWARNING=true 14 | 15 | # TMPREAPER_TIME 16 | # is the max. age of files before they're removed. 17 | # default: 18 | # the TMPTIME value in /etc/default/rcS if it's there, else 19 | # TMPREAPER_TIME=7d (for 7 days) 20 | # I recommend setting the value in /etc/default/rcS, as 21 | # that is used to clean out /tmp whenever the system is booted. 22 | # 23 | # TMPREAPER_PROTECT_EXTRA 24 | # are extra patterns that you may want to protect. 25 | # Example: 26 | # TMPREAPER_PROTECT_EXTRA='/tmp/isdnctrl* /tmp/important*' 27 | # 28 | # TMPREAPER_DIRS 29 | # are the directories to clean up. 30 | # *never* supply / here! That will wipe most of your system! 31 | # Example: 32 | # TMPREAPER_DIRS='/tmp/. /var/tmp/.' 33 | # 34 | # TMPREAPER_DELAY 35 | # defines the maximum (randomized) delay before starting processing. 36 | # See the manpage entry for --delay. Default is 256. 37 | # Example: 38 | # TMPREAPER_DELAY='256' 39 | # 40 | # TMPREAPER_ADDITIONALOPTIONS 41 | # extra options that are passed to tmpreaper, e.g. --all 42 | 43 | # uncomment and change the next line to overrule the /etc/default/rcS value 44 | # TMPREAPER_TIME=7d 45 | 46 | TMPREAPER_PROTECT_EXTRA='/tmp/slurm*' 47 | TMPREAPER_DIRS='/tmp/.' 48 | TMPREAPER_DELAY='256' 49 | TMPREAPER_ADDITIONALOPTIONS='' 50 | TMPREAPER_TIME='{{ tmp_time }}' 51 | -------------------------------------------------------------------------------- /roles/user/README.md: -------------------------------------------------------------------------------- 1 | User 2 | ========= 3 | 4 | This role is designed to add new users on the cluster as easy as possible. 5 | 6 | Requirements 7 | ------------ 8 | 9 | You must run `slurm` role first and have configured slurm cluster, account, user, qos info by `sacctmgr`. 10 | 11 | If you use quota to limit normal users disk usage, you also need to firstly configure quota. 12 | 13 | Role Variables 14 | -------------- 15 | 16 | See defaults/main.yml. It is worth noting, for every user item, only name, password, and uid is necessary. Others will be the default value if not given. 17 | 18 | Templates and Files 19 | -------------- 20 | memory.conf and nproc.conf in files dir are very **specific**. You may want to change the specific limits based on your needs and your hardware specs. -------------------------------------------------------------------------------- /roles/user/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # defaults file for user 3 | set_quota: yes 4 | sacct_default_account: root 5 | sacct_default_qos: normal 6 | quota_default_soft: 0 7 | quota_default_hard: 0 8 | quota_root: "/" 9 | other_user_dir: yes 10 | data_root: 11 | - "/DATA" 12 | users: 13 | - name: "test0" 14 | uid: 20000 15 | password: "123456sobad" 16 | quota_soft: 10G 17 | quota_hard: 20G 18 | home_permission: "0755" 19 | sacct_account: root 20 | sacct_qos: normal 21 | -------------------------------------------------------------------------------- /roles/user/files/memory.conf: -------------------------------------------------------------------------------- 1 | * hard as 224000000 2 | @sudo hard as unlimited 3 | -------------------------------------------------------------------------------- /roles/user/files/nproc.conf: -------------------------------------------------------------------------------- 1 | * soft nproc 8192 2 | root soft nproc unlimited 3 | -------------------------------------------------------------------------------- /roles/user/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for user 3 | - name: create users 4 | become: yes 5 | user: 6 | name: "{{ item.name }}" 7 | uid: "{{ item.uid }}" 8 | shell: /bin/bash 9 | password: "{{ item.password|password_hash('sha512') }}" 10 | update_password: on_create 11 | generate_ssh_key: yes 12 | with_items: "{{ users }}" 13 | when: inventory_hostname in groups['ln'] 14 | - name: permission for home dir 15 | become: yes 16 | file: 17 | mode: "{{ item['home_permission']|default('0700') }}" 18 | state: directory 19 | path: "/home/{{ item.name }}" 20 | owner: "{{ item.name }}" 21 | with_items: "{{ users }}" 22 | when: inventory_hostname in groups['ln'] 23 | - name: create dir in DATA 24 | become: yes 25 | file: 26 | mode: "0755" 27 | state: directory 28 | path: "{{ item[1] }}/{{ item[0].name }}" 29 | owner: "{{ item[0].name }}" 30 | loop: "{{ users |product(data_root)|list }}" 31 | when: inventory_hostname in groups['ln'] and other_user_dir 32 | - name: create users in cn 33 | become: yes 34 | user: 35 | name: "{{ item.name }}" 36 | uid: "{{ item.uid }}" 37 | shell: /bin/bash 38 | password: "{{ item.password|password_hash('sha512') }}" 39 | update_password: on_create 40 | generate_ssh_key: no 41 | create_home: no 42 | with_items: "{{ users }}" 43 | when: inventory_hostname in groups['cn'] 44 | - name: cat ssh pubkey 45 | register: pubkey 46 | become: yes 47 | when: inventory_hostname in groups['ln'] 48 | copy: 49 | src: "{{ '/home/'+item.name+'/.ssh/id_rsa.pub' }}" 50 | dest: "{{ role_path+'/files/'+item.name+'.pub' }}" 51 | remote_src: true 52 | with_items: "{{ users }}" 53 | - name: authorized keys add 54 | become: yes 55 | authorized_key: 56 | exclusive: no 57 | user: "{{ item.name }}" 58 | key: "{{ lookup('file', '../files/'+item.name+'.pub') }}" 59 | with_items: "{{ users }}" 60 | when: inventory_hostname in groups['ln'] 61 | - name: add user into slurm database 62 | command: "sacctmgr -i add user {{ item.name }} account={{ item['sacct_account']|default(sacct_default_account) }} qos={{ item['sacct_qos']|default(sacct_default_qos) }}" 63 | when: inventory_hostname in groups['ln'] 64 | with_items: "{{ users }}" 65 | failed_when: r.rc != 1 and r.rc != 0 66 | changed_when: r.rc == 0 67 | register: r 68 | - name: add nproc limit to all nodes avoding shell fork 69 | become: yes 70 | copy: 71 | src: nproc.conf 72 | dest: /etc/security/limits.d/nproc.conf 73 | - name: add memory limit to ln nodes for normal user 74 | become: yes 75 | copy: 76 | src: memory.conf 77 | dest: /etc/security/limits.d/memory.conf 78 | when: inventory_hostname in groups['ln'] 79 | - name: fs quota limit on ln nodes 80 | become: yes 81 | command: "setquota -u {{ item.name }} {{ item['quota_soft']|default(quota_default_soft) }} {{ item['quota_hard']|default(quota_default_hard) }} 0 0 {{ quota_root }} " 82 | when: inventory_hostname in groups['ln'] and set_quota 83 | with_items: "{{ users }}" 84 | changed_when: r.rc != 0 85 | register: r 86 | -------------------------------------------------------------------------------- /site.yml: -------------------------------------------------------------------------------- 1 | - hosts: all 2 | gather_facts: no 3 | environment: "{{env_vars}}" 4 | roles: 5 | # - network 6 | # - basic 7 | # - storage 8 | # - drivers 9 | # - slurm 10 | # - mpi 11 | # - spack 12 | # - python 13 | # - ganglia 14 | # - user 15 | # - elk 16 | # - elastalert 17 | # - cgroup 18 | # - restic 19 | --------------------------------------------------------------------------------