├── roles
    ├── rsyslog-collector
    │   ├── defaults
    │   │   └── main.yml
    │   ├── handlers
    │   │   └── main.yml
    │   ├── templates
    │   │   └── collect.conf
    │   └── tasks
    │   │   └── main.yml
    ├── etchosts
    │   ├── defaults
    │   │   └── main.yml
    │   ├── tasks
    │   │   └── main.yml
    │   ├── templates
    │   │   └── hosts.j2
    │   └── README
    ├── slurm
    │   ├── templates
    │   │   ├── cgroup.conf
    │   │   └── slurm.conf
    │   ├── handlers
    │   │   └── main.yml
    │   ├── files
    │   │   ├── munge.key
    │   │   └── slurm_import.conf
    │   ├── defaults
    │   │   └── main.yml
    │   ├── README.md
    │   └── tasks
    │   │   └── main.yml
    ├── elk-log-server
    │   ├── defaults
    │   │   └── main.yml
    │   ├── README
    │   ├── templates
    │   │   ├── 01-elasticsearch-storage.conf
    │   │   ├── 10-syslog-input.conf
    │   │   ├── kibana.yml
    │   │   ├── supervisord.conf
    │   │   └── elasticsearch.yml
    │   ├── files
    │   │   └── elasticsearch-1.5.repo
    │   ├── handlers
    │   │   └── main.yml
    │   └── tasks
    │   │   └── main.yml
    ├── nfs-server
    │   ├── handlers
    │   │   └── main.yml
    │   ├── defaults
    │   │   └── main.yml
    │   ├── templates
    │   │   └── exports
    │   └── tasks
    │   │   └── main.yml
    ├── rsyslog-forwarder
    │   ├── templates
    │   │   └── forward.conf
    │   ├── handlers
    │   │   └── main.yml
    │   ├── defaults
    │   │   └── main.yml
    │   └── tasks
    │   │   └── main.yml
    ├── ajdecon-repo
    │   ├── defaults
    │   │   └── main.yml
    │   ├── files
    │   │   └── yum-s3-0.2.4-1.noarch.rpm
    │   ├── tasks
    │   │   └── main.yml
    │   └── templates
    │   │   └── ajdecon-repo.el6.repo.j2
    ├── ganglia-gmond
    │   ├── handlers
    │   │   └── main.yml
    │   ├── defaults
    │   │   └── main.yml
    │   ├── tasks
    │   │   └── main.yml
    │   └── templates
    │   │   └── gmond.conf
    ├── ganglia-gmetad
    │   ├── handlers
    │   │   └── main.yml
    │   ├── defaults
    │   │   └── main.yml
    │   ├── tasks
    │   │   └── main.yml
    │   └── templates
    │   │   └── gmetad.conf
    ├── slurmdbd
    │   ├── handlers
    │   │   └── main.yml
    │   ├── defaults
    │   │   └── main.yml
    │   ├── README.md
    │   ├── templates
    │   │   └── slurmdbd.conf
    │   └── tasks
    │   │   └── main.yml
    ├── epel6
    │   ├── tasks
    │   │   └── main.yml
    │   └── files
    │   │   └── epel.repo
    ├── nfs-client
    │   ├── defaults
    │   │   └── main.yml
    │   └── tasks
    │   │   └── main.yml
    ├── common-cluster-pkgs
    │   ├── files
    │   │   └── osu-micro-benchmarks-4.4.1.tar.gz
    │   ├── defaults
    │   │   └── main.yml
    │   └── tasks
    │   │   └── main.yml
    └── sethostname
    │   └── tasks
    │       └── main.yml
├── config.yml
├── s3.yml
├── hosts.real
├── hosts.test
├── LICENSE
├── cluster.yml
└── README.md


/roles/rsyslog-collector/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | syslog_port: 514
3 | 


--------------------------------------------------------------------------------
/roles/etchosts/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | hosts_groupname: cluster
3 | 
4 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/cgroup.conf:
--------------------------------------------------------------------------------
1 | CgroupAutomount=yes
2 | ConstrainCores=yes
3 | 


--------------------------------------------------------------------------------
/roles/elk-log-server/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | logstash_syslog_listen_port: 5200
3 | 


--------------------------------------------------------------------------------
/roles/nfs-server/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: exportfs
3 |   command: exportfs -a
4 | 


--------------------------------------------------------------------------------
/roles/rsyslog-forwarder/templates/forward.conf:
--------------------------------------------------------------------------------
1 | *.*     @{{syslog_target}}:{{syslog_target_port}}
2 | 


--------------------------------------------------------------------------------
/roles/ajdecon-repo/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | s3_key_id: 'NO DEFAULT VALUE'
3 | s3_secret_key: 'NO DEFAULT VALUE'
4 | 


--------------------------------------------------------------------------------
/roles/ganglia-gmond/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: gmond is restarted
3 |   service: name=gmond state=restarted
4 | 


--------------------------------------------------------------------------------
/roles/ganglia-gmetad/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: gmetad is restarted
3 |   service: name=gmetad state=restarted
4 | 


--------------------------------------------------------------------------------
/roles/rsyslog-collector/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: rsyslog is restarted
3 |   service: name=rsyslog state=restarted
4 | 


--------------------------------------------------------------------------------
/roles/rsyslog-forwarder/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: rsyslog is restarted
3 |   service: name=rsyslog state=restarted
4 | 


--------------------------------------------------------------------------------
/roles/slurmdbd/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: slurmdbd is restarted
3 |   action: service name=slurmdbd state=restarted
4 | 


--------------------------------------------------------------------------------
/roles/rsyslog-forwarder/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | syslog_target: head
3 | syslog_target_port: 514
4 | syslog_fwd_file: "forward.conf"
5 | 


--------------------------------------------------------------------------------
/roles/elk-log-server/README:
--------------------------------------------------------------------------------
1 | An extremly simple (read: stupid) role that sets up an ELK-stack log server
2 | with a single syslog input.
3 | 


--------------------------------------------------------------------------------
/roles/elk-log-server/templates/01-elasticsearch-storage.conf:
--------------------------------------------------------------------------------
1 | output {
2 |     elasticsearch {
3 |         host => localhost
4 |     }
5 | }
6 | 


--------------------------------------------------------------------------------
/roles/epel6/tasks/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: yum.repos.d includes epel.repo
3 |   action: copy src=epel.repo dest=/etc/yum.repos.d/epel.repo
4 | 


--------------------------------------------------------------------------------
/roles/slurmdbd/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | slurmdbd_password: "$LurM42"
3 | mysql_host: localhost
4 | mysql_admin: root
5 | mysql_pw: "@ns1bl3!"
6 | 


--------------------------------------------------------------------------------
/roles/elk-log-server/templates/10-syslog-input.conf:
--------------------------------------------------------------------------------
1 | input {
2 |     syslog {
3 |         port => {{ logstash_syslog_listen_port }}
4 |     }
5 | }
6 | 


--------------------------------------------------------------------------------
/roles/ganglia-gmetad/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | grid_name: "SlurmCluster"
3 | xml_port: 8651
4 | interactive_port: 8652
5 | enable_web_interface: true
6 | 


--------------------------------------------------------------------------------
/roles/ganglia-gmond/defaults/main.yml:
--------------------------------------------------------------------------------
1 | cluster_name: defaultcluster
2 | cluster_owner: unspecified
3 | target_host: head
4 | target_port: 8649
5 | recv_port: 8649
6 | 


--------------------------------------------------------------------------------
/roles/nfs-client/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | nfs_mounts:
3 |     - path: /home
4 |       mountpoint: /home
5 |       server: head
6 |       options: "defaults,vers=3"
7 | 


--------------------------------------------------------------------------------
/roles/nfs-server/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | nfs_exports:
3 |     - path: /home
4 |       options: "rw,no_root_squash"
5 | 
6 | nfs_allowed_groups:
7 |     - cluster
8 | 


--------------------------------------------------------------------------------
/roles/ajdecon-repo/files/yum-s3-0.2.4-1.noarch.rpm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajdecon/ansible-simple-slurm-cluster/HEAD/roles/ajdecon-repo/files/yum-s3-0.2.4-1.noarch.rpm


--------------------------------------------------------------------------------
/roles/etchosts/tasks/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: regather data
3 |   action: setup
4 | 
5 | - name: auto-generate /etc/hosts
6 |   action: template src=hosts.j2 dest=/etc/hosts
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/roles/common-cluster-pkgs/files/osu-micro-benchmarks-4.4.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajdecon/ansible-simple-slurm-cluster/HEAD/roles/common-cluster-pkgs/files/osu-micro-benchmarks-4.4.1.tar.gz


--------------------------------------------------------------------------------
/roles/slurmdbd/README.md:
--------------------------------------------------------------------------------
1 | slurmdbd
2 | ========
3 | 
4 | Set up a MySQL db and slurmdbd for accounting storage.
5 | 
6 | Requirements:
7 | - Inject variable `slurmdbd_password`, no default value in role.
8 | 


--------------------------------------------------------------------------------
/roles/common-cluster-pkgs/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | openblas_enabled: true
3 | openmpi_enabled: false
4 | mpich2_enabled: true
5 | python_enabled: false
6 | openib_enabled: true
7 | upload_osu_benchmarks: false
8 | 


--------------------------------------------------------------------------------
/roles/etchosts/templates/hosts.j2:
--------------------------------------------------------------------------------
1 | 127.0.0.1       localhost       localhost.localdomain
2 | {% for host in groups[hosts_groupname]  %}
3 | {{ hostvars[host]['ansible_eth0']['ipv4']['address'] }}    {{ hostvars[host]['ansible_hostname'] }}
4 | {% endfor %}
5 | 


--------------------------------------------------------------------------------
/roles/slurm/handlers/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: munge is restarted
 3 |   service: name=munge state=restarted
 4 | 
 5 | - name: slurm is restarted
 6 |   service: name=slurm state=restarted
 7 | 
 8 | - name: rsyslog is restarted
 9 |   service: name=rsyslog state=restarted
10 | 


--------------------------------------------------------------------------------
/roles/slurm/files/munge.key:
--------------------------------------------------------------------------------
1 | You should replace this file with random data, i.e. by using
2 | 
3 |     dd if=/dev/random of=munge.key bs=8 count=128
4 | 
5 | Munge should work if you just use a file with text in it (even this file!),
6 | but the security is less likely to be good.
7 | 


--------------------------------------------------------------------------------
/roles/elk-log-server/files/elasticsearch-1.5.repo:
--------------------------------------------------------------------------------
1 | [elasticsearch-1.5]
2 | name=Elasticsearch repository for 1.5.x packages
3 | baseurl=http://packages.elasticsearch.org/elasticsearch/1.5/centos
4 | gpgcheck=1
5 | gpgkey=http://packages.elasticsearch.org/GPG-KEY-elasticsearch
6 | enabled=1
7 | 


--------------------------------------------------------------------------------
/roles/etchosts/README:
--------------------------------------------------------------------------------
 1 | etchosts
 2 | ========
 3 | 
 4 | A simple role for setting up an /etc/hosts file based on an Ansible group.
 5 | 
 6 | Variables:
 7 | ----------
 8 | 
 9 | * hosts_groupname: Ansible group to iterate through when generating file.
10 |   Default value is "cluster".
11 | 


--------------------------------------------------------------------------------
/roles/elk-log-server/handlers/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: restart elasticsearch
 3 |   service: name=elasticsearch state=restarted
 4 | 
 5 | - name: restart supervisord
 6 |   service: name=supervisord state=restarted
 7 | 
 8 | - name: restart logstash
 9 |   service: name=logstash state=restarted
10 | 


--------------------------------------------------------------------------------
/roles/nfs-server/templates/exports:
--------------------------------------------------------------------------------
1 | {% for export in nfs_exports -%}
2 |     {% for g in nfs_allowed_groups -%}
3 |         {% for host in groups[g] -%}
4 | {{export.path}}     {{hostvars[host]['ansible_eth0']['ipv4']['address']}}/255.255.255.255({{export.options}})
5 |         {% endfor %}
6 |     {% endfor %}
7 | {% endfor %}
8 | 


--------------------------------------------------------------------------------
/config.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | # Several of the roles (including the slurm role, nfs-client, and
 4 | # rsyslog-forwarder roles) assume your head node will be named
 5 | # "head". See the roles' defaults/main.yml files to see where this
 6 | # might need to be changed.
 7 | 
 8 | slurmctld_host: head
 9 | slurm_cpus_per_node: 2
10 | upload_osu_benchmarks: true
11 | 


--------------------------------------------------------------------------------
/roles/rsyslog-collector/templates/collect.conf:
--------------------------------------------------------------------------------
 1 | # Provides UDP syslog reception
 2 | $ModLoad imudp
 3 | $UDPServerRun 514
 4 | 
 5 | # Provides TCP syslog reception
 6 | $ModLoad imtcp
 7 | $InputTCPServerRun 514
 8 | 
 9 | # Collect cluster logs per-host
10 | $template ClusterFile,"/var/log/cluster/%HOSTNAME%
11 | :source , !isequal , "localhost" ?ClusterFile
12 | 


--------------------------------------------------------------------------------
/roles/slurm/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | slurmctld_host: head
 3 | 
 4 | slurmdbd_enabled: false
 5 | slurmdbd_host: head
 6 | 
 7 | slurm_compute_group: compute
 8 | slurm_cluster_name: defaultcluster
 9 | slurm_partition_name: debug
10 | slurm_cpus_per_node: 1
11 | 
12 | slurm_state_dir: /var/lib/slurm
13 | slurm_acct_file: /var/log/slurm_jobacct.log
14 | 
15 | slurm_log_to_syslog: false
16 | 


--------------------------------------------------------------------------------
/roles/rsyslog-forwarder/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: ensure rsyslog is installed
 3 |   yum: name=rsyslog state=present
 4 |   
 5 | - name: add forward.conf to syslog config
 6 |   template: src="forward.conf" dest="/etc/rsyslog.d/{{ syslog_fwd_file }}"
 7 |   notify:
 8 |   - rsyslog is restarted
 9 | 
10 | - name: ensure rsyslog is running and enabled
11 |   service: name=rsyslog state=started enabled=yes
12 | 


--------------------------------------------------------------------------------
/roles/ajdecon-repo/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: upload yum-s3 rpm
 3 |   action: copy src=yum-s3-0.2.4-1.noarch.rpm dest=/tmp/yum-s3-0.2.4-1.noarch.rpm
 4 | 
 5 | - name: install yum-s3
 6 |   action: yum name=/tmp/yum-s3-0.2.4-1.noarch.rpm state=present
 7 |   
 8 | - name: yum.repos.d includes ajdecon-repo.el6.repo
 9 |   action: template src=ajdecon-repo.el6.repo.j2 dest=/etc/yum.repos.d/ajdecon-repo.el6.repo 
10 | 


--------------------------------------------------------------------------------
/roles/ganglia-gmond/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: install gmond packages
 3 |   yum: name={{ item }} state=present
 4 |   with_items:
 5 |   - ganglia-gmond
 6 |   - ganglia-gmond-python
 7 | 
 8 | - name: gmond is configured
 9 |   template: src="gmond.conf" dest="/etc/ganglia/gmond.conf"
10 |   notify:
11 |   - gmond is restarted
12 | 
13 | - name: gmond is enabled and running
14 |   service: name=gmond state=started enabled=yes
15 | 
16 | 


--------------------------------------------------------------------------------
/s3.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | # These keys are used for authorizing access to my RPM repo on S3.
 4 | # You should build your own RPM repo for installing any software not
 5 | # in a public repo.
 6 | #
 7 | # See http://www.carrollops.com/blog/2012/09/11/s3-yum-repos-with-iam-authorization/
 8 | # and https://github.com/jbraeuer/yum-s3-plugin
 9 | # to see how I set this up for myself.
10 | 
11 | s3_key_id: 'NOTHING HERE'
12 | s3_secret_key: 'NOT REAL'
13 | 


--------------------------------------------------------------------------------
/roles/slurmdbd/templates/slurmdbd.conf:
--------------------------------------------------------------------------------
 1 | ArchiveEvents=yes
 2 | ArchiveJobs=yes
 3 | AuthType=auth/munge
 4 | DbdHost=localhost
 5 | DebugLevel=4
 6 | PurgeEventAfter=1month
 7 | PurgeJobAfter=24month
 8 | PurgeStepAfter=1month
 9 | PurgeSuspendAfter=1month
10 | LogFile=/var/log/slurmdbd.log
11 | PidFile=/var/tmp/slurmdbd.pid
12 | SlurmUser=slurm
13 | StorageHost={{mysql_host}}
14 | StoragePass={{slurmdbd_password}}
15 | StorageType=accounting_storage/mysql
16 | StorageUser=slurm
17 | 


--------------------------------------------------------------------------------
/roles/ajdecon-repo/templates/ajdecon-repo.el6.repo.j2:
--------------------------------------------------------------------------------
 1 | [ajdecon-repo.el6.x86_64]
 2 | name = ajdecon-repo.el6.x86_64
 3 | baseurl = https://ajdecon-repo.s3.amazonaws.com/el6/x86_64/
 4 | gpgcheck=0
 5 | enabled=1
 6 | s3_enabled=1
 7 | key_id={{s3_key_id}}
 8 | secret_key={{s3_secret_key}}
 9 | 
10 | [ajdecon-repo.el6.noarch]
11 | name = ajdecon-repo.el6.noarch
12 | baseurl = https://ajdecon-repo.s3.amazonaws.com/el6/noarch/
13 | gpgcheck=0
14 | enabled=1
15 | s3_enabled=1
16 | key_id={{s3_key_id}}
17 | secret_key={{s3_secret_key}}
18 | 
19 | 


--------------------------------------------------------------------------------
/roles/rsyslog-collector/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: ensure rsyslog is installed
 3 |   yum: name=rsyslog state=present
 4 |   
 5 | - name: add collect.conf to syslog config
 6 |   template: src="collect.conf" dest="/etc/rsyslog.d/forward.conf"
 7 |   notify:
 8 |   - rsyslog is restarted
 9 | 
10 | - name: ensure /var/log/cluster exists
11 |   file: path=/var/log/cluster owner=root group=root mode=0700 state=directory
12 | 
13 | - name: ensure rsyslog is running and enabled
14 |   service: name=rsyslog state=started enabled=yes
15 | 


--------------------------------------------------------------------------------
/roles/nfs-server/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: ensure nfs-related packages are installed
 3 |   yum: name={{item}} state=present
 4 |   with_items:
 5 |   - nfs-utils
 6 |   - nfs-utils-lib
 7 |   - nfs-utils-lib-devel
 8 |   - nfs4-acl-tools
 9 | 
10 | - name: configure exports
11 |   template: src=exports dest=/etc/exports
12 |   notify:
13 |   - exportfs
14 | 
15 | - name: ensure rpcbind is running
16 |   service: name=rpcbind state=started enabled=yes
17 | 
18 | - name: ensure nfs is running
19 |   service: name=nfs state=started enabled=yes
20 | 


--------------------------------------------------------------------------------
/roles/sethostname/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: live hostname set to setname value
 3 |   action: command hostname {{setname}}
 4 | 
 5 | - name: re-gather facts to update ansible_hostname
 6 |   action: setup
 7 |   when: not ansible_hostname == "{{ setname }}"
 8 | 
 9 | - name: persistent hostname set to setname value
10 |   action: lineinfile dest=/etc/sysconfig/network regexp=^HOSTNAME= line=HOSTNAME={{setname}}
11 | 
12 | - name: /etc/hosts reflects setname value
13 |   action: lineinfile dest=/etc/hosts regexp=^{{ansible_eth0.ipv4.address}} line="{{ansible_eth0.ipv4.address}}   {{setname}}"
14 | 


--------------------------------------------------------------------------------
/roles/slurm/README.md:
--------------------------------------------------------------------------------
 1 | slurm
 2 | =====
 3 | 
 4 | This role installs SLURM, the HPC cluster resource manager.
 5 | 
 6 | Things you should change:
 7 | 
 8 | - This role is dependent on the presence of a yum repo
 9 |   which contains the slurm packages. The easiest way to 
10 |   do this (for me) is to have another role which sets up 
11 |   the appropriate repo, and include it as a role dependency
12 |   in meta/main.yml
13 | 
14 | - You need to have a file munge.key for cluster node
15 |   authentication in files/munge.key. Run,
16 | 
17 |   dd bs=1 if=/dev/urandom count=1024 of=files/munge.key
18 | 


--------------------------------------------------------------------------------
/roles/nfs-client/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: ensure nfs packages are installed
 3 |   yum: name={{item}} state=present
 4 |   with_items:
 5 |   - nfs-utils
 6 |   - nfs4-acl-tools
 7 | 
 8 | - name: ensure mountpoints exist
 9 |   file: path={{item.mountpoint}} state=directory
10 |   with_items: nfs_mounts
11 | 
12 | - name: ensure rpcbind is running
13 |   service: name=rpcbind state=started enabled=yes
14 | 
15 | - name: ensure filesystems are mounted
16 |   mount: name={{item.mountpoint}} src="{{item.server}}:{{item.path}}" fstype=nfs opts={{item.options}} state=mounted
17 |   with_items: nfs_mounts
18 | 
19 | 


--------------------------------------------------------------------------------
/roles/slurm/files/slurm_import.conf:
--------------------------------------------------------------------------------
 1 | $ModLoad imfile
 2 | 
 3 | $InputFileName /var/log/slurmd.log
 4 | $InputFileStateFile stat-slurmd
 5 | $InputFileTag slurmd:
 6 | $InputFileSeverity info
 7 | $InputFileFacility local7
 8 | $InputFilePollInterval 60
 9 | $InputRunFileMonitor
10 | 
11 | $InputFileName /var/log/slurmctld.log
12 | $InputFileStateFile stat-slurmctld
13 | $InputFileTag slurmctld:
14 | $InputFileSeverity info
15 | $InputFileFacility local7
16 | $InputFilePollInterval 60
17 | $InputRunFileMonitor
18 | 
19 | $InputFileName /var/log/slurmsched.log
20 | $InputFileStateFile stat-slurmsched
21 | $InputFileTag slurmsched:
22 | $InputFileSeverity info
23 | $InputFileFacility local7
24 | $InputFilePollInterval 60
25 | $InputRunFileMonitor
26 | 
27 | 


--------------------------------------------------------------------------------
/roles/slurmdbd/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: slurmdbd packages are installed
 3 |   yum: name={{item}} state=present
 4 |   with_items:
 5 |   - slurm-munge
 6 |   - slurm-plugins
 7 |   - munge
 8 |   - slurm-slurmdbd
 9 |   - slurm-sql
10 |   - mysql-server
11 |   - MySQL-python
12 | 
13 | - name: mysqld is running
14 |   service: name=mysqld state=started
15 | 
16 | - name: slurm user exists
17 |   user: name=slurm state=present
18 | 
19 | - name: slurm mysqld user is configured
20 |   mysql_user: 
21 |     name=slurm 
22 |     password={{slurmdbd_password}} 
23 |     priv="slurm_acct_db.*:ALL"
24 |     login_host={{mysql_host}} 
25 |     login_user={{mysql_admin}}
26 |     login_password={{mysql_pw}}
27 | 
28 | - name: slurmdbd service is configured
29 |   template: src=slurmdbd.conf dest=/etc/slurm/slurmdbd.conf 
30 |   notify:
31 |   - slurmdbd is restarted
32 | 
33 | - name: slurmdbd is started and enabled
34 |   service: name=slurmdbd state=started enabled=yes
35 | 


--------------------------------------------------------------------------------
/hosts.real:
--------------------------------------------------------------------------------
 1 | # This is an example inventory file for using the playbooks in this repo.
 2 | # Note that I'm using a role called "sethostname" to change the hostname
 3 | # of each host, according to the per-host value "setname" defined below.
 4 | # An "etchosts" role then sets up /etc/hosts according to the system
 5 | # hostnames.
 6 | 
 7 | # Several of the roles assume the name of the head node will be "head".
 8 | # Their defaults/main.yml files should make it clear where this is needs
 9 | # to be changed if you don't want that name.
10 | 
11 | # Other roles assume the groups "compute" and "cluster" are being used to
12 | # group the compute nodes and the entire cluster, respectively. Again, this
13 | # group name is usually defined as a variable in defaults/main.yml so you
14 | # can change it.
15 | 
16 | [head]
17 | 52.5.44.27         setname=head
18 | 
19 | [compute]
20 | 52.6.164.63         setname=compute0
21 | 52.6.176.90         setname=compute1
22 | 52.6.179.184         setname=compute2
23 | 
24 | [cluster:children]
25 | head
26 | compute
27 | 


--------------------------------------------------------------------------------
/hosts.test:
--------------------------------------------------------------------------------
 1 | # This is an example inventory file for using the playbooks in this repo.
 2 | # Note that I'm using a role called "sethostname" to change the hostname
 3 | # of each host, according to the per-host value "setname" defined below.
 4 | # An "etchosts" role then sets up /etc/hosts according to the system
 5 | # hostnames.
 6 | 
 7 | # Several of the roles assume the name of the head node will be "head".
 8 | # Their defaults/main.yml files should make it clear where this is needs
 9 | # to be changed if you don't want that name.
10 | 
11 | # Other roles assume the groups "compute" and "cluster" are being used to
12 | # group the compute nodes and the entire cluster, respectively. Again, this
13 | # group name is usually defined as a variable in defaults/main.yml so you
14 | # can change it.
15 | 
16 | [head]
17 | XX.XX.XX.XX         setname=head
18 | 
19 | [compute]
20 | XX.XX.XX.XX         setname=compute0
21 | XX.XX.XX.XX         setname=compute1
22 | XX.XX.XX.XX         setname=compute2
23 | XX.XX.XX.XX         setname=compute3
24 | 
25 | [cluster:children]
26 | head
27 | compute
28 | 


--------------------------------------------------------------------------------
/roles/epel6/files/epel.repo:
--------------------------------------------------------------------------------
 1 | [epel6]
 2 | name=Extra Packages for Enterprise Linux 6 - $basearch
 3 | #baseurl=http://download.fedoraproject.org/pub/epel/6/$basearch
 4 | mirrorlist=https://mirrors.fedoraproject.org/metalink?repo=epel-6&arch=$basearch
 5 | failovermethod=priority
 6 | enabled=1
 7 | gpgcheck=1
 8 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-6
 9 | 
10 | [epel6-debuginfo]
11 | name=Extra Packages for Enterprise Linux 6 - $basearch - Debug
12 | #baseurl=http://download.fedoraproject.org/pub/epel/6/$basearch/debug
13 | mirrorlist=https://mirrors.fedoraproject.org/metalink?repo=epel-debug-6&arch=$basearch
14 | failovermethod=priority
15 | enabled=0
16 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-6
17 | gpgcheck=1
18 | 
19 | [epel6-source]
20 | name=Extra Packages for Enterprise Linux 6 - $basearch - Source
21 | #baseurl=http://download.fedoraproject.org/pub/epel/6/SRPMS
22 | mirrorlist=https://mirrors.fedoraproject.org/metalink?repo=epel-source-6&arch=$basearch
23 | failovermethod=priority
24 | enabled=0
25 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-6
26 | gpgcheck=1
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 Adam DeConinck
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/roles/ganglia-gmetad/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: install ganglia packages
 3 |   yum: name={{ item }} state=present
 4 |   with_items:
 5 |   - ganglia-devel
 6 |   - ganglia-gmetad
 7 |   - ganglia-web
 8 | 
 9 | - name: gmetad is configured
10 |   template: src="gmetad.conf" dest="/etc/ganglia/gmetad.conf"
11 |   notify:
12 |   - gmetad is restarted
13 | 
14 | - name: gmetad is enabled and running
15 |   service: name=gmetad state=started enabled=yes
16 | 
17 | - name: httpd is enabled and running for the web interface
18 |   when: enable_web_interface
19 |   service: name=httpd state=started enabled=yes
20 | 
21 | - name: ensure /var/lib/ganglia-web/dwoo/ is writable by web interface
22 |   when: enable_web_interface
23 |   file: path=/var/lib/ganglia-web/dwoo group=apache mode=0775
24 | 
25 | - name: ensure /var/lib/ganglia-web/dwoo/cache is writable by web interface
26 |   when: enable_web_interface
27 |   file: path=/var/lib/ganglia-web/dwoo/cache group=apache mode=0775
28 | 
29 | - name: ensure /var/lib/ganglia-web/dwoo/compiled is writable by web interface
30 |   when: enable_web_interface
31 |   file: path=/var/lib/ganglia-web/dwoo/compiled group=apache mode=0775
32 | 


--------------------------------------------------------------------------------
/cluster.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # This Ansible playbook sets up a relatively basic HPC-style compute cluster
 3 | # using the SLURM resource manager. It also sets up a basic NFS share of /home
 4 | # to the compute nodes, configures Ganglia and syslog forwarding to make it
 5 | # easier to monitor, and installs some common packages for HPC.
 6 | #
 7 | # Note that the file config.yml is used to define some variables which
 8 | # configure the roles' behavior. The roles should be written in such a way that
 9 | # you can run with all the defaults, and leave config.yml empty.
10 | 
11 | 
12 | # Run roles common to all hosts in cluster
13 | - hosts: cluster
14 |   user: root
15 |   vars_files:
16 |   - config.yml
17 |   - s3.yml
18 |   roles:
19 |   - "epel6"
20 |   - "ajdecon-repo"
21 |   - "sethostname"
22 |   - "etchosts"
23 |   - "slurm"
24 |   - "ganglia-gmond"
25 |   - "common-cluster-pkgs"
26 | 
27 | # Run roles specific to the cluster headnode
28 | - hosts: head
29 |   user: root
30 |   vars_files:
31 |   - config.yml
32 |   roles:
33 |   - "nfs-server"
34 |   - "rsyslog-collector"
35 |   - "ganglia-gmetad"
36 |   - "elk-log-server"
37 |   - role: "rsyslog-forwarder"
38 |     syslog_target: "localhost"
39 |     syslog_target_port: 5200
40 |     syslog_fwd_file: "elk-fwd.conf"
41 | 
42 | # Run roles specific to the cluster compute nodes
43 | - hosts: compute
44 |   user: root
45 |   vars_files:
46 |   - config.yml
47 |   roles:
48 |   - "nfs-client"
49 |   - "rsyslog-forwarder"
50 | 


--------------------------------------------------------------------------------
/roles/slurm/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: munge and slurm are installed
 3 |   yum: name={{item}} state=present
 4 |   with_items:
 5 |   - munge
 6 |   - slurm
 7 |   - slurm-munge
 8 |   - slurm-plugins
 9 |   - slurm-pam_slurm
10 |   - slurm-perlapi
11 |   - libcgroup
12 | 
13 | - name: cgroups are started and enabled
14 |   service: name=cgconfig state=started enabled=yes
15 | 
16 | - name: munge key is present
17 |   copy: src=munge.key dest=/etc/munge/munge.key owner=munge group=munge mode=0600 force=yes
18 |   notify: 
19 |   - munge is restarted
20 | 
21 | - name: munge is started and enabled
22 |   service: name=munge state=started enabled=yes
23 | 
24 | - name: slurm group exists
25 |   group: name=slurm state=present
26 | 
27 | - name: slurm user exists
28 |   user: name=slurm group=slurm state=present
29 | 
30 | - name: slurm_state_dir exists
31 |   file: path={{ slurm_state_dir }} owner=slurm group=slurm mode=0755 state=directory
32 | 
33 | - name: accounting storage file exists
34 |   file: path={{ slurm_acct_file }} owner=slurm group=slurm mode=0644 state=touch
35 | 
36 | - name: slurm is configured
37 |   template: src={{item}} dest=/etc/slurm/{{item}} owner=slurm mode=0644
38 |   with_items:
39 |   - slurm.conf
40 |   - cgroup.conf
41 |   notify:
42 |   - slurm is restarted
43 | 
44 | - name: slurm is started and enabled
45 |   service: name=slurm state=started enabled=yes
46 | 
47 | - name: check if rsyslog.d exists
48 |   stat: path="/etc/rsyslog.d"
49 |   register: check_rsyslogd
50 | 
51 | - when: check_rsyslogd.stat.exists
52 |   name: drop rsyslog.d config file
53 |   copy: src="slurm_import.conf" dest="/etc/rsyslog.d/slurm_import.conf"
54 |         owner=root group=root mode=0444
55 |   notify:
56 |   - rsyslog is restarted
57 | 
58 | 


--------------------------------------------------------------------------------
/roles/common-cluster-pkgs/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: install common packages
 3 |   yum: name={{ item }} state=present
 4 |   with_items:
 5 |   - "environment-modules"
 6 |   - "pdsh"
 7 |   - "pdsh-rcmd-ssh"
 8 |   - "@development"
 9 |   - "gcc-gfortran"
10 |   - hdf5
11 |   - hdf5-devel
12 |   - hdf5-static
13 |   - netcdf
14 |   - netcdf-devel
15 | 
16 | - name: install openib support
17 |   when: openib_enabled
18 |   yum: name={{ item }} state=present
19 |   with_items:
20 |   - "@infiniband"
21 |   - libibcm-devel
22 |   - librdmacm-devel
23 |   - libibverbs-devel
24 |   - libibmad-devel
25 |   - libibumad-devel
26 | 
27 | - name: install openblas
28 |   when: openblas_enabled
29 |   yum: name={{ item }} state=present
30 |   with_items:
31 |   - openblas
32 |   - openblas-devel
33 |   - openblas-openmp
34 | 
35 | - name: install OpenMPI packages
36 |   when: openmpi_enabled
37 |   yum: name={{ item }} state=present
38 |   with_items:
39 |   - openmpi
40 |   - openmpi-devel
41 |   - mpi4py-openmpi
42 |   - mpitests-openmpi
43 |   - boost-openmpi
44 |   - boost-openmpi-devel
45 |   - paraview-openmpi
46 |   - hdf5-openmpi
47 |   - scalapack-openmpi
48 | 
49 | - name: install mpich2 packages
50 |   when: mpich2_enabled
51 |   yum: name={{ item }} state=present
52 |   with_items:
53 |   - mpich2
54 |   - mpich2-devel
55 |   - mpi4py-mpich
56 |   - mpitests-mpich
57 |   - hdf5-mpich
58 |   - paraview-mpich
59 |   - scalapack-mpich
60 | 
61 | - name: install python packages
62 |   when: python_enabled
63 |   yum: name={{ item }} state=present
64 |   with_items:
65 |   - numpy
66 |   - numpy-f2py
67 |   - scipy
68 |   - h5py
69 | 
70 | - name: upload osu mpi micro-benchmarks
71 |   when: upload_osu_benchmarks
72 |   copy: src="osu-micro-benchmarks-4.4.1.tar.gz" dest="/opt/osu-micro-benchmarks-4.4.1.tar.gz" mode=0444
73 | 


--------------------------------------------------------------------------------
/roles/elk-log-server/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: import and trust elasticsearch rpm gpg key
 3 |   rpm_key: key="https://packages.elasticsearch.org/GPG-KEY-elasticsearch"
 4 | 
 5 | - name: configure elasticsearch repo
 6 |   copy: src="elasticsearch-1.5.repo" dest="/etc/yum.repos.d/elasticsearch-1.5.repo"
 7 |         owner=root group=root mode=0444
 8 | 
 9 | - name: upload logstash and kibana (not in repos)
10 |   get_url: url="{{ item }}" dest="/tmp/"
11 |   with_items:
12 |   - "https://download.elastic.co/kibana/kibana/kibana-4.0.2-linux-x64.tar.gz"
13 |   - "https://download.elastic.co/logstash/logstash/packages/centos/logstash-1.4.2-1_2c0f5a1.noarch.rpm"
14 | 
15 | - name: install elasticsearch and logstash
16 |   yum: name="{{ item }}" state=present
17 |   with_items:
18 |   - java-1.7.0-openjdk
19 |   - elasticsearch
20 |   - "/tmp/logstash-1.4.2-1_2c0f5a1.noarch.rpm"
21 |   - supervisor
22 | 
23 | - name: configure elasticsearch
24 |   template: src="elasticsearch.yml" dest="/etc/elasticsearch/elasticsearch.yml"
25 |             owner=root group=root mode=0444
26 |   notify:
27 |   - restart elasticsearch
28 | 
29 | - name: elasticsearch started and enabled
30 |   service: name=elasticsearch state=started enabled=yes
31 | 
32 | - name: extract kibana
33 |   unarchive: src="/tmp/kibana-4.0.2-linux-x64.tar.gz" dest="/opt" copy=no
34 | 
35 | - name: configure kibana
36 |   template: src="kibana.yml" dest="/opt/kibana-4.0.2-linux-x64/kibana.yml"
37 |             owner=root group=root mode=0444
38 | 
39 | - name: configure supervisord (run kibana)
40 |   template: src="supervisord.conf" dest="/etc/supervisord.conf" backup=yes
41 |   notify:
42 |   - restart supervisord
43 | 
44 | - name: supervisord started and enabled
45 |   service: name=supervisord state=started enabled=yes
46 | 
47 | - name: configure logstash
48 |   template: src="{{ item }}" dest="/etc/logstash/conf.d/"
49 |   with_items:
50 |   - "01-elasticsearch-storage.conf"
51 |   - "10-syslog-input.conf"
52 |   notify:
53 |   - restart logstash
54 | 
55 | - name: logstash started and enabled
56 |   service: name=logstash state=started enabled=yes
57 | 


--------------------------------------------------------------------------------
/roles/elk-log-server/templates/kibana.yml:
--------------------------------------------------------------------------------
 1 | # Kibana is served by a back end server. This controls which port to use.
 2 | port: 5601
 3 | 
 4 | # The host to bind the server to.
 5 | host: "0.0.0.0"
 6 | 
 7 | # The Elasticsearch instance to use for all your queries.
 8 | elasticsearch_url: "http://localhost:9200"
 9 | 
10 | # preserve_elasticsearch_host true will send the hostname specified in `elasticsearch`. If you set it to false,
11 | # then the host you use to connect to *this* Kibana instance will be sent.
12 | elasticsearch_preserve_host: true
13 | 
14 | # Kibana uses an index in Elasticsearch to store saved searches, visualizations
15 | # and dashboards. It will create a new index if it doesn't already exist.
16 | kibana_index: ".kibana"
17 | 
18 | # If your Elasticsearch is protected with basic auth, this is the user credentials
19 | # used by the Kibana server to perform maintence on the kibana_index at statup. Your Kibana
20 | # users will still need to authenticate with Elasticsearch (which is proxied thorugh
21 | # the Kibana server)
22 | # kibana_elasticsearch_username: user
23 | # kibana_elasticsearch_password: pass
24 | 
25 | # If your Elasticsearch requires client certificate and key
26 | # kibana_elasticsearch_client_crt: /path/to/your/client.crt
27 | # kibana_elasticsearch_client_key: /path/to/your/client.key
28 | 
29 | # If you need to provide a CA certificate for your Elasticsarech instance, put
30 | # the path of the pem file here.
31 | # ca: /path/to/your/CA.pem
32 | 
33 | # The default application to load.
34 | default_app_id: "discover"
35 | 
36 | # Time in milliseconds to wait for responses from the back end or elasticsearch.
37 | # This must be > 0
38 | request_timeout: 300000
39 | 
40 | # Time in milliseconds for Elasticsearch to wait for responses from shards.
41 | # Set to 0 to disable.
42 | shard_timeout: 0
43 | 
44 | # Set to false to have a complete disregard for the validity of the SSL
45 | # certificate.
46 | verify_ssl: true
47 | 
48 | # SSL for outgoing requests from the Kibana Server (PEM formatted)
49 | # ssl_key_file: /path/to/your/server.key
50 | # ssl_cert_file: /path/to/your/server.crt
51 | 
52 | # Set the path to where you would like the process id file to be created.
53 | # pid_file: /var/run/kibana.pid
54 | 
55 | # Plugins that are included in the build, and no longer found in the plugins/ folder
56 | bundled_plugin_ids:
57 |  - plugins/dashboard/index
58 |  - plugins/discover/index
59 |  - plugins/doc/index
60 |  - plugins/kibana/index
61 |  - plugins/markdown_vis/index
62 |  - plugins/metric_vis/index
63 |  - plugins/settings/index
64 |  - plugins/table_vis/index
65 |  - plugins/vis_types/index
66 |  - plugins/visualize/index
67 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ansible-simple-slurm-cluster
 2 | ============================
 3 | 
 4 | This repo contains a set of Ansible roles for setting up a relatively basic
 5 | HPC-style compute cluster, along with an example playbook for using them.
 6 | 
 7 | **These scipts should not be considered production-quality!** (Though
 8 | you can always use them or get inspiration from them if you like.) I use
 9 | the roles in this repo to set up ephemeral clusters while playing with new
10 | ideas or writing software. I am rarely concerned with stability or user
11 | experience, and more often with seeing what I can break. :) YMMV!
12 | 
13 | They have mostly been tested on EC2 but there's
14 | nothing cloud-specific in them, so they should work on normal hardware.
15 | 
16 | 
17 | So what does it do?
18 | -------------------
19 | 
20 | - Set host names and builds an /etc/hosts file
21 | - Configure [SLURM](http://slurm.schedmd.com) as resource manager
22 | - NFS export of /home from head node to computes
23 | - Syslog forwarding to the head node
24 | - [Ganglia](http://ganglia.sourceforge.net/) monitoring system (not using multicast b/c mostly used on EC2)
25 | - Install some common HPC dev tools
26 | 
27 | This is usually enough to get me started on whatever other project I'm working
28 | on.
29 | 
30 | 
31 | Prerequisites
32 | -------------
33 | 
34 | - All the roles assume you're using EL6 (i.e. CentOS, RHEL, Scientific Linux)
35 | 
36 | - [SLURM](http://slurm.schedmd.com) and [Munge](https://code.google.com/p/munge/) 
37 |   are not distributed as RPMs, so I built those RPMs and stuck
38 |   them in a repository on S3. The "ajdecon-repo" role configures each node of
39 |   the cluster to include this repo when installing software.
40 |   
41 |   The S3 bucket is not public because I like low bandwidth bills. :) The RPMs
42 |   themselves are very easy to build, so I suggest just setting up your own
43 |   YUM repo to install them from.
44 | 
45 | 
46 | How do I use the playbook?
47 | --------------------------
48 | 
49 | 1. Set up an Ansible [inventory file](http://docs.ansible.com/intro_inventory.html)
50 |    similar to the included hosts.test. Each host can have a "setname="
51 |    parameter included next to it, and the "sethostname" role will use that
52 |    to configure the hostname.
53 | 
54 |    Note that many of the roles assume your head node will be named "head", and
55 |    that the "compute" and "cluster" groups exist. However this can generally be
56 |    changed for each role: see the variables in `defaults/main.yml` for each one.
57 | 
58 | 2. Run `ansible-playbook -i <inventory_file> cluster.yml` and wait for your
59 |    cluster to be ready!
60 | 
61 | 
62 | Other notes
63 | -----------
64 | 
65 | Many of the roles have variables you can set to control their behavior. (For
66 | example, there are a few knobs to turn on the "slurm" role.) You can change
67 | their values by setting them in "config.yml" and they'll get picked up
68 | in "cluster.yml". See the `defaults/main.yml` file in each role to see what
69 | variables are available to change.
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/roles/elk-log-server/templates/supervisord.conf:
--------------------------------------------------------------------------------
 1 | 
 2 | [supervisord]
 3 | http_port=/var/tmp/supervisor.sock ; (default is to run a UNIX domain socket server)
 4 | ;http_port=127.0.0.1:9001  ; (alternately, ip_address:port specifies AF_INET)
 5 | ;sockchmod=0700              ; AF_UNIX socketmode (AF_INET ignore, default 0700)
 6 | ;sockchown=nobody.nogroup     ; AF_UNIX socket uid.gid owner (AF_INET ignores)
 7 | ;umask=022                   ; (process file creation umask;default 022)
 8 | logfile=/var/log/supervisor/supervisord.log ; (main log file;default $CWD/supervisord.log)
 9 | logfile_maxbytes=50MB       ; (max main logfile bytes b4 rotation;default 50MB)
10 | logfile_backups=10          ; (num of main logfile rotation backups;default 10)
11 | loglevel=info               ; (logging level;default info; others: debug,warn)
12 | pidfile=/var/run/supervisord.pid ; (supervisord pidfile;default supervisord.pid)
13 | nodaemon=false              ; (start in foreground if true;default false)
14 | minfds=1024                 ; (min. avail startup file descriptors;default 1024)
15 | minprocs=200                ; (min. avail process descriptors;default 200)
16 | 
17 | ;nocleanup=true              ; (don't clean up tempfiles at start;default false)
18 | ;http_username=user          ; (default is no username (open system))
19 | ;http_password=123           ; (default is no password (open system))
20 | ;childlogdir=/tmp            ; ('AUTO' child log dir, default $TEMP)
21 | ;user=chrism                 ; (default is current user, required if root)
22 | ;directory=/tmp              ; (default is not to cd during start)
23 | ;environment=KEY=value       ; (key value pairs to add to environment)
24 | 
25 | [supervisorctl]
26 | serverurl=unix:///var/tmp/supervisor.sock ; use a unix:// URL  for a unix socket
27 | ;serverurl=http://127.0.0.1:9001 ; use an http:// url to specify an inet socket
28 | ;username=chris              ; should be same as http_username if set
29 | ;password=123                ; should be same as http_password if set
30 | ;prompt=mysupervisor         ; cmd line prompt (default "supervisor")
31 | 
32 | ; The below sample program section shows all possible program subsection values,
33 | ; create one or more 'real' program: sections to be able to control them under
34 | ; supervisor.
35 | 
36 | [program:kibana]
37 | command=/opt/kibana-4.0.2-linux-x64/bin/kibana
38 | autostart=true
39 | autorestart=true
40 | logfile=syslog
41 | 
42 | ;[program:theprogramname]
43 | ;command=/bin/cat            ; the program (relative uses PATH, can take args)
44 | ;priority=999                ; the relative start priority (default 999)
45 | ;autostart=true              ; start at supervisord start (default: true)
46 | ;autorestart=true            ; retstart at unexpected quit (default: true)
47 | ;startsecs=10                ; number of secs prog must stay running (def. 10)
48 | ;startretries=3              ; max # of serial start failures (default 3)
49 | ;exitcodes=0,2               ; 'expected' exit codes for process (default 0,2)
50 | ;stopsignal=QUIT             ; signal used to kill process (default TERM)
51 | ;stopwaitsecs=10             ; max num secs to wait before SIGKILL (default 10)
52 | ;user=chrism                 ; setuid to this UNIX account to run the program
53 | ;log_stdout=true             ; if true, log program stdout (default true)
54 | ;log_stderr=true             ; if true, log program stderr (def false)
55 | ;logfile=/var/log/cat.log    ; child log path, use NONE for none; default AUTO
56 | ;logfile_maxbytes=1MB        ; max # logfile bytes b4 rotation (default 50MB)
57 | ;logfile_backups=10          ; # of logfile backups (default 10)
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/roles/slurm/templates/slurm.conf:
--------------------------------------------------------------------------------
  1 | # slurm.conf file generated by configurator.html.
  2 | # Put this file on all nodes of your cluster.
  3 | # See the slurm.conf man page for more information.
  4 | #
  5 | ControlMachine={{slurmctld_host}}
  6 | #ControlAddr=
  7 | #BackupController=
  8 | #BackupAddr=
  9 | # 
 10 | AuthType=auth/munge
 11 | CacheGroups=0
 12 | #CheckpointType=checkpoint/none 
 13 | CryptoType=crypto/munge
 14 | #DisableRootJobs=NO 
 15 | #EnforcePartLimits=NO 
 16 | #Epilog=
 17 | #EpilogSlurmctld= 
 18 | #FirstJobId=1 
 19 | #MaxJobId=999999 
 20 | #GresTypes= 
 21 | #GroupUpdateForce=0 
 22 | #GroupUpdateTime=600 
 23 | #JobCheckpointDir=/var/slurm/checkpoint 
 24 | #JobCredentialPrivateKey=
 25 | #JobCredentialPublicCertificate=
 26 | #JobFileAppend=0 
 27 | #JobRequeue=1 
 28 | #JobSubmitPlugins=1 
 29 | #KillOnBadExit=0 
 30 | #Licenses=foo*4,bar 
 31 | #MailProg=/bin/mail 
 32 | #MaxJobCount=5000 
 33 | #MaxStepCount=40000 
 34 | #MaxTasksPerNode=128 
 35 | MpiDefault=none
 36 | MpiParams=ports=12000-12999 
 37 | #PluginDir= 
 38 | #PlugStackConfig= 
 39 | #PrivateData=jobs 
 40 | ProctrackType=proctrack/pgid
 41 | #Prolog=
 42 | #PrologSlurmctld= 
 43 | #PropagatePrioProcess=0 
 44 | #PropagateResourceLimits= 
 45 | #PropagateResourceLimitsExcept= 
 46 | ReturnToService=1
 47 | #SallocDefaultCommand= 
 48 | SlurmctldPidFile=/var/run/slurmctld.pid
 49 | SlurmctldPort=6817
 50 | SlurmdPidFile=/var/run/slurmd.pid
 51 | SlurmdPort=6818
 52 | SlurmdSpoolDir=/tmp/slurmd
 53 | SlurmUser=slurm
 54 | #SlurmdUser=root 
 55 | #SrunEpilog=
 56 | #SrunProlog=
 57 | StateSaveLocation={{ slurm_state_dir }}
 58 | SwitchType=switch/none
 59 | #TaskEpilog=
 60 | TaskPlugin=task/none
 61 | #TaskPluginParam=
 62 | #TaskProlog=
 63 | #TopologyPlugin=topology/tree 
 64 | #TmpFs=/tmp 
 65 | #TrackWCKey=no 
 66 | #TreeWidth= 
 67 | #UnkillableStepProgram= 
 68 | #UsePAM=0 
 69 | # 
 70 | # 
 71 | # TIMERS 
 72 | #BatchStartTimeout=10 
 73 | #CompleteWait=0 
 74 | #EpilogMsgTime=2000 
 75 | #GetEnvTimeout=2 
 76 | #HealthCheckInterval=0 
 77 | #HealthCheckProgram= 
 78 | InactiveLimit=0
 79 | KillWait=30
 80 | #MessageTimeout=10 
 81 | #ResvOverRun=0 
 82 | MinJobAge=300
 83 | #OverTimeLimit=0 
 84 | SlurmctldTimeout=120
 85 | SlurmdTimeout=300
 86 | #UnkillableStepTimeout=60 
 87 | #VSizeFactor=0 
 88 | Waittime=0
 89 | # 
 90 | # 
 91 | # SCHEDULING 
 92 | #DefMemPerCPU=0 
 93 | FastSchedule=1
 94 | #MaxMemPerCPU=0 
 95 | #SchedulerRootFilter=1 
 96 | #SchedulerTimeSlice=30 
 97 | SchedulerType=sched/backfill
 98 | SchedulerPort=7321
 99 | SelectType=select/linear
100 | #SelectTypeParameters=
101 | # 
102 | # 
103 | # JOB PRIORITY 
104 | #PriorityType=priority/basic 
105 | #PriorityDecayHalfLife= 
106 | #PriorityCalcPeriod= 
107 | #PriorityFavorSmall= 
108 | #PriorityMaxAge= 
109 | #PriorityUsageResetPeriod= 
110 | #PriorityWeightAge= 
111 | #PriorityWeightFairshare= 
112 | #PriorityWeightJobSize= 
113 | #PriorityWeightPartition= 
114 | #PriorityWeightQOS= 
115 | # 
116 | # 
117 | # LOGGING AND ACCOUNTING 
118 | #AccountingStorageEnforce=0 
119 | #AccountingStorageHost=
120 | #AccountingStorageLoc=
121 | #AccountingStoragePass=
122 | #AccountingStoragePort=
123 | {% if slurmdbd_enabled %}
124 | AccountingStorageType=accounting_storage/slurmdbd
125 | AccountingStorageHost={{slurmdbd_host}}
126 | {% else %}
127 | AccountingStorageType=accounting_storage/filetxt
128 | AccountingStorageLoc={{ slurm_acct_file }}
129 | {% endif %}
130 | #AccountingStorageUser=
131 | AccountingStoreJobComment=YES
132 | ClusterName={{slurm_cluster_name}}
133 | #DebugFlags= 
134 | #JobCompHost=
135 | #JobCompLoc=
136 | #JobCompPass=
137 | #JobCompPort=
138 | JobCompType=jobcomp/none
139 | #JobCompUser=
140 | JobAcctGatherFrequency=30
141 | JobAcctGatherType=jobacct_gather/none
142 | SlurmctldDebug=3
143 | {% if slurm_log_to_syslog %}
144 | SlurmctldLogFile=
145 | SlurmdLogFile=
146 | SlurmSchedLogFile=
147 | SlurmSchedLogLevel=1
148 | {% else %}
149 | SlurmctldLogFile=/var/log/slurmctld.log
150 | SlurmdLogFile=/var/log/slurmd.log
151 | SlurmSchedLogFile=/var/log/slurmsched.log 
152 | SlurmSchedLogLevel=1
153 | {% endif %}
154 | SlurmdDebug=3
155 | # 
156 | # 
157 | # POWER SAVE SUPPORT FOR IDLE NODES (optional) 
158 | #SuspendProgram= 
159 | #ResumeProgram= 
160 | #SuspendTimeout= 
161 | #ResumeTimeout= 
162 | #ResumeRate= 
163 | #SuspendExcNodes= 
164 | #SuspendExcParts= 
165 | #SuspendRate= 
166 | #SuspendTime= 
167 | # 
168 | # 
169 | # COMPUTE NODES 
170 | {% for cn in groups[slurm_compute_group] %}
171 | NodeName={{hostvars[cn]['ansible_hostname']}} CPUs={{slurm_cpus_per_node}} State=UNKNOWN
172 | {% endfor %}
173 | 
174 | PartitionName={{slurm_partition_name}} Nodes={% for cn in groups[slurm_compute_group] -%}{{hostvars[cn]['ansible_hostname']}}{% if not loop.last -%}
175 | ,{% endif %}{% endfor %} Default=YES MaxTime=INFINITE State=UP
176 | 
177 | 


--------------------------------------------------------------------------------
/roles/ganglia-gmetad/templates/gmetad.conf:
--------------------------------------------------------------------------------
  1 | # This is an example of a Ganglia Meta Daemon configuration file
  2 | #                http://ganglia.sourceforge.net/
  3 | #
  4 | # $Id: gmetad.conf.in 2014 2009-08-10 10:44:09Z d_pocock $
  5 | #
  6 | #-------------------------------------------------------------------------------
  7 | # Setting the debug_level to 1 will keep daemon in the forground and
  8 | # show only error messages. Setting this value higher than 1 will make 
  9 | # gmetad output debugging information and stay in the foreground.
 10 | # default: 0
 11 | # debug_level 10
 12 | #
 13 | #-------------------------------------------------------------------------------
 14 | # What to monitor. The most important section of this file. 
 15 | #
 16 | # The data_source tag specifies either a cluster or a grid to
 17 | # monitor. If we detect the source is a cluster, we will maintain a complete
 18 | # set of RRD databases for it, which can be used to create historical 
 19 | # graphs of the metrics. If the source is a grid (it comes from another gmetad),
 20 | # we will only maintain summary RRDs for it.
 21 | #
 22 | # Format: 
 23 | # data_source "my cluster" [polling interval] address1:port addreses2:port ...
 24 | # 
 25 | # The keyword 'data_source' must immediately be followed by a unique
 26 | # string which identifies the source, then an optional polling interval in 
 27 | # seconds. The source will be polled at this interval on average. 
 28 | # If the polling interval is omitted, 15sec is asssumed. 
 29 | #
 30 | # A list of machines which service the data source follows, in the 
 31 | # format ip:port, or name:port. If a port is not specified then 8649
 32 | # (the default gmond port) is assumed.
 33 | # default: There is no default value
 34 | #
 35 | # data_source "my cluster" 10 localhost  my.machine.edu:8649  1.2.3.5:8655
 36 | # data_source "my grid" 50 1.3.4.7:8655 grid.org:8651 grid-backup.org:8651
 37 | # data_source "another source" 1.3.4.7:8655  1.3.4.8
 38 | 
 39 | data_source "my cluster" localhost
 40 | 
 41 | #
 42 | # Round-Robin Archives
 43 | # You can specify custom Round-Robin archives here (defaults are listed below)
 44 | #
 45 | # RRAs "RRA:AVERAGE:0.5:1:244" "RRA:AVERAGE:0.5:24:244" "RRA:AVERAGE:0.5:168:244" "RRA:AVERAGE:0.5:672:244" \
 46 | #      "RRA:AVERAGE:0.5:5760:374"
 47 | #
 48 | 
 49 | #
 50 | #-------------------------------------------------------------------------------
 51 | # Scalability mode. If on, we summarize over downstream grids, and respect
 52 | # authority tags. If off, we take on 2.5.0-era behavior: we do not wrap our output
 53 | # in <GRID></GRID> tags, we ignore all <GRID> tags we see, and always assume
 54 | # we are the "authority" on data source feeds. This approach does not scale to
 55 | # large groups of clusters, but is provided for backwards compatibility.
 56 | # default: on
 57 | # scalable off
 58 | #
 59 | #-------------------------------------------------------------------------------
 60 | # The name of this Grid. All the data sources above will be wrapped in a GRID
 61 | # tag with this name.
 62 | # default: unspecified
 63 | gridname "{{ grid_name }}"
 64 | #
 65 | #-------------------------------------------------------------------------------
 66 | # The authority URL for this grid. Used by other gmetads to locate graphs
 67 | # for our data sources. Generally points to a ganglia/
 68 | # website on this machine.
 69 | # default: "http://hostname/ganglia/",
 70 | #   where hostname is the name of this machine, as defined by gethostname().
 71 | # authority "http://mycluster.org/newprefix/"
 72 | #
 73 | #-------------------------------------------------------------------------------
 74 | # List of machines this gmetad will share XML with. Localhost
 75 | # is always trusted. 
 76 | # default: There is no default value
 77 | # trusted_hosts 127.0.0.1 169.229.50.165 my.gmetad.org
 78 | trusted_hosts 127.0.0.1
 79 | #
 80 | #-------------------------------------------------------------------------------
 81 | # If you want any host which connects to the gmetad XML to receive
 82 | # data, then set this value to "on"
 83 | # default: off
 84 | # all_trusted on
 85 | #
 86 | #-------------------------------------------------------------------------------
 87 | # If you don't want gmetad to setuid then set this to off
 88 | # default: on
 89 | # setuid off
 90 | #
 91 | #-------------------------------------------------------------------------------
 92 | # User gmetad will setuid to (defaults to "ganglia")
 93 | # default: "ganglia"
 94 | # setuid_username "ganglia"
 95 | #
 96 | #-------------------------------------------------------------------------------
 97 | # The port gmetad will answer requests for XML
 98 | # default: 8651
 99 | xml_port {{ xml_port }}
100 | #
101 | #-------------------------------------------------------------------------------
102 | # The port gmetad will answer queries for XML. This facility allows
103 | # simple subtree and summation views of the XML tree.
104 | # default: 8652
105 | interactive_port {{ interactive_port }}
106 | #
107 | #-------------------------------------------------------------------------------
108 | # The number of threads answering XML requests
109 | # default: 4
110 | # server_threads 10
111 | #
112 | #-------------------------------------------------------------------------------
113 | # Where gmetad stores its round-robin databases
114 | # default: "/var/lib/ganglia/rrds"
115 | # rrd_rootdir "/some/other/place"
116 | #
117 | #-------------------------------------------------------------------------------
118 | # In earlier versions of gmetad, hostnames were handled in a case
119 | # sensitive manner
120 | # If your hostname directories have been renamed to lower case,
121 | # set this option to 0 to disable backward compatibility.
122 | # From version 3.2, backwards compatibility will be disabled by default.
123 | # default: 1   (for gmetad < 3.2)
124 | # default: 0   (for gmetad >= 3.2)
125 | case_sensitive_hostnames 1
126 | 
127 | 


--------------------------------------------------------------------------------
/roles/ganglia-gmond/templates/gmond.conf:
--------------------------------------------------------------------------------
  1 | /* This configuration is as close to 2.5.x default behavior as possible
  2 |    The values closely match ./gmond/metric.h definitions in 2.5.x */
  3 | globals {
  4 |   daemonize = yes
  5 |   setuid = yes
  6 |   user = ganglia
  7 |   debug_level = 0
  8 |   max_udp_msg_len = 1472
  9 |   mute = no
 10 |   deaf = no
 11 |   allow_extra_data = yes
 12 |   host_dmax = 0 /*secs */
 13 |   cleanup_threshold = 300 /*secs */
 14 |   gexec = no
 15 |   send_metadata_interval = 0 /*secs */
 16 | }
 17 | 
 18 | /*
 19 |  * The cluster attributes specified will be used as part of the <CLUSTER>
 20 |  * tag that will wrap all hosts collected by this instance.
 21 |  */
 22 | cluster {
 23 |   name = "{{ cluster_name }}"
 24 |   owner = "{{ cluster_owner }}"
 25 |   latlong = "unspecified"
 26 |   url = "unspecified"
 27 | }
 28 | 
 29 | /* The host section describes attributes of the host, like the location */
 30 | host {
 31 |   location = "unspecified"
 32 | }
 33 | 
 34 | /* Feel free to specify as many udp_send_channels as you like.  Gmond
 35 |    used to only support having a single channel */
 36 | udp_send_channel {
 37 |   bind_hostname = yes # Highly recommended, soon to be default.
 38 |                        # This option tells gmond to use a source address
 39 |                        # that resolves to the machine's hostname.  Without
 40 |                        # this, the metrics may appear to come from any
 41 |                        # interface and the DNS names associated with
 42 |                        # those IPs will be used to create the RRDs.
 43 |   #mcast_join = 239.2.11.71
 44 |   host={{ target_host }}
 45 |   port = {{ target_port }}
 46 |   ttl = 1
 47 | }
 48 | 
 49 | /* You can specify as many udp_recv_channels as you like as well. */
 50 | udp_recv_channel {
 51 |   #mcast_join = 239.2.11.71
 52 |   port = {{ recv_port }}
 53 |   #bind = 239.2.11.71
 54 | }
 55 | 
 56 | /* You can specify as many tcp_accept_channels as you like to share
 57 |    an xml description of the state of the cluster */
 58 | tcp_accept_channel {
 59 |   port = {{ recv_port }}
 60 | }
 61 | 
 62 | /* Each metrics module that is referenced by gmond must be specified and
 63 |    loaded. If the module has been statically linked with gmond, it does
 64 |    not require a load path. However all dynamically loadable modules must
 65 |    include a load path. */
 66 | modules {
 67 |   module {
 68 |     name = "core_metrics"
 69 |   }
 70 |   module {
 71 |     name = "cpu_module"
 72 |     path = "modcpu.so"
 73 |   }
 74 |   module {
 75 |     name = "disk_module"
 76 |     path = "moddisk.so"
 77 |   }
 78 |   module {
 79 |     name = "load_module"
 80 |     path = "modload.so"
 81 |   }
 82 |   module {
 83 |     name = "mem_module"
 84 |     path = "modmem.so"
 85 |   }
 86 |   module {
 87 |     name = "net_module"
 88 |     path = "modnet.so"
 89 |   }
 90 |   module {
 91 |     name = "proc_module"
 92 |     path = "modproc.so"
 93 |   }
 94 |   module {
 95 |     name = "sys_module"
 96 |     path = "modsys.so"
 97 |   }
 98 | }
 99 | 
100 | include ('/etc/ganglia/conf.d/*.conf')
101 | 
102 | /* The old internal 2.5.x metric array has been replaced by the following
103 |    collection_group directives.  What follows is the default behavior for
104 |    collecting and sending metrics that is as close to 2.5.x behavior as
105 |    possible. */
106 | 
107 | /* This collection group will cause a heartbeat (or beacon) to be sent every
108 |    20 seconds.  In the heartbeat is the GMOND_STARTED data which expresses
109 |    the age of the running gmond. */
110 | collection_group {
111 |   collect_once = yes
112 |   time_threshold = 20
113 |   metric {
114 |     name = "heartbeat"
115 |   }
116 | }
117 | 
118 | /* This collection group will send general info about this host every
119 |    1200 secs.
120 |    This information doesn't change between reboots and is only collected
121 |    once. */
122 | collection_group {
123 |   collect_once = yes
124 |   time_threshold = 1200
125 |   metric {
126 |     name = "cpu_num"
127 |     title = "CPU Count"
128 |   }
129 |   metric {
130 |     name = "cpu_speed"
131 |     title = "CPU Speed"
132 |   }
133 |   metric {
134 |     name = "mem_total"
135 |     title = "Memory Total"
136 |   }
137 |   /* Should this be here? Swap can be added/removed between reboots. */
138 |   metric {
139 |     name = "swap_total"
140 |     title = "Swap Space Total"
141 |   }
142 |   metric {
143 |     name = "boottime"
144 |     title = "Last Boot Time"
145 |   }
146 |   metric {
147 |     name = "machine_type"
148 |     title = "Machine Type"
149 |   }
150 |   metric {
151 |     name = "os_name"
152 |     title = "Operating System"
153 |   }
154 |   metric {
155 |     name = "os_release"
156 |     title = "Operating System Release"
157 |   }
158 |   metric {
159 |     name = "location"
160 |     title = "Location"
161 |   }
162 | }
163 | 
164 | /* This collection group will send the status of gexecd for this host
165 |    every 300 secs.*/
166 | /* Unlike 2.5.x the default behavior is to report gexecd OFF. */
167 | collection_group {
168 |   collect_once = yes
169 |   time_threshold = 300
170 |   metric {
171 |     name = "gexec"
172 |     title = "Gexec Status"
173 |   }
174 | }
175 | 
176 | /* This collection group will collect the CPU status info every 20 secs.
177 |    The time threshold is set to 90 seconds.  In honesty, this
178 |    time_threshold could be set significantly higher to reduce
179 |    unneccessary  network chatter. */
180 | collection_group {
181 |   collect_every = 20
182 |   time_threshold = 90
183 |   /* CPU status */
184 |   metric {
185 |     name = "cpu_user"
186 |     value_threshold = "1.0"
187 |     title = "CPU User"
188 |   }
189 |   metric {
190 |     name = "cpu_system"
191 |     value_threshold = "1.0"
192 |     title = "CPU System"
193 |   }
194 |   metric {
195 |     name = "cpu_idle"
196 |     value_threshold = "5.0"
197 |     title = "CPU Idle"
198 |   }
199 |   metric {
200 |     name = "cpu_nice"
201 |     value_threshold = "1.0"
202 |     title = "CPU Nice"
203 |   }
204 |   metric {
205 |     name = "cpu_aidle"
206 |     value_threshold = "5.0"
207 |     title = "CPU aidle"
208 |   }
209 |   metric {
210 |     name = "cpu_wio"
211 |     value_threshold = "1.0"
212 |     title = "CPU wio"
213 |   }
214 |   /* The next two metrics are optional if you want more detail...
215 |      ... since they are accounted for in cpu_system.
216 |   metric {
217 |     name = "cpu_intr"
218 |     value_threshold = "1.0"
219 |     title = "CPU intr"
220 |   }
221 |   metric {
222 |     name = "cpu_sintr"
223 |     value_threshold = "1.0"
224 |     title = "CPU sintr"
225 |   }
226 |   */
227 | }
228 | 
229 | collection_group {
230 |   collect_every = 20
231 |   time_threshold = 90
232 |   /* Load Averages */
233 |   metric {
234 |     name = "load_one"
235 |     value_threshold = "1.0"
236 |     title = "One Minute Load Average"
237 |   }
238 |   metric {
239 |     name = "load_five"
240 |     value_threshold = "1.0"
241 |     title = "Five Minute Load Average"
242 |   }
243 |   metric {
244 |     name = "load_fifteen"
245 |     value_threshold = "1.0"
246 |     title = "Fifteen Minute Load Average"
247 |   }
248 | }
249 | 
250 | /* This group collects the number of running and total processes */
251 | collection_group {
252 |   collect_every = 80
253 |   time_threshold = 950
254 |   metric {
255 |     name = "proc_run"
256 |     value_threshold = "1.0"
257 |     title = "Total Running Processes"
258 |   }
259 |   metric {
260 |     name = "proc_total"
261 |     value_threshold = "1.0"
262 |     title = "Total Processes"
263 |   }
264 | }
265 | 
266 | /* This collection group grabs the volatile memory metrics every 40 secs and
267 |    sends them at least every 180 secs.  This time_threshold can be increased
268 |    significantly to reduce unneeded network traffic. */
269 | collection_group {
270 |   collect_every = 40
271 |   time_threshold = 180
272 |   metric {
273 |     name = "mem_free"
274 |     value_threshold = "1024.0"
275 |     title = "Free Memory"
276 |   }
277 |   metric {
278 |     name = "mem_shared"
279 |     value_threshold = "1024.0"
280 |     title = "Shared Memory"
281 |   }
282 |   metric {
283 |     name = "mem_buffers"
284 |     value_threshold = "1024.0"
285 |     title = "Memory Buffers"
286 |   }
287 |   metric {
288 |     name = "mem_cached"
289 |     value_threshold = "1024.0"
290 |     title = "Cached Memory"
291 |   }
292 |   metric {
293 |     name = "swap_free"
294 |     value_threshold = "1024.0"
295 |     title = "Free Swap Space"
296 |   }
297 | }
298 | 
299 | collection_group {
300 |   collect_every = 40
301 |   time_threshold = 300
302 |   metric {
303 |     name = "bytes_out"
304 |     value_threshold = 4096
305 |     title = "Bytes Sent"
306 |   }
307 |   metric {
308 |     name = "bytes_in"
309 |     value_threshold = 4096
310 |     title = "Bytes Received"
311 |   }
312 |   metric {
313 |     name = "pkts_in"
314 |     value_threshold = 256
315 |     title = "Packets Received"
316 |   }
317 |   metric {
318 |     name = "pkts_out"
319 |     value_threshold = 256
320 |     title = "Packets Sent"
321 |   }
322 | }
323 | 
324 | /* Different than 2.5.x default since the old config made no sense */
325 | collection_group {
326 |   collect_every = 1800
327 |   time_threshold = 3600
328 |   metric {
329 |     name = "disk_total"
330 |     value_threshold = 1.0
331 |     title = "Total Disk Space"
332 |   }
333 | }
334 | 
335 | collection_group {
336 |   collect_every = 40
337 |   time_threshold = 180
338 |   metric {
339 |     name = "disk_free"
340 |     value_threshold = 1.0
341 |     title = "Disk Space Available"
342 |   }
343 |   metric {
344 |     name = "part_max_used"
345 |     value_threshold = 1.0
346 |     title = "Maximum Disk Space Used"
347 |   }
348 | }
349 | 
350 | 


--------------------------------------------------------------------------------
/roles/elk-log-server/templates/elasticsearch.yml:
--------------------------------------------------------------------------------
  1 | ##################### Elasticsearch Configuration Example #####################
  2 | 
  3 | # This file contains an overview of various configuration settings,
  4 | # targeted at operations staff. Application developers should
  5 | # consult the guide at <http://elasticsearch.org/guide>.
  6 | #
  7 | # The installation procedure is covered at
  8 | # <http://elasticsearch.org/guide/en/elasticsearch/reference/current/setup.html>.
  9 | #
 10 | # Elasticsearch comes with reasonable defaults for most settings,
 11 | # so you can try it out without bothering with configuration.
 12 | #
 13 | # Most of the time, these defaults are just fine for running a production
 14 | # cluster. If you're fine-tuning your cluster, or wondering about the
 15 | # effect of certain configuration option, please _do ask_ on the
 16 | # mailing list or IRC channel [http://elasticsearch.org/community].
 17 | 
 18 | # Any element in the configuration can be replaced with environment variables
 19 | # by placing them in ${...} notation. For example:
 20 | #
 21 | #node.rack: ${RACK_ENV_VAR}
 22 | 
 23 | # For information on supported formats and syntax for the config file, see
 24 | # <http://elasticsearch.org/guide/en/elasticsearch/reference/current/setup-configuration.html>
 25 | 
 26 | 
 27 | ################################### Added by user #############################
 28 | # Based on tutorial in 
 29 | # https://www.digitalocean.com/community/tutorials/how-to-use-logstash-and-kibana-to-centralize-logs-on-centos-6
 30 | #
 31 | 
 32 | script.disable_dynamic: true
 33 | network.host: localhost
 34 | discovery.zen.ping.multicast.enabled: false
 35 | 
 36 | 
 37 | ################################### Cluster ###################################
 38 | 
 39 | # Cluster name identifies your cluster for auto-discovery. If you're running
 40 | # multiple clusters on the same network, make sure you're using unique names.
 41 | #
 42 | #cluster.name: elasticsearch
 43 | 
 44 | 
 45 | #################################### Node #####################################
 46 | 
 47 | # Node names are generated dynamically on startup, so you're relieved
 48 | # from configuring them manually. You can tie this node to a specific name:
 49 | #
 50 | #node.name: "Franz Kafka"
 51 | 
 52 | # Every node can be configured to allow or deny being eligible as the master,
 53 | # and to allow or deny to store the data.
 54 | #
 55 | # Allow this node to be eligible as a master node (enabled by default):
 56 | #
 57 | #node.master: true
 58 | #
 59 | # Allow this node to store data (enabled by default):
 60 | #
 61 | #node.data: true
 62 | 
 63 | # You can exploit these settings to design advanced cluster topologies.
 64 | #
 65 | # 1. You want this node to never become a master node, only to hold data.
 66 | #    This will be the "workhorse" of your cluster.
 67 | #
 68 | #node.master: false
 69 | #node.data: true
 70 | #
 71 | # 2. You want this node to only serve as a master: to not store any data and
 72 | #    to have free resources. This will be the "coordinator" of your cluster.
 73 | #
 74 | #node.master: true
 75 | #node.data: false
 76 | #
 77 | # 3. You want this node to be neither master nor data node, but
 78 | #    to act as a "search load balancer" (fetching data from nodes,
 79 | #    aggregating results, etc.)
 80 | #
 81 | #node.master: false
 82 | #node.data: false
 83 | 
 84 | # Use the Cluster Health API [http://localhost:9200/_cluster/health], the
 85 | # Node Info API [http://localhost:9200/_nodes] or GUI tools
 86 | # such as <http://www.elasticsearch.org/overview/marvel/>,
 87 | # <http://github.com/karmi/elasticsearch-paramedic>,
 88 | # <http://github.com/lukas-vlcek/bigdesk> and
 89 | # <http://mobz.github.com/elasticsearch-head> to inspect the cluster state.
 90 | 
 91 | # A node can have generic attributes associated with it, which can later be used
 92 | # for customized shard allocation filtering, or allocation awareness. An attribute
 93 | # is a simple key value pair, similar to node.key: value, here is an example:
 94 | #
 95 | #node.rack: rack314
 96 | 
 97 | # By default, multiple nodes are allowed to start from the same installation location
 98 | # to disable it, set the following:
 99 | #node.max_local_storage_nodes: 1
100 | 
101 | 
102 | #################################### Index ####################################
103 | 
104 | # You can set a number of options (such as shard/replica options, mapping
105 | # or analyzer definitions, translog settings, ...) for indices globally,
106 | # in this file.
107 | #
108 | # Note, that it makes more sense to configure index settings specifically for
109 | # a certain index, either when creating it or by using the index templates API.
110 | #
111 | # See <http://elasticsearch.org/guide/en/elasticsearch/reference/current/index-modules.html> and
112 | # <http://elasticsearch.org/guide/en/elasticsearch/reference/current/indices-create-index.html>
113 | # for more information.
114 | 
115 | # Set the number of shards (splits) of an index (5 by default):
116 | #
117 | #index.number_of_shards: 5
118 | 
119 | # Set the number of replicas (additional copies) of an index (1 by default):
120 | #
121 | #index.number_of_replicas: 1
122 | 
123 | # Note, that for development on a local machine, with small indices, it usually
124 | # makes sense to "disable" the distributed features:
125 | #
126 | #index.number_of_shards: 1
127 | #index.number_of_replicas: 0
128 | 
129 | # These settings directly affect the performance of index and search operations
130 | # in your cluster. Assuming you have enough machines to hold shards and
131 | # replicas, the rule of thumb is:
132 | #
133 | # 1. Having more *shards* enhances the _indexing_ performance and allows to
134 | #    _distribute_ a big index across machines.
135 | # 2. Having more *replicas* enhances the _search_ performance and improves the
136 | #    cluster _availability_.
137 | #
138 | # The "number_of_shards" is a one-time setting for an index.
139 | #
140 | # The "number_of_replicas" can be increased or decreased anytime,
141 | # by using the Index Update Settings API.
142 | #
143 | # Elasticsearch takes care about load balancing, relocating, gathering the
144 | # results from nodes, etc. Experiment with different settings to fine-tune
145 | # your setup.
146 | 
147 | # Use the Index Status API (<http://localhost:9200/A/_status>) to inspect
148 | # the index status.
149 | 
150 | 
151 | #################################### Paths ####################################
152 | 
153 | # Path to directory containing configuration (this file and logging.yml):
154 | #
155 | #path.conf: /path/to/conf
156 | 
157 | # Path to directory where to store index data allocated for this node.
158 | #
159 | #path.data: /path/to/data
160 | #
161 | # Can optionally include more than one location, causing data to be striped across
162 | # the locations (a la RAID 0) on a file level, favouring locations with most free
163 | # space on creation. For example:
164 | #
165 | #path.data: /path/to/data1,/path/to/data2
166 | 
167 | # Path to temporary files:
168 | #
169 | #path.work: /path/to/work
170 | 
171 | # Path to log files:
172 | #
173 | #path.logs: /path/to/logs
174 | 
175 | # Path to where plugins are installed:
176 | #
177 | #path.plugins: /path/to/plugins
178 | 
179 | 
180 | #################################### Plugin ###################################
181 | 
182 | # If a plugin listed here is not installed for current node, the node will not start.
183 | #
184 | #plugin.mandatory: mapper-attachments,lang-groovy
185 | 
186 | 
187 | ################################### Memory ####################################
188 | 
189 | # Elasticsearch performs poorly when JVM starts swapping: you should ensure that
190 | # it _never_ swaps.
191 | #
192 | # Set this property to true to lock the memory:
193 | #
194 | #bootstrap.mlockall: true
195 | 
196 | # Make sure that the ES_MIN_MEM and ES_MAX_MEM environment variables are set
197 | # to the same value, and that the machine has enough memory to allocate
198 | # for Elasticsearch, leaving enough memory for the operating system itself.
199 | #
200 | # You should also make sure that the Elasticsearch process is allowed to lock
201 | # the memory, eg. by using `ulimit -l unlimited`.
202 | 
203 | 
204 | ############################## Network And HTTP ###############################
205 | 
206 | # Elasticsearch, by default, binds itself to the 0.0.0.0 address, and listens
207 | # on port [9200-9300] for HTTP traffic and on port [9300-9400] for node-to-node
208 | # communication. (the range means that if the port is busy, it will automatically
209 | # try the next port).
210 | 
211 | # Set the bind address specifically (IPv4 or IPv6):
212 | #
213 | #network.bind_host: 192.168.0.1
214 | 
215 | # Set the address other nodes will use to communicate with this node. If not
216 | # set, it is automatically derived. It must point to an actual IP address.
217 | #
218 | #network.publish_host: 192.168.0.1
219 | 
220 | # Set both 'bind_host' and 'publish_host':
221 | #
222 | #network.host: 192.168.0.1
223 | 
224 | # Set a custom port for the node to node communication (9300 by default):
225 | #
226 | #transport.tcp.port: 9300
227 | 
228 | # Enable compression for all communication between nodes (disabled by default):
229 | #
230 | #transport.tcp.compress: true
231 | 
232 | # Set a custom port to listen for HTTP traffic:
233 | #
234 | #http.port: 9200
235 | 
236 | # Set a custom allowed content length:
237 | #
238 | #http.max_content_length: 100mb
239 | 
240 | # Disable HTTP completely:
241 | #
242 | #http.enabled: false
243 | 
244 | 
245 | ################################### Gateway ###################################
246 | 
247 | # The gateway allows for persisting the cluster state between full cluster
248 | # restarts. Every change to the state (such as adding an index) will be stored
249 | # in the gateway, and when the cluster starts up for the first time,
250 | # it will read its state from the gateway.
251 | 
252 | # There are several types of gateway implementations. For more information, see
253 | # <http://elasticsearch.org/guide/en/elasticsearch/reference/current/modules-gateway.html>.
254 | 
255 | # The default gateway type is the "local" gateway (recommended):
256 | #
257 | #gateway.type: local
258 | 
259 | # Settings below control how and when to start the initial recovery process on
260 | # a full cluster restart (to reuse as much local data as possible when using shared
261 | # gateway).
262 | 
263 | # Allow recovery process after N nodes in a cluster are up:
264 | #
265 | #gateway.recover_after_nodes: 1
266 | 
267 | # Set the timeout to initiate the recovery process, once the N nodes
268 | # from previous setting are up (accepts time value):
269 | #
270 | #gateway.recover_after_time: 5m
271 | 
272 | # Set how many nodes are expected in this cluster. Once these N nodes
273 | # are up (and recover_after_nodes is met), begin recovery process immediately
274 | # (without waiting for recover_after_time to expire):
275 | #
276 | #gateway.expected_nodes: 2
277 | 
278 | 
279 | ############################# Recovery Throttling #############################
280 | 
281 | # These settings allow to control the process of shards allocation between
282 | # nodes during initial recovery, replica allocation, rebalancing,
283 | # or when adding and removing nodes.
284 | 
285 | # Set the number of concurrent recoveries happening on a node:
286 | #
287 | # 1. During the initial recovery
288 | #
289 | #cluster.routing.allocation.node_initial_primaries_recoveries: 4
290 | #
291 | # 2. During adding/removing nodes, rebalancing, etc
292 | #
293 | #cluster.routing.allocation.node_concurrent_recoveries: 2
294 | 
295 | # Set to throttle throughput when recovering (eg. 100mb, by default 20mb):
296 | #
297 | #indices.recovery.max_bytes_per_sec: 20mb
298 | 
299 | # Set to limit the number of open concurrent streams when
300 | # recovering a shard from a peer:
301 | #
302 | #indices.recovery.concurrent_streams: 5
303 | 
304 | 
305 | ################################## Discovery ##################################
306 | 
307 | # Discovery infrastructure ensures nodes can be found within a cluster
308 | # and master node is elected. Multicast discovery is the default.
309 | 
310 | # Set to ensure a node sees N other master eligible nodes to be considered
311 | # operational within the cluster. This should be set to a quorum/majority of 
312 | # the master-eligible nodes in the cluster.
313 | #
314 | #discovery.zen.minimum_master_nodes: 1
315 | 
316 | # Set the time to wait for ping responses from other nodes when discovering.
317 | # Set this option to a higher value on a slow or congested network
318 | # to minimize discovery failures:
319 | #
320 | #discovery.zen.ping.timeout: 3s
321 | 
322 | # For more information, see
323 | # <http://elasticsearch.org/guide/en/elasticsearch/reference/current/modules-discovery-zen.html>
324 | 
325 | # Unicast discovery allows to explicitly control which nodes will be used
326 | # to discover the cluster. It can be used when multicast is not present,
327 | # or to restrict the cluster communication-wise.
328 | #
329 | # 1. Disable multicast discovery (enabled by default):
330 | #
331 | #discovery.zen.ping.multicast.enabled: false
332 | #
333 | # 2. Configure an initial list of master nodes in the cluster
334 | #    to perform discovery when new nodes (master or data) are started:
335 | #
336 | #discovery.zen.ping.unicast.hosts: ["host1", "host2:port"]
337 | 
338 | # EC2 discovery allows to use AWS EC2 API in order to perform discovery.
339 | #
340 | # You have to install the cloud-aws plugin for enabling the EC2 discovery.
341 | #
342 | # For more information, see
343 | # <http://elasticsearch.org/guide/en/elasticsearch/reference/current/modules-discovery-ec2.html>
344 | #
345 | # See <http://elasticsearch.org/tutorials/elasticsearch-on-ec2/>
346 | # for a step-by-step tutorial.
347 | 
348 | # GCE discovery allows to use Google Compute Engine API in order to perform discovery.
349 | #
350 | # You have to install the cloud-gce plugin for enabling the GCE discovery.
351 | #
352 | # For more information, see <https://github.com/elasticsearch/elasticsearch-cloud-gce>.
353 | 
354 | # Azure discovery allows to use Azure API in order to perform discovery.
355 | #
356 | # You have to install the cloud-azure plugin for enabling the Azure discovery.
357 | #
358 | # For more information, see <https://github.com/elasticsearch/elasticsearch-cloud-azure>.
359 | 
360 | ################################## Slow Log ##################################
361 | 
362 | # Shard level query and fetch threshold logging.
363 | 
364 | #index.search.slowlog.threshold.query.warn: 10s
365 | #index.search.slowlog.threshold.query.info: 5s
366 | #index.search.slowlog.threshold.query.debug: 2s
367 | #index.search.slowlog.threshold.query.trace: 500ms
368 | 
369 | #index.search.slowlog.threshold.fetch.warn: 1s
370 | #index.search.slowlog.threshold.fetch.info: 800ms
371 | #index.search.slowlog.threshold.fetch.debug: 500ms
372 | #index.search.slowlog.threshold.fetch.trace: 200ms
373 | 
374 | #index.indexing.slowlog.threshold.index.warn: 10s
375 | #index.indexing.slowlog.threshold.index.info: 5s
376 | #index.indexing.slowlog.threshold.index.debug: 2s
377 | #index.indexing.slowlog.threshold.index.trace: 500ms
378 | 
379 | ################################## GC Logging ################################
380 | 
381 | #monitor.jvm.gc.young.warn: 1000ms
382 | #monitor.jvm.gc.young.info: 700ms
383 | #monitor.jvm.gc.young.debug: 400ms
384 | 
385 | #monitor.jvm.gc.old.warn: 10s
386 | #monitor.jvm.gc.old.info: 5s
387 | #monitor.jvm.gc.old.debug: 2s
388 | 
389 | ################################## Security ################################
390 | 
391 | # Uncomment if you want to enable JSONP as a valid return transport on the
392 | # http server. With this enabled, it may pose a security risk, so disabling
393 | # it unless you need it is recommended (it is disabled by default).
394 | #
395 | #http.jsonp.enable: true
396 | 


--------------------------------------------------------------------------------