├── roles ├── rsyslog-collector │ ├── defaults │ │ └── main.yml │ ├── handlers │ │ └── main.yml │ ├── templates │ │ └── collect.conf │ └── tasks │ │ └── main.yml ├── etchosts │ ├── defaults │ │ └── main.yml │ ├── tasks │ │ └── main.yml │ ├── templates │ │ └── hosts.j2 │ └── README ├── slurm │ ├── templates │ │ ├── cgroup.conf │ │ └── slurm.conf │ ├── handlers │ │ └── main.yml │ ├── files │ │ ├── munge.key │ │ └── slurm_import.conf │ ├── defaults │ │ └── main.yml │ ├── README.md │ └── tasks │ │ └── main.yml ├── elk-log-server │ ├── defaults │ │ └── main.yml │ ├── README │ ├── templates │ │ ├── 01-elasticsearch-storage.conf │ │ ├── 10-syslog-input.conf │ │ ├── kibana.yml │ │ ├── supervisord.conf │ │ └── elasticsearch.yml │ ├── files │ │ └── elasticsearch-1.5.repo │ ├── handlers │ │ └── main.yml │ └── tasks │ │ └── main.yml ├── nfs-server │ ├── handlers │ │ └── main.yml │ ├── defaults │ │ └── main.yml │ ├── templates │ │ └── exports │ └── tasks │ │ └── main.yml ├── rsyslog-forwarder │ ├── templates │ │ └── forward.conf │ ├── handlers │ │ └── main.yml │ ├── defaults │ │ └── main.yml │ └── tasks │ │ └── main.yml ├── ajdecon-repo │ ├── defaults │ │ └── main.yml │ ├── files │ │ └── yum-s3-0.2.4-1.noarch.rpm │ ├── tasks │ │ └── main.yml │ └── templates │ │ └── ajdecon-repo.el6.repo.j2 ├── ganglia-gmond │ ├── handlers │ │ └── main.yml │ ├── defaults │ │ └── main.yml │ ├── tasks │ │ └── main.yml │ └── templates │ │ └── gmond.conf ├── ganglia-gmetad │ ├── handlers │ │ └── main.yml │ ├── defaults │ │ └── main.yml │ ├── tasks │ │ └── main.yml │ └── templates │ │ └── gmetad.conf ├── slurmdbd │ ├── handlers │ │ └── main.yml │ ├── defaults │ │ └── main.yml │ ├── README.md │ ├── templates │ │ └── slurmdbd.conf │ └── tasks │ │ └── main.yml ├── epel6 │ ├── tasks │ │ └── main.yml │ └── files │ │ └── epel.repo ├── nfs-client │ ├── defaults │ │ └── main.yml │ └── tasks │ │ └── main.yml ├── common-cluster-pkgs │ ├── files │ │ └── osu-micro-benchmarks-4.4.1.tar.gz │ ├── defaults │ │ └── main.yml │ └── tasks │ │ └── main.yml └── sethostname │ └── tasks │ └── main.yml ├── config.yml ├── s3.yml ├── hosts.real ├── hosts.test ├── LICENSE ├── cluster.yml └── README.md /roles/rsyslog-collector/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | syslog_port: 514 3 | -------------------------------------------------------------------------------- /roles/etchosts/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | hosts_groupname: cluster 3 | 4 | -------------------------------------------------------------------------------- /roles/slurm/templates/cgroup.conf: -------------------------------------------------------------------------------- 1 | CgroupAutomount=yes 2 | ConstrainCores=yes 3 | -------------------------------------------------------------------------------- /roles/elk-log-server/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | logstash_syslog_listen_port: 5200 3 | -------------------------------------------------------------------------------- /roles/nfs-server/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: exportfs 3 | command: exportfs -a 4 | -------------------------------------------------------------------------------- /roles/rsyslog-forwarder/templates/forward.conf: -------------------------------------------------------------------------------- 1 | *.* @{{syslog_target}}:{{syslog_target_port}} 2 | -------------------------------------------------------------------------------- /roles/ajdecon-repo/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | s3_key_id: 'NO DEFAULT VALUE' 3 | s3_secret_key: 'NO DEFAULT VALUE' 4 | -------------------------------------------------------------------------------- /roles/ganglia-gmond/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: gmond is restarted 3 | service: name=gmond state=restarted 4 | -------------------------------------------------------------------------------- /roles/ganglia-gmetad/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: gmetad is restarted 3 | service: name=gmetad state=restarted 4 | -------------------------------------------------------------------------------- /roles/rsyslog-collector/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: rsyslog is restarted 3 | service: name=rsyslog state=restarted 4 | -------------------------------------------------------------------------------- /roles/rsyslog-forwarder/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: rsyslog is restarted 3 | service: name=rsyslog state=restarted 4 | -------------------------------------------------------------------------------- /roles/slurmdbd/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: slurmdbd is restarted 3 | action: service name=slurmdbd state=restarted 4 | -------------------------------------------------------------------------------- /roles/rsyslog-forwarder/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | syslog_target: head 3 | syslog_target_port: 514 4 | syslog_fwd_file: "forward.conf" 5 | -------------------------------------------------------------------------------- /roles/elk-log-server/README: -------------------------------------------------------------------------------- 1 | An extremly simple (read: stupid) role that sets up an ELK-stack log server 2 | with a single syslog input. 3 | -------------------------------------------------------------------------------- /roles/elk-log-server/templates/01-elasticsearch-storage.conf: -------------------------------------------------------------------------------- 1 | output { 2 | elasticsearch { 3 | host => localhost 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /roles/epel6/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: yum.repos.d includes epel.repo 3 | action: copy src=epel.repo dest=/etc/yum.repos.d/epel.repo 4 | -------------------------------------------------------------------------------- /roles/slurmdbd/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | slurmdbd_password: "$LurM42" 3 | mysql_host: localhost 4 | mysql_admin: root 5 | mysql_pw: "@ns1bl3!" 6 | -------------------------------------------------------------------------------- /roles/elk-log-server/templates/10-syslog-input.conf: -------------------------------------------------------------------------------- 1 | input { 2 | syslog { 3 | port => {{ logstash_syslog_listen_port }} 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /roles/ganglia-gmetad/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | grid_name: "SlurmCluster" 3 | xml_port: 8651 4 | interactive_port: 8652 5 | enable_web_interface: true 6 | -------------------------------------------------------------------------------- /roles/ganglia-gmond/defaults/main.yml: -------------------------------------------------------------------------------- 1 | cluster_name: defaultcluster 2 | cluster_owner: unspecified 3 | target_host: head 4 | target_port: 8649 5 | recv_port: 8649 6 | -------------------------------------------------------------------------------- /roles/nfs-client/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | nfs_mounts: 3 | - path: /home 4 | mountpoint: /home 5 | server: head 6 | options: "defaults,vers=3" 7 | -------------------------------------------------------------------------------- /roles/nfs-server/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | nfs_exports: 3 | - path: /home 4 | options: "rw,no_root_squash" 5 | 6 | nfs_allowed_groups: 7 | - cluster 8 | -------------------------------------------------------------------------------- /roles/ajdecon-repo/files/yum-s3-0.2.4-1.noarch.rpm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ajdecon/ansible-simple-slurm-cluster/HEAD/roles/ajdecon-repo/files/yum-s3-0.2.4-1.noarch.rpm -------------------------------------------------------------------------------- /roles/etchosts/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: regather data 3 | action: setup 4 | 5 | - name: auto-generate /etc/hosts 6 | action: template src=hosts.j2 dest=/etc/hosts 7 | 8 | 9 | -------------------------------------------------------------------------------- /roles/common-cluster-pkgs/files/osu-micro-benchmarks-4.4.1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ajdecon/ansible-simple-slurm-cluster/HEAD/roles/common-cluster-pkgs/files/osu-micro-benchmarks-4.4.1.tar.gz -------------------------------------------------------------------------------- /roles/slurmdbd/README.md: -------------------------------------------------------------------------------- 1 | slurmdbd 2 | ======== 3 | 4 | Set up a MySQL db and slurmdbd for accounting storage. 5 | 6 | Requirements: 7 | - Inject variable `slurmdbd_password`, no default value in role. 8 | -------------------------------------------------------------------------------- /roles/common-cluster-pkgs/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | openblas_enabled: true 3 | openmpi_enabled: false 4 | mpich2_enabled: true 5 | python_enabled: false 6 | openib_enabled: true 7 | upload_osu_benchmarks: false 8 | -------------------------------------------------------------------------------- /roles/etchosts/templates/hosts.j2: -------------------------------------------------------------------------------- 1 | 127.0.0.1 localhost localhost.localdomain 2 | {% for host in groups[hosts_groupname] %} 3 | {{ hostvars[host]['ansible_eth0']['ipv4']['address'] }} {{ hostvars[host]['ansible_hostname'] }} 4 | {% endfor %} 5 | -------------------------------------------------------------------------------- /roles/slurm/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: munge is restarted 3 | service: name=munge state=restarted 4 | 5 | - name: slurm is restarted 6 | service: name=slurm state=restarted 7 | 8 | - name: rsyslog is restarted 9 | service: name=rsyslog state=restarted 10 | -------------------------------------------------------------------------------- /roles/slurm/files/munge.key: -------------------------------------------------------------------------------- 1 | You should replace this file with random data, i.e. by using 2 | 3 | dd if=/dev/random of=munge.key bs=8 count=128 4 | 5 | Munge should work if you just use a file with text in it (even this file!), 6 | but the security is less likely to be good. 7 | -------------------------------------------------------------------------------- /roles/elk-log-server/files/elasticsearch-1.5.repo: -------------------------------------------------------------------------------- 1 | [elasticsearch-1.5] 2 | name=Elasticsearch repository for 1.5.x packages 3 | baseurl=http://packages.elasticsearch.org/elasticsearch/1.5/centos 4 | gpgcheck=1 5 | gpgkey=http://packages.elasticsearch.org/GPG-KEY-elasticsearch 6 | enabled=1 7 | -------------------------------------------------------------------------------- /roles/etchosts/README: -------------------------------------------------------------------------------- 1 | etchosts 2 | ======== 3 | 4 | A simple role for setting up an /etc/hosts file based on an Ansible group. 5 | 6 | Variables: 7 | ---------- 8 | 9 | * hosts_groupname: Ansible group to iterate through when generating file. 10 | Default value is "cluster". 11 | -------------------------------------------------------------------------------- /roles/elk-log-server/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: restart elasticsearch 3 | service: name=elasticsearch state=restarted 4 | 5 | - name: restart supervisord 6 | service: name=supervisord state=restarted 7 | 8 | - name: restart logstash 9 | service: name=logstash state=restarted 10 | -------------------------------------------------------------------------------- /roles/nfs-server/templates/exports: -------------------------------------------------------------------------------- 1 | {% for export in nfs_exports -%} 2 | {% for g in nfs_allowed_groups -%} 3 | {% for host in groups[g] -%} 4 | {{export.path}} {{hostvars[host]['ansible_eth0']['ipv4']['address']}}/255.255.255.255({{export.options}}) 5 | {% endfor %} 6 | {% endfor %} 7 | {% endfor %} 8 | -------------------------------------------------------------------------------- /config.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | # Several of the roles (including the slurm role, nfs-client, and 4 | # rsyslog-forwarder roles) assume your head node will be named 5 | # "head". See the roles' defaults/main.yml files to see where this 6 | # might need to be changed. 7 | 8 | slurmctld_host: head 9 | slurm_cpus_per_node: 2 10 | upload_osu_benchmarks: true 11 | -------------------------------------------------------------------------------- /roles/rsyslog-collector/templates/collect.conf: -------------------------------------------------------------------------------- 1 | # Provides UDP syslog reception 2 | $ModLoad imudp 3 | $UDPServerRun 514 4 | 5 | # Provides TCP syslog reception 6 | $ModLoad imtcp 7 | $InputTCPServerRun 514 8 | 9 | # Collect cluster logs per-host 10 | $template ClusterFile,"/var/log/cluster/%HOSTNAME% 11 | :source , !isequal , "localhost" ?ClusterFile 12 | -------------------------------------------------------------------------------- /roles/slurm/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | slurmctld_host: head 3 | 4 | slurmdbd_enabled: false 5 | slurmdbd_host: head 6 | 7 | slurm_compute_group: compute 8 | slurm_cluster_name: defaultcluster 9 | slurm_partition_name: debug 10 | slurm_cpus_per_node: 1 11 | 12 | slurm_state_dir: /var/lib/slurm 13 | slurm_acct_file: /var/log/slurm_jobacct.log 14 | 15 | slurm_log_to_syslog: false 16 | -------------------------------------------------------------------------------- /roles/rsyslog-forwarder/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: ensure rsyslog is installed 3 | yum: name=rsyslog state=present 4 | 5 | - name: add forward.conf to syslog config 6 | template: src="forward.conf" dest="/etc/rsyslog.d/{{ syslog_fwd_file }}" 7 | notify: 8 | - rsyslog is restarted 9 | 10 | - name: ensure rsyslog is running and enabled 11 | service: name=rsyslog state=started enabled=yes 12 | -------------------------------------------------------------------------------- /roles/ajdecon-repo/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: upload yum-s3 rpm 3 | action: copy src=yum-s3-0.2.4-1.noarch.rpm dest=/tmp/yum-s3-0.2.4-1.noarch.rpm 4 | 5 | - name: install yum-s3 6 | action: yum name=/tmp/yum-s3-0.2.4-1.noarch.rpm state=present 7 | 8 | - name: yum.repos.d includes ajdecon-repo.el6.repo 9 | action: template src=ajdecon-repo.el6.repo.j2 dest=/etc/yum.repos.d/ajdecon-repo.el6.repo 10 | -------------------------------------------------------------------------------- /roles/ganglia-gmond/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: install gmond packages 3 | yum: name={{ item }} state=present 4 | with_items: 5 | - ganglia-gmond 6 | - ganglia-gmond-python 7 | 8 | - name: gmond is configured 9 | template: src="gmond.conf" dest="/etc/ganglia/gmond.conf" 10 | notify: 11 | - gmond is restarted 12 | 13 | - name: gmond is enabled and running 14 | service: name=gmond state=started enabled=yes 15 | 16 | -------------------------------------------------------------------------------- /s3.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | # These keys are used for authorizing access to my RPM repo on S3. 4 | # You should build your own RPM repo for installing any software not 5 | # in a public repo. 6 | # 7 | # See http://www.carrollops.com/blog/2012/09/11/s3-yum-repos-with-iam-authorization/ 8 | # and https://github.com/jbraeuer/yum-s3-plugin 9 | # to see how I set this up for myself. 10 | 11 | s3_key_id: 'NOTHING HERE' 12 | s3_secret_key: 'NOT REAL' 13 | -------------------------------------------------------------------------------- /roles/slurmdbd/templates/slurmdbd.conf: -------------------------------------------------------------------------------- 1 | ArchiveEvents=yes 2 | ArchiveJobs=yes 3 | AuthType=auth/munge 4 | DbdHost=localhost 5 | DebugLevel=4 6 | PurgeEventAfter=1month 7 | PurgeJobAfter=24month 8 | PurgeStepAfter=1month 9 | PurgeSuspendAfter=1month 10 | LogFile=/var/log/slurmdbd.log 11 | PidFile=/var/tmp/slurmdbd.pid 12 | SlurmUser=slurm 13 | StorageHost={{mysql_host}} 14 | StoragePass={{slurmdbd_password}} 15 | StorageType=accounting_storage/mysql 16 | StorageUser=slurm 17 | -------------------------------------------------------------------------------- /roles/ajdecon-repo/templates/ajdecon-repo.el6.repo.j2: -------------------------------------------------------------------------------- 1 | [ajdecon-repo.el6.x86_64] 2 | name = ajdecon-repo.el6.x86_64 3 | baseurl = https://ajdecon-repo.s3.amazonaws.com/el6/x86_64/ 4 | gpgcheck=0 5 | enabled=1 6 | s3_enabled=1 7 | key_id={{s3_key_id}} 8 | secret_key={{s3_secret_key}} 9 | 10 | [ajdecon-repo.el6.noarch] 11 | name = ajdecon-repo.el6.noarch 12 | baseurl = https://ajdecon-repo.s3.amazonaws.com/el6/noarch/ 13 | gpgcheck=0 14 | enabled=1 15 | s3_enabled=1 16 | key_id={{s3_key_id}} 17 | secret_key={{s3_secret_key}} 18 | 19 | -------------------------------------------------------------------------------- /roles/rsyslog-collector/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: ensure rsyslog is installed 3 | yum: name=rsyslog state=present 4 | 5 | - name: add collect.conf to syslog config 6 | template: src="collect.conf" dest="/etc/rsyslog.d/forward.conf" 7 | notify: 8 | - rsyslog is restarted 9 | 10 | - name: ensure /var/log/cluster exists 11 | file: path=/var/log/cluster owner=root group=root mode=0700 state=directory 12 | 13 | - name: ensure rsyslog is running and enabled 14 | service: name=rsyslog state=started enabled=yes 15 | -------------------------------------------------------------------------------- /roles/nfs-server/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: ensure nfs-related packages are installed 3 | yum: name={{item}} state=present 4 | with_items: 5 | - nfs-utils 6 | - nfs-utils-lib 7 | - nfs-utils-lib-devel 8 | - nfs4-acl-tools 9 | 10 | - name: configure exports 11 | template: src=exports dest=/etc/exports 12 | notify: 13 | - exportfs 14 | 15 | - name: ensure rpcbind is running 16 | service: name=rpcbind state=started enabled=yes 17 | 18 | - name: ensure nfs is running 19 | service: name=nfs state=started enabled=yes 20 | -------------------------------------------------------------------------------- /roles/sethostname/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: live hostname set to setname value 3 | action: command hostname {{setname}} 4 | 5 | - name: re-gather facts to update ansible_hostname 6 | action: setup 7 | when: not ansible_hostname == "{{ setname }}" 8 | 9 | - name: persistent hostname set to setname value 10 | action: lineinfile dest=/etc/sysconfig/network regexp=^HOSTNAME= line=HOSTNAME={{setname}} 11 | 12 | - name: /etc/hosts reflects setname value 13 | action: lineinfile dest=/etc/hosts regexp=^{{ansible_eth0.ipv4.address}} line="{{ansible_eth0.ipv4.address}} {{setname}}" 14 | -------------------------------------------------------------------------------- /roles/slurm/README.md: -------------------------------------------------------------------------------- 1 | slurm 2 | ===== 3 | 4 | This role installs SLURM, the HPC cluster resource manager. 5 | 6 | Things you should change: 7 | 8 | - This role is dependent on the presence of a yum repo 9 | which contains the slurm packages. The easiest way to 10 | do this (for me) is to have another role which sets up 11 | the appropriate repo, and include it as a role dependency 12 | in meta/main.yml 13 | 14 | - You need to have a file munge.key for cluster node 15 | authentication in files/munge.key. Run, 16 | 17 | dd bs=1 if=/dev/urandom count=1024 of=files/munge.key 18 | -------------------------------------------------------------------------------- /roles/nfs-client/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: ensure nfs packages are installed 3 | yum: name={{item}} state=present 4 | with_items: 5 | - nfs-utils 6 | - nfs4-acl-tools 7 | 8 | - name: ensure mountpoints exist 9 | file: path={{item.mountpoint}} state=directory 10 | with_items: nfs_mounts 11 | 12 | - name: ensure rpcbind is running 13 | service: name=rpcbind state=started enabled=yes 14 | 15 | - name: ensure filesystems are mounted 16 | mount: name={{item.mountpoint}} src="{{item.server}}:{{item.path}}" fstype=nfs opts={{item.options}} state=mounted 17 | with_items: nfs_mounts 18 | 19 | -------------------------------------------------------------------------------- /roles/slurm/files/slurm_import.conf: -------------------------------------------------------------------------------- 1 | $ModLoad imfile 2 | 3 | $InputFileName /var/log/slurmd.log 4 | $InputFileStateFile stat-slurmd 5 | $InputFileTag slurmd: 6 | $InputFileSeverity info 7 | $InputFileFacility local7 8 | $InputFilePollInterval 60 9 | $InputRunFileMonitor 10 | 11 | $InputFileName /var/log/slurmctld.log 12 | $InputFileStateFile stat-slurmctld 13 | $InputFileTag slurmctld: 14 | $InputFileSeverity info 15 | $InputFileFacility local7 16 | $InputFilePollInterval 60 17 | $InputRunFileMonitor 18 | 19 | $InputFileName /var/log/slurmsched.log 20 | $InputFileStateFile stat-slurmsched 21 | $InputFileTag slurmsched: 22 | $InputFileSeverity info 23 | $InputFileFacility local7 24 | $InputFilePollInterval 60 25 | $InputRunFileMonitor 26 | 27 | -------------------------------------------------------------------------------- /roles/slurmdbd/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: slurmdbd packages are installed 3 | yum: name={{item}} state=present 4 | with_items: 5 | - slurm-munge 6 | - slurm-plugins 7 | - munge 8 | - slurm-slurmdbd 9 | - slurm-sql 10 | - mysql-server 11 | - MySQL-python 12 | 13 | - name: mysqld is running 14 | service: name=mysqld state=started 15 | 16 | - name: slurm user exists 17 | user: name=slurm state=present 18 | 19 | - name: slurm mysqld user is configured 20 | mysql_user: 21 | name=slurm 22 | password={{slurmdbd_password}} 23 | priv="slurm_acct_db.*:ALL" 24 | login_host={{mysql_host}} 25 | login_user={{mysql_admin}} 26 | login_password={{mysql_pw}} 27 | 28 | - name: slurmdbd service is configured 29 | template: src=slurmdbd.conf dest=/etc/slurm/slurmdbd.conf 30 | notify: 31 | - slurmdbd is restarted 32 | 33 | - name: slurmdbd is started and enabled 34 | service: name=slurmdbd state=started enabled=yes 35 | -------------------------------------------------------------------------------- /hosts.real: -------------------------------------------------------------------------------- 1 | # This is an example inventory file for using the playbooks in this repo. 2 | # Note that I'm using a role called "sethostname" to change the hostname 3 | # of each host, according to the per-host value "setname" defined below. 4 | # An "etchosts" role then sets up /etc/hosts according to the system 5 | # hostnames. 6 | 7 | # Several of the roles assume the name of the head node will be "head". 8 | # Their defaults/main.yml files should make it clear where this is needs 9 | # to be changed if you don't want that name. 10 | 11 | # Other roles assume the groups "compute" and "cluster" are being used to 12 | # group the compute nodes and the entire cluster, respectively. Again, this 13 | # group name is usually defined as a variable in defaults/main.yml so you 14 | # can change it. 15 | 16 | [head] 17 | 52.5.44.27 setname=head 18 | 19 | [compute] 20 | 52.6.164.63 setname=compute0 21 | 52.6.176.90 setname=compute1 22 | 52.6.179.184 setname=compute2 23 | 24 | [cluster:children] 25 | head 26 | compute 27 | -------------------------------------------------------------------------------- /hosts.test: -------------------------------------------------------------------------------- 1 | # This is an example inventory file for using the playbooks in this repo. 2 | # Note that I'm using a role called "sethostname" to change the hostname 3 | # of each host, according to the per-host value "setname" defined below. 4 | # An "etchosts" role then sets up /etc/hosts according to the system 5 | # hostnames. 6 | 7 | # Several of the roles assume the name of the head node will be "head". 8 | # Their defaults/main.yml files should make it clear where this is needs 9 | # to be changed if you don't want that name. 10 | 11 | # Other roles assume the groups "compute" and "cluster" are being used to 12 | # group the compute nodes and the entire cluster, respectively. Again, this 13 | # group name is usually defined as a variable in defaults/main.yml so you 14 | # can change it. 15 | 16 | [head] 17 | XX.XX.XX.XX setname=head 18 | 19 | [compute] 20 | XX.XX.XX.XX setname=compute0 21 | XX.XX.XX.XX setname=compute1 22 | XX.XX.XX.XX setname=compute2 23 | XX.XX.XX.XX setname=compute3 24 | 25 | [cluster:children] 26 | head 27 | compute 28 | -------------------------------------------------------------------------------- /roles/epel6/files/epel.repo: -------------------------------------------------------------------------------- 1 | [epel6] 2 | name=Extra Packages for Enterprise Linux 6 - $basearch 3 | #baseurl=http://download.fedoraproject.org/pub/epel/6/$basearch 4 | mirrorlist=https://mirrors.fedoraproject.org/metalink?repo=epel-6&arch=$basearch 5 | failovermethod=priority 6 | enabled=1 7 | gpgcheck=1 8 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-6 9 | 10 | [epel6-debuginfo] 11 | name=Extra Packages for Enterprise Linux 6 - $basearch - Debug 12 | #baseurl=http://download.fedoraproject.org/pub/epel/6/$basearch/debug 13 | mirrorlist=https://mirrors.fedoraproject.org/metalink?repo=epel-debug-6&arch=$basearch 14 | failovermethod=priority 15 | enabled=0 16 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-6 17 | gpgcheck=1 18 | 19 | [epel6-source] 20 | name=Extra Packages for Enterprise Linux 6 - $basearch - Source 21 | #baseurl=http://download.fedoraproject.org/pub/epel/6/SRPMS 22 | mirrorlist=https://mirrors.fedoraproject.org/metalink?repo=epel-source-6&arch=$basearch 23 | failovermethod=priority 24 | enabled=0 25 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-6 26 | gpgcheck=1 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Adam DeConinck 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /roles/ganglia-gmetad/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: install ganglia packages 3 | yum: name={{ item }} state=present 4 | with_items: 5 | - ganglia-devel 6 | - ganglia-gmetad 7 | - ganglia-web 8 | 9 | - name: gmetad is configured 10 | template: src="gmetad.conf" dest="/etc/ganglia/gmetad.conf" 11 | notify: 12 | - gmetad is restarted 13 | 14 | - name: gmetad is enabled and running 15 | service: name=gmetad state=started enabled=yes 16 | 17 | - name: httpd is enabled and running for the web interface 18 | when: enable_web_interface 19 | service: name=httpd state=started enabled=yes 20 | 21 | - name: ensure /var/lib/ganglia-web/dwoo/ is writable by web interface 22 | when: enable_web_interface 23 | file: path=/var/lib/ganglia-web/dwoo group=apache mode=0775 24 | 25 | - name: ensure /var/lib/ganglia-web/dwoo/cache is writable by web interface 26 | when: enable_web_interface 27 | file: path=/var/lib/ganglia-web/dwoo/cache group=apache mode=0775 28 | 29 | - name: ensure /var/lib/ganglia-web/dwoo/compiled is writable by web interface 30 | when: enable_web_interface 31 | file: path=/var/lib/ganglia-web/dwoo/compiled group=apache mode=0775 32 | -------------------------------------------------------------------------------- /cluster.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # This Ansible playbook sets up a relatively basic HPC-style compute cluster 3 | # using the SLURM resource manager. It also sets up a basic NFS share of /home 4 | # to the compute nodes, configures Ganglia and syslog forwarding to make it 5 | # easier to monitor, and installs some common packages for HPC. 6 | # 7 | # Note that the file config.yml is used to define some variables which 8 | # configure the roles' behavior. The roles should be written in such a way that 9 | # you can run with all the defaults, and leave config.yml empty. 10 | 11 | 12 | # Run roles common to all hosts in cluster 13 | - hosts: cluster 14 | user: root 15 | vars_files: 16 | - config.yml 17 | - s3.yml 18 | roles: 19 | - "epel6" 20 | - "ajdecon-repo" 21 | - "sethostname" 22 | - "etchosts" 23 | - "slurm" 24 | - "ganglia-gmond" 25 | - "common-cluster-pkgs" 26 | 27 | # Run roles specific to the cluster headnode 28 | - hosts: head 29 | user: root 30 | vars_files: 31 | - config.yml 32 | roles: 33 | - "nfs-server" 34 | - "rsyslog-collector" 35 | - "ganglia-gmetad" 36 | - "elk-log-server" 37 | - role: "rsyslog-forwarder" 38 | syslog_target: "localhost" 39 | syslog_target_port: 5200 40 | syslog_fwd_file: "elk-fwd.conf" 41 | 42 | # Run roles specific to the cluster compute nodes 43 | - hosts: compute 44 | user: root 45 | vars_files: 46 | - config.yml 47 | roles: 48 | - "nfs-client" 49 | - "rsyslog-forwarder" 50 | -------------------------------------------------------------------------------- /roles/slurm/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: munge and slurm are installed 3 | yum: name={{item}} state=present 4 | with_items: 5 | - munge 6 | - slurm 7 | - slurm-munge 8 | - slurm-plugins 9 | - slurm-pam_slurm 10 | - slurm-perlapi 11 | - libcgroup 12 | 13 | - name: cgroups are started and enabled 14 | service: name=cgconfig state=started enabled=yes 15 | 16 | - name: munge key is present 17 | copy: src=munge.key dest=/etc/munge/munge.key owner=munge group=munge mode=0600 force=yes 18 | notify: 19 | - munge is restarted 20 | 21 | - name: munge is started and enabled 22 | service: name=munge state=started enabled=yes 23 | 24 | - name: slurm group exists 25 | group: name=slurm state=present 26 | 27 | - name: slurm user exists 28 | user: name=slurm group=slurm state=present 29 | 30 | - name: slurm_state_dir exists 31 | file: path={{ slurm_state_dir }} owner=slurm group=slurm mode=0755 state=directory 32 | 33 | - name: accounting storage file exists 34 | file: path={{ slurm_acct_file }} owner=slurm group=slurm mode=0644 state=touch 35 | 36 | - name: slurm is configured 37 | template: src={{item}} dest=/etc/slurm/{{item}} owner=slurm mode=0644 38 | with_items: 39 | - slurm.conf 40 | - cgroup.conf 41 | notify: 42 | - slurm is restarted 43 | 44 | - name: slurm is started and enabled 45 | service: name=slurm state=started enabled=yes 46 | 47 | - name: check if rsyslog.d exists 48 | stat: path="/etc/rsyslog.d" 49 | register: check_rsyslogd 50 | 51 | - when: check_rsyslogd.stat.exists 52 | name: drop rsyslog.d config file 53 | copy: src="slurm_import.conf" dest="/etc/rsyslog.d/slurm_import.conf" 54 | owner=root group=root mode=0444 55 | notify: 56 | - rsyslog is restarted 57 | 58 | -------------------------------------------------------------------------------- /roles/common-cluster-pkgs/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: install common packages 3 | yum: name={{ item }} state=present 4 | with_items: 5 | - "environment-modules" 6 | - "pdsh" 7 | - "pdsh-rcmd-ssh" 8 | - "@development" 9 | - "gcc-gfortran" 10 | - hdf5 11 | - hdf5-devel 12 | - hdf5-static 13 | - netcdf 14 | - netcdf-devel 15 | 16 | - name: install openib support 17 | when: openib_enabled 18 | yum: name={{ item }} state=present 19 | with_items: 20 | - "@infiniband" 21 | - libibcm-devel 22 | - librdmacm-devel 23 | - libibverbs-devel 24 | - libibmad-devel 25 | - libibumad-devel 26 | 27 | - name: install openblas 28 | when: openblas_enabled 29 | yum: name={{ item }} state=present 30 | with_items: 31 | - openblas 32 | - openblas-devel 33 | - openblas-openmp 34 | 35 | - name: install OpenMPI packages 36 | when: openmpi_enabled 37 | yum: name={{ item }} state=present 38 | with_items: 39 | - openmpi 40 | - openmpi-devel 41 | - mpi4py-openmpi 42 | - mpitests-openmpi 43 | - boost-openmpi 44 | - boost-openmpi-devel 45 | - paraview-openmpi 46 | - hdf5-openmpi 47 | - scalapack-openmpi 48 | 49 | - name: install mpich2 packages 50 | when: mpich2_enabled 51 | yum: name={{ item }} state=present 52 | with_items: 53 | - mpich2 54 | - mpich2-devel 55 | - mpi4py-mpich 56 | - mpitests-mpich 57 | - hdf5-mpich 58 | - paraview-mpich 59 | - scalapack-mpich 60 | 61 | - name: install python packages 62 | when: python_enabled 63 | yum: name={{ item }} state=present 64 | with_items: 65 | - numpy 66 | - numpy-f2py 67 | - scipy 68 | - h5py 69 | 70 | - name: upload osu mpi micro-benchmarks 71 | when: upload_osu_benchmarks 72 | copy: src="osu-micro-benchmarks-4.4.1.tar.gz" dest="/opt/osu-micro-benchmarks-4.4.1.tar.gz" mode=0444 73 | -------------------------------------------------------------------------------- /roles/elk-log-server/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: import and trust elasticsearch rpm gpg key 3 | rpm_key: key="https://packages.elasticsearch.org/GPG-KEY-elasticsearch" 4 | 5 | - name: configure elasticsearch repo 6 | copy: src="elasticsearch-1.5.repo" dest="/etc/yum.repos.d/elasticsearch-1.5.repo" 7 | owner=root group=root mode=0444 8 | 9 | - name: upload logstash and kibana (not in repos) 10 | get_url: url="{{ item }}" dest="/tmp/" 11 | with_items: 12 | - "https://download.elastic.co/kibana/kibana/kibana-4.0.2-linux-x64.tar.gz" 13 | - "https://download.elastic.co/logstash/logstash/packages/centos/logstash-1.4.2-1_2c0f5a1.noarch.rpm" 14 | 15 | - name: install elasticsearch and logstash 16 | yum: name="{{ item }}" state=present 17 | with_items: 18 | - java-1.7.0-openjdk 19 | - elasticsearch 20 | - "/tmp/logstash-1.4.2-1_2c0f5a1.noarch.rpm" 21 | - supervisor 22 | 23 | - name: configure elasticsearch 24 | template: src="elasticsearch.yml" dest="/etc/elasticsearch/elasticsearch.yml" 25 | owner=root group=root mode=0444 26 | notify: 27 | - restart elasticsearch 28 | 29 | - name: elasticsearch started and enabled 30 | service: name=elasticsearch state=started enabled=yes 31 | 32 | - name: extract kibana 33 | unarchive: src="/tmp/kibana-4.0.2-linux-x64.tar.gz" dest="/opt" copy=no 34 | 35 | - name: configure kibana 36 | template: src="kibana.yml" dest="/opt/kibana-4.0.2-linux-x64/kibana.yml" 37 | owner=root group=root mode=0444 38 | 39 | - name: configure supervisord (run kibana) 40 | template: src="supervisord.conf" dest="/etc/supervisord.conf" backup=yes 41 | notify: 42 | - restart supervisord 43 | 44 | - name: supervisord started and enabled 45 | service: name=supervisord state=started enabled=yes 46 | 47 | - name: configure logstash 48 | template: src="{{ item }}" dest="/etc/logstash/conf.d/" 49 | with_items: 50 | - "01-elasticsearch-storage.conf" 51 | - "10-syslog-input.conf" 52 | notify: 53 | - restart logstash 54 | 55 | - name: logstash started and enabled 56 | service: name=logstash state=started enabled=yes 57 | -------------------------------------------------------------------------------- /roles/elk-log-server/templates/kibana.yml: -------------------------------------------------------------------------------- 1 | # Kibana is served by a back end server. This controls which port to use. 2 | port: 5601 3 | 4 | # The host to bind the server to. 5 | host: "0.0.0.0" 6 | 7 | # The Elasticsearch instance to use for all your queries. 8 | elasticsearch_url: "http://localhost:9200" 9 | 10 | # preserve_elasticsearch_host true will send the hostname specified in `elasticsearch`. If you set it to false, 11 | # then the host you use to connect to *this* Kibana instance will be sent. 12 | elasticsearch_preserve_host: true 13 | 14 | # Kibana uses an index in Elasticsearch to store saved searches, visualizations 15 | # and dashboards. It will create a new index if it doesn't already exist. 16 | kibana_index: ".kibana" 17 | 18 | # If your Elasticsearch is protected with basic auth, this is the user credentials 19 | # used by the Kibana server to perform maintence on the kibana_index at statup. Your Kibana 20 | # users will still need to authenticate with Elasticsearch (which is proxied thorugh 21 | # the Kibana server) 22 | # kibana_elasticsearch_username: user 23 | # kibana_elasticsearch_password: pass 24 | 25 | # If your Elasticsearch requires client certificate and key 26 | # kibana_elasticsearch_client_crt: /path/to/your/client.crt 27 | # kibana_elasticsearch_client_key: /path/to/your/client.key 28 | 29 | # If you need to provide a CA certificate for your Elasticsarech instance, put 30 | # the path of the pem file here. 31 | # ca: /path/to/your/CA.pem 32 | 33 | # The default application to load. 34 | default_app_id: "discover" 35 | 36 | # Time in milliseconds to wait for responses from the back end or elasticsearch. 37 | # This must be > 0 38 | request_timeout: 300000 39 | 40 | # Time in milliseconds for Elasticsearch to wait for responses from shards. 41 | # Set to 0 to disable. 42 | shard_timeout: 0 43 | 44 | # Set to false to have a complete disregard for the validity of the SSL 45 | # certificate. 46 | verify_ssl: true 47 | 48 | # SSL for outgoing requests from the Kibana Server (PEM formatted) 49 | # ssl_key_file: /path/to/your/server.key 50 | # ssl_cert_file: /path/to/your/server.crt 51 | 52 | # Set the path to where you would like the process id file to be created. 53 | # pid_file: /var/run/kibana.pid 54 | 55 | # Plugins that are included in the build, and no longer found in the plugins/ folder 56 | bundled_plugin_ids: 57 | - plugins/dashboard/index 58 | - plugins/discover/index 59 | - plugins/doc/index 60 | - plugins/kibana/index 61 | - plugins/markdown_vis/index 62 | - plugins/metric_vis/index 63 | - plugins/settings/index 64 | - plugins/table_vis/index 65 | - plugins/vis_types/index 66 | - plugins/visualize/index 67 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ansible-simple-slurm-cluster 2 | ============================ 3 | 4 | This repo contains a set of Ansible roles for setting up a relatively basic 5 | HPC-style compute cluster, along with an example playbook for using them. 6 | 7 | **These scipts should not be considered production-quality!** (Though 8 | you can always use them or get inspiration from them if you like.) I use 9 | the roles in this repo to set up ephemeral clusters while playing with new 10 | ideas or writing software. I am rarely concerned with stability or user 11 | experience, and more often with seeing what I can break. :) YMMV! 12 | 13 | They have mostly been tested on EC2 but there's 14 | nothing cloud-specific in them, so they should work on normal hardware. 15 | 16 | 17 | So what does it do? 18 | ------------------- 19 | 20 | - Set host names and builds an /etc/hosts file 21 | - Configure [SLURM](http://slurm.schedmd.com) as resource manager 22 | - NFS export of /home from head node to computes 23 | - Syslog forwarding to the head node 24 | - [Ganglia](http://ganglia.sourceforge.net/) monitoring system (not using multicast b/c mostly used on EC2) 25 | - Install some common HPC dev tools 26 | 27 | This is usually enough to get me started on whatever other project I'm working 28 | on. 29 | 30 | 31 | Prerequisites 32 | ------------- 33 | 34 | - All the roles assume you're using EL6 (i.e. CentOS, RHEL, Scientific Linux) 35 | 36 | - [SLURM](http://slurm.schedmd.com) and [Munge](https://code.google.com/p/munge/) 37 | are not distributed as RPMs, so I built those RPMs and stuck 38 | them in a repository on S3. The "ajdecon-repo" role configures each node of 39 | the cluster to include this repo when installing software. 40 | 41 | The S3 bucket is not public because I like low bandwidth bills. :) The RPMs 42 | themselves are very easy to build, so I suggest just setting up your own 43 | YUM repo to install them from. 44 | 45 | 46 | How do I use the playbook? 47 | -------------------------- 48 | 49 | 1. Set up an Ansible [inventory file](http://docs.ansible.com/intro_inventory.html) 50 | similar to the included hosts.test. Each host can have a "setname=" 51 | parameter included next to it, and the "sethostname" role will use that 52 | to configure the hostname. 53 | 54 | Note that many of the roles assume your head node will be named "head", and 55 | that the "compute" and "cluster" groups exist. However this can generally be 56 | changed for each role: see the variables in `defaults/main.yml` for each one. 57 | 58 | 2. Run `ansible-playbook -i cluster.yml` and wait for your 59 | cluster to be ready! 60 | 61 | 62 | Other notes 63 | ----------- 64 | 65 | Many of the roles have variables you can set to control their behavior. (For 66 | example, there are a few knobs to turn on the "slurm" role.) You can change 67 | their values by setting them in "config.yml" and they'll get picked up 68 | in "cluster.yml". See the `defaults/main.yml` file in each role to see what 69 | variables are available to change. 70 | 71 | 72 | -------------------------------------------------------------------------------- /roles/elk-log-server/templates/supervisord.conf: -------------------------------------------------------------------------------- 1 | 2 | [supervisord] 3 | http_port=/var/tmp/supervisor.sock ; (default is to run a UNIX domain socket server) 4 | ;http_port=127.0.0.1:9001 ; (alternately, ip_address:port specifies AF_INET) 5 | ;sockchmod=0700 ; AF_UNIX socketmode (AF_INET ignore, default 0700) 6 | ;sockchown=nobody.nogroup ; AF_UNIX socket uid.gid owner (AF_INET ignores) 7 | ;umask=022 ; (process file creation umask;default 022) 8 | logfile=/var/log/supervisor/supervisord.log ; (main log file;default $CWD/supervisord.log) 9 | logfile_maxbytes=50MB ; (max main logfile bytes b4 rotation;default 50MB) 10 | logfile_backups=10 ; (num of main logfile rotation backups;default 10) 11 | loglevel=info ; (logging level;default info; others: debug,warn) 12 | pidfile=/var/run/supervisord.pid ; (supervisord pidfile;default supervisord.pid) 13 | nodaemon=false ; (start in foreground if true;default false) 14 | minfds=1024 ; (min. avail startup file descriptors;default 1024) 15 | minprocs=200 ; (min. avail process descriptors;default 200) 16 | 17 | ;nocleanup=true ; (don't clean up tempfiles at start;default false) 18 | ;http_username=user ; (default is no username (open system)) 19 | ;http_password=123 ; (default is no password (open system)) 20 | ;childlogdir=/tmp ; ('AUTO' child log dir, default $TEMP) 21 | ;user=chrism ; (default is current user, required if root) 22 | ;directory=/tmp ; (default is not to cd during start) 23 | ;environment=KEY=value ; (key value pairs to add to environment) 24 | 25 | [supervisorctl] 26 | serverurl=unix:///var/tmp/supervisor.sock ; use a unix:// URL for a unix socket 27 | ;serverurl=http://127.0.0.1:9001 ; use an http:// url to specify an inet socket 28 | ;username=chris ; should be same as http_username if set 29 | ;password=123 ; should be same as http_password if set 30 | ;prompt=mysupervisor ; cmd line prompt (default "supervisor") 31 | 32 | ; The below sample program section shows all possible program subsection values, 33 | ; create one or more 'real' program: sections to be able to control them under 34 | ; supervisor. 35 | 36 | [program:kibana] 37 | command=/opt/kibana-4.0.2-linux-x64/bin/kibana 38 | autostart=true 39 | autorestart=true 40 | logfile=syslog 41 | 42 | ;[program:theprogramname] 43 | ;command=/bin/cat ; the program (relative uses PATH, can take args) 44 | ;priority=999 ; the relative start priority (default 999) 45 | ;autostart=true ; start at supervisord start (default: true) 46 | ;autorestart=true ; retstart at unexpected quit (default: true) 47 | ;startsecs=10 ; number of secs prog must stay running (def. 10) 48 | ;startretries=3 ; max # of serial start failures (default 3) 49 | ;exitcodes=0,2 ; 'expected' exit codes for process (default 0,2) 50 | ;stopsignal=QUIT ; signal used to kill process (default TERM) 51 | ;stopwaitsecs=10 ; max num secs to wait before SIGKILL (default 10) 52 | ;user=chrism ; setuid to this UNIX account to run the program 53 | ;log_stdout=true ; if true, log program stdout (default true) 54 | ;log_stderr=true ; if true, log program stderr (def false) 55 | ;logfile=/var/log/cat.log ; child log path, use NONE for none; default AUTO 56 | ;logfile_maxbytes=1MB ; max # logfile bytes b4 rotation (default 50MB) 57 | ;logfile_backups=10 ; # of logfile backups (default 10) 58 | 59 | 60 | -------------------------------------------------------------------------------- /roles/slurm/templates/slurm.conf: -------------------------------------------------------------------------------- 1 | # slurm.conf file generated by configurator.html. 2 | # Put this file on all nodes of your cluster. 3 | # See the slurm.conf man page for more information. 4 | # 5 | ControlMachine={{slurmctld_host}} 6 | #ControlAddr= 7 | #BackupController= 8 | #BackupAddr= 9 | # 10 | AuthType=auth/munge 11 | CacheGroups=0 12 | #CheckpointType=checkpoint/none 13 | CryptoType=crypto/munge 14 | #DisableRootJobs=NO 15 | #EnforcePartLimits=NO 16 | #Epilog= 17 | #EpilogSlurmctld= 18 | #FirstJobId=1 19 | #MaxJobId=999999 20 | #GresTypes= 21 | #GroupUpdateForce=0 22 | #GroupUpdateTime=600 23 | #JobCheckpointDir=/var/slurm/checkpoint 24 | #JobCredentialPrivateKey= 25 | #JobCredentialPublicCertificate= 26 | #JobFileAppend=0 27 | #JobRequeue=1 28 | #JobSubmitPlugins=1 29 | #KillOnBadExit=0 30 | #Licenses=foo*4,bar 31 | #MailProg=/bin/mail 32 | #MaxJobCount=5000 33 | #MaxStepCount=40000 34 | #MaxTasksPerNode=128 35 | MpiDefault=none 36 | MpiParams=ports=12000-12999 37 | #PluginDir= 38 | #PlugStackConfig= 39 | #PrivateData=jobs 40 | ProctrackType=proctrack/pgid 41 | #Prolog= 42 | #PrologSlurmctld= 43 | #PropagatePrioProcess=0 44 | #PropagateResourceLimits= 45 | #PropagateResourceLimitsExcept= 46 | ReturnToService=1 47 | #SallocDefaultCommand= 48 | SlurmctldPidFile=/var/run/slurmctld.pid 49 | SlurmctldPort=6817 50 | SlurmdPidFile=/var/run/slurmd.pid 51 | SlurmdPort=6818 52 | SlurmdSpoolDir=/tmp/slurmd 53 | SlurmUser=slurm 54 | #SlurmdUser=root 55 | #SrunEpilog= 56 | #SrunProlog= 57 | StateSaveLocation={{ slurm_state_dir }} 58 | SwitchType=switch/none 59 | #TaskEpilog= 60 | TaskPlugin=task/none 61 | #TaskPluginParam= 62 | #TaskProlog= 63 | #TopologyPlugin=topology/tree 64 | #TmpFs=/tmp 65 | #TrackWCKey=no 66 | #TreeWidth= 67 | #UnkillableStepProgram= 68 | #UsePAM=0 69 | # 70 | # 71 | # TIMERS 72 | #BatchStartTimeout=10 73 | #CompleteWait=0 74 | #EpilogMsgTime=2000 75 | #GetEnvTimeout=2 76 | #HealthCheckInterval=0 77 | #HealthCheckProgram= 78 | InactiveLimit=0 79 | KillWait=30 80 | #MessageTimeout=10 81 | #ResvOverRun=0 82 | MinJobAge=300 83 | #OverTimeLimit=0 84 | SlurmctldTimeout=120 85 | SlurmdTimeout=300 86 | #UnkillableStepTimeout=60 87 | #VSizeFactor=0 88 | Waittime=0 89 | # 90 | # 91 | # SCHEDULING 92 | #DefMemPerCPU=0 93 | FastSchedule=1 94 | #MaxMemPerCPU=0 95 | #SchedulerRootFilter=1 96 | #SchedulerTimeSlice=30 97 | SchedulerType=sched/backfill 98 | SchedulerPort=7321 99 | SelectType=select/linear 100 | #SelectTypeParameters= 101 | # 102 | # 103 | # JOB PRIORITY 104 | #PriorityType=priority/basic 105 | #PriorityDecayHalfLife= 106 | #PriorityCalcPeriod= 107 | #PriorityFavorSmall= 108 | #PriorityMaxAge= 109 | #PriorityUsageResetPeriod= 110 | #PriorityWeightAge= 111 | #PriorityWeightFairshare= 112 | #PriorityWeightJobSize= 113 | #PriorityWeightPartition= 114 | #PriorityWeightQOS= 115 | # 116 | # 117 | # LOGGING AND ACCOUNTING 118 | #AccountingStorageEnforce=0 119 | #AccountingStorageHost= 120 | #AccountingStorageLoc= 121 | #AccountingStoragePass= 122 | #AccountingStoragePort= 123 | {% if slurmdbd_enabled %} 124 | AccountingStorageType=accounting_storage/slurmdbd 125 | AccountingStorageHost={{slurmdbd_host}} 126 | {% else %} 127 | AccountingStorageType=accounting_storage/filetxt 128 | AccountingStorageLoc={{ slurm_acct_file }} 129 | {% endif %} 130 | #AccountingStorageUser= 131 | AccountingStoreJobComment=YES 132 | ClusterName={{slurm_cluster_name}} 133 | #DebugFlags= 134 | #JobCompHost= 135 | #JobCompLoc= 136 | #JobCompPass= 137 | #JobCompPort= 138 | JobCompType=jobcomp/none 139 | #JobCompUser= 140 | JobAcctGatherFrequency=30 141 | JobAcctGatherType=jobacct_gather/none 142 | SlurmctldDebug=3 143 | {% if slurm_log_to_syslog %} 144 | SlurmctldLogFile= 145 | SlurmdLogFile= 146 | SlurmSchedLogFile= 147 | SlurmSchedLogLevel=1 148 | {% else %} 149 | SlurmctldLogFile=/var/log/slurmctld.log 150 | SlurmdLogFile=/var/log/slurmd.log 151 | SlurmSchedLogFile=/var/log/slurmsched.log 152 | SlurmSchedLogLevel=1 153 | {% endif %} 154 | SlurmdDebug=3 155 | # 156 | # 157 | # POWER SAVE SUPPORT FOR IDLE NODES (optional) 158 | #SuspendProgram= 159 | #ResumeProgram= 160 | #SuspendTimeout= 161 | #ResumeTimeout= 162 | #ResumeRate= 163 | #SuspendExcNodes= 164 | #SuspendExcParts= 165 | #SuspendRate= 166 | #SuspendTime= 167 | # 168 | # 169 | # COMPUTE NODES 170 | {% for cn in groups[slurm_compute_group] %} 171 | NodeName={{hostvars[cn]['ansible_hostname']}} CPUs={{slurm_cpus_per_node}} State=UNKNOWN 172 | {% endfor %} 173 | 174 | PartitionName={{slurm_partition_name}} Nodes={% for cn in groups[slurm_compute_group] -%}{{hostvars[cn]['ansible_hostname']}}{% if not loop.last -%} 175 | ,{% endif %}{% endfor %} Default=YES MaxTime=INFINITE State=UP 176 | 177 | -------------------------------------------------------------------------------- /roles/ganglia-gmetad/templates/gmetad.conf: -------------------------------------------------------------------------------- 1 | # This is an example of a Ganglia Meta Daemon configuration file 2 | # http://ganglia.sourceforge.net/ 3 | # 4 | # $Id: gmetad.conf.in 2014 2009-08-10 10:44:09Z d_pocock $ 5 | # 6 | #------------------------------------------------------------------------------- 7 | # Setting the debug_level to 1 will keep daemon in the forground and 8 | # show only error messages. Setting this value higher than 1 will make 9 | # gmetad output debugging information and stay in the foreground. 10 | # default: 0 11 | # debug_level 10 12 | # 13 | #------------------------------------------------------------------------------- 14 | # What to monitor. The most important section of this file. 15 | # 16 | # The data_source tag specifies either a cluster or a grid to 17 | # monitor. If we detect the source is a cluster, we will maintain a complete 18 | # set of RRD databases for it, which can be used to create historical 19 | # graphs of the metrics. If the source is a grid (it comes from another gmetad), 20 | # we will only maintain summary RRDs for it. 21 | # 22 | # Format: 23 | # data_source "my cluster" [polling interval] address1:port addreses2:port ... 24 | # 25 | # The keyword 'data_source' must immediately be followed by a unique 26 | # string which identifies the source, then an optional polling interval in 27 | # seconds. The source will be polled at this interval on average. 28 | # If the polling interval is omitted, 15sec is asssumed. 29 | # 30 | # A list of machines which service the data source follows, in the 31 | # format ip:port, or name:port. If a port is not specified then 8649 32 | # (the default gmond port) is assumed. 33 | # default: There is no default value 34 | # 35 | # data_source "my cluster" 10 localhost my.machine.edu:8649 1.2.3.5:8655 36 | # data_source "my grid" 50 1.3.4.7:8655 grid.org:8651 grid-backup.org:8651 37 | # data_source "another source" 1.3.4.7:8655 1.3.4.8 38 | 39 | data_source "my cluster" localhost 40 | 41 | # 42 | # Round-Robin Archives 43 | # You can specify custom Round-Robin archives here (defaults are listed below) 44 | # 45 | # RRAs "RRA:AVERAGE:0.5:1:244" "RRA:AVERAGE:0.5:24:244" "RRA:AVERAGE:0.5:168:244" "RRA:AVERAGE:0.5:672:244" \ 46 | # "RRA:AVERAGE:0.5:5760:374" 47 | # 48 | 49 | # 50 | #------------------------------------------------------------------------------- 51 | # Scalability mode. If on, we summarize over downstream grids, and respect 52 | # authority tags. If off, we take on 2.5.0-era behavior: we do not wrap our output 53 | # in tags, we ignore all tags we see, and always assume 54 | # we are the "authority" on data source feeds. This approach does not scale to 55 | # large groups of clusters, but is provided for backwards compatibility. 56 | # default: on 57 | # scalable off 58 | # 59 | #------------------------------------------------------------------------------- 60 | # The name of this Grid. All the data sources above will be wrapped in a GRID 61 | # tag with this name. 62 | # default: unspecified 63 | gridname "{{ grid_name }}" 64 | # 65 | #------------------------------------------------------------------------------- 66 | # The authority URL for this grid. Used by other gmetads to locate graphs 67 | # for our data sources. Generally points to a ganglia/ 68 | # website on this machine. 69 | # default: "http://hostname/ganglia/", 70 | # where hostname is the name of this machine, as defined by gethostname(). 71 | # authority "http://mycluster.org/newprefix/" 72 | # 73 | #------------------------------------------------------------------------------- 74 | # List of machines this gmetad will share XML with. Localhost 75 | # is always trusted. 76 | # default: There is no default value 77 | # trusted_hosts 127.0.0.1 169.229.50.165 my.gmetad.org 78 | trusted_hosts 127.0.0.1 79 | # 80 | #------------------------------------------------------------------------------- 81 | # If you want any host which connects to the gmetad XML to receive 82 | # data, then set this value to "on" 83 | # default: off 84 | # all_trusted on 85 | # 86 | #------------------------------------------------------------------------------- 87 | # If you don't want gmetad to setuid then set this to off 88 | # default: on 89 | # setuid off 90 | # 91 | #------------------------------------------------------------------------------- 92 | # User gmetad will setuid to (defaults to "ganglia") 93 | # default: "ganglia" 94 | # setuid_username "ganglia" 95 | # 96 | #------------------------------------------------------------------------------- 97 | # The port gmetad will answer requests for XML 98 | # default: 8651 99 | xml_port {{ xml_port }} 100 | # 101 | #------------------------------------------------------------------------------- 102 | # The port gmetad will answer queries for XML. This facility allows 103 | # simple subtree and summation views of the XML tree. 104 | # default: 8652 105 | interactive_port {{ interactive_port }} 106 | # 107 | #------------------------------------------------------------------------------- 108 | # The number of threads answering XML requests 109 | # default: 4 110 | # server_threads 10 111 | # 112 | #------------------------------------------------------------------------------- 113 | # Where gmetad stores its round-robin databases 114 | # default: "/var/lib/ganglia/rrds" 115 | # rrd_rootdir "/some/other/place" 116 | # 117 | #------------------------------------------------------------------------------- 118 | # In earlier versions of gmetad, hostnames were handled in a case 119 | # sensitive manner 120 | # If your hostname directories have been renamed to lower case, 121 | # set this option to 0 to disable backward compatibility. 122 | # From version 3.2, backwards compatibility will be disabled by default. 123 | # default: 1 (for gmetad < 3.2) 124 | # default: 0 (for gmetad >= 3.2) 125 | case_sensitive_hostnames 1 126 | 127 | -------------------------------------------------------------------------------- /roles/ganglia-gmond/templates/gmond.conf: -------------------------------------------------------------------------------- 1 | /* This configuration is as close to 2.5.x default behavior as possible 2 | The values closely match ./gmond/metric.h definitions in 2.5.x */ 3 | globals { 4 | daemonize = yes 5 | setuid = yes 6 | user = ganglia 7 | debug_level = 0 8 | max_udp_msg_len = 1472 9 | mute = no 10 | deaf = no 11 | allow_extra_data = yes 12 | host_dmax = 0 /*secs */ 13 | cleanup_threshold = 300 /*secs */ 14 | gexec = no 15 | send_metadata_interval = 0 /*secs */ 16 | } 17 | 18 | /* 19 | * The cluster attributes specified will be used as part of the 20 | * tag that will wrap all hosts collected by this instance. 21 | */ 22 | cluster { 23 | name = "{{ cluster_name }}" 24 | owner = "{{ cluster_owner }}" 25 | latlong = "unspecified" 26 | url = "unspecified" 27 | } 28 | 29 | /* The host section describes attributes of the host, like the location */ 30 | host { 31 | location = "unspecified" 32 | } 33 | 34 | /* Feel free to specify as many udp_send_channels as you like. Gmond 35 | used to only support having a single channel */ 36 | udp_send_channel { 37 | bind_hostname = yes # Highly recommended, soon to be default. 38 | # This option tells gmond to use a source address 39 | # that resolves to the machine's hostname. Without 40 | # this, the metrics may appear to come from any 41 | # interface and the DNS names associated with 42 | # those IPs will be used to create the RRDs. 43 | #mcast_join = 239.2.11.71 44 | host={{ target_host }} 45 | port = {{ target_port }} 46 | ttl = 1 47 | } 48 | 49 | /* You can specify as many udp_recv_channels as you like as well. */ 50 | udp_recv_channel { 51 | #mcast_join = 239.2.11.71 52 | port = {{ recv_port }} 53 | #bind = 239.2.11.71 54 | } 55 | 56 | /* You can specify as many tcp_accept_channels as you like to share 57 | an xml description of the state of the cluster */ 58 | tcp_accept_channel { 59 | port = {{ recv_port }} 60 | } 61 | 62 | /* Each metrics module that is referenced by gmond must be specified and 63 | loaded. If the module has been statically linked with gmond, it does 64 | not require a load path. However all dynamically loadable modules must 65 | include a load path. */ 66 | modules { 67 | module { 68 | name = "core_metrics" 69 | } 70 | module { 71 | name = "cpu_module" 72 | path = "modcpu.so" 73 | } 74 | module { 75 | name = "disk_module" 76 | path = "moddisk.so" 77 | } 78 | module { 79 | name = "load_module" 80 | path = "modload.so" 81 | } 82 | module { 83 | name = "mem_module" 84 | path = "modmem.so" 85 | } 86 | module { 87 | name = "net_module" 88 | path = "modnet.so" 89 | } 90 | module { 91 | name = "proc_module" 92 | path = "modproc.so" 93 | } 94 | module { 95 | name = "sys_module" 96 | path = "modsys.so" 97 | } 98 | } 99 | 100 | include ('/etc/ganglia/conf.d/*.conf') 101 | 102 | /* The old internal 2.5.x metric array has been replaced by the following 103 | collection_group directives. What follows is the default behavior for 104 | collecting and sending metrics that is as close to 2.5.x behavior as 105 | possible. */ 106 | 107 | /* This collection group will cause a heartbeat (or beacon) to be sent every 108 | 20 seconds. In the heartbeat is the GMOND_STARTED data which expresses 109 | the age of the running gmond. */ 110 | collection_group { 111 | collect_once = yes 112 | time_threshold = 20 113 | metric { 114 | name = "heartbeat" 115 | } 116 | } 117 | 118 | /* This collection group will send general info about this host every 119 | 1200 secs. 120 | This information doesn't change between reboots and is only collected 121 | once. */ 122 | collection_group { 123 | collect_once = yes 124 | time_threshold = 1200 125 | metric { 126 | name = "cpu_num" 127 | title = "CPU Count" 128 | } 129 | metric { 130 | name = "cpu_speed" 131 | title = "CPU Speed" 132 | } 133 | metric { 134 | name = "mem_total" 135 | title = "Memory Total" 136 | } 137 | /* Should this be here? Swap can be added/removed between reboots. */ 138 | metric { 139 | name = "swap_total" 140 | title = "Swap Space Total" 141 | } 142 | metric { 143 | name = "boottime" 144 | title = "Last Boot Time" 145 | } 146 | metric { 147 | name = "machine_type" 148 | title = "Machine Type" 149 | } 150 | metric { 151 | name = "os_name" 152 | title = "Operating System" 153 | } 154 | metric { 155 | name = "os_release" 156 | title = "Operating System Release" 157 | } 158 | metric { 159 | name = "location" 160 | title = "Location" 161 | } 162 | } 163 | 164 | /* This collection group will send the status of gexecd for this host 165 | every 300 secs.*/ 166 | /* Unlike 2.5.x the default behavior is to report gexecd OFF. */ 167 | collection_group { 168 | collect_once = yes 169 | time_threshold = 300 170 | metric { 171 | name = "gexec" 172 | title = "Gexec Status" 173 | } 174 | } 175 | 176 | /* This collection group will collect the CPU status info every 20 secs. 177 | The time threshold is set to 90 seconds. In honesty, this 178 | time_threshold could be set significantly higher to reduce 179 | unneccessary network chatter. */ 180 | collection_group { 181 | collect_every = 20 182 | time_threshold = 90 183 | /* CPU status */ 184 | metric { 185 | name = "cpu_user" 186 | value_threshold = "1.0" 187 | title = "CPU User" 188 | } 189 | metric { 190 | name = "cpu_system" 191 | value_threshold = "1.0" 192 | title = "CPU System" 193 | } 194 | metric { 195 | name = "cpu_idle" 196 | value_threshold = "5.0" 197 | title = "CPU Idle" 198 | } 199 | metric { 200 | name = "cpu_nice" 201 | value_threshold = "1.0" 202 | title = "CPU Nice" 203 | } 204 | metric { 205 | name = "cpu_aidle" 206 | value_threshold = "5.0" 207 | title = "CPU aidle" 208 | } 209 | metric { 210 | name = "cpu_wio" 211 | value_threshold = "1.0" 212 | title = "CPU wio" 213 | } 214 | /* The next two metrics are optional if you want more detail... 215 | ... since they are accounted for in cpu_system. 216 | metric { 217 | name = "cpu_intr" 218 | value_threshold = "1.0" 219 | title = "CPU intr" 220 | } 221 | metric { 222 | name = "cpu_sintr" 223 | value_threshold = "1.0" 224 | title = "CPU sintr" 225 | } 226 | */ 227 | } 228 | 229 | collection_group { 230 | collect_every = 20 231 | time_threshold = 90 232 | /* Load Averages */ 233 | metric { 234 | name = "load_one" 235 | value_threshold = "1.0" 236 | title = "One Minute Load Average" 237 | } 238 | metric { 239 | name = "load_five" 240 | value_threshold = "1.0" 241 | title = "Five Minute Load Average" 242 | } 243 | metric { 244 | name = "load_fifteen" 245 | value_threshold = "1.0" 246 | title = "Fifteen Minute Load Average" 247 | } 248 | } 249 | 250 | /* This group collects the number of running and total processes */ 251 | collection_group { 252 | collect_every = 80 253 | time_threshold = 950 254 | metric { 255 | name = "proc_run" 256 | value_threshold = "1.0" 257 | title = "Total Running Processes" 258 | } 259 | metric { 260 | name = "proc_total" 261 | value_threshold = "1.0" 262 | title = "Total Processes" 263 | } 264 | } 265 | 266 | /* This collection group grabs the volatile memory metrics every 40 secs and 267 | sends them at least every 180 secs. This time_threshold can be increased 268 | significantly to reduce unneeded network traffic. */ 269 | collection_group { 270 | collect_every = 40 271 | time_threshold = 180 272 | metric { 273 | name = "mem_free" 274 | value_threshold = "1024.0" 275 | title = "Free Memory" 276 | } 277 | metric { 278 | name = "mem_shared" 279 | value_threshold = "1024.0" 280 | title = "Shared Memory" 281 | } 282 | metric { 283 | name = "mem_buffers" 284 | value_threshold = "1024.0" 285 | title = "Memory Buffers" 286 | } 287 | metric { 288 | name = "mem_cached" 289 | value_threshold = "1024.0" 290 | title = "Cached Memory" 291 | } 292 | metric { 293 | name = "swap_free" 294 | value_threshold = "1024.0" 295 | title = "Free Swap Space" 296 | } 297 | } 298 | 299 | collection_group { 300 | collect_every = 40 301 | time_threshold = 300 302 | metric { 303 | name = "bytes_out" 304 | value_threshold = 4096 305 | title = "Bytes Sent" 306 | } 307 | metric { 308 | name = "bytes_in" 309 | value_threshold = 4096 310 | title = "Bytes Received" 311 | } 312 | metric { 313 | name = "pkts_in" 314 | value_threshold = 256 315 | title = "Packets Received" 316 | } 317 | metric { 318 | name = "pkts_out" 319 | value_threshold = 256 320 | title = "Packets Sent" 321 | } 322 | } 323 | 324 | /* Different than 2.5.x default since the old config made no sense */ 325 | collection_group { 326 | collect_every = 1800 327 | time_threshold = 3600 328 | metric { 329 | name = "disk_total" 330 | value_threshold = 1.0 331 | title = "Total Disk Space" 332 | } 333 | } 334 | 335 | collection_group { 336 | collect_every = 40 337 | time_threshold = 180 338 | metric { 339 | name = "disk_free" 340 | value_threshold = 1.0 341 | title = "Disk Space Available" 342 | } 343 | metric { 344 | name = "part_max_used" 345 | value_threshold = 1.0 346 | title = "Maximum Disk Space Used" 347 | } 348 | } 349 | 350 | -------------------------------------------------------------------------------- /roles/elk-log-server/templates/elasticsearch.yml: -------------------------------------------------------------------------------- 1 | ##################### Elasticsearch Configuration Example ##################### 2 | 3 | # This file contains an overview of various configuration settings, 4 | # targeted at operations staff. Application developers should 5 | # consult the guide at . 6 | # 7 | # The installation procedure is covered at 8 | # . 9 | # 10 | # Elasticsearch comes with reasonable defaults for most settings, 11 | # so you can try it out without bothering with configuration. 12 | # 13 | # Most of the time, these defaults are just fine for running a production 14 | # cluster. If you're fine-tuning your cluster, or wondering about the 15 | # effect of certain configuration option, please _do ask_ on the 16 | # mailing list or IRC channel [http://elasticsearch.org/community]. 17 | 18 | # Any element in the configuration can be replaced with environment variables 19 | # by placing them in ${...} notation. For example: 20 | # 21 | #node.rack: ${RACK_ENV_VAR} 22 | 23 | # For information on supported formats and syntax for the config file, see 24 | # 25 | 26 | 27 | ################################### Added by user ############################# 28 | # Based on tutorial in 29 | # https://www.digitalocean.com/community/tutorials/how-to-use-logstash-and-kibana-to-centralize-logs-on-centos-6 30 | # 31 | 32 | script.disable_dynamic: true 33 | network.host: localhost 34 | discovery.zen.ping.multicast.enabled: false 35 | 36 | 37 | ################################### Cluster ################################### 38 | 39 | # Cluster name identifies your cluster for auto-discovery. If you're running 40 | # multiple clusters on the same network, make sure you're using unique names. 41 | # 42 | #cluster.name: elasticsearch 43 | 44 | 45 | #################################### Node ##################################### 46 | 47 | # Node names are generated dynamically on startup, so you're relieved 48 | # from configuring them manually. You can tie this node to a specific name: 49 | # 50 | #node.name: "Franz Kafka" 51 | 52 | # Every node can be configured to allow or deny being eligible as the master, 53 | # and to allow or deny to store the data. 54 | # 55 | # Allow this node to be eligible as a master node (enabled by default): 56 | # 57 | #node.master: true 58 | # 59 | # Allow this node to store data (enabled by default): 60 | # 61 | #node.data: true 62 | 63 | # You can exploit these settings to design advanced cluster topologies. 64 | # 65 | # 1. You want this node to never become a master node, only to hold data. 66 | # This will be the "workhorse" of your cluster. 67 | # 68 | #node.master: false 69 | #node.data: true 70 | # 71 | # 2. You want this node to only serve as a master: to not store any data and 72 | # to have free resources. This will be the "coordinator" of your cluster. 73 | # 74 | #node.master: true 75 | #node.data: false 76 | # 77 | # 3. You want this node to be neither master nor data node, but 78 | # to act as a "search load balancer" (fetching data from nodes, 79 | # aggregating results, etc.) 80 | # 81 | #node.master: false 82 | #node.data: false 83 | 84 | # Use the Cluster Health API [http://localhost:9200/_cluster/health], the 85 | # Node Info API [http://localhost:9200/_nodes] or GUI tools 86 | # such as , 87 | # , 88 | # and 89 | # to inspect the cluster state. 90 | 91 | # A node can have generic attributes associated with it, which can later be used 92 | # for customized shard allocation filtering, or allocation awareness. An attribute 93 | # is a simple key value pair, similar to node.key: value, here is an example: 94 | # 95 | #node.rack: rack314 96 | 97 | # By default, multiple nodes are allowed to start from the same installation location 98 | # to disable it, set the following: 99 | #node.max_local_storage_nodes: 1 100 | 101 | 102 | #################################### Index #################################### 103 | 104 | # You can set a number of options (such as shard/replica options, mapping 105 | # or analyzer definitions, translog settings, ...) for indices globally, 106 | # in this file. 107 | # 108 | # Note, that it makes more sense to configure index settings specifically for 109 | # a certain index, either when creating it or by using the index templates API. 110 | # 111 | # See and 112 | # 113 | # for more information. 114 | 115 | # Set the number of shards (splits) of an index (5 by default): 116 | # 117 | #index.number_of_shards: 5 118 | 119 | # Set the number of replicas (additional copies) of an index (1 by default): 120 | # 121 | #index.number_of_replicas: 1 122 | 123 | # Note, that for development on a local machine, with small indices, it usually 124 | # makes sense to "disable" the distributed features: 125 | # 126 | #index.number_of_shards: 1 127 | #index.number_of_replicas: 0 128 | 129 | # These settings directly affect the performance of index and search operations 130 | # in your cluster. Assuming you have enough machines to hold shards and 131 | # replicas, the rule of thumb is: 132 | # 133 | # 1. Having more *shards* enhances the _indexing_ performance and allows to 134 | # _distribute_ a big index across machines. 135 | # 2. Having more *replicas* enhances the _search_ performance and improves the 136 | # cluster _availability_. 137 | # 138 | # The "number_of_shards" is a one-time setting for an index. 139 | # 140 | # The "number_of_replicas" can be increased or decreased anytime, 141 | # by using the Index Update Settings API. 142 | # 143 | # Elasticsearch takes care about load balancing, relocating, gathering the 144 | # results from nodes, etc. Experiment with different settings to fine-tune 145 | # your setup. 146 | 147 | # Use the Index Status API () to inspect 148 | # the index status. 149 | 150 | 151 | #################################### Paths #################################### 152 | 153 | # Path to directory containing configuration (this file and logging.yml): 154 | # 155 | #path.conf: /path/to/conf 156 | 157 | # Path to directory where to store index data allocated for this node. 158 | # 159 | #path.data: /path/to/data 160 | # 161 | # Can optionally include more than one location, causing data to be striped across 162 | # the locations (a la RAID 0) on a file level, favouring locations with most free 163 | # space on creation. For example: 164 | # 165 | #path.data: /path/to/data1,/path/to/data2 166 | 167 | # Path to temporary files: 168 | # 169 | #path.work: /path/to/work 170 | 171 | # Path to log files: 172 | # 173 | #path.logs: /path/to/logs 174 | 175 | # Path to where plugins are installed: 176 | # 177 | #path.plugins: /path/to/plugins 178 | 179 | 180 | #################################### Plugin ################################### 181 | 182 | # If a plugin listed here is not installed for current node, the node will not start. 183 | # 184 | #plugin.mandatory: mapper-attachments,lang-groovy 185 | 186 | 187 | ################################### Memory #################################### 188 | 189 | # Elasticsearch performs poorly when JVM starts swapping: you should ensure that 190 | # it _never_ swaps. 191 | # 192 | # Set this property to true to lock the memory: 193 | # 194 | #bootstrap.mlockall: true 195 | 196 | # Make sure that the ES_MIN_MEM and ES_MAX_MEM environment variables are set 197 | # to the same value, and that the machine has enough memory to allocate 198 | # for Elasticsearch, leaving enough memory for the operating system itself. 199 | # 200 | # You should also make sure that the Elasticsearch process is allowed to lock 201 | # the memory, eg. by using `ulimit -l unlimited`. 202 | 203 | 204 | ############################## Network And HTTP ############################### 205 | 206 | # Elasticsearch, by default, binds itself to the 0.0.0.0 address, and listens 207 | # on port [9200-9300] for HTTP traffic and on port [9300-9400] for node-to-node 208 | # communication. (the range means that if the port is busy, it will automatically 209 | # try the next port). 210 | 211 | # Set the bind address specifically (IPv4 or IPv6): 212 | # 213 | #network.bind_host: 192.168.0.1 214 | 215 | # Set the address other nodes will use to communicate with this node. If not 216 | # set, it is automatically derived. It must point to an actual IP address. 217 | # 218 | #network.publish_host: 192.168.0.1 219 | 220 | # Set both 'bind_host' and 'publish_host': 221 | # 222 | #network.host: 192.168.0.1 223 | 224 | # Set a custom port for the node to node communication (9300 by default): 225 | # 226 | #transport.tcp.port: 9300 227 | 228 | # Enable compression for all communication between nodes (disabled by default): 229 | # 230 | #transport.tcp.compress: true 231 | 232 | # Set a custom port to listen for HTTP traffic: 233 | # 234 | #http.port: 9200 235 | 236 | # Set a custom allowed content length: 237 | # 238 | #http.max_content_length: 100mb 239 | 240 | # Disable HTTP completely: 241 | # 242 | #http.enabled: false 243 | 244 | 245 | ################################### Gateway ################################### 246 | 247 | # The gateway allows for persisting the cluster state between full cluster 248 | # restarts. Every change to the state (such as adding an index) will be stored 249 | # in the gateway, and when the cluster starts up for the first time, 250 | # it will read its state from the gateway. 251 | 252 | # There are several types of gateway implementations. For more information, see 253 | # . 254 | 255 | # The default gateway type is the "local" gateway (recommended): 256 | # 257 | #gateway.type: local 258 | 259 | # Settings below control how and when to start the initial recovery process on 260 | # a full cluster restart (to reuse as much local data as possible when using shared 261 | # gateway). 262 | 263 | # Allow recovery process after N nodes in a cluster are up: 264 | # 265 | #gateway.recover_after_nodes: 1 266 | 267 | # Set the timeout to initiate the recovery process, once the N nodes 268 | # from previous setting are up (accepts time value): 269 | # 270 | #gateway.recover_after_time: 5m 271 | 272 | # Set how many nodes are expected in this cluster. Once these N nodes 273 | # are up (and recover_after_nodes is met), begin recovery process immediately 274 | # (without waiting for recover_after_time to expire): 275 | # 276 | #gateway.expected_nodes: 2 277 | 278 | 279 | ############################# Recovery Throttling ############################# 280 | 281 | # These settings allow to control the process of shards allocation between 282 | # nodes during initial recovery, replica allocation, rebalancing, 283 | # or when adding and removing nodes. 284 | 285 | # Set the number of concurrent recoveries happening on a node: 286 | # 287 | # 1. During the initial recovery 288 | # 289 | #cluster.routing.allocation.node_initial_primaries_recoveries: 4 290 | # 291 | # 2. During adding/removing nodes, rebalancing, etc 292 | # 293 | #cluster.routing.allocation.node_concurrent_recoveries: 2 294 | 295 | # Set to throttle throughput when recovering (eg. 100mb, by default 20mb): 296 | # 297 | #indices.recovery.max_bytes_per_sec: 20mb 298 | 299 | # Set to limit the number of open concurrent streams when 300 | # recovering a shard from a peer: 301 | # 302 | #indices.recovery.concurrent_streams: 5 303 | 304 | 305 | ################################## Discovery ################################## 306 | 307 | # Discovery infrastructure ensures nodes can be found within a cluster 308 | # and master node is elected. Multicast discovery is the default. 309 | 310 | # Set to ensure a node sees N other master eligible nodes to be considered 311 | # operational within the cluster. This should be set to a quorum/majority of 312 | # the master-eligible nodes in the cluster. 313 | # 314 | #discovery.zen.minimum_master_nodes: 1 315 | 316 | # Set the time to wait for ping responses from other nodes when discovering. 317 | # Set this option to a higher value on a slow or congested network 318 | # to minimize discovery failures: 319 | # 320 | #discovery.zen.ping.timeout: 3s 321 | 322 | # For more information, see 323 | # 324 | 325 | # Unicast discovery allows to explicitly control which nodes will be used 326 | # to discover the cluster. It can be used when multicast is not present, 327 | # or to restrict the cluster communication-wise. 328 | # 329 | # 1. Disable multicast discovery (enabled by default): 330 | # 331 | #discovery.zen.ping.multicast.enabled: false 332 | # 333 | # 2. Configure an initial list of master nodes in the cluster 334 | # to perform discovery when new nodes (master or data) are started: 335 | # 336 | #discovery.zen.ping.unicast.hosts: ["host1", "host2:port"] 337 | 338 | # EC2 discovery allows to use AWS EC2 API in order to perform discovery. 339 | # 340 | # You have to install the cloud-aws plugin for enabling the EC2 discovery. 341 | # 342 | # For more information, see 343 | # 344 | # 345 | # See 346 | # for a step-by-step tutorial. 347 | 348 | # GCE discovery allows to use Google Compute Engine API in order to perform discovery. 349 | # 350 | # You have to install the cloud-gce plugin for enabling the GCE discovery. 351 | # 352 | # For more information, see . 353 | 354 | # Azure discovery allows to use Azure API in order to perform discovery. 355 | # 356 | # You have to install the cloud-azure plugin for enabling the Azure discovery. 357 | # 358 | # For more information, see . 359 | 360 | ################################## Slow Log ################################## 361 | 362 | # Shard level query and fetch threshold logging. 363 | 364 | #index.search.slowlog.threshold.query.warn: 10s 365 | #index.search.slowlog.threshold.query.info: 5s 366 | #index.search.slowlog.threshold.query.debug: 2s 367 | #index.search.slowlog.threshold.query.trace: 500ms 368 | 369 | #index.search.slowlog.threshold.fetch.warn: 1s 370 | #index.search.slowlog.threshold.fetch.info: 800ms 371 | #index.search.slowlog.threshold.fetch.debug: 500ms 372 | #index.search.slowlog.threshold.fetch.trace: 200ms 373 | 374 | #index.indexing.slowlog.threshold.index.warn: 10s 375 | #index.indexing.slowlog.threshold.index.info: 5s 376 | #index.indexing.slowlog.threshold.index.debug: 2s 377 | #index.indexing.slowlog.threshold.index.trace: 500ms 378 | 379 | ################################## GC Logging ################################ 380 | 381 | #monitor.jvm.gc.young.warn: 1000ms 382 | #monitor.jvm.gc.young.info: 700ms 383 | #monitor.jvm.gc.young.debug: 400ms 384 | 385 | #monitor.jvm.gc.old.warn: 10s 386 | #monitor.jvm.gc.old.info: 5s 387 | #monitor.jvm.gc.old.debug: 2s 388 | 389 | ################################## Security ################################ 390 | 391 | # Uncomment if you want to enable JSONP as a valid return transport on the 392 | # http server. With this enabled, it may pose a security risk, so disabling 393 | # it unless you need it is recommended (it is disabled by default). 394 | # 395 | #http.jsonp.enable: true 396 | --------------------------------------------------------------------------------