├── .gitignore
├── README.md
├── Vagrantfile
├── ansible
    ├── ansible.cfg
    ├── cluster.yml
    ├── inventories
    │   └── vbox
    │   │   ├── group_vars
    │   │       ├── all
    │   │       ├── cassandra
    │   │       ├── flink
    │   │       ├── hadoop-master
    │   │       ├── hadoop-slave
    │   │       ├── kafka
    │   │       └── spark
    │   │   └── hosts
    ├── network.yml
    └── roles
    │   ├── cassandra
    │       ├── defaults
    │       │   └── main.yml
    │       ├── tasks
    │       │   └── main.yml
    │       └── templates
    │       │   ├── cassandra.service.j2
    │       │   └── cassandra.yaml.j2
    │   ├── common
    │       ├── tasks
    │       │   └── main.yml
    │       └── templates
    │       │   └── hosts.j2
    │   ├── flink
    │       ├── defaults
    │       │   └── main.yml
    │       ├── tasks
    │       │   └── main.yml
    │       └── templates
    │       │   ├── flink.service.j2
    │       │   ├── status-flink-yarn.sh.j2
    │       │   └── stop-flink-yarn.sh.j2
    │   ├── hadoop
    │       ├── defaults
    │       │   └── main.yml
    │       ├── tasks
    │       │   └── main.yml
    │       └── templates
    │       │   ├── capacity-scheduler.xml.j2
    │       │   ├── core-site.xml.j2
    │       │   ├── hadoop-env.sh.j2
    │       │   ├── hdfs-datanode.service.j2
    │       │   ├── hdfs-namenode.service.j2
    │       │   ├── yarn-env.sh.j2
    │       │   ├── yarn-nodemanager.service.j2
    │       │   ├── yarn-resourcemanager.service.j2
    │       │   └── yarn-site.xml.j2
    │   ├── java
    │       ├── defaults
    │       │   └── main.yml
    │       └── tasks
    │       │   └── main.yml
    │   ├── kafka
    │       ├── defaults
    │       │   └── main.yml
    │       ├── handlers
    │       │   └── main.yml
    │       ├── tasks
    │       │   ├── kafka.yml
    │       │   ├── main.yml
    │       │   └── zookeeper.yml
    │       └── templates
    │       │   ├── kafka.environment.j2
    │       │   ├── kafka.service.j2
    │       │   ├── server.properties.j2
    │       │   ├── zookeeper.environment.j2
    │       │   ├── zookeeper.properties.j2
    │       │   └── zookeeper.service.j2
    │   └── spark
    │       ├── defaults
    │           └── main.yml
    │       └── tasks
    │           └── main.yml
├── doc
    ├── fastdata-cluster.png
    ├── flink.png
    ├── spark-streaming.png
    └── yarn.png
└── exchange
    └── spark-playground.jar


/.gitignore:
--------------------------------------------------------------------------------
1 | .vagrant
2 | download
3 | exchange
4 | *.retry
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Fast Data Cluster
  2 | 
  3 | > **Warning**
  4 | > Because this repo is based upon VirtualBox which isn't available vor Apple Silicon based Macs, i have to deprecated this repo.
  5 | >
  6 | > 2023: there are [test builds](https://www.virtualbox.org/wiki/Testbuilds) of VirtualBox for Apple Silicon, but so far it is not stable enough.
  7 | 
  8 | ## Content
  9 | 
 10 | In case you need a local cluster providing Kafka, Cassandra and Spark you're at the right place.
 11 | 
 12 | * [Apache Kafka 2.7.0](http://kafka.apache.org/27/documentation.html)
 13 | * [Apache Spark 3.0.2](http://spark.apache.org/releases/spark-release-3-0-2.html)
 14 | * [Apache Cassandra 4.0-beta4](http://cassandra.apache.org)
 15 | * [Apache Hadoop 3.3.0](https://hadoop.apache.org/docs/r3.3.0/)
 16 | * [Apache Flink 1.12.1](https://ci.apache.org/projects/flink/flink-docs-release-1.12)
 17 | 
 18 | ## Prerequisites
 19 | 
 20 | * [Vagrant](https://www.vagrantup.com) (tested with 2.2.14)
 21 | * [VirtualBox](http://virtualbox.org) (tested with 6.1.18)
 22 | * [Ansible](http://docs.ansible.com/ansible/index.html) (tested with 2.10.5)
 23 | * The VMs take approx 18 GB of RAM, so you should have more than that.
 24 | 
 25 | 
 26 | :warning: Vagrant might ask you for your admin password. The reason behind is, that `vagrant-hostsupdater` is used to have the vms available with their names in your network.
 27 | 
 28 | ## Init
 29 | 
 30 | ```bash
 31 | git clone https://github.com/markush81/fastdata-cluster.git
 32 | vagrant up
 33 | ```
 34 | 
 35 | ## Cluster
 36 | 
 37 | The result if everything wents fine should be
 38 | 
 39 | ![FastData Cluster](doc/fastdata-cluster.png)
 40 | 
 41 | 
 42 | ## Coordinates
 43 | 
 44 | #### Servers
 45 | 
 46 | | IP | Hostname | Description | Settings |
 47 | |:--- |:-- |:-- |:-- |
 48 | |192.168.10.2|kafka-1|running a kafka broker| 1024 MB RAM |
 49 | |192.168.10.3|kafka-2|running a kafka broker| 1024 MB RAM |
 50 | |192.168.10.4|kafka-3|running a kafka broker| 1024 MB RAM |
 51 | |192.168.10.5|cassandra-1|running a cassandra node| 1024 MB RAM |
 52 | |192.168.10.6|cassandra-2|running a cassandra nodee| 1024 MB RAM |
 53 | |192.168.10.7|cassandra-3|running a cassandra node| 1024 MB RAM |
 54 | |192.168.10.8|hadoop-1|running a yarn resourcemanager and nodemanager, hdfs namenode, spark distribution, flink distribution| 4096 MB RAM |
 55 | |192.168.10.9|hadoop-2|running a yarn nodemanager, hdfs datanode | 4096 MB RAM |
 56 | |192.168.10.10|hadoop-3|running a yarn nodemanager, hdfs datanode | 4096 MB RAM |
 57 | 
 58 | ### Connections
 59 | 
 60 | | Name | |
 61 | |:-- |:-- |
 62 | |Zookeeper|kafka-1:2181,kafka-2:2181,kafka-3:2181|
 63 | |Kafka Brokers|kafka-1:9092,kafka-2:9092,kafka-3:9092|
 64 | |Cassandra Hosts|cassandra-1,cassandra-2,cassandra-3|
 65 | |YARN Resource Manager|[http://hadoop-1:8088](http://hadoop-1:8088)|
 66 | |HDFS Namenode UI|[http://hadoop-1:9870](http://hadoop-1:9870)|
 67 | 
 68 | # Usage
 69 | 
 70 | 
 71 | ## Cassandra
 72 | 
 73 | ```bash
 74 | lucky:~ markus$ vagrant ssh cassandra-1
 75 | [vagrant@cassandra-1 ~]$ cqlsh
 76 | Connected to analytics at 127.0.0.1:9042.
 77 | [cqlsh 5.0.1 | Cassandra 4.0-beta4 | CQL spec 3.4.5 | Native protocol v4]
 78 | Use HELP for help.
 79 | cqlsh>
 80 | ```
 81 | 
 82 | ```
 83 | cqlsh> CREATE KEYSPACE example WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 2 };
 84 | cqlsh> USE example;
 85 | cqlsh:example> CREATE TABLE users (id UUID PRIMARY KEY, lastname text, firstname text );
 86 | cqlsh:example> INSERT INTO users (id, lastname, firstname) VALUES (6ab09bec-e68e-48d9-a5f8-97e6fb4c9b47, 'Mustermann','Max') USING TTL 86400 AND TIMESTAMP 123456789;
 87 | cqlsh:example> SELECT * FROM users;
 88 | 
 89 |  id                                   | firstname | lastname
 90 | --------------------------------------+-----------+------------
 91 |  6ab09bec-e68e-48d9-a5f8-97e6fb4c9b47 |       Max | Mustermann
 92 | 
 93 | (1 rows)
 94 | ```
 95 | 
 96 | Check Cluster Status:
 97 | 
 98 | ```bash
 99 | [vagrant@cassandra-1 ~]$ nodetool status
100 | Datacenter: datacenter1
101 | =======================
102 | Status=Up/Down
103 | |/ State=Normal/Leaving/Joining/Moving
104 | --  Address       Load        Tokens  Owns  Host ID                               Rack
105 | UN  192.168.10.5  105.69 KiB  16      ?     74e6aff4-3561-4f48-bdbb-d030a9da0c01  rack1
106 | UN  192.168.10.7  100.65 KiB  16      ?     3b428824-a9f2-4a49-ae1d-3639fc584e92  rack1
107 | UN  192.168.10.6  100.66 KiB  16      ?     4418963f-5e94-4046-9cc1-f9614c6eae6e  rack1
108 | 
109 | Note: Non-system keyspaces don't have the same replication settings, effective ownership information is meaningless
110 | ```
111 | 
112 | ## Zookeeper
113 | 
114 | ```bash
115 | [vagrant@kafka-1 ~]$ zookeeper-shell.sh kafka-1:2181/
116 | Connecting to kafka-1:2181/
117 | Welcome to ZooKeeper!
118 | JLine support is disabled
119 | 
120 | WATCHER::
121 | 
122 | WatchedEvent state:SyncConnected type:None path:null
123 | ls /
124 | [admin, brokers, cluster, config, consumers, controller, controller_epoch, isr_change_notification, latest_producer_id_block, log_dir_event_notification, zookeeper]
125 | ls /brokers/ids
126 | [0, 1, 2]
127 | 
128 | ```
129 | 
130 | ## Kafka
131 | 
132 | ### Topic Creation
133 | 
134 | ```bash
135 | lucky:~ markus$ vagrant ssh kafka-1
136 | [vagrant@kafka-1 ~]$ kafka-topics.sh --create --zookeeper kafka-1:2181 --replication-factor 2 --partitions 6 --topic sample
137 | Created topic "sample".
138 | [vagrant@kafka-1 ~]$ kafka-topics.sh --zookeeper kafka-1 --topic sample --describe
139 | Topic:sample	PartitionCount:6	ReplicationFactor:2	Configs:
140 | 	Topic: sample	Partition: 0	Leader: 1	Replicas: 1,2	Isr: 1,2
141 | 	Topic: sample	Partition: 1	Leader: 2	Replicas: 2,3	Isr: 2,3
142 | 	Topic: sample	Partition: 2	Leader: 3	Replicas: 3,1	Isr: 3,1
143 | 	Topic: sample	Partition: 3	Leader: 1	Replicas: 1,3	Isr: 1,3
144 | 	Topic: sample	Partition: 4	Leader: 2	Replicas: 2,1	Isr: 2,1
145 | 	Topic: sample	Partition: 5	Leader: 3	Replicas: 3,2	Isr: 3,2
146 | [vagrant@kafka-1 ~]$
147 | ```
148 | ### Producer
149 | 
150 | ```bash
151 | [vagrant@kafka-1 ~]$ kafka-console-producer.sh --broker-list kafka-1:9092,kafka-3:9092 --topic sample
152 | Hey, is Kafka up and running?
153 | ```
154 | 
155 | ### Consumer
156 | 
157 | ```bash
158 | [vagrant@kafka-1 ~]$ kafka-console-consumer.sh --bootstrap-server kafka-1:9092,kafka-3:9092 --topic sample --from-beginning
159 | Hey, is Kafka up and running?
160 | ```
161 | 
162 | ## YARN
163 | 
164 | The YARN ResourceManager UI can be accessed by [http://hadoop-1:8088](http://hadoop-1:8088), from there you can navigate to your application .
165 | 
166 | ![YARN](doc/yarn.png)
167 | 
168 | ## Spark
169 | 
170 | ### Spark Examples
171 | 
172 | ```bash
173 | lucky:~ markus$ vagrant ssh hadoop-1
174 | [vagrant@hadoop-1 ~]$ spark-submit --master yarn --class org.apache.spark.examples.SparkPi --deploy-mode cluster --driver-memory 512M --executor-memory 512M --num-executors 2 /usr/local/spark-3.0.2-bin-without-hadoop/examples/jars/spark-examples_2.12-3.0.2.jar 1000
175 | ```
176 | 
177 | ## Flink
178 | 
179 | ### Flink Example Run
180 | 
181 | #### Access Flink UI:
182 | 
183 | http://hadoop-1:8088/cluster -> Click ID Link of "Flink session cluster" and then "Tracking URL: ApplicationMaster"
184 | 
185 | #### Submit a job:
186 | 
187 | ```bash
188 | [vagrant@hadoop-1 ~]$ HADOOP_CLASSPATH=$(hadoop classpath) flink run /usr/local/flink-1.12.1/examples/streaming/WordCount.jar
189 | ```
190 | 
191 | 
192 | ![Flink](doc/flink.png)
193 | 
194 | ## Further Links
195 | 
196 | - [yarn-default.xml](https://hadoop.apache.org/docs/r3.3.0/hadoop-yarn/hadoop-yarn-common/yarn-default.xml)
197 | - [core-default.xml](https://hadoop.apache.org/docs/r3.2.0/hadoop-project-dist/hadoop-common/core-default.xml)
198 | - [hdfs-default.xml](https://hadoop.apache.org/docs/r3.3.0/hadoop-project-dist/hadoop-hdfs/hdfs-default.xml)
199 | - [Spark Documentation](https://spark.apache.org/docs/latest/)
200 | - [Apache Cassandra Documentation](http://cassandra.apache.org/doc/latest/)
201 | 


--------------------------------------------------------------------------------
/Vagrantfile:
--------------------------------------------------------------------------------
  1 | ENV["LC_ALL"] = "en_US.UTF-8"
  2 | 
  3 | # if sth. gets changed here, also adapt /ansible/inventories/vbox/hosts
  4 | KAFKA = 3
  5 | CASSANDRA = 3
  6 | HADDOP = 3
  7 | 
  8 | Vagrant.configure("2") do |config|
  9 | 
 10 |   required_plugins = %w( vagrant-hostsupdater )
 11 |   required_plugins.each do |plugin|
 12 |     system "vagrant plugin install #{plugin}" unless Vagrant.has_plugin? plugin
 13 |   end
 14 | 
 15 |   config.vm.box = "markush81/centos8-vbox-guestadditions"
 16 |   config.vm.box_check_update = true
 17 | 
 18 |   config.vm.synced_folder "exchange", "/home/vagrant/exchange", create: true
 19 |   config.vm.synced_folder "ansible", "/home/vagrant/ansible", create: true
 20 | 
 21 |   config.vm.provision :shell, inline: "ifup eth1", run: "always"
 22 | 
 23 |   (1..KAFKA).each do |i|
 24 |     config.vm.define "kafka-#{i}" do |kafka|
 25 |       kafka.vm.hostname = "kafka-#{i}"
 26 |       kafka.vm.provider "virtualbox" do |vb|
 27 |         vb.memory = "1024"
 28 |         vb.cpus = "1"
 29 |       end
 30 |       kafka.vm.network :private_network, ip: "192.168.10.#{1 + i}", auto_config: true
 31 | 
 32 |       if i == KAFKA
 33 |         kafka.vm.provision :ansible do |ansible|
 34 |           ansible.compatibility_mode = "2.0"
 35 |           ansible.limit = "kafka,cassandra,hadoop-master,hadoop-slave"
 36 |           ansible.playbook = "ansible/network.yml"
 37 |           ansible.inventory_path = "ansible/inventories/vbox"
 38 |           ansible.raw_arguments  = [
 39 |             "-vv"
 40 |           ]
 41 |         end
 42 | 
 43 |         kafka.vm.provision :ansible do |ansible|
 44 |           ansible.compatibility_mode = "2.0"
 45 |           ansible.limit = "kafka"
 46 |           ansible.playbook = "ansible/cluster.yml"
 47 |           ansible.inventory_path = "ansible/inventories/vbox"
 48 |           ansible.raw_arguments  = [
 49 |             "-vv"
 50 |           ]
 51 |         end
 52 |       end
 53 |     end
 54 |   end
 55 | 
 56 |   (1..CASSANDRA).each do |i|
 57 |     config.vm.define "cassandra-#{i}" do |cassandra|
 58 |       cassandra.vm.hostname = "cassandra-#{i}"
 59 |       cassandra.vm.provider "virtualbox" do |vb|
 60 |         vb.memory = "1024"
 61 |         vb.cpus = "1"
 62 |       end
 63 |       cassandra.vm.network :private_network, ip: "192.168.10.#{KAFKA + 1 + i }", auto_config: true
 64 | 
 65 |       if i == CASSANDRA
 66 |         cassandra.vm.provision :ansible do |ansible|
 67 |           ansible.compatibility_mode = "2.0"
 68 |           ansible.limit = "kafka,cassandra,hadoop-master,hadoop-slave"
 69 |           ansible.playbook = "ansible/network.yml"
 70 |           ansible.inventory_path = "ansible/inventories/vbox"
 71 |           ansible.raw_arguments  = [
 72 |             "-vv"
 73 |           ]
 74 |         end
 75 | 
 76 |         cassandra.vm.provision :ansible do |ansible|
 77 |           ansible.compatibility_mode = "2.0"
 78 |           ansible.limit = "cassandra"
 79 |           ansible.playbook = "ansible/cluster.yml"
 80 |           ansible.inventory_path = "ansible/inventories/vbox"
 81 |           ansible.raw_arguments  = [
 82 |             "-vv"
 83 |           ]
 84 |         end
 85 |       end
 86 |     end
 87 |   end
 88 | 
 89 |   (HADDOP).downto(1).each do |i|
 90 |     config.vm.define "hadoop-#{i}" do |hadoop|
 91 |       hadoop.vm.hostname = "hadoop-#{i}"
 92 |       hadoop.vm.provider "virtualbox" do |vb|
 93 |         vb.memory = "4096"
 94 |         vb.cpus = "2"
 95 |       end
 96 |       hadoop.vm.network :private_network, ip: "192.168.10.#{KAFKA + CASSANDRA + 1 + i}", auto_config: true
 97 | 
 98 |       if i == 1
 99 |         hadoop.vm.provision :ansible do |ansible|
100 |           ansible.compatibility_mode = "2.0"
101 |           ansible.limit = "kafka,cassandra,hadoop-master,hadoop-slave"
102 |           ansible.playbook = "ansible/network.yml"
103 |           ansible.inventory_path = "ansible/inventories/vbox"
104 |           ansible.raw_arguments  = [
105 |             "-vv"
106 |           ]
107 |         end
108 | 
109 |         hadoop.vm.provision :ansible do |ansible|
110 |           ansible.compatibility_mode = "2.0"
111 |           ansible.limit = "hadoop-master,hadoop-slave"
112 |           ansible.playbook = "ansible/cluster.yml"
113 |           ansible.inventory_path = "ansible/inventories/vbox"
114 |           ansible.raw_arguments  = [
115 |             "-vv"
116 |           ]
117 |         end
118 |       end
119 |     end
120 |   end
121 | end
122 | 


--------------------------------------------------------------------------------
/ansible/ansible.cfg:
--------------------------------------------------------------------------------
1 | inventory = inventories/vbox
2 | 


--------------------------------------------------------------------------------
/ansible/cluster.yml:
--------------------------------------------------------------------------------
 1 | - hosts: all
 2 |   remote_user: vagrant
 3 |   serial: 100%
 4 |   roles:
 5 |   - java
 6 | 
 7 | - hosts: kafka
 8 |   remote_user: vagrant
 9 |   serial: 66%
10 |   roles:
11 |     - kafka
12 | 
13 | - hosts: cassandra
14 |   remote_user: vagrant
15 |   serial: 100%
16 |   roles:
17 |     - cassandra
18 | 
19 | - hosts: hadoop-master
20 |   remote_user: vagrant
21 |   serial: 100%
22 |   roles:
23 |     - hadoop
24 | 
25 | - hosts: hadoop-slave
26 |   remote_user: vagrant
27 |   serial: 100%
28 |   roles:
29 |     - hadoop
30 | 
31 | - hosts: hadoop-master
32 |   remote_user: vagrant
33 |   serial: 100%
34 |   roles:
35 |     - spark
36 |     - flink
37 | 


--------------------------------------------------------------------------------
/ansible/inventories/vbox/group_vars/all:
--------------------------------------------------------------------------------
1 | usr_local: /usr/local
2 | etc_profiles: /etc/profile.d
3 | system_units: /etc/systemd/system
4 | java_package: ["java-11-openjdk", "java-11-openjdk-devel"]


--------------------------------------------------------------------------------
/ansible/inventories/vbox/group_vars/cassandra:
--------------------------------------------------------------------------------
1 | cassandra_home: "{{ usr_local }}/{{ cassandra }}"
2 | cassandra_log_dir: "{{ cassandra_home }}/logs"
3 | cassandra_cluster: "analytics"
4 | cassandra_seeds: "cassandra-1, cassandra-3"
5 | cassandra_pid: "{{ cassandra_home }}/cassandra.pid"


--------------------------------------------------------------------------------
/ansible/inventories/vbox/group_vars/flink:
--------------------------------------------------------------------------------
1 | flink_home: "{{ usr_local }}/{{ flink }}"
2 | 


--------------------------------------------------------------------------------
/ansible/inventories/vbox/group_vars/hadoop-master:
--------------------------------------------------------------------------------
1 | hadoop_home: "{{ usr_local }}/{{ hadoop }}"
2 | hadoop_conf: "{{ hadoop_home }}/etc/hadoop"
3 | hadoop_log_dir: "{{ hadoop_home }}/logs"


--------------------------------------------------------------------------------
/ansible/inventories/vbox/group_vars/hadoop-slave:
--------------------------------------------------------------------------------
1 | hadoop_home: "{{ usr_local }}/{{ hadoop }}"
2 | hadoop_conf: "{{ hadoop_home }}/etc/hadoop"
3 | hadoop_log_dir: "{{ hadoop_home }}/logs"


--------------------------------------------------------------------------------
/ansible/inventories/vbox/group_vars/kafka:
--------------------------------------------------------------------------------
1 | kafka_home: /usr/local/{{ kafka }}
2 | kafka_log_dir: "{{ kafka_home }}/logs"
3 | zookeeper_data_dir: "/var/data"


--------------------------------------------------------------------------------
/ansible/inventories/vbox/group_vars/spark:
--------------------------------------------------------------------------------
1 | spark_home: "{{ usr_local }}/{{ spark }}"
2 | 


--------------------------------------------------------------------------------
/ansible/inventories/vbox/hosts:
--------------------------------------------------------------------------------
 1 | [zookeeper]
 2 | kafka-1
 3 | kafka-2
 4 | kafka-3
 5 | 
 6 | [kafka:children]
 7 | zookeeper
 8 | 
 9 | [cassandra]
10 | cassandra-1
11 | cassandra-2
12 | cassandra-3
13 | 
14 | [hadoop-master]
15 | hadoop-1
16 | 
17 | [hadoop-slave]
18 | hadoop-2
19 | hadoop-3
20 | 
21 | [spark]
22 | hadoop-1
23 | 
24 | [flink]
25 | hadoop-1
26 | 


--------------------------------------------------------------------------------
/ansible/network.yml:
--------------------------------------------------------------------------------
1 | - hosts: all
2 |   remote_user: vagrant
3 |   ignore_unreachable: true
4 |   serial: 100%
5 |   roles:
6 |   - common
7 | 


--------------------------------------------------------------------------------
/ansible/roles/cassandra/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | version: "4.0-beta4"
3 | cassandra: apache-cassandra-{{ version }}
4 | file: "{{ cassandra }}-bin.tar.gz"
5 | url: ftp://ftp-stud.hs-esslingen.de/pub/Mirrors/ftp.apache.org/dist/cassandra/{{ version }}/{{ file }}
6 | 


--------------------------------------------------------------------------------
/ansible/roles/cassandra/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: install python2
 4 |   become: yes
 5 |   yum:
 6 |     name: "python2"
 7 |     state: present
 8 | 
 9 | - name: download {{ url }}
10 |   get_url:
11 |     url: "{{ url }}"
12 |     dest: "/tmp/{{ file }}"
13 | 
14 | - name: extract /tmp/{{ file }}
15 |   become: yes
16 |   become_method: sudo
17 |   unarchive:
18 |     src: "/tmp/{{ file }}"
19 |     dest: "{{ usr_local }}"
20 |     copy: no
21 |     creates: "{{ cassandra_home }}"
22 | 
23 | - name: set ownership on {{ cassandra_home }}
24 |   become: yes
25 |   become_method: sudo
26 |   file:
27 |     path: "{{ cassandra_home }}"
28 |     owner: vagrant
29 |     group: vagrant
30 |     recurse: yes
31 |     mode: 0755
32 | 
33 | - name: set PATH=$PATH:{{ cassandra_home }}/bin
34 |   become: yes
35 |   become_method: sudo
36 |   lineinfile:
37 |     dest: "{{ etc_profiles }}/cassandra.sh"
38 |     create: yes
39 |     state: present
40 |     regexp: '^PATH'
41 |     line: 'PATH=$PATH:{{ cassandra_home }}/bin'
42 | 
43 | - name: copy cassandra.yaml {{ cassandra_home }}/conf/cassandra.yaml
44 |   template:
45 |     src: templates/cassandra.yaml.j2
46 |     dest: "{{ cassandra_home }}/conf/cassandra.yaml"
47 |   register: cassandra_config
48 | 
49 | - name: create {{ cassandra_log_dir }}
50 |   file:
51 |     path: "{{ cassandra_log_dir }}"
52 |     state: directory
53 | 
54 | - name: install cassandra systemd unit file
55 |   become: yes
56 |   become_method: sudo
57 |   template:
58 |     src: templates/cassandra.service.j2
59 |     dest: "{{ system_units }}/cassandra.service"
60 |   register: cassandra_service
61 | 
62 | - name: restart cassandra
63 |   become: yes
64 |   become_method: sudo
65 |   when: cassandra_config.changed or cassandra_service.changed
66 |   systemd:
67 |     enabled: yes
68 |     state: restarted
69 |     name: cassandra
70 |     daemon_reload: yes
71 | 


--------------------------------------------------------------------------------
/ansible/roles/cassandra/templates/cassandra.service.j2:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Cassandra Service
 3 | After=network.target
 4 | 
 5 | [Service]
 6 | Type=forking
 7 | User=vagrant
 8 | ExecStart={{ cassandra_home }}/bin/cassandra -R -p {{ cassandra_pid }}
 9 | ExecStop=cat {{ cassandra_pid }} | kill
10 | Restart=on-failure
11 | TimeoutSec=300
12 | 
13 | [Install]
14 | WantedBy=multi-user.target
15 | 


--------------------------------------------------------------------------------
/ansible/roles/cassandra/templates/cassandra.yaml.j2:
--------------------------------------------------------------------------------
   1 | # Cassandra storage config YAML
   2 | 
   3 | # NOTE:
   4 | #   See https://cassandra.apache.org/doc/latest/configuration/ for
   5 | #   full explanations of configuration directives
   6 | # /NOTE
   7 | 
   8 | # The name of the cluster. This is mainly used to prevent machines in
   9 | # one logical cluster from joining another.
  10 | cluster_name: '{{ cassandra_cluster }}'
  11 | 
  12 | # This defines the number of tokens randomly assigned to this node on the ring
  13 | # The more tokens, relative to other nodes, the larger the proportion of data
  14 | # that this node will store. You probably want all nodes to have the same number
  15 | # of tokens assuming they have equal hardware capability.
  16 | #
  17 | # If you leave this unspecified, Cassandra will use the default of 1 token for legacy compatibility,
  18 | # and will use the initial_token as described below.
  19 | #
  20 | # Specifying initial_token will override this setting on the node's initial start,
  21 | # on subsequent starts, this setting will apply even if initial token is set.
  22 | #
  23 | # See https://cassandra.apache.org/doc/latest/getting_started/production.html#tokens for
  24 | # best practice information about num_tokens.
  25 | #
  26 | num_tokens: 16
  27 | 
  28 | # Triggers automatic allocation of num_tokens tokens for this node. The allocation
  29 | # algorithm attempts to choose tokens in a way that optimizes replicated load over
  30 | # the nodes in the datacenter for the replica factor.
  31 | #
  32 | # The load assigned to each node will be close to proportional to its number of
  33 | # vnodes.
  34 | #
  35 | # Only supported with the Murmur3Partitioner.
  36 | 
  37 | # Replica factor is determined via the replication strategy used by the specified
  38 | # keyspace.
  39 | # allocate_tokens_for_keyspace: KEYSPACE
  40 | 
  41 | # Replica factor is explicitly set, regardless of keyspace or datacenter.
  42 | # This is the replica factor within the datacenter, like NTS.
  43 | allocate_tokens_for_local_replication_factor: 3
  44 | 
  45 | # initial_token allows you to specify tokens manually.  While you can use it with
  46 | # vnodes (num_tokens > 1, above) -- in which case you should provide a 
  47 | # comma-separated list -- it's primarily used when adding nodes to legacy clusters 
  48 | # that do not have vnodes enabled.
  49 | # initial_token:
  50 | 
  51 | # May either be "true" or "false" to enable globally
  52 | hinted_handoff_enabled: true
  53 | 
  54 | # When hinted_handoff_enabled is true, a black list of data centers that will not
  55 | # perform hinted handoff
  56 | # hinted_handoff_disabled_datacenters:
  57 | #    - DC1
  58 | #    - DC2
  59 | 
  60 | # this defines the maximum amount of time a dead host will have hints
  61 | # generated.  After it has been dead this long, new hints for it will not be
  62 | # created until it has been seen alive and gone down again.
  63 | max_hint_window_in_ms: 10800000 # 3 hours
  64 | 
  65 | # Maximum throttle in KBs per second, per delivery thread.  This will be
  66 | # reduced proportionally to the number of nodes in the cluster.  (If there
  67 | # are two nodes in the cluster, each delivery thread will use the maximum
  68 | # rate; if there are three, each will throttle to half of the maximum,
  69 | # since we expect two nodes to be delivering hints simultaneously.)
  70 | hinted_handoff_throttle_in_kb: 1024
  71 | 
  72 | # Number of threads with which to deliver hints;
  73 | # Consider increasing this number when you have multi-dc deployments, since
  74 | # cross-dc handoff tends to be slower
  75 | max_hints_delivery_threads: 2
  76 | 
  77 | # Directory where Cassandra should store hints.
  78 | # If not set, the default directory is $CASSANDRA_HOME/data/hints.
  79 | # hints_directory: /var/lib/cassandra/hints
  80 | 
  81 | # How often hints should be flushed from the internal buffers to disk.
  82 | # Will *not* trigger fsync.
  83 | hints_flush_period_in_ms: 10000
  84 | 
  85 | # Maximum size for a single hints file, in megabytes.
  86 | max_hints_file_size_in_mb: 128
  87 | 
  88 | # Compression to apply to the hint files. If omitted, hints files
  89 | # will be written uncompressed. LZ4, Snappy, and Deflate compressors
  90 | # are supported.
  91 | #hints_compression:
  92 | #   - class_name: LZ4Compressor
  93 | #     parameters:
  94 | #         -
  95 | 
  96 | # Maximum throttle in KBs per second, total. This will be
  97 | # reduced proportionally to the number of nodes in the cluster.
  98 | batchlog_replay_throttle_in_kb: 1024
  99 | 
 100 | # Authentication backend, implementing IAuthenticator; used to identify users
 101 | # Out of the box, Cassandra provides org.apache.cassandra.auth.{AllowAllAuthenticator,
 102 | # PasswordAuthenticator}.
 103 | #
 104 | # - AllowAllAuthenticator performs no checks - set it to disable authentication.
 105 | # - PasswordAuthenticator relies on username/password pairs to authenticate
 106 | #   users. It keeps usernames and hashed passwords in system_auth.roles table.
 107 | #   Please increase system_auth keyspace replication factor if you use this authenticator.
 108 | #   If using PasswordAuthenticator, CassandraRoleManager must also be used (see below)
 109 | authenticator: AllowAllAuthenticator
 110 | 
 111 | # Authorization backend, implementing IAuthorizer; used to limit access/provide permissions
 112 | # Out of the box, Cassandra provides org.apache.cassandra.auth.{AllowAllAuthorizer,
 113 | # CassandraAuthorizer}.
 114 | #
 115 | # - AllowAllAuthorizer allows any action to any user - set it to disable authorization.
 116 | # - CassandraAuthorizer stores permissions in system_auth.role_permissions table. Please
 117 | #   increase system_auth keyspace replication factor if you use this authorizer.
 118 | authorizer: AllowAllAuthorizer
 119 | 
 120 | # Part of the Authentication & Authorization backend, implementing IRoleManager; used
 121 | # to maintain grants and memberships between roles.
 122 | # Out of the box, Cassandra provides org.apache.cassandra.auth.CassandraRoleManager,
 123 | # which stores role information in the system_auth keyspace. Most functions of the
 124 | # IRoleManager require an authenticated login, so unless the configured IAuthenticator
 125 | # actually implements authentication, most of this functionality will be unavailable.
 126 | #
 127 | # - CassandraRoleManager stores role data in the system_auth keyspace. Please
 128 | #   increase system_auth keyspace replication factor if you use this role manager.
 129 | role_manager: CassandraRoleManager
 130 | 
 131 | # Network authorization backend, implementing INetworkAuthorizer; used to restrict user
 132 | # access to certain DCs
 133 | # Out of the box, Cassandra provides org.apache.cassandra.auth.{AllowAllNetworkAuthorizer,
 134 | # CassandraNetworkAuthorizer}.
 135 | #
 136 | # - AllowAllNetworkAuthorizer allows access to any DC to any user - set it to disable authorization.
 137 | # - CassandraNetworkAuthorizer stores permissions in system_auth.network_permissions table. Please
 138 | #   increase system_auth keyspace replication factor if you use this authorizer.
 139 | network_authorizer: AllowAllNetworkAuthorizer
 140 | 
 141 | # Validity period for roles cache (fetching granted roles can be an expensive
 142 | # operation depending on the role manager, CassandraRoleManager is one example)
 143 | # Granted roles are cached for authenticated sessions in AuthenticatedUser and
 144 | # after the period specified here, become eligible for (async) reload.
 145 | # Defaults to 2000, set to 0 to disable caching entirely.
 146 | # Will be disabled automatically for AllowAllAuthenticator.
 147 | roles_validity_in_ms: 2000
 148 | 
 149 | # Refresh interval for roles cache (if enabled).
 150 | # After this interval, cache entries become eligible for refresh. Upon next
 151 | # access, an async reload is scheduled and the old value returned until it
 152 | # completes. If roles_validity_in_ms is non-zero, then this must be
 153 | # also.
 154 | # Defaults to the same value as roles_validity_in_ms.
 155 | # roles_update_interval_in_ms: 2000
 156 | 
 157 | # Validity period for permissions cache (fetching permissions can be an
 158 | # expensive operation depending on the authorizer, CassandraAuthorizer is
 159 | # one example). Defaults to 2000, set to 0 to disable.
 160 | # Will be disabled automatically for AllowAllAuthorizer.
 161 | permissions_validity_in_ms: 2000
 162 | 
 163 | # Refresh interval for permissions cache (if enabled).
 164 | # After this interval, cache entries become eligible for refresh. Upon next
 165 | # access, an async reload is scheduled and the old value returned until it
 166 | # completes. If permissions_validity_in_ms is non-zero, then this must be
 167 | # also.
 168 | # Defaults to the same value as permissions_validity_in_ms.
 169 | # permissions_update_interval_in_ms: 2000
 170 | 
 171 | # Validity period for credentials cache. This cache is tightly coupled to
 172 | # the provided PasswordAuthenticator implementation of IAuthenticator. If
 173 | # another IAuthenticator implementation is configured, this cache will not
 174 | # be automatically used and so the following settings will have no effect.
 175 | # Please note, credentials are cached in their encrypted form, so while
 176 | # activating this cache may reduce the number of queries made to the
 177 | # underlying table, it may not  bring a significant reduction in the
 178 | # latency of individual authentication attempts.
 179 | # Defaults to 2000, set to 0 to disable credentials caching.
 180 | credentials_validity_in_ms: 2000
 181 | 
 182 | # Refresh interval for credentials cache (if enabled).
 183 | # After this interval, cache entries become eligible for refresh. Upon next
 184 | # access, an async reload is scheduled and the old value returned until it
 185 | # completes. If credentials_validity_in_ms is non-zero, then this must be
 186 | # also.
 187 | # Defaults to the same value as credentials_validity_in_ms.
 188 | # credentials_update_interval_in_ms: 2000
 189 | 
 190 | # The partitioner is responsible for distributing groups of rows (by
 191 | # partition key) across nodes in the cluster. The partitioner can NOT be
 192 | # changed without reloading all data.  If you are adding nodes or upgrading,
 193 | # you should set this to the same partitioner that you are currently using.
 194 | #
 195 | # The default partitioner is the Murmur3Partitioner. Older partitioners
 196 | # such as the RandomPartitioner, ByteOrderedPartitioner, and
 197 | # OrderPreservingPartitioner have been included for backward compatibility only.
 198 | # For new clusters, you should NOT change this value.
 199 | #
 200 | partitioner: org.apache.cassandra.dht.Murmur3Partitioner
 201 | 
 202 | # Directories where Cassandra should store data on disk. If multiple
 203 | # directories are specified, Cassandra will spread data evenly across 
 204 | # them by partitioning the token ranges.
 205 | # If not set, the default directory is $CASSANDRA_HOME/data/data.
 206 | # data_file_directories:
 207 | #     - /var/lib/cassandra/data
 208 | 
 209 | # commit log.  when running on magnetic HDD, this should be a
 210 | # separate spindle than the data directories.
 211 | # If not set, the default directory is $CASSANDRA_HOME/data/commitlog.
 212 | # commitlog_directory: /var/lib/cassandra/commitlog
 213 | 
 214 | # Enable / disable CDC functionality on a per-node basis. This modifies the logic used
 215 | # for write path allocation rejection (standard: never reject. cdc: reject Mutation
 216 | # containing a CDC-enabled table if at space limit in cdc_raw_directory).
 217 | cdc_enabled: false
 218 | 
 219 | # CommitLogSegments are moved to this directory on flush if cdc_enabled: true and the
 220 | # segment contains mutations for a CDC-enabled table. This should be placed on a
 221 | # separate spindle than the data directories. If not set, the default directory is
 222 | # $CASSANDRA_HOME/data/cdc_raw.
 223 | # cdc_raw_directory: /var/lib/cassandra/cdc_raw
 224 | 
 225 | # Policy for data disk failures:
 226 | #
 227 | # die
 228 | #   shut down gossip and client transports and kill the JVM for any fs errors or
 229 | #   single-sstable errors, so the node can be replaced.
 230 | #
 231 | # stop_paranoid
 232 | #   shut down gossip and client transports even for single-sstable errors,
 233 | #   kill the JVM for errors during startup.
 234 | #
 235 | # stop
 236 | #   shut down gossip and client transports, leaving the node effectively dead, but
 237 | #   can still be inspected via JMX, kill the JVM for errors during startup.
 238 | #
 239 | # best_effort
 240 | #    stop using the failed disk and respond to requests based on
 241 | #    remaining available sstables.  This means you WILL see obsolete
 242 | #    data at CL.ONE!
 243 | #
 244 | # ignore
 245 | #    ignore fatal errors and let requests fail, as in pre-1.2 Cassandra
 246 | disk_failure_policy: stop
 247 | 
 248 | # Policy for commit disk failures:
 249 | #
 250 | # die
 251 | #   shut down the node and kill the JVM, so the node can be replaced.
 252 | #
 253 | # stop
 254 | #   shut down the node, leaving the node effectively dead, but
 255 | #   can still be inspected via JMX.
 256 | #
 257 | # stop_commit
 258 | #   shutdown the commit log, letting writes collect but
 259 | #   continuing to service reads, as in pre-2.0.5 Cassandra
 260 | #
 261 | # ignore
 262 | #   ignore fatal errors and let the batches fail
 263 | commit_failure_policy: stop
 264 | 
 265 | # Maximum size of the native protocol prepared statement cache
 266 | #
 267 | # Valid values are either "auto" (omitting the value) or a value greater 0.
 268 | #
 269 | # Note that specifying a too large value will result in long running GCs and possbily
 270 | # out-of-memory errors. Keep the value at a small fraction of the heap.
 271 | #
 272 | # If you constantly see "prepared statements discarded in the last minute because
 273 | # cache limit reached" messages, the first step is to investigate the root cause
 274 | # of these messages and check whether prepared statements are used correctly -
 275 | # i.e. use bind markers for variable parts.
 276 | #
 277 | # Do only change the default value, if you really have more prepared statements than
 278 | # fit in the cache. In most cases it is not neccessary to change this value.
 279 | # Constantly re-preparing statements is a performance penalty.
 280 | #
 281 | # Default value ("auto") is 1/256th of the heap or 10MB, whichever is greater
 282 | prepared_statements_cache_size_mb:
 283 | 
 284 | # Maximum size of the key cache in memory.
 285 | #
 286 | # Each key cache hit saves 1 seek and each row cache hit saves 2 seeks at the
 287 | # minimum, sometimes more. The key cache is fairly tiny for the amount of
 288 | # time it saves, so it's worthwhile to use it at large numbers.
 289 | # The row cache saves even more time, but must contain the entire row,
 290 | # so it is extremely space-intensive. It's best to only use the
 291 | # row cache if you have hot rows or static rows.
 292 | #
 293 | # NOTE: if you reduce the size, you may not get you hottest keys loaded on startup.
 294 | #
 295 | # Default value is empty to make it "auto" (min(5% of Heap (in MB), 100MB)). Set to 0 to disable key cache.
 296 | key_cache_size_in_mb:
 297 | 
 298 | # Duration in seconds after which Cassandra should
 299 | # save the key cache. Caches are saved to saved_caches_directory as
 300 | # specified in this configuration file.
 301 | #
 302 | # Saved caches greatly improve cold-start speeds, and is relatively cheap in
 303 | # terms of I/O for the key cache. Row cache saving is much more expensive and
 304 | # has limited use.
 305 | #
 306 | # Default is 14400 or 4 hours.
 307 | key_cache_save_period: 14400
 308 | 
 309 | # Number of keys from the key cache to save
 310 | # Disabled by default, meaning all keys are going to be saved
 311 | # key_cache_keys_to_save: 100
 312 | 
 313 | # Row cache implementation class name. Available implementations:
 314 | #
 315 | # org.apache.cassandra.cache.OHCProvider
 316 | #   Fully off-heap row cache implementation (default).
 317 | #
 318 | # org.apache.cassandra.cache.SerializingCacheProvider
 319 | #   This is the row cache implementation availabile
 320 | #   in previous releases of Cassandra.
 321 | # row_cache_class_name: org.apache.cassandra.cache.OHCProvider
 322 | 
 323 | # Maximum size of the row cache in memory.
 324 | # Please note that OHC cache implementation requires some additional off-heap memory to manage
 325 | # the map structures and some in-flight memory during operations before/after cache entries can be
 326 | # accounted against the cache capacity. This overhead is usually small compared to the whole capacity.
 327 | # Do not specify more memory that the system can afford in the worst usual situation and leave some
 328 | # headroom for OS block level cache. Do never allow your system to swap.
 329 | #
 330 | # Default value is 0, to disable row caching.
 331 | row_cache_size_in_mb: 0
 332 | 
 333 | # Duration in seconds after which Cassandra should save the row cache.
 334 | # Caches are saved to saved_caches_directory as specified in this configuration file.
 335 | #
 336 | # Saved caches greatly improve cold-start speeds, and is relatively cheap in
 337 | # terms of I/O for the key cache. Row cache saving is much more expensive and
 338 | # has limited use.
 339 | #
 340 | # Default is 0 to disable saving the row cache.
 341 | row_cache_save_period: 0
 342 | 
 343 | # Number of keys from the row cache to save.
 344 | # Specify 0 (which is the default), meaning all keys are going to be saved
 345 | # row_cache_keys_to_save: 100
 346 | 
 347 | # Maximum size of the counter cache in memory.
 348 | #
 349 | # Counter cache helps to reduce counter locks' contention for hot counter cells.
 350 | # In case of RF = 1 a counter cache hit will cause Cassandra to skip the read before
 351 | # write entirely. With RF > 1 a counter cache hit will still help to reduce the duration
 352 | # of the lock hold, helping with hot counter cell updates, but will not allow skipping
 353 | # the read entirely. Only the local (clock, count) tuple of a counter cell is kept
 354 | # in memory, not the whole counter, so it's relatively cheap.
 355 | #
 356 | # NOTE: if you reduce the size, you may not get you hottest keys loaded on startup.
 357 | #
 358 | # Default value is empty to make it "auto" (min(2.5% of Heap (in MB), 50MB)). Set to 0 to disable counter cache.
 359 | # NOTE: if you perform counter deletes and rely on low gcgs, you should disable the counter cache.
 360 | counter_cache_size_in_mb:
 361 | 
 362 | # Duration in seconds after which Cassandra should
 363 | # save the counter cache (keys only). Caches are saved to saved_caches_directory as
 364 | # specified in this configuration file.
 365 | #
 366 | # Default is 7200 or 2 hours.
 367 | counter_cache_save_period: 7200
 368 | 
 369 | # Number of keys from the counter cache to save
 370 | # Disabled by default, meaning all keys are going to be saved
 371 | # counter_cache_keys_to_save: 100
 372 | 
 373 | # saved caches
 374 | # If not set, the default directory is $CASSANDRA_HOME/data/saved_caches.
 375 | # saved_caches_directory: /var/lib/cassandra/saved_caches
 376 | 
 377 | # commitlog_sync may be either "periodic", "group", or "batch." 
 378 | # 
 379 | # When in batch mode, Cassandra won't ack writes until the commit log
 380 | # has been flushed to disk.  Each incoming write will trigger the flush task.
 381 | # commitlog_sync_batch_window_in_ms is a deprecated value. Previously it had
 382 | # almost no value, and is being removed.
 383 | #
 384 | # commitlog_sync_batch_window_in_ms: 2
 385 | #
 386 | # group mode is similar to batch mode, where Cassandra will not ack writes
 387 | # until the commit log has been flushed to disk. The difference is group
 388 | # mode will wait up to commitlog_sync_group_window_in_ms between flushes.
 389 | #
 390 | # commitlog_sync_group_window_in_ms: 1000
 391 | #
 392 | # the default option is "periodic" where writes may be acked immediately
 393 | # and the CommitLog is simply synced every commitlog_sync_period_in_ms
 394 | # milliseconds.
 395 | commitlog_sync: periodic
 396 | commitlog_sync_period_in_ms: 10000
 397 | 
 398 | # When in periodic commitlog mode, the number of milliseconds to block writes
 399 | # while waiting for a slow disk flush to complete.
 400 | # periodic_commitlog_sync_lag_block_in_ms: 
 401 | 
 402 | # The size of the individual commitlog file segments.  A commitlog
 403 | # segment may be archived, deleted, or recycled once all the data
 404 | # in it (potentially from each columnfamily in the system) has been
 405 | # flushed to sstables.
 406 | #
 407 | # The default size is 32, which is almost always fine, but if you are
 408 | # archiving commitlog segments (see commitlog_archiving.properties),
 409 | # then you probably want a finer granularity of archiving; 8 or 16 MB
 410 | # is reasonable.
 411 | # Max mutation size is also configurable via max_mutation_size_in_kb setting in
 412 | # cassandra.yaml. The default is half the size commitlog_segment_size_in_mb * 1024.
 413 | # This should be positive and less than 2048.
 414 | #
 415 | # NOTE: If max_mutation_size_in_kb is set explicitly then commitlog_segment_size_in_mb must
 416 | # be set to at least twice the size of max_mutation_size_in_kb / 1024
 417 | #
 418 | commitlog_segment_size_in_mb: 32
 419 | 
 420 | # Compression to apply to the commit log. If omitted, the commit log
 421 | # will be written uncompressed.  LZ4, Snappy, and Deflate compressors
 422 | # are supported.
 423 | # commitlog_compression:
 424 | #   - class_name: LZ4Compressor
 425 | #     parameters:
 426 | #         -
 427 | 
 428 | # Compression to apply to SSTables as they flush for compressed tables.
 429 | # Note that tables without compression enabled do not respect this flag.
 430 | #
 431 | # As high ratio compressors like LZ4HC, Zstd, and Deflate can potentially
 432 | # block flushes for too long, the default is to flush with a known fast
 433 | # compressor in those cases. Options are:
 434 | #
 435 | # none : Flush without compressing blocks but while still doing checksums.
 436 | # fast : Flush with a fast compressor. If the table is already using a
 437 | #        fast compressor that compressor is used.
 438 | # table: Always flush with the same compressor that the table uses. This
 439 | #        was the pre 4.0 behavior.
 440 | #
 441 | # flush_compression: fast
 442 | 
 443 | # any class that implements the SeedProvider interface and has a
 444 | # constructor that takes a Map<String, String> of parameters will do.
 445 | seed_provider:
 446 |     # Addresses of hosts that are deemed contact points. 
 447 |     # Cassandra nodes use this list of hosts to find each other and learn
 448 |     # the topology of the ring.  You must change this if you are running
 449 |     # multiple nodes!
 450 |     - class_name: org.apache.cassandra.locator.SimpleSeedProvider
 451 |       parameters:
 452 |           # seeds is actually a comma-delimited list of addresses.
 453 |           # Ex: "<ip1>,<ip2>,<ip3>"
 454 |           - seeds: "{{ cassandra_seeds }}"
 455 | 
 456 | # For workloads with more data than can fit in memory, Cassandra's
 457 | # bottleneck will be reads that need to fetch data from
 458 | # disk. "concurrent_reads" should be set to (16 * number_of_drives) in
 459 | # order to allow the operations to enqueue low enough in the stack
 460 | # that the OS and drives can reorder them. Same applies to
 461 | # "concurrent_counter_writes", since counter writes read the current
 462 | # values before incrementing and writing them back.
 463 | #
 464 | # On the other hand, since writes are almost never IO bound, the ideal
 465 | # number of "concurrent_writes" is dependent on the number of cores in
 466 | # your system; (8 * number_of_cores) is a good rule of thumb.
 467 | concurrent_reads: 32
 468 | concurrent_writes: 32
 469 | concurrent_counter_writes: 32
 470 | 
 471 | # For materialized view writes, as there is a read involved, so this should
 472 | # be limited by the less of concurrent reads or concurrent writes.
 473 | concurrent_materialized_view_writes: 32
 474 | 
 475 | # Maximum memory to use for inter-node and client-server networking buffers.
 476 | #
 477 | # Defaults to the smaller of 1/16 of heap or 128MB. This pool is allocated off-heap,
 478 | # so is in addition to the memory allocated for heap. The cache also has on-heap
 479 | # overhead which is roughly 128 bytes per chunk (i.e. 0.2% of the reserved size
 480 | # if the default 64k chunk size is used).
 481 | # Memory is only allocated when needed.
 482 | # networking_cache_size_in_mb: 128
 483 | 
 484 | # Enable the sstable chunk cache.  The chunk cache will store recently accessed
 485 | # sections of the sstable in-memory as uncompressed buffers.
 486 | # file_cache_enabled: false
 487 | 
 488 | # Maximum memory to use for sstable chunk cache and buffer pooling.
 489 | # 32MB of this are reserved for pooling buffers, the rest is used for chunk cache
 490 | # that holds uncompressed sstable chunks.
 491 | # Defaults to the smaller of 1/4 of heap or 512MB. This pool is allocated off-heap,
 492 | # so is in addition to the memory allocated for heap. The cache also has on-heap
 493 | # overhead which is roughly 128 bytes per chunk (i.e. 0.2% of the reserved size
 494 | # if the default 64k chunk size is used).
 495 | # Memory is only allocated when needed.
 496 | # file_cache_size_in_mb: 512
 497 | 
 498 | # Flag indicating whether to allocate on or off heap when the sstable buffer
 499 | # pool is exhausted, that is when it has exceeded the maximum memory
 500 | # file_cache_size_in_mb, beyond which it will not cache buffers but allocate on request.
 501 | 
 502 | # buffer_pool_use_heap_if_exhausted: true
 503 | 
 504 | # The strategy for optimizing disk read
 505 | # Possible values are:
 506 | # ssd (for solid state disks, the default)
 507 | # spinning (for spinning disks)
 508 | # disk_optimization_strategy: ssd
 509 | 
 510 | # Total permitted memory to use for memtables. Cassandra will stop
 511 | # accepting writes when the limit is exceeded until a flush completes,
 512 | # and will trigger a flush based on memtable_cleanup_threshold
 513 | # If omitted, Cassandra will set both to 1/4 the size of the heap.
 514 | # memtable_heap_space_in_mb: 2048
 515 | # memtable_offheap_space_in_mb: 2048
 516 | 
 517 | # memtable_cleanup_threshold is deprecated. The default calculation
 518 | # is the only reasonable choice. See the comments on  memtable_flush_writers
 519 | # for more information.
 520 | #
 521 | # Ratio of occupied non-flushing memtable size to total permitted size
 522 | # that will trigger a flush of the largest memtable. Larger mct will
 523 | # mean larger flushes and hence less compaction, but also less concurrent
 524 | # flush activity which can make it difficult to keep your disks fed
 525 | # under heavy write load.
 526 | #
 527 | # memtable_cleanup_threshold defaults to 1 / (memtable_flush_writers + 1)
 528 | # memtable_cleanup_threshold: 0.11
 529 | 
 530 | # Specify the way Cassandra allocates and manages memtable memory.
 531 | # Options are:
 532 | #
 533 | # heap_buffers
 534 | #   on heap nio buffers
 535 | #
 536 | # offheap_buffers
 537 | #   off heap (direct) nio buffers
 538 | #
 539 | # offheap_objects
 540 | #    off heap objects
 541 | memtable_allocation_type: heap_buffers
 542 | 
 543 | # Limit memory usage for Merkle tree calculations during repairs. The default
 544 | # is 1/16th of the available heap. The main tradeoff is that smaller trees
 545 | # have less resolution, which can lead to over-streaming data. If you see heap
 546 | # pressure during repairs, consider lowering this, but you cannot go below
 547 | # one megabyte. If you see lots of over-streaming, consider raising
 548 | # this or using subrange repair.
 549 | #
 550 | # For more details see https://issues.apache.org/jira/browse/CASSANDRA-14096.
 551 | #
 552 | # repair_session_space_in_mb:
 553 | 
 554 | # Total space to use for commit logs on disk.
 555 | #
 556 | # If space gets above this value, Cassandra will flush every dirty CF
 557 | # in the oldest segment and remove it.  So a small total commitlog space
 558 | # will tend to cause more flush activity on less-active columnfamilies.
 559 | #
 560 | # The default value is the smaller of 8192, and 1/4 of the total space
 561 | # of the commitlog volume.
 562 | #
 563 | # commitlog_total_space_in_mb: 8192
 564 | 
 565 | # This sets the number of memtable flush writer threads per disk
 566 | # as well as the total number of memtables that can be flushed concurrently.
 567 | # These are generally a combination of compute and IO bound.
 568 | #
 569 | # Memtable flushing is more CPU efficient than memtable ingest and a single thread
 570 | # can keep up with the ingest rate of a whole server on a single fast disk
 571 | # until it temporarily becomes IO bound under contention typically with compaction.
 572 | # At that point you need multiple flush threads. At some point in the future
 573 | # it may become CPU bound all the time.
 574 | #
 575 | # You can tell if flushing is falling behind using the MemtablePool.BlockedOnAllocation
 576 | # metric which should be 0, but will be non-zero if threads are blocked waiting on flushing
 577 | # to free memory.
 578 | #
 579 | # memtable_flush_writers defaults to two for a single data directory.
 580 | # This means that two  memtables can be flushed concurrently to the single data directory.
 581 | # If you have multiple data directories the default is one memtable flushing at a time
 582 | # but the flush will use a thread per data directory so you will get two or more writers.
 583 | #
 584 | # Two is generally enough to flush on a fast disk [array] mounted as a single data directory.
 585 | # Adding more flush writers will result in smaller more frequent flushes that introduce more
 586 | # compaction overhead.
 587 | #
 588 | # There is a direct tradeoff between number of memtables that can be flushed concurrently
 589 | # and flush size and frequency. More is not better you just need enough flush writers
 590 | # to never stall waiting for flushing to free memory.
 591 | #
 592 | #memtable_flush_writers: 2
 593 | 
 594 | # Total space to use for change-data-capture logs on disk.
 595 | #
 596 | # If space gets above this value, Cassandra will throw WriteTimeoutException
 597 | # on Mutations including tables with CDC enabled. A CDCCompactor is responsible
 598 | # for parsing the raw CDC logs and deleting them when parsing is completed.
 599 | #
 600 | # The default value is the min of 4096 mb and 1/8th of the total space
 601 | # of the drive where cdc_raw_directory resides.
 602 | # cdc_total_space_in_mb: 4096
 603 | 
 604 | # When we hit our cdc_raw limit and the CDCCompactor is either running behind
 605 | # or experiencing backpressure, we check at the following interval to see if any
 606 | # new space for cdc-tracked tables has been made available. Default to 250ms
 607 | # cdc_free_space_check_interval_ms: 250
 608 | 
 609 | # A fixed memory pool size in MB for for SSTable index summaries. If left
 610 | # empty, this will default to 5% of the heap size. If the memory usage of
 611 | # all index summaries exceeds this limit, SSTables with low read rates will
 612 | # shrink their index summaries in order to meet this limit.  However, this
 613 | # is a best-effort process. In extreme conditions Cassandra may need to use
 614 | # more than this amount of memory.
 615 | index_summary_capacity_in_mb:
 616 | 
 617 | # How frequently index summaries should be resampled.  This is done
 618 | # periodically to redistribute memory from the fixed-size pool to sstables
 619 | # proportional their recent read rates.  Setting to -1 will disable this
 620 | # process, leaving existing index summaries at their current sampling level.
 621 | index_summary_resize_interval_in_minutes: 60
 622 | 
 623 | # Whether to, when doing sequential writing, fsync() at intervals in
 624 | # order to force the operating system to flush the dirty
 625 | # buffers. Enable this to avoid sudden dirty buffer flushing from
 626 | # impacting read latencies. Almost always a good idea on SSDs; not
 627 | # necessarily on platters.
 628 | trickle_fsync: false
 629 | trickle_fsync_interval_in_kb: 10240
 630 | 
 631 | # TCP port, for commands and data
 632 | # For security reasons, you should not expose this port to the internet.  Firewall it if needed.
 633 | storage_port: 7000
 634 | 
 635 | # SSL port, for legacy encrypted communication. This property is unused unless enabled in
 636 | # server_encryption_options (see below). As of cassandra 4.0, this property is deprecated
 637 | # as a single port can be used for either/both secure and insecure connections.
 638 | # For security reasons, you should not expose this port to the internet. Firewall it if needed.
 639 | ssl_storage_port: 7001
 640 | 
 641 | # Address or interface to bind to and tell other Cassandra nodes to connect to.
 642 | # You _must_ change this if you want multiple nodes to be able to communicate!
 643 | #
 644 | # Set listen_address OR listen_interface, not both.
 645 | #
 646 | # Leaving it blank leaves it up to InetAddress.getLocalHost(). This
 647 | # will always do the Right Thing _if_ the node is properly configured
 648 | # (hostname, name resolution, etc), and the Right Thing is to use the
 649 | # address associated with the hostname (it might not be). If unresolvable
 650 | # it will fall back to InetAddress.getLoopbackAddress(), which is wrong for production systems.
 651 | #
 652 | # Setting listen_address to 0.0.0.0 is always wrong.
 653 | #
 654 | listen_address: {{ inventory_hostname }}
 655 | 
 656 | # Set listen_address OR listen_interface, not both. Interfaces must correspond
 657 | # to a single address, IP aliasing is not supported.
 658 | # listen_interface: eth0
 659 | 
 660 | # If you choose to specify the interface by name and the interface has an ipv4 and an ipv6 address
 661 | # you can specify which should be chosen using listen_interface_prefer_ipv6. If false the first ipv4
 662 | # address will be used. If true the first ipv6 address will be used. Defaults to false preferring
 663 | # ipv4. If there is only one address it will be selected regardless of ipv4/ipv6.
 664 | # listen_interface_prefer_ipv6: false
 665 | 
 666 | # Address to broadcast to other Cassandra nodes
 667 | # Leaving this blank will set it to the same value as listen_address
 668 | # broadcast_address: 1.2.3.4
 669 | 
 670 | # When using multiple physical network interfaces, set this
 671 | # to true to listen on broadcast_address in addition to
 672 | # the listen_address, allowing nodes to communicate in both
 673 | # interfaces.
 674 | # Ignore this property if the network configuration automatically
 675 | # routes  between the public and private networks such as EC2.
 676 | # listen_on_broadcast_address: false
 677 | 
 678 | # Internode authentication backend, implementing IInternodeAuthenticator;
 679 | # used to allow/disallow connections from peer nodes.
 680 | # internode_authenticator: org.apache.cassandra.auth.AllowAllInternodeAuthenticator
 681 | 
 682 | # Whether to start the native transport server.
 683 | # The address on which the native transport is bound is defined by rpc_address.
 684 | start_native_transport: true
 685 | # port for the CQL native transport to listen for clients on
 686 | # For security reasons, you should not expose this port to the internet.  Firewall it if needed.
 687 | native_transport_port: 9042
 688 | # Enabling native transport encryption in client_encryption_options allows you to either use
 689 | # encryption for the standard port or to use a dedicated, additional port along with the unencrypted
 690 | # standard native_transport_port.
 691 | # Enabling client encryption and keeping native_transport_port_ssl disabled will use encryption
 692 | # for native_transport_port. Setting native_transport_port_ssl to a different value
 693 | # from native_transport_port will use encryption for native_transport_port_ssl while
 694 | # keeping native_transport_port unencrypted.
 695 | # native_transport_port_ssl: 9142
 696 | # The maximum threads for handling requests (note that idle threads are stopped
 697 | # after 30 seconds so there is not corresponding minimum setting).
 698 | # native_transport_max_threads: 128
 699 | #
 700 | # The maximum size of allowed frame. Frame (requests) larger than this will
 701 | # be rejected as invalid. The default is 256MB. If you're changing this parameter,
 702 | # you may want to adjust max_value_size_in_mb accordingly. This should be positive and less than 2048.
 703 | # native_transport_max_frame_size_in_mb: 256
 704 | 
 705 | # The maximum number of concurrent client connections.
 706 | # The default is -1, which means unlimited.
 707 | # native_transport_max_concurrent_connections: -1
 708 | 
 709 | # The maximum number of concurrent client connections per source ip.
 710 | # The default is -1, which means unlimited.
 711 | # native_transport_max_concurrent_connections_per_ip: -1
 712 | 
 713 | # Controls whether Cassandra honors older, yet currently supported, protocol versions.
 714 | # The default is true, which means all supported protocols will be honored.
 715 | native_transport_allow_older_protocols: true
 716 | 
 717 | # Controls when idle client connections are closed. Idle connections are ones that had neither reads
 718 | # nor writes for a time period.
 719 | #
 720 | # Clients may implement heartbeats by sending OPTIONS native protocol message after a timeout, which
 721 | # will reset idle timeout timer on the server side. To close idle client connections, corresponding
 722 | # values for heartbeat intervals have to be set on the client side.
 723 | #
 724 | # Idle connection timeouts are disabled by default.
 725 | # native_transport_idle_timeout_in_ms: 60000
 726 | 
 727 | # The address or interface to bind the native transport server to.
 728 | #
 729 | # Set rpc_address OR rpc_interface, not both.
 730 | #
 731 | # Leaving rpc_address blank has the same effect as on listen_address
 732 | # (i.e. it will be based on the configured hostname of the node).
 733 | #
 734 | # Note that unlike listen_address, you can specify 0.0.0.0, but you must also
 735 | # set broadcast_rpc_address to a value other than 0.0.0.0.
 736 | #
 737 | # For security reasons, you should not expose this port to the internet.  Firewall it if needed.
 738 | rpc_address: 0.0.0.0
 739 | 
 740 | # Set rpc_address OR rpc_interface, not both. Interfaces must correspond
 741 | # to a single address, IP aliasing is not supported.
 742 | # rpc_interface: eth1
 743 | 
 744 | # If you choose to specify the interface by name and the interface has an ipv4 and an ipv6 address
 745 | # you can specify which should be chosen using rpc_interface_prefer_ipv6. If false the first ipv4
 746 | # address will be used. If true the first ipv6 address will be used. Defaults to false preferring
 747 | # ipv4. If there is only one address it will be selected regardless of ipv4/ipv6.
 748 | # rpc_interface_prefer_ipv6: false
 749 | 
 750 | # RPC address to broadcast to drivers and other Cassandra nodes. This cannot
 751 | # be set to 0.0.0.0. If left blank, this will be set to the value of
 752 | # rpc_address. If rpc_address is set to 0.0.0.0, broadcast_rpc_address must
 753 | # be set.
 754 | broadcast_rpc_address: {{ inventory_hostname }}
 755 | 
 756 | # enable or disable keepalive on rpc/native connections
 757 | rpc_keepalive: true
 758 | 
 759 | # Uncomment to set socket buffer size for internode communication
 760 | # Note that when setting this, the buffer size is limited by net.core.wmem_max
 761 | # and when not setting it it is defined by net.ipv4.tcp_wmem
 762 | # See also:
 763 | # /proc/sys/net/core/wmem_max
 764 | # /proc/sys/net/core/rmem_max
 765 | # /proc/sys/net/ipv4/tcp_wmem
 766 | # /proc/sys/net/ipv4/tcp_wmem
 767 | # and 'man tcp'
 768 | # internode_send_buff_size_in_bytes:
 769 | 
 770 | # Uncomment to set socket buffer size for internode communication
 771 | # Note that when setting this, the buffer size is limited by net.core.wmem_max
 772 | # and when not setting it it is defined by net.ipv4.tcp_wmem
 773 | # internode_recv_buff_size_in_bytes:
 774 | 
 775 | # Set to true to have Cassandra create a hard link to each sstable
 776 | # flushed or streamed locally in a backups/ subdirectory of the
 777 | # keyspace data.  Removing these links is the operator's
 778 | # responsibility.
 779 | incremental_backups: false
 780 | 
 781 | # Whether or not to take a snapshot before each compaction.  Be
 782 | # careful using this option, since Cassandra won't clean up the
 783 | # snapshots for you.  Mostly useful if you're paranoid when there
 784 | # is a data format change.
 785 | snapshot_before_compaction: false
 786 | 
 787 | # Whether or not a snapshot is taken of the data before keyspace truncation
 788 | # or dropping of column families. The STRONGLY advised default of true 
 789 | # should be used to provide data safety. If you set this flag to false, you will
 790 | # lose data on truncation or drop.
 791 | auto_snapshot: true
 792 | 
 793 | # The act of creating or clearing a snapshot involves creating or removing
 794 | # potentially tens of thousands of links, which can cause significant performance
 795 | # impact, especially on consumer grade SSDs. A non-zero value here can
 796 | # be used to throttle these links to avoid negative performance impact of
 797 | # taking and clearing snapshots
 798 | snapshot_links_per_second: 0
 799 | 
 800 | # Granularity of the collation index of rows within a partition.
 801 | # Increase if your rows are large, or if you have a very large
 802 | # number of rows per partition.  The competing goals are these:
 803 | #
 804 | # - a smaller granularity means more index entries are generated
 805 | #   and looking up rows withing the partition by collation column
 806 | #   is faster
 807 | # - but, Cassandra will keep the collation index in memory for hot
 808 | #   rows (as part of the key cache), so a larger granularity means
 809 | #   you can cache more hot rows
 810 | column_index_size_in_kb: 64
 811 | 
 812 | # Per sstable indexed key cache entries (the collation index in memory
 813 | # mentioned above) exceeding this size will not be held on heap.
 814 | # This means that only partition information is held on heap and the
 815 | # index entries are read from disk.
 816 | #
 817 | # Note that this size refers to the size of the
 818 | # serialized index information and not the size of the partition.
 819 | column_index_cache_size_in_kb: 2
 820 | 
 821 | # Number of simultaneous compactions to allow, NOT including
 822 | # validation "compactions" for anti-entropy repair.  Simultaneous
 823 | # compactions can help preserve read performance in a mixed read/write
 824 | # workload, by mitigating the tendency of small sstables to accumulate
 825 | # during a single long running compactions. The default is usually
 826 | # fine and if you experience problems with compaction running too
 827 | # slowly or too fast, you should look at
 828 | # compaction_throughput_mb_per_sec first.
 829 | #
 830 | # concurrent_compactors defaults to the smaller of (number of disks,
 831 | # number of cores), with a minimum of 2 and a maximum of 8.
 832 | # 
 833 | # If your data directories are backed by SSD, you should increase this
 834 | # to the number of cores.
 835 | #concurrent_compactors: 1
 836 | 
 837 | # Number of simultaneous repair validations to allow. If not set or set to
 838 | # a value less than 1, it defaults to the value of concurrent_compactors.
 839 | # To set a value greeater than concurrent_compactors at startup, the system
 840 | # property cassandra.allow_unlimited_concurrent_validations must be set to
 841 | # true. To dynamically resize to a value > concurrent_compactors on a running
 842 | # node, first call the bypassConcurrentValidatorsLimit method on the
 843 | # org.apache.cassandra.db:type=StorageService mbean
 844 | # concurrent_validations: 0
 845 | 
 846 | # Number of simultaneous materialized view builder tasks to allow.
 847 | concurrent_materialized_view_builders: 1
 848 | 
 849 | # Throttles compaction to the given total throughput across the entire
 850 | # system. The faster you insert data, the faster you need to compact in
 851 | # order to keep the sstable count down, but in general, setting this to
 852 | # 16 to 32 times the rate you are inserting data is more than sufficient.
 853 | # Setting this to 0 disables throttling. Note that this accounts for all types
 854 | # of compaction, including validation compaction (building Merkle trees
 855 | # for repairs).
 856 | compaction_throughput_mb_per_sec: 64
 857 | 
 858 | # When compacting, the replacement sstable(s) can be opened before they
 859 | # are completely written, and used in place of the prior sstables for
 860 | # any range that has been written. This helps to smoothly transfer reads 
 861 | # between the sstables, reducing page cache churn and keeping hot rows hot
 862 | sstable_preemptive_open_interval_in_mb: 50
 863 | 
 864 | # When enabled, permits Cassandra to zero-copy stream entire eligible
 865 | # SSTables between nodes, including every component.
 866 | # This speeds up the network transfer significantly subject to
 867 | # throttling specified by stream_throughput_outbound_megabits_per_sec.
 868 | # Enabling this will reduce the GC pressure on sending and receiving node.
 869 | # When unset, the default is enabled. While this feature tries to keep the
 870 | # disks balanced, it cannot guarantee it. This feature will be automatically
 871 | # disabled if internode encryption is enabled. Currently this can be used with
 872 | # Leveled Compaction. Once CASSANDRA-14586 is fixed other compaction strategies
 873 | # will benefit as well when used in combination with CASSANDRA-6696.
 874 | # stream_entire_sstables: true
 875 | 
 876 | # Throttles all outbound streaming file transfers on this node to the
 877 | # given total throughput in Mbps. This is necessary because Cassandra does
 878 | # mostly sequential IO when streaming data during bootstrap or repair, which
 879 | # can lead to saturating the network connection and degrading rpc performance.
 880 | # When unset, the default is 200 Mbps or 25 MB/s.
 881 | # stream_throughput_outbound_megabits_per_sec: 200
 882 | 
 883 | # Throttles all streaming file transfer between the datacenters,
 884 | # this setting allows users to throttle inter dc stream throughput in addition
 885 | # to throttling all network stream traffic as configured with
 886 | # stream_throughput_outbound_megabits_per_sec
 887 | # When unset, the default is 200 Mbps or 25 MB/s
 888 | # inter_dc_stream_throughput_outbound_megabits_per_sec: 200
 889 | 
 890 | # How long the coordinator should wait for read operations to complete.
 891 | # Lowest acceptable value is 10 ms.
 892 | read_request_timeout_in_ms: 5000
 893 | # How long the coordinator should wait for seq or index scans to complete.
 894 | # Lowest acceptable value is 10 ms.
 895 | range_request_timeout_in_ms: 10000
 896 | # How long the coordinator should wait for writes to complete.
 897 | # Lowest acceptable value is 10 ms.
 898 | write_request_timeout_in_ms: 2000
 899 | # How long the coordinator should wait for counter writes to complete.
 900 | # Lowest acceptable value is 10 ms.
 901 | counter_write_request_timeout_in_ms: 5000
 902 | # How long a coordinator should continue to retry a CAS operation
 903 | # that contends with other proposals for the same row.
 904 | # Lowest acceptable value is 10 ms.
 905 | cas_contention_timeout_in_ms: 1000
 906 | # How long the coordinator should wait for truncates to complete
 907 | # (This can be much longer, because unless auto_snapshot is disabled
 908 | # we need to flush first so we can snapshot before removing the data.)
 909 | # Lowest acceptable value is 10 ms.
 910 | truncate_request_timeout_in_ms: 60000
 911 | # The default timeout for other, miscellaneous operations.
 912 | # Lowest acceptable value is 10 ms.
 913 | request_timeout_in_ms: 10000
 914 | 
 915 | # Defensive settings for protecting Cassandra from true network partitions.
 916 | # See (CASSANDRA-14358) for details.
 917 | #
 918 | # The amount of time to wait for internode tcp connections to establish.
 919 | # internode_tcp_connect_timeout_in_ms = 2000
 920 | #
 921 | # The amount of time unacknowledged data is allowed on a connection before we throw out the connection
 922 | # Note this is only supported on Linux + epoll, and it appears to behave oddly above a setting of 30000
 923 | # (it takes much longer than 30s) as of Linux 4.12. If you want something that high set this to 0
 924 | # which picks up the OS default and configure the net.ipv4.tcp_retries2 sysctl to be ~8.
 925 | # internode_tcp_user_timeout_in_ms = 30000
 926 | 
 927 | # The amount of time unacknowledged data is allowed on a streaming connection.
 928 | # The default is 5 minutes. Increase it or set it to 0 in order to increase the timeout.
 929 | # internode_streaming_tcp_user_timeout_in_ms = 300000
 930 | 
 931 | # The maximum continuous period a connection may be unwritable in application space
 932 | # internode_application_timeout_in_ms = 30000
 933 | 
 934 | # Global, per-endpoint and per-connection limits imposed on messages queued for delivery to other nodes
 935 | # and waiting to be processed on arrival from other nodes in the cluster.  These limits are applied to the on-wire
 936 | # size of the message being sent or received.
 937 | #
 938 | # The basic per-link limit is consumed in isolation before any endpoint or global limit is imposed.
 939 | # Each node-pair has three links: urgent, small and large.  So any given node may have a maximum of
 940 | # N*3*(internode_application_send_queue_capacity_in_bytes+internode_application_receive_queue_capacity_in_bytes)
 941 | # messages queued without any coordination between them although in practice, with token-aware routing, only RF*tokens
 942 | # nodes should need to communicate with significant bandwidth.
 943 | #
 944 | # The per-endpoint limit is imposed on all messages exceeding the per-link limit, simultaneously with the global limit,
 945 | # on all links to or from a single node in the cluster.
 946 | # The global limit is imposed on all messages exceeding the per-link limit, simultaneously with the per-endpoint limit,
 947 | # on all links to or from any node in the cluster.
 948 | #
 949 | # internode_application_send_queue_capacity_in_bytes: 4194304                       #4MiB
 950 | # internode_application_send_queue_reserve_endpoint_capacity_in_bytes: 134217728    #128MiB
 951 | # internode_application_send_queue_reserve_global_capacity_in_bytes: 536870912      #512MiB
 952 | # internode_application_receive_queue_capacity_in_bytes: 4194304                    #4MiB
 953 | # internode_application_receive_queue_reserve_endpoint_capacity_in_bytes: 134217728 #128MiB
 954 | # internode_application_receive_queue_reserve_global_capacity_in_bytes: 536870912   #512MiB
 955 | 
 956 | 
 957 | # How long before a node logs slow queries. Select queries that take longer than
 958 | # this timeout to execute, will generate an aggregated log message, so that slow queries
 959 | # can be identified. Set this value to zero to disable slow query logging.
 960 | slow_query_log_timeout_in_ms: 500
 961 | 
 962 | # Enable operation timeout information exchange between nodes to accurately
 963 | # measure request timeouts.  If disabled, replicas will assume that requests
 964 | # were forwarded to them instantly by the coordinator, which means that
 965 | # under overload conditions we will waste that much extra time processing 
 966 | # already-timed-out requests.
 967 | #
 968 | # Warning: It is generally assumed that users have setup NTP on their clusters, and that clocks are modestly in sync, 
 969 | # since this is a requirement for general correctness of last write wins.
 970 | #cross_node_timeout: true
 971 | 
 972 | # Set keep-alive period for streaming
 973 | # This node will send a keep-alive message periodically with this period.
 974 | # If the node does not receive a keep-alive message from the peer for
 975 | # 2 keep-alive cycles the stream session times out and fail
 976 | # Default value is 300s (5 minutes), which means stalled stream
 977 | # times out in 10 minutes by default
 978 | # streaming_keep_alive_period_in_secs: 300
 979 | 
 980 | # Limit number of connections per host for streaming
 981 | # Increase this when you notice that joins are CPU-bound rather that network
 982 | # bound (for example a few nodes with big files).
 983 | # streaming_connections_per_host: 1
 984 | 
 985 | 
 986 | # phi value that must be reached for a host to be marked down.
 987 | # most users should never need to adjust this.
 988 | # phi_convict_threshold: 8
 989 | 
 990 | # endpoint_snitch -- Set this to a class that implements
 991 | # IEndpointSnitch.  The snitch has two functions:
 992 | #
 993 | # - it teaches Cassandra enough about your network topology to route
 994 | #   requests efficiently
 995 | # - it allows Cassandra to spread replicas around your cluster to avoid
 996 | #   correlated failures. It does this by grouping machines into
 997 | #   "datacenters" and "racks."  Cassandra will do its best not to have
 998 | #   more than one replica on the same "rack" (which may not actually
 999 | #   be a physical location)
1000 | #
1001 | # CASSANDRA WILL NOT ALLOW YOU TO SWITCH TO AN INCOMPATIBLE SNITCH
1002 | # ONCE DATA IS INSERTED INTO THE CLUSTER.  This would cause data loss.
1003 | # This means that if you start with the default SimpleSnitch, which
1004 | # locates every node on "rack1" in "datacenter1", your only options
1005 | # if you need to add another datacenter are GossipingPropertyFileSnitch
1006 | # (and the older PFS).  From there, if you want to migrate to an
1007 | # incompatible snitch like Ec2Snitch you can do it by adding new nodes
1008 | # under Ec2Snitch (which will locate them in a new "datacenter") and
1009 | # decommissioning the old ones.
1010 | #
1011 | # Out of the box, Cassandra provides:
1012 | #
1013 | # SimpleSnitch:
1014 | #    Treats Strategy order as proximity. This can improve cache
1015 | #    locality when disabling read repair.  Only appropriate for
1016 | #    single-datacenter deployments.
1017 | #
1018 | # GossipingPropertyFileSnitch
1019 | #    This should be your go-to snitch for production use.  The rack
1020 | #    and datacenter for the local node are defined in
1021 | #    cassandra-rackdc.properties and propagated to other nodes via
1022 | #    gossip.  If cassandra-topology.properties exists, it is used as a
1023 | #    fallback, allowing migration from the PropertyFileSnitch.
1024 | #
1025 | # PropertyFileSnitch:
1026 | #    Proximity is determined by rack and data center, which are
1027 | #    explicitly configured in cassandra-topology.properties.
1028 | #
1029 | # Ec2Snitch:
1030 | #    Appropriate for EC2 deployments in a single Region. Loads Region
1031 | #    and Availability Zone information from the EC2 API. The Region is
1032 | #    treated as the datacenter, and the Availability Zone as the rack.
1033 | #    Only private IPs are used, so this will not work across multiple
1034 | #    Regions.
1035 | #
1036 | # Ec2MultiRegionSnitch:
1037 | #    Uses public IPs as broadcast_address to allow cross-region
1038 | #    connectivity.  (Thus, you should set seed addresses to the public
1039 | #    IP as well.) You will need to open the storage_port or
1040 | #    ssl_storage_port on the public IP firewall.  (For intra-Region
1041 | #    traffic, Cassandra will switch to the private IP after
1042 | #    establishing a connection.)
1043 | #
1044 | # RackInferringSnitch:
1045 | #    Proximity is determined by rack and data center, which are
1046 | #    assumed to correspond to the 3rd and 2nd octet of each node's IP
1047 | #    address, respectively.  Unless this happens to match your
1048 | #    deployment conventions, this is best used as an example of
1049 | #    writing a custom Snitch class and is provided in that spirit.
1050 | #
1051 | # You can use a custom Snitch by setting this to the full class name
1052 | # of the snitch, which will be assumed to be on your classpath.
1053 | endpoint_snitch: SimpleSnitch
1054 | 
1055 | # controls how often to perform the more expensive part of host score
1056 | # calculation
1057 | dynamic_snitch_update_interval_in_ms: 100 
1058 | # controls how often to reset all host scores, allowing a bad host to
1059 | # possibly recover
1060 | dynamic_snitch_reset_interval_in_ms: 600000
1061 | # if set greater than zero, this will allow
1062 | # 'pinning' of replicas to hosts in order to increase cache capacity.
1063 | # The badness threshold will control how much worse the pinned host has to be
1064 | # before the dynamic snitch will prefer other replicas over it.  This is
1065 | # expressed as a double which represents a percentage.  Thus, a value of
1066 | # 0.2 means Cassandra would continue to prefer the static snitch values
1067 | # until the pinned host was 20% worse than the fastest.
1068 | dynamic_snitch_badness_threshold: 1.0
1069 | 
1070 | # Configure server-to-server internode encryption
1071 | #
1072 | # JVM and netty defaults for supported SSL socket protocols and cipher suites can
1073 | # be replaced using custom encryption options. This is not recommended
1074 | # unless you have policies in place that dictate certain settings, or
1075 | # need to disable vulnerable ciphers or protocols in case the JVM cannot
1076 | # be updated.
1077 | #
1078 | # FIPS compliant settings can be configured at JVM level and should not
1079 | # involve changing encryption settings here:
1080 | # https://docs.oracle.com/javase/8/docs/technotes/guides/security/jsse/FIPS.html
1081 | #
1082 | # **NOTE** this default configuration is an insecure configuration. If you need to
1083 | # enable server-to-server encryption generate server keystores (and truststores for mutual
1084 | # authentication) per:
1085 | # http://download.oracle.com/javase/8/docs/technotes/guides/security/jsse/JSSERefGuide.html#CreateKeystore
1086 | # Then perform the following configuration changes:
1087 | #
1088 | # Step 1: Set internode_encryption=<dc|rack|all> and explicitly set optional=true. Restart all nodes
1089 | #
1090 | # Step 2: Set optional=false (or remove it) and if you generated truststores and want to use mutual
1091 | # auth set require_client_auth=true. Restart all nodes
1092 | server_encryption_options:
1093 |     # On outbound connections, determine which type of peers to securely connect to.
1094 |     #   The available options are :
1095 |     #     none : Do not encrypt outgoing connections
1096 |     #     dc   : Encrypt connections to peers in other datacenters but not within datacenters
1097 |     #     rack : Encrypt connections to peers in other racks but not within racks
1098 |     #     all  : Always use encrypted connections
1099 |     internode_encryption: none
1100 |     # When set to true, encrypted and unencrypted connections are allowed on the storage_port
1101 |     # This should _only be true_ while in unencrypted or transitional operation
1102 |     # optional defaults to true if internode_encryption is none
1103 |     # optional: true
1104 |     # If enabled, will open up an encrypted listening socket on ssl_storage_port. Should only be used
1105 |     # during upgrade to 4.0; otherwise, set to false.
1106 |     enable_legacy_ssl_storage_port: false
1107 |     # Set to a valid keystore if internode_encryption is dc, rack or all
1108 |     keystore: conf/.keystore
1109 |     keystore_password: cassandra
1110 |     # Verify peer server certificates
1111 |     require_client_auth: false
1112 |     # Set to a valid trustore if require_client_auth is true
1113 |     truststore: conf/.truststore
1114 |     truststore_password: cassandra
1115 |     # Verify that the host name in the certificate matches the connected host
1116 |     require_endpoint_verification: false
1117 |     # More advanced defaults:
1118 |     # protocol: TLS
1119 |     # store_type: JKS
1120 |     # cipher_suites: [
1121 |     #   TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384, TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
1122 |     #   TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,
1123 |     #   TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA, TLS_RSA_WITH_AES_128_GCM_SHA256, TLS_RSA_WITH_AES_128_CBC_SHA,
1124 |     #   TLS_RSA_WITH_AES_256_CBC_SHA
1125 |     # ]
1126 | 
1127 | # Configure client-to-server encryption.
1128 | #
1129 | # **NOTE** this default configuration is an insecure configuration. If you need to
1130 | # enable client-to-server encryption generate server keystores (and truststores for mutual
1131 | # authentication) per:
1132 | # http://download.oracle.com/javase/8/docs/technotes/guides/security/jsse/JSSERefGuide.html#CreateKeystore
1133 | # Then perform the following configuration changes:
1134 | #
1135 | # Step 1: Set enabled=true and explicitly set optional=true. Restart all nodes
1136 | #
1137 | # Step 2: Set optional=false (or remove it) and if you generated truststores and want to use mutual
1138 | # auth set require_client_auth=true. Restart all nodes
1139 | client_encryption_options:
1140 |     # Enable client-to-server encryption
1141 |     enabled: false
1142 |     # When set to true, encrypted and unencrypted connections are allowed on the native_transport_port
1143 |     # This should _only be true_ while in unencrypted or transitional operation
1144 |     # optional defaults to true when enabled is false, and false when enabled is true.
1145 |     # optional: true
1146 |     # Set keystore and keystore_password to valid keystores if enabled is true
1147 |     keystore: conf/.keystore
1148 |     keystore_password: cassandra
1149 |     # Verify client certificates
1150 |     require_client_auth: false
1151 |     # Set trustore and truststore_password if require_client_auth is true
1152 |     # truststore: conf/.truststore
1153 |     # truststore_password: cassandra
1154 |     # More advanced defaults:
1155 |     # protocol: TLS
1156 |     # store_type: JKS
1157 |     # cipher_suites: [
1158 |     #   TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384, TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
1159 |     #   TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,
1160 |     #   TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA, TLS_RSA_WITH_AES_128_GCM_SHA256, TLS_RSA_WITH_AES_128_CBC_SHA,
1161 |     #   TLS_RSA_WITH_AES_256_CBC_SHA
1162 |     # ]
1163 | 
1164 | # internode_compression controls whether traffic between nodes is
1165 | # compressed.
1166 | # Can be:
1167 | #
1168 | # all
1169 | #   all traffic is compressed
1170 | #
1171 | # dc
1172 | #   traffic between different datacenters is compressed
1173 | #
1174 | # none
1175 | #   nothing is compressed.
1176 | internode_compression: dc
1177 | 
1178 | # Enable or disable tcp_nodelay for inter-dc communication.
1179 | # Disabling it will result in larger (but fewer) network packets being sent,
1180 | # reducing overhead from the TCP protocol itself, at the cost of increasing
1181 | # latency if you block for cross-datacenter responses.
1182 | inter_dc_tcp_nodelay: false
1183 | 
1184 | # TTL for different trace types used during logging of the repair process.
1185 | tracetype_query_ttl: 86400
1186 | tracetype_repair_ttl: 604800
1187 | 
1188 | # If unset, all GC Pauses greater than gc_log_threshold_in_ms will log at
1189 | # INFO level
1190 | # UDFs (user defined functions) are disabled by default.
1191 | # As of Cassandra 3.0 there is a sandbox in place that should prevent execution of evil code.
1192 | enable_user_defined_functions: false
1193 | 
1194 | # Enables scripted UDFs (JavaScript UDFs).
1195 | # Java UDFs are always enabled, if enable_user_defined_functions is true.
1196 | # Enable this option to be able to use UDFs with "language javascript" or any custom JSR-223 provider.
1197 | # This option has no effect, if enable_user_defined_functions is false.
1198 | enable_scripted_user_defined_functions: false
1199 | 
1200 | # The default Windows kernel timer and scheduling resolution is 15.6ms for power conservation.
1201 | # Lowering this value on Windows can provide much tighter latency and better throughput, however
1202 | # some virtualized environments may see a negative performance impact from changing this setting
1203 | # below their system default. The sysinternals 'clockres' tool can confirm your system's default
1204 | # setting.
1205 | windows_timer_interval: 1
1206 | 
1207 | 
1208 | # Enables encrypting data at-rest (on disk). Different key providers can be plugged in, but the default reads from
1209 | # a JCE-style keystore. A single keystore can hold multiple keys, but the one referenced by
1210 | # the "key_alias" is the only key that will be used for encrypt opertaions; previously used keys
1211 | # can still (and should!) be in the keystore and will be used on decrypt operations
1212 | # (to handle the case of key rotation).
1213 | #
1214 | # It is strongly recommended to download and install Java Cryptography Extension (JCE)
1215 | # Unlimited Strength Jurisdiction Policy Files for your version of the JDK.
1216 | # (current link: http://www.oracle.com/technetwork/java/javase/downloads/jce8-download-2133166.html)
1217 | #
1218 | # Currently, only the following file types are supported for transparent data encryption, although
1219 | # more are coming in future cassandra releases: commitlog, hints
1220 | transparent_data_encryption_options:
1221 |     enabled: false
1222 |     chunk_length_kb: 64
1223 |     cipher: AES/CBC/PKCS5Padding
1224 |     key_alias: testing:1
1225 |     # CBC IV length for AES needs to be 16 bytes (which is also the default size)
1226 |     # iv_length: 16
1227 |     key_provider:
1228 |       - class_name: org.apache.cassandra.security.JKSKeyProvider
1229 |         parameters:
1230 |           - keystore: conf/.keystore
1231 |             keystore_password: cassandra
1232 |             store_type: JCEKS
1233 |             key_password: cassandra
1234 | 
1235 | 
1236 | #####################
1237 | # SAFETY THRESHOLDS #
1238 | #####################
1239 | 
1240 | # When executing a scan, within or across a partition, we need to keep the
1241 | # tombstones seen in memory so we can return them to the coordinator, which
1242 | # will use them to make sure other replicas also know about the deleted rows.
1243 | # With workloads that generate a lot of tombstones, this can cause performance
1244 | # problems and even exaust the server heap.
1245 | # (http://www.datastax.com/dev/blog/cassandra-anti-patterns-queues-and-queue-like-datasets)
1246 | # Adjust the thresholds here if you understand the dangers and want to
1247 | # scan more tombstones anyway.  These thresholds may also be adjusted at runtime
1248 | # using the StorageService mbean.
1249 | tombstone_warn_threshold: 1000
1250 | tombstone_failure_threshold: 100000
1251 | 
1252 | # Filtering and secondary index queries at read consistency levels above ONE/LOCAL_ONE use a
1253 | # mechanism called replica filtering protection to ensure that results from stale replicas do
1254 | # not violate consistency. (See CASSANDRA-8272 and CASSANDRA-15907 for more details.) This
1255 | # mechanism materializes replica results by partition on-heap at the coordinator. The more possibly
1256 | # stale results returned by the replicas, the more rows materialized during the query.
1257 | replica_filtering_protection:
1258 |     # These thresholds exist to limit the damage severely out-of-date replicas can cause during these
1259 |     # queries. They limit the number of rows from all replicas individual index and filtering queries
1260 |     # can materialize on-heap to return correct results at the desired read consistency level.
1261 |     #
1262 |     # "cached_replica_rows_warn_threshold" is the per-query threshold at which a warning will be logged.
1263 |     # "cached_replica_rows_fail_threshold" is the per-query threshold at which the query will fail.
1264 |     #
1265 |     # These thresholds may also be adjusted at runtime using the StorageService mbean.
1266 |     #
1267 |     # If the failure threshold is breached, it is likely that either the current page/fetch size
1268 |     # is too large or one or more replicas is severely out-of-sync and in need of repair.
1269 |     cached_rows_warn_threshold: 2000
1270 |     cached_rows_fail_threshold: 32000
1271 | 
1272 | # Log WARN on any multiple-partition batch size exceeding this value. 5kb per batch by default.
1273 | # Caution should be taken on increasing the size of this threshold as it can lead to node instability.
1274 | batch_size_warn_threshold_in_kb: 5
1275 | 
1276 | # Fail any multiple-partition batch exceeding this value. 50kb (10x warn threshold) by default.
1277 | batch_size_fail_threshold_in_kb: 50
1278 | 
1279 | # Log WARN on any batches not of type LOGGED than span across more partitions than this limit
1280 | unlogged_batch_across_partitions_warn_threshold: 10
1281 | 
1282 | # Log a warning when compacting partitions larger than this value
1283 | compaction_large_partition_warning_threshold_mb: 100
1284 | 
1285 | # GC Pauses greater than 200 ms will be logged at INFO level
1286 | # This threshold can be adjusted to minimize logging if necessary
1287 | # gc_log_threshold_in_ms: 200
1288 | 
1289 | # GC Pauses greater than gc_warn_threshold_in_ms will be logged at WARN level
1290 | # Adjust the threshold based on your application throughput requirement. Setting to 0
1291 | # will deactivate the feature.
1292 | # gc_warn_threshold_in_ms: 1000
1293 | 
1294 | # Maximum size of any value in SSTables. Safety measure to detect SSTable corruption
1295 | # early. Any value size larger than this threshold will result into marking an SSTable
1296 | # as corrupted. This should be positive and less than 2048.
1297 | # max_value_size_in_mb: 256
1298 | 
1299 | # Coalescing Strategies #
1300 | # Coalescing multiples messages turns out to significantly boost message processing throughput (think doubling or more).
1301 | # On bare metal, the floor for packet processing throughput is high enough that many applications won't notice, but in
1302 | # virtualized environments, the point at which an application can be bound by network packet processing can be
1303 | # surprisingly low compared to the throughput of task processing that is possible inside a VM. It's not that bare metal
1304 | # doesn't benefit from coalescing messages, it's that the number of packets a bare metal network interface can process
1305 | # is sufficient for many applications such that no load starvation is experienced even without coalescing.
1306 | # There are other benefits to coalescing network messages that are harder to isolate with a simple metric like messages
1307 | # per second. By coalescing multiple tasks together, a network thread can process multiple messages for the cost of one
1308 | # trip to read from a socket, and all the task submission work can be done at the same time reducing context switching
1309 | # and increasing cache friendliness of network message processing.
1310 | # See CASSANDRA-8692 for details.
1311 | 
1312 | # Strategy to use for coalescing messages in OutboundTcpConnection.
1313 | # Can be fixed, movingaverage, timehorizon, disabled (default).
1314 | # You can also specify a subclass of CoalescingStrategies.CoalescingStrategy by name.
1315 | # otc_coalescing_strategy: DISABLED
1316 | 
1317 | # How many microseconds to wait for coalescing. For fixed strategy this is the amount of time after the first
1318 | # message is received before it will be sent with any accompanying messages. For moving average this is the
1319 | # maximum amount of time that will be waited as well as the interval at which messages must arrive on average
1320 | # for coalescing to be enabled.
1321 | # otc_coalescing_window_us: 200
1322 | 
1323 | # Do not try to coalesce messages if we already got that many messages. This should be more than 2 and less than 128.
1324 | # otc_coalescing_enough_coalesced_messages: 8
1325 | 
1326 | # How many milliseconds to wait between two expiration runs on the backlog (queue) of the OutboundTcpConnection.
1327 | # Expiration is done if messages are piling up in the backlog. Droppable messages are expired to free the memory
1328 | # taken by expired messages. The interval should be between 0 and 1000, and in most installations the default value
1329 | # will be appropriate. A smaller value could potentially expire messages slightly sooner at the expense of more CPU
1330 | # time and queue contention while iterating the backlog of messages.
1331 | # An interval of 0 disables any wait time, which is the behavior of former Cassandra versions.
1332 | #
1333 | # otc_backlog_expiration_interval_ms: 200
1334 | 
1335 | # Track a metric per keyspace indicating whether replication achieved the ideal consistency
1336 | # level for writes without timing out. This is different from the consistency level requested by
1337 | # each write which may be lower in order to facilitate availability.
1338 | # ideal_consistency_level: EACH_QUORUM
1339 | 
1340 | # Automatically upgrade sstables after upgrade - if there is no ordinary compaction to do, the
1341 | # oldest non-upgraded sstable will get upgraded to the latest version
1342 | # automatic_sstable_upgrade: false
1343 | # Limit the number of concurrent sstable upgrades
1344 | # max_concurrent_automatic_sstable_upgrades: 1
1345 | 
1346 | # Audit logging - Logs every incoming CQL command request, authentication to a node. See the docs
1347 | # on audit_logging for full details about the various configuration options.
1348 | audit_logging_options:
1349 |     enabled: false
1350 |     logger:
1351 |       - class_name: BinAuditLogger
1352 |     # audit_logs_dir:
1353 |     # included_keyspaces:
1354 |     # excluded_keyspaces: system, system_schema, system_virtual_schema
1355 |     # included_categories:
1356 |     # excluded_categories:
1357 |     # included_users:
1358 |     # excluded_users:
1359 |     # roll_cycle: HOURLY
1360 |     # block: true
1361 |     # max_queue_weight: 268435456 # 256 MiB
1362 |     # max_log_size: 17179869184 # 16 GiB
1363 |     ## archive command is "/path/to/script.sh %path" where %path is replaced with the file being rolled:
1364 |     # archive_command:
1365 |     # max_archive_retries: 10
1366 | 
1367 | 
1368 | # default options for full query logging - these can be overridden from command line when executing
1369 | # nodetool enablefullquerylog
1370 | #full_query_logging_options:
1371 |     # log_dir:
1372 |     # roll_cycle: HOURLY
1373 |     # block: true
1374 |     # max_queue_weight: 268435456 # 256 MiB
1375 |     # max_log_size: 17179869184 # 16 GiB
1376 |     ## archive command is "/path/to/script.sh %path" where %path is replaced with the file being rolled:
1377 |     # archive_command:
1378 |     # max_archive_retries: 10
1379 | 
1380 | # validate tombstones on reads and compaction
1381 | # can be either "disabled", "warn" or "exception"
1382 | # corrupted_tombstone_strategy: disabled
1383 | 
1384 | # Diagnostic Events #
1385 | # If enabled, diagnostic events can be helpful for troubleshooting operational issues. Emitted events contain details
1386 | # on internal state and temporal relationships across events, accessible by clients via JMX.
1387 | diagnostic_events_enabled: false
1388 | 
1389 | # Use native transport TCP message coalescing. If on upgrade to 4.0 you found your throughput decreasing, and in
1390 | # particular you run an old kernel or have very fewer client connections, this option might be worth evaluating.
1391 | #native_transport_flush_in_batches_legacy: false
1392 | 
1393 | # Enable tracking of repaired state of data during reads and comparison between replicas
1394 | # Mismatches between the repaired sets of replicas can be characterized as either confirmed
1395 | # or unconfirmed. In this context, unconfirmed indicates that the presence of pending repair
1396 | # sessions, unrepaired partition tombstones, or some other condition means that the disparity
1397 | # cannot be considered conclusive. Confirmed mismatches should be a trigger for investigation
1398 | # as they may be indicative of corruption or data loss.
1399 | # There are separate flags for range vs partition reads as single partition reads are only tracked
1400 | # when CL > 1 and a digest mismatch occurs. Currently, range queries don't use digests so if
1401 | # enabled for range reads, all range reads will include repaired data tracking. As this adds
1402 | # some overhead, operators may wish to disable it whilst still enabling it for partition reads
1403 | repaired_data_tracking_for_range_reads_enabled: false
1404 | repaired_data_tracking_for_partition_reads_enabled: false
1405 | # If false, only confirmed mismatches will be reported. If true, a separate metric for unconfirmed
1406 | # mismatches will also be recorded. This is to avoid potential signal:noise issues are unconfirmed
1407 | # mismatches are less actionable than confirmed ones.
1408 | report_unconfirmed_repaired_data_mismatches: false
1409 | 
1410 | # Having many tables and/or keyspaces negatively affects performance of many operations in the
1411 | # cluster. When the number of tables/keyspaces in the cluster exceeds the following thresholds
1412 | # a client warning will be sent back to the user when creating a table or keyspace.
1413 | # table_count_warn_threshold: 150
1414 | # keyspace_count_warn_threshold: 40
1415 | 
1416 | #########################
1417 | # EXPERIMENTAL FEATURES #
1418 | #########################
1419 | 
1420 | # Enables materialized view creation on this node.
1421 | # Materialized views are considered experimental and are not recommended for production use.
1422 | enable_materialized_views: false
1423 | 
1424 | # Enables SASI index creation on this node.
1425 | # SASI indexes are considered experimental and are not recommended for production use.
1426 | enable_sasi_indexes: false
1427 | 
1428 | # Enables creation of transiently replicated keyspaces on this node.
1429 | # Transient replication is experimental and is not recommended for production use.
1430 | enable_transient_replication: false
1431 | 


--------------------------------------------------------------------------------
/ansible/roles/common/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: "generate /etc/hosts.ansible file"
 3 |   become: yes
 4 |   become_method: sudo
 5 |   template:
 6 |     src: templates/hosts.j2
 7 |     dest: '/etc/hosts'
 8 |     owner: root
 9 |     group: root
10 |     mode: 0644
11 | 


--------------------------------------------------------------------------------
/ansible/roles/common/templates/hosts.j2:
--------------------------------------------------------------------------------
1 | 127.0.0.1   localhost
2 | ::1         localhost ip6-localhost ip6-loopback
3 | 
4 | {% for item in groups['all'] %}
5 | {% if 'ansible_eth1' in hostvars[item] %}
6 | {{ hostvars[item]['ansible_eth1']['ipv4']['address'] }} {{ item }}
7 | {% endif %}
8 | {% endfor %}
9 | 


--------------------------------------------------------------------------------
/ansible/roles/flink/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | version: 1.12.1
3 | flink: flink-{{ version }}
4 | file: "{{ flink }}-bin-scala_2.12.tgz"
5 | url: https://ftp-stud.hs-esslingen.de/pub/Mirrors/ftp.apache.org/dist/flink/{{ flink }}/{{ file }}
6 |      


--------------------------------------------------------------------------------
/ansible/roles/flink/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: get java_home
 4 |   shell: "java -XshowSettings:properties -version 2>&1 | sed '/^[[:space:]]*java\\.home/!d;s/^[[:space:]]*java\\.home[[:space:]]*=[[:space:]]*//'"
 5 |   register: java_home_output
 6 | 
 7 | - name: get hadoop_classpath
 8 |   shell: "{{ usr_local }}/hadoop/bin/hadoop classpath"
 9 |   register: hadoop_classpath_outout
10 |   
11 | - set_fact:
12 |     java_home: "{{ java_home_output.stdout }}"
13 |     hadoop_classpath: "{{ hadoop_classpath_outout.stdout }}"
14 | 
15 | - name: download {{ url }}
16 |   get_url:
17 |     url: "{{ url }}"
18 |     dest: "/tmp/{{ file }}"
19 | 
20 | - name: extract /tmp/{{ file }}
21 |   become: yes
22 |   become_method: sudo
23 |   unarchive:
24 |     src: "/tmp/{{ file }}"
25 |     dest: "{{ usr_local }}"
26 |     copy: no
27 |     creates: "{{ flink_home }}"
28 | 
29 | - name: set ownership on {{ flink_home }}
30 |   become: yes
31 |   become_method: sudo
32 |   file:
33 |     path: "{{ flink_home }}"
34 |     owner: vagrant
35 |     group: vagrant
36 |     recurse: yes
37 |     mode: 0755
38 | 
39 | - name: set PATH=$PATH:{{ flink_home }}/bin
40 |   become: yes
41 |   become_method: sudo
42 |   lineinfile:
43 |     dest: "{{ etc_profiles }}/flink.sh"
44 |     create: yes
45 |     state: present
46 |     regexp: '^PATH'
47 |     line: 'PATH=$PATH:{{ flink_home }}/bin'
48 | 
49 | - name: install custom flink stop
50 |   template:
51 |     src: templates/stop-flink-yarn.sh.j2
52 |     dest: "{{ flink_home }}/bin/stop-flink-yarn.sh"
53 |     mode: 0755
54 |   register: flink_stop
55 | 
56 | - name: install custom flink status
57 |   template:
58 |     src: templates/status-flink-yarn.sh.j2
59 |     dest: "{{ flink_home }}/bin/status-flink-yarn.sh"
60 |     mode: 0755
61 | 
62 | - name: install flink systemd unit file
63 |   become: yes
64 |   become_method: sudo
65 |   template:
66 |     src: templates/flink.service.j2
67 |     dest: "{{ system_units }}/flink.service"
68 |   register: flink_service
69 | 
70 | - name: check if flink already started
71 |   become: yes
72 |   become_user: vagrant
73 |   shell: "{{ flink_home }}/bin/status-flink-yarn.sh"
74 |   ignore_errors: yes
75 |   register: flink_status
76 | 
77 | - name: restart flink
78 |   become: yes
79 |   become_method: sudo
80 |   when: flink_service.changed or flink_stop.changed or flink_status.rc != 0
81 |   systemd:
82 |     enabled: yes
83 |     state: restarted
84 |     name: flink
85 |     daemon_reload: yes
86 | 


--------------------------------------------------------------------------------
/ansible/roles/flink/templates/flink.service.j2:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Flink
 3 | After=yarn-resourcemanager.service
 4 | 
 5 | [Service]
 6 | Environment=JAVA_HOME={{ java_home }}
 7 | Environment=HADOOP_HOME={{ usr_local }}/hadoop
 8 | Environment=HADOOP_CONF_DIR={{ usr_local }}/hadoop/etc/hadoop
 9 | Environment=HADOOP_CLASSPATH={{ hadoop_classpath }}
10 | Type=forking
11 | User=vagrant
12 | ExecStart={{ flink_home }}/bin/yarn-session.sh -d -jm 1024 -tm 1024 -s 2
13 | RemainAfterExit=yes
14 | ExecStop={{ flink_home }}/bin/stop-flink-yarn.sh
15 | Restart=on-failure
16 | 
17 | [Install]
18 | WantedBy=multi-user.target
19 | 


--------------------------------------------------------------------------------
/ansible/roles/flink/templates/status-flink-yarn.sh.j2:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | YARN_APPLICATION_IDS=`yarn application -list | grep Flink | awk '{split($0,a," "); print a[1]}'`
 4 | 
 5 | echo $YARN_APPLICATION_IDS
 6 | 
 7 | if [ -n "${YARN_APPLICATION_IDS}" ]; then
 8 |   exit 0
 9 | else
10 |   exit 1
11 | fi
12 | 


--------------------------------------------------------------------------------
/ansible/roles/flink/templates/stop-flink-yarn.sh.j2:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | YARN_APPLICATION_IDS=`{{ usr_local }}/hadoop/bin/yarn application -list | grep Flink | awk '{split($0,a," "); print a[1]}'`
 4 | 
 5 | echo $YARN_APPLICATION_IDS
 6 | 
 7 | if [ -n "${YARN_APPLICATION_IDS}" ]; then
 8 |   for YARN_APPLICATION_ID in $YARN_APPLICATION_IDS
 9 |   do
10 |     echo stop | {{ flink_home }}/bin/yarn-session.sh -id $YARN_APPLICATION_ID
11 |   done
12 | fi
13 | 


--------------------------------------------------------------------------------
/ansible/roles/hadoop/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | version: "3.3.0"
3 | hadoop: hadoop-{{ version }}
4 | file: "{{ hadoop }}.tar.gz"
5 | url: http://apache.mirrors.spacedump.net/hadoop/common/{{ hadoop }}/{{ file }}
6 | 


--------------------------------------------------------------------------------
/ansible/roles/hadoop/tasks/main.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | 
  3 | - name: get java_home
  4 |   shell: "java -XshowSettings:properties -version 2>&1 | sed '/^[[:space:]]*java\\.home/!d;s/^[[:space:]]*java\\.home[[:space:]]*=[[:space:]]*//'"
  5 |   register: java_home_output
  6 | 
  7 | - set_fact:
  8 |     java_home: "{{ java_home_output.stdout }}"
  9 | 
 10 | - name: download {{ url }}
 11 |   get_url:
 12 |     url: "{{ url }}"
 13 |     dest: "/tmp/{{ file }}"
 14 | 
 15 | - name: extract /tmp/{{ file }}
 16 |   become: yes
 17 |   become_method: sudo
 18 |   unarchive:
 19 |     src: "/tmp/{{ file }}"
 20 |     dest: "{{ usr_local }}"
 21 |     copy: no
 22 |     creates: "{{ hadoop_home }}"
 23 | 
 24 | - name: set ownership on {{ hadoop_home }}
 25 |   become: yes
 26 |   become_method: sudo
 27 |   file:
 28 |     path: "{{ hadoop_home }}"
 29 |     owner: vagrant
 30 |     group: vagrant
 31 |     recurse: yes
 32 |     mode: 0755
 33 | 
 34 | - name: link {{ hadoop_home }} to {{ usr_local }}/hadoop
 35 |   become: yes
 36 |   become_method: sudo
 37 |   file:
 38 |     src: "{{ hadoop_home }}"
 39 |     dest: "{{ usr_local }}/hadoop"
 40 |     owner: vagrant
 41 |     group: vagrant
 42 |     state: link
 43 | 
 44 | - name: set PATH=$PATH:{{ hadoop_home }}/bin
 45 |   become: yes
 46 |   become_method: sudo
 47 |   lineinfile:
 48 |     dest: "{{ etc_profiles }}/hadoop.sh"
 49 |     create: yes
 50 |     state: present
 51 |     regexp: '^PATH'
 52 |     line: 'PATH=$PATH:{{ hadoop_home }}/bin:{{ hadoop_home }}/sbin'
 53 | 
 54 | - name: set HADOOP_PREFIX={{ hadoop_home }}
 55 |   become: yes
 56 |   become_method: sudo
 57 |   lineinfile:
 58 |     dest: "{{ etc_profiles }}/hadoop.sh"
 59 |     create: yes
 60 |     state: present
 61 |     regexp: '^HADOOP_PREFIX'
 62 |     line: 'HADOOP_PREFIX={{ hadoop_home }}'
 63 | 
 64 | - name: set HADOOP_CONF_DIR={{ hadoop_conf }}
 65 |   become: yes
 66 |   become_method: sudo
 67 |   lineinfile:
 68 |     dest: "{{ etc_profiles }}/hadoop.sh"
 69 |     create: yes
 70 |     state: present
 71 |     regexp: '^export HADOOP_CONF_DIR'
 72 |     line: 'export HADOOP_CONF_DIR={{ hadoop_conf }}'
 73 | 
 74 | - name: copy core-site.xml {{ hadoop_conf }}/core-site.xml
 75 |   template:
 76 |     src: templates/core-site.xml.j2
 77 |     dest: "{{ hadoop_conf }}/core-site.xml"
 78 |   register: core_config
 79 | 
 80 | - name: copy yarn-site.xml {{ hadoop_conf }}/core-site.xml
 81 |   template:
 82 |     src: templates/yarn-site.xml.j2
 83 |     dest: "{{ hadoop_conf }}/yarn-site.xml"
 84 |   register: yarn_config
 85 | 
 86 | - name: copy capacity-scheduler.xml {{ hadoop_conf }}/capacity-scheduler.xml
 87 |   template:
 88 |     src: templates/capacity-scheduler.xml.j2
 89 |     dest: "{{ hadoop_conf }}/capacity-scheduler.xml"
 90 |   register: capacity_config
 91 | 
 92 | - name: copy hadoop-env.sh {{ hadoop_conf }}/hadoop-env.sh
 93 |   template:
 94 |     src: templates/hadoop-env.sh.j2
 95 |     dest: "{{ hadoop_conf }}/hadoop-env.sh"
 96 |   register: hadoop_env
 97 | 
 98 | - name: copy yarn-env.sh {{ hadoop_conf }}/yarn-env.sh
 99 |   template:
100 |     src: templates/yarn-env.sh.j2
101 |     dest: "{{ hadoop_conf }}/yarn-env.sh"
102 |   register: yarn_env
103 | 
104 | - name: create {{ hadoop_log_dir }}
105 |   file:
106 |     path: "{{ hadoop_log_dir }}"
107 |     state: directory
108 | 
109 | - name: install hdfs-namenode systemd unit file
110 |   become: yes
111 |   become_method: sudo
112 |   when: inventory_hostname in groups['hadoop-master']
113 |   template:
114 |     src: templates/hdfs-namenode.service.j2
115 |     dest: "{{ system_units }}/hdfs-namenode.service"
116 |   register: hdfs_namenode_service
117 | 
118 | - name: install hdfs-datanode systemd unit file
119 |   become: yes
120 |   become_method: sudo
121 |   template:
122 |     src: templates/hdfs-datanode.service.j2
123 |     dest: "{{ system_units }}/hdfs-datanode.service"
124 |   register: hdfs_datanode_service
125 | 
126 | - name: install yarn-resourcemanager systemd unit file
127 |   become: yes
128 |   become_method: sudo
129 |   when: inventory_hostname in groups['hadoop-master']
130 |   template:
131 |     src: templates/yarn-resourcemanager.service.j2
132 |     dest: "{{ system_units }}/yarn-resourcemanager.service"
133 | 
134 | - name: install yarn-nodemanager systemd unit file
135 |   become: yes
136 |   become_method: sudo
137 |   template:
138 |     src: templates/yarn-nodemanager.service.j2
139 |     dest: "{{ system_units }}/yarn-nodemanager.service"
140 | 
141 | - name: format HDFS
142 |   when: inventory_hostname in groups['hadoop-master']
143 |   command: "{{ hadoop_home}}/bin/hdfs namenode -format"
144 |   args:
145 |     creates: /tmp/hadoop-{{ ansible_user }}/dfs/name
146 | 
147 | - name: restart hdfs-namenode
148 |   become: yes
149 |   become_method: sudo
150 |   when: inventory_hostname in groups['hadoop-master'] and (hadoop_env.changed or core_config.changed or hdfs_namenode_service.changed)
151 |   systemd:
152 |     enabled: yes
153 |     state: restarted
154 |     name: hdfs-namenode
155 |     daemon_reload: yes
156 | 
157 | - name: restart hdfs-datanode
158 |   become: yes
159 |   become_method: sudo
160 |   when: hadoop_env.changed or core_config.changed or hdfs_datanode_service.changed
161 |   systemd:
162 |     enabled: yes
163 |     state: restarted
164 |     name: hdfs-datanode
165 |     daemon_reload: yes
166 | 
167 | - name: restart yarn-resourcemanager
168 |   become: yes
169 |   become_method: sudo
170 |   when: inventory_hostname in groups['hadoop-master'] and (yarn_env.changed or yarn_config.changed or capacity_config.changed)
171 |   systemd:
172 |     enabled: yes
173 |     state: restarted
174 |     name: yarn-resourcemanager
175 |     daemon_reload: yes
176 | 
177 | - name: restart yarn-nodemanager
178 |   become: yes
179 |   become_method: sudo
180 |   when: yarn_env.changed or yarn_config.changed or capacity_config.changed
181 |   systemd:
182 |     enabled: yes
183 |     state: restarted
184 |     name: yarn-nodemanager
185 |     daemon_reload: yes
186 | 


--------------------------------------------------------------------------------
/ansible/roles/hadoop/templates/capacity-scheduler.xml.j2:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <configuration>
 3 |   <property>
 4 |     <name>yarn.scheduler.capacity.maximum-applications</name>
 5 |     <value>10000</value>
 6 |   </property>
 7 |   <property>
 8 |     <name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
 9 |     <value>0.3</value>
10 |   </property>
11 |   <property>
12 |     <name>yarn.scheduler.capacity.resource-calculator</name>
13 |     <value>org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator</value>
14 |   </property>
15 |   <property>
16 |     <name>yarn.scheduler.capacity.root.queues</name>
17 |     <value>default</value>
18 |   </property>
19 |   <property>
20 |     <name>yarn.scheduler.capacity.root.default.capacity</name>
21 |     <value>100</value>
22 |   </property>
23 |   <property>
24 |     <name>yarn.scheduler.capacity.root.default.user-limit-factor</name>
25 |     <value>1</value>
26 |   </property>
27 |   <property>
28 |     <name>yarn.scheduler.capacity.root.default.maximum-capacity</name>
29 |     <value>100</value>
30 |   </property>
31 |   <property>
32 |     <name>yarn.scheduler.capacity.root.default.state</name>
33 |     <value>RUNNING</value>
34 |   </property>
35 |   <property>
36 |     <name>yarn.scheduler.capacity.root.default.acl_submit_applications</name>
37 |     <value>*</value>
38 |   </property>
39 |   <property>
40 |     <name>yarn.scheduler.capacity.root.default.acl_administer_queue</name>
41 |     <value>*</value>
42 |   </property>
43 |   <property>
44 |     <name>yarn.scheduler.capacity.root.default.acl_application_max_priority</name>
45 |     <value>*</value>
46 |   </property>
47 |    <property>
48 |      <name>yarn.scheduler.capacity.root.default.maximum-application-lifetime
49 |      </name>
50 |      <value>-1</value>
51 |    </property>
52 |    <property>
53 |      <name>yarn.scheduler.capacity.root.default.default-application-lifetime
54 |      </name>
55 |      <value>-1</value>
56 |    </property>
57 |   <property>
58 |     <name>yarn.scheduler.capacity.node-locality-delay</name>
59 |     <value>40</value>
60 |   </property>
61 |   <property>
62 |     <name>yarn.scheduler.capacity.rack-locality-additional-delay</name>
63 |     <value>-1</value>
64 |   </property>
65 |   <property>
66 |     <name>yarn.scheduler.capacity.queue-mappings</name>
67 |     <value></value>
68 |   </property>
69 |   <property>
70 |     <name>yarn.scheduler.capacity.queue-mappings-override.enable</name>
71 |     <value>false</value>
72 |   </property>
73 |   <property>
74 |     <name>yarn.scheduler.capacity.per-node-heartbeat.maximum-offswitch-assignments</name>
75 |     <value>1</value>
76 |   </property>
77 | </configuration>
78 | 


--------------------------------------------------------------------------------
/ansible/roles/hadoop/templates/core-site.xml.j2:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <configuration>
 3 | 	<property>
 4 | 		<name>fs.defaultFS</name>
 5 | 		<value>hdfs://{{ groups['hadoop-master'][0] }}</value>
 6 | 	</property>
 7 | 	<property>
 8 | 		<name>dfs.namenode.rpc-bind-host</name>
 9 | 		<value>0.0.0.0</value>
10 | 	</property>
11 | 	<property>
12 | 		<name>dfs.namenode.servicerpc-bind-host</name>
13 | 		<value>0.0.0.0</value>
14 | 	</property>
15 | 	<property>
16 | 		<name>dfs.namenode.http-bind-host</name>
17 | 		<value>0.0.0.0</value>
18 | 	</property>
19 | 	<property>
20 | 		<name>dfs.namenode.https-bind-host</name>
21 | 		<value>0.0.0.0</value>
22 | 	</property>	
23 | 	<property>
24 | 		<name>dfs.client.use.datanode.hostname</name>
25 | 		<value>true</value>
26 | 	</property>
27 | 	<property>
28 | 		<name>dfs.datanode.use.datanode.hostname</name>
29 | 		<value>true</value>
30 | 	</property>
31 | </configuration>
32 | 


--------------------------------------------------------------------------------
/ansible/roles/hadoop/templates/hadoop-env.sh.j2:
--------------------------------------------------------------------------------
  1 | #
  2 | # Licensed to the Apache Software Foundation (ASF) under one
  3 | # or more contributor license agreements.  See the NOTICE file
  4 | # distributed with this work for additional information
  5 | # regarding copyright ownership.  The ASF licenses this file
  6 | # to you under the Apache License, Version 2.0 (the
  7 | # "License"); you may not use this file except in compliance
  8 | # with the License.  You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | # Set Hadoop-specific environment variables here.
 19 | 
 20 | ##
 21 | ## THIS FILE ACTS AS THE MASTER FILE FOR ALL HADOOP PROJECTS.
 22 | ## SETTINGS HERE WILL BE READ BY ALL HADOOP COMMANDS.  THEREFORE,
 23 | ## ONE CAN USE THIS FILE TO SET YARN, HDFS, AND MAPREDUCE
 24 | ## CONFIGURATION OPTIONS INSTEAD OF xxx-env.sh.
 25 | ##
 26 | ## Precedence rules:
 27 | ##
 28 | ## {yarn-env.sh|hdfs-env.sh} > hadoop-env.sh > hard-coded defaults
 29 | ##
 30 | ## {YARN_xyz|HDFS_xyz} > HADOOP_xyz > hard-coded defaults
 31 | ##
 32 | 
 33 | # Many of the options here are built from the perspective that users
 34 | # may want to provide OVERWRITING values on the command line.
 35 | # For example:
 36 | #
 37 | #  JAVA_HOME=/usr/java/testing hdfs dfs -ls
 38 | #
 39 | # Therefore, the vast majority (BUT NOT ALL!) of these defaults
 40 | # are configured for substitution and not append.  If append
 41 | # is preferable, modify this file accordingly.
 42 | 
 43 | ###
 44 | # Generic settings for HADOOP
 45 | ###
 46 | 
 47 | # Technically, the only required environment variable is JAVA_HOME.
 48 | # All others are optional.  However, the defaults are probably not
 49 | # preferred.  Many sites configure these options outside of Hadoop,
 50 | # such as in /etc/profile.d
 51 | 
 52 | # The java implementation to use. By default, this environment
 53 | # variable is REQUIRED on ALL platforms except OS X!
 54 | # export JAVA_HOME=
 55 | 
 56 | # Location of Hadoop.  By default, Hadoop will attempt to determine
 57 | # this location based upon its execution path.
 58 | # export HADOOP_HOME=
 59 | 
 60 | # Location of Hadoop's configuration information.  i.e., where this
 61 | # file is living. If this is not defined, Hadoop will attempt to
 62 | # locate it based upon its execution path.
 63 | #
 64 | # NOTE: It is recommend that this variable not be set here but in
 65 | # /etc/profile.d or equivalent.  Some options (such as
 66 | # --config) may react strangely otherwise.
 67 | #
 68 | # export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
 69 | 
 70 | # The maximum amount of  to use (Java -Xmx).  If no unit
 71 | # is provided, it will be converted to MB.  Daemons will
 72 | # prefer any Xmx setting in their respective _OPT variable.
 73 | # There is no default; the JVM will autoscale based upon machine
 74 | # memory size.
 75 | export HADOOP_HEAPSIZE_MAX=384
 76 | 
 77 | # The minimum amount of heap to use (Java -Xms).  If no unit
 78 | # is provided, it will be converted to MB.  Daemons will
 79 | # prefer any Xms setting in their respective _OPT variable.
 80 | # There is no default; the JVM will autoscale based upon machine
 81 | # memory size.
 82 | # export HADOOP_HEAPSIZE_MIN=
 83 | 
 84 | # Enable extra debugging of Hadoop's JAAS binding, used to set up
 85 | # Kerberos security.
 86 | # export HADOOP_JAAS_DEBUG=true
 87 | 
 88 | # Extra Java runtime options for all Hadoop commands. We don't support
 89 | # IPv6 yet/still, so by default the preference is set to IPv4.
 90 | # export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true"
 91 | # For Kerberos debugging, an extended option set logs more invormation
 92 | # export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true -Dsun.security.krb5.debug=true -Dsun.security.spnego.debug"
 93 | 
 94 | # Some parts of the shell code may do special things dependent upon
 95 | # the operating system.  We have to set this here. See the next
 96 | # section as to why....
 97 | export HADOOP_OS_TYPE=${HADOOP_OS_TYPE:-$(uname -s)}
 98 | 
 99 | 
100 | # Under certain conditions, Java on OS X will throw SCDynamicStore errors
101 | # in the system logs.
102 | # See HADOOP-8719 for more information.  If one needs Kerberos
103 | # support on OS X, one will want to change/remove this extra bit.
104 | case ${HADOOP_OS_TYPE} in
105 |   Darwin*)
106 |     export HADOOP_OPTS="${HADOOP_OPTS} -Djava.security.krb5.realm= "
107 |     export HADOOP_OPTS="${HADOOP_OPTS} -Djava.security.krb5.kdc= "
108 |     export HADOOP_OPTS="${HADOOP_OPTS} -Djava.security.krb5.conf= "
109 |   ;;
110 | esac
111 | 
112 | # Extra Java runtime options for some Hadoop commands
113 | # and clients (i.e., hdfs dfs -blah).  These get appended to HADOOP_OPTS for
114 | # such commands.  In most cases, # this should be left empty and
115 | # let users supply it on the command line.
116 | # export HADOOP_CLIENT_OPTS=""
117 | 
118 | #
119 | # A note about classpaths.
120 | #
121 | # By default, Apache Hadoop overrides Java's CLASSPATH
122 | # environment variable.  It is configured such
123 | # that it sarts out blank with new entries added after passing
124 | # a series of checks (file/dir exists, not already listed aka
125 | # de-deduplication).  During de-depulication, wildcards and/or
126 | # directories are *NOT* expanded to keep it simple. Therefore,
127 | # if the computed classpath has two specific mentions of
128 | # awesome-methods-1.0.jar, only the first one added will be seen.
129 | # If two directories are in the classpath that both contain
130 | # awesome-methods-1.0.jar, then Java will pick up both versions.
131 | 
132 | # An additional, custom CLASSPATH. Site-wide configs should be
133 | # handled via the shellprofile functionality, utilizing the
134 | # hadoop_add_classpath function for greater control and much
135 | # harder for apps/end-users to accidentally override.
136 | # Similarly, end users should utilize ${HOME}/.hadooprc .
137 | # This variable should ideally only be used as a short-cut,
138 | # interactive way for temporary additions on the command line.
139 | # export HADOOP_CLASSPATH="/some/cool/path/on/your/machine"
140 | 
141 | # Should HADOOP_CLASSPATH be first in the official CLASSPATH?
142 | # export HADOOP_USER_CLASSPATH_FIRST="yes"
143 | 
144 | # If HADOOP_USE_CLIENT_CLASSLOADER is set, the classpath along
145 | # with the main jar are handled by a separate isolated
146 | # client classloader when 'hadoop jar', 'yarn jar', or 'mapred job'
147 | # is utilized. If it is set, HADOOP_CLASSPATH and
148 | # HADOOP_USER_CLASSPATH_FIRST are ignored.
149 | # export HADOOP_USE_CLIENT_CLASSLOADER=true
150 | 
151 | # HADOOP_CLIENT_CLASSLOADER_SYSTEM_CLASSES overrides the default definition of
152 | # system classes for the client classloader when HADOOP_USE_CLIENT_CLASSLOADER
153 | # is enabled. Names ending in '.' (period) are treated as package names, and
154 | # names starting with a '-' are treated as negative matches. For example,
155 | # export HADOOP_CLIENT_CLASSLOADER_SYSTEM_CLASSES="-org.apache.hadoop.UserClass,java.,javax.,org.apache.hadoop."
156 | 
157 | # Enable optional, bundled Hadoop features
158 | # This is a comma delimited list.  It may NOT be overridden via .hadooprc
159 | # Entries may be added/removed as needed.
160 | # export HADOOP_OPTIONAL_TOOLS="hadoop-azure,hadoop-aws,hadoop-aliyun,hadoop-azure-datalake,hadoop-kafka,hadoop-openstack"
161 | 
162 | ###
163 | # Options for remote shell connectivity
164 | ###
165 | 
166 | # There are some optional components of hadoop that allow for
167 | # command and control of remote hosts.  For example,
168 | # start-dfs.sh will attempt to bring up all NNs, DNS, etc.
169 | 
170 | # Options to pass to SSH when one of the "log into a host and
171 | # start/stop daemons" scripts is executed
172 | # export HADOOP_SSH_OPTS="-o BatchMode=yes -o StrictHostKeyChecking=no -o ConnectTimeout=10s"
173 | 
174 | # The built-in ssh handler will limit itself to 10 simultaneous connections.
175 | # For pdsh users, this sets the fanout size ( -f )
176 | # Change this to increase/decrease as necessary.
177 | # export HADOOP_SSH_PARALLEL=10
178 | 
179 | # Filename which contains all of the hosts for any remote execution
180 | # helper scripts # such as workers.sh, start-dfs.sh, etc.
181 | # export HADOOP_WORKERS="${HADOOP_CONF_DIR}/workers"
182 | 
183 | ###
184 | # Options for all daemons
185 | ###
186 | #
187 | 
188 | #
189 | # Many options may also be specified as Java properties.  It is
190 | # very common, and in many cases, desirable, to hard-set these
191 | # in daemon _OPTS variables.  Where applicable, the appropriate
192 | # Java property is also identified.  Note that many are re-used
193 | # or set differently in certain contexts (e.g., secure vs
194 | # non-secure)
195 | #
196 | 
197 | # Where (primarily) daemon log files are stored.
198 | # ${HADOOP_HOME}/logs by default.
199 | # Java property: hadoop.log.dir
200 | # export HADOOP_LOG_DIR=${HADOOP_HOME}/logs
201 | 
202 | # A string representing this instance of hadoop. $USER by default.
203 | # This is used in writing log and pid files, so keep that in mind!
204 | # Java property: hadoop.id.str
205 | # export HADOOP_IDENT_STRING=$USER
206 | 
207 | # How many seconds to pause after stopping a daemon
208 | # export HADOOP_STOP_TIMEOUT=5
209 | 
210 | # Where pid files are stored.  /tmp by default.
211 | # export HADOOP_PID_DIR=/tmp
212 | 
213 | # Default log4j setting for interactive commands
214 | # Java property: hadoop.root.logger
215 | # export HADOOP_ROOT_LOGGER=INFO,console
216 | 
217 | # Default log4j setting for daemons spawned explicitly by
218 | # --daemon option of hadoop, hdfs, mapred and yarn command.
219 | # Java property: hadoop.root.logger
220 | # export HADOOP_DAEMON_ROOT_LOGGER=INFO,RFA
221 | 
222 | # Default log level and output location for security-related messages.
223 | # You will almost certainly want to change this on a per-daemon basis via
224 | # the Java property (i.e., -Dhadoop.security.logger=foo). (Note that the
225 | # defaults for the NN and 2NN override this by default.)
226 | # Java property: hadoop.security.logger
227 | # export HADOOP_SECURITY_LOGGER=INFO,NullAppender
228 | 
229 | # Default process priority level
230 | # Note that sub-processes will also run at this level!
231 | # export HADOOP_NICENESS=0
232 | 
233 | # Default name for the service level authorization file
234 | # Java property: hadoop.policy.file
235 | # export HADOOP_POLICYFILE="hadoop-policy.xml"
236 | 
237 | #
238 | # NOTE: this is not used by default!  <-----
239 | # You can define variables right here and then re-use them later on.
240 | # For example, it is common to use the same garbage collection settings
241 | # for all the daemons.  So one could define:
242 | #
243 | # export HADOOP_GC_SETTINGS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps"
244 | #
245 | # .. and then use it as per the b option under the namenode.
246 | 
247 | ###
248 | # Secure/privileged execution
249 | ###
250 | 
251 | #
252 | # Out of the box, Hadoop uses jsvc from Apache Commons to launch daemons
253 | # on privileged ports.  This functionality can be replaced by providing
254 | # custom functions.  See hadoop-functions.sh for more information.
255 | #
256 | 
257 | # The jsvc implementation to use. Jsvc is required to run secure datanodes
258 | # that bind to privileged ports to provide authentication of data transfer
259 | # protocol.  Jsvc is not required if SASL is configured for authentication of
260 | # data transfer protocol using non-privileged ports.
261 | # export JSVC_HOME=/usr/bin
262 | 
263 | #
264 | # This directory contains pids for secure and privileged processes.
265 | #export HADOOP_SECURE_PID_DIR=${HADOOP_PID_DIR}
266 | 
267 | #
268 | # This directory contains the logs for secure and privileged processes.
269 | # Java property: hadoop.log.dir
270 | # export HADOOP_SECURE_LOG=${HADOOP_LOG_DIR}
271 | 
272 | #
273 | # When running a secure daemon, the default value of HADOOP_IDENT_STRING
274 | # ends up being a bit bogus.  Therefore, by default, the code will
275 | # replace HADOOP_IDENT_STRING with HADOOP_xx_SECURE_USER.  If one wants
276 | # to keep HADOOP_IDENT_STRING untouched, then uncomment this line.
277 | # export HADOOP_SECURE_IDENT_PRESERVE="true"
278 | 
279 | ###
280 | # NameNode specific parameters
281 | ###
282 | 
283 | # Default log level and output location for file system related change
284 | # messages. For non-namenode daemons, the Java property must be set in
285 | # the appropriate _OPTS if one wants something other than INFO,NullAppender
286 | # Java property: hdfs.audit.logger
287 | # export HDFS_AUDIT_LOGGER=INFO,NullAppender
288 | 
289 | # Specify the JVM options to be used when starting the NameNode.
290 | # These options will be appended to the options specified as HADOOP_OPTS
291 | # and therefore may override any similar flags set in HADOOP_OPTS
292 | #
293 | # a) Set JMX options
294 | # export HDFS_NAMENODE_OPTS="-Dcom.sun.management.jmxremote=true -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.port=1026"
295 | #
296 | # b) Set garbage collection logs
297 | # export HDFS_NAMENODE_OPTS="${HADOOP_GC_SETTINGS} -Xloggc:${HADOOP_LOG_DIR}/gc-rm.log-$(date +'%Y%m%d%H%M')"
298 | #
299 | # c) ... or set them directly
300 | # export HDFS_NAMENODE_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -Xloggc:${HADOOP_LOG_DIR}/gc-rm.log-$(date +'%Y%m%d%H%M')"
301 | 
302 | # this is the default:
303 | # export HDFS_NAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS"
304 | 
305 | ###
306 | # SecondaryNameNode specific parameters
307 | ###
308 | # Specify the JVM options to be used when starting the SecondaryNameNode.
309 | # These options will be appended to the options specified as HADOOP_OPTS
310 | # and therefore may override any similar flags set in HADOOP_OPTS
311 | #
312 | # This is the default:
313 | # export HDFS_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS"
314 | 
315 | ###
316 | # DataNode specific parameters
317 | ###
318 | # Specify the JVM options to be used when starting the DataNode.
319 | # These options will be appended to the options specified as HADOOP_OPTS
320 | # and therefore may override any similar flags set in HADOOP_OPTS
321 | #
322 | # This is the default:
323 | # export HDFS_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS"
324 | 
325 | # On secure datanodes, user to run the datanode as after dropping privileges.
326 | # This **MUST** be uncommented to enable secure HDFS if using privileged ports
327 | # to provide authentication of data transfer protocol.  This **MUST NOT** be
328 | # defined if SASL is configured for authentication of data transfer protocol
329 | # using non-privileged ports.
330 | # This will replace the hadoop.id.str Java property in secure mode.
331 | # export HDFS_DATANODE_SECURE_USER=hdfs
332 | 
333 | # Supplemental options for secure datanodes
334 | # By default, Hadoop uses jsvc which needs to know to launch a
335 | # server jvm.
336 | # export HDFS_DATANODE_SECURE_EXTRA_OPTS="-jvm server"
337 | 
338 | ###
339 | # NFS3 Gateway specific parameters
340 | ###
341 | # Specify the JVM options to be used when starting the NFS3 Gateway.
342 | # These options will be appended to the options specified as HADOOP_OPTS
343 | # and therefore may override any similar flags set in HADOOP_OPTS
344 | #
345 | # export HDFS_NFS3_OPTS=""
346 | 
347 | # Specify the JVM options to be used when starting the Hadoop portmapper.
348 | # These options will be appended to the options specified as HADOOP_OPTS
349 | # and therefore may override any similar flags set in HADOOP_OPTS
350 | #
351 | # export HDFS_PORTMAP_OPTS="-Xmx512m"
352 | 
353 | # Supplemental options for priviliged gateways
354 | # By default, Hadoop uses jsvc which needs to know to launch a
355 | # server jvm.
356 | # export HDFS_NFS3_SECURE_EXTRA_OPTS="-jvm server"
357 | 
358 | # On privileged gateways, user to run the gateway as after dropping privileges
359 | # This will replace the hadoop.id.str Java property in secure mode.
360 | # export HDFS_NFS3_SECURE_USER=nfsserver
361 | 
362 | ###
363 | # ZKFailoverController specific parameters
364 | ###
365 | # Specify the JVM options to be used when starting the ZKFailoverController.
366 | # These options will be appended to the options specified as HADOOP_OPTS
367 | # and therefore may override any similar flags set in HADOOP_OPTS
368 | #
369 | # export HDFS_ZKFC_OPTS=""
370 | 
371 | ###
372 | # QuorumJournalNode specific parameters
373 | ###
374 | # Specify the JVM options to be used when starting the QuorumJournalNode.
375 | # These options will be appended to the options specified as HADOOP_OPTS
376 | # and therefore may override any similar flags set in HADOOP_OPTS
377 | #
378 | # export HDFS_JOURNALNODE_OPTS=""
379 | 
380 | ###
381 | # HDFS Balancer specific parameters
382 | ###
383 | # Specify the JVM options to be used when starting the HDFS Balancer.
384 | # These options will be appended to the options specified as HADOOP_OPTS
385 | # and therefore may override any similar flags set in HADOOP_OPTS
386 | #
387 | # export HDFS_BALANCER_OPTS=""
388 | 
389 | ###
390 | # HDFS Mover specific parameters
391 | ###
392 | # Specify the JVM options to be used when starting the HDFS Mover.
393 | # These options will be appended to the options specified as HADOOP_OPTS
394 | # and therefore may override any similar flags set in HADOOP_OPTS
395 | #
396 | # export HDFS_MOVER_OPTS=""
397 | 
398 | ###
399 | # Advanced Users Only!
400 | ###
401 | 
402 | #
403 | # When building Hadoop, one can add the class paths to the commands
404 | # via this special env var:
405 | # export HADOOP_ENABLE_BUILD_PATHS="true"
406 | 
407 | #
408 | # To prevent accidents, shell commands be (superficially) locked
409 | # to only allow certain users to execute certain subcommands.
410 | # It uses the format of (command)_(subcommand)_USER.
411 | #
412 | # For example, to limit who can execute the namenode command,
413 | # export HDFS_NAMENODE_USER=hdfs
414 | 


--------------------------------------------------------------------------------
/ansible/roles/hadoop/templates/hdfs-datanode.service.j2:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=HDFS Namenode Service
 3 | After=network.target
 4 | Wants=hdfs-namenode.service
 5 | 
 6 | [Service]
 7 | Type=forking
 8 | User=vagrant
 9 | Environment=JAVA_HOME={{ java_home }}
10 | Environment=HADOOP_PREFIX={{ hadoop_home }}
11 | Environment=HADOOP_CONF_DIR={{ hadoop_home }}/etc/hadoop
12 | ExecStart={{ hadoop_home }}/bin/hdfs --config $HADOOP_CONF_DIR --daemon start datanode
13 | ExecStop={{ hadoop_home }}/bin/hdfs --config $HADOOP_CONF_DIR --daemon stop datanode
14 | Restart=on-failure
15 | 
16 | [Install]
17 | WantedBy=multi-user.target
18 | 


--------------------------------------------------------------------------------
/ansible/roles/hadoop/templates/hdfs-namenode.service.j2:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=HDFS Namenode Service
 3 | After=network.target
 4 | 
 5 | [Service]
 6 | Type=forking
 7 | Environment=JAVA_HOME={{ java_home }}
 8 | Environment=HADOOP_PREFIX={{ hadoop_home }}
 9 | Environment=HADOOP_CONF_DIR={{ hadoop_home }}/etc/hadoop
10 | User=vagrant
11 | ExecStart={{ hadoop_home }}/bin/hdfs --config $HADOOP_CONF_DIR --daemon start namenode
12 | ExecStop={{ hadoop_home }}/bin/hdfs --config $HADOOP_CONF_DIR --daemon stop namenode
13 | Restart=on-failure
14 | 
15 | [Install]
16 | WantedBy=multi-user.target
17 | 


--------------------------------------------------------------------------------
/ansible/roles/hadoop/templates/yarn-env.sh.j2:
--------------------------------------------------------------------------------
  1 | #
  2 | # Licensed to the Apache Software Foundation (ASF) under one or more
  3 | # contributor license agreements.  See the NOTICE file distributed with
  4 | # this work for additional information regarding copyright ownership.
  5 | # The ASF licenses this file to You under the Apache License, Version 2.0
  6 | # (the "License"); you may not use this file except in compliance with
  7 | # the License.  You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | ##
 18 | ## THIS FILE ACTS AS AN OVERRIDE FOR hadoop-env.sh FOR ALL
 19 | ## WORK DONE BY THE yarn AND RELATED COMMANDS.
 20 | ##
 21 | ## Precedence rules:
 22 | ##
 23 | ## yarn-env.sh > hadoop-env.sh > hard-coded defaults
 24 | ##
 25 | ## YARN_xyz > HADOOP_xyz > hard-coded defaults
 26 | ##
 27 | 
 28 | ###
 29 | # Resource Manager specific parameters
 30 | ###
 31 | 
 32 | # Specify the max heapsize for the ResourceManager.  If no units are
 33 | # given, it will be assumed to be in MB.
 34 | # This value will be overridden by an Xmx setting specified in either
 35 | # HADOOP_OPTS and/or YARN_RESOURCEMANAGER_OPTS.
 36 | # Default is the same as HADOOP_HEAPSIZE_MAX
 37 | export YARN_RESOURCEMANAGER_HEAPSIZE=384
 38 | 
 39 | # Specify the JVM options to be used when starting the ResourceManager.
 40 | # These options will be appended to the options specified as HADOOP_OPTS
 41 | # and therefore may override any similar flags set in HADOOP_OPTS
 42 | #
 43 | # Examples for a Sun/Oracle JDK:
 44 | # a) override the appsummary log file:
 45 | # export YARN_RESOURCEMANAGER_OPTS="-Dyarn.server.resourcemanager.appsummary.log.file=rm-appsummary.log -Dyarn.server.resourcemanager.appsummary.logger=INFO,RMSUMMARY"
 46 | #
 47 | # b) Set JMX options
 48 | # export YARN_RESOURCEMANAGER_OPTS="-Dcom.sun.management.jmxremote=true -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.port=1026"
 49 | #
 50 | # c) Set garbage collection logs from hadoop-env.sh
 51 | # export YARN_RESOURCE_MANAGER_OPTS="${HADOOP_GC_SETTINGS} -Xloggc:${HADOOP_LOG_DIR}/gc-rm.log-$(date +'%Y%m%d%H%M')"
 52 | #
 53 | # d) ... or set them directly
 54 | # export YARN_RESOURCEMANAGER_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -Xloggc:${HADOOP_LOG_DIR}/gc-rm.log-$(date +'%Y%m%d%H%M')"
 55 | #
 56 | #
 57 | # export YARN_RESOURCEMANAGER_OPTS=
 58 | 
 59 | ###
 60 | # Node Manager specific parameters
 61 | ###
 62 | 
 63 | # Specify the max heapsize for the NodeManager.  If no units are
 64 | # given, it will be assumed to be in MB.
 65 | # This value will be overridden by an Xmx setting specified in either
 66 | # HADOOP_OPTS and/or YARN_NODEMANAGER_OPTS.
 67 | # Default is the same as HADOOP_HEAPSIZE_MAX.
 68 | export YARN_NODEMANAGER_HEAPSIZE=384
 69 | 
 70 | # Specify the JVM options to be used when starting the NodeManager.
 71 | # These options will be appended to the options specified as HADOOP_OPTS
 72 | # and therefore may override any similar flags set in HADOOP_OPTS
 73 | #
 74 | # See ResourceManager for some examples
 75 | #
 76 | #export YARN_NODEMANAGER_OPTS=
 77 | 
 78 | ###
 79 | # TimeLineServer specific parameters
 80 | ###
 81 | 
 82 | # Specify the max heapsize for the timelineserver.  If no units are
 83 | # given, it will be assumed to be in MB.
 84 | # This value will be overridden by an Xmx setting specified in either
 85 | # HADOOP_OPTS and/or YARN_TIMELINESERVER_OPTS.
 86 | # Default is the same as HADOOP_HEAPSIZE_MAX.
 87 | #export YARN_TIMELINE_HEAPSIZE=
 88 | 
 89 | # Specify the JVM options to be used when starting the TimeLineServer.
 90 | # These options will be appended to the options specified as HADOOP_OPTS
 91 | # and therefore may override any similar flags set in HADOOP_OPTS
 92 | #
 93 | # See ResourceManager for some examples
 94 | #
 95 | #export YARN_TIMELINESERVER_OPTS=
 96 | 
 97 | ###
 98 | # TimeLineReader specific parameters
 99 | ###
100 | 
101 | # Specify the JVM options to be used when starting the TimeLineReader.
102 | # These options will be appended to the options specified as HADOOP_OPTS
103 | # and therefore may override any similar flags set in HADOOP_OPTS
104 | #
105 | # See ResourceManager for some examples
106 | #
107 | #export YARN_TIMELINEREADER_OPTS=
108 | 
109 | ###
110 | # Web App Proxy Server specifc parameters
111 | ###
112 | 
113 | # Specify the max heapsize for the web app proxy server.  If no units are
114 | # given, it will be assumed to be in MB.
115 | # This value will be overridden by an Xmx setting specified in either
116 | # HADOOP_OPTS and/or YARN_PROXYSERVER_OPTS.
117 | # Default is the same as HADOOP_HEAPSIZE_MAX.
118 | #export YARN_PROXYSERVER_HEAPSIZE=
119 | 
120 | # Specify the JVM options to be used when starting the proxy server.
121 | # These options will be appended to the options specified as HADOOP_OPTS
122 | # and therefore may override any similar flags set in HADOOP_OPTS
123 | #
124 | # See ResourceManager for some examples
125 | #
126 | #export YARN_PROXYSERVER_OPTS=
127 | 
128 | ###
129 | # Shared Cache Manager specific parameters
130 | ###
131 | # Specify the JVM options to be used when starting the
132 | # shared cache manager server.
133 | # These options will be appended to the options specified as HADOOP_OPTS
134 | # and therefore may override any similar flags set in HADOOP_OPTS
135 | #
136 | # See ResourceManager for some examples
137 | #
138 | #export YARN_SHAREDCACHEMANAGER_OPTS=
139 | 
140 | ###
141 | # Router specific parameters
142 | ###
143 | 
144 | # Specify the JVM options to be used when starting the Router.
145 | # These options will be appended to the options specified as HADOOP_OPTS
146 | # and therefore may override any similar flags set in HADOOP_OPTS
147 | #
148 | # See ResourceManager for some examples
149 | #
150 | #export YARN_ROUTER_OPTS=
151 | 


--------------------------------------------------------------------------------
/ansible/roles/hadoop/templates/yarn-nodemanager.service.j2:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=YARN service
 3 | After=network.target
 4 | Wants=yarn-resourcemanager.service
 5 | 
 6 | [Service]
 7 | Environment=JAVA_HOME={{ java_home }}
 8 | Environment=HADOOP_CONF_DIR={{ hadoop_home }}/etc/hadoop
 9 | Type=forking
10 | User=vagrant
11 | ExecStart={{ hadoop_home }}/bin/yarn --config $HADOOP_CONF_DIR --daemon start nodemanager
12 | ExecStop={{ hadoop_home }}/bin/yarn --config $HADOOP_CONF_DIR --daemon stop nodemanager
13 | Restart=on-failure
14 | 
15 | [Install]
16 | WantedBy=multi-user.target
17 | 


--------------------------------------------------------------------------------
/ansible/roles/hadoop/templates/yarn-resourcemanager.service.j2:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=YARN service
 3 | After=network.target
 4 | 
 5 | [Service]
 6 | Environment=JAVA_HOME={{ java_home }}
 7 | Environment=HADOOP_CONF_DIR={{ hadoop_home }}/etc/hadoop
 8 | Type=forking
 9 | User=vagrant
10 | ExecStart={{ hadoop_home }}/bin/yarn --config $HADOOP_CONF_DIR --daemon start resourcemanager
11 | ExecStop={{ hadoop_home }}/bin/yarn --config $HADOOP_CONF_DIR --daemon stop resourcemanager
12 | Restart=on-failure
13 | 
14 | [Install]
15 | WantedBy=multi-user.target
16 | 


--------------------------------------------------------------------------------
/ansible/roles/hadoop/templates/yarn-site.xml.j2:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <configuration>
 3 | 	<property>
 4 | 		<name>yarn.resourcemanager.hostname</name>
 5 | 		<value>{{ groups['hadoop-master'][0] }}</value>
 6 | 	</property>
 7 | 	<property>
 8 | 		<name>yarn.resourcemanager.bind-host</name>
 9 | 		<value>0.0.0.0</value>
10 | 	</property>
11 | 	<property>
12 | 		<name>yarn.nodemanager.hostname</name>
13 | 		<value>{{ inventory_hostname }}</value>
14 | 	</property>
15 | 	<property>
16 | 		<name>yarn.nodemanager.bind-host</name>
17 | 		<value>0.0.0.0</value>
18 | 	</property>
19 | 	<property>
20 | 		<name>yarn.nodemanager.resource.detect-hardware-capabilities</name>
21 | 		<value>true</value>
22 | 	</property>
23 | 	<property>
24 | 		<name>yarn.scheduler.minimum-allocation-mb</name>
25 | 		<value>128</value>
26 | 	</property>
27 | 	<property>
28 | 		<name>yarn.scheduler.maximum-allocation-mb</name>
29 | 		<value>3072</value>
30 | 	</property>
31 | 	<property>
32 | 		<name>yarn.nodemanager.vmem-check-enabled</name>
33 | 		<value>false</value>
34 | 	</property>
35 | 	<property>
36 | 		<name>yarn.nodemanager.pmem-check-enabled</name>
37 | 		<value>true</value>
38 | 	</property>
39 |   <property>
40 |       <name>yarn.log-aggregation-enable</name>
41 |       <value>true</value>
42 |   </property>
43 | </configuration>
44 | 


--------------------------------------------------------------------------------
/ansible/roles/java/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | 


--------------------------------------------------------------------------------
/ansible/roles/java/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: install {{ java_package }}
 4 |   become: yes
 5 |   yum:
 6 |     name: "{{ java_package }}"
 7 |     state: present
 8 | 
 9 | - name: set JAVA_HOME
10 |   become: yes
11 |   become_method: sudo
12 |   lineinfile:
13 |     dest: "{{ etc_profiles }}/java.sh"
14 |     create: yes
15 |     state: present
16 |     regexp: '^export JAVA_HOME'
17 |     line: "export JAVA_HOME=`java -XshowSettings:properties -version 2>&1 | sed '/^[[:space:]]*java\\.home/!d;s/^[[:space:]]*java\\.home[[:space:]]*=[[:space:]]*//'`"
18 | 


--------------------------------------------------------------------------------
/ansible/roles/kafka/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | version: "2.7.0"
3 | kafka: kafka_2.13-{{ version }}
4 | file: "{{ kafka }}.tgz"
5 | url: http://archive.apache.org/dist/kafka/{{ version }}/{{ file }}
6 | 


--------------------------------------------------------------------------------
/ansible/roles/kafka/handlers/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: restart zookeeper service
 4 |   become: yes
 5 |   systemd:
 6 |     enabled: yes
 7 |     state: restarted
 8 |     name: zookeeper
 9 |     daemon_reload: yes
10 |   listen: "restart zookeeper"
11 | 
12 | - name: wait until zookeeper has started
13 |   wait_for:
14 |     host: "{{ inventory_hostname }}"
15 |     port: 2181
16 |   listen: "restart zookeeper"
17 | 
18 | - name: restart kafka service
19 |   become: yes
20 |   systemd:
21 |     enabled: yes
22 |     state: restarted
23 |     name: kafka
24 |     daemon_reload: yes
25 |   listen: "restart kafka"
26 | 
27 | - name: wait until kafka PLAINTEXT has started
28 |   wait_for:
29 |     host: "{{ inventory_hostname }}"
30 |     port: 9092
31 |   listen: "restart kafka"
32 |   when: groups.kafka.index(inventory_hostname) != 0 #when initially setup the first broker will not start lonely
33 | 


--------------------------------------------------------------------------------
/ansible/roles/kafka/tasks/kafka.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: copy server.properties {{ kafka_home }}/config/server.properties
 4 |   become: yes
 5 |   template:
 6 |     src: templates/server.properties.j2
 7 |     dest: "{{ kafka_home }}/config/server.properties"
 8 |   notify: "restart kafka"
 9 | 
10 | - name: copy kafka.environment {{ kafka_home }}/kafka.environment
11 |   become: yes
12 |   template:
13 |     src: templates/kafka.environment.j2
14 |     dest: "{{ kafka_home }}/kafka.environment"
15 |   notify: "restart kafka"
16 | 
17 | - name: create {{ kafka_log_dir }}
18 |   become: yes
19 |   file:
20 |     path: "{{ kafka_log_dir }}"
21 |     state: directory
22 |   notify: "restart kafka"
23 | 
24 | - name: install kafka systemd unit file
25 |   become: yes
26 |   template:
27 |     src: templates/kafka.service.j2
28 |     dest: /etc/systemd/system/kafka.service
29 |   notify: "restart kafka"


--------------------------------------------------------------------------------
/ansible/roles/kafka/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: download {{ url }}
 3 |   run_once: true
 4 |   get_url:
 5 |     url: "{{ url }}"
 6 |     dest: "/tmp/{{ file }}"
 7 | 
 8 | - name: extract /tmp/{{ file }}
 9 |   become: yes
10 |   unarchive:
11 |     src: "/tmp/{{ file }}"
12 |     dest: /usr/local
13 |     copy: no
14 |     owner: root
15 |     group: root
16 |     creates: "{{ kafka_home }}"
17 | 
18 | - name: link to /usr/local/kafka
19 |   become: yes
20 |   file:
21 |     src:  "{{ kafka_home }}"
22 |     dest: /usr/local/kafka
23 |     state: link
24 | 
25 | - name: set PATH=$PATH:{{ kafka_home }}/bin
26 |   become: yes
27 |   lineinfile:
28 |     dest: /etc/profile.d/kafka.sh
29 |     create: yes
30 |     state: present
31 |     regexp: '^PATH'
32 |     line: 'PATH=$PATH:{{ kafka_home }}/bin'
33 | 
34 | - import_tasks: zookeeper.yml
35 |   tags: zookeeper
36 | 
37 | - import_tasks: kafka.yml
38 |   tags: kafka
39 | 


--------------------------------------------------------------------------------
/ansible/roles/kafka/tasks/zookeeper.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: copy {{ kafka_home }}/config/zookeeper.properties
 4 |   become: yes
 5 |   template:
 6 |     src: templates/zookeeper.properties.j2
 7 |     dest: "{{ kafka_home }}/config/zookeeper.properties"
 8 |   notify: "restart zookeeper"
 9 | 
10 | - name: create {{ zookeeper_data_dir }}
11 |   become: yes
12 |   file:
13 |     path: "{{ zookeeper_data_dir }}"
14 |     state: directory
15 |   notify: "restart zookeeper"
16 | 
17 | - name: create myid in {{ zookeeper_data_dir }} for {{ groups.kafka.index(inventory_hostname) + 1 }}
18 |   become: yes
19 |   copy:
20 |     content: "{{ groups.kafka.index(inventory_hostname) + 1 }}"
21 |     dest: "{{ zookeeper_data_dir }}/myid"
22 |     force: yes
23 |   notify: "restart zookeeper"
24 | 
25 | - name: copy zookeeper.environment {{ kafka_home }}/zookeeper.environment
26 |   become: yes
27 |   template:
28 |     src: templates/zookeeper.environment.j2
29 |     dest: "{{ kafka_home }}/zookeeper.environment"
30 |   notify: "restart zookeeper"
31 | 
32 | - name: install zookeeper systemd unit file
33 |   become: yes
34 |   template:
35 |     src: templates/zookeeper.service.j2
36 |     dest: /etc/systemd/system/zookeeper.service
37 |   notify: "restart zookeeper"
38 | 


--------------------------------------------------------------------------------
/ansible/roles/kafka/templates/kafka.environment.j2:
--------------------------------------------------------------------------------
1 | JMX_PORT=5555
2 | 


--------------------------------------------------------------------------------
/ansible/roles/kafka/templates/kafka.service.j2:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Kafka Service
 3 | After=network-online.target
 4 | 
 5 | [Service]
 6 | EnvironmentFile={{ kafka_home }}/kafka.environment
 7 | Type=forking
 8 | User=root
 9 | ExecStart={{ kafka_home }}/bin/kafka-server-start.sh -daemon {{ kafka_home }}/config/server.properties
10 | ExecStop={{ kafka_home }}/bin/kafka-server-start.sh stop
11 | Restart=on-failure
12 | 
13 | [Install]
14 | WantedBy=multi-user.target
15 | 


--------------------------------------------------------------------------------
/ansible/roles/kafka/templates/server.properties.j2:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one or more
  2 | # contributor license agreements.  See the NOTICE file distributed with
  3 | # this work for additional information regarding copyright ownership.
  4 | # The ASF licenses this file to You under the Apache License, Version 2.0
  5 | # (the "License"); you may not use this file except in compliance with
  6 | # the License.  You may obtain a copy of the License at
  7 | #
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # see kafka.server.KafkaConfig for additional details and defaults
 17 | 
 18 | ############################# Server Basics #############################
 19 | 
 20 | # The id of the broker. This must be set to a unique integer for each broker.
 21 | broker.id={{ groups.kafka.index(inventory_hostname) }}
 22 | 
 23 | # Disable auto topic creation, default value is true
 24 | auto.create.topics.enable=false
 25 | 
 26 | # Switch to enable topic deletion or not, default value is false
 27 | #delete.topic.enable=true
 28 | 
 29 | ############################# Socket Server Settings #############################
 30 | 
 31 | # The address the socket server listens on. It will get the value returned from
 32 | # java.net.InetAddress.getCanonicalHostName() if not configured.
 33 | #   FORMAT:
 34 | #     listeners = listener_name://host_name:port
 35 | #   EXAMPLE:
 36 | #     listeners = PLAINTEXT://your.host.name:9092
 37 | listeners=PLAINTEXT://0.0.0.0:9092
 38 | 
 39 | # Hostname and port the broker will advertise to producers and consumers. If not set,
 40 | # it uses the value for "listeners" if configured.  Otherwise, it will use the value
 41 | # returned from java.net.InetAddress.getCanonicalHostName().
 42 | advertised.listeners=PLAINTEXT://{{ inventory_hostname }}:9092
 43 | 
 44 | # Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
 45 | #listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
 46 | 
 47 | # The number of threads handling network requests
 48 | num.network.threads=3
 49 | 
 50 | # The number of threads doing disk I/O
 51 | num.io.threads=8
 52 | 
 53 | # The send buffer (SO_SNDBUF) used by the socket server
 54 | socket.send.buffer.bytes=102400
 55 | 
 56 | # The receive buffer (SO_RCVBUF) used by the socket server
 57 | socket.receive.buffer.bytes=102400
 58 | 
 59 | # The maximum size of a request that the socket server will accept (protection against OOM)
 60 | socket.request.max.bytes=104857600
 61 | 
 62 | 
 63 | ############################# Log Basics #############################
 64 | 
 65 | # A comma seperated list of directories under which to store log files
 66 | log.dirs={{ kafka_log_dir }}
 67 | 
 68 | # The default number of log partitions per topic. More partitions allow greater
 69 | # parallelism for consumption, but this will also result in more files across
 70 | # the brokers.
 71 | num.partitions=1
 72 | 
 73 | # The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
 74 | # This value is recommended to be increased for installations with data dirs located in RAID array.
 75 | num.recovery.threads.per.data.dir=1
 76 | 
 77 | ############################# Log Flush Policy #############################
 78 | 
 79 | # Messages are immediately written to the filesystem but by default we only fsync() to sync
 80 | # the OS cache lazily. The following configurations control the flush of data to disk.
 81 | # There are a few important trade-offs here:
 82 | #    1. Durability: Unflushed data may be lost if you are not using replication.
 83 | #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
 84 | #    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks.
 85 | # The settings below allow one to configure the flush policy to flush data after a period of time or
 86 | # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 87 | 
 88 | # The number of messages to accept before forcing a flush of data to disk
 89 | #log.flush.interval.messages=10000
 90 | 
 91 | # The maximum amount of time a message can sit in a log before we force a flush
 92 | log.flush.interval.ms=1000
 93 | 
 94 | ############################# Log Retention Policy #############################
 95 | 
 96 | # The following configurations control the disposal of log segments. The policy can
 97 | # be set to delete segments after a period of time, or after a given size has accumulated.
 98 | # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 99 | # from the end of the log.
100 | 
101 | # The minimum age of a log file to be eligible for deletion due to age
102 | log.retention.hours=168
103 | 
104 | # A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
105 | # segments don't drop below log.retention.bytes. Functions independently of log.retention.hours.
106 | #log.retention.bytes=1073741824
107 | 
108 | # The maximum size of a log segment file. When this size is reached a new log segment will be created.
109 | log.segment.bytes=1073741824
110 | 
111 | # The interval at which log segments are checked to see if they can be deleted according
112 | # to the retention policies
113 | log.retention.check.interval.ms=300000
114 | 
115 | ############################# Zookeeper #############################
116 | 
117 | # Zookeeper connection string (see zookeeper docs for details).
118 | # This is a comma separated host:port pairs, each corresponding to a zk
119 | # server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002".
120 | # You can also append an optional chroot string to the urls to specify the
121 | # root directory for all kafka znodes.
122 | zookeeper.connect={% for server in groups['zookeeper'] %}{% if loop.index > 1%},{% endif %}{{ server }}:2181{% endfor %}
123 | 
124 | # Timeout in ms for connecting to zookeeper
125 | zookeeper.connection.timeout.ms=6000
126 | 


--------------------------------------------------------------------------------
/ansible/roles/kafka/templates/zookeeper.environment.j2:
--------------------------------------------------------------------------------
1 | JMXPORT=5556
2 | 


--------------------------------------------------------------------------------
/ansible/roles/kafka/templates/zookeeper.properties.j2:
--------------------------------------------------------------------------------
 1 | # The number of milliseconds of each tick
 2 | tickTime=2000
 3 | # The number of ticks that the initial
 4 | # synchronization phase can take
 5 | initLimit=10
 6 | # The number of ticks that can pass between
 7 | # sending a request and getting an acknowledgement
 8 | syncLimit=5
 9 | # the directory where the snapshot is stored.
10 | # do not use /tmp for storage, /tmp here is just
11 | # example sakes.
12 | dataDir={{ zookeeper_data_dir }}
13 | # the port at which the clients will connect
14 | clientPort=2181
15 | # the maximum number of client connections.
16 | # increase this if you need to handle more clients
17 | #maxClientCnxns=60
18 | #
19 | # Be sure to read the maintenance section of the
20 | # administrator guide before turning on autopurge.
21 | #
22 | # http://zookeeper.apache.org/doc/current/zookeeperAdmin.html#sc_maintenance
23 | #
24 | # The number of snapshots to retain in dataDir
25 | #autopurge.snapRetainCount=3
26 | # Purge task interval in hours
27 | # Set to "0" to disable auto purge feature
28 | #autopurge.purgeInterval=1
29 | # Zookeeper Nodes
30 | {% for item in groups['kafka'] %}
31 | server.{{ loop.index }}={{ item }}:2888:3888
32 | {% endfor %}


--------------------------------------------------------------------------------
/ansible/roles/kafka/templates/zookeeper.service.j2:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Zookeeper Service
 3 | After=network-online.target
 4 | 
 5 | [Service]
 6 | EnvironmentFile={{ kafka_home }}/zookeeper.environment
 7 | Type=forking
 8 | User=root
 9 | ExecStart={{ kafka_home }}/bin/zookeeper-server-start.sh -daemon {{ kafka_home }}/config/zookeeper.properties
10 | ExecStop={{ kafka_home }}/bin/zookeeper-server-stop.sh
11 | Restart=on-failure
12 | 
13 | [Install]
14 | WantedBy=multi-user.target
15 | 


--------------------------------------------------------------------------------
/ansible/roles/spark/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | version: 3.0.2
3 | spark: spark-{{ version }}-bin-without-hadoop #-scala-2.12
4 | file: "{{ spark }}.tgz"
5 | url: http://ftp-stud.hs-esslingen.de/pub/Mirrors/ftp.apache.org/dist/spark/spark-{{ version }}/{{ file }}
6 | 


--------------------------------------------------------------------------------
/ansible/roles/spark/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: download {{ url }}
 3 |   get_url:
 4 |     url: "{{ url }}"
 5 |     dest: "/tmp/{{ file }}"
 6 | 
 7 | - name: extract /tmp/{{ file }}
 8 |   become: yes
 9 |   become_method: sudo
10 |   unarchive:
11 |     src: "/tmp/{{ file }}"
12 |     dest: "{{ usr_local }}"
13 |     copy: no
14 |     creates: "{{ spark_home }}"
15 | 
16 | - name: set ownership on {{ spark_home }}
17 |   become: yes
18 |   become_method: sudo
19 |   file:
20 |     path: "{{ spark_home }}"
21 |     owner: vagrant
22 |     group: vagrant
23 |     recurse: yes
24 |     mode: 0755
25 | 
26 | - name: set PATH=$PATH:{{ spark_home }}/bin
27 |   become: yes
28 |   become_method: sudo
29 |   lineinfile:
30 |     dest: "{{ etc_profiles }}/spark.sh"
31 |     create: yes
32 |     state: present
33 |     regexp: '^PATH'
34 |     line: 'PATH=$PATH:{{ spark_home }}/bin'
35 | 
36 | - name: set SPARK_LOCAL_IP={{ hostvars[inventory_hostname]['ansible_all_ipv4_addresses'][0] }}
37 |   become: yes
38 |   become_method: sudo
39 |   lineinfile:
40 |     dest: "{{ etc_profiles }}/spark.sh"
41 |     create: yes
42 |     state: present
43 |     regexp: '^SPARK_LOCAL_IP'
44 |     line: "SPARK_LOCAL_IP={{ hostvars[inventory_hostname]['ansible_all_ipv4_addresses'][0] }}"
45 | 
46 | - name: set SPARK_DIST_CLASSPATH=$(hadoop classpath)
47 |   become: yes
48 |   become_method: sudo
49 |   lineinfile:
50 |     dest: "{{ etc_profiles }}/spark.sh"
51 |     create: yes
52 |     state: present
53 |     regexp: '^export SPARK_DIST_CLASSPATH'
54 |     line: 'export SPARK_DIST_CLASSPATH=$(hadoop classpath)'
55 | 


--------------------------------------------------------------------------------
/doc/fastdata-cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markush81/fastdata-cluster/96ff28d400c24d208092dee14b3d08d95b65416b/doc/fastdata-cluster.png


--------------------------------------------------------------------------------
/doc/flink.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markush81/fastdata-cluster/96ff28d400c24d208092dee14b3d08d95b65416b/doc/flink.png


--------------------------------------------------------------------------------
/doc/spark-streaming.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markush81/fastdata-cluster/96ff28d400c24d208092dee14b3d08d95b65416b/doc/spark-streaming.png


--------------------------------------------------------------------------------
/doc/yarn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markush81/fastdata-cluster/96ff28d400c24d208092dee14b3d08d95b65416b/doc/yarn.png


--------------------------------------------------------------------------------
/exchange/spark-playground.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markush81/fastdata-cluster/96ff28d400c24d208092dee14b3d08d95b65416b/exchange/spark-playground.jar


--------------------------------------------------------------------------------