├── .gitignore
├── LICENSE
├── README.md
├── Vagrantfile
├── bootstrap-master.sh
├── group_vars
└── all
├── heat-datanode.yaml
├── heat-hadoop-cluster.yaml
├── heat-inventory.py
├── hosts-dev
├── nodes-dev
├── playbook.yml
└── roles
├── common
├── tasks
│ └── main.yml
├── templates
│ ├── core-site.xml
│ ├── hadoop_rsa.pub
│ ├── hdfs-site.xml
│ ├── mapred-site.xml
│ └── yarn-site.xml
└── vars
│ └── main.yml
├── master
├── tasks
│ └── main.yml
├── templates
│ ├── hadoop_rsa
│ └── slaves
└── vars
│ └── main.yml
└── oraclejava8
├── tasks
└── main.yml
└── templates
└── oracle-pgp-key
/.gitignore:
--------------------------------------------------------------------------------
1 | .vagrant
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Daniel Watrous
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Hadoop multi-node cluster with Ansible
2 | Multi-server deployment of Hadoop using Ansible
3 |
4 | This repository contains a set of Vagrant and Ansible scripts that make it fast and easy to build a fully functional Hadoop cluster, including HDFS, on a single computer using VirtualBox. In order to run the scripts as they are, you will probably need about 16GB RAM and at least 4 CPUs.
5 |
6 | ## Quick Start (Local)
7 |
8 | - Clone this repository
9 | - (optional) Download a binary release of hadoop (e.g. http://www.apache.org/dyn/closer.cgi/hadoop/common/hadoop-2.7.1/hadoop-2.7.1.tar.gz) and saved it to `roles/common/templates/hadoop-2.7.1.tar.gz` then update `roles/common/tasks/main.yml` to use the alternative approach
10 | - Open a command prompt to the directory where you cloned the code
11 | - Run `vagrant up`
12 | - Use the commented lines in `bootstrap-master.sh` to do the following
13 | - Run the ansible playbook: `ansible-playbook -i hosts-dev playbook.yml`
14 | - Format the HDFS namenode
15 | - Start DFS and YARN
16 | - Run an example job: `hadoop jar /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.1.jar pi 10 30`
17 |
18 | ### Additional Details and Explanation
19 |
20 | I wrote up a detailed article about how to understand and run these scripts. This includes the expected output and instructions to modify the process to accommodate proxy environments and low RAM environments. You can find that here:
21 |
22 | http://software.danielwatrous.com/install-and-configure-a-multi-node-hadoop-cluster-using-ansible/
23 |
24 | ## Quick Start (OpenStack)
25 | - Clone this repository
26 | - Open a command prompt to the directory where you cloned the code
27 | - Edit the Vagrantfile and remove the unused datanodes. You may also reduce the memory for the master, which will become your Ansible host.
28 | - Run `vagrant up` (this sets up the virtualenv for openstack connectivity)
29 | - Use the commented lines in `bootstrap-master.sh` to do the following
30 | - Enter the virtualenv with `source ~/venv/bin/activate`
31 | - Download openrc from OpenStack file and source to establish your environment
32 | - Use openstack CLI to gather details and update `heat-hadoop-cluster.yaml`
33 | - Update other files for proxy, usernames, etc. if necessary
34 | - Run `heat stack-create hadoop-stack -f heat-hadoop-cluster.yaml` and other connectivity commands
35 | - Run the ansible playbook: `ansible-playbook -i hosts-pro playbook.yml`
36 | - Format the HDFS namenode
37 | - Start DFS and YARN
38 | - Run an example job: `hadoop jar /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.1.jar pi 10 30`
39 |
40 |
--------------------------------------------------------------------------------
/Vagrantfile:
--------------------------------------------------------------------------------
1 | # -*- mode: ruby -*-
2 | # vi: set ft=ruby :
3 |
4 | #http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/ClusterSetup.html
5 |
6 | PRIVATE_KEY_SOURCE = 'C:\Users\watrous\.vagrant.d\insecure_private_key'
7 | PRIVATE_KEY_DESTINATION = '/home/vagrant/.ssh/id_rsa'
8 | MASTER_IP = '192.168.51.4'
9 | DATA1_IP = '192.168.51.5'
10 | DATA2_IP = '192.168.51.6'
11 |
12 | Vagrant.configure("2") do |config|
13 |
14 | config.ssh.insert_key = false
15 |
16 | # define Master server
17 | config.vm.define "master" do |master|
18 | master.vm.hostname = "hadoop-master"
19 | master.vm.box = "ubuntu/trusty64"
20 | master.vm.synced_folder ".", "/home/vagrant/src", mount_options: ["dmode=775,fmode=664"]
21 | master.vm.network "private_network", ip: MASTER_IP
22 | master.vm.provider "virtualbox" do |v|
23 | v.name = "master"
24 | v.cpus = 2
25 | v.memory = 3072
26 | end
27 | # copy private key so hosts can ssh using key authentication (the script below sets permissions to 600)
28 | master.vm.provision :file do |file|
29 | file.source = PRIVATE_KEY_SOURCE
30 | file.destination = PRIVATE_KEY_DESTINATION
31 | end
32 | master.vm.provision "shell", path: "bootstrap-master.sh"
33 | end
34 |
35 | # define data1 server
36 | config.vm.define "data1" do |data1|
37 | data1.vm.hostname = "hadoop-data1"
38 | data1.vm.box = "ubuntu/trusty64"
39 | data1.vm.network "private_network", ip: DATA1_IP
40 | data1.vm.provider "virtualbox" do |v|
41 | v.name = "data1"
42 | v.cpus = 2
43 | v.memory = 3072
44 | end
45 | # copy private key so hosts can ssh using key authentication (the script below sets permissions to 600)
46 | data1.vm.provision :file do |file|
47 | file.source = PRIVATE_KEY_SOURCE
48 | file.destination = PRIVATE_KEY_DESTINATION
49 | end
50 | end
51 |
52 | # define data2 server
53 | config.vm.define "data2" do |data2|
54 | data2.vm.hostname = "hadoop-data2"
55 | data2.vm.box = "ubuntu/trusty64"
56 | data2.vm.network "private_network", ip: DATA2_IP
57 | data2.vm.provider "virtualbox" do |v|
58 | v.name = "data2"
59 | v.cpus = 2
60 | v.memory = 3072
61 | end
62 | # copy private key so hosts can ssh using key authentication (the script below sets permissions to 600)
63 | data2.vm.provision :file do |file|
64 | file.source = PRIVATE_KEY_SOURCE
65 | file.destination = PRIVATE_KEY_DESTINATION
66 | end
67 | end
68 |
69 | end
70 |
--------------------------------------------------------------------------------
/bootstrap-master.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #export https_proxy=https://proxy.company.com:8080
4 | #export http_proxy=http://proxy.company.com:8080
5 |
6 | sudo -E apt-get update
7 | sudo -E apt-get install -y unzip python-pip python-virtualenv python-dev
8 | sudo -E pip install ansible
9 |
10 | chmod 600 /home/vagrant/.ssh/id_rsa
11 |
12 | ### Use heat (an orchestration tool to provision in OpenStack) to prepare servers for Hadoop ###
13 | # create python virtualenv in ~/venv
14 | virtualenv venv
15 | chown -R vagrant:vagrant venv
16 | # install heat client
17 | apt-get install -y libffi-dev libssl-dev
18 | /home/vagrant/venv/bin/pip install python-heatclient python-openstackclient pyopenssl ndg-httpsclient pyasn1
19 | # setup environment to connect to openstack using heat
20 | #source ~/venv/bin/activate
21 | #cd ~/src
22 | #source hadoop-Project-openrc.sh
23 | # update heat-hadoop-cluster.yaml
24 | #openstack keypair list
25 | #openstack image list
26 | #openstack flavor list
27 | #openstack network list
28 | #openstack security group list
29 | # update proxy details in group_vars/all
30 | # update remote user in playbook.yml
31 | # update ansible_ssh_(user|private_key_file) in heat-inventory.py
32 | #heat stack-create hadoop-stack -f heat-hadoop-cluster.yaml
33 | #heat output-show hadoop-stack hadoop_master_public_ip 2>&1 | grep -o '[^"]*'
34 | #cp hadoop.pem ~/.ssh/
35 | #chmod 600 ~/.ssh/hadoop.pem
36 | #python heat-inventory.py
37 | #source scan-node-keys.sh
38 | ### End heat ###
39 |
40 | #ansible-playbook -i hosts-dev playbook.yml
41 | ## --OR-- ##
42 | #ansible-playbook -i hosts-pro playbook.yml
43 |
44 | # for openstack, first login to the master before running the remaining commands
45 | #ssh -i ~/.ssh/hadoop.pem ubuntu@[hadoop_master_public_ip]
46 |
47 | #sudo su - hadoop
48 | #hdfs namenode -format
49 | #/usr/local/hadoop/sbin/start-dfs.sh
50 | #hdfs dfsadmin -report
51 | #/usr/local/hadoop/sbin/start-yarn.sh
52 | #/usr/local/hadoop/sbin/stop-dfs.sh
53 | #/usr/local/hadoop/sbin/stop-yarn.sh
54 | #$HADOOP_HOME/sbin/slaves.sh jps
55 | #hadoop jar /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.1.jar pi 10 30
56 |
--------------------------------------------------------------------------------
/group_vars/all:
--------------------------------------------------------------------------------
1 | ---
2 | proxy_env:
3 | #http_proxy: http://proxy.company.com:8080
4 | #https_proxy: https://proxy.company.com:8080
5 | none: false
6 |
--------------------------------------------------------------------------------
/heat-datanode.yaml:
--------------------------------------------------------------------------------
1 | heat_template_version: 2013-05-23
2 |
3 | description: >
4 | Template for additional nodes in a Hadoop cluster
5 |
6 | parameters:
7 | hadoop_security_group:
8 | type: string
9 | description: Security group passed from main template
10 | key_name:
11 | type: string
12 | image_id:
13 | type: string
14 | admin_user:
15 | type: string
16 | flavor:
17 | type: string
18 | public_net_id:
19 | type: string
20 | private_net_id:
21 | type: string
22 | private_subnet_id:
23 | type: string
24 | ssh_security_group:
25 | type: string
26 | server_name:
27 | type: string
28 |
29 | resources:
30 | hadoop_data_server:
31 | type: OS::Nova::Server
32 | properties:
33 | name: { get_param: server_name }
34 | admin_user: { get_param: admin_user }
35 | image: { get_param: image_id }
36 | flavor: { get_param: flavor }
37 | key_name: { get_param: key_name }
38 | networks:
39 | - port: { get_resource: hadoop_data_server_port }
40 |
41 | hadoop_data_server_port:
42 | type: OS::Neutron::Port
43 | properties:
44 | network_id: { get_param: private_net_id }
45 | fixed_ips:
46 | - subnet_id: { get_param: private_subnet_id }
47 | security_groups: [{ get_param: hadoop_security_group }, { get_param: ssh_security_group }]
48 |
49 | hadoop_data_server_floating_ip:
50 | type: OS::Neutron::FloatingIP
51 | properties:
52 | floating_network_id: { get_param: public_net_id }
53 | port_id: { get_resource: hadoop_data_server_port }
54 |
55 | outputs:
56 | hadoop_node_private_ip:
57 | description: IP address of Hadoop Data Node in private network
58 | value: [{ get_attr: [ hadoop_data_server, first_address ] }, { get_attr: [ hadoop_data_server, name ] }]
59 | hadoop_node_public_ip:
60 | description: Floating IP address of Hadoop Data Node in public network
61 | value: [{ get_attr: [ hadoop_data_server_floating_ip, floating_ip_address ] }, { get_attr: [ hadoop_data_server, name ] }]
62 |
--------------------------------------------------------------------------------
/heat-hadoop-cluster.yaml:
--------------------------------------------------------------------------------
1 | heat_template_version: 2013-05-23
2 |
3 | description: >
4 | HOT template to deploy an automation server to host the REST API for
5 | onboarding automation and assign a floating IP address to the server
6 |
7 | parameters:
8 | key_name:
9 | type: string
10 | description: Name of keypair to assign to servers
11 | default: KEYPAIR
12 | image_id:
13 | type: string
14 | description: ID of image to use for servers
15 | default: IMAGEID
16 | admin_user:
17 | type: string
18 | description: Username of admin user
19 | default: ADMINUSER
20 | flavor:
21 | type: string
22 | description: Flavor to use for servers
23 | default: FLAVOR
24 | public_net_id:
25 | type: string
26 | description: >
27 | ID of public network for which floating IP addresses will be allocated
28 | default: PUBNETID
29 | private_net_id:
30 | type: string
31 | description: ID of private network into which servers get deployed
32 | default: PRIVNETID
33 | private_subnet_id:
34 | type: string
35 | description: ID of private sub network into which servers get deployed
36 | default: PRIVSUBNETID
37 | ssh_security_group:
38 | type: string
39 | description: ID of a security-group that allows SSH traffic
40 | default: SSHSECGRPID
41 |
42 | resources:
43 | hadoop_master_server:
44 | type: OS::Nova::Server
45 | properties:
46 | name: hadoop_master
47 | admin_user: { get_param: admin_user }
48 | image: { get_param: image_id }
49 | flavor: { get_param: flavor }
50 | key_name: { get_param: key_name }
51 | networks:
52 | - port: { get_resource: hadoop_master_server_port }
53 |
54 | hadoop_master_server_port:
55 | type: OS::Neutron::Port
56 | properties:
57 | network_id: { get_param: private_net_id }
58 | fixed_ips:
59 | - subnet_id: { get_param: private_subnet_id }
60 | security_groups: [{ get_resource: hadoop_security_group }, { get_param: ssh_security_group }]
61 |
62 | hadoop_master_server_floating_ip:
63 | type: OS::Neutron::FloatingIP
64 | properties:
65 | floating_network_id: { get_param: public_net_id }
66 | port_id: { get_resource: hadoop_master_server_port }
67 |
68 | hadoop_security_group:
69 | type: OS::Neutron::SecurityGroup
70 | properties:
71 | description: Add security group rules for server
72 | name: hadoop-security-group
73 | rules:
74 | - remote_ip_prefix: 0.0.0.0/0
75 | protocol: tcp
76 | port_range_min: 50070
77 | port_range_max: 50070
78 | - remote_ip_prefix: 0.0.0.0/0
79 | protocol: tcp
80 | port_range_min: 50075
81 | port_range_max: 50075
82 | # update the following remote_ip_prefix to match the subnet of your private network
83 | - remote_ip_prefix: 172.16.0.0/24
84 | protocol: tcp
85 | port_range_min: 1
86 | port_range_max: 65535
87 | - remote_ip_prefix: 0.0.0.0/0
88 | protocol: tcp
89 | port_range_min: 8088
90 | port_range_max: 8088
91 | - remote_ip_prefix: 0.0.0.0/0
92 | protocol: icmp
93 |
94 | nodes:
95 | type: OS::Heat::ResourceGroup
96 | properties:
97 | count: 2
98 | resource_def:
99 | type: heat-datanode.yaml
100 | properties:
101 | server_name: hadoop-data-%index%
102 | hadoop_security_group: { get_resource: hadoop_security_group }
103 | key_name: { get_param: key_name }
104 | image_id: { get_param: image_id }
105 | admin_user: { get_param: admin_user }
106 | flavor: { get_param: flavor }
107 | public_net_id: { get_param: public_net_id }
108 | private_net_id: { get_param: private_net_id }
109 | private_subnet_id: { get_param: private_subnet_id }
110 | ssh_security_group: { get_param: ssh_security_group }
111 |
112 | outputs:
113 | hadoop_master_private_ip:
114 | description: IP address of Hadoop Master in private network
115 | value: { get_attr: [ hadoop_master_server, first_address ] }
116 | hadoop_master_public_ip:
117 | description: Floating IP address of Hadoop Master in public network
118 | value: { get_attr: [ hadoop_master_server_floating_ip, floating_ip_address ] }
119 | nodes_public_ips:
120 | description: Public IP for datanode
121 | value: {get_attr: [nodes, hadoop_node_public_ip]}
122 | nodes_private_ips:
123 | description: Private IP for datanode
124 | value: {get_attr: [nodes, hadoop_node_private_ip]}
125 |
--------------------------------------------------------------------------------
/heat-inventory.py:
--------------------------------------------------------------------------------
1 | #-------------------------------------------------------------------------------
2 | # Name: heat_inventory
3 | # Purpose:
4 | #
5 | # Author: Daniel Watrous
6 | #
7 | # Created: 10/07/2015
8 | # Copyright: (c) HP 2015
9 | #-------------------------------------------------------------------------------
10 | #!/usr/bin/python
11 |
12 | import json
13 | from string import Template
14 | from textwrap import dedent
15 | import subprocess
16 |
17 | class heat_inventory:
18 |
19 | # output keys
20 | hadoop_master_public_key = "hadoop_master_public_ip"
21 | hadoop_master_private_key = "hadoop_master_private_ip"
22 | hadoop_datanode_public_key = "nodes_public_ips"
23 | hadoop_datanode_private_key = "nodes_private_ips"
24 |
25 | # template values
26 | ansible_ssh_user = "debian"
27 | ansible_ssh_private_key_file = "~/.ssh/hadoop.pem"
28 |
29 | # templates
30 | host_entry = Template('$ipaddress ansible_connection=ssh ansible_ssh_user=$ssh_user ansible_ssh_private_key_file=$private_key_file')
31 | hosts_output = Template("""[hadoop-master]
32 | $master_host
33 |
34 | [hadoop-data]
35 | $data_hosts
36 |
37 | [hadoop-master:vars]
38 | nodesfile=nodes-pro
39 |
40 | [hadoop-data:vars]
41 | nodesfile=nodes-pro""")
42 |
43 | node_entry = Template(""" - hostname: $hostname
44 | ip: $ipaddress""")
45 | nodes_section = Template("""---
46 | nodes:
47 | $nodes
48 | """)
49 | nodes_sshkeyscan = Template('ssh-keyscan -t rsa $ipaddress >> ~/.ssh/known_hosts')
50 |
51 | def __init__(self):
52 | self.load_heat_output()
53 |
54 | def load_heat_output(self):
55 | self.heat_output = json.loads(subprocess.Popen("heat output-show hadoop-stack --all", shell=True, stdout=subprocess.PIPE).stdout.read())
56 |
57 | def get_master_public_ip(self):
58 | for output_item in self.heat_output:
59 | if self.hadoop_master_public_key == output_item['output_key']:
60 | return output_item['output_value']
61 |
62 | def get_master_private_ip(self):
63 | for output_item in self.heat_output:
64 | if self.hadoop_master_private_key == output_item['output_key']:
65 | return output_item['output_value']
66 |
67 | def get_datanode_public_ips(self):
68 | for output_item in self.heat_output:
69 | if self.hadoop_datanode_public_key == output_item['output_key']:
70 | return output_item['output_value']
71 |
72 | def get_datanode_private_ips(self):
73 | for output_item in self.heat_output:
74 | if self.hadoop_datanode_private_key == output_item['output_key']:
75 | return output_item['output_value']
76 |
77 | # Ansible hosts file
78 | def get_host_entry(self, ipaddress):
79 | return self.host_entry.substitute(ipaddress=ipaddress, ssh_user=self.ansible_ssh_user, private_key_file=self.ansible_ssh_private_key_file)
80 |
81 | def get_datanode_host_entries(self):
82 | datanode_hosts = []
83 | for datanode_host in self.get_datanode_public_ips():
84 | datanode_hosts.append(self.get_host_entry(datanode_host[0]))
85 | return "\n".join(datanode_hosts)
86 |
87 | def get_hosts_output(self):
88 | master_host = self.get_host_entry(self.get_master_public_ip())
89 | datanode_hosts = self.get_datanode_host_entries()
90 | return dedent(self.hosts_output.substitute(master_host=master_host, data_hosts=datanode_hosts))
91 |
92 | # Ansible group_vars nodes
93 | def get_node_entry(self, hostname, ipaddress):
94 | return self.node_entry.substitute(hostname=hostname, ipaddress=ipaddress)
95 |
96 | def get_nodes_entries(self):
97 | nodes = []
98 | nodes.append(self.get_node_entry('hadoop-master', self.get_master_private_ip()))
99 | for node in self.get_datanode_private_ips():
100 | nodes.append(self.get_node_entry(node[1], node[0]))
101 | return "\n".join(nodes)
102 |
103 | def get_nodes_output(self):
104 | return self.nodes_section.substitute(nodes=self.get_nodes_entries())
105 |
106 | def get_node_keyscan_script(self):
107 | nodes = []
108 | nodes.append(self.nodes_sshkeyscan.substitute(ipaddress=self.get_master_public_ip()))
109 | for node in self.get_datanode_public_ips():
110 | nodes.append(self.nodes_sshkeyscan.substitute(ipaddress=node[0]))
111 | return "\n".join(nodes)
112 |
113 | def main():
114 | heat_inv = heat_inventory()
115 | ## print "hadoop master public IP: " + heat_inv.get_master_public_ip()
116 | ## print "hadoop master private IP: " + heat_inv.get_master_private_ip()
117 | ## print "hadoop datanode private IP: " + ', '.join(heat_inv.get_datanode_private_ips())
118 | ## print "hadoop datanode public IP: " + ', '.join(heat_inv.get_datanode_public_ips())
119 | inventory_file = open('hosts-pro', 'w')
120 | nodes_file = open('nodes-pro', 'w')
121 | inventory_file.write(heat_inv.get_hosts_output())
122 | nodes_file.write(heat_inv.get_nodes_output())
123 | inventory_file.close()
124 | nodes_file.close()
125 | keyscan_script_file = open('scan-node-keys.sh', 'w')
126 | keyscan_script_file.write(heat_inv.get_node_keyscan_script())
127 | keyscan_script_file.close()
128 |
129 | if __name__ == '__main__':
130 | main()
131 |
132 |
--------------------------------------------------------------------------------
/hosts-dev:
--------------------------------------------------------------------------------
1 | [hadoop-master]
2 | 192.168.51.4 ansible_connection=local
3 |
4 | [hadoop-data]
5 | 192.168.51.5 ansible_connection=ssh ansible_ssh_user=vagrant ansible_ssh_private_key_file=~/.ssh/id_rsa
6 | 192.168.51.6 ansible_connection=ssh ansible_ssh_user=vagrant ansible_ssh_private_key_file=~/.ssh/id_rsa
7 |
8 | [hadoop-master:vars]
9 | nodesfile=nodes-dev
10 |
11 | [hadoop-data:vars]
12 | nodesfile=nodes-dev
--------------------------------------------------------------------------------
/nodes-dev:
--------------------------------------------------------------------------------
1 | nodes:
2 | - hostname: hadoop-master
3 | ip: 192.168.51.4
4 | - hostname: hadoop-data1
5 | ip: 192.168.51.5
6 | - hostname: hadoop-data2
7 | ip: 192.168.51.6
--------------------------------------------------------------------------------
/playbook.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Install hadoop master node
3 | hosts: hadoop-master
4 | remote_user: ubuntu
5 | sudo: yes
6 |
7 | roles:
8 | - common
9 | - oraclejava8
10 | - master
11 |
12 | - name: Install hadoop data nodes
13 | hosts: hadoop-data
14 | remote_user: ubuntu
15 | sudo: yes
16 |
17 | roles:
18 | - common
19 | - oraclejava8
20 | # - data
21 |
--------------------------------------------------------------------------------
/roles/common/tasks/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - include_vars: "{{ nodesfile }}"
3 |
4 | - group: name={{ hadoop_group}} state=present
5 | - user: name={{ hadoop_user }} comment="Hadoop" group={{ hadoop_group}} shell=/bin/bash
6 |
7 | - authorized_key: user={{ hadoop_user }} key="{{ lookup('file', '../templates/hadoop_rsa.pub') }}"
8 |
9 | # this is a bandwidth heavy task which downloads hadoop binaries to each node
10 | - name: Download hadoop
11 | get_url: url={{ hadoop_download_url }} dest=/home/{{ hadoop_user }}/hadoop-2.7.1.tar.gz
12 | environment: proxy_env
13 |
14 | - name: Extract hadoop archive
15 | unarchive: src=/home/{{ hadoop_user }}/hadoop-2.7.1.tar.gz dest=/usr/local owner={{ hadoop_user}} group={{ hadoop_group }} creates=/usr/local/hadoop copy=no
16 |
17 | # this is an alternative for the local deployment where the hadoop binary can be cached locally
18 | #- name: unpack hadoop
19 | # unarchive: src=/home/vagrant/src/roles/common/templates/hadoop-2.7.1.tar.gz dest=/usr/local owner={{ hadoop_user}} group={{ hadoop_group }} creates=/usr/local/hadoop
20 |
21 | - command: mv /usr/local/hadoop-2.7.1 /usr/local/hadoop creates=/usr/local/hadoop removes=/usr/local/hadoop-2.7.1
22 |
23 | - lineinfile: dest=/home/hadoop/.bashrc regexp="HADOOP_HOME=" line="export HADOOP_HOME=/usr/local/hadoop"
24 | - lineinfile: dest=/home/hadoop/.bashrc regexp="PATH=" line="export PATH=$PATH:$HADOOP_HOME/bin"
25 | - lineinfile: dest=/home/hadoop/.bashrc regexp="HADOOP_SSH_OPTS=" line="export HADOOP_SSH_OPTS=\"-i /home/{{ hadoop_user }}/.ssh/hadoop_rsa\""
26 |
27 | # Idempotent way to build a /etc/hosts file with Ansible using your Ansible hosts inventory for a source.
28 | # Will include all hosts the playbook is run on.
29 | # Inspired from http://xmeblog.blogspot.com/2013/06/ansible-dynamicaly-update-etchosts.html
30 |
31 | - name: "Build hosts file"
32 | lineinfile: dest=/etc/hosts regexp='{{ item.ip }}' line="{{ item.ip }} {{ item.hostname }}" state=present
33 | with_items: "{{ nodes }}"
34 |
35 | - lineinfile: dest=/etc/hosts regexp='127.0.1.1' state=absent
36 |
37 | - file: path=/home/{{ hadoop_user }}/tmp state=directory owner={{ hadoop_user}} group={{ hadoop_group }} mode=750
38 | - file: path=/home/{{ hadoop_user }}/hadoop-data/hdfs/namenode state=directory owner={{ hadoop_user}} group={{ hadoop_group }} mode=750
39 | - file: path=/home/{{ hadoop_user }}/hadoop-data/hdfs/datanode state=directory owner={{ hadoop_user}} group={{ hadoop_group }} mode=750
40 |
41 | - name: Add the service scripts
42 | template: src={{ item.src }} dest={{ item.dest }} owner={{ hadoop_user}} group={{ hadoop_group }}
43 | with_items:
44 | - {src: "core-site.xml", dest: "/usr/local/hadoop/etc/hadoop/core-site.xml"}
45 | - {src: "hdfs-site.xml", dest: "/usr/local/hadoop/etc/hadoop/hdfs-site.xml"}
46 | - {src: "yarn-site.xml", dest: "/usr/local/hadoop/etc/hadoop/yarn-site.xml"}
47 | - {src: "mapred-site.xml", dest: "/usr/local/hadoop/etc/hadoop/mapred-site.xml"}
48 |
49 | - lineinfile: dest=/usr/local/hadoop/etc/hadoop/hadoop-env.sh regexp="^export JAVA_HOME" line="export JAVA_HOME=/usr/lib/jvm/java-8-oracle"
50 |
--------------------------------------------------------------------------------
/roles/common/templates/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 |
21 | hadoop.tmp.dir
22 | /home/{{ hadoop_user }}/tmp
23 | A base for other temporary directories.
24 |
25 |
26 |
27 | fs.default.name
28 | hdfs://{{ nodes[0]['hostname'] }}:54310
29 | The name of the default file system. A URI whose
30 | scheme and authority determine the FileSystem implementation. The
31 | uri's scheme determines the config property (fs.SCHEME.impl) naming
32 | the FileSystem implementation class. The uri's authority is used to
33 | determine the host, port, etc. for a filesystem.
34 |
35 |
36 |
--------------------------------------------------------------------------------
/roles/common/templates/hadoop_rsa.pub:
--------------------------------------------------------------------------------
1 | ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDWeJfgWx7hDeZUJOeaIVzcbmYxzMcWfxhgC2975tvGL5BV6unzLz8ZVak6ju++AvnM5mcQp6Ydv73uWyaoQaFZigAzfuenruQkwc7D5YYuba+FgZdQ8VHon29oQA3iaZWG7xTspagrfq3fcqaz2ZIjzqN+E/MtcW08PwfibN2QRWchBCuZ1Q8AmrW7gClzMcgd/uj3TstabspGaaZMCs8aC9JWzZlMMegXKYHvVQs6xH2AmifpKpLoMTdO8jP4jczmGebPzvaXmvVylgwo6bRJ3tyYAmGwx8PHj2EVVQ0XX9ipgixLyAa2c7+/crPpGmKFRrYibCCT6x65px7nWnn3
--------------------------------------------------------------------------------
/roles/common/templates/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 |
21 | dfs.replication
22 | 2
23 | Default block replication.
24 | The actual number of replications can be specified when the file is created.
25 | The default is used if replication is not specified in create time.
26 |
27 |
28 |
29 | dfs.namenode.name.dir
30 | /home/{{ hadoop_user }}/hadoop-data/hdfs/namenode
31 | Determines where on the local filesystem the DFS name node should store the name table(fsimage). If this is a comma-delimited list of directories then the name table is replicated in all of the directories, for redundancy.
32 |
33 |
34 | dfs.datanode.data.dir
35 | /home/{{ hadoop_user }}/hadoop-data/hdfs/datanode
36 | Determines where on the local filesystem an DFS data node should store its blocks. If this is a comma-delimited list of directories, then data will be stored in all named directories, typically on different devices. Directories that do not exist are ignored.
37 |
38 |
39 |
--------------------------------------------------------------------------------
/roles/common/templates/mapred-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 |
21 | mapred.job.tracker
22 | {{ nodes[0]['hostname'] }}:54311
23 | The host and port that the MapReduce job tracker runs
24 | at. If "local", then jobs are run in-process as a single map
25 | and reduce task.
26 |
27 |
28 |
29 | mapreduce.framework.name
30 | yarn
31 |
32 |
33 |
--------------------------------------------------------------------------------
/roles/common/templates/yarn-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 |
21 | yarn.nodemanager.aux-services
22 | mapreduce_shuffle
23 |
24 |
25 | yarn.resourcemanager.scheduler.address
26 | {{ nodes[0]['hostname'] }}:8030
27 |
28 |
29 | yarn.resourcemanager.address
30 | {{ nodes[0]['hostname'] }}:8032
31 |
32 |
33 | yarn.resourcemanager.webapp.address
34 | {{ nodes[0]['hostname'] }}:8088
35 |
36 |
37 | yarn.resourcemanager.resource-tracker.address
38 | {{ nodes[0]['hostname'] }}:8031
39 |
40 |
41 | yarn.resourcemanager.admin.address
42 | {{ nodes[0]['hostname'] }}:8033
43 |
44 |
45 |
--------------------------------------------------------------------------------
/roles/common/vars/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | hadoop_user: hadoop
3 | hadoop_group: hadoop
4 | hadoop_download_url: http://apache.cs.utah.edu/hadoop/common/hadoop-2.7.1/hadoop-2.7.1.tar.gz
--------------------------------------------------------------------------------
/roles/master/tasks/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - include_vars: "{{ nodesfile }}"
3 |
4 | - name: Copy private key into place
5 | template: src=hadoop_rsa dest=/home/{{ hadoop_user }}/.ssh/hadoop_rsa owner={{ hadoop_user }} group={{ hadoop_group}} mode=0600
6 |
7 | - name: Copy slaves into place
8 | template: src=slaves dest=/usr/local/hadoop/etc/hadoop/slaves owner={{ hadoop_user }} group={{ hadoop_group}}
9 |
10 | - name: prepare known_hosts entries
11 | shell: ssh-keyscan -t rsa {{ item.hostname }}
12 | with_items: "{{ nodes }}"
13 | register: keyscans
14 |
15 | - name: prepare known_hosts
16 | lineinfile:
17 | dest=/home/{{ hadoop_user }}/.ssh/known_hosts
18 | create=yes
19 | state=present
20 | line="{{ item.stdout }}"
21 | regexp="^{{ item.item.hostname }}"
22 | owner={{ hadoop_user }}
23 | group={{ hadoop_group }}
24 | with_items: "{{ keyscans.results }}"
25 |
26 | - name: prepare known_hosts entries
27 | shell: ssh-keyscan -t rsa 0.0.0.0
28 | register: keyscan_0_0_0_0
29 |
30 | - name: add 0.0.0.0 to known_hosts for secondary namenode
31 | lineinfile:
32 | dest=/home/{{ hadoop_user }}/.ssh/known_hosts
33 | create=yes
34 | state=present
35 | line="{{ keyscan_0_0_0_0.stdout }}"
36 | regexp="^0.0.0.0"
37 | owner={{ hadoop_user }}
38 | group={{ hadoop_group }}
39 |
--------------------------------------------------------------------------------
/roles/master/templates/hadoop_rsa:
--------------------------------------------------------------------------------
1 | -----BEGIN RSA PRIVATE KEY-----
2 | MIIEogIBAAKCAQEA1niX4Fse4Q3mVCTnmiFc3G5mMczHFn8YYAtve+bbxi+QVerp
3 | 8y8/GVWpOo7vvgL5zOZnEKemHb+97lsmqEGhWYoAM37np67kJMHOw+WGLm2vhYGX
4 | UPFR6J9vaEAN4mmVhu8U7KWoK36t33Kms9mSI86jfhPzLXFtPD8H4mzdkEVnIQQr
5 | mdUPAJq1u4ApczHIHf7o907LWm7KRmmmTArPGgvSVs2ZTDHoFymB71ULOsR9gJon
6 | 6SqS6DE3TvIz+I3M5hnmz872l5r1cpYMKOm0Sd7cmAJhsMfDx49hFVUNF1/YqYIs
7 | S8gGtnO/v3Kz6RpihUa2Imwgk+seuace51p59wIDAQABAoIBAAnbWP+SsISfYuqT
8 | F6HZzq7C1CeNxcie5rvjSqhjH86n+PY56a6niTrc2P47+4C8Gkb0f+AgFF5hpZUC
9 | 03AANC4tTX0QZA0s2Idn8NzH+7a+i8aTAiVSMfWC/lfIOdsiByFAc7G8m1IEbHFM
10 | j6TMZw8GPDVbLV31TJPsQayCXFx4/oEgO2WoIukuHOlWYvNcxhOHe0dWZbI2fG0n
11 | pj9kE5YEkbmV/KBLJm4TnCRzt3DkcwTUNSxPHoVipCEMrMAzi7aNLoZOlx5txXxl
12 | nvSc0DHbh/WO9ZzvRDdICDYMSonunji3SffmrYj+2rTBn9/4RmNXgBE6eIAyGA6H
13 | 8+ZatIECgYEA+0iXpewZsiAs+NvOOoxS2w5EhOsqOJcBFHGpK6uraYot+/SvDCbH
14 | aKin2u/pELjBN83ujGwjoVbQuLd9qMkjNAvP54kWkr9UnfFg0xr1DKGusZn4vafr
15 | 3D/L33I9SWu9sd0M9ehaqi8IXjEhyXFjTepc76y/MSzc/GhqejOpOqcCgYEA2n8e
16 | SKmxHYeby+EdVyACgD6DE5L4SfUMl/GkpueRzLxjzw+HwU7papbKN9I3obkZCcED
17 | 1I9cYFs47coK/7i9jtCxmPzZ7RuDi1YbcJ7N1E51XjxYPrtFMsmyjQT2ZaYYcwJi
18 | oDBgUgR/H/lhLlOqTRLDaJCoXljlUNcvhIxfwDECgYAOEVlUE6y4mS6LXBrmkjaN
19 | RvY1DslVU7bewyI6bKjzBqZeZatrHbyqMk77ZdUBd6ZxjljdiaxQ+wBCZxtk2KXc
20 | 4xBnoPgfjRCaqwnFp8uyQlb6YGlZjM4ajYPHZajitGJxU7PHDJQO1QU96LhAg9WQ
21 | Zh9/14b4pM9WXi4f20aNHwKBgDcF/CJgWXIOTHWzYyglr3uMQoYTZON7CojZGZXO
22 | izD74EasqotLzAxDnPkhv1DCxENokAZfc+vwDQ4U62AqrdHhTFgylM7ZNu9H5/Ec
23 | jMkCWRw6Jh7mGi8IDCbnMQW4JkAQQbYFeLwu1bS/oHGCMEjoiL00cLIRwO3ywsym
24 | bRaBAoGAOMrZunDk0Go4G+OmE4gn9fKsViTm8wSfMGqMKQDNI8BibRfkHnieqW41
25 | qtZ65mc/5xFFvb01Eb1K0e2i+DpdhOlnc7e34jyRH9Ac15V7eDuLm3eYdNjghkVT
26 | /PQeITGQ3KAwki/UC8BHzmTL+dWpNBB4InrBgZ4nk5Atiq6YUFk=
27 | -----END RSA PRIVATE KEY-----
--------------------------------------------------------------------------------
/roles/master/templates/slaves:
--------------------------------------------------------------------------------
1 | {% for host in nodes %}
2 | {{ host['hostname'] }}
3 | {% endfor %}
--------------------------------------------------------------------------------
/roles/master/vars/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | ansible_user: vagrant
--------------------------------------------------------------------------------
/roles/oraclejava8/tasks/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - apt_repository: repo='deb http://ppa.launchpad.net/webupd8team/java/ubuntu trusty main' state=present
3 | environment: proxy_env
4 | - apt_repository: repo='deb-src http://ppa.launchpad.net/webupd8team/java/ubuntu trusty main' state=present
5 | environment: proxy_env
6 |
7 | - debconf: name='oracle-java8-installer' question='shared/accepted-oracle-license-v1-1' value='true' vtype='select' unseen=false
8 |
9 | - name: Copy PGP key for Oracle into place
10 | template: src=oracle-pgp-key dest=/home/{{ hadoop_user }}/.ssh/oracle-pgp-key owner={{ hadoop_user }} mode=0600
11 |
12 | - apt_key: file=/home/{{ hadoop_user }}/.ssh/oracle-pgp-key
13 |
14 | - name: Install Java
15 | apt: pkg=oracle-java8-installer state=installed update_cache=true
16 | environment: proxy_env
17 |
18 | - lineinfile: dest=/home/hadoop/.bashrc regexp="^export JAVA_HOME" line="export JAVA_HOME=/usr/lib/jvm/java-8-oracle"
19 |
--------------------------------------------------------------------------------
/roles/oraclejava8/templates/oracle-pgp-key:
--------------------------------------------------------------------------------
1 | -----BEGIN PGP PUBLIC KEY BLOCK-----
2 | Version: SKS 1.1.5
3 | Comment: Hostname: keyserver.ubuntu.com
4 |
5 | mI0ES9/P3AEEAPbI+9BwCbJucuC78iUeOPKl/HjAXGV49FGat0PcwfDd69MVp6zUtIMbLgkU
6 | OxIlhiEkDmlYkwWVS8qy276hNg9YKZP37ut5+GPObuS6ZWLpwwNus5PhLvqeGawVJ/obu7d7
7 | gM8mBWTgvk0ErnZDaqaU2OZtHataxbdeW8qH/9FJABEBAAG0DUxhdW5jaHBhZCBWTEOItgQT
8 | AQIAIAUCS9/P3AIbAwYLCQgHAwIEFQIIAwQWAgMBAh4BAheAAAoJEMJRgkjuoUiG5wYEANCd
9 | jhXXEpPUbP7cRGXL6cFvrUFKpHHopSC9NIQ9qxJVlUK2NjkzCCFhTxPSHU8LHapKKvie3e+l
10 | kvWW5bbFN3IuQUKttsgBkQe2aNdGBC7dVRxKSAcx2fjqP/s32q1lRxdDRM6xlQlEA1j94ewG
11 | 9SDVwGbdGcJ43gLxBmuKvUJ4
12 | =0Cp+
13 | -----END PGP PUBLIC KEY BLOCK-----
--------------------------------------------------------------------------------