├── .gitignore ├── LICENSE ├── README.md ├── Vagrantfile ├── bootstrap-master.sh ├── group_vars └── all ├── heat-datanode.yaml ├── heat-hadoop-cluster.yaml ├── heat-inventory.py ├── hosts-dev ├── nodes-dev ├── playbook.yml └── roles ├── common ├── tasks │ └── main.yml ├── templates │ ├── core-site.xml │ ├── hadoop_rsa.pub │ ├── hdfs-site.xml │ ├── mapred-site.xml │ └── yarn-site.xml └── vars │ └── main.yml ├── master ├── tasks │ └── main.yml ├── templates │ ├── hadoop_rsa │ └── slaves └── vars │ └── main.yml └── oraclejava8 ├── tasks └── main.yml └── templates └── oracle-pgp-key /.gitignore: -------------------------------------------------------------------------------- 1 | .vagrant -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Daniel Watrous 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hadoop multi-node cluster with Ansible 2 | Multi-server deployment of Hadoop using Ansible 3 | 4 | This repository contains a set of Vagrant and Ansible scripts that make it fast and easy to build a fully functional Hadoop cluster, including HDFS, on a single computer using VirtualBox. In order to run the scripts as they are, you will probably need about 16GB RAM and at least 4 CPUs. 5 | 6 | ## Quick Start (Local) 7 | 8 | - Clone this repository 9 | - (optional) Download a binary release of hadoop (e.g. http://www.apache.org/dyn/closer.cgi/hadoop/common/hadoop-2.7.1/hadoop-2.7.1.tar.gz) and saved it to `roles/common/templates/hadoop-2.7.1.tar.gz` then update `roles/common/tasks/main.yml` to use the alternative approach 10 | - Open a command prompt to the directory where you cloned the code 11 | - Run `vagrant up` 12 | - Use the commented lines in `bootstrap-master.sh` to do the following 13 | - Run the ansible playbook: `ansible-playbook -i hosts-dev playbook.yml` 14 | - Format the HDFS namenode 15 | - Start DFS and YARN 16 | - Run an example job: `hadoop jar /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.1.jar pi 10 30` 17 | 18 | ### Additional Details and Explanation 19 | 20 | I wrote up a detailed article about how to understand and run these scripts. This includes the expected output and instructions to modify the process to accommodate proxy environments and low RAM environments. You can find that here: 21 | 22 | http://software.danielwatrous.com/install-and-configure-a-multi-node-hadoop-cluster-using-ansible/ 23 | 24 | ## Quick Start (OpenStack) 25 | - Clone this repository 26 | - Open a command prompt to the directory where you cloned the code 27 | - Edit the Vagrantfile and remove the unused datanodes. You may also reduce the memory for the master, which will become your Ansible host. 28 | - Run `vagrant up` (this sets up the virtualenv for openstack connectivity) 29 | - Use the commented lines in `bootstrap-master.sh` to do the following 30 | - Enter the virtualenv with `source ~/venv/bin/activate` 31 | - Download openrc from OpenStack file and source to establish your environment 32 | - Use openstack CLI to gather details and update `heat-hadoop-cluster.yaml` 33 | - Update other files for proxy, usernames, etc. if necessary 34 | - Run `heat stack-create hadoop-stack -f heat-hadoop-cluster.yaml` and other connectivity commands 35 | - Run the ansible playbook: `ansible-playbook -i hosts-pro playbook.yml` 36 | - Format the HDFS namenode 37 | - Start DFS and YARN 38 | - Run an example job: `hadoop jar /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.1.jar pi 10 30` 39 | 40 | -------------------------------------------------------------------------------- /Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | #http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/ClusterSetup.html 5 | 6 | PRIVATE_KEY_SOURCE = 'C:\Users\watrous\.vagrant.d\insecure_private_key' 7 | PRIVATE_KEY_DESTINATION = '/home/vagrant/.ssh/id_rsa' 8 | MASTER_IP = '192.168.51.4' 9 | DATA1_IP = '192.168.51.5' 10 | DATA2_IP = '192.168.51.6' 11 | 12 | Vagrant.configure("2") do |config| 13 | 14 | config.ssh.insert_key = false 15 | 16 | # define Master server 17 | config.vm.define "master" do |master| 18 | master.vm.hostname = "hadoop-master" 19 | master.vm.box = "ubuntu/trusty64" 20 | master.vm.synced_folder ".", "/home/vagrant/src", mount_options: ["dmode=775,fmode=664"] 21 | master.vm.network "private_network", ip: MASTER_IP 22 | master.vm.provider "virtualbox" do |v| 23 | v.name = "master" 24 | v.cpus = 2 25 | v.memory = 3072 26 | end 27 | # copy private key so hosts can ssh using key authentication (the script below sets permissions to 600) 28 | master.vm.provision :file do |file| 29 | file.source = PRIVATE_KEY_SOURCE 30 | file.destination = PRIVATE_KEY_DESTINATION 31 | end 32 | master.vm.provision "shell", path: "bootstrap-master.sh" 33 | end 34 | 35 | # define data1 server 36 | config.vm.define "data1" do |data1| 37 | data1.vm.hostname = "hadoop-data1" 38 | data1.vm.box = "ubuntu/trusty64" 39 | data1.vm.network "private_network", ip: DATA1_IP 40 | data1.vm.provider "virtualbox" do |v| 41 | v.name = "data1" 42 | v.cpus = 2 43 | v.memory = 3072 44 | end 45 | # copy private key so hosts can ssh using key authentication (the script below sets permissions to 600) 46 | data1.vm.provision :file do |file| 47 | file.source = PRIVATE_KEY_SOURCE 48 | file.destination = PRIVATE_KEY_DESTINATION 49 | end 50 | end 51 | 52 | # define data2 server 53 | config.vm.define "data2" do |data2| 54 | data2.vm.hostname = "hadoop-data2" 55 | data2.vm.box = "ubuntu/trusty64" 56 | data2.vm.network "private_network", ip: DATA2_IP 57 | data2.vm.provider "virtualbox" do |v| 58 | v.name = "data2" 59 | v.cpus = 2 60 | v.memory = 3072 61 | end 62 | # copy private key so hosts can ssh using key authentication (the script below sets permissions to 600) 63 | data2.vm.provision :file do |file| 64 | file.source = PRIVATE_KEY_SOURCE 65 | file.destination = PRIVATE_KEY_DESTINATION 66 | end 67 | end 68 | 69 | end 70 | -------------------------------------------------------------------------------- /bootstrap-master.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #export https_proxy=https://proxy.company.com:8080 4 | #export http_proxy=http://proxy.company.com:8080 5 | 6 | sudo -E apt-get update 7 | sudo -E apt-get install -y unzip python-pip python-virtualenv python-dev 8 | sudo -E pip install ansible 9 | 10 | chmod 600 /home/vagrant/.ssh/id_rsa 11 | 12 | ### Use heat (an orchestration tool to provision in OpenStack) to prepare servers for Hadoop ### 13 | # create python virtualenv in ~/venv 14 | virtualenv venv 15 | chown -R vagrant:vagrant venv 16 | # install heat client 17 | apt-get install -y libffi-dev libssl-dev 18 | /home/vagrant/venv/bin/pip install python-heatclient python-openstackclient pyopenssl ndg-httpsclient pyasn1 19 | # setup environment to connect to openstack using heat 20 | #source ~/venv/bin/activate 21 | #cd ~/src 22 | #source hadoop-Project-openrc.sh 23 | # update heat-hadoop-cluster.yaml 24 | #openstack keypair list 25 | #openstack image list 26 | #openstack flavor list 27 | #openstack network list 28 | #openstack security group list 29 | # update proxy details in group_vars/all 30 | # update remote user in playbook.yml 31 | # update ansible_ssh_(user|private_key_file) in heat-inventory.py 32 | #heat stack-create hadoop-stack -f heat-hadoop-cluster.yaml 33 | #heat output-show hadoop-stack hadoop_master_public_ip 2>&1 | grep -o '[^"]*' 34 | #cp hadoop.pem ~/.ssh/ 35 | #chmod 600 ~/.ssh/hadoop.pem 36 | #python heat-inventory.py 37 | #source scan-node-keys.sh 38 | ### End heat ### 39 | 40 | #ansible-playbook -i hosts-dev playbook.yml 41 | ## --OR-- ## 42 | #ansible-playbook -i hosts-pro playbook.yml 43 | 44 | # for openstack, first login to the master before running the remaining commands 45 | #ssh -i ~/.ssh/hadoop.pem ubuntu@[hadoop_master_public_ip] 46 | 47 | #sudo su - hadoop 48 | #hdfs namenode -format 49 | #/usr/local/hadoop/sbin/start-dfs.sh 50 | #hdfs dfsadmin -report 51 | #/usr/local/hadoop/sbin/start-yarn.sh 52 | #/usr/local/hadoop/sbin/stop-dfs.sh 53 | #/usr/local/hadoop/sbin/stop-yarn.sh 54 | #$HADOOP_HOME/sbin/slaves.sh jps 55 | #hadoop jar /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.1.jar pi 10 30 56 | -------------------------------------------------------------------------------- /group_vars/all: -------------------------------------------------------------------------------- 1 | --- 2 | proxy_env: 3 | #http_proxy: http://proxy.company.com:8080 4 | #https_proxy: https://proxy.company.com:8080 5 | none: false 6 | -------------------------------------------------------------------------------- /heat-datanode.yaml: -------------------------------------------------------------------------------- 1 | heat_template_version: 2013-05-23 2 | 3 | description: > 4 | Template for additional nodes in a Hadoop cluster 5 | 6 | parameters: 7 | hadoop_security_group: 8 | type: string 9 | description: Security group passed from main template 10 | key_name: 11 | type: string 12 | image_id: 13 | type: string 14 | admin_user: 15 | type: string 16 | flavor: 17 | type: string 18 | public_net_id: 19 | type: string 20 | private_net_id: 21 | type: string 22 | private_subnet_id: 23 | type: string 24 | ssh_security_group: 25 | type: string 26 | server_name: 27 | type: string 28 | 29 | resources: 30 | hadoop_data_server: 31 | type: OS::Nova::Server 32 | properties: 33 | name: { get_param: server_name } 34 | admin_user: { get_param: admin_user } 35 | image: { get_param: image_id } 36 | flavor: { get_param: flavor } 37 | key_name: { get_param: key_name } 38 | networks: 39 | - port: { get_resource: hadoop_data_server_port } 40 | 41 | hadoop_data_server_port: 42 | type: OS::Neutron::Port 43 | properties: 44 | network_id: { get_param: private_net_id } 45 | fixed_ips: 46 | - subnet_id: { get_param: private_subnet_id } 47 | security_groups: [{ get_param: hadoop_security_group }, { get_param: ssh_security_group }] 48 | 49 | hadoop_data_server_floating_ip: 50 | type: OS::Neutron::FloatingIP 51 | properties: 52 | floating_network_id: { get_param: public_net_id } 53 | port_id: { get_resource: hadoop_data_server_port } 54 | 55 | outputs: 56 | hadoop_node_private_ip: 57 | description: IP address of Hadoop Data Node in private network 58 | value: [{ get_attr: [ hadoop_data_server, first_address ] }, { get_attr: [ hadoop_data_server, name ] }] 59 | hadoop_node_public_ip: 60 | description: Floating IP address of Hadoop Data Node in public network 61 | value: [{ get_attr: [ hadoop_data_server_floating_ip, floating_ip_address ] }, { get_attr: [ hadoop_data_server, name ] }] 62 | -------------------------------------------------------------------------------- /heat-hadoop-cluster.yaml: -------------------------------------------------------------------------------- 1 | heat_template_version: 2013-05-23 2 | 3 | description: > 4 | HOT template to deploy an automation server to host the REST API for 5 | onboarding automation and assign a floating IP address to the server 6 | 7 | parameters: 8 | key_name: 9 | type: string 10 | description: Name of keypair to assign to servers 11 | default: KEYPAIR 12 | image_id: 13 | type: string 14 | description: ID of image to use for servers 15 | default: IMAGEID 16 | admin_user: 17 | type: string 18 | description: Username of admin user 19 | default: ADMINUSER 20 | flavor: 21 | type: string 22 | description: Flavor to use for servers 23 | default: FLAVOR 24 | public_net_id: 25 | type: string 26 | description: > 27 | ID of public network for which floating IP addresses will be allocated 28 | default: PUBNETID 29 | private_net_id: 30 | type: string 31 | description: ID of private network into which servers get deployed 32 | default: PRIVNETID 33 | private_subnet_id: 34 | type: string 35 | description: ID of private sub network into which servers get deployed 36 | default: PRIVSUBNETID 37 | ssh_security_group: 38 | type: string 39 | description: ID of a security-group that allows SSH traffic 40 | default: SSHSECGRPID 41 | 42 | resources: 43 | hadoop_master_server: 44 | type: OS::Nova::Server 45 | properties: 46 | name: hadoop_master 47 | admin_user: { get_param: admin_user } 48 | image: { get_param: image_id } 49 | flavor: { get_param: flavor } 50 | key_name: { get_param: key_name } 51 | networks: 52 | - port: { get_resource: hadoop_master_server_port } 53 | 54 | hadoop_master_server_port: 55 | type: OS::Neutron::Port 56 | properties: 57 | network_id: { get_param: private_net_id } 58 | fixed_ips: 59 | - subnet_id: { get_param: private_subnet_id } 60 | security_groups: [{ get_resource: hadoop_security_group }, { get_param: ssh_security_group }] 61 | 62 | hadoop_master_server_floating_ip: 63 | type: OS::Neutron::FloatingIP 64 | properties: 65 | floating_network_id: { get_param: public_net_id } 66 | port_id: { get_resource: hadoop_master_server_port } 67 | 68 | hadoop_security_group: 69 | type: OS::Neutron::SecurityGroup 70 | properties: 71 | description: Add security group rules for server 72 | name: hadoop-security-group 73 | rules: 74 | - remote_ip_prefix: 0.0.0.0/0 75 | protocol: tcp 76 | port_range_min: 50070 77 | port_range_max: 50070 78 | - remote_ip_prefix: 0.0.0.0/0 79 | protocol: tcp 80 | port_range_min: 50075 81 | port_range_max: 50075 82 | # update the following remote_ip_prefix to match the subnet of your private network 83 | - remote_ip_prefix: 172.16.0.0/24 84 | protocol: tcp 85 | port_range_min: 1 86 | port_range_max: 65535 87 | - remote_ip_prefix: 0.0.0.0/0 88 | protocol: tcp 89 | port_range_min: 8088 90 | port_range_max: 8088 91 | - remote_ip_prefix: 0.0.0.0/0 92 | protocol: icmp 93 | 94 | nodes: 95 | type: OS::Heat::ResourceGroup 96 | properties: 97 | count: 2 98 | resource_def: 99 | type: heat-datanode.yaml 100 | properties: 101 | server_name: hadoop-data-%index% 102 | hadoop_security_group: { get_resource: hadoop_security_group } 103 | key_name: { get_param: key_name } 104 | image_id: { get_param: image_id } 105 | admin_user: { get_param: admin_user } 106 | flavor: { get_param: flavor } 107 | public_net_id: { get_param: public_net_id } 108 | private_net_id: { get_param: private_net_id } 109 | private_subnet_id: { get_param: private_subnet_id } 110 | ssh_security_group: { get_param: ssh_security_group } 111 | 112 | outputs: 113 | hadoop_master_private_ip: 114 | description: IP address of Hadoop Master in private network 115 | value: { get_attr: [ hadoop_master_server, first_address ] } 116 | hadoop_master_public_ip: 117 | description: Floating IP address of Hadoop Master in public network 118 | value: { get_attr: [ hadoop_master_server_floating_ip, floating_ip_address ] } 119 | nodes_public_ips: 120 | description: Public IP for datanode 121 | value: {get_attr: [nodes, hadoop_node_public_ip]} 122 | nodes_private_ips: 123 | description: Private IP for datanode 124 | value: {get_attr: [nodes, hadoop_node_private_ip]} 125 | -------------------------------------------------------------------------------- /heat-inventory.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------- 2 | # Name: heat_inventory 3 | # Purpose: 4 | # 5 | # Author: Daniel Watrous 6 | # 7 | # Created: 10/07/2015 8 | # Copyright: (c) HP 2015 9 | #------------------------------------------------------------------------------- 10 | #!/usr/bin/python 11 | 12 | import json 13 | from string import Template 14 | from textwrap import dedent 15 | import subprocess 16 | 17 | class heat_inventory: 18 | 19 | # output keys 20 | hadoop_master_public_key = "hadoop_master_public_ip" 21 | hadoop_master_private_key = "hadoop_master_private_ip" 22 | hadoop_datanode_public_key = "nodes_public_ips" 23 | hadoop_datanode_private_key = "nodes_private_ips" 24 | 25 | # template values 26 | ansible_ssh_user = "debian" 27 | ansible_ssh_private_key_file = "~/.ssh/hadoop.pem" 28 | 29 | # templates 30 | host_entry = Template('$ipaddress ansible_connection=ssh ansible_ssh_user=$ssh_user ansible_ssh_private_key_file=$private_key_file') 31 | hosts_output = Template("""[hadoop-master] 32 | $master_host 33 | 34 | [hadoop-data] 35 | $data_hosts 36 | 37 | [hadoop-master:vars] 38 | nodesfile=nodes-pro 39 | 40 | [hadoop-data:vars] 41 | nodesfile=nodes-pro""") 42 | 43 | node_entry = Template(""" - hostname: $hostname 44 | ip: $ipaddress""") 45 | nodes_section = Template("""--- 46 | nodes: 47 | $nodes 48 | """) 49 | nodes_sshkeyscan = Template('ssh-keyscan -t rsa $ipaddress >> ~/.ssh/known_hosts') 50 | 51 | def __init__(self): 52 | self.load_heat_output() 53 | 54 | def load_heat_output(self): 55 | self.heat_output = json.loads(subprocess.Popen("heat output-show hadoop-stack --all", shell=True, stdout=subprocess.PIPE).stdout.read()) 56 | 57 | def get_master_public_ip(self): 58 | for output_item in self.heat_output: 59 | if self.hadoop_master_public_key == output_item['output_key']: 60 | return output_item['output_value'] 61 | 62 | def get_master_private_ip(self): 63 | for output_item in self.heat_output: 64 | if self.hadoop_master_private_key == output_item['output_key']: 65 | return output_item['output_value'] 66 | 67 | def get_datanode_public_ips(self): 68 | for output_item in self.heat_output: 69 | if self.hadoop_datanode_public_key == output_item['output_key']: 70 | return output_item['output_value'] 71 | 72 | def get_datanode_private_ips(self): 73 | for output_item in self.heat_output: 74 | if self.hadoop_datanode_private_key == output_item['output_key']: 75 | return output_item['output_value'] 76 | 77 | # Ansible hosts file 78 | def get_host_entry(self, ipaddress): 79 | return self.host_entry.substitute(ipaddress=ipaddress, ssh_user=self.ansible_ssh_user, private_key_file=self.ansible_ssh_private_key_file) 80 | 81 | def get_datanode_host_entries(self): 82 | datanode_hosts = [] 83 | for datanode_host in self.get_datanode_public_ips(): 84 | datanode_hosts.append(self.get_host_entry(datanode_host[0])) 85 | return "\n".join(datanode_hosts) 86 | 87 | def get_hosts_output(self): 88 | master_host = self.get_host_entry(self.get_master_public_ip()) 89 | datanode_hosts = self.get_datanode_host_entries() 90 | return dedent(self.hosts_output.substitute(master_host=master_host, data_hosts=datanode_hosts)) 91 | 92 | # Ansible group_vars nodes 93 | def get_node_entry(self, hostname, ipaddress): 94 | return self.node_entry.substitute(hostname=hostname, ipaddress=ipaddress) 95 | 96 | def get_nodes_entries(self): 97 | nodes = [] 98 | nodes.append(self.get_node_entry('hadoop-master', self.get_master_private_ip())) 99 | for node in self.get_datanode_private_ips(): 100 | nodes.append(self.get_node_entry(node[1], node[0])) 101 | return "\n".join(nodes) 102 | 103 | def get_nodes_output(self): 104 | return self.nodes_section.substitute(nodes=self.get_nodes_entries()) 105 | 106 | def get_node_keyscan_script(self): 107 | nodes = [] 108 | nodes.append(self.nodes_sshkeyscan.substitute(ipaddress=self.get_master_public_ip())) 109 | for node in self.get_datanode_public_ips(): 110 | nodes.append(self.nodes_sshkeyscan.substitute(ipaddress=node[0])) 111 | return "\n".join(nodes) 112 | 113 | def main(): 114 | heat_inv = heat_inventory() 115 | ## print "hadoop master public IP: " + heat_inv.get_master_public_ip() 116 | ## print "hadoop master private IP: " + heat_inv.get_master_private_ip() 117 | ## print "hadoop datanode private IP: " + ', '.join(heat_inv.get_datanode_private_ips()) 118 | ## print "hadoop datanode public IP: " + ', '.join(heat_inv.get_datanode_public_ips()) 119 | inventory_file = open('hosts-pro', 'w') 120 | nodes_file = open('nodes-pro', 'w') 121 | inventory_file.write(heat_inv.get_hosts_output()) 122 | nodes_file.write(heat_inv.get_nodes_output()) 123 | inventory_file.close() 124 | nodes_file.close() 125 | keyscan_script_file = open('scan-node-keys.sh', 'w') 126 | keyscan_script_file.write(heat_inv.get_node_keyscan_script()) 127 | keyscan_script_file.close() 128 | 129 | if __name__ == '__main__': 130 | main() 131 | 132 | -------------------------------------------------------------------------------- /hosts-dev: -------------------------------------------------------------------------------- 1 | [hadoop-master] 2 | 192.168.51.4 ansible_connection=local 3 | 4 | [hadoop-data] 5 | 192.168.51.5 ansible_connection=ssh ansible_ssh_user=vagrant ansible_ssh_private_key_file=~/.ssh/id_rsa 6 | 192.168.51.6 ansible_connection=ssh ansible_ssh_user=vagrant ansible_ssh_private_key_file=~/.ssh/id_rsa 7 | 8 | [hadoop-master:vars] 9 | nodesfile=nodes-dev 10 | 11 | [hadoop-data:vars] 12 | nodesfile=nodes-dev -------------------------------------------------------------------------------- /nodes-dev: -------------------------------------------------------------------------------- 1 | nodes: 2 | - hostname: hadoop-master 3 | ip: 192.168.51.4 4 | - hostname: hadoop-data1 5 | ip: 192.168.51.5 6 | - hostname: hadoop-data2 7 | ip: 192.168.51.6 -------------------------------------------------------------------------------- /playbook.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install hadoop master node 3 | hosts: hadoop-master 4 | remote_user: ubuntu 5 | sudo: yes 6 | 7 | roles: 8 | - common 9 | - oraclejava8 10 | - master 11 | 12 | - name: Install hadoop data nodes 13 | hosts: hadoop-data 14 | remote_user: ubuntu 15 | sudo: yes 16 | 17 | roles: 18 | - common 19 | - oraclejava8 20 | # - data 21 | -------------------------------------------------------------------------------- /roles/common/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - include_vars: "{{ nodesfile }}" 3 | 4 | - group: name={{ hadoop_group}} state=present 5 | - user: name={{ hadoop_user }} comment="Hadoop" group={{ hadoop_group}} shell=/bin/bash 6 | 7 | - authorized_key: user={{ hadoop_user }} key="{{ lookup('file', '../templates/hadoop_rsa.pub') }}" 8 | 9 | # this is a bandwidth heavy task which downloads hadoop binaries to each node 10 | - name: Download hadoop 11 | get_url: url={{ hadoop_download_url }} dest=/home/{{ hadoop_user }}/hadoop-2.7.1.tar.gz 12 | environment: proxy_env 13 | 14 | - name: Extract hadoop archive 15 | unarchive: src=/home/{{ hadoop_user }}/hadoop-2.7.1.tar.gz dest=/usr/local owner={{ hadoop_user}} group={{ hadoop_group }} creates=/usr/local/hadoop copy=no 16 | 17 | # this is an alternative for the local deployment where the hadoop binary can be cached locally 18 | #- name: unpack hadoop 19 | # unarchive: src=/home/vagrant/src/roles/common/templates/hadoop-2.7.1.tar.gz dest=/usr/local owner={{ hadoop_user}} group={{ hadoop_group }} creates=/usr/local/hadoop 20 | 21 | - command: mv /usr/local/hadoop-2.7.1 /usr/local/hadoop creates=/usr/local/hadoop removes=/usr/local/hadoop-2.7.1 22 | 23 | - lineinfile: dest=/home/hadoop/.bashrc regexp="HADOOP_HOME=" line="export HADOOP_HOME=/usr/local/hadoop" 24 | - lineinfile: dest=/home/hadoop/.bashrc regexp="PATH=" line="export PATH=$PATH:$HADOOP_HOME/bin" 25 | - lineinfile: dest=/home/hadoop/.bashrc regexp="HADOOP_SSH_OPTS=" line="export HADOOP_SSH_OPTS=\"-i /home/{{ hadoop_user }}/.ssh/hadoop_rsa\"" 26 | 27 | # Idempotent way to build a /etc/hosts file with Ansible using your Ansible hosts inventory for a source. 28 | # Will include all hosts the playbook is run on. 29 | # Inspired from http://xmeblog.blogspot.com/2013/06/ansible-dynamicaly-update-etchosts.html 30 | 31 | - name: "Build hosts file" 32 | lineinfile: dest=/etc/hosts regexp='{{ item.ip }}' line="{{ item.ip }} {{ item.hostname }}" state=present 33 | with_items: "{{ nodes }}" 34 | 35 | - lineinfile: dest=/etc/hosts regexp='127.0.1.1' state=absent 36 | 37 | - file: path=/home/{{ hadoop_user }}/tmp state=directory owner={{ hadoop_user}} group={{ hadoop_group }} mode=750 38 | - file: path=/home/{{ hadoop_user }}/hadoop-data/hdfs/namenode state=directory owner={{ hadoop_user}} group={{ hadoop_group }} mode=750 39 | - file: path=/home/{{ hadoop_user }}/hadoop-data/hdfs/datanode state=directory owner={{ hadoop_user}} group={{ hadoop_group }} mode=750 40 | 41 | - name: Add the service scripts 42 | template: src={{ item.src }} dest={{ item.dest }} owner={{ hadoop_user}} group={{ hadoop_group }} 43 | with_items: 44 | - {src: "core-site.xml", dest: "/usr/local/hadoop/etc/hadoop/core-site.xml"} 45 | - {src: "hdfs-site.xml", dest: "/usr/local/hadoop/etc/hadoop/hdfs-site.xml"} 46 | - {src: "yarn-site.xml", dest: "/usr/local/hadoop/etc/hadoop/yarn-site.xml"} 47 | - {src: "mapred-site.xml", dest: "/usr/local/hadoop/etc/hadoop/mapred-site.xml"} 48 | 49 | - lineinfile: dest=/usr/local/hadoop/etc/hadoop/hadoop-env.sh regexp="^export JAVA_HOME" line="export JAVA_HOME=/usr/lib/jvm/java-8-oracle" 50 | -------------------------------------------------------------------------------- /roles/common/templates/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | hadoop.tmp.dir 22 | /home/{{ hadoop_user }}/tmp 23 | A base for other temporary directories. 24 | 25 | 26 | 27 | fs.default.name 28 | hdfs://{{ nodes[0]['hostname'] }}:54310 29 | The name of the default file system. A URI whose 30 | scheme and authority determine the FileSystem implementation. The 31 | uri's scheme determines the config property (fs.SCHEME.impl) naming 32 | the FileSystem implementation class. The uri's authority is used to 33 | determine the host, port, etc. for a filesystem. 34 | 35 | 36 | -------------------------------------------------------------------------------- /roles/common/templates/hadoop_rsa.pub: -------------------------------------------------------------------------------- 1 | ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDWeJfgWx7hDeZUJOeaIVzcbmYxzMcWfxhgC2975tvGL5BV6unzLz8ZVak6ju++AvnM5mcQp6Ydv73uWyaoQaFZigAzfuenruQkwc7D5YYuba+FgZdQ8VHon29oQA3iaZWG7xTspagrfq3fcqaz2ZIjzqN+E/MtcW08PwfibN2QRWchBCuZ1Q8AmrW7gClzMcgd/uj3TstabspGaaZMCs8aC9JWzZlMMegXKYHvVQs6xH2AmifpKpLoMTdO8jP4jczmGebPzvaXmvVylgwo6bRJ3tyYAmGwx8PHj2EVVQ0XX9ipgixLyAa2c7+/crPpGmKFRrYibCCT6x65px7nWnn3 -------------------------------------------------------------------------------- /roles/common/templates/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | dfs.replication 22 | 2 23 | Default block replication. 24 | The actual number of replications can be specified when the file is created. 25 | The default is used if replication is not specified in create time. 26 | 27 | 28 | 29 | dfs.namenode.name.dir 30 | /home/{{ hadoop_user }}/hadoop-data/hdfs/namenode 31 | Determines where on the local filesystem the DFS name node should store the name table(fsimage). If this is a comma-delimited list of directories then the name table is replicated in all of the directories, for redundancy. 32 | 33 | 34 | dfs.datanode.data.dir 35 | /home/{{ hadoop_user }}/hadoop-data/hdfs/datanode 36 | Determines where on the local filesystem an DFS data node should store its blocks. If this is a comma-delimited list of directories, then data will be stored in all named directories, typically on different devices. Directories that do not exist are ignored. 37 | 38 | 39 | -------------------------------------------------------------------------------- /roles/common/templates/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | mapred.job.tracker 22 | {{ nodes[0]['hostname'] }}:54311 23 | The host and port that the MapReduce job tracker runs 24 | at. If "local", then jobs are run in-process as a single map 25 | and reduce task. 26 | 27 | 28 | 29 | mapreduce.framework.name 30 | yarn 31 | 32 | 33 | -------------------------------------------------------------------------------- /roles/common/templates/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | yarn.nodemanager.aux-services 22 | mapreduce_shuffle 23 | 24 | 25 | yarn.resourcemanager.scheduler.address 26 | {{ nodes[0]['hostname'] }}:8030 27 | 28 | 29 | yarn.resourcemanager.address 30 | {{ nodes[0]['hostname'] }}:8032 31 | 32 | 33 | yarn.resourcemanager.webapp.address 34 | {{ nodes[0]['hostname'] }}:8088 35 | 36 | 37 | yarn.resourcemanager.resource-tracker.address 38 | {{ nodes[0]['hostname'] }}:8031 39 | 40 | 41 | yarn.resourcemanager.admin.address 42 | {{ nodes[0]['hostname'] }}:8033 43 | 44 | 45 | -------------------------------------------------------------------------------- /roles/common/vars/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | hadoop_user: hadoop 3 | hadoop_group: hadoop 4 | hadoop_download_url: http://apache.cs.utah.edu/hadoop/common/hadoop-2.7.1/hadoop-2.7.1.tar.gz -------------------------------------------------------------------------------- /roles/master/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - include_vars: "{{ nodesfile }}" 3 | 4 | - name: Copy private key into place 5 | template: src=hadoop_rsa dest=/home/{{ hadoop_user }}/.ssh/hadoop_rsa owner={{ hadoop_user }} group={{ hadoop_group}} mode=0600 6 | 7 | - name: Copy slaves into place 8 | template: src=slaves dest=/usr/local/hadoop/etc/hadoop/slaves owner={{ hadoop_user }} group={{ hadoop_group}} 9 | 10 | - name: prepare known_hosts entries 11 | shell: ssh-keyscan -t rsa {{ item.hostname }} 12 | with_items: "{{ nodes }}" 13 | register: keyscans 14 | 15 | - name: prepare known_hosts 16 | lineinfile: 17 | dest=/home/{{ hadoop_user }}/.ssh/known_hosts 18 | create=yes 19 | state=present 20 | line="{{ item.stdout }}" 21 | regexp="^{{ item.item.hostname }}" 22 | owner={{ hadoop_user }} 23 | group={{ hadoop_group }} 24 | with_items: "{{ keyscans.results }}" 25 | 26 | - name: prepare known_hosts entries 27 | shell: ssh-keyscan -t rsa 0.0.0.0 28 | register: keyscan_0_0_0_0 29 | 30 | - name: add 0.0.0.0 to known_hosts for secondary namenode 31 | lineinfile: 32 | dest=/home/{{ hadoop_user }}/.ssh/known_hosts 33 | create=yes 34 | state=present 35 | line="{{ keyscan_0_0_0_0.stdout }}" 36 | regexp="^0.0.0.0" 37 | owner={{ hadoop_user }} 38 | group={{ hadoop_group }} 39 | -------------------------------------------------------------------------------- /roles/master/templates/hadoop_rsa: -------------------------------------------------------------------------------- 1 | -----BEGIN RSA PRIVATE KEY----- 2 | MIIEogIBAAKCAQEA1niX4Fse4Q3mVCTnmiFc3G5mMczHFn8YYAtve+bbxi+QVerp 3 | 8y8/GVWpOo7vvgL5zOZnEKemHb+97lsmqEGhWYoAM37np67kJMHOw+WGLm2vhYGX 4 | UPFR6J9vaEAN4mmVhu8U7KWoK36t33Kms9mSI86jfhPzLXFtPD8H4mzdkEVnIQQr 5 | mdUPAJq1u4ApczHIHf7o907LWm7KRmmmTArPGgvSVs2ZTDHoFymB71ULOsR9gJon 6 | 6SqS6DE3TvIz+I3M5hnmz872l5r1cpYMKOm0Sd7cmAJhsMfDx49hFVUNF1/YqYIs 7 | S8gGtnO/v3Kz6RpihUa2Imwgk+seuace51p59wIDAQABAoIBAAnbWP+SsISfYuqT 8 | F6HZzq7C1CeNxcie5rvjSqhjH86n+PY56a6niTrc2P47+4C8Gkb0f+AgFF5hpZUC 9 | 03AANC4tTX0QZA0s2Idn8NzH+7a+i8aTAiVSMfWC/lfIOdsiByFAc7G8m1IEbHFM 10 | j6TMZw8GPDVbLV31TJPsQayCXFx4/oEgO2WoIukuHOlWYvNcxhOHe0dWZbI2fG0n 11 | pj9kE5YEkbmV/KBLJm4TnCRzt3DkcwTUNSxPHoVipCEMrMAzi7aNLoZOlx5txXxl 12 | nvSc0DHbh/WO9ZzvRDdICDYMSonunji3SffmrYj+2rTBn9/4RmNXgBE6eIAyGA6H 13 | 8+ZatIECgYEA+0iXpewZsiAs+NvOOoxS2w5EhOsqOJcBFHGpK6uraYot+/SvDCbH 14 | aKin2u/pELjBN83ujGwjoVbQuLd9qMkjNAvP54kWkr9UnfFg0xr1DKGusZn4vafr 15 | 3D/L33I9SWu9sd0M9ehaqi8IXjEhyXFjTepc76y/MSzc/GhqejOpOqcCgYEA2n8e 16 | SKmxHYeby+EdVyACgD6DE5L4SfUMl/GkpueRzLxjzw+HwU7papbKN9I3obkZCcED 17 | 1I9cYFs47coK/7i9jtCxmPzZ7RuDi1YbcJ7N1E51XjxYPrtFMsmyjQT2ZaYYcwJi 18 | oDBgUgR/H/lhLlOqTRLDaJCoXljlUNcvhIxfwDECgYAOEVlUE6y4mS6LXBrmkjaN 19 | RvY1DslVU7bewyI6bKjzBqZeZatrHbyqMk77ZdUBd6ZxjljdiaxQ+wBCZxtk2KXc 20 | 4xBnoPgfjRCaqwnFp8uyQlb6YGlZjM4ajYPHZajitGJxU7PHDJQO1QU96LhAg9WQ 21 | Zh9/14b4pM9WXi4f20aNHwKBgDcF/CJgWXIOTHWzYyglr3uMQoYTZON7CojZGZXO 22 | izD74EasqotLzAxDnPkhv1DCxENokAZfc+vwDQ4U62AqrdHhTFgylM7ZNu9H5/Ec 23 | jMkCWRw6Jh7mGi8IDCbnMQW4JkAQQbYFeLwu1bS/oHGCMEjoiL00cLIRwO3ywsym 24 | bRaBAoGAOMrZunDk0Go4G+OmE4gn9fKsViTm8wSfMGqMKQDNI8BibRfkHnieqW41 25 | qtZ65mc/5xFFvb01Eb1K0e2i+DpdhOlnc7e34jyRH9Ac15V7eDuLm3eYdNjghkVT 26 | /PQeITGQ3KAwki/UC8BHzmTL+dWpNBB4InrBgZ4nk5Atiq6YUFk= 27 | -----END RSA PRIVATE KEY----- -------------------------------------------------------------------------------- /roles/master/templates/slaves: -------------------------------------------------------------------------------- 1 | {% for host in nodes %} 2 | {{ host['hostname'] }} 3 | {% endfor %} -------------------------------------------------------------------------------- /roles/master/vars/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | ansible_user: vagrant -------------------------------------------------------------------------------- /roles/oraclejava8/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - apt_repository: repo='deb http://ppa.launchpad.net/webupd8team/java/ubuntu trusty main' state=present 3 | environment: proxy_env 4 | - apt_repository: repo='deb-src http://ppa.launchpad.net/webupd8team/java/ubuntu trusty main' state=present 5 | environment: proxy_env 6 | 7 | - debconf: name='oracle-java8-installer' question='shared/accepted-oracle-license-v1-1' value='true' vtype='select' unseen=false 8 | 9 | - name: Copy PGP key for Oracle into place 10 | template: src=oracle-pgp-key dest=/home/{{ hadoop_user }}/.ssh/oracle-pgp-key owner={{ hadoop_user }} mode=0600 11 | 12 | - apt_key: file=/home/{{ hadoop_user }}/.ssh/oracle-pgp-key 13 | 14 | - name: Install Java 15 | apt: pkg=oracle-java8-installer state=installed update_cache=true 16 | environment: proxy_env 17 | 18 | - lineinfile: dest=/home/hadoop/.bashrc regexp="^export JAVA_HOME" line="export JAVA_HOME=/usr/lib/jvm/java-8-oracle" 19 | -------------------------------------------------------------------------------- /roles/oraclejava8/templates/oracle-pgp-key: -------------------------------------------------------------------------------- 1 | -----BEGIN PGP PUBLIC KEY BLOCK----- 2 | Version: SKS 1.1.5 3 | Comment: Hostname: keyserver.ubuntu.com 4 | 5 | mI0ES9/P3AEEAPbI+9BwCbJucuC78iUeOPKl/HjAXGV49FGat0PcwfDd69MVp6zUtIMbLgkU 6 | OxIlhiEkDmlYkwWVS8qy276hNg9YKZP37ut5+GPObuS6ZWLpwwNus5PhLvqeGawVJ/obu7d7 7 | gM8mBWTgvk0ErnZDaqaU2OZtHataxbdeW8qH/9FJABEBAAG0DUxhdW5jaHBhZCBWTEOItgQT 8 | AQIAIAUCS9/P3AIbAwYLCQgHAwIEFQIIAwQWAgMBAh4BAheAAAoJEMJRgkjuoUiG5wYEANCd 9 | jhXXEpPUbP7cRGXL6cFvrUFKpHHopSC9NIQ9qxJVlUK2NjkzCCFhTxPSHU8LHapKKvie3e+l 10 | kvWW5bbFN3IuQUKttsgBkQe2aNdGBC7dVRxKSAcx2fjqP/s32q1lRxdDRM6xlQlEA1j94ewG 11 | 9SDVwGbdGcJ43gLxBmuKvUJ4 12 | =0Cp+ 13 | -----END PGP PUBLIC KEY BLOCK----- --------------------------------------------------------------------------------