├── Courses_Offered
    ├── Advanced_Data Lake_and_Streaming.pdf
    ├── Advanced_Hadoop_TroubleshootingVer2.0.pdf
    ├── Advanced_Hadoop_adminstrationV2.0.pdf
    ├── HBase_catalog_ver2.0.pdf
    └── Hadoop_Admin_catalog.pdf
├── DNS
    ├── dns_installation.txt
    ├── named.txt
    └── zones.txt
├── Flume
    ├── commands
    ├── flume_collector.conf
    └── web_server_source.conf
├── HA_QJM
    ├── core-site.xml
    ├── hdfs-site.xml
    ├── hdfs-site_datanode.xml
    ├── hdfs-site_namenode.xml
    ├── steps
    └── zoo.cfg
├── HA_RM
    ├── yarn-site.xml.ha
    ├── yarn-site.xml.spark
    └── yarn-site_nodemanager_ha.xml
├── HA_hadoop
    ├── core-site.xml
    └── hdfs-site.xml
├── HBase
    ├── Optimizations
    │   ├── HBase_yscb.txt
    │   ├── Hbase_create_table.txt
    │   ├── Hbase_happybase.txt
    │   ├── Hbase_rand_gen.txt
    │   └── Netxillon_HBase.pdf
    ├── README.md
    ├── backup-masters
    ├── commands.txt
    ├── hbase-site.txt
    ├── hfile
    ├── hive-mysql.txt
    ├── hive.txt
    ├── regions.txt
    ├── regionservers
    ├── replication
    ├── tez-setup
    └── untitled.txt
├── Hive_performance
├── Jars
    ├── azure.tar.gz
    ├── hadoop-azure-3.1.1.3.1.0.0-SNAPSHOT.jar
    └── jce_policy-8.zip
├── Kafka
    ├── commands
    ├── kafka-env.sh
    ├── kafka_ganglia.txt
    ├── kafka_ganglia2.txt
    ├── kakfa_rsyslog.txt
    └── server.properties
├── Notes
    ├── Benchmarking.txt
    ├── Hadoop_lab.doc
    ├── Hadoop_upgrade.txt
    ├── Performance.txt
    ├── backup.txt
    ├── cassandra2.pdf
    ├── class3_questions
    ├── class4_questions
    ├── cloudera.txt
    ├── disk_partition
    ├── hadoop_ports.txt
    ├── hadoop_ports_firewall.xls
    ├── install_hadoop.txt
    ├── installation.txt
    ├── pig.txt
    ├── questions.txt
    ├── quick-links
    ├── quiz4.txt
    ├── quiz7.txt
    ├── quota.txt
    ├── rack.txt
    ├── remove_datanode.txt
    ├── repo_server.txt
    ├── scoop.txt
    ├── sqoop.txt
    ├── sqoop1.txt
    └── yarn.txt
├── README.md
├── Schedulers
    ├── capacity-scheduler.xml
    ├── commands
    ├── fair-scheduler.xml
    ├── mapred-site.xml
    ├── user-mappings.txt
    ├── yarn-site.xml_capacity
    └── yarn-site.xml_fair
├── Security
    ├── README.md
    ├── SSL_Configs
    │   ├── CA
    │   │   ├── README.txt
    │   │   └── openssl.cnf
    │   ├── commands_CA_JKS
    │   ├── gen-certs.sh
    │   └── hadoop_ssl_configs
    │   │   ├── core-site.xml
    │   │   ├── hdfs-site.xml
    │   │   ├── mapred-site.xml
    │   │   ├── ssl-client.xml
    │   │   ├── ssl-server.xml
    │   │   └── yarn-site.xml
    ├── kerberos
    │   ├── JT
    │   │   ├── core-site.xml
    │   │   ├── hdfs-site.xml
    │   │   ├── mapred-site.xml
    │   │   └── taskcontroller.cfg
    │   ├── Jsvc_download.txt
    │   ├── Namenode_Datanode
    │   │   ├── core-site.xml
    │   │   ├── hadoop-env.sh
    │   │   ├── hdfs-site.xml
    │   │   ├── mapred-site.xml
    │   │   └── taskcontroller.cfg
    │   ├── README.md
    │   ├── kdc.conf
    │   ├── kerberos_scripts
    │   │   ├── README.md
    │   │   ├── add_users.sh
    │   │   ├── copy_keytabs.sh
    │   │   ├── create_dn_princs.sh
    │   │   ├── create_nn_princs.sh
    │   │   ├── create_partions.sh
    │   │   ├── create_user_keytab.sh
    │   │   ├── delete_list
    │   │   ├── delete_princs.sh
    │   │   ├── dn_host_list
    │   │   ├── hosts
    │   │   ├── install_krb.sh
    │   │   ├── list_princs.sh
    │   │   ├── nn_host_list
    │   │   ├── setup_kerberos.sh
    │   │   ├── user_host_list
    │   │   └── user_list
    │   ├── kerberos_user_mappings.txt
    │   └── krb5.conf
    ├── kms
    │   ├── kms-acl
    │   └── kms-setup
    └── ldap
    │   ├── Installation_steps
    │   ├── addmembers.ldif
    │   ├── adduser.ldif
    │   ├── base.ldif
    │   ├── base1.ldif
    │   ├── base2.ldif
    │   ├── commands
    │   ├── groupadd.ldif
    │   ├── slapd.conf.obsolete
    │   ├── test.ldif
    │   └── users.ldif
├── Spark
    ├── Spark_build
    ├── examples.txt
    ├── spark-defaults.conf
    ├── sparkPython
    │   ├── erfunction.py
    │   ├── error.py
    │   ├── error.txt
    │   ├── logparser.py
    │   ├── pivot.txt
    │   ├── square.py
    │   ├── wordcount.py
    │   └── wordcount.txt
    ├── spark_command.txt
    ├── spark_standalone_cluster.txt
    └── yarn-site.xml.spark
├── Yarn_tuning
    ├── Yarn.pdf
    ├── mapred-site.xml
    └── yarn-site.xml
├── _config.yml
├── hadoop1.0
    └── README.md
├── hadoop2.0
    ├── bash_profile
    ├── core-site.xml
    ├── hdfs-site.xml
    ├── mapred-site.xml
    └── yarn-site.xml
├── hadoop_build64bit
├── jumbune
├── logging
├── map_scripts
    ├── job.txt
    ├── mapper.py
    ├── mapper.sh
    ├── reducer.py
    └── reducer.sh
├── zookeeper.txt
└── zookeeper_oozie
    ├── oozie-server.txt
    └── zookeeper.txt


/Courses_Offered/Advanced_Data Lake_and_Streaming.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Courses_Offered/Advanced_Data Lake_and_Streaming.pdf


--------------------------------------------------------------------------------
/Courses_Offered/Advanced_Hadoop_TroubleshootingVer2.0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Courses_Offered/Advanced_Hadoop_TroubleshootingVer2.0.pdf


--------------------------------------------------------------------------------
/Courses_Offered/Advanced_Hadoop_adminstrationV2.0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Courses_Offered/Advanced_Hadoop_adminstrationV2.0.pdf


--------------------------------------------------------------------------------
/Courses_Offered/HBase_catalog_ver2.0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Courses_Offered/HBase_catalog_ver2.0.pdf


--------------------------------------------------------------------------------
/Courses_Offered/Hadoop_Admin_catalog.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Courses_Offered/Hadoop_Admin_catalog.pdf


--------------------------------------------------------------------------------
/DNS/dns_installation.txt:
--------------------------------------------------------------------------------
  1 | DNS Installation
  2 | ================
  3 | 
  4 | # yum install bind -y
  5 | 
  6 | # vi /etc/named.conf
  7 | 
  8 | remove everything and just use the lines below (Change IP's accordingly)
  9 | 
 10 | options {
 11 | <<<<<<< HEAD:DNS/dns_installation.txt
 12 | 	listen-on port 53 { 192.168.1.254; };
 13 | 	directory 	"/var/named";
 14 | =======
 15 | 		listen-on port 53 { 192.168.1.254; };
 16 | 		directory "/var/named";
 17 | >>>>>>> 925912d1565c9ba263d928397d065c93178cb463:hadoop1.0/dns.txt
 18 | 
 19 | 		allow-query { any; };
 20 | 
 21 | 		forwarders { 192.168.1.1; };
 22 | 
 23 | };
 24 | 
 25 | zone "cluster1.com" IN {
 26 | 	type master;
 27 | 	file "/var/named/named.hadoop.forw";
 28 | };
 29 | 
 30 | <<<<<<< HEAD:DNS/dns_installation.txt
 31 | zone "1.168.192.in-addr.arpa" IN {
 32 |         type master;
 33 |         file "/var/named/named.ha.rev";
 34 | 
 35 | };
 36 | 
 37 | =================================================
 38 | 
 39 | # vi /var/named/named.hadoop.forward.zone
 40 | 
 41 | $TTL	86400
 42 | @		IN SOA	cluster1.com root (
 43 | 					42		; serial
 44 | 					3H		; refresh
 45 | 					15M		; retry
 46 | 					1W		; expiry
 47 | 					1D )		; minimum
 48 | 
 49 | 	        IN NS		ns1.cluster1.com
 50 | ns1		IN A		192.168.1.70
 51 | 
 52 | nn1		IN A		192.168.1.70
 53 | nn2		IN A		192.168.1.77
 54 | dn1		IN A		192.168.1.71
 55 | dn2		IN A		192.168.1.72
 56 | dn3		IN A		192.168.1.73
 57 | snn		IN A		192.168.1.68
 58 | jt		IN A		192.168.1.69
 59 | db		IN A		192.168.1.99
 60 | kdc		IN A		192.168.1.40
 61 | cm		IN A		192.168.1.41
 62 | base		IN A		192.168.1.10
 63 | cm1		IN A		192.168.1.11
 64 | node1		IN A		192.168.1.12
 65 | filer		IN A		192.168.1.222
 66 | cloudera	IN A		192.168.1.151
 67 | datanode	IN A		192.168.1.152
 68 | hadooplab	IN A		192.168.1.33
 69 | 
 70 | ===================
 71 | 
 72 | # vi /var/named/named.ha.rev
 73 | =======
 74 | 
 75 | zone "1.168.192.in-addr.arpa" IN {
 76 |         type master;
 77 |         file "/var/named/named.hadoop.rev";
 78 | };
 79 | ============
 80 | zone files (cluster1.com) forward zone
 81 | =========
 82 | 
 83 | $TTL 1D
 84 | @	IN SOA	@ rname.invalid. (
 85 | 					0	; serial
 86 | 					1D	; refresh
 87 | 					1H	; retry
 88 | 					1W	; expire
 89 | 					3H )	; minimum
 90 | 
 91 | 
 92 | 		IN 	NS 	ns1.cluster1.com.
 93 | ns1 	IN 	A 	192.168.1.254
 94 | repo 	IN 	A 	192.168.1.254
 95 | 
 96 | ;namenodes
 97 | nn1 	IN 	A 	192.168.1.70
 98 | nn2 	IN 	A 	192.168.1.71
 99 | 
100 | ;Datanodes
101 | dn1 	IN 	A 	192.168.1.72
102 | dn2 	IN 	A 	192.168.1.73
103 | dn3 	IN 	A 	192.168.1.74
104 | dn4 	IN 	A 	192.168.1.75
105 | 
106 | ;Other Masters
107 | 
108 | snn 		IN 	A 	192.168.1.68
109 | jt 		IN 	A 	192.168.1.69
110 | client 		IN 	A 	192.168.1.99
111 | kdc 		IN 	A 	192.168.1.102
112 | 
113 | ;Cloudera
114 | 
115 | cm 		IN 	A 	192.168.1.40
116 | node1 		IN 	A 	192.168.1.41
117 | node2 		IN 	A 	192.168.1.42
118 | server		IN 	A 	192.168.1.44
119 | 
120 | ;Storage
121 | 
122 | filer 		IN 	A 	192.168.1.253
123 | 
124 | ;Databases;
125 | 
126 | mynode1		IN	A	192.168.1.81
127 | mynode2		IN	A	192.168.1.82
128 | labserver	IN	A	192.168.1.14
129 | ===========
130 | 
131 | Reverse zone for  cluster1.com
132 | ==============================
133 | >>>>>>> 925912d1565c9ba263d928397d065c93178cb463:hadoop1.0/dns.txt
134 | 
135 | $TTL	86400
136 | @       IN      SOA     ns1.cluster1.com. root.cluster1.com.  (
137 |                                       1997022700 ; Serial
138 |                                       28800      ; Refresh
139 |                                       14400      ; Retry
140 |                                       3600000    ; Expire
141 |                                       86400 )    ; Minimum
142 | 
143 | 1.168.192.in-addr.arpa.                IN      NS      ns1.cluster1.com.
144 | 
145 | 70             IN      PTR     nn1.cluster1.com.
146 | 40             IN      PTR     kdc.cluster1.com.
147 | 41             IN      PTR     cm.cluster1.com.
148 | 20             IN      PTR     node1.cluster1.com.
149 | 71             IN      PTR     dn1.cluster1.com.
150 | 72             IN      PTR     dn2.cluster1.com.
151 | 73             IN      PTR     dn3.cluster1.com.
152 | 10	       IN	PTR	base.cluster1.com.
153 | 11	       IN	PTR	cm1.cluster1.com.
154 | 12	       IN	PTR	node1.cluster1.com.
155 | 151	       IN	PTR	cloudera.cluster1.com.
156 | 152	       IN	PTR	datanode.cluster1.com.
157 | 
158 | ==============================================
159 | 
160 | # chown -R root:named /var/named/
161 | 
162 | 
163 | 
164 | # service named restart
165 | 
166 | ===========================
167 | 
168 | On all client machines
169 | 
170 | # vi /etc/resolv.conf
171 | 
172 | nameserver 192.168.1.254
173 | 


--------------------------------------------------------------------------------
/DNS/named.txt:
--------------------------------------------------------------------------------
 1 | options {
 2 | 	listen-on port 53 { 192.168.1.254; };
 3 | 	directory 	"/var/named";
 4 | 
 5 | 	allow-query     { any; };
 6 | 
 7 | 	forwarders { 192.168.1.1; };
 8 | 
 9 | };
10 | 
11 | zone "cluster1.com" IN {
12 | 	type master;
13 | 	file "/var/named/named.hadoop.forward.zone";
14 | };
15 | 
16 | ;Second Zone
17 | zone "hacluster1.com" IN {
18 | 	type master;
19 | 	file "/var/named/named.ha.forward.zone";
20 | };
21 | 
22 | zone "1.168.192.in-addr.arpa" IN {
23 |         type master;
24 |         file "/var/named/named.ha.rev";
25 | #         file "/var/named/named.hadoop.rev";
26 | };
27 | 


--------------------------------------------------------------------------------
/DNS/zones.txt:
--------------------------------------------------------------------------------
 1 | #Forward lookup zone
 2 | ;Make sure you understand that comments could be using # or ; and it might change in future versions
 3 | 
 4 | $TTL	86400
 5 | @		IN SOA	cluster1.com root (
 6 | 					42		; serial
 7 | 					3H		; refresh
 8 | 					15M		; retry
 9 | 					1W		; expiry
10 | 					1D )		; minimum
11 | 
12 | 	        IN NS		ns1.cluster1.com
13 | ns1		IN A		192.168.1.70
14 | 
15 | nn1		IN A		192.168.1.70
16 | nn2		IN A		192.168.1.77
17 | dn1		IN A		192.168.1.71
18 | dn2		IN A		192.168.1.72
19 | dn3		IN A		192.168.1.73
20 | snn		IN A		192.168.1.68
21 | jt		IN A		192.168.1.69
22 | db		IN A		192.168.1.99
23 | kdc		IN A		192.168.1.40
24 | cm		IN A		192.168.1.41
25 | base		IN A		192.168.1.10
26 | cm1		IN A		192.168.1.11
27 | node1		IN A		192.168.1.12
28 | filer		IN A		192.168.1.222
29 | cloudera	IN A		192.168.1.151
30 | datanode	IN A		192.168.1.152
31 | hadooplab	IN A		192.168.1.33
32 | 
33 | 
34 | ================
35 | # Reverse lookup Zone
36 | +++++++++++++++++++++++
37 | 
38 | $TTL	86400
39 | @       IN      SOA     ns1.cluster1.com. root.cluster1.com.  (
40 |                                       1997022700 ; Serial
41 |                                       28800      ; Refresh
42 |                                       14400      ; Retry
43 |                                       3600000    ; Expire
44 |                                       86400 )    ; Minimum
45 | 
46 | 1.168.192.in-addr.arpa.                IN      NS      ns1.cluster1.com.
47 | 
48 | 70             IN      PTR     nn1.cluster1.com.
49 | 40             IN      PTR     kdc.cluster1.com.
50 | 41             IN      PTR     cm.cluster1.com.
51 | 20             IN      PTR     node1.cluster1.com.
52 | 71             IN      PTR     dn1.cluster1.com.
53 | 72             IN      PTR     dn2.cluster1.com.
54 | 73             IN      PTR     dn3.cluster1.com.
55 | 10	       IN	PTR	base.cluster1.com.
56 | 11	       IN	PTR	cm1.cluster1.com.
57 | 12	       IN	PTR	node1.cluster1.com.
58 | 151	       IN	PTR	cloudera.cluster1.com.
59 | 152	       IN	PTR	datanode.cluster1.com.
60 | 


--------------------------------------------------------------------------------
/Flume/commands:
--------------------------------------------------------------------------------
 1 | # Start Collector first:
 2 | 
 3 | flume-ng agent -c conf -f flume/conf/flume_collector.conf -n collector (Name must match the agent name)
 4 | flume-ng agent -c conf -f flume/conf/web_server_source.conf -n source_agent (Name must match agent name)
 5 | 
 6 | 
 7 | # Change the hostnames/IPs in the config files accordingly.
 8 | 
 9 | 
10 | #Note:
11 | - New deployments do not use flume any more and kafka is doing lot of things.
12 | - But, flume is still a good use case for log aggregatio etc and avoid the overhead of kafka cluster.
13 | 


--------------------------------------------------------------------------------
/Flume/flume_collector.conf:
--------------------------------------------------------------------------------
 1 | #http://flume.apache.org/FlumeUserGuide.html#avro-source
 2 | collector.sources = AvroIn
 3 | collector.sources.AvroIn.type = avro
 4 | collector.sources.AvroIn.bind = 192.168.1.109
 5 | collector.sources.AvroIn.port = 4545
 6 | collector.sources.AvroIn.channels = mc1 mc2
 7 | 
 8 | ## Channels ##
 9 | ## Source writes to 2 channels, one for each sink
10 | collector.channels = mc1 mc2
11 | 
12 | #http://flume.apache.org/FlumeUserGuide.html#memory-channel
13 | 
14 | collector.channels.mc1.type = memory
15 | collector.channels.mc1.capacity = 100
16 | 
17 | collector.channels.mc2.type = memory
18 | collector.channels.mc2.capacity = 100
19 | 
20 | ## Sinks ##
21 | collector.sinks = LocalOut HadoopOut
22 | 
23 | ## Write copy to Local Filesystem
24 | #http://flume.apache.org/FlumeUserGuide.html#file-roll-sink
25 | collector.sinks.LocalOut.type = file_roll
26 | collector.sinks.LocalOut.sink.directory = /var/log/flume-ng
27 | collector.sinks.LocalOut.sink.rollInterval = 0
28 | collector.sinks.LocalOut.channel = mc1
29 | 
30 | ## Write to HDFS
31 | #http://flume.apache.org/FlumeUserGuide.html#hdfs-sink
32 | collector.sinks.HadoopOut.type = hdfs
33 | collector.sinks.HadoopOut.channel = mc2
34 | collector.sinks.HadoopOut.hdfs.path = /user/hadoop/flume-channel/%{log_type}/%y%m%d
35 | collector.sinks.HadoopOut.hdfs.fileType = DataStream
36 | collector.sinks.HadoopOut.hdfs.writeFormat = Text
37 | collector.sinks.HadoopOut.hdfs.rollSize = 0
38 | collector.sinks.HadoopOut.hdfs.rollCount = 10000
39 | collector.sinks.HadoopOut.hdfs.rollInterval = 600
40 | 
41 | 


--------------------------------------------------------------------------------
/Flume/web_server_source.conf:
--------------------------------------------------------------------------------
 1 | # Source Config
 2 | 
 3 | source_agent.sources = apache_server
 4 | source_agent.sources.apache_server.type = exec
 5 | source_agent.sources.apache_server.command = tail -f /var/log/httpd/access_log
 6 | source_agent.sources.apache_server.batchSize = 1
 7 | source_agent.sources.apache_server.channels = mc1
 8 | source_agent.sources.apache_server.interceptors = itime ihost itype
 9 | 
10 | # http://flume.apache.org/FlumeUserGuide.html#timestamp-interceptor
11 | source_agent.sources.apache_server.interceptors.itime.type = timestamp
12 | 
13 | # http://flume.apache.org/FlumeUserGuide.html#host-interceptor
14 | source_agent.sources.apache_server.interceptors.ihost.type = host
15 | source_agent.sources.apache_server.interceptors.ihost.useIP = false
16 | source_agent.sources.apache_server.interceptors.ihost.hostHeader = host
17 |  
18 | # http://flume.apache.org/FlumeUserGuide.html#static-interceptor
19 | source_agent.sources.apache_server.interceptors.itype.type = static
20 | source_agent.sources.apache_server.interceptors.itype.key = log_type
21 | source_agent.sources.apache_server.interceptors.itype.value = apache_access_combined
22 |  
23 | # http://flume.apache.org/FlumeUserGuide.html#memory-channel
24 | source_agent.channels = mc1
25 | source_agent.channels.mc1.type = memory
26 | source_agent.channels.mc1.capacity = 100
27 |  
28 | ## Send to Flume Collector on Hadoop Node
29 | # http://flume.apache.org/FlumeUserGuide.html#avro-sink
30 | source_agent.sinks = avro_sink
31 | source_agent.sinks.avro_sink.type = avro
32 | source_agent.sinks.avro_sink.channel = mc1
33 | source_agent.sinks.avro_sink.hostname = 192.168.1.109
34 | source_agent.sinks.avro_sink.port = 4545
35 | 
36 | #source_agent.sinks = LocalOut
37 | #source_agent.sinks.LocalOut.type = file_roll
38 | #source_agent.sinks.LocalOut.sink.directory = /tmp/flume
39 | #source_agent.sinks.LocalOut.sink.rollInterval = 0
40 | #source_agent.sinks.LocalOut.channel = mc1
41 | 


--------------------------------------------------------------------------------
/HA_QJM/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <configuration>
 5 | 
 6 | <property>
 7 | <name>fs.defaultFS</name>
 8 | <value>hdfs://netxillon</value>
 9 | </property>
10 | 
11 | <property>
12 | <name>ha.zookeeper.quorum</name>
13 | <value>nn1.dilithium.com:2181,nn2.dilithium.com:2181,hbm1.dilithium.com:2181</value>
14 | </property>
15 | 
16 | </configuration>
17 | 


--------------------------------------------------------------------------------
/HA_QJM/hdfs-site.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 | <property>
 4 | <name>dfs.namenode.name.dir</name>
 5 | <value>file:/data/n1,file:/data/n2</value>
 6 | </property>
 7 | 
 8 | <property>
 9 | <name>dfs.replication</name>
10 | <value>2</value>
11 | </property>
12 | 
13 | <property>
14 | <name>dfs.blocksize</name>
15 | <value>268435456</value>
16 | </property>
17 | 
18 | <property>
19 | <name>dfs.nameservices</name>
20 | <value>netxillon</value>
21 | </property>
22 | 
23 | <property>
24 | <name>dfs.ha.namenodes.netxillon</name>
25 | <value>nn1,nn2</value>
26 | </property>
27 | 
28 | <property>
29 | <name>dfs.namenode.rpc-address.netxillon.nn1</name>
30 | <value>nn1.dilithium.com:9000</value>
31 | </property>
32 | 
33 | <property>
34 | <name>dfs.namenode.rpc-address.netxillon.nn2</name>
35 | <value>nn2.dilithium.com:9000</value>
36 | </property>
37 | 
38 | <property>
39 | <name>dfs.namenode.http-address.netxillon.nn1</name>
40 | <value>nn1.dilithium.com:50070</value>
41 | </property>
42 | 
43 | <property>
44 | <name>dfs.namenode.http-address.netxillon.nn2</name>
45 | <value>nn2.dilithium.com:50070</value>
46 | </property>
47 | 
48 | <property>
49 | <name>dfs.ha.automatic-failover.enabled</name>
50 | <value>true</value>
51 | </property>
52 | 
53 | <property>
54 | <name>dfs.journalnode.edits.dir</name>
55 | <value>/data/netxillon</value>
56 | </property>
57 | 
58 | <property>
59 | <name>dfs.namenode.shared.edits.dir</name>
60 | <value>qjournal://nn1.dilithium.com:8485;nn2.dilithium.com:8485;hbm1.dilithium.com:8485/netxillon</value>
61 | </property>
62 | 
63 | <property>
64 | <name>dfs.client.failover.proxy.provider.netxillon</name>
65 | <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
66 | </property>
67 | 
68 | <property>
69 |   <name>dfs.ha.fencing.ssh.private-key-files</name>
70 |       <value>/home/hadoop/.ssh/id_rsa</value>
71 |       </property>
72 | 
73 | <property>
74 |   <name>dfs.ha.fencing.methods</name>
75 |       <value>sshfence
76 |                    shell(/bin/true)
77 |                                   </value>
78 | </property>
79 | </configuration>
80 | 


--------------------------------------------------------------------------------
/HA_QJM/hdfs-site_datanode.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 | <property>
 4 | <name>dfs.datanode.data.dir</name>
 5 | <value>file:/space/d1,file:/space/d2</value>
 6 | </property>
 7 | 
 8 | <property>
 9 | <name>dfs.replication</name>
10 | <value>2</value>
11 | </property>
12 | 
13 | <property>
14 | <name>dfs.blocksize</name>
15 | <value>268435456</value>
16 | </property>
17 | 
18 | <property>
19 | <name>dfs.nameservices</name>
20 | <value>netxillon</value>
21 | </property>
22 | 
23 | <property>
24 | <name>dfs.ha.namenodes.netxillon</name>
25 | <value>nn1,nn2</value>
26 | </property>
27 | 
28 | <property>
29 | <name>dfs.namenode.rpc-address.netxillon.nn1</name>
30 | <value>nn1.dilithium.com:9000</value>
31 | </property>
32 | 
33 | <property>
34 | <name>dfs.namenode.rpc-address.netxillon.nn2</name>
35 | <value>nn2.dilithium.com:9000</value>
36 | </property>
37 | 
38 | <property>
39 | <name>dfs.client.failover.proxy.provider.netxillon</name>
40 | <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
41 | </property>
42 | 
43 | </configuration>
44 | 


--------------------------------------------------------------------------------
/HA_QJM/hdfs-site_namenode.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 | <property>
 4 | <name>dfs.namenode.name.dir</name>
 5 | <value>file:/data/n1,file:/data/n2</value>
 6 | </property>
 7 | 
 8 | <property>
 9 | <name>dfs.replication</name>
10 | <value>2</value>
11 | </property>
12 | 
13 | <property>
14 | <name>dfs.blocksize</name>
15 | <value>268435456</value>
16 | </property>
17 | 
18 | <property>
19 | <name>dfs.nameservices</name>
20 | <value>netxillon</value>
21 | </property>
22 | 
23 | <property>
24 | <name>dfs.ha.namenodes.netxillon</name>
25 | <value>nn1,nn2</value>
26 | </property>
27 | 
28 | <property>
29 | <name>dfs.namenode.rpc-address.netxillon.nn1</name>
30 | <value>nn1.dilithium.com:9000</value>
31 | </property>
32 | 
33 | <property>
34 | <name>dfs.namenode.rpc-address.netxillon.nn2</name>
35 | <value>nn2.dilithium.com:9000</value>
36 | </property>
37 | 
38 | <property>
39 | <name>dfs.namenode.http-address.netxillon.nn1</name>
40 | <value>nn1.dilithium.com:50070</value>
41 | </property>
42 | 
43 | <property>
44 | <name>dfs.namenode.http-address.netxillon.nn2</name>
45 | <value>nn2.dilithium.com:50070</value>
46 | </property>
47 | 
48 | <property>
49 | <name>dfs.ha.automatic-failover.enabled</name>
50 | <value>true</value>
51 | </property>
52 | 
53 | <property>
54 | <name>dfs.journalnode.edits.dir</name>
55 | <value>/data/netxillon</value>
56 | </property>
57 | 
58 | <property>
59 | <name>dfs.namenode.shared.edits.dir</name>
60 | <value>qjournal://nn1.dilithium.com:8485;nn2.dilithium.com:8485;hbm1.dilithium.com:8485/netxillon</value>
61 | </property>
62 | 
63 | <property>
64 | <name>dfs.client.failover.proxy.provider.netxillon</name>
65 | <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
66 | </property>
67 | 
68 | <property>
69 |   <name>dfs.ha.fencing.ssh.private-key-files</name>
70 |       <value>/home/hadoop/.ssh/id_rsa</value>
71 |       </property>
72 | 
73 | <property>
74 |   <name>dfs.ha.fencing.methods</name>
75 |       <value>sshfence
76 |                    shell(/bin/true)
77 |                                   </value>
78 | </property>
79 | </configuration>
80 | 


--------------------------------------------------------------------------------
/HA_QJM/steps:
--------------------------------------------------------------------------------
 1 | QJM Steps
 2 | =========
 3 | 1. Setup zookeeper quorum and make sure that it is healthy
 4 | 	- ./zookeeper-3.4.5/bin/zkServer.sh start
 5 |   	- ./zookeeper-3.4.5/bin/zkCli.sh -server nn1.dilithium.com:2181
 6 | 
 7 | 	or
 8 | 
 9 | [hdfs@nn2 ~]$ zkServer.sh status
10 | ZooKeeper JMX enabled by default
11 | Using config: /opt/cluster/zoo/bin/../conf/zoo.cfg
12 | Mode: follower
13 | 
14 | 2. Setup core and hdfs file on each node as given.
15 | 
16 | 3. Start Journalnode on all journal node machines
17 | 	- hadoop-daemon.sh start journalnode
18 | 
19 | 4. Format namenode (Do not run this command, if you already have a NN runing and want to update it to HA)
20 | 	- hdfs namenode -format
21 | 
22 | 5. Initialize shared edits for Journal node to see:
23 |         - hdfs namenode -initializeSharedEdits -force
24 | 
25 | Note:	
26 |   - namenode must be stopped for this step;
27 |   - Only run this if you have not executed step4.
28 |   	- Means that there was already a single Namenode and now you need to initialize the shared edits for Journals.
29 |   	- You must not use initialize command if we are formating the Namenode as it will automatically initiallize the Journals
30 | 	  node directories
31 |         
32 | 
33 | 6. Format zkFC
34 | 	- hdfs zkfc -formatZK -force
35 | 
36 | 7. Start namenode
37 | 	- hadoop-daemon.sh start namenode
38 | 
39 | 8. Start ZKFC
40 | 	- hadoop-daemon.sh start zkfc
41 | 
42 | 9. Bootstrap StandbyNamenode
43 | 	- hdfs namenode -bootstrapStandby
44 | 
45 | 10. Start Namenode on standby
46 | 	- hadoop-daemon.sh start namenode
47 | 
48 | 11. Start ZKFC on standy
49 | 	- hadoop-daemon.sh start zkfc
50 | 
51 | Now your cluster is HA with one active Namenode
52 | 
53 | [hdfs@nn1 ~]$ hdfs haadmin -getServiceState nn1
54 | active
55 | [hdfs@nn1 ~]$ hdfs haadmin -getServiceState nn2
56 | standby
57 | 
58 | The "start-dfs.sh" script understands that it is a HA with Journal nodes and automatically starts:
59 | 	- Both namenodes
60 | 	- All Journal nodes
61 | 	- Datanode nodes
62 | 	- Both ZkFC
63 | 
64 | Note: Make sure you start the ZK quorum before hand.
65 | 
66 | [hdfs@nn1 ~]$ jps
67 | 7828 Jps
68 | 7336 JournalNode
69 | 7512 DFSZKFailoverController
70 | 7162 NameNode
71 | 
72 | 


--------------------------------------------------------------------------------
/HA_QJM/zoo.cfg:
--------------------------------------------------------------------------------
 1 | [hdfs@nn2 ~]$ cat .bash_profile
 2 | # .bash_profile
 3 | 
 4 | # Get the aliases and functions
 5 | if [ -f ~/.bashrc ]; then
 6 |         . ~/.bashrc
 7 | fi
 8 | 
 9 | # User specific environment and startup programs
10 | 
11 | PATH=$PATH:$HOME/bin
12 | ZOOKEEPER_HOME=/opt/cluster/zoo
13 | 
14 | PATH=$ZOOKEEPER_HOME/bin:$PATH
15 | export PATH
16 | 
17 | 
18 | [hdfs@nn2 ~]$ cat /opt/cluster/zoo/conf/zoo.cfg
19 | 
20 | # The number of milliseconds of each tick
21 | tickTime=2000
22 | 
23 | # The number of ticks that the initial synchronization phase can take
24 | initLimit=10
25 | 
26 | # The number of ticks that can pass between
27 | # sending a request and getting an acknowledgement
28 | syncLimit=5
29 | 
30 | # the directory where the snapshot is stored.
31 | # Choose appropriately for your environment
32 | dataDir=/opt/cluster/zookeeper/data
33 | 
34 | # the port at which the clients will connect
35 | clientPort=2181
36 | 
37 | maxClientCnxns=60
38 | 
39 | # the directory where transaction log is stored.
40 | # this parameter provides dedicated log device for ZooKeeper
41 | dataLogDir=/opt/cluster/zookeeper/logs
42 | 
43 | # ZooKeeper server and its port no.
44 | # ZooKeeper ensemble should know about every other machine in the ensemble
45 | # specify server id by creating 'myid' file in the dataDir
46 | # use hostname instead of IP address for convenient maintenance
47 | server.1=nn1.dilithium.com:2888:3888
48 | server.2=nn2.dilithium.com:2889:3889
49 | server.3=hbm1.dilithium.com:2890:3890
50 | 
51 | 


--------------------------------------------------------------------------------
/HA_RM/yarn-site.xml.ha:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0"?>
  2 | <configuration>
  3 | 
  4 | <!-- If you do not have Spark Yarn shuffle jar in the Path, remove the Spark entries -->
  5 | 
  6 | <property>
  7 | <name>yarn.nodemanager.aux-services</name>
  8 | <value>mapreduce_shuffle,spark_shuffle,spark2_shuffle</value>
  9 | </property>
 10 | 
 11 | <property>
 12 | <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
 13 | <value>org.apache.hadoop.mapred.ShuffleHandler</value>
 14 | </property>
 15 | 
 16 | <property>
 17 |     <name>yarn.nodemanager.aux-services.spark_shuffle.class</name>
 18 |     <value>org.apache.spark.network.yarn.YarnShuffleService</value>
 19 | </property>
 20 | 
 21 | <property>
 22 |       <name>yarn.nodemanager.aux-services.spark2_shuffle.class</name>
 23 |       <value>org.apache.spark.network.yarn.YarnShuffleService</value>
 24 | </property>
 25 | 
 26 | # HA Configuration
 27 | 
 28 | <!-- RM HA Configurations-->
 29 | 
 30 | <property>
 31 |  <name>yarn.resourcemanager.ha.enabled</name>
 32 |  <value>true</value>
 33 | </property>
 34 | 
 35 | <property>
 36 |  <name>yarn.resourcemanager.cluster-id</name>
 37 |  <value>netxillon</value>
 38 | </property>
 39 | 
 40 | <property>
 41 |  <name>yarn.resourcemanager.ha.rm-ids</name>
 42 |  <value>rm1,rm2</value>
 43 | </property>
 44 | 
 45 | <property>
 46 |  <name>yarn.resourcemanager.hostname.rm1</name>
 47 |  <value>rm1.dilithium.com</value>
 48 | </property>
 49 | 
 50 | <property>
 51 |  <name>yarn.resourcemanager.hostname.rm2</name>
 52 |  <value>rm2.dilithium.com</value>
 53 | </property>
 54 | 
 55 | <property>
 56 |  <name>yarn.resourcemanager.webapp.address.rm1</name>
 57 |  <value>rm1.dilithium.com:8088</value>
 58 | </property>
 59 | 
 60 | <property>
 61 |  <name>yarn.resourcemanager.webapp.address.rm2</name>
 62 |  <value>rm2.dilithium.com:8088</value>
 63 | </property>
 64 | 
 65 | <property>
 66 |  <name>yarn.resourcemanager.recovery.enabled</name>
 67 |  <value>true</value>
 68 | </property>
 69 | 
 70 | <property>
 71 |  <name>yarn.resourcemanager.store.class</name>
 72 |  <value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
 73 | </property>
 74 | 
 75 | <property>
 76 |  <name>yarn.resourcemanager.zk-address</name>
 77 |  <value>nn1.dilithium.com:2181,nn2.dilithium.com:2181,hbm1.dilithium.com:2181</value>
 78 | </property>
 79 | 
 80 | <property>
 81 |  <name>yarn.client.failover-proxy-provider</name>
 82 |  <value>org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider</value>
 83 | </property>
 84 | 
 85 | <!-- We can set specifics for components of RM
 86 | 
 87 | You can also set values for each of the following properties in yarn-site.xml:
 88 | 
 89 | yarn.resourcemanager.address.<rm-id>
 90 | yarn.resourcemanager.scheduler.address.<rm-id>
 91 | yarn.resourcemanager.admin.address.<rm-id>
 92 | yarn.resourcemanager.resource-tracker.address.<rm-id>
 93 | yarn.resourcemanager.webapp.address.<rm-id>
 94 | 
 95 | If these addresses are not explicitly set, each of these properties will use
 96 | 
 97 | yarn.resourcemanager.hostname.<rm-id>:default_port
 98 | -->
 99 | # End HA Configuration
100 | 
101 | <property>
102 |     <name>yarn.nodemanager.resource.memory-mb</name>
103 |     <value>3072</value>
104 | </property>
105 | 
106 | <property>
107 |     <name>yarn.scheduler.minimum-allocation-mb</name>
108 |     <value>256</value>
109 | </property>
110 | 
111 | <property>
112 |     <name>yarn.scheduler.maximum-allocation-mb</name>
113 |     <value>3072</value>
114 | </property>
115 | 
116 | <property>
117 |     <name>yarn.scheduler.minimum-allocation-vcores</name>
118 |     <value>1</value>
119 | </property>
120 | 
121 | <property>
122 |     <name>yarn.scheduler.maximum-allocation-vcores</name>
123 |     <value>12</value>
124 | </property>
125 | 
126 | <property>
127 |     <name>yarn.nodemanager.resource.cpu-vcores</name>
128 |     <value>12</value>
129 | </property>
130 | 
131 | 
132 | <property>
133 | <name>yarn.nodemanager.vmem-pmem-ratio</name>
134 | <value>2.1</value>
135 | </property>
136 | 
137 | 
138 | <property>
139 |     <name>yarn.log-aggregation-enable</name>
140 |     <value>true</value>
141 | </property>
142 | 
143 | <property>
144 |     <description>Where to aggregate logs to.</description>
145 |     <name>yarn.nodemanager.remote-app-log-dir</name>
146 |     <value>/tmp/logs</value>
147 | </property>
148 | 
149 | <property>
150 |     <name>yarn.log-aggregation.retain-seconds</name>
151 |     <value>259200</value>
152 | </property>
153 | 
154 | <property>
155 |     <name>yarn.log-aggregation.retain-check-interval-seconds</name>
156 |     <value>3600</value>
157 | </property>
158 | 
159 | </configuration>
160 | 
161 | 


--------------------------------------------------------------------------------
/HA_RM/yarn-site.xml.spark:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0"?>
  2 | <configuration>
  3 | 
  4 | <!-- Site specific YARN configuration properties -->
  5 | 
  6 | <property>
  7 | <name>yarn.nodemanager.aux-services</name>
  8 | <value>mapreduce_shuffle,spark_shuffle,spark2_shuffle</value>
  9 | </property>
 10 | 
 11 | <property>
 12 | <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
 13 | <value>org.apache.hadoop.mapred.ShuffleHandler</value>
 14 | </property>
 15 | 
 16 | <property>
 17 |     <name>yarn.nodemanager.aux-services.spark_shuffle.class</name>
 18 |     <value>org.apache.spark.network.yarn.YarnShuffleService</value>
 19 | </property>
 20 | 
 21 | <property>
 22 | 
 23 |       <name>yarn.nodemanager.aux-services.spark2_shuffle.class</name>
 24 |       <value>org.apache.spark.network.yarn.YarnShuffleService</value>
 25 | </property>
 26 | 
 27 | <property>
 28 | <name>yarn.resourcemanager.resource-tracker.address</name>
 29 | <value>rm1.dilithium.com:9001</value>
 30 | </property>
 31 | 
 32 | <property>
 33 | <name>yarn.resourcemanager.scheduler.address</name>
 34 | <value>rm1.dilithium.com:9002</value>
 35 | </property>
 36 | 
 37 | <property>
 38 | <name>yarn.resourcemanager.address</name>
 39 | <value>rm1.dilithium.com:9003</value>
 40 | </property>
 41 | 
 42 | #<property>
 43 | #<name>yarn.nodemanager.local-dirs</name>
 44 | #<value>file:/space/tmp1,file:/space/tmp2</value>
 45 | #</property>
 46 | 
 47 | <property>
 48 |     <name>yarn.nodemanager.resource.memory-mb</name>
 49 |     <value>3072</value>
 50 | </property>
 51 | 
 52 | <property>
 53 |     <name>yarn.scheduler.minimum-allocation-mb</name>
 54 |     <value>256</value>
 55 | </property>
 56 | 
 57 | <property>
 58 |     <name>yarn.scheduler.maximum-allocation-mb</name>
 59 |     <value>3072</value>
 60 | </property>
 61 | 
 62 | <property>
 63 |     <name>yarn.scheduler.minimum-allocation-vcores</name>
 64 |     <value>1</value>
 65 | </property>
 66 | 
 67 | <property>
 68 |     <name>yarn.scheduler.maximum-allocation-vcores</name>
 69 |     <value>12</value>
 70 | </property>
 71 | 
 72 | <property>
 73 |     <name>yarn.nodemanager.resource.cpu-vcores</name>
 74 |     <value>12</value>
 75 | </property>
 76 | 
 77 | 
 78 | <property>
 79 | <name>yarn.nodemanager.vmem-pmem-ratio</name>
 80 | <value>2.1</value>
 81 | </property>
 82 | 
 83 | #<property>
 84 | #   <name>yarn.nodemanager.vmem-check-enabled</name>
 85 | #    <value>false</value>
 86 | #    <description>Whether virtual memory limits will be enforced for containers</description>
 87 | #</property>
 88 | 
 89 | <property>
 90 |     <name>yarn.log-aggregation-enable</name>
 91 |     <value>true</value>
 92 | </property>
 93 | 
 94 | <property>
 95 |     <description>Where to aggregate logs to.</description>
 96 |     <name>yarn.nodemanager.remote-app-log-dir</name>
 97 |     <value>/tmp/logs</value>
 98 | </property>
 99 | 
100 | <property>
101 |     <name>yarn.log-aggregation.retain-seconds</name>
102 |     <value>259200</value>
103 | </property>
104 | 
105 | <property>
106 |     <name>yarn.log-aggregation.retain-check-interval-seconds</name>
107 |     <value>3600</value>
108 | </property>
109 | 
110 | </configuration>
111 | 
112 | 


--------------------------------------------------------------------------------
/HA_RM/yarn-site_nodemanager_ha.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0"?>
  2 | <configuration>
  3 | 
  4 | <!-- Site specific YARN configuration properties -->
  5 | 
  6 | <property>
  7 | <name>yarn.nodemanager.aux-services</name>
  8 | <value>mapreduce_shuffle,spark_shuffle,spark2_shuffle</value>
  9 | </property>
 10 | 
 11 | <property>
 12 | <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
 13 | <value>org.apache.hadoop.mapred.ShuffleHandler</value>
 14 | </property>
 15 | 
 16 | <property>
 17 |     <name>yarn.nodemanager.aux-services.spark_shuffle.class</name>
 18 |     <value>org.apache.spark.network.yarn.YarnShuffleService</value>
 19 | </property>
 20 | 
 21 | <property>
 22 |       <name>yarn.nodemanager.aux-services.spark2_shuffle.class</name>
 23 |       <value>org.apache.spark.network.yarn.YarnShuffleService</value>
 24 | </property>
 25 | 
 26 | # HA Configuration
 27 | 
 28 | <!-- HA Configurations-->
 29 | 
 30 | <property>
 31 |  <name>yarn.resourcemanager.ha.enabled</name>
 32 |  <value>true</value>
 33 | </property>
 34 | 
 35 | <property>
 36 |  <name>yarn.resourcemanager.cluster-id</name>
 37 |  <value>netxillon</value>
 38 | </property>
 39 | 
 40 | <property>
 41 |  <name>yarn.resourcemanager.ha.rm-ids</name>
 42 |  <value>rm1,rm2</value>
 43 | </property>
 44 | 
 45 | <property>
 46 |  <name>yarn.resourcemanager.hostname.rm1</name>
 47 |  <value>rm1.dilithium.com</value>
 48 | </property>
 49 | 
 50 | <property>
 51 |  <name>yarn.resourcemanager.hostname.rm2</name>
 52 |  <value>rm2.dilithium.com</value>
 53 | </property>
 54 | 
 55 | <property>
 56 |  <name>yarn.client.failover-proxy-provider</name>
 57 |  <value>org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider</value>
 58 | </property>
 59 | 
 60 | # End HA Configuration
 61 | 
 62 | <property>
 63 |     <name>yarn.nodemanager.resource.memory-mb</name>
 64 |     <value>3072</value>
 65 | </property>
 66 | 
 67 | <property>
 68 |     <name>yarn.scheduler.minimum-allocation-mb</name>
 69 |     <value>256</value>
 70 | </property>
 71 | 
 72 | <property>
 73 |     <name>yarn.scheduler.maximum-allocation-mb</name>
 74 |     <value>3072</value>
 75 | </property>
 76 | 
 77 | <property>
 78 |     <name>yarn.scheduler.minimum-allocation-vcores</name>
 79 |     <value>1</value>
 80 | </property>
 81 | 
 82 | <property>
 83 |     <name>yarn.scheduler.maximum-allocation-vcores</name>
 84 |     <value>12</value>
 85 | </property>
 86 | 
 87 | <property>
 88 |     <name>yarn.nodemanager.resource.cpu-vcores</name>
 89 |     <value>12</value>
 90 | </property>
 91 | 
 92 | 
 93 | <property>
 94 | <name>yarn.nodemanager.vmem-pmem-ratio</name>
 95 | <value>2.1</value>
 96 | </property>
 97 | 
 98 | 
 99 | <property>
100 |     <name>yarn.log-aggregation-enable</name>
101 |     <value>true</value>
102 | </property>
103 | 
104 | <property>
105 |     <description>Where to aggregate logs to.</description>
106 |     <name>yarn.nodemanager.remote-app-log-dir</name>
107 |     <value>/tmp/logs</value>
108 | </property>
109 | 
110 | <property>
111 |     <name>yarn.log-aggregation.retain-seconds</name>
112 |     <value>259200</value>
113 | </property>
114 | 
115 | <property>
116 |     <name>yarn.log-aggregation.retain-check-interval-seconds</name>
117 |     <value>3600</value>
118 | </property>
119 | 
120 | </configuration>
121 | 
122 | 


--------------------------------------------------------------------------------
/HA_hadoop/core-site.xml:
--------------------------------------------------------------------------------
 1 | Both Namenodes
 2 | ==============
 3 | 
 4 | <configuration>
 5 | 
 6 | <property>
 7 |   <name>fs.defaultFS</name>
 8 |   <value>hdfs://mycluster</value>
 9 | </property>
10 | 
11 | <property>
12 | <name>dfs.replication</name>
13 | <value>1</value>
14 | </property>
15 | 
16 | </configuration>
17 | 
18 | Data Nodes
19 | ==========
20 | <configuration>
21 | 
22 | <property>
23 |   <name>fs.defaultFS</name>
24 |   <value>hdfs://mycluster</value>
25 | </property>
26 | 
27 | </configuration>
28 | 


--------------------------------------------------------------------------------
/HA_hadoop/hdfs-site.xml:
--------------------------------------------------------------------------------
  1 | Namenodes
  2 | =========
  3 | 
  4 | <configuration>
  5 | 
  6 | <property>
  7 |     <name>dfs.namenode.name.dir</name>
  8 |     <value>file:/data/namenode</value>
  9 | </property>
 10 | 
 11 | <property>
 12 |   <name>dfs.nameservices</name>
 13 |   <value>mycluster</value>
 14 | </property>
 15 | 
 16 | <property>
 17 |   <name>dfs.ha.namenodes.mycluster</name>
 18 |   <value>nn1,nn2</value>
 19 | </property>
 20 | 
 21 | <property>
 22 |   <name>dfs.namenode.rpc-address.mycluster.nn1</name>
 23 |   <value>ha-nn1.hacluster1.com:9000</value>
 24 | </property>
 25 | 
 26 | <property>
 27 |   <name>dfs.namenode.rpc-address.mycluster.nn2</name>
 28 |   <value>ha-nn2.hacluster1.com:9000</value>
 29 | </property>
 30 | 
 31 | <property>
 32 |   <name>dfs.namenode.http-address.mycluster.nn1</name>
 33 |   <value>ha-nn1.hacluster1.com:50070</value>
 34 | </property>
 35 | 
 36 | <property>
 37 |   <name>dfs.namenode.http-address.mycluster.nn2</name>
 38 |   <value>ha-nn2.hacluster1.com:50070</value>
 39 | </property>
 40 | 
 41 | <property>
 42 |   <name>dfs.namenode.shared.edits.dir</name>
 43 |   <value>file:///mnt/filer</value>
 44 | </property>
 45 | 
 46 | <property>
 47 |   <name>dfs.client.failover.proxy.provider.mycluster</name>
 48 |   <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
 49 | </property>
 50 | 
 51 | <property>
 52 |   <name>dfs.ha.fencing.methods</name>
 53 |   <value>sshfence</value>
 54 | </property>
 55 | 
 56 | <property>
 57 |   <name>dfs.ha.fencing.ssh.private-key-files</name>
 58 |   <value>/home/hadoop/.ssh/id_rsa</value>
 59 | </property>
 60 | 
 61 | <property>
 62 |   <name>dfs.ha.fencing.methods</name>
 63 |   <value>sshfence
 64 |          shell(/bin/true)
 65 |   </value>
 66 | </property>
 67 | 
 68 | </configuration>
 69 | ======================
 70 | 
 71 | Datanodes
 72 | =========
 73 | 
 74 | <configuration>
 75 | 
 76 | <property>
 77 |     <name>dfs.datanode.data.dir</name>
 78 |     <value>file:/data/datanode</value>
 79 | </property>
 80 | 
 81 | <property>
 82 |   <name>dfs.nameservices</name>
 83 |   <value>mycluster</value>
 84 | </property>
 85 | 
 86 | <property>
 87 |   <name>dfs.ha.namenodes.mycluster</name>
 88 |   <value>nn1,nn2</value>
 89 | </property>
 90 | 
 91 | <property>
 92 |   <name>dfs.namenode.rpc-address.mycluster.nn1</name>
 93 |   <value>ha-nn1.hacluster1.com:9000</value>
 94 | </property>
 95 | 
 96 | <property>
 97 |   <name>dfs.namenode.rpc-address.mycluster.nn2</name>
 98 |   <value>ha-nn2.hacluster1.com:9000</value>
 99 | </property>
100 | 
101 | <property>
102 | <name>dfs.client.failover.proxy.provider.mycluster</name>
103 | <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
104 | </property>
105 | 
106 | </configuration>
107 | 
108 | 


--------------------------------------------------------------------------------
/HBase/Optimizations/HBase_yscb.txt:
--------------------------------------------------------------------------------
 1 | Steps:
 2 | 
 3 | 1. tar -xzvf ycsb-0.13.0-SNAPSHOT.tar.gz
 4 | 2. cd ycsb-0.13.0-SNAPSHOT
 5 | 3. cp /usr/lib/hbase/lib/slf4j-api-1.6.1.jar .
 6 | 4. cp /usr/lib/hbase/lib/zookeeper.jar .
 7 | 
 8 | hbase> create 'usertable', {NAME => 'f1', VERSIONS => '1', COMPRESSION => 'SNAPPY'}
 9 | 
10 | 5. cd ycsb-0.13.0-SNAPSHOT/bin
11 | 
12 | $ ycsb load hbase12 -P workloads/workloada -p columnfamily=f1 -p recordcount=1000000 -p threadcount=4 -s | tee -a write.txt
13 | $ ycsb load hbase12 -P workloads/workloadb -p columnfamily=f1 -p recordcount=100000 -p operationcount=10000 -p threadcount=4 -s | tee -a workloadread.dat


--------------------------------------------------------------------------------
/HBase/Optimizations/Hbase_create_table.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | hbase(main):001:0> create 'emp', 'personal data', 'professional data', {SPLITS => (1..n_splits).map {|i| "user#{1000+i*(9999-1000)/n_splits}"}}
 3 | 
 4 | create 'emp1', 'personal data', 'professional data', {REPLICATION_SCOPE => 1}
 5 | 
 6 | 
 7 | hbase(main):001:0> n_splits = 200 # HBase recommends (10 * number of regionservers)
 8 | hbase(main):002:0> create 'usertable', 'family', {SPLITS => (1..n_splits).map {|i| "user#{1000+i*(9999-1000)/n_splits}"}}
 9 | 
10 | 
11 | scan 'hbase:meta',{FILTER=>"PrefixFilter('emp1')"}
12 | 
13 | 
14 | Snapshots:
15 | 
16 | 
17 | hbase snapshot create -n snapshotName -t tableName
18 | 
19 | hbase shell
20 | >> delete_snapshot 'snapshotName'
21 | >> restore_snapshot snapshotName
22 | >> list_snapshots
23 | >> clone_snapshot 'snapshotName', 'newTableName'
24 | 
25 | hbase snapshot info -snapshot snapshotName
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/HBase/Optimizations/Hbase_happybase.txt:
--------------------------------------------------------------------------------
 1 | table = connection.table('table-name')
 2 | 
 3 | table.put(b'row-key', {b'family:qual1': b'value1',
 4 |                        b'family:qual2': b'value2'})
 5 | 
 6 | row = table.row(b'row-key')
 7 | print(row[b'family:qual1'])  # prints 'value1'
 8 | 
 9 | for key, data in table.rows([b'row-key-1', b'row-key-2']):
10 |     print(key, data)  # prints row key and data for each row
11 | 
12 | for key, data in table.scan(row_prefix=b'row'):
13 |     print(key, data)  # prints 'value1' and 'value2'
14 | 
15 | row = table.delete(b'row-key')
16 | 
17 | 
18 | families = {
19 |     'cf1': dict(max_versions=10),
20 |     'cf2': dict(max_versions=1, block_cache_enabled=False),
21 |     'cf3': dict(),  # use defaults
22 | }
23 | 
24 | connection.create_table('mytable', families)


--------------------------------------------------------------------------------
/HBase/Optimizations/Hbase_rand_gen.txt:
--------------------------------------------------------------------------------
 1 | hbase(main):005:0> put 'emp','1','personal data:name','raju’
 2 | hbase(main):006:0> put 'emp','1','personal data:city','hyderabad' 
 3 | hbase(main):007:0> put 'emp','1','professional data:designation','manager' 
 4 | hbase(main):007:0> put 'emp','1','professional data:salary','50000’
 5 | 
 6 | locate_region 'test', '1'
 7 | get_splits 'test'
 8 | 
 9 | create 'emp', 'personal data', 'professional data'
10 | #!/bin/bash
11 | 
12 | for i in `seq 1 1000000`
13 | do
14 |        
15 |         echo "put 'emp', '$i', 'personal data:name', 'raju$i'"
16 | 	echo "put 'emp', '$i', 'personal data:city', 'hyderabad$i'"
17 | 	echo "put 'emp', '$i', 'professional data:designation', 'manager$i'"
18 | 	echo "put 'emp', '$i', 'professional data:salary', '20000$i'"
19 |        
20 | done
21 | 
22 | # Optimized versions
23 | ====================
24 | 
25 | #!/bin/bash
26 | 
27 | MIN=0
28 | MAX=1234567890
29 | while
30 | for i in `seq 1 1000000`
31 | do
32 |   rand=$(cat /dev/urandom | tr -dc 0-9 | fold -w${#MAX} | head -1 | sed 's/^0*//;')
33 |   [ -z $rnd ] && rnd=0
34 |   (( $rnd < $MIN || $rnd > $MAX ))
35 | 	
36 |   echo "put 'emp', '$rand', 'personal data:name', 'raju$i'"
37 | 	echo "put 'emp', '$rand', 'personal data:city', 'hyderabad$i'"
38 | 	echo "put 'emp', '$rand', 'professional data:designation', 'manager$i'"
39 | 	echo "put 'emp', '$rand', 'professional data:salary', '20000$i'"
40 | done
41 | do :
42 | done
43 | 
44 | =============
45 | 
46 | #!/bin/bash
47 | 
48 | # create 'emp', 'personal data', 'professional data'
49 | 
50 | MIN=0
51 | MAX=1234567890
52 | while
53 | 
54 | exp=`shuf -i 2000-65000 -n 1`
55 | #for i in `seq 1 10000000`
56 | #do
57 |   rand=$(cat /dev/urandom | tr -dc 0-9 | fold -w${#MAX} | head -1 | sed 's/^0*//;')
58 |   [ -z $rand ] && rand=0
59 |   (( $rand < $MIN || $rand > $MAX ))
60 | 
61 |   	echo "put 'emp', '$rand', 'personal data:name', 'raju$exp'"
62 | 	echo "put 'emp', '$rand', 'personal data:city', 'hyderabad$exp'"
63 | 	echo "put 'emp', '$rand', 'professional data:designation', 'manager$exp'"
64 | 	echo "put 'emp', '$rand', 'professional data:salary', '20$exp'"
65 | #done
66 | do :
67 | done
68 | 


--------------------------------------------------------------------------------
/HBase/Optimizations/Netxillon_HBase.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/HBase/Optimizations/Netxillon_HBase.pdf


--------------------------------------------------------------------------------
/HBase/README.md:
--------------------------------------------------------------------------------
1 | export HADOOP_ROOT_LOGGER=TRACE,console; export HADOOP_JAAS_DEBUG=true; export HADOOP_OPTS="-Dsun.security.krb5.debug=true"
2 | 
3 | export HBASE_ROOT_LOGGER=hbase.root.logger=DEBUG,console
4 | 


--------------------------------------------------------------------------------
/HBase/backup-masters:
--------------------------------------------------------------------------------
1 | dn2.cluster1.com
2 | 


--------------------------------------------------------------------------------
/HBase/commands.txt:
--------------------------------------------------------------------------------
 1 | start-hbase.sh
 2 | stop-hbase.sh
 3 | 
 4 | hbase shell;
 5 | 
 6 | 
 7 | create 't1', {NAME => 'f1', VERSIONS => 5}
 8 | 
 9 | describe 't1'
10 | 
11 | 
12 | create 'class', 'cf'
13 | 
14 | put 'class', 'row1', 'cf:a', 'value1'
15 | put 'class', 'row2', 'cf:b', 'value2'
16 | put 'class', 'row3', 'cf:c', 'value3'
17 | scan 'test1'
18 | 
19 | put 'test', 'row1', 'cf:a', 'value1'
20 | put 'test', 'row3', 'cf:c', 'value3'
21 | 


--------------------------------------------------------------------------------
/HBase/hbase-site.txt:
--------------------------------------------------------------------------------
 1 | <property>
 2 | <name>hbase.master</name>
 3 | <value>client.cluster1.com:60000</value>
 4 | </property>
 5 | 
 6 | <property>
 7 | <name>hbase.rootdir</name>
 8 | <value>hdfs://nn1.cluster1.com:9000/hbase</value>
 9 | </property>
10 | 
11 | <property>
12 | <name>hbase.cluster.distributed</name>
13 | <value>true</value>
14 | </property>
15 | 
16 | <property>
17 | <name>hbase.zookeeper.quorum</name>
18 | <value>dn1.cluster1.com,dn2.cluster1.com</value>
19 | </property>
20 | 
21 | 
22 | <property>
23 | <name>hbase.zookeeper.property.clientPort</name>
24 | <value>2181</value>
25 | </property>
26 | 
27 | Standalone Only
28 | ===============
29 | <configuration>
30 | 
31 | <property>
32 | <name>hbase.rootdir</name>
33 | <value>file:///home/hadoop/hdata</value>
34 | </property>
35 | 
36 | <property>
37 | <name>hbase.zookeeper.property.dataDir</name>
38 | <value>/home/hadoop/zookeeper</value>
39 | </property>
40 | 
41 | </configuration>
42 | 


--------------------------------------------------------------------------------
/HBase/hfile:
--------------------------------------------------------------------------------
 1 | [hdfs@edge1 conf]$ hbase org.apache.hadoop.hbase.io.hfile.HFile -v -f /hbase/data/default/class/9c4785ae4c37c4679773326224d7f5fe/cf/e90064527dda44d1b09f12b36edb4dd7
 2 | SLF4J: Class path contains multiple SLF4J bindings.
 3 | SLF4J: Found binding in [jar:file:/opt/cluster/hbase-1.0.1.1/lib/slf4j-log4j12-1.7.7.jar!/org/slf4j/impl/StaticLoggerBinder.class]
 4 | SLF4J: Found binding in [jar:file:/opt/cluster/hadoop-2.6.0/share/hadoop/common/lib/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]
 5 | SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
 6 | SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
 7 | Scanning -> /hbase/data/default/class/9c4785ae4c37c4679773326224d7f5fe/cf/e90064527dda44d1b09f12b36edb4dd7
 8 | 2016-01-26 00:06:50,390 INFO  [main] hfile.CacheConfig: CacheConfig:disabled
 9 | Scanned kv count -> 5
10 | [hdfs@edge1 conf]$ hbase org.apache.hadoop.hbase.io.hfile.HFile -v -f hdfs://nn1.dilithium.com:9000/hbase/data/default/class/9c4785ae4c37c4679773326224d7f5fe/cf/e90064527dda44d1b09f12b36edb4dd7
11 | SLF4J: Class path contains multiple SLF4J bindings.
12 | SLF4J: Found binding in [jar:file:/opt/cluster/hbase-1.0.1.1/lib/slf4j-log4j12-1.7.7.jar!/org/slf4j/impl/StaticLoggerBinder.class]
13 | SLF4J: Found binding in [jar:file:/opt/cluster/hadoop-2.6.0/share/hadoop/common/lib/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]
14 | SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
15 | SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
16 | Scanning -> hdfs://nn1.dilithium.com:9000/hbase/data/default/class/9c4785ae4c37c4679773326224d7f5fe/cf/e90064527dda44d1b09f12b36edb4dd7
17 | 2016-01-26 00:07:16,371 INFO  [main] hfile.CacheConfig: CacheConfig:disabled
18 | Scanned kv count -> 5
19 | 


--------------------------------------------------------------------------------
/HBase/hive-mysql.txt:
--------------------------------------------------------------------------------
  1 | mysql> CREATE DATABASE metastore_db;
  2 | Query OK, 1 row affected (0.00 sec)
  3 | 
  4 | mysql> CREATE USER 'hadoop'@'%' IDENTIFIED BY 'hivepassword';
  5 | Query OK, 0 rows affected (0.00 sec)
  6 | 
  7 | mysql> GRANT all on *.* to 'hadoop'@client.cluster1.com identified by 'hivepassword';
  8 | Query OK, 0 rows affected (0.00 sec)
  9 | 
 10 | mysql> flush privileges;
 11 | Query OK, 0 rows affected (0.00 sec)
 12 | 
 13 | ====================
 14 | 
 15 | <configuration>
 16 | 
 17 | <property>
 18 | 
 19 | 	<name>hive.metastore.local</name>
 20 | 	<value>true</value>
 21 | 
 22 | </property>
 23 | 
 24 | 
 25 | <property>
 26 | 
 27 |             <name>javax.jdo.option.ConnectionURL</name>
 28 | 
 29 |             <value>jdbc:mysql://client.cluster1.com:3306/metastore_db?createDatabaseIfNotExist=true</value>
 30 | 
 31 |             <description>metadata is stored in a MySQL server</description>
 32 | 
 33 |             </property>
 34 | 
 35 |             <property>
 36 | 
 37 |             <name>javax.jdo.option.ConnectionDriverName</name>
 38 | 
 39 |             <value>com.mysql.jdbc.Driver</value>
 40 | 
 41 |             <description>MySQL JDBC driver class</description>
 42 | 
 43 |             </property>
 44 | 
 45 |             <property>
 46 | 
 47 |             <name>javax.jdo.option.ConnectionUserName</name>
 48 | 
 49 |             <value>hadoop</value>
 50 | 
 51 |             <description>user name for connecting to mysql server </description>
 52 | 
 53 |             </property>
 54 | 
 55 |             <property>
 56 | 
 57 |             <name>javax.jdo.option.ConnectionPassword</name>
 58 | 
 59 |             <value>hivepassword</value>
 60 | 
 61 |             <description>password for connecting to mysql server </description>
 62 | 
 63 |             </property>
 64 |             </configuration>
 65 | 			
 66 | ===================
 67 | Start hive Server
 68 | ===================
 69 | 
 70 | hive  --service hiveserver2&
 71 | 
 72 | Start a Separate Metastore Service
 73 | -----------------------------------
 74 | 
 75 |     <property>
 76 |       <name>hive.metastore.uris</name>
 77 |       <value>thrift://<n.n.n.n>:9083</value>
 78 |       <description>IP address (or fully-qualified domain name) and port of the metastore host</description>
 79 |     </property>
 80 | 
 81 |     <property>
 82 |     <name>hive.metastore.schema.verification</name>
 83 |     <value>true</value>
 84 |     </property>
 85 | 
 86 | hive  --service metastore&
 87 | 
 88 | mysql> use metastore_db;
 89 | Reading table information for completion of table and column names
 90 | You can turn off this feature to get a quicker startup with -A
 91 | 
 92 | Database changed
 93 | mysql> show tables;
 94 | +---------------------------+
 95 | | Tables_in_metastore_db    |
 96 | +---------------------------+
 97 | | BUCKETING_COLS            |
 98 | | CDS                       |
 99 | | COLUMNS_V2                |
100 | | DATABASE_PARAMS           |
101 | | DBS                       |
102 | | PARTITION_KEYS            |
103 | | SDS                       |
104 | | SD_PARAMS                 |
105 | | SEQUENCE_TABLE            |
106 | | SERDES                    |
107 | | SERDE_PARAMS              |
108 | | SKEWED_COL_NAMES          |
109 | | SKEWED_COL_VALUE_LOC_MAP  |
110 | | SKEWED_STRING_LIST        |
111 | | SKEWED_STRING_LIST_VALUES |
112 | | SKEWED_VALUES             |
113 | | SORT_COLS                 |
114 | | TABLE_PARAMS              |
115 | | TBLS                      |
116 | | VERSION                   |
117 | +---------------------------+
118 | 20 rows in set (0.00 sec)
119 | 
120 | mysql> show databases;
121 | +--------------------+
122 | | Database           |
123 | +--------------------+
124 | | information_schema |
125 | | employee           |
126 | | metastore_db       |
127 | | mysql              |
128 | | test               |
129 | +--------------------+
130 | 5 rows in set (0.00 sec)
131 | 
132 | mysql> select * from TBLS;
133 | +--------+-------------+-------+------------------+--------+-----------+-------+-------------+---------------+--------------------+--------------------+
134 | | TBL_ID | CREATE_TIME | DB_ID | LAST_ACCESS_TIME | OWNER  | RETENTION | SD_ID | TBL_NAME    | TBL_TYPE      | VIEW_EXPANDED_TEXT | VIEW_ORIGINAL_TEXT |
135 | +--------+-------------+-------+------------------+--------+-----------+-------+-------------+---------------+--------------------+--------------------+
136 | |      1 |  1403283170 |     1 |                0 | hadoop |         0 |     1 | hivetesting | MANAGED_TABLE | NULL               | NULL               |
137 | +--------+-------------+-------+------------------+--------+-----------+-------+-------------+---------------+--------------------+--------------------+
138 | 1 row in set (0.00 sec)
139 | 


--------------------------------------------------------------------------------
/HBase/hive.txt:
--------------------------------------------------------------------------------
  1 | export JAVA_HOME=/usr/java/jdk1.7.0_25/
  2 | export HIVE_HOME=/home/hadoop/hive/
  3 | export HBASE_HOME=/home/hadoop/hbase/
  4 | 
  5 | PATH=$PATH:$HOME/bin
  6 | PATH=$PATH:/home/hadoop/hadoop/bin:$JAVA_HOME/bin:$HIVE_HOME/bin:/$HBASE_HOME/bin
  7 | 
  8 | export PIG_HOME=/home/hadoop/pig
  9 | export PIG_INSTALL=/home/hadoop/pig
 10 | 
 11 | export HIVE_HOME=/home/hadoop/hive
 12 | export HBASE_HOME=/home/hadoop/hbase
 13 | 
 14 | =============
 15 | 
 16 | 
 17 | $ hadoop fs -mkdir /tmp
 18 | $ hadoop fs -mkdir /user/hive/warehouse
 19 | $ hadoop fs -chmod g+w /tmp
 20 | $ hadoop fs -chmod g+w /user/hive/warehouse
 21 | 
 22 | you must create /tmp and /user/hive/warehouse (aka hive.metastore.warehouse.dir) and set aprpopriate permissions in HDFS
 23 | 
 24 | hive> SET mapred.job.tracker=myhost.mycompany.com:50030;
 25 | 
 26 | 
 27 | CREATE DATABASE test_hive_db;
 28 | 
 29 | 
 30 | Creating Hive Tables
 31 | ==================
 32 |   hive> CREATE TABLE pokes (foo INT, bar STRING);
 33 | 
 34 | LOAD DATA LOCAL INPATH './examples/files/kv1.txt' OVERWRITE INTO TABLE pokes;
 35 | 
 36 | 
 37 | creates a table called pokes with two columns, the first being an integer and the other a string.
 38 | 
 39 |  ================= 
 40 |   
 41 |   
 42 |   hive> CREATE TABLE invites (foo INT, bar STRING) PARTITIONED BY (ds STRING);
 43 | 
 44 | 
 45 | hive> LOAD DATA LOCAL INPATH './hive/examples/files/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
 46 |   hive> LOAD DATA LOCAL INPATH './examples/files/kv3.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-08');
 47 |   
 48 | 
 49 | Loading from hdfs
 50 | 
 51 | hive> LOAD DATA INPATH '/user/myname/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
 52 | 
 53 | 
 54 | Browsing through Tables
 55 | 
 56 |   hive> SHOW TABLES;
 57 | 
 58 | lists all the tables.
 59 | 
 60 |   hive> SHOW TABLES '.*s';
 61 | 
 62 | hive> DESCRIBE invites;
 63 | 
 64 | shows the list of columns.
 65 | Altering and Dropping Tables
 66 | 
 67 | Table names can be changed and columns can be added or replaced:
 68 | 
 69 |   hive> ALTER TABLE events RENAME TO 3koobecaf;
 70 |   hive> ALTER TABLE pokes ADD COLUMNS (new_col INT);
 71 |   hive> ALTER TABLE invites ADD COLUMNS (new_col2 INT COMMENT 'a comment');
 72 |   hive> ALTER TABLE invites REPLACE COLUMNS (foo INT, bar STRING, baz INT COMMENT 'baz replaces new_col2');
 73 | 
 74 | Note that REPLACE COLUMNS replaces all existing columns and only changes the table's schema, not the data. The table must use a native SerDe. REPLACE COLUMNS can also be used to drop columns from the table's schema:
 75 | 
 76 |   hive> ALTER TABLE invites REPLACE COLUMNS (foo INT COMMENT 'only keep the first column');
 77 | 
 78 | Dropping tables:
 79 | 
 80 |   hive> DROP TABLE pokes;
 81 |   
 82 |   
 83 |   hive> LOAD DATA LOCAL INPATH './examples/files/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
 84 |     hive> LOAD DATA LOCAL INPATH './examples/files/kv3.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-08');
 85 | ==============
 86 | 
 87 | CREATE TABLE tags (userId INT,movieId INT,tag STRING,time timestamp) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
 88 | 
 89 | CREATE TABLE test (userId INT,movieId INT,tag STRING,time timestamp) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';
 90 | 
 91 | CREATE external TABLE test1 (name STRING,Id INT,roll INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' stored as textfile location '/user/hadoop/dump';
 92 | 
 93 | Hive VERSION Table
 94 | 
 95 | mysql> CREATE TABLE VERSION ( VER_ID bigint(20) NOT NULL, SCHEMA_VERSION varchar(127) NOT NULL, VERSION_COMMENT varchar(255), PRIMARY KEY (VER_ID));
 96 | Query OK, 0 rows affected (0.00 sec)
 97 | 
 98 | mysql> insert into VERSION (VER_ID,SCHEMA_VERSION,VERSION_COMMENT) values (1,"0.14.0","Hive release version 0.14.0");
 99 | Query OK, 1 row affected (0.00 sec)
100 | 
101 | /usr/lib/hive/bin/schematool -dbType mysql -initSchema
102 | 
103 | 
104 | Performance tune Hive after checking stats on a table
105 | 
106 | analyze table store compute statistics noscan;
107 | analyze table store compute statistics;
108 | analyze table store compute statistics for columns;
109 | ANALYZE TABLE Table1 CACHE METADATA;
110 | ANALYZE TABLE Table1 PARTITION(ds='2008-04-09', hr) COMPUTE STATISTICS NOSCAN;
111 | 


--------------------------------------------------------------------------------
/HBase/regions.txt:
--------------------------------------------------------------------------------
1 | node1
2 | node2


--------------------------------------------------------------------------------
/HBase/regionservers:
--------------------------------------------------------------------------------
1 | dn1.cluster1.com
2 | dn2.cluster1.com
3 | dn3.cluster1.com
4 | 


--------------------------------------------------------------------------------
/HBase/replication:
--------------------------------------------------------------------------------
 1 | hbase(main):003:0> add_peer '1', CLUSTER_KEY => 'd1.aus.cloudera.site,d2.aus.cloudera.site,d3.aus.cloudera.site:2181:/hbase'
 2 | hbase(main):003:0> disable_peer("1")
 3 | hbase(main):003:0> enable_table_replication 'emp'
 4 | hbase(main):003:0> enable_table_replication 'emp1'
 5 | 
 6 | hbase$ hbase snapshot create -n emp1_4aug -t emp1
 7 | 
 8 | hbase$ hbase org.apache.hadoop.hbase.snapshot.ExportSnapshot -snapshot emp1_4aug -copy-to hdfs://d2.aus.cloudera.site:8020/hbase -mappers 2
 9 | 
10 | hbase(main):003:0> enable_peer("1")
11 | 
12 | 
13 | # The above steps are to be used when the soruce cluster already has data/tables.
14 | # In a new cluster with no data, we do not need to export snapshot and disable_peer (To build backlog for WALs)
15 | 
16 | 
17 | 
18 | hbase(main):003:0> list_peers
19 |  PEER_ID CLUSTER_KEY ENDPOINT_CLASSNAME STATE REPLICATE_ALL NAMESPACES TABLE_CFS BANDWIDTH SERIAL
20 |  1 d1.aus.cloudera.site,d2.aus.cloudera.site,d3.aus.cloudera.site:2181:/hbase  ENABLED true   0 false
21 | 1 row(s)
22 | Took 0.0125 seconds
23 | => #<Java::JavaUtil::ArrayList:0x17b6ad97>
24 | 
25 | hbase(main):004:0> list_peer_configs
26 |  PeerId                               1
27 |  Cluster Key                          d1.aus.cloudera.site,d2.aus.cloudera.site,d3.aus.cloudera.site:2181:/hbase
28 | 
29 | Took 0.0090 seconds
30 | => {"1"=>#<Java::OrgApacheHadoopHbaseReplication::ReplicationPeerConfig:0x6136e1fc>}
31 | 


--------------------------------------------------------------------------------
/HBase/tez-setup:
--------------------------------------------------------------------------------
 1 | Tez Configuration
 2 | 
 3 | 1. Download Tez tar ball: 
 4 | 
 5 | $ su - hadoop
 6 | $ wget www-us.apache.org/dist/tez/0.8.4/apache-tez-0.8.4-bin.tar.gz
 7 | 
 8 | untar it in any directory and set path to it. Should be readable by the user running hive.
 9 | 
10 | $ tar xzvf apache-tez-0.8.4-bin.tar.gz
11 | $ ln -s apache-tez-0.8.4-bin tez
12 | 
13 | Copy the tez tarball to a path on HDFS.
14 | 
15 | $ hadoop fs -mkdir -p /apps/tez
16 | $ hadoop fs -put tez/share/tez.tar.gz /apps/tez
17 | $ hadoop fs -put hive/lib/hive-exec-1.2.2.jar /apps/tez
18 | 
19 | $ vi tez/conf/tez-site.xml
20 | 
21 | <configuration>
22 | 
23 | <property>
24 | <name>tez.lib.uris</name>
25 | <value>/apps/tez/tez.tar.gz</value> "This path is the HDFS path, can be speficied using the hdfs://path syntax as well"
26 | </property>
27 | 
28 | <property>
29 | <name>tez.am.resource.memory.mb</name>
30 | <value>2048</value>
31 | </property>
32 | 
33 | </configuration>
34 | 
35 | Set ENV
36 | 
37 | vi /etc/profile.d/hadoopenv.sh or .bash_profile
38 | 
39 | export TEZ_CONF_DIR=/home/hadoop/tez/conf
40 | export TEZ_JARS=/home/hadoop/tez/
41 | 
42 | export HADOOP_CLASSPATH=${TEZ_CONF_DIR}:${TEZ_JARS}/*:${TEZ_JARS}/lib/*:$HADOOP_CLASSPATH
43 | 
44 | Set the execution mode in the hive configuration:
45 | 
46 | $ vi hive/conf/hive-site.xml
47 | 
48 | <property>
49 | 
50 |         <name>hive.execution.engine</name>
51 |         <value>tez</value>
52 | 
53 | </property>
54 | 
55 | This can be done at the hive/beeline prompt as well:
56 | 
57 | hive> set hive.execution.engine=tez;
58 | 
59 | 
60 | Test by running any example:
61 | 
62 | hive> select count(*) from pokes;
63 | Query ID = hadoop_20180414105904_37f4b946-30cc-447a-8878-be956d0b222e
64 | Total jobs = 1
65 | Launching Job 1 out of 1
66 | 
67 | 
68 | Status: Running (Executing on YARN cluster with App id application_1523714759756_0007)
69 | 
70 | --------------------------------------------------------------------------------
71 |         VERTICES      STATUS  TOTAL  COMPLETED  RUNNING  PENDING  FAILED  KILLED
72 | --------------------------------------------------------------------------------
73 | Map 1 ..........   SUCCEEDED      1          1        0        0       0       0
74 | Reducer 2 ......   SUCCEEDED      1          1        0        0       0       0
75 | --------------------------------------------------------------------------------
76 | VERTICES: 02/02  [==========================>>] 100%  ELAPSED TIME: 6.88 s
77 | --------------------------------------------------------------------------------
78 | OK
79 | 500
80 | Time taken: 9.721 seconds, Fetched: 1 row(s)
81 | 
82 | 
83 | Important thing to keep in mind that the env must be set on the nodes which are edge nodes, i.e hive client nodes. 
84 | Hive server and metaserver etc will be as talked previously.
85 | 


--------------------------------------------------------------------------------
/HBase/untitled.txt:
--------------------------------------------------------------------------------
 1 | parted /dev/sdb --script -- mklabel msdos
 2 | parted /dev/sdb --script -- mkpart primary 0 -1
 3 | mkfs.ext3 /dev/sdb1
 4 | mkdir -p /space/disk1
 5 | mount /dev/sdb1 /space/disk1
 6 | 
 7 | useradd hadoop; echo hadoop | passwd --stdin hadoop
 8 | chown -R hadoop:hadoop /space
 9 | 
10 | 
11 | yum install jdk -y


--------------------------------------------------------------------------------
/Hive_performance:
--------------------------------------------------------------------------------
 1 | -XX:-UseGCOverheadLimit
 2 | 
 3 | SET mapred.child.java.opts="-server1g -XX:+UseConcMarkSweepGC -XX:-UseGCOverheadLimit";
 4 | 
 5 | To enable the optimization
 6 | 
 7 |   set hive.auto.convert.join = true
 8 |   set hive.optimize.skewjoin = true
 9 |   
10 |   
11 | When you are working with a large number of small files, Hive uses CombineHiveInputFormat by default. 
12 | In terms of MapReduce, it ultimately translates to using CombineFileInputFormat that creates virtual splits over multiple files, 
13 | grouped by common node, rack when possible. The size of the combined split is determined by
14 | 
15 | mapred.max.split.size
16 | or 
17 | mapreduce.input.fileinputformat.split.maxsize ( in yarn/MR2);
18 | 
19 | So if you want to have less splits(less mapper) you need to set this parameter higher.
20 | 
21 | http://stackoverflow.com/questions/17852838/what-is-the-default-size-that-each-hadoop-mapper-will-read
22 | 
23 | http://www.ericlin.me/how-to-control-the-number-of-mappers-required-for-a-hive-query
24 | 
25 | 
26 | set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
27 | set mapred.map.tasks = 20;
28 | 
29 | Controlling split size:
30 | 
31 | set mapreduce.input.fileinputformat.split.minsize==100000000;
32 | reference: https://hadoopjournal.wordpress.com/2015/06/13/set-mappers-in-pig-hive-and-mapreduce/
33 | 


--------------------------------------------------------------------------------
/Jars/azure.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Jars/azure.tar.gz


--------------------------------------------------------------------------------
/Jars/hadoop-azure-3.1.1.3.1.0.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Jars/hadoop-azure-3.1.1.3.1.0.0-SNAPSHOT.jar


--------------------------------------------------------------------------------
/Jars/jce_policy-8.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Jars/jce_policy-8.zip


--------------------------------------------------------------------------------
/Kafka/commands:
--------------------------------------------------------------------------------
 1 | # Make sure to setup the kafka variables like:
 2 | 
 3 | export KAFKA_HOME=/home/hadoop/kafka
 4 | PATH=$KAFKA_HOME/bin:$PATH
 5 | 
 6 | Commands:
 7 | 
 8 | kafka-server-start.sh kafka/config/server.properties
 9 | 
10 | Run as daemon:
11 | 
12 | kafka-server-start.sh -daemon kafka/config/server.properties
13 | 
14 | [hadoop@gw1 ~]$ jps
15 | 4581 Kafka
16 | 
17 | # Stop:
18 | 
19 | kafka-server-stop.sh
20 | 
21 | # Useful commands:
22 | 
23 | # Create Topic
24 | kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic test
25 | 
26 | # List topcis
27 | kafka-topics.sh --list --zookeeper localhost:2181
28 | kafka-topics.sh --list --zookeeper n1.dilithium.com:2181
29 | 
30 | echo "Hello, Kafka" | kafka-console-producer.sh --broker-list <kafka-brokers>:9092 --topic MyTopic > /dev/null
31 | kafka-console-consumer.sh --zookeeper <> --topic MyTopic --from-beginning
32 | 
33 | Examples:
34 | 
35 | $ kafka-topics.sh --create --zookeeper n1.dilithium.com:2181 --replication-factor 1 --partitions 1 --topic test
36 | Created topic "test".
37 | $ kafka-topics.sh --list --zookeeper n1.dilithium.com:2181
38 | test
39 | 
40 | echo "Hello, Kafka" | kafka-console-producer.sh --broker-list gw1.dilithium.com:9092 --topic test > /dev/null
41 | kafka-console-consumer.sh --bootstrap-server gw1.dilithium.com:9092 --topic test --from-beginning
42 | 
43 | $ kafka-log-dirs.sh --describe --bootstrap-server gw1.dilithium.com:9092
44 | Querying brokers for log directories information
45 | Received log directory information from brokers 0
46 | {"version":1,"brokers":[{"broker":0,"logDirs":[{"logDir":"/data/kafka","error":null,"partitions":[]}]}]}
47 | 
48 | 
49 | Benchmarks
50 | 
51 | These are just on my test lab(1 core VM, 2 GB RAM)
52 | 
53 | $ kafka-producer-perf-test.sh --topic bench --num-records 1000000  --throughput 150000 --record-size 100 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=67108864 compression.type=none batch.size=8196
54 | 71189 records sent, 13945.0 records/sec (1.33 MB/sec), 1090.8 ms avg latency, 1612.0 max latency.
55 | 170124 records sent, 34018.0 records/sec (3.24 MB/sec), 2294.4 ms avg latency, 3198.0 max latency.
56 | 186553 records sent, 36882.8 records/sec (3.52 MB/sec), 4227.2 ms avg latency, 5537.0 max latency.
57 | 239463 records sent, 47892.6 records/sec (4.57 MB/sec), 7076.9 ms avg latency, 7590.0 max latency.
58 | 1000000 records sent, 39799.410969 records/sec (3.80 MB/sec), 5569.86 ms avg latency, 8151.00 ms max latency, 6986 ms 50th, 8012 ms 95th, 8107 ms 99th, 8143 ms 99.9th
59 | 
60 | $ kafka-consumer-perf-test.sh --topic gsd  --broker-list gw1.dilithium.com:9092 --messages 1000000 -threads 1 --num-fetch-threads 1 --print-metrics
61 | 
62 | $ kafka-consumer-perf-test.sh --topic gsd  --broker-list gw1.dilithium.com:9092 --messages 1000000 -threads 1 --num-fetch-threads 1
63 | start.time, end.time, data.consumed.in.MB, MB.sec, data.consumed.in.nMsg, nMsg.sec, rebalance.time.ms, fetch.time.ms, fetch.MB.sec, fetch.nMsg.sec
64 | 2019-02-11 19:23:15:397, 2019-02-11 19:23:20:199, 95.3787, 19.8623, 1000118, 208271.1370, 51, 4751, 20.0755, 210506.8407
65 | 
66 | $ kafka-consumer-perf-test.sh --topic gsd  --broker-list gw1.dilithium.com:9092 --messages 1000000 -threads 1 --num-fetch-threads 2
67 | start.time, end.time, data.consumed.in.MB, MB.sec, data.consumed.in.nMsg, nMsg.sec, rebalance.time.ms, fetch.time.ms, fetch.MB.sec, fetch.nMsg.sec
68 | 2019-02-11 19:23:49:632, 2019-02-11 19:23:54:701, 95.3787, 18.8161, 1000118, 197300.8483, 135, 4934, 19.3309, 202699.2298
69 | .
70 | 
71 | Benchmark with various throughtput, message size:
72 | 
73 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000  --throughput 150000 --record-size 100 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196
74 | 
75 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000  --throughput 150000 --record-size 1 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196
76 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000  --throughput 15 --record-size 1 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196
77 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000  --throughput 1500 -record-size 1 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196
78 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000  --throughput 150 --record-size 1 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196
79 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000  --throughput 150000 --record-size 1 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196
80 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000  --throughput 15000000 --record-size 1 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196
81 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000  --throughput 15000000 --record-size 1000 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196
82 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000  --throughput -1 --record-size 1000 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196
83 | 


--------------------------------------------------------------------------------
/Kafka/kafka-env.sh:
--------------------------------------------------------------------------------
 1 | # Create this file as it is not part of the distro
 2 | 
 3 | #!/bin/bash
 4 | 
 5 | # Set KAFKA specific environment variables here.
 6 | 
 7 | # The java implementation to use.
 8 | export JAVA_HOME=/usr/java/default
 9 | export PATH=$PATH:$JAVA_HOME/bin
10 | #export PID_DIR={{kafka_pid_dir}}
11 | #export LOG_DIR={{kafka_log_dir}}
12 | #export JMX_PORT=9093
13 | 
14 | export KAFKA_HEAP_OPTS="-Xmx1g -Xms1g"
15 | export KAFKA_JVM_PERFORMANCE_OPTS="-XX:MetaspaceSize=96m -XX:+UseG1GC -XX:MaxGCPauseMillis=20 -XX:InitiatingHeapOccupancyPercent=35 -XX:G1HeapRegionSize=16M -XX:MinMetaspaceFreeRatio=50 -XX:MaxMetaspaceFreeRatio=80"
16 | 


--------------------------------------------------------------------------------
/Kafka/kafka_ganglia2.txt:
--------------------------------------------------------------------------------
 1 | {
 2 |   "servers" : [ {
 3 |     "port" : "9999", <--- Defined Kafka JMX Port
 4 |     "host" : "192.168.1.18",  <--- Kafka Server
 5 |     "queries" : [ {
 6 |       "outputWriters" : [ {
 7 |         "@class" :
 8 | "com.googlecode.jmxtrans.model.output.KeyOutWriter",
 9 |           "settings" : {
10 |                    "outputFile" : "/tmp/bufferPool_direct_stats.txt",
11 |                    "v31" : false
12 |                    }
13 |       } ],
14 |       "obj" : "java.nio:type=BufferPool,name=direct",
15 |       "resultAlias": "bufferPool.direct",
16 |       "attr" : [ "Count", "MemoryUsed", "Name", "ObjectName", "TotalCapacity" ]
17 |     }, {
18 |       "outputWriters" : [ {
19 |         "@class" :
20 | "com.googlecode.jmxtrans.model.output.KeyOutWriter",
21 |         "settings" : {
22 |                    "outputFile" : "/tmp/bufferPool_mapped_stats.txt",
23 |                    "v31" : false
24 |                    }
25 |       } ],
26 |       "obj" : "java.nio:type=BufferPool,name=mapped",
27 |       "resultAlias": "bufferPool.mapped",
28 |       "attr" : [ "Count", "MemoryUsed", "Name", "ObjectName", "TotalCapacity" ]
29 |     }, {
30 |       "outputWriters" : [ {
31 |         "@class" :
32 | "com.googlecode.jmxtrans.model.output.KeyOutWriter",
33 |         "settings" : {
34 |                    "outputFile" : "/tmp/kafka_log4j_stats.txt",
35 |                    "v31" : false
36 |                    }
37 |       } ],
38 |       "obj" : "kafka:type=kafka.Log4jController",
39 |       "resultAlias": "kafka.log4jController",
40 |       "attr" : [ "Loggers" ]
41 |     }, {
42 |       "outputWriters" : [ {
43 |         "@class" :
44 | "com.googlecode.jmxtrans.model.output.KeyOutWriter",
45 |           "settings" : {
46 |                    "outputFile" : "/tmp/kafka_socketServer_stats.txt",
47 |                    "v31" : false
48 |                    }
49 |       } ],
50 |       "obj" : "kafka:type=kafka.SocketServerStats",
51 |       "resultAlias": "kafka.socketServerStats",
52 |       "attr" : [ "AvgFetchRequestMs", "AvgProduceRequestMs", "BytesReadPerSecond", "BytesWrittenPerSecond", "FetchRequestsPerSecond", "MaxFetchRequestMs", "MaxProduceRequestMs" , "NumFetchRequests" , "NumProduceRequests" , "ProduceRequestsPerSecond", "TotalBytesRead", "TotalBytesWritten", "TotalFetchRequestMs", "TotalProduceRequestMs" ]
53 |     } ],
54 |     "numQueryThreads" : 2
55 |   } ]
56 | }
57 | }
58 | 
59 | 


--------------------------------------------------------------------------------
/Kafka/kakfa_rsyslog.txt:
--------------------------------------------------------------------------------
 1 | rsyslog (base, includes imfile)
 2 | rsyslog-kafka
 3 | 
 4 | /etc/rsyslog.conf
 5 | -----------------
 6 | 
 7 | 
 8 | $WorkDirectory /var/lib/rsyslog # where to place spool files
 9 | 
10 | $MainMsgQueueType LinkedList
11 | $MainMsgQueueFileName mainmsgq
12 | $MainMsgQueueSaveOnShutdown on
13 | $MainMsgQueueSize 15000
14 | $MainMsgQueueHighWatermark 10000
15 | $MainMsgQueueLowWatermark 1000
16 | $MainMsgQueueMaxDiskSpace 53687091 # 512KB, most containers have
17 | 
18 | 
19 | /etc/rsyslog.d/kafka.conf
20 | -------------------------
21 | 
22 | module(load="omkafka")   # provides omkafka
23 | # Use rainerscript, as below#$ActionQueueSize 1500000
24 | #$ActionQueueType LinkedList
25 | #$ActionQueueFileName omkafkaq
26 | #$ActionResumeRetryCount -1
27 | #$ActionQueueSaveOnShutdown on
28 | #$ActionQueueHighWatermark 1000000
29 | #$ActionQueueLowWatermark 100000
30 | #$ActionQueueMaxDiskSpace 536870912 # 512MB, most containers have
31 | #$ActionQueueMaxDiskSpace 536870912 # 512MB, most containers have <8GB of space
32 | #$MainMsgQueueDiscardMark 400000 # Low < Discard < High < DiskSpace
33 | #$MainMsgQueueDiscardSeverity 4 # Discard anything lower than warning
34 | 
35 | *.* action(type="omkafka" topic="rsyslog-prod"
36 |         broker="kafka1.example.com,kafka2.example.com,kafka3.example.com"
37 |         queue.filename="omkafkaq" queue.spoolDirectory="/var/lib/rsyslog"
38 |         queue.size="300000" queue.maxdiskspace="536870912"
39 |         queue.lowwatermark="20000" queue.highwatermark="200000"
40 |         queue.discardmark="250000" queue.type="LinkedList"
41 |         queue.discardseverity="4"
42 |         queue.saveonshutdown="on" queue.dequeuebatchsize="4"
43 |         partitions.auto="on" errorFile="/var/log/rsyslog.err"
44 |         confParam=[ "compression.codec=snappy",
45 |                 "socket.timeout.ms=1000",
46 |                 "socket.keepalive.enable=true"]
47 | )
48 | 


--------------------------------------------------------------------------------
/Kafka/server.properties:
--------------------------------------------------------------------------------
1 | # Only need to change the below for each broker. This is a very basic kafka config
2 | 
3 | # The id of the broker. This must be set to a unique integer for each broker.
4 | broker.id=0
5 | 
6 | # root directory for all kafka znodes.
7 | zookeeper.connect=n1.dilithium.com:2181,n2.dilithium.com:2181,sn.dilithium.com:2181
8 | 


--------------------------------------------------------------------------------
/Notes/Benchmarking.txt:
--------------------------------------------------------------------------------
 1 | Test Hadoop
 2 | ============
 3 | 
 4 | hadoop jar hadoop/hadoop-test-0.20.205.0.jar TestDFSIO -write -nrFiles 10 -fileSize 1000
 5 | hadoop jar hadoop/hadoop-test-0.20.205.0.jar TestDFSIO -read -nrFiles 10 -fileSize 1000
 6 | 
 7 | hadoop jar hadoop/hadoop-test-0.20.205.0.jar TestDFSIO -clean
 8 | 
 9 | 
10 | Generate Tera Data
11 | ==================
12 | 
13 | hadoop jar hadoop/hadoop-examples-0.20.205.0.jar teragen 1000 /user/hduser/terasort-input
14 | 
15 | hadoop jar hadoop/hadoop-examples-0.20.205.0.jar terasort /user/hduser/terasort-input /user/hduser/terasort-output
16 | 
17 | hadoop job -history all /user/hduser/terasort-input
18 | 
19 | 
20 | 
21 | hadoop jar hadoop/hadoop-examples-0.20.205.0.jar nnbench -operation create_write \
22 |     -maps 12 -reduces 6 -blockSize 1 -bytesToWrite 0 -numberOfFiles 1000 \
23 |     -replicationFactorPerFile 3 -readFileAfterOpen true \
24 |     -baseDir /benchmarks/NNBench-`hostname -s`
25 | 
26 | 
27 | hadoop jar hadoop/hadoop-examples-0.20.205.0.jar mrbench -numRuns 50


--------------------------------------------------------------------------------
/Notes/Hadoop_lab.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Notes/Hadoop_lab.doc


--------------------------------------------------------------------------------
/Notes/Hadoop_upgrade.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Hadoop Upgrade
 3 | ===============
 4 | 
 5 | 1. hadoop dfsadmin -upgradeProgress status
 6 | 
 7 | 2. Stop all client applications running on the MapReduce cluster.
 8 | 
 9 | 3. Perform a filesystem check 
10 | 	hadoop fsck / -files -blocks -locations > dfs-v-old-fsck-1.log
11 | 	
12 | 4. Save a complete listing of the HDFS namespace to a local file
13 | 	hadoop dfs -lsr / > dfs-v-old-lsr-1.log
14 | 	
15 | 5. Create a list of DataNodes participating in the cluster:
16 | 	hadoop dfsadmin -report > dfs-v-old-report-1.log
17 | 	
18 | 6. Optionally backup HDFS data
19 | 
20 | 7. Upgrade process:
21 | 	Point to the new directory, update environment variables.
22 | 	
23 | 8. hadoop-daemon.sh start namenode -upgrade
24 | 
25 | 9. hadoop dfsadmin -upgradeProgress status
26 | 
27 | 10. Now start the datanode, after pointing to the new hadoop directory
28 | 
29 | 11. hadoop dfsadmin -safemode get
30 | 
31 | 12. hadoop dfsadmin -finalizeUpgrade
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/Notes/Performance.txt:
--------------------------------------------------------------------------------
 1 | CPU-related parameters:mapred.tasktracker.map and reduce.tasks.maximum
 2 | Decide the maximum number of map/reduce tasks that will be run simultaneously by a task tracker. These two parameters are the most relative ones to CPU utilization. The default value of both parameters is 2. Properly increasing their values according to your cluster condition increases the CPU utilization and therefore improves the performance. For example, assume each node of the cluster has 4 CPUs supporting simultaneous multi-threading, and each CPU has 2 cores; then the total number of daemons should be no more than 4x2x2=16. Considering DN and TT would take 2 slots, there are at most 14 slots for map/reduce tasks, so the best value is 7 for both parameters.
 3 | 
 4 | Set this parameter in mapred-site.xml.
 5 | 
 6 | Memory-related parameter:mapred.child.java.opts
 7 | This is the main parameter for JVM tuning. The default value is -Xmx200m, which gives each child task thread 200 MB of memory at most. You can increase this value if the job is large, but should make sure it won't cause swap, which significantly reduces performance.
 8 | 
 9 | Let's examine how this parameter can affect the total memory usage. Assume the maximum number of map/reduce tasks is set to 7, and mapred.child.java.opts is left to the default value. Then memory cost of running tasks will be 2x7x200 MB =2800 MB. If each worker node has both DN and TT daemons, and each daemon costs 1 GB memory by default, the total memory allocated would be around 4.8 GB.
10 | 
11 | Set this parameter in mapred-site.xml.
12 | 
13 | Disk I/O-related parameters:mapred.compress.map.output, mapred.output.compress, and mapred.map.output.compression.codec
14 | These are parameters that control whether to compress the output, in which mapred.compress.map.output is for map output compression, mapred.output.compress is for job output compression, and mapred.map.output.compression.codec is for compression code. All of these options are turned off by default.
15 | 
16 | Turning on output compression can speed up disk (local/Hadoop Distributed File System (HDFS)) writes and reduce total time of data transfer (in both shuffle and HDFS writing phase), while on the other hand cost additional overhead during the compression/decompression process.
17 | 
18 | According to personal experience, turning on compression is not effective for sequence filing with random keys/values. One suggestion is to turn on compression only when the data you're dealing with is large and organized (especially natural language data).
19 | 
20 | Set these parameters in mapred-site.xml.
21 | 
22 | io.sort.mb parameter:
23 | This parameter sets the buffer size for map-side sorting, in units of MB, 100 by default. The greater the value, the fewer spills to the disk, thus reducing I/O times on the map side. Notice that increasing this value increases memory required by each map task.
24 | 
25 | According to experience, when the map output is large, and the map-side I/O is frequent, you should try increasing this value.
26 | 
27 | Set this parameter in mapred-site.xml.
28 | 
29 | io.sort.factor parameter
30 | This parameter sets the number of input streams (files) to be merged at once in both map and reduce tasks. The greater this value, the fewer spills to the disk, thus reducing I/O times on both the map and reduce sides. Notice that increasing this value might cost more garbage collection activities if memory allocated for each task is not large enough.
31 | 
32 | According to experience, when there is a large number of spills to the disk, and I/O times of the sort and shuffle phase is high, you should try increasing this value.
33 | 
34 | Set this parameter in mapred-site.xml.
35 | 
36 | mapred.job.reduce.input.buffer.percent parameter
37 | This parameter sets the percentage of memory (relative to the maximum heap size) to retain map outputs during the reduce phase. When the shuffle is concluded, any remaining map outputs in memory must consume less than this threshold before the reduce phase can begin, 0 by default. The greater this value is, the less merge on the disk, thus reducing I/O times on the local disk during the reduce phase. Notice that increasing this value might cost more garbage collection activities if memory allocated for each task is not large enough.
38 | 
39 | According to experience, when map output is large, and local disk I/O is frequent during the reduce through sort phases, you should try increasing this value. 


--------------------------------------------------------------------------------
/Notes/backup.txt:
--------------------------------------------------------------------------------
 1 | Hadoop Backup and Recovery
 2 | ==========================
 3 | 
 4 | 
 5 | <property>
 6 |         <name>dfs.secondary.http.address</name>
 7 |         <value>192.168.1.68:50090</value>
 8 | </property>
 9 | 
10 | 
11 | 1. Secondary namenode checkpointing
12 | 
13 | If you want to explicitly specify the file to be used by the namenode
14 | 
15 | 	hadoop-daemons.sh --hosts masters start secondarynamenode
16 | 
17 | 	hdfs secondarynamenode -checkpoint force
18 | 
19 | 2. hadoop namenode -importCheckpoint
20 | 
21 | <property>
22 |   	<name>fs.checkpoint.dir</name>
23 |   	<value>/data/new</value>
24 | </property>
25 | 
26 | 3. Save NameSpace
27 | 
28 | hadoop dfsadmin -safemode enter
29 | 
30 | hadoop dfsadmin -saveNamespace
31 | 
32 | Remember it updates under the Namespace directory.
33 | 
34 | 4. Metadata Save
35 | 
36 | hdfs dfsadmin -metasave filename.txt
37 | 
38 | 5. Can do a detailed view of the namespace (above 0.21)
39 | 
40 | hdfs oiv -i /data/namenode/current/fsimage -o fsimage.txt
41 | 
42 | 
43 | <property>
44 |         <name>dfs.secondary.http.address</name>
45 |         <value>192.168.1.68:50090</value>
46 | </property>


--------------------------------------------------------------------------------
/Notes/cassandra2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Notes/cassandra2.pdf


--------------------------------------------------------------------------------
/Notes/class3_questions:
--------------------------------------------------------------------------------
 1 | 1. when name node started what will be "keyword" to identify from log that at what timestamp namenode started and which log should we see?
 2 | 2. Does Secondary Name node  starts automatically? 
 3 | 3. "copyFromLocal" is locally load the data from the same node.. if yes, than if we want to load data from NAS or SAN or any client is there any specific command ..
 4 | 4. Is there any command we can use to override if same file is present in HDFS already? or we always have to remove existing first?
 5 | 5. If we change path hdfs then we need to reformat?
 6 | 6. how different is hadoop fsck from linux fsck
 7 | 7. Should we run  "hadoop dfsadmin"  & "hadoop fsck"  (Admin) commands only from name node and "hadoop fs" files related command from any of the name or datanode in cluster?
 8 | 8. how to start data node on selected machines instead of all as this earlier command is doing?
 9 | 9. what is .meta and .curr files are created and what is difference. what if .meta file is deleted will data also be lost
10 | 10. in what cicumstances it will choose another node in case dn1 near to full capacity
11 | 11. only shows 1 live node, although running it several time, it's changing between the live datanodes, but only shows 1 live at a time. Is it normal? What could be the problem
12 | 12. is it possible to force the data to go into a particular data node?
13 | 13. so at some edits will too huge , does it rotate also?
14 | 14. can we have both NN and SNN on same node? or is it best practice to separate them
15 | 15. how we know which rack machine belongs to? 
16 | 16. editing nodes in include and exclude files does not require a reboot?
17 | 17. In what kind of cases we might need to exclude a particular data node? Why would we create a DN and exclude it from the cluster?
18 | 18. Let say I have servers from 2 datacenters one in new ATLANTA and one in NEW YORK. Let say ATLANTA datacenter is down because of FLOOD. How can we recover our cluster from such disaster recovery. In that case can we configure our data file to consider servers of ATLANTA datacenter as ONE RACK and NEWYORK servers as another rack?
19 | 
20 | 


--------------------------------------------------------------------------------
/Notes/class4_questions:
--------------------------------------------------------------------------------
 1 | https://www.packtpub.com/books/content/sizing-and-configuring-your-hadoop-cluster
 2 | why do we need replication of replication? why do we need to have replication of data on the same server again
 3 | what was that sdb1 & sdc1, is that new partition for dn1 & dn2
 4 | is this similar to RAID 1(mirroring)?
 5 | do we need to setup this only if we do not have raid 1 already?
 6 | Isnt its a good practice to create these directories on NFS server? other than local?
 7 | after chaning any configuration setting , do we need to run --format everytime to reflect the change?
 8 | will thses two disks need to be kept in different racks to avaoid data loss
 9 | If the disk io speed is different significantly between the local and NFS one, will this cause adverse effect for namenode?
10 | it will be overhead because instead of rsyncing 1 directory it has to copy 2 dirs to DR right?
11 | so which meta data name node is refering to if both mount points are up?
12 | what is the read policy? will it be only from the first disk specified?
13 | does two disk drives follow propotional fill algoritm?
14 | So in that case we dont have control how end clients are sending jobs? they can send any size
15 | So when name node is down and we have to make secondary as primary. I am confused why we need to change the hostname of secondary to primary? Instead of that if we would have configured namenode location by GSLB instead of direct hostname. Then we don’t need to change hostname 
16 | you are setting Quota from command line. So it will be flushed once we restart the namenode?
17 | 


--------------------------------------------------------------------------------
/Notes/cloudera.txt:
--------------------------------------------------------------------------------
1 | Cloudera Manager
2 | ================
3 | 
4 | ./cloudera-manager-installer.bin --skip_repo_package=1
5 | 
6 | 
7 | 
8 | 1. Cloudera Manager - GUI
9 | 2. Cloudera Packages CDH4 - Hadoop packages


--------------------------------------------------------------------------------
/Notes/disk_partition:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | hdd="/dev/sdc /dev/sdd /dev/sde /dev/sdf"
 3 | 
 4 | count=0
 5 | 
 6 | for disk in $hdd; do
 7 |   #echo -e "n\np\n\n\n\nw\n" | fdisk $disk;
 8 | 
 9 |   fs="${disk}1"
10 |   #mkfs.xfs $fs;
11 | 
12 |   twoDigitCount=$(printf "%02d" $count)
13 |   mount="/data/$twoDigitCount"
14 |   mkdir -p $mount;
15 | 
16 |   mount $fs $mount;
17 | 
18 |   count=$((count+1))
19 | done
20 | 


--------------------------------------------------------------------------------
/Notes/hadoop_ports.txt:
--------------------------------------------------------------------------------
 1 | Hadoop nodes communication ports
 2 | 
 3 | No.	name	protocol	port #	configuration file	parameter name	description
 4 | 1	ssh	tcp	*:22	/etc/ssh/sshd_config	Port	ssh server port for ssh communication
 5 | 2	HDFS default port	tcp	localhost:9000	core-site.xml	fs.default.name	HDFS port for clients.
 6 | 3	secondary name node administration	tcp	0.0.0.0:50090	hdfs-site.xml	dfs.secondary.http.address	The secondary namenode http server address and port. If the port is 0 then the server will start on a free port.
 7 | 4	data node communication	tcp	0.0.0.0:50010	hdfs-site.xml	dfs.datanode.address	
 8 | 5	data node administration	tcp	0.0.0.0:50075	hdfs-site.xml	dfs.datanode.http.address	
 9 | 6	data node IPC communication	tcp	0.0.0.0:50020	hdfs-site.xml	dfs.datanode.ipc.address	
10 | 7	name node administration	tcp	0.0.0.0:50070	hdfs-site.xml	dfs.http.address	
11 | 8	data node administration	tcp	0.0.0.0:50475	hdfs-site.xml	dfs.datanode.https.address	
12 | 9	name node administration	tcp	0.0.0.0:50470	hdfs-site.xml	dfs.https.address	
13 | 10	MapReduce job tracker	tcp	0.0.0.0:9001	mapred-site.xml	mapred.job.tracker	The port of Job Tracker accepting for job request.
14 | 11	job tracker administration	tcp	0.0.0.0:50030	mapred-site.xml	mapred.job.tracker.http.address	
15 | 12	task tracker administration	tcp	0.0.0.0:50060	mapred-site.xml	mapred.task.tracker.http.address	


--------------------------------------------------------------------------------
/Notes/hadoop_ports_firewall.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Notes/hadoop_ports_firewall.xls


--------------------------------------------------------------------------------
/Notes/installation.txt:
--------------------------------------------------------------------------------
 1 | core-site.xml
 2 | 
 3 | <property>
 4 | <name>fs.default.name</name>
 5 | <value>hdfs://nn1.cluster1.com:9000</value>
 6 | </property>
 7 | 
 8 | hdfs-site.xml
 9 | 
10 | <property>
11 | <name>dfs.name.dir</name>
12 | <value>/data/namenode</value>
13 | <final>true</final>
14 | </property>
15 | 
16 | <property>
17 | <name>dfs.data.dir</name>
18 | <value>/space/disk1,/space/disk2</value>
19 | <final>true</final>
20 | </property>
21 | 
22 | <property>
23 | <name>dfs.replication</name>
24 | <value>1</value>
25 | </property>
26 | 
27 | <property>
28 | <name>dfs.block.size</name>
29 | <value>67108864</value>
30 | </property>
31 | 
32 | <property>
33 |     	<name>dfs.hosts.exclude</name>
34 |     	<value>/home/hadoop/excludes</value>
35 |     	<final>true</final>
36 | </property>
37 | 
38 | <property>
39 |     	<name>dfs.hosts</name>
40 |     	<value>/home/hadoop/include</value>
41 |     	<final>true</final>
42 | </property>
43 | 
44 | mapred-site
45 | 
46 | 
47 | <property>
48 | <name>mapred.job.tracker</name>
49 | <value>jt.cluster1.com:9001</value>
50 | </property>
51 | 
52 | 
53 | 
54 | export JAVA_HOME=/usr/java/jdk1.7.0_25/
55 | export HADOOP_HOME=/home/hadoop/hadoop
56 | export HADOOP_PID_DIR=/home/hadoop/pids
57 | export HADOOP_HEAPSIZE=500
58 | 
59 | export HADOOP_LOG_DIR=${HADOOP_HOME}/logs
60 | export HADOOP_HOME_WARN_SUPPRESS="TRUE"
61 | 
62 | 
63 | 
64 | export JAVA_HOME=/usr/java/jdk1.7.0_25/
65 | 
66 | PATH=$JAVA_HOME/bin:$PATH:$HOME/bin
67 | PATH=$PATH:/home/hadoop/hadoop/bin
68 | 
69 | export PATH
70 | 
71 | ================
72 | <property>
73 |  <name>heartbeat.recheck.interval</name>
74 |  <value>15</value>
75 |  <description>Determines datanode heartbeat interval in seconds</description>
76 | </property>
77 | 
78 | If above doesn't work - try the following (seems to be version-dependent):
79 | 
80 | <property>
81 |  <name>dfs.heartbeat.recheck.interval</name>
82 |  <value>15</value>
83 |  <description>Determines datanode heartbeat interval in seconds.</description>
84 | </property>
85 | 


--------------------------------------------------------------------------------
/Notes/pig.txt:
--------------------------------------------------------------------------------
 1 | export PIG_HOME=/home/hadoop/pig/
 2 | 
 3 | 
 4 | A = load 'passwd' using PigStorage(':'); 
 5 | B = foreach A generate $0 as id; 
 6 | store B into 'id.out';
 7 | 
 8 | 
 9 | 
10 | pig -x local id.pig
11 | 
12 | pig -x mapreduce id.pig
13 | 
14 | 
15 | passwd = LOAD '/etc/passwd' USING PigStorage(':') AS (user:chararray, \
16 | passwd:chararray, uid:int, gid:int, userinfo:chararray, home:chararray, \
17 | shell:chararray);
18 | grunt> DUMP passwd;
19 | 
20 | 
21 | grunt> counts = FOREACH grp_shell GENERATE group, COUNT(passwd);
22 | grunt> DUMP counts;
23 | 
24 | 
25 | 
26 | A = load 'test';
27 | 
28 | B = foreach A generate flatten(TOKENIZE((chararray)$0)) as word;
29 | 
30 | C = group B by word;
31 | 
32 | D = foreach C generate COUNT(B), group;
33 | 
34 | store D into 'wordcount';
35 | 
36 | ==================
37 | 
38 | A = load 'http_access_2011-07-07.log' using PigStorage('-') as (f0,f1,f2,f3,f4);
39 | B = foreach A generate f0;
40 | C = distinct B;
41 | dump C;
42 | 
43 | A = load 'http_access_2011-07-07.log' using PigStorage('"') as (f0,f1,f2,f3,f4,f5);
44 | B = foreach A generate f5;
45 | C = distinct B;
46 | dump C;
47 | 
48 | A = load 'http_access_2011-07-07.log' using PigStorage('"') as (f0,f1,f2,f3,f4);
49 | B = foreach A generate f1;
50 | C = distinct B;
51 | dump C;
52 | 
53 | ==============
54 | yum install ant*
55 | 
56 | For Hadoop-2.0
57 | 
58 | ant clean jar-withouthadoop -Dhadoopversion=23
59 | 
60 | or
61 | 
62 | ant clean jar-all -Dhadoopversion=23


--------------------------------------------------------------------------------
/Notes/questions.txt:
--------------------------------------------------------------------------------
  1 | 1) If we add new DataNodes to the cluster will HDFS move the blocks to the newly added nodes in order to balance disk space utilization between the nodes?
  2 | 
  3 | a) yes, it will automatically do balancing
  4 | b) no, we have to manually to re-balancing (correct)
  5 | 
  6 | 2) The name-node will stay in safe mode till all under-replicated files are fully replicated?
  7 | 
  8 | a)TRUE		b) FALSE (correct)
  9 | 
 10 | 3) How do I set up a hadoop data node to use multiple volumes?
 11 | 
 12 | a) We cannot do that	b) We can use comma seperated fields (correct)	c) This can only be done with SAN storage
 13 | 
 14 | 4) Can a Hadoop client renames a file or a directory containing a file while another client is still writing into it?
 15 | 
 16 | a) yes, it can	(correct)	b) No, hadoop does locking
 17 | 
 18 | 5) Will the command bin/hadoop dfs -ls /projects/* list all the files under /projects ?
 19 | 
 20 | a) yes (correct, but better to safeguard it with single quotes)		b) no
 21 | 
 22 | 6) Can we have multiple files in HDFS use different block sizes?
 23 | 
 24 | a) yes (correct)	b) no
 25 | 
 26 | 7) How do you gracefully stop a running job?
 27 | 
 28 | a) hadoop job -kill jobid(correct)	b) kill the task tracker	c) it can not be done
 29 | 
 30 | 8) What is the best java version to use for Hadoop?
 31 | 
 32 | a) It does not matter	b) Must be greater then java2.6	c) greater then 1.6 (correct)
 33 | 
 34 | 9) What is the command for adding the hosts newly added to the mapred.include file?
 35 | 
 36 | a) hadoop dfsadmin -refreshNodes	b) hadoop rmadmin -refreshNodes (correct)
 37 | 
 38 | 10) What will happen, if we set the number of reducers to 0 ?
 39 | 
 40 | a) job will fail	b) the map-tasks r written directly to the disk (correct)
 41 | 
 42 | 11) How many maximum JVM run on the slave node?
 43 | 
 44 | a) only one as there is only one tasktracker	b) 2 one each for tasktracker, datanode		c) It depends upon task instances (correct)
 45 | 
 46 | 12) Where is the intermidiate mapper output stored?
 47 | 
 48 | a) It is stored in tmp folder on hdfs	b) It is stored on local filesystem(correct)		c) It is only in Memory
 49 | 
 50 | 13) When does mappers run ?
 51 | 
 52 | a) They start immediately when job is submitted		b) They start only after the mapper finish (correct)
 53 | 
 54 | 
 55 | 14) What action occurs automatically on a cluster when a DataNode is marked as dead?
 56 | 
 57 | A. The NameNode forces re-replication of all the blocks which were stored on the dead DataNode.
 58 | B. The next time a client submits job that requires blocks from the dead DataNode, the JobTracker receives no heart beats from the DataNode. The JobTracker tells the NameNode that the DataNode is dead, which triggers block re-replication on the cluster.
 59 | C. The replication factor of the files which had blocks stored on the dead DataNode is temporarily reduced, until the dead DataNode is recovered and returned to the cluster.
 60 | D. The NameNode informs the client which write the blocks that are no longer available; the client then re-writes the blocks to a different DataNode.
 61 | 
 62 | 15) QUESTION: 5
 63 | Which three distcp features can you utilize on a Hadoop cluster?
 64 | A. Use distcp to copy files only between two clusters or more. You cannot use distcp to copy data between directories inside the same cluster.
 65 | B. Use distcp to copy HBase table files.
 66 | C. Use distcp to copy physical blocks from the source to the target destination in your cluster.
 67 | D. Use distcp to copy data between directories inside the same cluster. E. Use distcp to run an internal MapReduce job to copy files.
 68 | Answer: B, D, E
 69 | 
 70 | 16) What is the recommended disk configuration for slave nodes in your Hadoop cluster with 6 x 2 TB hard drives?
 71 | A. RAID 10 B. JBOD
 72 | C. RAID 5 D. RAID 1+0
 73 | Answer: B
 74 | 
 75 | 17) Your Hadoop cluster has 25 nodes with a total of 100 TB (4 TB per node) of raw disk space allocated HDFS storage. Assuming Hadoop's default configuration, how much data will you be able to store?
 76 | A. Approximately 100TB B. Approximately 25TB C. Approximately 10TB D. Approximately 33 TB
 77 | Answer: D
 78 | 
 79 | 18) The most important consideration for slave nodes in a Hadoop cluster running production jobs that require short turnaround times is:
 80 | A. The ratio between the amount of memory and the number of disk drives.
 81 | B. The ratio between the amount of memory and the total storage capacity.
 82 | C. The ratio between the number of processor cores and the amount of memory. D. The ratio between the number of processor cores and total storage capacity. E. The ratio between the number of processor cores and number of disk drives.
 83 | Answer: D
 84 | 
 85 | 19) Your existing Hadoop cluster has 30 slave nodes, each of which has 4 x 2T hard drives. You plan to add another 10 nodes. How much disk space can your new nodes contain?
 86 | A. The new nodes must all contain 8TB of disk space, but it does not matter how the disks are configured
 87 | B. The new nodes cannot contain more than 8TB of disk space
 88 | C. The new nodes can contain any amount of disk space
 89 | D. The new nodes must all contain 4 x 2TB hard drives Answer: C
 90 | 
 91 | 20) On a cluster running MapReduce v1 (MRv1), a MapReduce job is given a directory of 10 plain text as its input directory. Each file is made up of 3 HDFS blocks. How many Mappers will run?
 92 | A. We cannot say; the number of Mappers is determined by the developer B. 30
 93 | C. 10
 94 | D. 1
 95 | Answer: B
 96 | 
 97 | 21) Which scheduler would you deploy to ensure that your cluster allows short jobs to finish within a reasonable time without starving long-running jobs?
 98 | A. FIFO Scheduler
 99 | B. Fair Scheduler
100 | C. Capacity Scheduler
101 | D. Completely Fair Scheduler (CFS)
102 | Answer: B
103 | 
104 | 22) You are planning a Hadoop duster, and you expect to be receiving just under 1TB of data per week which will be stored on the cluster, using Hadoop's default replication. You decide that your slave nodes will be configured with 4 x 1TB disks. Calculate how many slave nodes you need to deploy at a minimum to store one year's worth of data.
105 | A. 100 slave nodes B. 100 slave nodes C. 10 slave nodes D. 50 slave nodes
106 | Answer: D
107 | 
108 | 23) On a cluster running MapReduce v1 (MRv1), a MapReduce job is given a directory of 10 plain text as its input directory. Each file is made up of 3 HDFS blocks. How many Mappers will run?
109 | A. We cannot say; the number of Mappers is determined by the developer B. 30
110 | C. 10
111 | D. 1
112 | Answer: A
113 | 
114 | 24) For each job, the Hadoop framework generates task log files. Where are Hadoop's task log files stored?
115 | A. Cached on the local disk of the slave node running the task, then purged immediately upon task completion.
116 | B. Cached on the local disk of the slave node running the task, then copied into HDFS.
117 | C. In HDFS, in the directory of the user who generates the job.
118 | D. On the local disk of the slave node running the task. 
119 | 
120 | Answer: D
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/Notes/quick-links:
--------------------------------------------------------------------------------
1 | AMS: https://cwiki.apache.org/confluence/display/AMBARI/Known+Issues
2 | 


--------------------------------------------------------------------------------
/Notes/quiz4.txt:
--------------------------------------------------------------------------------
 1 | 1) How do you gracefully stop a running job?
 2 | 
 3 | a) hadoop job -kill jobid(correct)	b) kill the task tracker	c) it can not be done
 4 | 
 5 | 2) What will happen, if we set the number of reducers to 0 ?
 6 | 
 7 | a) job will fail	b) the map-tasks r written directly to the disk (correct)
 8 | 
 9 | 3) Where is the intermidiate mapper output stored?
10 | 
11 | a) It is stored in tmp folder on hdfs	b) It is stored on local filesystem(correct)		c) It is only in Memory
12 | 
13 | 4) When does mappers run ?
14 | 
15 | a) They start immediately when job is submitted		b) They start only after the mapper finish (correct)
16 | 
17 | 5) Which property set the max number of tasktrackers ? (B is correct)
18 | 
19 | a) mapred.tasktracker.map.tasks		b) mapred.tasktracker.map.tasks.maximum		c) map.tasks.maximum
20 | 


--------------------------------------------------------------------------------
/Notes/quiz7.txt:
--------------------------------------------------------------------------------
 1 | 1) What is HBase?
 2 | 
 3 | a) Is an RDMS database	b) Hbase is Column-Oriented c) Distributed database d) Both b and c
 4 | 
 5 | 2) Why we use HBase ?
 6 | 
 7 | a) It is a DB on top of HDFS	b) Hbase provide random read and write on large data set. c) HBase is same as MySql
 8 | 
 9 | 3) What is the maximum size of string data type supported by Hive?
10 | 
11 | a) 64MB		b) It depends upon the HDFS block size		c) 2GB (correct)
12 | 
13 | 4) In Hadoop ‘Reading‘ is done in parallel and ‘Writing‘ is not in HDFS.
14 | 
15 | a) TRUE (correct)
16 | b) FALSE
17 | 
18 | 5) Multiple users can use same metastore in 'Embedded metastore Mode'.
19 | 
20 | a) TRUE 
21 | b) FALSE (Correct)
22 | 
23 | 6) Hbase 'CopyTable' utlitiy can be used to:
24 | 
25 | a) Copy a partial table		b) Full table	c) It is not a valid command d) a and b (correct)
26 | 


--------------------------------------------------------------------------------
/Notes/quota.txt:
--------------------------------------------------------------------------------
 1 | Applying Quota
 2 | --------------
 3 | 
 4 | hadoop dfsadmin -setSpaceQuota 1m
 5 | 
 6 | 
 7 | dfsadmin -setQuota <N> <directory>
 8 | 
 9 | dfsadmin -clrQuota <directory>
10 | 
11 | dfsadmin -setSpaceQuota <N> 
12 | 
13 | dfsadmin -clrSpaceQuota
14 | 
15 | 
16 | Distcp
17 | ======
18 | 
19 | hadoop distcp hdfs://nn1:8020/foo/bar hdfs://nn2:8020/bar/foo
20 | 
21 | hdfs://nn1:8020/foo/a hdfs://nn1:8020/foo/b
22 | 
23 | hadoop distcp hdfs://nn1.cluster1.com:9000/jobtracker hdfs://nn1.cluster1.com:9000/newtracker
24 | 
25 | 
26 | Trash
27 | =======
28 | 
29 | <property>
30 | <name>fs.trash.interval</name>
31 | <value>40</value>
32 | </property>
33 | 
34 | SetRep
35 | =====
36 | hadoop dfs -setrep -R -w 3 /chandra


--------------------------------------------------------------------------------
/Notes/rack.txt:
--------------------------------------------------------------------------------
 1 | while [ $# -gt 0 ] ; do
 2 |   nodeArg=$1
 3 |   exec< /home/hadoop/topology.data
 4 |   result=""
 5 |   while read line ; do
 6 |     ar=( $line )
 7 |     if [ "${ar[0]}" = "$nodeArg" ] ; then
 8 |       result="${ar[1]}"
 9 |     fi
10 |   done
11 |   shift
12 |   if [ -z "$result" ] ; then
13 |     echo -n "/default"
14 |   else
15 |     echo -n "$result "
16 |   fi
17 | done
18 | 
19 | 
20 | <property>
21 | <name>topology.script.file.name</name>
22 | <value>/home/hadoop/hadoop/conf/topology.sh</value>
23 | </property>
24 | ====================
25 | The Above works very well on Hadoop 1 but for hadoop 2, make sure to have the correct format emiited by the script. It
26 | takes IP addresses instead of DNS name and also there are multiple classes like simpleDNS and Table based. We do not need to do anything if we are using a script as above, but for Java invocations and other tabular formats we need to modify the "topology.node.switch.mapping.impl"
27 | 


--------------------------------------------------------------------------------
/Notes/remove_datanode.txt:
--------------------------------------------------------------------------------
 1 | Add/Remove a Datanode
 2 | =====================
 3 | 
 4 | Decommission a host gracefully
 5 | 
 6 | <property>
 7 |     	<name>dfs.hosts.exclude</name>
 8 |     	<value>/home/hadoop/excludes</value>
 9 |     	<final>true</final>
10 | </property>
11 | 
12 | Similarly for Jobtracker.
13 | 
14 | <property>
15 |     	<name>mapred.hosts.exclude</name>
16 |     	<value>/home/hadoop/excludes</value>
17 |     	<final>true</final>
18 | </property>
19 | 
20 | mapred.hosts.exclude in mapred-site.xml
21 | 
22 | Add the FQDN to the exclude file and refresh
23 | 
24 | Update for the Namenode
25 | -----------------------
26 | 
27 | hadoop dfsadmin -refreshNodes
28 | 
29 | Update for Jobtracker
30 | ----------------------
31 | 
32 | hadoop mradmin -refreshNodes
33 | 
34 | 
35 | Add Hosts:
36 | 
37 | 1. dfs.hosts in the hdfs-site.xml, mapred.hosts
38 | 
39 | 
40 | ================================================
41 | 
42 | Cluster Balancing
43 | -----------------
44 | 
45 | hadoop balancer -threshold 40
46 | 
47 | ==============================================
48 | 
49 | Add Disk Space to a datanode
50 | ----------------------------
51 | 
52 | How do you add storage to cluster
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | <property>
60 |     	<name>dfs.hosts</name>
61 |     	<value>/home/hadoop/include</value>
62 |     	<final>true</final>
63 | </property>
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/Notes/repo_server.txt:
--------------------------------------------------------------------------------
 1 | Setup Repo Server
 2 | =================
 3 | 
 4 | Mount Centos DVD and install:
 5 | 
 6 | cd /media/Rhel 6 DVD/Packages/
 7 | 
 8 | # yum install vsftpd*
 9 | # yum install createrepo*
10 | 
11 | # mkdir /var/ftp/pub/Centos65
12 | 
13 | cp -a /media/RHEL_6_DVD/* /var/ftp/pub/Centos65/
14 | 
15 | # createrepo -v /var/ftp/pub/Centos65/
16 | 
17 | 
18 | # service vsftpd restart
19 | 
20 | ========================
21 | 
22 | On all the nodes
23 | 
24 | # rm -rf /etc/yum.repos.d/*
25 | # vi /etc/yum.repos.d/server.repo
26 | 
27 | [server]
28 | name=Centos 6.5 repository
29 | baseurl=ftp://<server ip>/pub/Centos65/
30 | gpgcheck=0
31 | enable=1
32 | 
33 | 
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/Notes/scoop.txt:
--------------------------------------------------------------------------------
  1 | export SQOOP_HOME=/usr/lib/sqoop
  2 | export PATH=$PATH:$SQOOP_HOME/bin
  3 | 
  4 | 
  5 | Step 2: Configure the MySQL Service and Connector
  6 | 
  7 | Download mysql-connector-java-5.0.5.jar file and copy it to $SQOOP_HOME/lib directory.
  8 | 
  9 | Step 3: Sqoop Installation
 10 | 
 11 | Sqoop Installation Tutorial for instructions of how to install Sqoop.
 12 | 
 13 |     Database and table creation in MySQL
 14 | 
 15 | First connect to MySQL
 16 | 
 17 | $ mysql -u root -p
 18 | 
 19 | Enter password:
 20 | 
 21 | Create database ‘testDb’ and use ‘testDb’ database as a current database.
 22 | 
 23 | mysql> create database testDb;
 24 | 
 25 | mysql> use testDb;
 26 | 
 27 | Create table ‘student’
 28 | 
 29 | mysql> create table student(id integer,name char(20));
 30 | 
 31 | Add following 2 records to the table
 32 | 
 33 | mysql> insert into student values(1,'Archana');
 34 | 
 35 | mysql> insert into student values(2,'XYZ');
 36 | 
 37 |  Exit from MySQL
 38 | 
 39 | mysql> exit;
 40 | 
 41 | Sqoop import
 42 | 1. Importing a table into HDFS
 43 | 
 44 |     1st way
 45 | 
 46 | Command for import
 47 | 
 48 | sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --m 1
 49 | 
 50 | Execute the sqoop import
 51 | 
 52 | Here we are using database ‘testDb’ , username ‘root’, password ‘hadoop123′, and table student.
 53 | 
 54 | $ sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --m 1
 55 | 
 56 | ——————- NOTE——————–
 57 | 
 58 | If you have not defined primary key for your table then you have to give ‘-m 1′ option for import.
 59 | Otherwise it gives error
 60 | ERROR tool.ImportTool: Error during import: No primary key could be found for table student1. Please specify one with --split-by or perform a sequential import with '-m 1'.
 61 | 
 62 |     2nd Way
 63 | 
 64 | Create a config file $HOME/import.txt add following to the config file
 65 | 
 66 |               import.txt
 67 | 
 68 | import
 69 | --connect
 70 | jdbc:mysql://localhost/testDb
 71 | --username
 72 | root
 73 | --password
 74 | hadoop123
 75 | 
 76 | Execute the sqoop import
 77 | 
 78 | sqoop --options-file /home/hduser/import.txt --table student -m 1
 79 | 
 80 | Once import is done you can find student.jar, student.class and student.java at following location /tmp/sqoop-hduser/compile/—-/student.jar
 81 | 
 82 | Files created in HDFS
 83 | 
 84 | $ hadoop dfs -ls -R student
 85 | 
 86 | Found 3 items
 87 | 
 88 | -rw-r--r--   1 hduser supergroup          0 2013-09-13 15:38 /user/hduser/student/_SUCCESS
 89 | 
 90 | drwxr-xr-x   - hduser supergroup          0 2013-09-13 15:38 /user/hduser/student/_logs
 91 | 
 92 | -rw-r--r--   1 hduser supergroup         16 2013-09-13 15:38 /user/hduser/student/part-m-00000
 93 | 
 94 | Data file contents
 95 | 
 96 | $ hadoop dfs -cat /user/hduser/student/part-m-00000
 97 | 
 98 | 1,Archana
 99 | 2,XYZ
100 | 
101 | 2 Import all rows of a table in MySQL, but specific columns of the table
102 | 
103 | $ sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --columns "name" -m 1
104 | 
105 | Data file contents
106 | 
107 | $ hadoop dfs -cat  /user/hduser/student/part-m-00000
108 | 
109 | Archana
110 | Xyz
111 | 
112 | 3 Import all columns, filter rows using where clause
113 | 
114 | $ sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --where "id>1" -m 1 --target-dir /user/hduser/ar
115 | 
116 | Data file contents
117 | 
118 | $ hadoop dfs -cat  /user/hduser/ar/part-m-00000
119 | 2,XYZ


--------------------------------------------------------------------------------
/Notes/sqoop.txt:
--------------------------------------------------------------------------------
  1 | export SQOOP_HOME=/usr/lib/sqoop
  2 | export PATH=$PATH:$SQOOP_HOME/bin
  3 | 
  4 | 
  5 | Step 2: Configure the MySQL Service and Connector
  6 | 
  7 | Download mysql-connector-java-5.0.5.jar file and copy it to $SQOOP_HOME/lib directory.
  8 | 
  9 | Step 3: Sqoop Installation
 10 | 
 11 | Sqoop Installation Tutorial for instructions of how to install Sqoop.
 12 | 
 13 |     Database and table creation in MySQL
 14 | 
 15 | First connect to MySQL
 16 | 
 17 | $ mysql -u root -p
 18 | 
 19 | Enter password:
 20 | 
 21 | Create database ‘testDb’ and use ‘testDb’ database as a current database.
 22 | 
 23 | mysql> create database testDb;
 24 | 
 25 | mysql> use testDb;
 26 | 
 27 | Create table ‘student’
 28 | 
 29 | mysql> create table student(id integer,name char(20));
 30 | 
 31 | Add following 2 records to the table
 32 | 
 33 | mysql> insert into student values(1,'Archana');
 34 | 
 35 | mysql> insert into student values(2,'XYZ');
 36 | 
 37 |  Exit from MySQL
 38 | 
 39 | mysql> exit;
 40 | 
 41 | Sqoop import
 42 | 1. Importing a table into HDFS
 43 | 
 44 |     1st way
 45 | 
 46 | Command for import
 47 | 
 48 | sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --m 1
 49 | 
 50 | Execute the sqoop import
 51 | 
 52 | Here we are using database ‘testDb’ , username ‘root’, password ‘hadoop123′, and table student.
 53 | 
 54 | $ sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --m 1
 55 | 
 56 | ——————- NOTE——————–
 57 | 
 58 | If you have not defined primary key for your table then you have to give ‘-m 1′ option for import.
 59 | Otherwise it gives error
 60 | ERROR tool.ImportTool: Error during import: No primary key could be found for table student1. Please specify one with --split-by or perform a sequential import with '-m 1'.
 61 | 
 62 |     2nd Way
 63 | 
 64 | Create a config file $HOME/import.txt add following to the config file
 65 | 
 66 |               import.txt
 67 | 
 68 | import
 69 | --connect
 70 | jdbc:mysql://localhost/testDb
 71 | --username
 72 | root
 73 | --password
 74 | hadoop123
 75 | 
 76 | Execute the sqoop import
 77 | 
 78 | sqoop --options-file /home/hduser/import.txt --table student -m 1
 79 | 
 80 | Once import is done you can find student.jar, student.class and student.java at following location /tmp/sqoop-hduser/compile/—-/student.jar
 81 | 
 82 | Files created in HDFS
 83 | 
 84 | $ hadoop dfs -ls -R student
 85 | 
 86 | Found 3 items
 87 | 
 88 | -rw-r--r--   1 hduser supergroup          0 2013-09-13 15:38 /user/hduser/student/_SUCCESS
 89 | 
 90 | drwxr-xr-x   - hduser supergroup          0 2013-09-13 15:38 /user/hduser/student/_logs
 91 | 
 92 | -rw-r--r--   1 hduser supergroup         16 2013-09-13 15:38 /user/hduser/student/part-m-00000
 93 | 
 94 | Data file contents
 95 | 
 96 | $ hadoop dfs -cat /user/hduser/student/part-m-00000
 97 | 
 98 | 1,Archana
 99 | 2,XYZ
100 | 
101 | 2 Import all rows of a table in MySQL, but specific columns of the table
102 | 
103 | $ sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --columns "name" -m 1
104 | 
105 | Data file contents
106 | 
107 | $ hadoop dfs -cat  /user/hduser/student/part-m-00000
108 | 
109 | Archana
110 | Xyz
111 | 
112 | 3 Import all columns, filter rows using where clause
113 | 
114 | $ sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --where "id>1" -m 1 --target-dir /user/hduser/ar
115 | 
116 | Data file contents
117 | 
118 | $ hadoop dfs -cat  /user/hduser/ar/part-m-00000
119 | 2,XYZ


--------------------------------------------------------------------------------
/Notes/sqoop1.txt:
--------------------------------------------------------------------------------
 1 | sqoop list-databases --connect jdbc:mysql://client.cluster1.com/employee --username sqoop --password passwd
 2 | 
 3 | sqoop list-tables --connect jdbc:mysql://client.cluster1.com/employee --username sqoop --password passwd
 4 | 
 5 | 
 6 | sqoop import --connect jdbc:mysql://client.cluster1.com/employee --username sqoop --password passwd --table student -m 1 --target-dir /user/sqoop/employee
 7 | 
 8 | sqoop import --connect jdbc:mysql://client.cluster1.com/employee --username sqoop --password passwd --table student -m 1 --target-dir /user/sqoop/employee
 9 | 
10 | 
11 | sqoop --options-file SqoopImportOptions.txt \
12 | --table employees  \
13 | --where "emp_no > 499948" \
14 | --as-textfile \
15 | -m 1 \
16 | --target-dir /user/airawat/sqoop-mysql/employeeGtTest
17 | 
18 | 
19 | sqoop --options-file SqoopImportOptions.txt \
20 | --query 'select EMP_NO,FIRST_NAME,LAST_NAME from employees where $CONDITIONS' \
21 | --fetch-size=50000 \
22 | --split-by EMP_NO \
23 | --direct \
24 | --target-dir /user/airawat/sqoop-mysql/FetchSize
25 | 
26 | sqoop --options-file SqoopImportOptions.txt \
27 | 
28 | --query 'select EMP_NO,FIRST_NAME,LAST_NAME from employees where $CONDITIONS' \
29 | -z \
30 | --split-by EMP_NO \
31 | --direct \
32 | --target-dir /user/airawat/sqoop-mysql/CompressedSampl
33 | 
34 | =================
35 | mysql> create table employee(id varchar(20),name varchar(20),salary varchar(10));
36 | 
37 | hive -> CREATE External TABLE emp_hive (id INT, name STRING, salary STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE location '/user/hadoop/table';
38 | 
39 | 
40 | sqoop import --connect jdbc:mysql://repo.cluster1.com/test --username hadoop --password hivepassword --table employee --target-dir /user/hadoop/table -m 1 --incremental append -check-column id
41 | 
42 | #!/bin/bash
43 | 
44 | for i in `seq 1 100`
45 | do
46 | 	echo "insert into test.employee(id,name,salary) values('${i}','Am${i}','10000');"
47 | done
48 | 


--------------------------------------------------------------------------------
/Notes/yarn.txt:
--------------------------------------------------------------------------------
 1 | hadoop-daemon.sh start namenode
 2 | hadoop-daemon.sh start datanode
 3 | 
 4 | yarn-daemon.sh start resourcemanager
 5 | yarn-daemon.sh start nodemanager
 6 | 
 7 | 
 8 | <property>
 9 |     <name>yarn.resourcemanager.address</name>
10 |     <value>ha-nn1.hacluster1.com:8032</value>
11 |     <description>the host is the hostname of the ResourceManager and the port is the port on
12 |     which the clients can talk to the Resource Manager. </description>
13 | </property>
14 | 
15 | <property>
16 |     <name>yarn.resourcemanager.scheduler.address</name>
17 |     <value>ha-nn1.hacluster1.com:8030</value>
18 |     <description>host is the hostname of the resourcemanager and port is the port
19 |     on which the Applications in the cluster talk to the Resource Manager.
20 |     </description>
21 | </property>
22 | 
23 | <property>
24 |     <name>yarn.resourcemanager.resource-tracker.address</name>
25 |     <value>ha-nn1.hacluster1.com:8031</value>
26 |     <description>host is the hostname of the resource manager and
27 |     port is the port on which the NodeManagers contact the Resource Manager.
28 |     </description>
29 | </property>
30 | 
31 | <property>
32 |     <name>yarn.nodemanager.address</name>
33 |     <value>0.0.0.0:9004</value>
34 |     <description>the nodemanagers bind to this port</description>
35 | </property>
36 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ```
 2 | @ Netxillon Technologies. You are allowed to use and modify any work here, provided you acknowlege the source back.
 3 | Please contact at trainings@netxillon.com for any questions.
 4 | 
 5 | Disclaimer: There is no responsibility for any kind of damage caused, by using this github. Please make sure you understand the things here before implementing them in production.
 6 | ```
 7 | ```
 8 | http://www.netxillon.com
 9 | For any help you can reach me at: trainings@netxillon.com
10 | ```
11 | 
12 | #### Courses
13 | 
14 | Hadoop Cluster Configurations
15 | The config files are from running cluster. Feel free to use them, but please drop an email with your feedback.
16 | 
17 | I provide Advanced Hadoop Administration and DevOps trainings:
18 | > Hadoop, HBase, Kafka, Spark
19 | > Ansible automation for Hadoop Stack
20 | > Advanced Linux Optmizations
21 | 
22 | Advanced Hadoop Training: I will be covering topics like: detailed kerberos, Encryption, Centerlized caching, Storage policy, Ranger, Knox, Hadoop Performance Tuning and Production Use cases. Contact me for details.
23 | 
24 | >  "Doing a course is not a guarantee for a job, but having a solid foundation surely is"
25 |  
26 | For Details on Courses offered, please refer to the folder **Courses_Offered**.
27 | 


--------------------------------------------------------------------------------
/Schedulers/capacity-scheduler.xml:
--------------------------------------------------------------------------------
  1 | <!--
  2 |   Licensed under the Apache License, Version 2.0 (the "License");
  3 |   you may not use this file except in compliance with the License.
  4 |   You may obtain a copy of the License at
  5 | 
  6 |     http://www.apache.org/licenses/LICENSE-2.0
  7 | 
  8 |   Unless required by applicable law or agreed to in writing, software
  9 |   distributed under the License is distributed on an "AS IS" BASIS,
 10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |   See the License for the specific language governing permissions and
 12 |   limitations under the License. See accompanying LICENSE file.
 13 | -->
 14 | <configuration>
 15 | 
 16 |   <property>
 17 |     <name>yarn.scheduler.capacity.maximum-applications</name>
 18 |     <value>10000</value>
 19 |     <description>
 20 |       Maximum number of applications that can be pending and running.
 21 |     </description>
 22 |   </property>
 23 | 
 24 |   <property>
 25 |     <name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
 26 |     <value>0.1</value>
 27 |     <description>
 28 |       Maximum percent of resources in the cluster which can be used to run
 29 |       application masters i.e. controls number of concurrent running
 30 |       applications.
 31 |     </description>
 32 |   </property>
 33 | 
 34 |   <property>
 35 |     <name>yarn.scheduler.capacity.resource-calculator</name>
 36 |     <value>org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator</value>
 37 |     <description>
 38 |       The ResourceCalculator implementation to be used to compare
 39 |       Resources in the scheduler.
 40 |       The default i.e. DefaultResourceCalculator only uses Memory while
 41 |       DominantResourceCalculator uses dominant-resource to compare
 42 |       multi-dimensional resources such as Memory, CPU etc.
 43 |     </description>
 44 |   </property>
 45 | 
 46 |   <property>
 47 |     <name>yarn.scheduler.capacity.root.queues</name>
 48 |     <value>default,sales,marketing</value>
 49 |     <description>
 50 |       The queues at the this level (root is the root queue).
 51 |     </description>
 52 |   </property>
 53 | 
 54 |   <property>
 55 |     <name>yarn.scheduler.capacity.root.default.capacity</name>
 56 |     <value>50</value>
 57 |     <description>Default queue target capacity.</description>
 58 |   </property>
 59 | 
 60 |   <property>
 61 |     <name>yarn.scheduler.capacity.root.default.user-limit-factor</name>
 62 |     <value>1</value>
 63 |     <description>
 64 |       Default queue user limit a percentage from 0.0 to 1.0.
 65 |     </description>
 66 |   </property>
 67 | 
 68 |   <property>
 69 |     <name>yarn.scheduler.capacity.root.default.maximum-capacity</name>
 70 |     <value>100</value>
 71 |     <description>
 72 |       The maximum capacity of the default queue.
 73 |     </description>
 74 |   </property>
 75 | 
 76 |   <property>
 77 |     <name>yarn.scheduler.capacity.root.default.state</name>
 78 |     <value>RUNNING</value>
 79 |     <description>
 80 |       The state of the default queue. State can be one of RUNNING or STOPPED.
 81 |     </description>
 82 |   </property>
 83 | 
 84 |   <property>
 85 |     <name>yarn.scheduler.capacity.root.default.acl_submit_applications</name>
 86 |     <value>*</value>
 87 |     <description>
 88 |       The ACL of who can submit jobs to the default queue.
 89 |     </description>
 90 |   </property>
 91 | 
 92 |   <property>
 93 |     <name>yarn.scheduler.capacity.root.default.acl_administer_queue</name>
 94 |     <value>*</value>
 95 |     <description>
 96 |       The ACL of who can administer jobs on the default queue.
 97 |     </description>
 98 |   </property>
 99 | 
100 |   <property>
101 |     <name>yarn.scheduler.capacity.node-locality-delay</name>
102 |     <value>40</value>
103 |     <description>
104 |       Number of missed scheduling opportunities after which the CapacityScheduler
105 |       attempts to schedule rack-local containers.
106 |       Typically this should be set to number of nodes in the cluster, By default is setting
107 |       approximately number of nodes in one rack which is 40.
108 |     </description>
109 |   </property>
110 | 
111 | # sales queue
112 | 
113 | <property>
114 |     <name>yarn.scheduler.capacity.root.sales.capacity</name>
115 |     <value>30</value>
116 |   </property>
117 | 
118 |   <property>
119 |     <name>yarn.scheduler.capacity.root.sales.user-limit-factor</name>
120 |     <value>1</value>
121 |   </property>
122 | 
123 |   <property>
124 |     <name>yarn.scheduler.capacity.root.sales.maximum-capacity</name>
125 |     <value>100</value>
126 |   </property>
127 | 
128 |   <property>
129 |     <name>yarn.scheduler.capacity.root.sales.state</name>
130 |     <value>RUNNING</value>
131 |   </property>
132 | 
133 |   <property>
134 |     <name>yarn.scheduler.capacity.root.sales.acl_submit_applications</name>
135 |     <value>*</value>
136 |   </property>
137 | 
138 |   <property>
139 |     <name>yarn.scheduler.capacity.root.sales.acl_administer_queue</name>
140 |     <value>*</value>
141 |   </property>
142 | 
143 | # Marketing Queue
144 | 
145 | <property>
146 |     <name>yarn.scheduler.capacity.root.marketing.capacity</name>
147 |     <value>20</value>
148 |   </property>
149 | 
150 |   <property>
151 |     <name>yarn.scheduler.capacity.root.marketing.user-limit-factor</name>
152 |     <value>1</value>
153 |   </property>
154 | 
155 |   <property>
156 |     <name>yarn.scheduler.capacity.root.marketing.maximum-capacity</name>
157 |     <value>100</value>
158 |   </property>
159 | 
160 |   <property>
161 |     <name>yarn.scheduler.capacity.root.marketing.state</name>
162 |     <value>RUNNING</value>
163 |   </property>
164 | 
165 |   <property>
166 |     <name>yarn.scheduler.capacity.root.marketing.acl_submit_applications</name>
167 |     <value>*</value>
168 |   </property>
169 | 
170 |   <property>
171 |     <name>yarn.scheduler.capacity.root.marketing.acl_administer_queue</name>
172 |     <value>*</value>
173 |   </property>
174 | 
175 |   <property>
176 |     <name>yarn.scheduler.capacity.queue-mappings</name>
177 |     <value></value>
178 |     <description>
179 |       A list of mappings that will be used to assign jobs to queues
180 |       The syntax for this list is [u|g]:[name]:[queue_name][,next mapping]*
181 |       Typically this list will be used to map users to queues,
182 |       for example, u:%user:%user maps all users to queues with the same name
183 |       as the user.
184 |     </description>
185 |   </property>
186 | 
187 |   <property>
188 |     <name>yarn.scheduler.capacity.queue-mappings-override.enable</name>
189 |     <value>false</value>
190 |     <description>
191 |       If a queue mapping is present, will it override the value specified
192 |       by the user? This can be used by administrators to place jobs in queues
193 |       that are different than the one specified by the user.
194 |       The default is false.
195 |     </description>
196 |   </property>
197 | 
198 | </configuration>
199 | 


--------------------------------------------------------------------------------
/Schedulers/commands:
--------------------------------------------------------------------------------
 1 | Hadoop 1:
 2 | hadoop jar hadoop/hadoop-examples-1.2.1.jar wordcount -Dmapred.job.queue.name=high /project/input /output2233231
 3 | 
 4 | Hadoop 2:
 5 | yarn jar hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.3.jar wordcount -Dmapred.job.queue.name=sales /test /out
 6 | 
 7 | Useful Commands:
 8 | $ yarn rmadmin -refreshQueues
 9 | $ mapred queue -list
10 | 


--------------------------------------------------------------------------------
/Schedulers/fair-scheduler.xml:
--------------------------------------------------------------------------------
 1 | Hadoop 1, we used the concept "pool" as well, but later it was standarized to queues
 2 | 
 3 | <allocations>
 4 | 
 5 | <pool name="ads">
 6 | <minMaps>10</minMaps>
 7 | <minReduces>5</minReduces>
 8 | </pool>
 9 | 
10 | 
11 | #Examples
12 | 
13 |   <queue name="sample_queue">
14 |     <minResources>10000 mb,0vcores</minResources>
15 |     <maxResources>90000 mb,0vcores</maxResources>
16 |     <maxRunningApps>50</maxRunningApps>
17 |     <maxAMShare>0.1</maxAMShare>
18 |     <weight>2.0</weight>
19 |     <schedulingPolicy>fair</schedulingPolicy>
20 |     <queue name="sample_sub_queue">
21 |       <aclSubmitApps>charlie</aclSubmitApps>
22 |       <minResources>5000 mb,0vcores</minResources>
23 |     </queue>
24 |   </queue>
25 | 
26 |   <queueMaxAMShareDefault>0.5</queueMaxAMShareDefault>
27 | 
28 |   <!-- Queue 'secondary_group_queue' is a parent queue and may have
29 |        user queues under it -->
30 |   <queue name="secondary_group_queue" type="parent">
31 |   <weight>3.0</weight>
32 |   </queue>
33 |   
34 |   <user name="sample_user">
35 |     <maxRunningApps>30</maxRunningApps>
36 |   </user>
37 |   <userMaxAppsDefault>5</userMaxAppsDefault>
38 |   
39 |   <queuePlacementPolicy>
40 |     <rule name="specified" />
41 |     <rule name="primaryGroup" create="false" />
42 |     <rule name="nestedUserQueue">
43 |         <rule name="secondaryGroupExistingQueue" create="false" />
44 |     </rule>
45 |     <rule name="default" queue="sample_queue"/>
46 |   </queuePlacementPolicy>
47 | 
48 | </allocations>
49 | 


--------------------------------------------------------------------------------
/Schedulers/mapred-site.xml:
--------------------------------------------------------------------------------
 1 | <property>
 2 | <name>mapred.job.tracker</name>
 3 | <value>jt.cluster1.com:9001</value>
 4 | </property>
 5 | 
 6 | <property>
 7 | <name>mapred.jobtracker.taskScheduler</name>
 8 | <value>org.apache.hadoop.mapred.FairScheduler</value>
 9 | </property>
10 | 
11 | <property>
12 | <name>mapred.fairscheduler.allocation.file</name>
13 | <value>/home/hadoop/hadoop/conf/fair-scheduler.xml</value>
14 | </property>
15 | 
16 | <property>
17 |   <name>mapred.fairscheduler.poolnameproperty</name>
18 |   <value>mapred.job.queue.name</value>
19 |   <final>true</final>
20 | </property>
21 | 
22 | <property>
23 |   <name>mapred.queue.names</name>
24 |   <value>default,high,low</value>
25 | </property>
26 | 


--------------------------------------------------------------------------------
/Schedulers/user-mappings.txt:
--------------------------------------------------------------------------------
 1 | <property>
 2 |     <name>yarn.scheduler.capacity.queue-mappings</name>
 3 |     <value>u:hdfs:marketing</value>
 4 |     <description>
 5 |       A list of mappings that will be used to assign jobs to queues
 6 |       The syntax for this list is [u|g]:[name]:[queue_name][,next mapping]*
 7 |       Typically this list will be used to map users to queues,
 8 |       for example, u:%user:%user maps all users to queues with the same name
 9 |       as the user.
10 |     </description>
11 |   </property>
12 | 
13 | u:%user:%primary_group
14 | 
15 | <property>
16 |   <name>yarn.scheduler.capacity.queue-mappings</name>
17 |   <value>u:%user:%primary_group</value>
18 | </property>
19 | 
20 | <property>
21 |     <name>yarn.scheduler.capacity.queue-mappings</name>
22 |     <value>u:maria:engineering,g:webadmins:weblog</value>
23 | </property> 
24 | 
25 | <property>
26 |     <name>yarn.scheduler.capacity.queue-mappings-override.enable</name>
27 |     <value>false</value>
28 |     <description>
29 |       If a queue mapping is present and override is set to true, it will override the queue value specified
30 |       by the user. This can be used by administrators to place jobs in queues
31 |       that are different than the one specified by the user.
32 |       The default is false - user can specify to a non-default queue.
33 |     </description>
34 | </property>
35 | 


--------------------------------------------------------------------------------
/Schedulers/yarn-site.xml_capacity:
--------------------------------------------------------------------------------
1 | # Capacity Scheduler is the default scheduler. So, we do not need to configure the below in Hadoop 2.x
2 | 
3 | 
4 | <property>
5 | <name>yarn.resourcemanager.scheduler.class</name>
6 | <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</value>
7 | <source>yarn-default.xml</source>
8 | </property>
9 | 


--------------------------------------------------------------------------------
/Schedulers/yarn-site.xml_fair:
--------------------------------------------------------------------------------
 1 | <property>
 2 |   <name>yarn.resourcemanager.scheduler.class</name>
 3 |   <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
 4 | </property>
 5 | 
 6 | <property>
 7 |   <name>yarn.scheduler.fair.allocation.file</name>
 8 |   <value>hadoop/conf/fair-scheduler.xml</value>
 9 | </property>
10 | 
11 | 


--------------------------------------------------------------------------------
/Security/README.md:
--------------------------------------------------------------------------------
 1 | Important Points before starrting with Security:
 2 | ===============================================
 3 | 1. Ensure NTP is working and all nodes are in sync.
 4 | 2. Ensure every system has the right entropy. Atleast 1000; refer to installation of rng under the kerberos install script.
 5 |    - This will ensure faster cryptography for keys, principals etc
 6 | 3. For Kerberos make sure Java is patched with Unrestricted key length.
 7 | 4. If not using SASL for Datanodes, ensure JSVC_HOME is set to the binary.
 8 | 
 9 | This is very vast topic and lots of things to talk about:
10 | 
11 | - The integrations can be with AD, FreeIPA, OpenLdap, Kerberos.
12 | - SIEM or RIHNO etc
13 | 
14 | For any specific needs, please contact me at trainings@netxillon.com
15 | 


--------------------------------------------------------------------------------
/Security/SSL_Configs/CA/README.txt:
--------------------------------------------------------------------------------
1 | This is to setup CA and get all certs signed by CA.
2 | 


--------------------------------------------------------------------------------
/Security/SSL_Configs/commands_CA_JKS:
--------------------------------------------------------------------------------
 1 | # yum install openssl-devel
 2 | cd /etc/pki/CA/
 3 | 
 4 | ls -l crl/
 5 | ls -l newcerts/
 6 | ls -l private/
 7 | vi /etc/pki/tls/openssl.cnf
 8 | touch /etc/pki/CA/index.txt
 9 | echo 01 > /etc/pki/CA/serial
10 | 
11 | openssl genrsa -out private/myca.key -des3 2048
12 | or openssl genrsa -out private/myca.key -aes128 2048
13 | 
14 | openssl req -new -x509 -key private/myca.key -days 365 > CA.crt
15 | ----------------
16 | more refined way:
17 | openssl req -new -sha256 -key private/myca.key -nodes -out rootCA.csr
18 | openssl x509 -req -days 3650 -extensions v3_ca -in rootCA.csr -signkey private/myca.key -out rootCA.pem
19 | ------------------
20 | 
21 | mkdir certs
22 | cd certs/
23 | openssl req -new -newkey rsa:2048 -nodes -keyout dilithium.key -out dilithium.csr
24 | 
25 | openssl ca -in dilithium.csr -out dilithium.crt
26 | openssl req -new -newkey rsa:2048 -nodes -keyout cluster1.key -out cluster1.csr
27 | openssl ca -in cluster1.csr -out cluster1.crt
28 | openssl req -new -newkey rsa:2048 -nodes -keyout cluster1.key -out cluster1.csr
29 | openssl ca -in cluster1.csr -out cluster1.crt
30 |   
31 | openssl verify -CAfile /etc/pki/CA/CA.crt certs/dilithium.crt
32 | 
33 | 
34 | openssl verify cluster1.crt
35 | openssl verify dilithium.crt
36 | 
37 | 
38 | Hadoop JKS steps: CA signed
39 | ---------------------------
40 | 
41 |  keytool -genkey -alias `hostname -s` -keyalg RSA -dname "CN=`hostname -f`,OU=Netxillon Technologies,O=Netxillon Technologies,L=Melbourne,ST=Victoria,C=AU" -keypass password -keystore keystore.jks -storepass password
42 | 
43 |  keytool -certreq -alias `hostname -s` -keyalg RSA -file `hostname -s`.csr -keystore keystore.jks -storepass password
44 | 
45 |  openssl ca -batch -passin pass:redhat -in `hostname -s`.csr -out `hostname -s`.crt
46 |  
47 |  keytool -import -keystore keystore.jks -file CA.crt -alias CARoot -storepass password -noprompt
48 | 
49 |  keytool -import -keystore keystore.jks -file `hostname -s`.crt -alias `hostname -s` -keypass password -storepass password -noprompt
50 | 
51 |  keytool -importcert -keystore truststore.jks -file CA.crt -alias CARoot -storepass password -noprompt
52 |  
53 | Good to do:
54 | ===========
55 | keytool -exportcert -alias caroot -keystore /etc/security/keys/truststore.jks -file /usr/java/default/jre/lib/security/cacerts
56 | 
57 | 
58 | Verify PEM format or not
59 | ========================
60 | 
61 | openssl x509 -inform PEM -in CA.crt
62 | openssl x509 -inform PEM -in CA.pem
63 | openssl x509 -inform PEM -in cm1.opta.com-server.pem
64 | 
65 | Verify cert presented by Server
66 | --------------------------------
67 | openssl s_client -verify 100 -showcerts -CAfile <($JAVA_HOME/bin/keytool -list -rfc -keystore $JAVA_HOME/jre/lib/security/jssecacerts -storepass changeit) -connect cm1.opta.com:7183
68 | 
69 | openssl s_client -connect cm1.opta.com:7183 2>/dev/null </dev/null |  sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p'
70 | 


--------------------------------------------------------------------------------
/Security/SSL_Configs/gen-certs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TRUSTSTORE_KEY=truststore.jks
 4 | KEYSTORE_KEY=keystore.jks
 5 | PRIV_KEY_PASSWORD=password
 6 | SERVER_KEYSTORE_PASSWORD=password
 7 | SERVER_TRUSTSTORE_PASSWORD=password
 8 | 
 9 | for i in $(cat hosts)
10 | do
11 |  #echo `echo $i | awk -F. '{ print $1 }'`
12 | 
13 |  mkdir -p `echo $i | awk -F. '{ print $1 }'`
14 | 
15 |  keytool -genkey -alias `echo $i | awk -F. '{ print $1 }'` -keyalg RSA -dname "CN=${i},OU=Netxillon Technologies,O=Netxillon Technologies,L=Melbourne,ST=Victoria,C=AU" -keypass "$PRIV_KEY_PASSWORD" -keystore `echo $i | awk -F. '{ print $1 }'`/$KEYSTORE_KEY -storepass "$SERVER_KEYSTORE_PASSWORD"
16 | 
17 |  keytool -certreq -alias `echo $i | awk -F. '{ print $1 }'` -keyalg RSA -file `echo $i | awk -F. '{ print $1 }'`/`echo $i | awk -F. '{ print $1 }'`.csr -keystore `echo $i | awk -F. '{ print $1 }'`/$KEYSTORE_KEY -storepass "$SERVER_KEYSTORE_PASSWORD"
18 | 
19 |  openssl ca -batch -passin pass:redhat -in `echo $i | awk -F. '{ print $1 }'`/`echo $i | awk -F. '{ print $1 }'`.csr -out `echo $i | awk -F. '{ print $1 }'`/`echo $i | awk -F. '{ print $1 }'`.crt
20 | 
21 |  keytool -import -keystore `echo $i | awk -F. '{ print $1 }'`/$KEYSTORE_KEY -file CA.crt -alias CARoot -storepass "$SERVER_KEYSTORE_PASSWORD" -noprompt
22 | 
23 |  keytool -import -keystore `echo $i | awk -F. '{ print $1 }'`/$KEYSTORE_KEY -file `echo $i | awk -F. '{ print $1 }'`/`echo $i | awk -F. '{ print $1 }'`.crt -alias `echo $i | awk -F. '{ print $1 }'` -keypass "$SERVER_KEYSTORE_PASSWORD" -storepass "$SERVER_KEYSTORE_PASSWORD" -noprompt
24 | 
25 |  keytool -importcert -keystore `echo $i | awk -F. '{ print $1 }'`/${TRUSTSTORE_KEY} -file CA.crt -alias CARoot -storepass "$SERVER_TRUSTSTORE_PASSWORD" -noprompt
26 | 
27 | done
28 | 
29 | 


--------------------------------------------------------------------------------
/Security/SSL_Configs/hadoop_ssl_configs/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <!-- Put site-specific property overrides in this file. -->
 5 | 
 6 | <configuration>
 7 | 
 8 | <property>
 9 | <name>fs.default.name</name>
10 | <value>hdfs://nn1.cluster1.com:9000</value>
11 | </property>
12 | 
13 | <property>
14 | <name>hadoop.rpc.protection</name>
15 | <value>privacy</value>
16 | </property>
17 | 
18 | <property>
19 | <name>hadoop.ssl.require.client.cert</name>
20 | <value>false</value>
21 | </property>
22 | 
23 | <property>
24 | <name>hadoop.ssl.hostname.verifier</name>
25 | <value>DEFAULT</value>
26 | </property>
27 | 
28 | <property>
29 | <name>hadoop.ssl.keystores.factory.class</name>
30 | <value>org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory</value>
31 | </property>
32 | 
33 | <property>
34 | <name>hadoop.ssl.server.conf</name>
35 | <value>ssl-server.xml</value>
36 | </property>
37 | 
38 | <property>
39 | <name>hadoop.ssl.client.conf</name>
40 | <value>ssl-client.xml</value>
41 | </property>
42 | 
43 | </configuration>
44 | 


--------------------------------------------------------------------------------
/Security/SSL_Configs/hadoop_ssl_configs/hdfs-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <!-- Put site-specific property overrides in this file. -->
 5 | 
 6 | <configuration>
 7 | 
 8 | <property>
 9 | <name>dfs.encrypt.data.transfer</name>
10 | <value>true</value>
11 | </property>
12 | 
13 | <property>
14 | <name>dfs.block.access.token.enable</name>
15 | <value>true</value>
16 | </property>
17 | 
18 | <property>
19 | <name>dfs.data.transfer.protection</name>
20 | <value>privacy</value>
21 | </property>
22 | 
23 | <property>
24 | <name>dfs.namenode.secondary.https-address</name>
25 | <value>nn1.cluster1.com:50091</value>
26 | </property>
27 | 
28 | <property>
29 | <name>dfs.namenode.https-address</name>
30 | <value>nn1.cluster1.com:50470</value>
31 | </property>
32 | 
33 | <property>
34 | <name>dfs.webhdfs.enabled</name>
35 | <value>true</value>
36 | </property>
37 | 
38 | <property>
39 | <name>dfs.https.enable</name>
40 | <value>true</value>
41 | </property>
42 | 
43 | <property>
44 | <name>dfs.http.policy</name>
45 | <value>HTTPS_ONLY</value>
46 | </property>
47 | 
48 | <property>
49 | <name>dfs.name.dir</name>
50 | <value>/data/nn1,/data/nn2</value>
51 | </property>
52 | 
53 | <property>
54 | <name>dfs.data.dir</name>
55 | <value>/data/d1,/data/d2</value>
56 | </property>
57 | 
58 | <property>
59 | <name>dfs.replication</name>
60 | <value>1</value>
61 | </property>
62 | 
63 | <property>
64 | <name>dfs.datanode.https.address</name>
65 | <value>0.0.0.0:50475</value>
66 | </property>
67 | 
68 | <property>
69 |   <name>dfs.datanode.address</name>
70 |   <value>0.0.0.0:10019</value>
71 | </property>
72 | 
73 | <property>
74 |   <name>dfs.datanode.http.address</name>
75 |   <value>0.0.0.0:10022</value>
76 | </property>
77 | 
78 | </configuration>
79 | 


--------------------------------------------------------------------------------
/Security/SSL_Configs/hadoop_ssl_configs/mapred-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <!-- Put site-specific property overrides in this file. -->
 5 | 
 6 | <configuration>
 7 | 
 8 | <property>
 9 | <name>mapreduce.framework.name</name>
10 | <value>yarn</value>
11 | </property>
12 | 
13 |   <property>
14 |     <name>hadoop.ssl.enabled</name>
15 |     <value>true</value>
16 |   </property>
17 | 
18 |   <property>
19 |     <name>mapreduce.shuffle.ssl.enabled</name>
20 |     <value>true</value>
21 |   </property>
22 | 
23 |   <property>
24 |       <name>hadoop.ssl.require.client.cert</name>
25 |       <value>false</value>
26 |     </property>
27 | 
28 |     <property>
29 |       <name>hadoop.ssl.hostname.verifier</name>
30 |       <value>DEFAULT</value>
31 |       <final>true</final>
32 |     </property>
33 | 
34 |     <property>
35 |       <name>hadoop.ssl.keystores.factory.class</name>
36 |       <value>org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory</value>
37 |       <final>true</final>
38 |     </property>
39 | 
40 |     <property>
41 |       <name>hadoop.ssl.server.conf</name>
42 |       <value>ssl-server.xml</value>
43 |       <final>true</final>
44 |     </property>
45 | 
46 |     <property>
47 |       <name>hadoop.ssl.client.conf</name>
48 |       <value>ssl-client.xml</value>
49 |       <final>true</final>
50 | </property>
51 | 
52 | 
53 | <property>
54 | <name>mapreduce.jobhistory.http.policy</name>
55 | <value>HTTPS_ONLY</value>
56 | </property>
57 | 
58 | <property>
59 | <name>mapreduce.jobhistory.webapp.https.address</name>
60 | <value>rm1.cluster1.com:19889</value>
61 | </property>
62 | 
63 | </configuration>
64 | 


--------------------------------------------------------------------------------
/Security/SSL_Configs/hadoop_ssl_configs/ssl-client.xml:
--------------------------------------------------------------------------------
 1 | [hadoop@ip-172-31-15-180 ~]$ cat /etc/hadoop/conf/ssl-client.xml
 2 | <?xml version="1.0"?>
 3 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 4 | 
 5 | <configuration>
 6 | 
 7 | <property>
 8 |   <name>ssl.client.truststore.location</name>
 9 |   <value>${user.home}/keystore/final.jks</value>
10 |   <description>Truststore to be used by clients like distcp. Must be
11 |   specified.
12 |   </description>
13 | </property>
14 | 
15 | <property>
16 |   <name>ssl.client.truststore.password</name>
17 |   <value>password</value>
18 |   <description>Optional. Default value is "".
19 |   </description>
20 | </property>
21 | 
22 | <property>
23 |   <name>ssl.client.truststore.type</name>
24 |   <value>jks</value>
25 |   <description>Optional. The keystore file format, default value is "jks".
26 |   </description>
27 | </property>
28 | 
29 | <property>
30 |   <name>ssl.client.truststore.reload.interval</name>
31 |   <value>10000</value>
32 |   <description>Truststore reload check interval, in milliseconds.
33 |   Default value is 10000 (10 seconds).
34 |   </description>
35 | </property>
36 | 
37 | <property>
38 |   <name>ssl.client.keystore.location</name>
39 |   <value>${user.home}/keystore/keystore.jks</value>
40 |   <description>Keystore to be used by clients like distcp. Must be
41 |   specified.
42 |   </description>
43 | </property>
44 | 
45 | <property>
46 |   <name>ssl.client.keystore.password</name>
47 |   <value>password</value>
48 |   <description>Optional. Default value is "".
49 |   </description>
50 | </property>
51 | 
52 | <property>
53 |   <name>ssl.client.keystore.keypassword</name>
54 |   <value>password</value>
55 |   <description>Optional. Default value is "".
56 |   </description>
57 | </property>
58 | 
59 | <property>
60 |   <name>ssl.client.keystore.type</name>
61 |   <value>jks</value>
62 |   <description>Optional. The keystore file format, default value is "jks".
63 |   </description>
64 | </property>
65 | 
66 | </configuration>
67 | 


--------------------------------------------------------------------------------
/Security/SSL_Configs/hadoop_ssl_configs/ssl-server.xml:
--------------------------------------------------------------------------------
 1 | [hadoop@ip-172-31-15-180 ~]$ cat /etc/hadoop/conf/ssl-server.xml
 2 | <?xml version="1.0"?>
 3 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 4 | 
 5 | <configuration>
 6 | <!-- Server Certificate Store -->
 7 | <property>
 8 |   <name>ssl.server.keystore.type</name>
 9 |      <value>jks</value>
10 | </property>
11 | <property>
12 |   <name>ssl.server.keystore.location</name>
13 |   <value>/home/hadoop/keystore/keystore.jks</value>
14 | </property>
15 | <property>
16 |   <name>ssl.server.keystore.password</name>
17 |   <value>password</value>
18 | </property>
19 | 
20 | <!-- Server Trust Store -->
21 | <property>
22 |   <name>ssl.server.truststore.type</name>
23 |   <value>jks</value>
24 | </property>
25 | <property>
26 |   <name>ssl.server.truststore.location</name>
27 |   <value>/home/hadoop/keystore/truststore.jks</value>
28 | </property>
29 | <property>
30 |   <name>ssl.server.truststore.password</name>
31 |   <value>password</value>
32 | </property>
33 | <property>
34 |   <name>ssl.server.truststore.reload.interval</name>
35 |   <value>10000</value>
36 | </property>
37 | </configuration>
38 | 


--------------------------------------------------------------------------------
/Security/SSL_Configs/hadoop_ssl_configs/yarn-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | 
 3 | <configuration>
 4 | 
 5 | <!-- Site specific YARN configuration properties -->
 6 | 
 7 | <property>
 8 | <name>yarn.resourcemanager.resource-tracker.address</name>
 9 | <value>rm1.cluster1.com:9001</value>
10 | </property>
11 | 
12 | <property>
13 | <name>yarn.resourcemanager.scheduler.address</name>
14 | <value>rm1.cluster1.com:9002</value>
15 | </property>
16 | 
17 | <property>
18 | <name>yarn.resourcemanager.address</name>
19 | <value>rm1.cluster1.com:9003</value>
20 | </property>
21 | 
22 | <property>
23 | <name>yarn.nodemanager.aux-services</name>
24 | <value>mapreduce_shuffle</value>
25 | </property>
26 | 
27 | <property>
28 | <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
29 | <value>org.apache.hadoop.mapred.ShuffleHandler</value>
30 | </property>
31 | 
32 | <property>
33 | <name>yarn.http.policy</name>
34 | <value>HTTPS_ONLY</value>
35 | </property>
36 | 
37 | <property>
38 | <name>yarn.resourcemanager.webapp.https.address</name>
39 | <value>rm1.cluster1.com:8089</value>
40 | </property>
41 | 
42 | <property>
43 | <name>yarn.log.server.url</name>
44 | <value>https://rm1.cluster1.com:19889/jobhistory/logs</value>
45 | </property>
46 | 
47 | <property>
48 | <name>yarn.nodemanager.webapp.https.address</name>
49 | <value>0.0.0.0:8090</value>
50 | </property>
51 | 
52 | </configuration>
53 | 


--------------------------------------------------------------------------------
/Security/kerberos/JT/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <!-- Put site-specific property overrides in this file. -->
 5 | 
 6 | <configuration>
 7 | 
 8 | <property>
 9 | <name>fs.default.name</name>
10 | <value>hdfs://nn1.cluster1.com:9000</value>
11 | </property>
12 | 
13 | <property>
14 | <name>hadoop.security.authentication</name>
15 | <value>kerberos</value>
16 | </property>
17 | 
18 | <property>
19 | <name>hadoop.security.authorization</name>
20 | <value>true</value>
21 | </property>
22 | 
23 | </configuration>
24 | 


--------------------------------------------------------------------------------
/Security/kerberos/JT/hdfs-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <!-- Put site-specific property overrides in this file. -->
 5 | 
 6 | <configuration>
 7 | 
 8 | <property>
 9 | <name>dfs.data.dir</name>
10 | <value>/space/d1</value>
11 | <final>true</final>
12 | </property>
13 | 
14 | <property>
15 | <name>dfs.replication</name>
16 | <value>1</value>
17 | </property>
18 | 
19 | <property>
20 | <name>dfs.permissions.supergroup</name>
21 | <value>hadoop</value>
22 | </property>
23 | 
24 | <property>
25 | <name>dfs.permissions.superusergroup</name>
26 | <value>hadoop</value>
27 | </property>
28 | 
29 | <property>
30 | <name>dfs.datanode.data.dir.perm</name>
31 | <value>700</value>
32 | </property>
33 | 
34 | <property>
35 | <name>dfs.datanode.address</name>
36 | <value>192.168.1.74:1004</value>
37 | </property>
38 | 
39 | <property>
40 | <name>dfs.datanode.http.address</name>
41 | <value>192.168.1.74:1006</value>
42 | </property>
43 | 
44 | <property>
45 | <name>dfs.datanode.keytab.file</name>
46 | <value>/home/hadoop/dn.hdfs.keytab</value>
47 | </property>
48 | 
49 | <property>
50 | <name>dfs.datanode.kerberos.principal</name>
51 | <value>dn/_HOST@CLUSTER1.COM</value>
52 | </property>
53 | 
54 | <property>
55 | <name>dfs.datanode.kerberos.https.principal</name>
56 | <value>host/_HOST@CLUSTER1.COM</value>
57 | </property>
58 | 
59 | <property>
60 | <name>dfs.namenode.kerberos.principal</name>
61 | <value>nn/_HOST@CLUSTER1.COM</value>
62 | </property>
63 | 
64 | </configuration>
65 | 


--------------------------------------------------------------------------------
/Security/kerberos/JT/mapred-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <!-- Put site-specific property overrides in this file. -->
 5 | 
 6 | <configuration>
 7 | 
 8 | <property>
 9 | <name>mapred.job.tracker</name>
10 | <value>jt1.cluster1.com:9001</value>
11 | </property>
12 | 
13 | <property>  
14 |         <name>mapreduce.jobtracker.kerberos.principal</name>  
15 | 	<value>mapred/_HOST@CLUSTER1.COM</value>
16 | </property>
17 | 
18 | <property>
19 |   <name>mapreduce.jobtracker.kerberos.https.principal</name>
20 |   <value>host/_HOST@CLUSTER1.COM</value>
21 | </property>
22 | 
23 | <property>   
24 |         <name>mapreduce.jobtracker.keytab.file</name>   
25 |         <value>/home/hadoop/mapred.keytab</value>   
26 | </property>
27 | 
28 | <property>  
29 |         <name>mapreduce.tasktracker.kerberos.principal</name>   
30 |         <value>mapred/_HOST@CLUSTER1.COM</value>  
31 | </property>
32 | 
33 | <property>
34 |   <name>mapreduce.tasktracker.kerberos.https.principal</name>
35 |   <value>host/_HOST@CLUSTER1.COM</value>
36 | </property>
37 | 
38 | <property>   
39 |         <name>mapreduce.tasktracker.keytab.file</name>   
40 |         <value>/home/hadoop/tt.mapred.keytab</value>   
41 | </property>
42 | 
43 | <property>
44 |         <name>mapred.local.dir</name>
45 |         <value>/space/tmp</value>
46 | </property>
47 | 
48 | <property>
49 |   <name>mapreduce.tasktracker.group</name>
50 |   <value>mapred</value>
51 | </property>
52 | 
53 | </configuration>
54 | 


--------------------------------------------------------------------------------
/Security/kerberos/JT/taskcontroller.cfg:
--------------------------------------------------------------------------------
 1 | mapred.local.dir=/space/tmp#configured value of mapred.local.dir. It can be a list of comma separated paths.
 2 | hadoop.log.dir=/home/hadoop/log#configured value of hadoop.log.dir.
 3 | mapred.tasktracker.tasks.sleeptime-before-sigkill=#sleep time before sig kill is to be sent to process group after sigterm is sent. Should be in seconds
 4 | mapreduce.tasktracker.group=#configured value of mapreduce.tasktracker.group.
 5 | 
 6 | <property>
 7 |   <name>mapred.task.tracker.task-controller</name>
 8 |   <value>org.apache.hadoop.mapred.LinuxTaskController</value>
 9 | </property>
10 | 
11 | <property>
12 |   <name>mapreduce.tasktracker.group</name>
13 |   <value>mapred</value>
14 | </property>
15 | 


--------------------------------------------------------------------------------
/Security/kerberos/Jsvc_download.txt:
--------------------------------------------------------------------------------
 1 | 1. http://commons.apache.org/proper/commons-daemon/download_daemon.cgi
 2 | 
 3 | Downlaod package: commons-daemon-1.1.0-native-src.tar.gz
 4 | 
 5 | $ tar -xzvf commons-daemon-1.1.0-native-src.tar.gz
 6 | $ cd commons-daemon-1.1.0-native-src/unix
 7 | $ ./configure && make
 8 | $ cp jsvc /usr/lib
 9 | 
10 | 
11 | Under hadoop-env.sh
12 | 
13 | export JSVC_HOME=/usr/lib
14 | 
15 | 2. Directly download the binary: http://archive.apache.org/dist/commons/daemon/binaries/
16 | 


--------------------------------------------------------------------------------
/Security/kerberos/Namenode_Datanode/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <!-- Put site-specific property overrides in this file. -->
 5 | 
 6 | <configuration>
 7 | 
 8 | <property>
 9 | <name>fs.default.name</name>
10 | <value>hdfs://nn1.cluster1.com:9000</value>
11 | </property>
12 | 
13 | <property>
14 | <name>hadoop.security.authentication</name>
15 | <value>kerberos</value>
16 | </property>
17 | 
18 | <property>
19 | <name>hadoop.security.authorization</name>
20 | <value>true</value>
21 | </property>
22 | 
23 | </configuration>
24 | 


--------------------------------------------------------------------------------
/Security/kerberos/Namenode_Datanode/hadoop-env.sh:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one
  2 | # or more contributor license agreements.  See the NOTICE file
  3 | # distributed with this work for additional information
  4 | # regarding copyright ownership.  The ASF licenses this file
  5 | # to you under the Apache License, Version 2.0 (the
  6 | # "License"); you may not use this file except in compliance
  7 | # with the License.  You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | # Set Hadoop-specific environment variables here.
 18 | 
 19 | # The only required environment variable is JAVA_HOME.  All others are
 20 | # optional.  When running a distributed configuration it is best to
 21 | # set JAVA_HOME in this file, so that it is correctly defined on
 22 | # remote nodes.
 23 | 
 24 | export JAVA_HOME=/usr/java/latest
 25 | 
 26 | # The java implementation to use.
 27 | export JAVA_HOME=${JAVA_HOME}
 28 | 
 29 | # The jsvc implementation to use. Jsvc is required to run secure datanodes
 30 | # that bind to privileged ports to provide authentication of data transfer
 31 | # protocol.  Jsvc is not required if SASL is configured for authentication of
 32 | # data transfer protocol using non-privileged ports.
 33 | #export JSVC_HOME=${JSVC_HOME}
 34 | 
 35 | export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"}
 36 | 
 37 | # Extra Java CLASSPATH elements.  Automatically insert capacity-scheduler.
 38 | for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do
 39 |   if [ "$HADOOP_CLASSPATH" ]; then
 40 |     export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f
 41 |   else
 42 |     export HADOOP_CLASSPATH=$f
 43 |   fi
 44 | done
 45 | 
 46 | # The maximum amount of heap to use, in MB. Default is 1000.
 47 | #export HADOOP_HEAPSIZE=
 48 | #export HADOOP_NAMENODE_INIT_HEAPSIZE=""
 49 | 
 50 | # Extra Java runtime options.  Empty by default.
 51 | export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true -Djavax.net.debug=ssl:handshake"
 52 | 
 53 | # Command specific options appended to HADOOP_OPTS when specified
 54 | export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} -Djavax.net.debug=ssl $HADOOP_NAMENODE_OPTS"
 55 | export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS"
 56 | 
 57 | export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS"
 58 | 
 59 | export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS"
 60 | export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS"
 61 | 
 62 | # The following applies to multiple commands (fs, dfs, fsck, distcp etc)
 63 | export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS"
 64 | #HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS"
 65 | 
 66 | # On secure datanodes, user to run the datanode as after dropping privileges.
 67 | # This **MUST** be uncommented to enable secure HDFS if using privileged ports
 68 | # to provide authentication of data transfer protocol.  This **MUST NOT** be
 69 | # defined if SASL is configured for authentication of data transfer protocol
 70 | # using non-privileged ports.
 71 | export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER}
 72 | 
 73 | # Where log files are stored.  $HADOOP_HOME/logs by default.
 74 | #export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER
 75 | 
 76 | # Where log files are stored in the secure data environment.
 77 | export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}
 78 | 
 79 | ###
 80 | # HDFS Mover specific parameters
 81 | ###
 82 | # Specify the JVM options to be used when starting the HDFS Mover.
 83 | # These options will be appended to the options specified as HADOOP_OPTS
 84 | # and therefore may override any similar flags set in HADOOP_OPTS
 85 | #
 86 | # export HADOOP_MOVER_OPTS=""
 87 | 
 88 | ###
 89 | # Advanced Users Only!
 90 | ###
 91 | 
 92 | # The directory where pid files are stored. /tmp by default.
 93 | # NOTE: this should be set to a directory that can only be written to by
 94 | #       the user that will run the hadoop daemons.  Otherwise there is the
 95 | #       potential for a symlink attack.
 96 | export HADOOP_PID_DIR=${HADOOP_PID_DIR}
 97 | export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR}
 98 | 
 99 | # A string representing this instance of hadoop. $USER by default.
100 | export HADOOP_IDENT_STRING=$USER
101 | 


--------------------------------------------------------------------------------
/Security/kerberos/Namenode_Datanode/hdfs-site.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 | <property>
 4 | <name>dfs.name.dir</name>
 5 | <value>/data/nn1,/data/nn2</value>
 6 | </property>
 7 | 
 8 | <property>
 9 | <name>dfs.data.dir</name>
10 | <value>/data/d1,/data/d2</value>
11 | </property>
12 | 
13 | <property>
14 | <name>dfs.replication</name>
15 | <value>1</value>
16 | </property>
17 | 
18 | <property>
19 | <name>dfs.permissions.supergroup</name>
20 | <value>hadoop</value>
21 | </property>
22 | 
23 | # Kerberos configuration
24 | 
25 | <property>
26 | <name>dfs.block.access.token.enable</name>
27 | <value>true</value>
28 | </property>
29 | 
30 | <property>
31 | <name>dfs.namenode.keytab.file</name>
32 | <value>/opt/cluster/security/nn.hdfs.keytab</value>
33 | </property>
34 | 
35 | <property>
36 | <name>dfs.namenode.kerberos.principal</name>
37 | <value>hdfs/_HOST@CLUSTER1.COM</value>
38 | </property>
39 | 
40 | <property>
41 | <name>dfs.namenode.kerberos.http.principal</name>
42 | <value>host/_HOST@CLUSTER1.COM</value>
43 | </property>
44 | 
45 | <property>
46 | <name>dfs.web.authentication.kerberos.principal</name>
47 | <value>HTTP/_HOST@CLUSTER1.COM</value>
48 | </property>
49 | 
50 | <property>
51 | <name>dfs.namenode.kerberos.internal.spnego.principal</name>
52 | <value>${dfs.web.authentication.kerberos.principal}</value>
53 | </property>
54 | 
55 | # Datanode configuration
56 | 
57 | <property>
58 | <name>dfs.datanode.data.dir.perm</name>
59 | <value>700</value>
60 | </property>
61 | 
62 | <property>
63 | <name>dfs.datanode.address</name>
64 | <value>0.0.0.0:1004</value>
65 | </property>
66 | 
67 | <property>
68 | <name>dfs.datanode.http.address</name>
69 | <value>0.0.0.0:1006</value>
70 | </property>
71 | 
72 | <property>
73 | <name>dfs.datanode.keytab.file</name>
74 | <value>/opt/cluster/security/dn.hdfs.keytab</value>
75 | </property>
76 | 
77 | <property>
78 | <name>dfs.datanode.kerberos.principal</name>
79 | <value>hdfs/_HOST@CLUSTER1.COM</value>
80 | </property>
81 | 
82 | <property>
83 | <name>dfs.datanode.kerberos.http.principal</name>
84 | <value>host/_HOST@CLUSTER1.COM</value>
85 | </property>
86 | 
87 | <property>
88 | <name>dfs.web.authentication.kerberos.principal</name>
89 | <value>HTTP/_HOST@CLUSTER1.COM</value>
90 | </property>
91 | 
92 | </configuration>
93 | 


--------------------------------------------------------------------------------
/Security/kerberos/Namenode_Datanode/mapred-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <!-- Put site-specific property overrides in this file. -->
 5 | 
 6 | <configuration>
 7 | 
 8 | <property>
 9 | <name>mapred.job.tracker</name>
10 | <value>jt1.cluster1.com:9001</value>
11 | </property>
12 | 
13 | <property>  
14 |         <name>mapreduce.jobtracker.kerberos.principal</name>  
15 | 	<value>mapred/_HOST@CLUSTER1.COM</value>
16 | </property>
17 | 
18 | <property>
19 |   <name>mapreduce.jobtracker.kerberos.https.principal</name>
20 |   <value>host/_HOST@CLUSTER1.COM</value>
21 | </property>
22 | 
23 | <property>   
24 |         <name>mapreduce.jobtracker.keytab.file</name>   
25 |         <value>/home/hadoop/mapred.keytab</value>   
26 | </property>
27 | 
28 | <property>  
29 |         <name>mapreduce.tasktracker.kerberos.principal</name>   
30 |         <value>mapred/_HOST@CLUSTER1.COM</value>  
31 | </property>
32 | 
33 | <property>
34 |   <name>mapreduce.tasktracker.kerberos.https.principal</name>
35 |   <value>host/_HOST@CLUSTER1.COM</value>
36 | </property>
37 | 
38 | <property>   
39 |         <name>mapreduce.tasktracker.keytab.file</name>   
40 |         <value>/home/hadoop/tt.mapred.keytab</value>   
41 | </property>
42 | 
43 | <property>
44 |         <name>mapred.local.dir</name>
45 |         <value>/space/tmp</value>
46 | </property>
47 | 
48 | <property>
49 |   <name>mapreduce.tasktracker.group</name>
50 |   <value>mapred</value>
51 | </property>
52 | 
53 | </configuration>
54 | 


--------------------------------------------------------------------------------
/Security/kerberos/Namenode_Datanode/taskcontroller.cfg:
--------------------------------------------------------------------------------
 1 | mapred.local.dir=/space/tmp#configured value of mapred.local.dir. It can be a list of comma separated paths.
 2 | hadoop.log.dir=/home/hadoop/log#configured value of hadoop.log.dir.
 3 | mapred.tasktracker.tasks.sleeptime-before-sigkill=#sleep time before sig kill is to be sent to process group after sigterm is sent. Should be in seconds
 4 | mapreduce.tasktracker.group=#configured value of mapreduce.tasktracker.group.
 5 | 
 6 | <property>
 7 |   <name>mapred.task.tracker.task-controller</name>
 8 |   <value>org.apache.hadoop.mapred.LinuxTaskController</value>
 9 | </property>
10 | 
11 | <property>
12 |   <name>mapreduce.tasktracker.group</name>
13 |   <value>mapred</value>
14 | </property>
15 | 


--------------------------------------------------------------------------------
/Security/kerberos/README.md:
--------------------------------------------------------------------------------
 1 | In production remove legacy encryption algo's and use only:
 2 | 
 3 |     default_tkt_enctypes = aes256-cts-hmac-sha1-96 aes128-cts-hmac-sha1-96
 4 |     default_tgs_enctypes = aes256-cts-hmac-sha1-96 aes128-cts-hmac-sha1-96
 5 |     permitted_enctypes = aes256-cts-hmac-sha1-96 aes128-cts-hmac-sha1-96
 6 | 
 7 | 
 8 | Debugging:
 9 | 
10 | $ export HADOOP_ROOT_LOGGER=TRACE,console; export HADOOP_JAAS_DEBUG=true; export HADOOP_OPTS="-Dsun.security.krb5.debug=true"
11 | 
12 | $ hadoop fs -ls / > >(tee fsls-logfile.txt) 2>&1
13 | 
14 | $ export KRB5_TRACE=/tmp/kinit.log
15 | 


--------------------------------------------------------------------------------
/Security/kerberos/kdc.conf:
--------------------------------------------------------------------------------
 1 | # On KDC server /var/kerberos/krb5kdc/kdc.conf
 2 | 
 3 | [kdcdefaults]
 4 |  kdc_ports = 88
 5 |  kdc_tcp_ports = 88
 6 | 
 7 | [realms]
 8 |  CLUSTER1.COM = {
 9 |   #master_key_type = aes256-cts
10 |   max_renewable_life = 7d 0h 0m 0s #Needed for Kerberos auto ticket renewing for long running jobs and Hue KGT renewer
11 |   acl_file = /var/kerberos/krb5kdc/kadm5.acl
12 |   dict_file = /usr/share/dict/words
13 |   admin_keytab = /var/kerberos/krb5kdc/kadm5.keytab
14 |   supported_enctypes = aes256-cts:normal aes128-cts:normal
15 |   default_principal_flags = +renewable #Needed for Kerberos auto ticket renewing for long running jobs and Hue KGT renewer
16 |  }
17 | 
18 | 
19 | 
20 | # Also, we need the below steps:
21 | kadmin.local: modprinc -maxrenewlife 90day krbtgt/NETXILLON.COM
22 | kadmin.local: modprinc -maxrenewlife 90day +allow_renewable hue/edge1.netxillon.com@NETXILLON.COM
23 | 


--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/README.md:
--------------------------------------------------------------------------------
1 | @Netxillon Technologies. These scripts I used for hadoop1.0 and 2.0, please update the service principals accordingly. Example tt, is no longer valid in hadoop2.0.
2 | Well, we can use any name for the service principal, but just to be consistent on naming conventions each service has a respective principal.
3 | 
4 | 


--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/add_users.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | for j in `cat user_list`
4 | do
5 |         echo -e "hadoop\nhadoop" | kadmin.local -q "addprinc $j"
6 | done
7 | 


--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/copy_keytabs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | clush -g all --copy nn.hdfs.keytab --dest=/opt/cluster/security/
4 | clush -g all --copy dn.hdfs.keytab --dest=/opt/cluster/security/
5 | clush -g all --copy user.hdfs.keytab --dest=/opt/cluster/security/
6 | clush -g all -b "chown -R hdfs:hadoop /opt/cluster/"
7 | 


--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/create_dn_princs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Generate Hosts principals
 4 | 
 5 | for i in `cat dn_host_list`
 6 | do
 7 | 	kadmin.local -q "addprinc -randkey host/$i"
 8 | 	kadmin.local -q "addprinc -randkey HTTP/$i"
 9 | 	kadmin.local -q "addprinc -randkey hdfs/$i"
10 | 	kadmin.local -q "xst -norandkey -k dn.hdfs.keytab host/$i"
11 | 	kadmin.local -q "xst -norandkey -k dn.hdfs.keytab HTTP/$i"
12 | 	kadmin.local -q "xst -norandkey -k dn.hdfs.keytab hdfs/$i"
13 | done
14 | 


--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/create_nn_princs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for k in `cat nn_host_list`
 4 | do
 5 | 	kadmin.local -q "addprinc -randkey host/$k"
 6 |         kadmin.local -q "addprinc -randkey HTTP/$k"
 7 |         kadmin.local -q "addprinc -randkey hdfs/$k"
 8 | 
 9 |         kadmin.local -q "xst -norandkey -k nn.hdfs.keytab host/$k"
10 |         kadmin.local -q "xst -norandkey -k nn.hdfs.keytab HTTP/$k"
11 |         kadmin.local -q "xst -norandkey -k nn.hdfs.keytab hdfs/$k"
12 | done
13 | 


--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/create_partions.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | for i in `cat hosts`
4 | do
5 | 	ssh $i 'echo -e "o\nn\np\n1\n\n\nw"'
6 | done
7 | 


--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/create_user_keytab.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for k in `cat user_host_list`
 4 | do
 5 |         kadmin.local -q "xst -norandkey -k user.hdfs.keytab host/$k"
 6 | done
 7 | 
 8 | for p in `cat user_list`
 9 | do
10 |         kadmin.local -q "xst -norandkey -k user.hdfs.keytab $p"
11 | done
12 | 


--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/delete_list:
--------------------------------------------------------------------------------
 1 | HTTP/dn1.cluster1.com@CLUSTER1.COM
 2 | HTTP/dn2.cluster1.com@CLUSTER1.COM
 3 | HTTP/dn3.cluster1.com@CLUSTER1.COM
 4 | HTTP/dn4.cluster1.com@CLUSTER1.COM
 5 | HTTP/nn1.cluster1.com@CLUSTER1.COM
 6 | dn/dn1.cluster1.com@CLUSTER1.COM
 7 | dn/dn2.cluster1.com@CLUSTER1.COM
 8 | dn/dn3.cluster1.com@CLUSTER1.COM
 9 | dn/dn4.cluster1.com@CLUSTER1.COM
10 | host/dn1.cluster1.com@CLUSTER1.COM
11 | host/dn2.cluster1.com@CLUSTER1.COM
12 | host/dn3.cluster1.com@CLUSTER1.COM
13 | host/dn4.cluster1.com@CLUSTER1.COM
14 | host/nn1.cluster1.com@CLUSTER1.COM
15 | nn/nn1.cluster1.com@CLUSTER1.COM
16 | 


--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/delete_princs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source list_princs.sh | egrep "host|nn|http|dn|mapred|jt|tt" > delete_list
 4 | 
 5 | for i in `cat delete_list`
 6 | do 
 7 | 	kadmin.local -q "delprinc -force $i"
 8 | done
 9 | 
10 | rm -rf *.keytab
11 | 


--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/dn_host_list:
--------------------------------------------------------------------------------
1 | dn1.cluster1.com
2 | dn2.cluster1.com
3 | dn3.cluster1.com
4 | dn4.cluster1.com
5 | 


--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/hosts:
--------------------------------------------------------------------------------
1 | 192.168.1.10
2 | 


--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/install_krb.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | yum clean all
 4 | yum install -y krb5-server krb5-workstation krb5-devel pam_krb5 krb5-libs
 5 | 
 6 | yum install rng-tools -y
 7 | 
 8 | echo 'EXTRAOPTIONS="-r /dev/urandom"'' > /etc/sysconfig/rngd
 9 | service rngd restart
10 | yum install ntp -y
11 | 
12 | chkconfig ntpd on
13 | chkconfig rngd on
14 | 


--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/list_princs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | kadmin.local -q "listprincs"
4 | 


--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/nn_host_list:
--------------------------------------------------------------------------------
1 | nn1.cluster1.com
2 | 


--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/setup_kerberos.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Note: Kerberos Realm does not need to be same as domain name. Just update [domain_realm] section mapping correctly.
 4 | echo -e "redhat\nredhat" | kdb5_util create -r NETXILLON.COM -s
 5 | 
 6 | echo -e "redhat\nredhat" | kadmin.local -q "addprinc root/admin"
 7 | 
 8 | kadmin.local -q "ktadd -k /var/kerberos/krb5kdc/kadm5.keytab kadmin/admin"
 9 | kadmin.local -q "ktadd -k /var/kerberos/krb5kdc/kadm5.keytab kadmin/changepw"
10 | 
11 | /etc/init.d/kadmin restart
12 | /etc/init.d/krb5kdc restart
13 | 
14 | chkconfig krb5kdc on
15 | chkconfig kadmin on
16 | 


--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/user_host_list:
--------------------------------------------------------------------------------
1 | nn1.cluster1.com
2 | dn1.cluster1.com
3 | dn2.cluster1.com
4 | dn3.cluster1.com
5 | dn4.cluster1.com
6 | 


--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/user_list:
--------------------------------------------------------------------------------
1 | hdfs
2 | 


--------------------------------------------------------------------------------
/Security/kerberos/kerberos_user_mappings.txt:
--------------------------------------------------------------------------------
 1 | This file talks about mapping the kerberos princial with local users.
 2 | 
 3 | We can have a NN principal as:
 4 | nn/_HOST@CLUSTER1.COM or hdfs/_HOST@CLUSTER1.COM
 5 | 
 6 | If it is the first way, when Datanode sends the user (dnUsername), it will be as user "dn", which does not exist anywhere. So, NN will complain that the user "dn" is not part of supergroup. Which is right!
 7 | 
 8 | For this we need to map users as below on all nodes under core-site.xml
 9 | 
10 | 
11 | [hdfs@nn1 ~]$ hadoop org.apache.hadoop.security.HadoopKerberosName host/nn1.cluster1.com@CLUSTER1.COM
12 | Name: host/nn1.cluster1.com@CLUSTER1.COM to host
13 | 
14 | [hdfs@nn1 ~]$ hadoop org.apache.hadoop.security.HadoopKerberosName dn/dn1.cluster1.com@CLUSTER1.COM
15 | Name: dn/dn1.cluster1.com@CLUSTER1.COM to dn
16 | 
17 | After adding the rule as below in core-site.xml:
18 | 
19 | <property>
20 |   <name>hadoop.security.auth_to_local</name>
21 |   <value>
22 |     RULE:[2:$1/$2@$0](dn/.*@.*CLUSTER1.COM)s/.*/hdfs/
23 |     DEFAULT
24 |   </value>
25 | </property>
26 | 
27 | [hdfs@nn1 hadoop]$ hadoop org.apache.hadoop.security.HadoopKerberosName host/nn1.cluster1.com@CLUSTER1.COM
28 | Name: host/nn1.cluster1.com@CLUSTER1.COM to host
29 | [hdfs@nn1 ~]$ hadoop org.apache.hadoop.security.HadoopKerberosName dn/dn1.cluster1.com@CLUSTER1.COM
30 | Name: dn/dn1.cluster1.com@CLUSTER1.COM to hdfs
31 | 
32 | See above that the user "dn" is translated to user "hdfs", which is part of the supergroup. All Good!
33 | 
34 | Good Read: https://www.cloudera.com/documentation/enterprise/5-8-x/topics/cdh_sg_kerbprin_to_sn.html
35 | 


--------------------------------------------------------------------------------
/Security/kerberos/krb5.conf:
--------------------------------------------------------------------------------
 1 | # On all nodes, including KDC: /etc/krb5.conf
 2 | 
 3 | [libdefaults]
 4 |     default_realm = CLUSTER1.COM
 5 |     dns_lookup_realm = false
 6 |     dns_lookup_kdc = false
 7 |     ticket_lifetime = 24h
 8 |     renew_lifetime = 7d
 9 |     forwardable = true
10 | 
11 |     default_tkt_enctypes = aes256-cts-hmac-sha1-96 aes128-cts-hmac-sha1-96
12 |     default_tgs_enctypes = aes256-cts-hmac-sha1-96 aes128-cts-hmac-sha1-96
13 |     permitted_enctypes = aes256-cts-hmac-sha1-96 aes128-cts-hmac-sha1-96
14 | 
15 |     #default_tkt_enctypes = des3-cbc-sha1 des-cbc-crc
16 |     #default_tgs_enctypes = des3-cbc-sha1 des-cbc-crc
17 |     #permitted_enctypes = des3-cbc-sha1 des-cbc-crc
18 |     udp_preference_limit = 1
19 | 
20 | [realms]
21 |     CLUSTER1.COM = {
22 |         kdc = repo.cluster1.com:88
23 |         admin_server = repo.cluster1.com:749
24 |         default_domain = cluster1.com
25 |     }
26 | 
27 | [domain_realm]
28 |     .cluster1.com = CLUSTER1.COM
29 |      cluster1.com = CLUSTER1.COM
30 | 
31 | [logging]
32 |     kdc = FILE:/var/log/krb5kdc.log
33 |     admin_server = FILE:/var/log/kadmin.log
34 |     default = FILE:/var/log/krb5lib.log
35 | 


--------------------------------------------------------------------------------
/Security/kms/kms-setup:
--------------------------------------------------------------------------------
 1 | core-site.xml
 2 | 
 3 | <property>
 4 |   <name>hadoop.security.key.provider.path</name>
 5 |   <value>kms://http@nn1.cluster1.com:16000/kms</value>
 6 | </property>
 7 | 
 8 | hdfs-site.xml file and make the changes shown here:
 9 | <property>
10 |   <name>dfs.encryption.key.provider.uri</name>
11 |   <value>kms://http@nn1.cluster1.com:16000/kms</value>
12 | </property>
13 | 
14 | /opt/cluster/hadoop/etc/hadoop/kms-env.sh:
15 | 
16 |  export KMS_TEMP=${KMS_HOME}/temp
17 | 
18 | 
19 | kms.sh start
20 | hadoop key list
21 | hadoop key create key1
22 | hadoop fs -mkdir /secure_zone
23 | hdfs crypto -createZone -keyName key1 -path /secure_zone
24 | 
25 | hdfs crypto -listZones
26 | 
27 | hadoop fs -put wordcount /secure_zone
28 | hadoop fs -cat /secure_zone/wordcount
29 | hadoop fs -mkdir /unsecure
30 | 


--------------------------------------------------------------------------------
/Security/ldap/Installation_steps:
--------------------------------------------------------------------------------
 1 | yum -y install openldap compat-openldap openldap-clients openldap-servers openldap-servers-sql openldap-devel
 2 | yum -y install nss-pam-ldapd pam_ldap
 3 | 
 4 | cp /usr/share/openldap-servers/DB_CONFIG.example /var/lib/ldap/DB_CONFIG
 5 | cp /usr/share/openldap-servers/slapd.conf.obsolete slapd.conf
 6 | 
 7 | edit "slapd.conf" and add/change the lines as below:
 8 | 
 9 | suffix		"dc=cluster1,dc=com"
10 | rootdn		"cn=Manager,dc=cluster1,dc=com"
11 | rootpw		{SSHA}2F2+4O43lt9jnPLrh6gjJ8tIVksTSSEg
12 | 
13 | The password is generated using "slappasswd"
14 | 
15 | slaptest -f /etc/openldap/slapd.conf -F /etc/openldap/slapd.d/
16 | 
17 | chown -R ldap:ldap /var/run/openldap/
18 | chown -R ldap:ldap /var/lib/ldap
19 | chown -R ldap:ldap /etc/openldap/slap.d
20 | 
21 | ldapadd -f base.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
22 | ldapadd -f groupadd.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
23 | ldapadd -f base1.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
24 | ldapadd -f users.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
25 | ldapadd -f adduser.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
26 | 
27 | #PhpldapAdmin installation; need epel repo
28 | 
29 | yum -y install httpd php php-ldap phpldapadmin
30 | 
31 | Then change the files as below:
32 | 
33 | /etc/phpldapadmin/config.php
34 | 
35 | $servers->setValue('server','name','Netxillon LDAP Server');
36 | $servers->setValue('server','host','192.168.1.254');
37 | $servers->setValue('login','bind_id','cn=Manager,dc=cluster1,dc=com');
38 | 
39 | comment //$servers->setValue('login','attr','uid');
40 | uncomment $servers->setValue('login','attr','dn');
41 | 
42 | Change Deny rule to Allow in the http and restart the httpd
43 | 
44 | # On Client nodes:
45 | 
46 | authconfig --enableldap --enableldapauth --ldapserver=repo.cluster1.com --ldapbasedn="dc=cluster1,dc=com" --enablemkhomedir --update
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/Security/ldap/addmembers.ldif:
--------------------------------------------------------------------------------
1 | dn: cn=hadoop,ou=groups,dc=cluster1,dc=com
2 | changetype: modify
3 | add: memberuid
4 | memberuid: hdfs1
5 | 


--------------------------------------------------------------------------------
/Security/ldap/adduser.ldif:
--------------------------------------------------------------------------------
 1 | dn: uid=hdfs1,ou=users,dc=cluster1,dc=com
 2 | objectClass: top
 3 | objectClass: account
 4 | objectClass: posixAccount
 5 | objectClass: shadowAccount
 6 | cn: hdfs1
 7 | uid: hdfs1
 8 | uidNumber: 509
 9 | gidNumber: 509
10 | homeDirectory: /home/hdfs1
11 | loginShell: /bin/bash
12 | gecos: adam
13 | userPassword: {crypt}x
14 | shadowLastChange: 0
15 | shadowMax: 0
16 | shadowWarning: 0
17 | 


--------------------------------------------------------------------------------
/Security/ldap/base.ldif:
--------------------------------------------------------------------------------
1 | dn: dc=cluster1,dc=com
2 | objectClass: dcObject
3 | objectClass: organization
4 | dc: cluster1
5 | o : cluster1
6 | 


--------------------------------------------------------------------------------
/Security/ldap/base1.ldif:
--------------------------------------------------------------------------------
 1 | dn:ou=groups, dc=cluster1, dc=com
 2 | objectclass: top
 3 | objectclass: organizationalUnit
 4 | ou: groups
 5 | 
 6 | dn:ou=people, dc=cluster1, dc=com
 7 | objectclass: top
 8 | objectclass: organizationalUnit
 9 | ou: people
10 | 


--------------------------------------------------------------------------------
/Security/ldap/base2.ldif:
--------------------------------------------------------------------------------
1 | dn: dc=cluster1,dc=com
2 | objectClass: top
3 | objectclass: organization
4 | o: cluster1
5 | 


--------------------------------------------------------------------------------
/Security/ldap/commands:
--------------------------------------------------------------------------------
 1 | ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/cosine.ldif
 2 |   962  ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/*
 3 |   963  ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/collective.ldif
 4 |   964  ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/core.ldif
 5 |   965  ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/inetorgperson.ldif
 6 |   966  ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/nis.ldif
 7 |   967  ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/ppolicy.ldif
 8 |   969  ldapadd -f groupadd.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
 9 |   971  ldapadd -f adduser.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
10 |   972  ldappasswd -s welcome123 -W -D "cn=Manager,dc=cluster1,dc=com" -x "uid=hdfs1,ou=users,dc=cluster1,dc=com"
11 |   974  yum install pam_ldap -y
12 |   976  cat /etc/openldap/ldap.conf
13 |   989  vi /etc/openldap/ldap.conf
14 |   997  authconfig --enableldap --enableldapauth --ldapserver=repo.cluster1.com --ldapbasedn="dc=cluster1,dc=com" --enablemkhomedir --update
15 |   998  cat /etc/openldap/ldap.conf
16 |  1004  cd ldap/
17 |  1006  ldapadd -f groupadd.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
18 |  1008  ldapadd -f groupadd.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
19 |  1010  ldapsearch -x -W -D "cn=Manager,dc=cluster1,dc=com" -b "uid=hdfs1,ou=users,dc=cluster1,dc=com" "(objectclass=*)"
20 |  1011  ldapsearch -x -W -D "cn=Manager,dc=cluster1,dc=com" -b "uid=hdfs1,ou=Users,dc=cluster1,dc=com" "(objectclass=*)"
21 |  1012  ldapdelete -W -D "cn=Manager,dc=cluster1,dc=com" "uid=hdfs1,ou=users,dc=cluster1,dc=com"
22 |  1013  ldapsearch -x -W -D "cn=Manager,dc=cluster1,dc=com" -b "uid=hdfs1,ou=Users,dc=cluster1,dc=com" "(objectclass=*)"
23 |  1016  ldapadd -f groupadd.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
24 |  1017  ldapadd -f adduser.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
25 |  1020  ldapadd -f addmembers.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
26 |  1024  ldapsearch -x -W -D "cn=Manager,dc=cluster1,dc=com" -b "uid=hdfs1,ou=Users,dc=cluster1,dc=com" "(objectclass=*)"
27 | 


--------------------------------------------------------------------------------
/Security/ldap/groupadd.ldif:
--------------------------------------------------------------------------------
1 | dn: cn=hdfs1,ou=groups,dc=cluster1,dc=com
2 | objectClass: top
3 | objectClass: posixGroup
4 | gidNumber: 509
5 | 


--------------------------------------------------------------------------------
/Security/ldap/slapd.conf.obsolete:
--------------------------------------------------------------------------------
  1 | #
  2 | # See slapd.conf(5) for details on configuration options.
  3 | # This file should NOT be world readable.
  4 | #
  5 | 
  6 | include         /etc/openldap/schema/corba.schema
  7 | include         /etc/openldap/schema/core.schema
  8 | include         /etc/openldap/schema/cosine.schema
  9 | include         /etc/openldap/schema/duaconf.schema
 10 | include         /etc/openldap/schema/dyngroup.schema
 11 | include         /etc/openldap/schema/inetorgperson.schema
 12 | include         /etc/openldap/schema/java.schema
 13 | include         /etc/openldap/schema/misc.schema
 14 | include         /etc/openldap/schema/nis.schema
 15 | include         /etc/openldap/schema/openldap.schema
 16 | include         /etc/openldap/schema/ppolicy.schema
 17 | include         /etc/openldap/schema/collective.schema
 18 | 
 19 | # Allow LDAPv2 client connections.  This is NOT the default.
 20 | allow bind_v2
 21 | 
 22 | # Do not enable referrals until AFTER you have a working directory
 23 | # service AND an understanding of referrals.
 24 | #referral       ldap://root.openldap.org
 25 | 
 26 | pidfile         /var/run/openldap/slapd.pid
 27 | argsfile        /var/run/openldap/slapd.args
 28 | 
 29 | # Load dynamic backend modules
 30 | # - modulepath is architecture dependent value (32/64-bit system)
 31 | # - back_sql.la overlay requires openldap-server-sql package
 32 | # - dyngroup.la and dynlist.la cannot be used at the same time
 33 | 
 34 | # modulepath /usr/lib/openldap
 35 | # modulepath /usr/lib64/openldap
 36 | 
 37 | # moduleload accesslog.la
 38 | # moduleload auditlog.la
 39 | # moduleload back_sql.la
 40 | # moduleload chain.la
 41 | # moduleload collect.la
 42 | # moduleload constraint.la
 43 | # moduleload dds.la
 44 | # moduleload deref.la
 45 | # moduleload dyngroup.la
 46 | # moduleload dynlist.la
 47 | # moduleload memberof.la
 48 | # moduleload pbind.la
 49 | # moduleload pcache.la
 50 | # moduleload ppolicy.la
 51 | # moduleload refint.la
 52 | # moduleload retcode.la
 53 | # moduleload rwm.la
 54 | # moduleload seqmod.la
 55 | # moduleload smbk5pwd.la
 56 | # moduleload sssvlv.la
 57 | # moduleload syncprov.la
 58 | # moduleload translucent.la
 59 | # moduleload unique.la
 60 | # moduleload valsort.la
 61 | 
 62 | # The next three lines allow use of TLS for encrypting connections using a
 63 | # dummy test certificate which you can generate by running
 64 | # /usr/libexec/openldap/generate-server-cert.sh. Your client software may balk
 65 | # at self-signed certificates, however.
 66 | TLSCACertificatePath /etc/openldap/certs
 67 | TLSCertificateFile "\"OpenLDAP Server\""
 68 | TLSCertificateKeyFile /etc/openldap/certs/password
 69 | 
 70 | # Sample security restrictions
 71 | #       Require integrity protection (prevent hijacking)
 72 | #       Require 112-bit (3DES or better) encryption for updates
 73 | #       Require 63-bit encryption for simple bind
 74 | # security ssf=1 update_ssf=112 simple_bind=64
 75 | 
 76 | # Sample access control policy:
 77 | #       Root DSE: allow anyone to read it
 78 | #       Subschema (sub)entry DSE: allow anyone to read it
 79 | #       Other DSEs:
 80 | #               Allow self write access
 81 | #               Allow authenticated users read access
 82 | #               Allow anonymous users to authenticate
 83 | #       Directives needed to implement policy:
 84 | # access to dn.base="" by * read
 85 | # access to dn.base="cn=Subschema" by * read
 86 | # access to *
 87 | #       by self write
 88 | #       by users read
 89 | #       by anonymous auth
 90 | #
 91 | # if no access controls are present, the default policy
 92 | # allows anyone and everyone to read anything but restricts
 93 | # updates to rootdn.  (e.g., "access to * by * read")
 94 | #
 95 | # rootdn can always read and write EVERYTHING!
 96 | 
 97 | # enable on-the-fly configuration (cn=config)
 98 | database config
 99 | access to *
100 |         by dn.exact="gidNumber=0+uidNumber=0,cn=peercred,cn=external,cn=auth" manage
101 |         by * none
102 | 
103 | # enable server status monitoring (cn=monitor)
104 | database monitor
105 | access to *
106 |         by dn.exact="gidNumber=0+uidNumber=0,cn=peercred,cn=external,cn=auth" read
107 |         by dn.exact="cn=Manager,dc=my-domain,dc=com" read
108 |         by * none
109 | 
110 | #######################################################################
111 | # database definitions
112 | #######################################################################
113 | 
114 | database        bdb
115 | suffix          "dc=my-domain,dc=com"
116 | checkpoint      1024 15
117 | rootdn          "cn=Manager,dc=my-domain,dc=com"
118 | # Cleartext passwords, especially for the rootdn, should
119 | # be avoided.  See slappasswd(8) and slapd.conf(5) for details.
120 | # Use of strong authentication encouraged.
121 | # rootpw                secret
122 | # rootpw                {crypt}ijFYNcSNctBYg
123 | 
124 | # The database directory MUST exist prior to running slapd AND
125 | # should only be accessible by the slapd and slap tools.
126 | # Mode 700 recommended.
127 | directory       /var/lib/ldap
128 | 
129 | # Indices to maintain for this database
130 | index objectClass                       eq,pres
131 | index ou,cn,mail,surname,givenname      eq,pres,sub
132 | index uidNumber,gidNumber,loginShell    eq,pres
133 | index uid,memberUid                     eq,pres,sub
134 | index nisMapName,nisMapEntry            eq,pres,sub
135 | 
136 | # Replicas of this database
137 | #replogfile /var/lib/ldap/openldap-master-replog
138 | #replica host=ldap-1.example.com:389 starttls=critical
139 | #     bindmethod=sasl saslmech=GSSAPI
140 | #     authcId=host/ldap-master.example.com@EXAMPLE.COM
141 | 


--------------------------------------------------------------------------------
/Security/ldap/test.ldif:
--------------------------------------------------------------------------------
 1 | dn: cn=Jim Bob,ou=people,dc=cluster1,dc=com
 2 | objectclass: top
 3 | objectclass: person
 4 | objectclass: organizationalPerson
 5 | objectclass: inetOrgPerson
 6 | cn: Jim Bob
 7 | sn: Bob
 8 | mail: jimbob@example.com
 9 | ou: sales
10 | 


--------------------------------------------------------------------------------
/Security/ldap/users.ldif:
--------------------------------------------------------------------------------
1 | dn: ou=Users,dc=cluster1,dc=com
2 | objectClass: organizationalUnit
3 | ou: Users
4 | 


--------------------------------------------------------------------------------
/Spark/Spark_build:
--------------------------------------------------------------------------------
 1 | $ which mvn
 2 | /opt/apache-maven-3.3.9/bin/mvn
 3 | 
 4 | $ cat /etc/profile.d/maven.sh
 5 | export JAVA_HOME=/usr/java/latest
 6 | export M3_HOME=/opt/apache-maven-3.3.9
 7 | export PATH=$JAVA_HOME/bin:${M3_HOME}/bin:/home/ec2-user/jruby-9.1.1.0/bin/:$PATH
 8 | 
 9 | 
10 | Build:
11 | 
12 | $ git clone git://git.apache.org/spark.git
13 | $ cd spark
14 | $ mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -Dscala-2.11 -Phive -Phive-thriftserver -DskipTests clean package
15 | 
16 | To build a distribution:
17 | 
18 | ./dev/make-distribution.sh --tgz -Phadoop-2.7 -Phive -Phive-thriftserver -Pyarn -DskipTests
19 | 


--------------------------------------------------------------------------------
/Spark/examples.txt:
--------------------------------------------------------------------------------
 1 | # To get rid of the warning on Hadoop native library
 2 | export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native
 3 | 
 4 | [hdfs@edge1 ~]$ spark-shell --master yarn --driver-memory 512m --executor-memory 512m
 5 | scala> 
 6 | val file = sc.textFile("/test")
 7 | val counts = file.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey(_ + _)
 8 | counts.saveAsTextFile("/tmp/wordcount")
 9 | counts.count()
10 | 
11 | 
12 | Examples using Python:
13 | 
14 | spark-submit  ~/sparkPython/square.py --master yarn --deploy-mode cluster
15 | 
16 | spark-submit  ~/sparkPython/wordcount.py --master yarn --deploy-mode cluster (Copy the file on which count needs to be done on hdfs path)
17 | 


--------------------------------------------------------------------------------
/Spark/spark-defaults.conf:
--------------------------------------------------------------------------------
 1 | spark.master                       	yarn
 2 | spark.eventLog.enabled    		true
 3 | spark.driver.memory         		1024m
 4 | spark.yarn.am.memory			1024m
 5 | 
 6 | spark.yarn.jars                     hdfs://nn1.dilithium.com:9000/spark_jars/jars/*
 7 | or
 8 | spark.yarn.archive                  hdfs://nn1.dilithium.com:9000/spark_jars/spark-libs.jar
 9 | 
10 | #if using archive: $ jar cv0f spark-libs.jar -C $SPARK_HOME/jars/ .
11 | 
12 | spark.serializer                   	org.apache.spark.serializer.KryoSerializer
13 | spark.eventLog.dir              	  hdfs://nn1.dilithium.com:9000/spark_logs
14 | spark.history.fs.logDirectory     	hdfs://nn1.dilithium.com:9000/spark_logs
15 | spark.history.provider            	org.apache.spark.deploy.history.FsHistoryProvider
16 | spark.history.fs.update.interval  	10s
17 | spark.history.ui.port 			18080
18 | 
19 | 
20 | 
21 | yarn-site.xml (Tested on spark 2.2.1)
22 | 
23 | 
24 | <property>
25 | <name>yarn.nodemanager.aux-services.spark_shuffle.class</name>
26 | <value>org.apache.spark.network.yarn.YarnShuffleService</value>
27 | </property>
28 | 
29 | <property>
30 | <name>yarn.nodemanager.aux-services.spark2_shuffle.class</name>
31 | <value>org.apache.spark.network.yarn.YarnShuffleService</value>
32 | </property>
33 | 
34 | <property>
35 | <name>yarn.nodemanager.aux-services</name>
36 | <value>mapreduce_shuffle,spark_shuffle,spark2_shuffle</value>
37 | </property>
38 | 


--------------------------------------------------------------------------------
/Spark/sparkPython/erfunction.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | sc = SparkContext()
 3 | log = sc.textFile("/Users/pkuma380/sparkPython/error.txt")
 4 | 
 5 | def errorcontain(s):
 6 |    return "ERROR" in s
 7 | f_log = log.filter(errorcontain)
 8 | for line in f_log.take(10):
 9 |     print "Start output", line
10 | 
11 | 


--------------------------------------------------------------------------------
/Spark/sparkPython/error.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkContext
2 | sc = SparkContext()
3 | log = sc.textFile("/Users/pkuma380/sparkPython/error.txt")
4 | f_log = log.filter(lambda data: "ERROR" in data)
5 | for line in f_log.take(10):
6 |     print line
7 | 


--------------------------------------------------------------------------------
/Spark/sparkPython/error.txt:
--------------------------------------------------------------------------------
 1 | Spark Command: /Library/Java/JavaVirtualMachines/jdk1.8.0_60.jdk/Contents/Home//bin/java -cp /usr/local/spark/spark-1.3.1-bin-hadoop2.6/sbin/../conf:/usr/local/spark/spark-1.3.1-bin-hadoop2.6/lib/spark-assembly-1.3.1-hadoop2.6.0.jar:/usr/local/spark/spark-1.3.1-bin-hadoop2.6/lib/datanucleus-api-jdo-3.2.6.jar:/usr/local/spark/spark-1.3.1-bin-hadoop2.6/lib/datanucleus-core-3.2.10.jar:/usr/local/spark/spark-1.3.1-bin-hadoop2.6/lib/datanucleus-rdbms-3.2.9.jar -Dspark.akka.logLifecycleEvents=true -Xms512m -Xmx512m org.apache.spark.deploy.master.Master --ip BGLC02M965AFH01 --port 7077 --webui-port 8080
 2 | ========================================
 3 | 
 4 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
 5 | 16/02/17 20:19:40 INFO Master: Registered signal handlers for [TERM, HUP, INT]
 6 | 16/02/17 20:20:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
 7 | 16/02/17 20:20:11 INFO SecurityManager: Changing view acls to: pkuma380
 8 | 16/02/17 20:20:11 INFO SecurityManager: Changing modify acls to: pkuma380
 9 | 16/02/17 20:20:11 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(pkuma380); users with modify permissions: Set(pkuma380)
10 | 16/02/17 20:20:11 INFO Slf4jLogger: Slf4jLogger started
11 | 16/02/17 20:20:11 INFO Remoting: Starting remoting
12 | 16/02/17 20:20:12 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkMaster@BGLC02M965AFH01:7077]
13 | 16/02/17 20:20:12 INFO Remoting: Remoting now listens on addresses: [akka.tcp://sparkMaster@BGLC02M965AFH01:7077]
14 | 16/02/17 20:20:12 INFO Utils: Successfully started service 'sparkMaster' on port 7077.
15 | 16/02/17 20:20:12 INFO Server: jetty-8.y.z-SNAPSHOT
16 | 16/02/17 20:20:12 INFO AbstractConnector: Started SelectChannelConnector@BGLC02M965AFH01:6066
17 | 16/02/17 20:20:12 INFO Utils: Successfully started service on port 6066.
18 | 16/02/17 20:20:12 INFO StandaloneRestServer: Started REST server for submitting applications on port 6066
19 | 16/02/17 20:20:12 INFO Master: Starting Spark master at spark://BGLC02M965AFH01:7077
20 | 16/02/17 20:20:12 INFO Master: Running Spark version 1.3.1
21 | 16/02/17 20:20:13 INFO Server: jetty-8.y.z-SNAPSHOT
22 | 16/02/17 20:20:13 INFO AbstractConnector: Started SelectChannelConnector@0.0.0.0:8080
23 | 16/02/17 20:20:13 INFO Utils: Successfully started service 'MasterUI' on port 8080.
24 | 16/02/17 20:20:13 INFO MasterWebUI: Started MasterWebUI at http://192.168.0.51:8080
25 | 16/02/17 20:20:13 INFO Master: I have been elected leader! New state: ALIVE
26 | 16/02/18 15:14:37 ERROR Master: RECEIVED SIGNAL 15: SIGTERM
27 | 16/02/18 15:14:37 ERROR Master: RECEIVED SIGNAL 15: SIGTERM
28 | 


--------------------------------------------------------------------------------
/Spark/sparkPython/logparser.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | data = "/Users/pkuma380/sparkPython/error.txt"
 3 | for line in open(data):
 4 |     #columns = line.split(" ")
 5 | #if len(columns) > 1:
 6 |   if '16' in line:
 7 |      date =line.split("(\s+)")
 8 | 
 9 | print date
10 | 


--------------------------------------------------------------------------------
/Spark/sparkPython/pivot.txt:
--------------------------------------------------------------------------------
 1 | userid age	country	 number_of_calls
 2 | x01	41	US	3
 3 | x01	41	UK	1
 4 | x01	41	CA	2
 5 | x01	72	US	4
 6 | x02	72	UK	6
 7 | x02	72	CA	7
 8 | x02	72	XX	8
 9 | x02	72	XB	8
10 | x02	72	NA	9
11 | 


--------------------------------------------------------------------------------
/Spark/sparkPython/square.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | sc = SparkContext()
 3 | #data = sc.parallelize([1,2,3,4,5])
 4 | 
 5 | def square(sq):
 6 |     return sq * sq
 7 | data = sc.parallelize([1,2,3,4,5])
 8 | sq= data.map(square)
 9 | for line in sq.collect():
10 |     print line
11 | 
12 | 


--------------------------------------------------------------------------------
/Spark/sparkPython/wordcount.py:
--------------------------------------------------------------------------------
 1 | from operator import add
 2 | from pyspark import SparkContext
 3 | sc= SparkContext()
 4 | file = sc.textFile("/user/hdfs/sparkPython/wordcount.txt")
 5 | word = file.flatMap(lambda x: x.split(" "))
 6 | mapword = word.map(lambda x: (x, 1))
 7 | reduceword = mapword.reduceByKey(add)
 8 | output = reduceword.collect()
 9 | nums = sc.parallelize([output])
10 | for i in nums.collect():
11 |     print i
12 | 


--------------------------------------------------------------------------------
/Spark/sparkPython/wordcount.txt:
--------------------------------------------------------------------------------
 1 | 49
 2 | 2
 3 | volumename                 
 4 | ainduk                     
 5 | apps                       
 6 | axp                        
 7 | axp.admin                  
 8 | axp.apptests               
 9 | axp.hivequerylogs          
10 | axp.mirror                 
11 | axp.mirror.home            
12 | bjaya                      
13 | bmanikya                   
14 | dprichar                   
15 | dschexna                   
16 | gsing140                   
17 | hyalama                    
18 | idn                        
19 | idn.home                   
20 | kvall3                     
21 | kvarakan                   
22 | mapr.cldb.internal         
23 | mapr.cluster.root          
24 | mapr.hbase                 
25 | mapr.tmp                   
26 | mirror-cstonedb-vol2-test  
27 | mirror-silver-datameer     
28 | mysqlbcp                   
29 | naveenmirrortest           
30 | ngupt131                   
31 | phari                      
32 | pigtemp                    
33 | PlatinumDR_Mysql_Backups   
34 | psing141                   
35 | rsyncappsvrs               
36 | rsynces                    
37 | rsyncmllab                 
38 | rsyncplatdrm5              
39 | rsyncsilverm5              
40 | rsyncsilverm7              
41 | rsyncskytree               
42 | rsyncstorm                 
43 | smanubo                    
44 | spark                      
45 | spool4                     
46 | spoudel                    
47 | twilli1                    
48 | ukris                      
49 | users                      
50 | vkomat                     
51 | zsmit3
52 | zsmit3                     
53 | 


--------------------------------------------------------------------------------
/Spark/spark_command.txt:
--------------------------------------------------------------------------------
 1 | YARN Node Labels:
 2 | 
 3 | $ spark-submit --class org.apache.spark.examples.SparkPi --queue root.prod --conf spark.yarn.am.nodeLabelExpression=spark    --conf spark.yarn.executor.nodeLabelExpression=spark --executor-memory 512m --num-executors 1 --driver-memory 512m --master yarn --deploy-mode cluster /opt/cloudera/parcels/CDH/jars/spark-examples*.jar 10
 4 | 
 5 | # https://www.ibm.com/support/pages/yarn-node-labels-label-based-scheduling-and-resource-isolation-hadoop-dev
 6 | # https://docs.cloudera.com/runtime/7.0.2/yarn-allocate-resources/topics/yarn-configuring-node-labels.html
 7 | # https://docs.cloudera.com/cdp-private-cloud-base/7.1.5/yarn-allocate-resources/topics/yarn-associate-node-labels-with-queues.html
 8 | 
 9 | Start cluster, after settign ssh-passphrase from master (Only for non Yarn cluster)
10 | 
11 | $ /opt/cluster/spark/sbin/stop-all.sh
12 | $ /opt/cluster/spark/sbin/start-all.sh
13 | 
14 | 
15 | Tip
16 | ===
17 | To avoid loading assembly jar every time, set env variable as below, as copying jar to hadoop
18 | 
19 | export SPARK_JAR=hdfs://nn1.dilithium.com:9000/user/hdfs/share/lib/spark-assembly-1.4.1-hadoop2.6.0.jar
20 | 
21 | Submit jobs in 3 modes
22 | 
23 | $ spark-submit --class org.apache.spark.examples.SparkPi /opt/cluster/spark/lib/spark-examples-1.6.1-hadoop2.2.0.jar 10 --master spark://rt1.cyrus.com:7077
24 | $ spark-submit --class org.apache.spark.examples.SparkPi /opt/cluster/spark/lib/spark-examples-1.6.1-hadoop2.2.0.jar 100 --master yarn --deploy-mode cluster
25 | $ spark-submit --class org.apache.spark.examples.SparkPi /opt/cluster/spark/lib/spark-examples-1.6.1-hadoop2.2.0.jar 100 --master yarn --deploy-mode client
26 | 
27 | $ spark-submit --class org.apache.spark.examples.SparkPi /usr/lib/spark/lib/spark-examples-1.6.1-hadoop2.7.2-amzn-1.jar 100 --master yarn-master
28 | 
29 | Other ways of running it
30 | -------------------------
31 | $ spark-shell --master yarn
32 | $ spark-submit --verbose ~/sparkPython/square.py --master yarn --deploy-mode cluster
33 | $ spark-submit --verbose ~/sparkPython/square.py --master yarn-cluster --deploy-mode cluster
34 | $ spark-submit --verbose ~/sparkPython/square.py --master yarn-client --deploy-mode cluster
35 | 
36 | 


--------------------------------------------------------------------------------
/Spark/spark_standalone_cluster.txt:
--------------------------------------------------------------------------------
 1 | Spark Yarn Cluster Setup
 2 | =========================
 3 | 
 4 | # nodes
 5 | 
 6 | edge1.dilithium.com(master)
 7 | edge2.dilithium.com(worker)
 8 | hbm1.dilithium.com(worker)
 9 | -------------------------------------
10 | [hdfs@edge1 conf]$ cat spark-env.sh
11 | 
12 | HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
13 | 
14 | SPARK_LOCAL_IP=192.168.1.18
15 | SPARK_MASTER_IP=edge1.dilithium.com
16 | 
17 | export SPARK_WORKER_MEMORY=256m
18 | export SPARK_EXECUTOR_MEMORY=128m
19 | export SPARK_WORKER_INSTANCES=1
20 | export SPARK_WORKER_CORES=1
21 | export SPARK_WORKER_DIR=/home/hdfs/work/sparkdata
22 | -------------------------------------------------
23 | [hdfs@edge2 conf]$ cat spark-env.sh
24 | 
25 | HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
26 | 
27 | SPARK_LOCAL_IP=192.168.1.19
28 | SPARK_MASTER_IP=edge1.dilithium.com
29 | 
30 | export SPARK_WORKER_MEMORY=256m
31 | export SPARK_EXECUTOR_MEMORY=128m
32 | export SPARK_WORKER_INSTANCES=2
33 | export SPARK_WORKER_CORES=1
34 | export SPARK_WORKER_DIR=/home/hdfs/work/sparkdata
35 | -------------------------------------------------
36 | [hdfs@hbm1 conf]$ cat spark-env.sh
37 | 
38 | HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
39 | 
40 | SPARK_LOCAL_IP=192.168.1.30
41 | SPARK_MASTER_IP=edge1.dilithium.com
42 | 
43 | export SPARK_WORKER_MEMORY=256m
44 | export SPARK_EXECUTOR_MEMORY=128m
45 | export SPARK_WORKER_INSTANCES=2
46 | export SPARK_WORKER_CORES=1
47 | export SPARK_WORKER_DIR=/home/hdfs/work/sparkdata
48 | --------------------------------------------------
49 | on Master node(edge1)
50 | 
51 | [hdfs@edge1 conf]$ cat slaves
52 | # A Spark Worker will be started on each of the machines listed below.
53 | 
54 | edge2.dilithium.com
55 | hbm1.dilithium.com
56 | --------------------
57 | On all nodes in the cluster
58 | 
59 | [hdfs@edge1 conf]$ cat spark-defaults.conf
60 | # Default system properties included when running spark-submit.
61 | # This is useful for setting default environmental settings.
62 | 
63 | # Example:
64 | spark.master                     spark://edge1.dilithium.com:7077
65 | spark.eventLog.enabled           true
66 | spark.serializer                 org.apache.spark.serializer.KryoSerializer
67 | spark.eventLog.dir               hdfs://nn1.dilithium.com:9000/user/hdfs/spark_logs
68 | 
69 | 


--------------------------------------------------------------------------------
/Spark/yarn-site.xml.spark:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0"?>
  2 | <configuration>
  3 | 
  4 | <!-- Site specific YARN configuration properties -->
  5 | 
  6 | <property>
  7 | <name>yarn.nodemanager.aux-services</name>
  8 | <value>mapreduce_shuffle,spark_shuffle,spark2_shuffle</value>
  9 | </property>
 10 | 
 11 | <property>
 12 | <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
 13 | <value>org.apache.hadoop.mapred.ShuffleHandler</value>
 14 | </property>
 15 | 
 16 | <property>
 17 |     <name>yarn.nodemanager.aux-services.spark_shuffle.class</name>
 18 |     <value>org.apache.spark.network.yarn.YarnShuffleService</value>
 19 | </property>
 20 | 
 21 | <property>
 22 | 
 23 |       <name>yarn.nodemanager.aux-services.spark2_shuffle.class</name>
 24 |       <value>org.apache.spark.network.yarn.YarnShuffleService</value>
 25 | </property>
 26 | 
 27 | <property>
 28 | <name>yarn.resourcemanager.resource-tracker.address</name>
 29 | <value>rm1.dilithium.com:9001</value>
 30 | </property>
 31 | 
 32 | <property>
 33 | <name>yarn.resourcemanager.scheduler.address</name>
 34 | <value>rm1.dilithium.com:9002</value>
 35 | </property>
 36 | 
 37 | <property>
 38 | <name>yarn.resourcemanager.address</name>
 39 | <value>rm1.dilithium.com:9003</value>
 40 | </property>
 41 | 
 42 | #<property>
 43 | #<name>yarn.nodemanager.local-dirs</name>
 44 | #<value>file:/space/tmp1,file:/space/tmp2</value>
 45 | #</property>
 46 | 
 47 | <property>
 48 |     <name>yarn.nodemanager.resource.memory-mb</name>
 49 |     <value>3072</value>
 50 | </property>
 51 | 
 52 | <property>
 53 |     <name>yarn.scheduler.minimum-allocation-mb</name>
 54 |     <value>256</value>
 55 | </property>
 56 | 
 57 | <property>
 58 |     <name>yarn.scheduler.maximum-allocation-mb</name>
 59 |     <value>3072</value>
 60 | </property>
 61 | 
 62 | <property>
 63 |     <name>yarn.scheduler.minimum-allocation-vcores</name>
 64 |     <value>1</value>
 65 | </property>
 66 | 
 67 | <property>
 68 |     <name>yarn.scheduler.maximum-allocation-vcores</name>
 69 |     <value>12</value>
 70 | </property>
 71 | 
 72 | <property>
 73 |     <name>yarn.nodemanager.resource.cpu-vcores</name>
 74 |     <value>12</value>
 75 | </property>
 76 | 
 77 | 
 78 | <property>
 79 | <name>yarn.nodemanager.vmem-pmem-ratio</name>
 80 | <value>2.1</value>
 81 | </property>
 82 | 
 83 | #<property>
 84 | #   <name>yarn.nodemanager.vmem-check-enabled</name>
 85 | #    <value>false</value>
 86 | #    <description>Whether virtual memory limits will be enforced for containers</description>
 87 | #</property>
 88 | 
 89 | <property>
 90 |     <name>yarn.log-aggregation-enable</name>
 91 |     <value>true</value>
 92 | </property>
 93 | 
 94 | <property>
 95 |     <description>Where to aggregate logs to.</description>
 96 |     <name>yarn.nodemanager.remote-app-log-dir</name>
 97 |     <value>/tmp/logs</value>
 98 | </property>
 99 | 
100 | <property>
101 |     <name>yarn.log-aggregation.retain-seconds</name>
102 |     <value>259200</value>
103 | </property>
104 | 
105 | <property>
106 |     <name>yarn.log-aggregation.retain-check-interval-seconds</name>
107 |     <value>3600</value>
108 | </property>
109 | 
110 | </configuration>
111 | 
112 | 


--------------------------------------------------------------------------------
/Yarn_tuning/Yarn.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Yarn_tuning/Yarn.pdf


--------------------------------------------------------------------------------
/Yarn_tuning/mapred-site.xml:
--------------------------------------------------------------------------------
 1 | <property>
 2 | <name>mapreduce.map.memory.mb</name>
 3 | <value>768</value>
 4 | </property>
 5 | 
 6 | <property>
 7 | <name>mapreduce.reduce.memory.mb</name>
 8 | <value>768</value>
 9 | </property>
10 | 
11 | <property>
12 | <name>mapreduce.map.java.opts</name>
13 | <value>-Xmx512m</value>
14 | </property>
15 | 
16 | <property>
17 | <name>mapreduce.reduce.java.opts</name>
18 | <value>-Xmx512m</value>
19 | </property>
20 | 


--------------------------------------------------------------------------------
/Yarn_tuning/yarn-site.xml:
--------------------------------------------------------------------------------
 1 | <property>
 2 | <name>yarn.app.mapreduce.am.resource.mb</name>
 3 | <value>1024</value>
 4 | </property>
 5 | 
 6 | <property>
 7 | <name>yarn.nodemanager.resource.memory-mb</name>
 8 | <value>2048</value>
 9 | </property>
10 | 
11 | <property>
12 | <name>yarn.scheduler.minimum-allocation-mb</name>
13 | <value>512</value>
14 | </property>
15 | 
16 | <property>
17 | <name>yarn.scheduler.maximum-allocation-mb</name>
18 | <value>1024</value>
19 | </property>
20 | 
21 | <property>
22 | <name>yyarn.scheduler.minimum-allocation-vcores</name>
23 | <value>1</value>
24 | </property>
25 | 
26 | <property>
27 | <name>yarn.scheduler.maximum-allocation-vcores</name>
28 | <value>2</value>
29 | </property>
30 | 
31 | <property>
32 | <name>yarn.nodemanager.pmem-check-enabled</name>
33 | <value>false</value>
34 | </property>
35 | 
36 | <property>
37 | <name>yarn.nodemanager.vmem-check-enabled</name>
38 | <value>false</value>
39 | </property>
40 | 
41 | <property>
42 | <name>yarn.nodemanager.vmem-pmem-ratio</name>
43 | <value>2.1</value>
44 | </property>
45 | 
46 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman


--------------------------------------------------------------------------------
/hadoop1.0/README.md:
--------------------------------------------------------------------------------
1 | For all the Configurations related to Hadoop 1.x, please use the below branch
2 | 
3 | https://github.com/netxillon/hadoop/tree/Hadoop1
4 | 


--------------------------------------------------------------------------------
/hadoop2.0/bash_profile:
--------------------------------------------------------------------------------
 1 | # .bash_profile
 2 | 
 3 | # Get the aliases and functions
 4 | if [ -f ~/.bashrc ]; then
 5 | 	. ~/.bashrc
 6 | fi
 7 | 
 8 | # User specific environment and startup programs
 9 | 
10 | # User specific aliases and functions
11 | 
12 | #export HADOOP_HOME=/home/hadoop/hadoop-2.2.0
13 | 
14 | export HADOOP_HOME=/home/hadoop/hadoop
15 | export HADOOP_MAPRED_HOME=$HADOOP_HOME
16 | export HADOOP_COMMON_HOME=$HADOOP_HOME
17 | export HADOOP_HDFS_HOME=$HADOOP_HOME
18 | export YARN_HOME=$HADOOP_HOME
19 | export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
20 | export YARN_CONF_DIR=$HADOOP_HOME/etc/hadoop
21 | 
22 | export JAVA_HOME=/usr/java/default
23 | 
24 | 
25 | PATH=$HADOOP_HOME/bin/:$HADOOP_HOME/sbin/:$JAVA_HOME/bin/:$PATH
26 | export PATH
27 | 


--------------------------------------------------------------------------------
/hadoop2.0/core-site.xml:
--------------------------------------------------------------------------------
1 | <configuration>
2 | <property>
3 |     <name>fs.defaultFS</name>
4 |     <value>hdfs://ha-nn1.hacluster1.com:9000</value>
5 | </property>
6 | 
7 | </configuration>
8 | 


--------------------------------------------------------------------------------
/hadoop2.0/hdfs-site.xml:
--------------------------------------------------------------------------------
 1 | Namenode
 2 | ========
 3 | 
 4 | <configuration>
 5 | 
 6 | <property>
 7 |     <name>dfs.namenode.name.dir</name>
 8 |     <value>file:/data/namenode</value>
 9 | </property>
10 | 
11 | <property>
12 | <name>dfs.replication</name>
13 | <value>1</value>
14 | </property>
15 | 
16 | <property>
17 | <name>dfs.blocksize</name>
18 | <value>134217728</value>
19 | </property>
20 | 
21 | </configuration>
22 | 
23 | Datanode
24 | ========
25 | 
26 | <configuration>
27 | 
28 | <property>
29 |     <name>dfs.datanode.data.dir</name>
30 |     <value>file:/data/datanode</value>
31 | </property>
32 | 
33 | </configuration>
34 | 


--------------------------------------------------------------------------------
/hadoop2.0/mapred-site.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 | # The framework can be local, classic or yarn
 4 | 
 5 | <property>
 6 | <name>mapreduce.framework.name</name>
 7 | <value>yarn</value>
 8 | </property>
 9 | 
10 | </configuration>
11 | 


--------------------------------------------------------------------------------
/hadoop2.0/yarn-site.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 | <property>
 4 | <name>yarn.nodemanager.aux-services</name>
 5 | <value>mapreduce_shuffle</value>
 6 | </property>
 7 | 
 8 | <property>
 9 | <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
10 | <value>org.apache.hadoop.mapred.ShuffleHandler</value>
11 | </property>
12 | 
13 | <property>
14 | <name>yarn.resourcemanager.resource-tracker.address</name>
15 | <value>nn2.cluster1.com:9001</value>
16 | </property>
17 | 
18 | <property>
19 | <name>yarn.resourcemanager.scheduler.address</name>
20 | <value>nn2.cluster1.com:9002</value>
21 | </property>
22 | 
23 | <property>
24 | <name>yarn.resourcemanager.address</name>
25 | <value>nn2.cluster1.com:9003</value>
26 | </property>
27 | 
28 | </configuration>
29 | 


--------------------------------------------------------------------------------
/hadoop_build64bit:
--------------------------------------------------------------------------------
 1 | Build 64 bit Hadoop
 2 | ===================
 3 | 
 4 | 
 5 | 1. yum -y install gcc gcc-c++ openssl-devel make cmake zlib* libssl* autoconf automake libtool cyrus-sasl* libgsasl-devel* java-1.8.0-openjdk.x86_64 java-1.8.0-openjdk-devel.x86_64
 6 | 
 7 | 2. Download Maven: wget http://mirrors.gigenet.com/apache/maven/maven-3/3.3.3/binaries/apache-maven-3.3.3-bin.tar.gz
 8 | 
 9 | 	tar -zxf apache-maven-3.3.3-bin.tar.gz -C /opt/
10 | 	
11 | 	setup maven environment
12 | 	
13 | 	[root@repo67 ~]# cat /etc/profile.d/maven.sh
14 | 	export JAVA_HOME=/usr/java/latest
15 | 	export M3_HOME=/opt/apache-maven-3.3.3
16 | 	export PATH=$JAVA_HOME/bin:/opt/apache-maven-3.3.3/bin:$PATH
17 | 	
18 | 3. Download protobuf: wget https://github.com/google/protobuf/releases/download/v2.5.0/protobuf-2.5.0.tar.gz
19 | 
20 | 	tar -xzf protobuf-2.5.0.tar.gz -C /opt
21 | 	
22 | 	cd /opt/protobuf-2.5.0/
23 | 	./configure
24 | 	make;make install
25 | 	
26 | 4. Download latest stable hadoop source code , example hadoop-2.7.2-src.tar.gz
27 | 
28 | 	tar -xzf hadoop-2.7.2-src.tar.gz -C /opt/
29 | 	cd /opt/hadoop-2.7.2-src
30 | 	mvn package -Pdist,native -DskipTests -Dtar -Dmaven.skip.test=true -Dmaven.javadoc.skip=true
31 | 	
32 | 	you will see a tar ball under hadoop-2.7.2-src/hadoop-dist/target/
33 | 	
34 | 	Enjoy !!
35 | 	
36 | 	
37 | Updated for maven 3.6.3 and protobuf 3.7.1
38 | ------------------------------------------
39 | Supported version hadoop 3.3.0
40 | 
41 | 1. wget http://mirror.intergrid.com.au/apache/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.tar.gz
42 | 
43 | 2. wget https://cmake.org/files/v3.6/cmake-3.6.2.tar.gz
44 | 	tar -zxvf cmake-3.6.2.tar.gz
45 | 	cd cmake-3.6.2
46 | 	./bootstrap --prefix=/usr/local
47 | 	make; make install
48 | 	PATH=/usr/local/bin:$PATH
49 | 
50 | 3. wget https://github.com/protocolbuffers/protobuf/releases/download/v3.7.1/protobuf-cpp-3.7.1.tar.gz
51 | 
52 | - For all version prior to hadoop 3.3.0, protobuf version = 2.5.0.
53 | - For Hadoop 3.x and higher cmake version must be greater than 3.2
54 | 


--------------------------------------------------------------------------------
/jumbune:
--------------------------------------------------------------------------------
 1 | hadoop-env.sh
 2 | 
 3 | export HADOOP_NAME NODE_OPTS="-Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=5677 $HADOOP_NAME NODE_OPTS"
 4 | export HADOOP_DATANODE_OPTS="-Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=5679 $HADOOP_DATANODE_OPTS"
 5 | 
 6 | yarn-env.sh
 7 | 
 8 | export YARN_NODEMANAGER_OPTS="-Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=5678 $YARN_NODEMANAGER_OPTS"
 9 | export YARN_RESOURCEMANAGER_OPTS="-Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=5680 $YARN_RESOURCEMANAGER_OPTS"
10 | 


--------------------------------------------------------------------------------
/logging:
--------------------------------------------------------------------------------
 1 | 
 2 | We can get and set the log level dynamically using the daemonlog command
 3 | ===================
 4 | 
 5 | $ hadoop daemonlog -getlevel master1.cyrus.com:50070 org.apache.hadoop.dfs.NameNode
 6 | Connecting to http://master1.cyrus.com:50070/logLevel?log=org.apache.hadoop.dfs.NameNode
 7 | Submitted Log Name: org.apache.hadoop.dfs.NameNode
 8 | Log Class: org.apache.commons.logging.impl.Log4JLogger
 9 | Effective level: INFO
10 | 
11 | 
12 | $ hadoop daemonlog -getlevel master1.cyrus.com:50070 org.apache.hadoop.hdfs.server.namenode.NodeNode
13 | Connecting to http://master1.cyrus.com:50070/logLevel?log=org.apache.hadoop.hdfs.server.namenode.NodeNode
14 | Submitted Log Name: org.apache.hadoop.hdfs.server.namenode.NodeNode
15 | Log Class: org.apache.commons.logging.impl.Log4JLogger
16 | Effective level: INFO
17 | 
18 | +++++++++++++
19 | 
20 | The logs are of the format /var/log/hadoop/hadoop-hadoop-$HADOOP_IDENT_STRING-<hostname>.log
21 | 
22 | Thinking of changing $HADOOP_IDENT_STRING ?
23 | 
24 | Not a good idea:
25 | 
26 | $HADOOP_IDENT_STRING=$USER (Do not try to change to any custom value, because the PID etc all are tracked by this 
27 | and your scripts like hadoop-daemon.sh will fail.
28 | 


--------------------------------------------------------------------------------
/map_scripts/job.txt:
--------------------------------------------------------------------------------
1 | $ hadoop jar contrib/streaming/hadoop-*streaming*.jar -file /home/hadoop/mapper.py -mapper /home/hadoop/mapper.py -file /home/hadoop/reducer.py  -reducer /home/hadoop/reducer.py -input /input -output /output
2 | 
3 | 
4 | 
5 | hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar -D stream.num.map.output.key.fields=2 -input /input /out -mapper /home/hadoop/mapper.sh -reducer /home/hadoop/reducer.sh


--------------------------------------------------------------------------------
/map_scripts/mapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | # input comes from STDIN (standard input)
 6 | for line in sys.stdin:
 7 |     # remove leading and trailing whitespace
 8 |     line = line.strip()
 9 |     # split the line into words
10 |     words = line.split()
11 |     # increase counters
12 |     for word in words:
13 |         # write the results to STDOUT (standard output);
14 |         # what we output here will be the input for the
15 |         # Reduce step, i.e. the input for reducer.py
16 |         #
17 |         # tab-delimited; the trivial word count is 1
18 |         print '%s\t%s' % (word, 1)
19 | 


--------------------------------------------------------------------------------
/map_scripts/mapper.sh:
--------------------------------------------------------------------------------
 1 | [training@localhost steve]$ cat maptf.sh
 2 | #!/bin/bash
 3 |  
 4 | exclude="\.\,?!\-_:;\]\[\#\|\$()\""
 5 | while read split; do
 6 | for word in $split; do
 7 |    term=`echo "${word//[$exclude]/}" | tr [:upper:] [:lower:]`
 8 |    if [ -n "$term" ]; then
 9 |      printf "%s\t%s\t%s\n" "$term" "$map_input_file" "1"
10 |    fi
11 | done
12 | done


--------------------------------------------------------------------------------
/map_scripts/reducer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from operator import itemgetter
 4 | import sys
 5 | 
 6 | current_word = None
 7 | current_count = 0
 8 | word = None
 9 | 
10 | # input comes from STDIN
11 | for line in sys.stdin:
12 |     # remove leading and trailing whitespace
13 |     line = line.strip()
14 | 
15 |     # parse the input we got from mapper.py
16 |     word, count = line.split('\t', 1)
17 | 
18 |     # convert count (currently a string) to int
19 |     try:
20 |         count = int(count)
21 |     except ValueError:
22 |         # count was not a number, so silently
23 |         # ignore/discard this line
24 |         continue
25 | 
26 |     # this IF-switch only works because Hadoop sorts map output
27 |     # by key (here: word) before it is passed to the reducer
28 |     if current_word == word:
29 |         current_count += count
30 |     else:
31 |         if current_word:
32 |             # write result to STDOUT
33 |             print '%s\t%s' % (current_word, current_count)
34 |         current_count = count
35 |         current_word = word
36 | 
37 | # do not forget to output the last word if needed!
38 | if current_word == word:
39 |     print '%s\t%s' % (current_word, current_count)
40 | 


--------------------------------------------------------------------------------
/map_scripts/reducer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 |  
 3 | read currterm currfile currnum
 4 | while read term file num; do
 5 |   if [[ $term = "$currterm" ]] && [[ $file = "$currfile" ]]; then
 6 |     currnum=$(( currnum + num ))
 7 |   else
 8 |     printf "%s\t%s\t%s\n" "$currterm" "$currfile" "$currnum"
 9 |     currterm="$term"
10 |     currfile="$file"
11 |     currnum="$num"
12 |   fi
13 | done
14 | printf "%s\t%s\t%s\n" "$currterm" "$currfile" "$currnum"


--------------------------------------------------------------------------------
/zookeeper.txt:
--------------------------------------------------------------------------------
  1 | Deploying ZooKeeper Cluster (Multi-Server) Setup
  2 | 
  3 | Let’s begin installation and configuration of ZooKeeper.
  4 | 
  5 | Step 1: Directory Structure creation, as decided in the designing section
  6 | 
  7 | mac-book-pro:demo aman$ mkdir -p /Users/aman/zookeeper/zk-server-1 \
  8 | /Users/aman/zookeeper/zk-server-2 \
  9 | /Users/aman/zookeeper/zk-server-3 \
 10 | /Users/aman/zookeeper/zk-server-4 \
 11 | /Users/aman/zookeeper/zk-server-5
 12 | 
 13 | mac-book-pro:demo aman$ mkdir -p /Users/aman/zookeeper/data/zk1 \
 14 | /Users/aman/zookeeper/data/zk2 \
 15 | /Users/aman/zookeeper/data/zk3 \
 16 | /Users/aman/zookeeper/data/zk4 \
 17 | /Users/aman/zookeeper/data/zk5
 18 | 
 19 | mac-book-pro:demo aman$ mkdir -p /Users/aman/zookeeper/log/zk1 \
 20 | /Users/aman/zookeeper/log/zk2 \
 21 | /Users/aman/zookeeper/log/zk3 \
 22 | /Users/aman/zookeeper/log/zk4 \
 23 | /Users/aman/zookeeper/log/zk5
 24 | 
 25 | Let’s take a look above created directory structure-
 26 | 
 27 | mac-book-pro:demo aman$ tree /Users/aman/zookeeper
 28 | 
 29 | /Users/aman/zookeeper
 30 | |-data
 31 | |---zk1
 32 | |---zk2
 33 | |---zk3
 34 | |---zk4
 35 | |---zk5
 36 | |-log
 37 | |---zk1
 38 | |---zk2
 39 | |---zk3
 40 | |---zk4
 41 | |---zk5
 42 | |-zk-server-1
 43 | |-zk-server-2
 44 | |-zk-server-3
 45 | |-zk-server-4
 46 | |-zk-server-5
 47 | 
 48 | mac-book-pro:demo aman$
 49 | 
 50 | Okay, looks good!
 51 | 
 52 | Step 2: Creating a ZooKeeper Server ID, basically this file reside in the ZooKeeper data directory.  Go on choose your favorite text editor
 53 | 
 54 | # just enter a value '1' in the file. Save the file, do the same for rest of ZooKeeper
 55 | mac-book-pro:demo aman$  vi /Users/aman/zookeeper/data/zk1/myid
 56 | 
 57 | # follow the same way to fill server id
 58 | vi /Users/aman/zookeeper/data/zk2/myid
 59 | vi /Users/aman/zookeeper/data/zk3/myid
 60 | vi /Users/aman/zookeeper/data/zk4/myid
 61 | vi /Users/aman/zookeeper/data/zk5/myid
 62 | 
 63 | Step 3: Downloading ZooKeeper Release
 64 | 
 65 | Download a ZooKeeper from http://hadoop.apache.org/zookeeper/releases.html; this article utilize the version 3.4.4 of ZooKeeper.  However same principle is applied for other version too.
 66 | 
 67 | Step 4: Extract & prepare ZooKeeper for deployment
 68 | 
 69 | mac-book-pro:demo aman$ gzip -dc ~/Downloads/soft/zookeeper-3.4.4.tar.gz | tar -xf - -C /tmp
 70 | mac-book-pro:demo aman$ cp -r /tmp/zookeeper-3.4.4/* /Users/aman/zookeeper/zk-server-1/
 71 | mac-book-pro:demo aman$ cp -r /tmp/zookeeper-3.4.4/* /Users/aman/zookeeper/zk-server-2/
 72 | mac-book-pro:demo aman$ cp -r /tmp/zookeeper-3.4.4/* /Users/aman/zookeeper/zk-server-3/
 73 | mac-book-pro:demo aman$ cp -r /tmp/zookeeper-3.4.4/* /Users/aman/zookeeper/zk-server-4/
 74 | mac-book-pro:demo aman$ cp -r /tmp/zookeeper-3.4.4/* /Users/aman/zookeeper/zk-server-5/
 75 | 
 76 | Once done don’t forget to cleanup the ‘/tmp/zookeeper-3.4.4′
 77 | 
 78 | Step 5: Preparing ZooKeeper configuration called ‘zoo.cfg‘ at ‘{zk-server-1}/conf/zoo.cfg‘.  Here I will show you for Server 1 and perform same steps with appropriate values (clientPort, dataDir, dataLogDir) for respective ZooKeeper server.
 79 | 
 80 | mac-book-pro:demo aman$ vi /Users/aman/zookeeper/zk-server-1/conf/zoo.cfg
 81 | 
 82 | Place below configuration into it.
 83 | 
 84 | 	
 85 | # The number of milliseconds of each tick
 86 | tickTime=2000
 87 |  
 88 | # The number of ticks that the initial synchronization phase can take
 89 | initLimit=10
 90 |  
 91 | # The number of ticks that can pass between
 92 | # sending a request and getting an acknowledgement
 93 | syncLimit=5
 94 |  
 95 | # the directory where the snapshot is stored.
 96 | # Choose appropriately for your environment
 97 | dataDir=/Users/aman/zookeeper/data/zk1
 98 |  
 99 | # the port at which the clients will connect
100 | clientPort=2181
101 |  
102 | # the directory where transaction log is stored.
103 | # this parameter provides dedicated log device for ZooKeeper
104 | dataLogDir=/Users/aman/zookeeper/log/zk1
105 |  
106 | # ZooKeeper server and its port no.
107 | # ZooKeeper ensemble should know about every other machine in the ensemble
108 | # specify server id by creating 'myid' file in the dataDir
109 | # use hostname instead of IP address for convenient maintenance
110 | server.1=localhost:2888:3888
111 | server.2=localhost:2889:3889
112 | server.3=localhost:2890:3890
113 | server.4=localhost:2891:3891
114 | server.5=localhost:2892:3892
115 | 


--------------------------------------------------------------------------------
/zookeeper_oozie/oozie-server.txt:
--------------------------------------------------------------------------------
  1 | Oozie Server Setup
  2 | 
  3 | Copy the built binaries to the home directory as ‘oozie’
  4 | 
  5 | $ cd ../../
  6 | $ cp -R oozie-3.3.2/distro/target/oozie-3.3.2-distro/oozie-3.3.2/ oozie
  7 | 
  8 | Create the required libext directory
  9 | 
 10 | $ cd oozie
 11 | $ mkdir libext
 12 | 
 13 | Copy all the required jars from hadooplibs to the libext directory using the following command:
 14 | 
 15 | $ cp ../oozie-3.3.2/hadooplibs/target/oozie-3.3.2-hadooplibs.tar.gz .
 16 | $ tar xzvf oozie-3.3.2-hadooplibs.tar.gz
 17 | $ cp oozie-3.3.2/hadooplibs/hadooplib-1.1.1.oozie-3.3.2/* libext/
 18 | 
 19 | Get Ext2Js – This library is not bundled with Oozie and needs to be downloaded separately. This library is used for the Oozie Web Console:
 20 | 
 21 | $ cd libext
 22 | $ wget http://extjs.com/deploy/ext-2.2.zip
 23 | $ cd ..
 24 | 
 25 | Update ../hadoop/conf/core-site.xml as follows:
 26 | 
 27 | <property>
 28 | <name>hadoop.proxyuser.hduser.hosts</name>
 29 | <value>localhost</value>
 30 | </property>
 31 | <property>
 32 | <name>hadoop.proxyuser.hduser.groups</name>
 33 | <value>hadoop</value>
 34 | </property>
 35 | 
 36 | Here, ‘hduser’ is the username and it belongs to ‘hadoop’ group.
 37 | 
 38 | Prepare the WAR file
 39 | 
 40 | $ ./bin/oozie-setup.sh prepare-war
 41 | 
 42 | setting CATALINA_OPTS="$CATALINA_OPTS -Xmx1024m"
 43 | 
 44 | INFO: Adding extension: /home/hduser/oozie/libext/commons-beanutils-1.7.0.jar
 45 | INFO: Adding extension: /home/hduser/oozie/libext/commons-beanutils-core-1.8.0.jar
 46 | INFO: Adding extension: /home/hduser/oozie/libext/commons-codec-1.4.jar
 47 | INFO: Adding extension: /home/hduser/oozie/libext/commons-collections-3.2.1.jar
 48 | INFO: Adding extension: /home/hduser/oozie/libext/commons-configuration-1.6.jar
 49 | INFO: Adding extension: /home/hduser/oozie/libext/commons-digester-1.8.jar
 50 | INFO: Adding extension: /home/hduser/oozie/libext/commons-el-1.0.jar
 51 | INFO: Adding extension: /home/hduser/oozie/libext/commons-io-2.1.jar
 52 | INFO: Adding extension: /home/hduser/oozie/libext/commons-lang-2.4.jar
 53 | INFO: Adding extension: /home/hduser/oozie/libext/commons-logging-1.1.jar
 54 | INFO: Adding extension: /home/hduser/oozie/libext/commons-math-2.1.jar
 55 | INFO: Adding extension: /home/hduser/oozie/libext/commons-net-1.4.1.jar
 56 | INFO: Adding extension: /home/hduser/oozie/libext/hadoop-client-1.1.1.jar
 57 | INFO: Adding extension: /home/hduser/oozie/libext/hadoop-core-1.1.1.jar
 58 | INFO: Adding extension: /home/hduser/oozie/libext/hsqldb-1.8.0.7.jar
 59 | INFO: Adding extension: /home/hduser/oozie/libext/jackson-core-asl-1.8.8.jar
 60 | INFO: Adding extension: /home/hduser/oozie/libext/jackson-mapper-asl-1.8.8.jar
 61 | INFO: Adding extension: /home/hduser/oozie/libext/log4j-1.2.16.jar
 62 | INFO: Adding extension: /home/hduser/oozie/libext/oro-2.0.8.jar
 63 | INFO: Adding extension: /home/hduser/oozie/libext/xmlenc-0.52.jar
 64 | 
 65 | New Oozie WAR file with added 'ExtJS library, JARs' at /home/hduser/oozie/oozie-server/webapps/oozie.war
 66 | 
 67 | INFO: Oozie is ready to be started
 68 | 
 69 | Create sharelib on HDFS
 70 | 
 71 | $ ./bin/oozie-setup.sh sharelib create -fs hdfs://localhost:54310
 72 | setting CATALINA_OPTS="$CATALINA_OPTS -Xmx1024m"
 73 | the destination path for sharelib is: /user/hduser/share/lib
 74 | 
 75 | Create the OoozieDB
 76 | 
 77 | $ ./bin/ooziedb.sh create -sqlfile oozie.sql -run
 78 | setting CATALINA_OPTS="$CATALINA_OPTS -Xmx1024m"
 79 | 
 80 | Validate DB Connection
 81 | DONE
 82 | Check DB schema does not exist
 83 | DONE
 84 | Check OOZIE_SYS table does not exist
 85 | DONE
 86 | Create SQL schema
 87 | DONE
 88 | Create OOZIE_SYS table
 89 | DONE
 90 | 
 91 | Oozie DB has been created for Oozie version '3.3.2'
 92 | 
 93 | The SQL commands have been written to: oozie.sql
 94 | 
 95 | To start Oozie as a daemon use the following command:
 96 | 
 97 | $ ./bin/oozied.sh start
 98 | 
 99 | Setting OOZIE_HOME: /home/hduser/oozie
100 | Setting OOZIE_CONFIG: /home/hduser/oozie/conf
101 | Sourcing: /home/hduser/oozie/conf/oozie-env.sh
102 | setting CATALINA_OPTS="$CATALINA_OPTS -Xmx1024m"
103 | Setting OOZIE_CONFIG_FILE: oozie-site.xml
104 | Setting OOZIE_DATA: /home/hduser/oozie/data
105 | Setting OOZIE_LOG: /home/hduser/oozie/logs
106 | Setting OOZIE_LOG4J_FILE: oozie-log4j.properties
107 | Setting OOZIE_LOG4J_RELOAD: 10
108 | Setting OOZIE_HTTP_HOSTNAME: rohit-VirtualBox
109 | Setting OOZIE_HTTP_PORT: 11000
110 | Setting OOZIE_ADMIN_PORT: 11001
111 | Setting OOZIE_HTTPS_PORT: 11443
112 | Setting OOZIE_BASE_URL: http://rohit-VirtualBox:11000/oozie
113 | Setting CATALINA_BASE: /home/hduser/oozie/oozie-server
114 | Setting OOZIE_HTTPS_KEYSTORE_FILE: /home/hduser/.keystore
115 | Setting OOZIE_HTTPS_KEYSTORE_PASS: password
116 | Setting CATALINA_OUT: /home/hduser/oozie/logs/catalina.out
117 | Setting CATALINA_PID: /home/hduser/oozie/oozie-server/temp/oozie.pid
118 | 
119 | Using CATALINA_OPTS: -Xmx1024m -Dderby.stream.error.file=/home/hduser/oozie/logs/derby.log
120 | Adding to CATALINA_OPTS: -Doozie.home.dir=/home/hduser/oozie -Doozie.config.dir=/home/hduser/oozie/conf -Doozie.log.dir=/home/hduser/oozie/logs -Doozie.data.dir=/home/hduser/oozie/data -Doozie.config.file=oozie-site.xml -Doozie.log4j.file=oozie-log4j.properties -Doozie.log4j.reload=10 -Doozie.http.hostname=rohit-VirtualBox -Doozie.admin.port=11001 -Doozie.http.port=11000 -Doozie.https.port=11443 -Doozie.base.url=http://rohit-VirtualBox:11000/oozie -Doozie.https.keystore.file=/home/hduser/.keystore -Doozie.https.keystore.pass=password -Djava.library.path=
121 | 
122 | Using CATALINA_BASE: /home/hduser/oozie/oozie-server
123 | Using CATALINA_HOME: /home/hduser/oozie/oozie-server
124 | Using CATALINA_TMPDIR: /home/hduser/oozie/oozie-server/temp
125 | Using JRE_HOME: /usr/lib/jvm/java-6-oracle
126 | Using CLASSPATH: /home/hduser/oozie/oozie-server/bin/bootstrap.jar
127 | Using CATALINA_PID: /home/hduser/oozie/oozie-server/temp/oozie.pid
128 | 
129 | To start Oozie as a foreground process use the following command:
130 | 
131 | $ ./bin/oozied.sh run
132 | 
133 | Check the Oozie log file logs/oozie.log to ensure Oozie started properly.
134 | 
135 | Use the following command to check the status of Oozie from command line:
136 | 
137 | $ ./bin/oozie admin -oozie http://localhost:11000/oozie -status
138 | System mode: NORMAL
139 | 
140 | URL for the Oozie Web Console is http://localhost:11000/oozie
141 | 


--------------------------------------------------------------------------------
/zookeeper_oozie/zookeeper.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/zookeeper_oozie/zookeeper.txt


--------------------------------------------------------------------------------