├── Courses_Offered
├── Advanced_Data Lake_and_Streaming.pdf
├── Advanced_Hadoop_TroubleshootingVer2.0.pdf
├── Advanced_Hadoop_adminstrationV2.0.pdf
├── HBase_catalog_ver2.0.pdf
└── Hadoop_Admin_catalog.pdf
├── DNS
├── dns_installation.txt
├── named.txt
└── zones.txt
├── Flume
├── commands
├── flume_collector.conf
└── web_server_source.conf
├── HA_QJM
├── core-site.xml
├── hdfs-site.xml
├── hdfs-site_datanode.xml
├── hdfs-site_namenode.xml
├── steps
└── zoo.cfg
├── HA_RM
├── yarn-site.xml.ha
├── yarn-site.xml.spark
└── yarn-site_nodemanager_ha.xml
├── HA_hadoop
├── core-site.xml
└── hdfs-site.xml
├── HBase
├── Optimizations
│ ├── HBase_yscb.txt
│ ├── Hbase_create_table.txt
│ ├── Hbase_happybase.txt
│ ├── Hbase_rand_gen.txt
│ └── Netxillon_HBase.pdf
├── README.md
├── backup-masters
├── commands.txt
├── hbase-site.txt
├── hfile
├── hive-mysql.txt
├── hive.txt
├── regions.txt
├── regionservers
├── replication
├── tez-setup
└── untitled.txt
├── Hive_performance
├── Jars
├── azure.tar.gz
├── hadoop-azure-3.1.1.3.1.0.0-SNAPSHOT.jar
└── jce_policy-8.zip
├── Kafka
├── commands
├── kafka-env.sh
├── kafka_ganglia.txt
├── kafka_ganglia2.txt
├── kakfa_rsyslog.txt
└── server.properties
├── Notes
├── Benchmarking.txt
├── Hadoop_lab.doc
├── Hadoop_upgrade.txt
├── Performance.txt
├── backup.txt
├── cassandra2.pdf
├── class3_questions
├── class4_questions
├── cloudera.txt
├── disk_partition
├── hadoop_ports.txt
├── hadoop_ports_firewall.xls
├── install_hadoop.txt
├── installation.txt
├── pig.txt
├── questions.txt
├── quick-links
├── quiz4.txt
├── quiz7.txt
├── quota.txt
├── rack.txt
├── remove_datanode.txt
├── repo_server.txt
├── scoop.txt
├── sqoop.txt
├── sqoop1.txt
└── yarn.txt
├── README.md
├── Schedulers
├── capacity-scheduler.xml
├── commands
├── fair-scheduler.xml
├── mapred-site.xml
├── user-mappings.txt
├── yarn-site.xml_capacity
└── yarn-site.xml_fair
├── Security
├── README.md
├── SSL_Configs
│ ├── CA
│ │ ├── README.txt
│ │ └── openssl.cnf
│ ├── commands_CA_JKS
│ ├── gen-certs.sh
│ └── hadoop_ssl_configs
│ │ ├── core-site.xml
│ │ ├── hdfs-site.xml
│ │ ├── mapred-site.xml
│ │ ├── ssl-client.xml
│ │ ├── ssl-server.xml
│ │ └── yarn-site.xml
├── kerberos
│ ├── JT
│ │ ├── core-site.xml
│ │ ├── hdfs-site.xml
│ │ ├── mapred-site.xml
│ │ └── taskcontroller.cfg
│ ├── Jsvc_download.txt
│ ├── Namenode_Datanode
│ │ ├── core-site.xml
│ │ ├── hadoop-env.sh
│ │ ├── hdfs-site.xml
│ │ ├── mapred-site.xml
│ │ └── taskcontroller.cfg
│ ├── README.md
│ ├── kdc.conf
│ ├── kerberos_scripts
│ │ ├── README.md
│ │ ├── add_users.sh
│ │ ├── copy_keytabs.sh
│ │ ├── create_dn_princs.sh
│ │ ├── create_nn_princs.sh
│ │ ├── create_partions.sh
│ │ ├── create_user_keytab.sh
│ │ ├── delete_list
│ │ ├── delete_princs.sh
│ │ ├── dn_host_list
│ │ ├── hosts
│ │ ├── install_krb.sh
│ │ ├── list_princs.sh
│ │ ├── nn_host_list
│ │ ├── setup_kerberos.sh
│ │ ├── user_host_list
│ │ └── user_list
│ ├── kerberos_user_mappings.txt
│ └── krb5.conf
├── kms
│ ├── kms-acl
│ └── kms-setup
└── ldap
│ ├── Installation_steps
│ ├── addmembers.ldif
│ ├── adduser.ldif
│ ├── base.ldif
│ ├── base1.ldif
│ ├── base2.ldif
│ ├── commands
│ ├── groupadd.ldif
│ ├── slapd.conf.obsolete
│ ├── test.ldif
│ └── users.ldif
├── Spark
├── Spark_build
├── examples.txt
├── spark-defaults.conf
├── sparkPython
│ ├── erfunction.py
│ ├── error.py
│ ├── error.txt
│ ├── logparser.py
│ ├── pivot.txt
│ ├── square.py
│ ├── wordcount.py
│ └── wordcount.txt
├── spark_command.txt
├── spark_standalone_cluster.txt
└── yarn-site.xml.spark
├── Yarn_tuning
├── Yarn.pdf
├── mapred-site.xml
└── yarn-site.xml
├── _config.yml
├── hadoop1.0
└── README.md
├── hadoop2.0
├── bash_profile
├── core-site.xml
├── hdfs-site.xml
├── mapred-site.xml
└── yarn-site.xml
├── hadoop_build64bit
├── jumbune
├── logging
├── map_scripts
├── job.txt
├── mapper.py
├── mapper.sh
├── reducer.py
└── reducer.sh
├── zookeeper.txt
└── zookeeper_oozie
├── oozie-server.txt
└── zookeeper.txt
/Courses_Offered/Advanced_Data Lake_and_Streaming.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Courses_Offered/Advanced_Data Lake_and_Streaming.pdf
--------------------------------------------------------------------------------
/Courses_Offered/Advanced_Hadoop_TroubleshootingVer2.0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Courses_Offered/Advanced_Hadoop_TroubleshootingVer2.0.pdf
--------------------------------------------------------------------------------
/Courses_Offered/Advanced_Hadoop_adminstrationV2.0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Courses_Offered/Advanced_Hadoop_adminstrationV2.0.pdf
--------------------------------------------------------------------------------
/Courses_Offered/HBase_catalog_ver2.0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Courses_Offered/HBase_catalog_ver2.0.pdf
--------------------------------------------------------------------------------
/Courses_Offered/Hadoop_Admin_catalog.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Courses_Offered/Hadoop_Admin_catalog.pdf
--------------------------------------------------------------------------------
/DNS/dns_installation.txt:
--------------------------------------------------------------------------------
1 | DNS Installation
2 | ================
3 |
4 | # yum install bind -y
5 |
6 | # vi /etc/named.conf
7 |
8 | remove everything and just use the lines below (Change IP's accordingly)
9 |
10 | options {
11 | <<<<<<< HEAD:DNS/dns_installation.txt
12 | listen-on port 53 { 192.168.1.254; };
13 | directory "/var/named";
14 | =======
15 | listen-on port 53 { 192.168.1.254; };
16 | directory "/var/named";
17 | >>>>>>> 925912d1565c9ba263d928397d065c93178cb463:hadoop1.0/dns.txt
18 |
19 | allow-query { any; };
20 |
21 | forwarders { 192.168.1.1; };
22 |
23 | };
24 |
25 | zone "cluster1.com" IN {
26 | type master;
27 | file "/var/named/named.hadoop.forw";
28 | };
29 |
30 | <<<<<<< HEAD:DNS/dns_installation.txt
31 | zone "1.168.192.in-addr.arpa" IN {
32 | type master;
33 | file "/var/named/named.ha.rev";
34 |
35 | };
36 |
37 | =================================================
38 |
39 | # vi /var/named/named.hadoop.forward.zone
40 |
41 | $TTL 86400
42 | @ IN SOA cluster1.com root (
43 | 42 ; serial
44 | 3H ; refresh
45 | 15M ; retry
46 | 1W ; expiry
47 | 1D ) ; minimum
48 |
49 | IN NS ns1.cluster1.com
50 | ns1 IN A 192.168.1.70
51 |
52 | nn1 IN A 192.168.1.70
53 | nn2 IN A 192.168.1.77
54 | dn1 IN A 192.168.1.71
55 | dn2 IN A 192.168.1.72
56 | dn3 IN A 192.168.1.73
57 | snn IN A 192.168.1.68
58 | jt IN A 192.168.1.69
59 | db IN A 192.168.1.99
60 | kdc IN A 192.168.1.40
61 | cm IN A 192.168.1.41
62 | base IN A 192.168.1.10
63 | cm1 IN A 192.168.1.11
64 | node1 IN A 192.168.1.12
65 | filer IN A 192.168.1.222
66 | cloudera IN A 192.168.1.151
67 | datanode IN A 192.168.1.152
68 | hadooplab IN A 192.168.1.33
69 |
70 | ===================
71 |
72 | # vi /var/named/named.ha.rev
73 | =======
74 |
75 | zone "1.168.192.in-addr.arpa" IN {
76 | type master;
77 | file "/var/named/named.hadoop.rev";
78 | };
79 | ============
80 | zone files (cluster1.com) forward zone
81 | =========
82 |
83 | $TTL 1D
84 | @ IN SOA @ rname.invalid. (
85 | 0 ; serial
86 | 1D ; refresh
87 | 1H ; retry
88 | 1W ; expire
89 | 3H ) ; minimum
90 |
91 |
92 | IN NS ns1.cluster1.com.
93 | ns1 IN A 192.168.1.254
94 | repo IN A 192.168.1.254
95 |
96 | ;namenodes
97 | nn1 IN A 192.168.1.70
98 | nn2 IN A 192.168.1.71
99 |
100 | ;Datanodes
101 | dn1 IN A 192.168.1.72
102 | dn2 IN A 192.168.1.73
103 | dn3 IN A 192.168.1.74
104 | dn4 IN A 192.168.1.75
105 |
106 | ;Other Masters
107 |
108 | snn IN A 192.168.1.68
109 | jt IN A 192.168.1.69
110 | client IN A 192.168.1.99
111 | kdc IN A 192.168.1.102
112 |
113 | ;Cloudera
114 |
115 | cm IN A 192.168.1.40
116 | node1 IN A 192.168.1.41
117 | node2 IN A 192.168.1.42
118 | server IN A 192.168.1.44
119 |
120 | ;Storage
121 |
122 | filer IN A 192.168.1.253
123 |
124 | ;Databases;
125 |
126 | mynode1 IN A 192.168.1.81
127 | mynode2 IN A 192.168.1.82
128 | labserver IN A 192.168.1.14
129 | ===========
130 |
131 | Reverse zone for cluster1.com
132 | ==============================
133 | >>>>>>> 925912d1565c9ba263d928397d065c93178cb463:hadoop1.0/dns.txt
134 |
135 | $TTL 86400
136 | @ IN SOA ns1.cluster1.com. root.cluster1.com. (
137 | 1997022700 ; Serial
138 | 28800 ; Refresh
139 | 14400 ; Retry
140 | 3600000 ; Expire
141 | 86400 ) ; Minimum
142 |
143 | 1.168.192.in-addr.arpa. IN NS ns1.cluster1.com.
144 |
145 | 70 IN PTR nn1.cluster1.com.
146 | 40 IN PTR kdc.cluster1.com.
147 | 41 IN PTR cm.cluster1.com.
148 | 20 IN PTR node1.cluster1.com.
149 | 71 IN PTR dn1.cluster1.com.
150 | 72 IN PTR dn2.cluster1.com.
151 | 73 IN PTR dn3.cluster1.com.
152 | 10 IN PTR base.cluster1.com.
153 | 11 IN PTR cm1.cluster1.com.
154 | 12 IN PTR node1.cluster1.com.
155 | 151 IN PTR cloudera.cluster1.com.
156 | 152 IN PTR datanode.cluster1.com.
157 |
158 | ==============================================
159 |
160 | # chown -R root:named /var/named/
161 |
162 |
163 |
164 | # service named restart
165 |
166 | ===========================
167 |
168 | On all client machines
169 |
170 | # vi /etc/resolv.conf
171 |
172 | nameserver 192.168.1.254
173 |
--------------------------------------------------------------------------------
/DNS/named.txt:
--------------------------------------------------------------------------------
1 | options {
2 | listen-on port 53 { 192.168.1.254; };
3 | directory "/var/named";
4 |
5 | allow-query { any; };
6 |
7 | forwarders { 192.168.1.1; };
8 |
9 | };
10 |
11 | zone "cluster1.com" IN {
12 | type master;
13 | file "/var/named/named.hadoop.forward.zone";
14 | };
15 |
16 | ;Second Zone
17 | zone "hacluster1.com" IN {
18 | type master;
19 | file "/var/named/named.ha.forward.zone";
20 | };
21 |
22 | zone "1.168.192.in-addr.arpa" IN {
23 | type master;
24 | file "/var/named/named.ha.rev";
25 | # file "/var/named/named.hadoop.rev";
26 | };
27 |
--------------------------------------------------------------------------------
/DNS/zones.txt:
--------------------------------------------------------------------------------
1 | #Forward lookup zone
2 | ;Make sure you understand that comments could be using # or ; and it might change in future versions
3 |
4 | $TTL 86400
5 | @ IN SOA cluster1.com root (
6 | 42 ; serial
7 | 3H ; refresh
8 | 15M ; retry
9 | 1W ; expiry
10 | 1D ) ; minimum
11 |
12 | IN NS ns1.cluster1.com
13 | ns1 IN A 192.168.1.70
14 |
15 | nn1 IN A 192.168.1.70
16 | nn2 IN A 192.168.1.77
17 | dn1 IN A 192.168.1.71
18 | dn2 IN A 192.168.1.72
19 | dn3 IN A 192.168.1.73
20 | snn IN A 192.168.1.68
21 | jt IN A 192.168.1.69
22 | db IN A 192.168.1.99
23 | kdc IN A 192.168.1.40
24 | cm IN A 192.168.1.41
25 | base IN A 192.168.1.10
26 | cm1 IN A 192.168.1.11
27 | node1 IN A 192.168.1.12
28 | filer IN A 192.168.1.222
29 | cloudera IN A 192.168.1.151
30 | datanode IN A 192.168.1.152
31 | hadooplab IN A 192.168.1.33
32 |
33 |
34 | ================
35 | # Reverse lookup Zone
36 | +++++++++++++++++++++++
37 |
38 | $TTL 86400
39 | @ IN SOA ns1.cluster1.com. root.cluster1.com. (
40 | 1997022700 ; Serial
41 | 28800 ; Refresh
42 | 14400 ; Retry
43 | 3600000 ; Expire
44 | 86400 ) ; Minimum
45 |
46 | 1.168.192.in-addr.arpa. IN NS ns1.cluster1.com.
47 |
48 | 70 IN PTR nn1.cluster1.com.
49 | 40 IN PTR kdc.cluster1.com.
50 | 41 IN PTR cm.cluster1.com.
51 | 20 IN PTR node1.cluster1.com.
52 | 71 IN PTR dn1.cluster1.com.
53 | 72 IN PTR dn2.cluster1.com.
54 | 73 IN PTR dn3.cluster1.com.
55 | 10 IN PTR base.cluster1.com.
56 | 11 IN PTR cm1.cluster1.com.
57 | 12 IN PTR node1.cluster1.com.
58 | 151 IN PTR cloudera.cluster1.com.
59 | 152 IN PTR datanode.cluster1.com.
60 |
--------------------------------------------------------------------------------
/Flume/commands:
--------------------------------------------------------------------------------
1 | # Start Collector first:
2 |
3 | flume-ng agent -c conf -f flume/conf/flume_collector.conf -n collector (Name must match the agent name)
4 | flume-ng agent -c conf -f flume/conf/web_server_source.conf -n source_agent (Name must match agent name)
5 |
6 |
7 | # Change the hostnames/IPs in the config files accordingly.
8 |
9 |
10 | #Note:
11 | - New deployments do not use flume any more and kafka is doing lot of things.
12 | - But, flume is still a good use case for log aggregatio etc and avoid the overhead of kafka cluster.
13 |
--------------------------------------------------------------------------------
/Flume/flume_collector.conf:
--------------------------------------------------------------------------------
1 | #http://flume.apache.org/FlumeUserGuide.html#avro-source
2 | collector.sources = AvroIn
3 | collector.sources.AvroIn.type = avro
4 | collector.sources.AvroIn.bind = 192.168.1.109
5 | collector.sources.AvroIn.port = 4545
6 | collector.sources.AvroIn.channels = mc1 mc2
7 |
8 | ## Channels ##
9 | ## Source writes to 2 channels, one for each sink
10 | collector.channels = mc1 mc2
11 |
12 | #http://flume.apache.org/FlumeUserGuide.html#memory-channel
13 |
14 | collector.channels.mc1.type = memory
15 | collector.channels.mc1.capacity = 100
16 |
17 | collector.channels.mc2.type = memory
18 | collector.channels.mc2.capacity = 100
19 |
20 | ## Sinks ##
21 | collector.sinks = LocalOut HadoopOut
22 |
23 | ## Write copy to Local Filesystem
24 | #http://flume.apache.org/FlumeUserGuide.html#file-roll-sink
25 | collector.sinks.LocalOut.type = file_roll
26 | collector.sinks.LocalOut.sink.directory = /var/log/flume-ng
27 | collector.sinks.LocalOut.sink.rollInterval = 0
28 | collector.sinks.LocalOut.channel = mc1
29 |
30 | ## Write to HDFS
31 | #http://flume.apache.org/FlumeUserGuide.html#hdfs-sink
32 | collector.sinks.HadoopOut.type = hdfs
33 | collector.sinks.HadoopOut.channel = mc2
34 | collector.sinks.HadoopOut.hdfs.path = /user/hadoop/flume-channel/%{log_type}/%y%m%d
35 | collector.sinks.HadoopOut.hdfs.fileType = DataStream
36 | collector.sinks.HadoopOut.hdfs.writeFormat = Text
37 | collector.sinks.HadoopOut.hdfs.rollSize = 0
38 | collector.sinks.HadoopOut.hdfs.rollCount = 10000
39 | collector.sinks.HadoopOut.hdfs.rollInterval = 600
40 |
41 |
--------------------------------------------------------------------------------
/Flume/web_server_source.conf:
--------------------------------------------------------------------------------
1 | # Source Config
2 |
3 | source_agent.sources = apache_server
4 | source_agent.sources.apache_server.type = exec
5 | source_agent.sources.apache_server.command = tail -f /var/log/httpd/access_log
6 | source_agent.sources.apache_server.batchSize = 1
7 | source_agent.sources.apache_server.channels = mc1
8 | source_agent.sources.apache_server.interceptors = itime ihost itype
9 |
10 | # http://flume.apache.org/FlumeUserGuide.html#timestamp-interceptor
11 | source_agent.sources.apache_server.interceptors.itime.type = timestamp
12 |
13 | # http://flume.apache.org/FlumeUserGuide.html#host-interceptor
14 | source_agent.sources.apache_server.interceptors.ihost.type = host
15 | source_agent.sources.apache_server.interceptors.ihost.useIP = false
16 | source_agent.sources.apache_server.interceptors.ihost.hostHeader = host
17 |
18 | # http://flume.apache.org/FlumeUserGuide.html#static-interceptor
19 | source_agent.sources.apache_server.interceptors.itype.type = static
20 | source_agent.sources.apache_server.interceptors.itype.key = log_type
21 | source_agent.sources.apache_server.interceptors.itype.value = apache_access_combined
22 |
23 | # http://flume.apache.org/FlumeUserGuide.html#memory-channel
24 | source_agent.channels = mc1
25 | source_agent.channels.mc1.type = memory
26 | source_agent.channels.mc1.capacity = 100
27 |
28 | ## Send to Flume Collector on Hadoop Node
29 | # http://flume.apache.org/FlumeUserGuide.html#avro-sink
30 | source_agent.sinks = avro_sink
31 | source_agent.sinks.avro_sink.type = avro
32 | source_agent.sinks.avro_sink.channel = mc1
33 | source_agent.sinks.avro_sink.hostname = 192.168.1.109
34 | source_agent.sinks.avro_sink.port = 4545
35 |
36 | #source_agent.sinks = LocalOut
37 | #source_agent.sinks.LocalOut.type = file_roll
38 | #source_agent.sinks.LocalOut.sink.directory = /tmp/flume
39 | #source_agent.sinks.LocalOut.sink.rollInterval = 0
40 | #source_agent.sinks.LocalOut.channel = mc1
41 |
--------------------------------------------------------------------------------
/HA_QJM/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | fs.defaultFS
8 | hdfs://netxillon
9 |
10 |
11 |
12 | ha.zookeeper.quorum
13 | nn1.dilithium.com:2181,nn2.dilithium.com:2181,hbm1.dilithium.com:2181
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/HA_QJM/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | dfs.namenode.name.dir
5 | file:/data/n1,file:/data/n2
6 |
7 |
8 |
9 | dfs.replication
10 | 2
11 |
12 |
13 |
14 | dfs.blocksize
15 | 268435456
16 |
17 |
18 |
19 | dfs.nameservices
20 | netxillon
21 |
22 |
23 |
24 | dfs.ha.namenodes.netxillon
25 | nn1,nn2
26 |
27 |
28 |
29 | dfs.namenode.rpc-address.netxillon.nn1
30 | nn1.dilithium.com:9000
31 |
32 |
33 |
34 | dfs.namenode.rpc-address.netxillon.nn2
35 | nn2.dilithium.com:9000
36 |
37 |
38 |
39 | dfs.namenode.http-address.netxillon.nn1
40 | nn1.dilithium.com:50070
41 |
42 |
43 |
44 | dfs.namenode.http-address.netxillon.nn2
45 | nn2.dilithium.com:50070
46 |
47 |
48 |
49 | dfs.ha.automatic-failover.enabled
50 | true
51 |
52 |
53 |
54 | dfs.journalnode.edits.dir
55 | /data/netxillon
56 |
57 |
58 |
59 | dfs.namenode.shared.edits.dir
60 | qjournal://nn1.dilithium.com:8485;nn2.dilithium.com:8485;hbm1.dilithium.com:8485/netxillon
61 |
62 |
63 |
64 | dfs.client.failover.proxy.provider.netxillon
65 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
66 |
67 |
68 |
69 | dfs.ha.fencing.ssh.private-key-files
70 | /home/hadoop/.ssh/id_rsa
71 |
72 |
73 |
74 | dfs.ha.fencing.methods
75 | sshfence
76 | shell(/bin/true)
77 |
78 |
79 |
80 |
--------------------------------------------------------------------------------
/HA_QJM/hdfs-site_datanode.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | dfs.datanode.data.dir
5 | file:/space/d1,file:/space/d2
6 |
7 |
8 |
9 | dfs.replication
10 | 2
11 |
12 |
13 |
14 | dfs.blocksize
15 | 268435456
16 |
17 |
18 |
19 | dfs.nameservices
20 | netxillon
21 |
22 |
23 |
24 | dfs.ha.namenodes.netxillon
25 | nn1,nn2
26 |
27 |
28 |
29 | dfs.namenode.rpc-address.netxillon.nn1
30 | nn1.dilithium.com:9000
31 |
32 |
33 |
34 | dfs.namenode.rpc-address.netxillon.nn2
35 | nn2.dilithium.com:9000
36 |
37 |
38 |
39 | dfs.client.failover.proxy.provider.netxillon
40 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/HA_QJM/hdfs-site_namenode.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | dfs.namenode.name.dir
5 | file:/data/n1,file:/data/n2
6 |
7 |
8 |
9 | dfs.replication
10 | 2
11 |
12 |
13 |
14 | dfs.blocksize
15 | 268435456
16 |
17 |
18 |
19 | dfs.nameservices
20 | netxillon
21 |
22 |
23 |
24 | dfs.ha.namenodes.netxillon
25 | nn1,nn2
26 |
27 |
28 |
29 | dfs.namenode.rpc-address.netxillon.nn1
30 | nn1.dilithium.com:9000
31 |
32 |
33 |
34 | dfs.namenode.rpc-address.netxillon.nn2
35 | nn2.dilithium.com:9000
36 |
37 |
38 |
39 | dfs.namenode.http-address.netxillon.nn1
40 | nn1.dilithium.com:50070
41 |
42 |
43 |
44 | dfs.namenode.http-address.netxillon.nn2
45 | nn2.dilithium.com:50070
46 |
47 |
48 |
49 | dfs.ha.automatic-failover.enabled
50 | true
51 |
52 |
53 |
54 | dfs.journalnode.edits.dir
55 | /data/netxillon
56 |
57 |
58 |
59 | dfs.namenode.shared.edits.dir
60 | qjournal://nn1.dilithium.com:8485;nn2.dilithium.com:8485;hbm1.dilithium.com:8485/netxillon
61 |
62 |
63 |
64 | dfs.client.failover.proxy.provider.netxillon
65 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
66 |
67 |
68 |
69 | dfs.ha.fencing.ssh.private-key-files
70 | /home/hadoop/.ssh/id_rsa
71 |
72 |
73 |
74 | dfs.ha.fencing.methods
75 | sshfence
76 | shell(/bin/true)
77 |
78 |
79 |
80 |
--------------------------------------------------------------------------------
/HA_QJM/steps:
--------------------------------------------------------------------------------
1 | QJM Steps
2 | =========
3 | 1. Setup zookeeper quorum and make sure that it is healthy
4 | - ./zookeeper-3.4.5/bin/zkServer.sh start
5 | - ./zookeeper-3.4.5/bin/zkCli.sh -server nn1.dilithium.com:2181
6 |
7 | or
8 |
9 | [hdfs@nn2 ~]$ zkServer.sh status
10 | ZooKeeper JMX enabled by default
11 | Using config: /opt/cluster/zoo/bin/../conf/zoo.cfg
12 | Mode: follower
13 |
14 | 2. Setup core and hdfs file on each node as given.
15 |
16 | 3. Start Journalnode on all journal node machines
17 | - hadoop-daemon.sh start journalnode
18 |
19 | 4. Format namenode (Do not run this command, if you already have a NN runing and want to update it to HA)
20 | - hdfs namenode -format
21 |
22 | 5. Initialize shared edits for Journal node to see:
23 | - hdfs namenode -initializeSharedEdits -force
24 |
25 | Note:
26 | - namenode must be stopped for this step;
27 | - Only run this if you have not executed step4.
28 | - Means that there was already a single Namenode and now you need to initialize the shared edits for Journals.
29 | - You must not use initialize command if we are formating the Namenode as it will automatically initiallize the Journals
30 | node directories
31 |
32 |
33 | 6. Format zkFC
34 | - hdfs zkfc -formatZK -force
35 |
36 | 7. Start namenode
37 | - hadoop-daemon.sh start namenode
38 |
39 | 8. Start ZKFC
40 | - hadoop-daemon.sh start zkfc
41 |
42 | 9. Bootstrap StandbyNamenode
43 | - hdfs namenode -bootstrapStandby
44 |
45 | 10. Start Namenode on standby
46 | - hadoop-daemon.sh start namenode
47 |
48 | 11. Start ZKFC on standy
49 | - hadoop-daemon.sh start zkfc
50 |
51 | Now your cluster is HA with one active Namenode
52 |
53 | [hdfs@nn1 ~]$ hdfs haadmin -getServiceState nn1
54 | active
55 | [hdfs@nn1 ~]$ hdfs haadmin -getServiceState nn2
56 | standby
57 |
58 | The "start-dfs.sh" script understands that it is a HA with Journal nodes and automatically starts:
59 | - Both namenodes
60 | - All Journal nodes
61 | - Datanode nodes
62 | - Both ZkFC
63 |
64 | Note: Make sure you start the ZK quorum before hand.
65 |
66 | [hdfs@nn1 ~]$ jps
67 | 7828 Jps
68 | 7336 JournalNode
69 | 7512 DFSZKFailoverController
70 | 7162 NameNode
71 |
72 |
--------------------------------------------------------------------------------
/HA_QJM/zoo.cfg:
--------------------------------------------------------------------------------
1 | [hdfs@nn2 ~]$ cat .bash_profile
2 | # .bash_profile
3 |
4 | # Get the aliases and functions
5 | if [ -f ~/.bashrc ]; then
6 | . ~/.bashrc
7 | fi
8 |
9 | # User specific environment and startup programs
10 |
11 | PATH=$PATH:$HOME/bin
12 | ZOOKEEPER_HOME=/opt/cluster/zoo
13 |
14 | PATH=$ZOOKEEPER_HOME/bin:$PATH
15 | export PATH
16 |
17 |
18 | [hdfs@nn2 ~]$ cat /opt/cluster/zoo/conf/zoo.cfg
19 |
20 | # The number of milliseconds of each tick
21 | tickTime=2000
22 |
23 | # The number of ticks that the initial synchronization phase can take
24 | initLimit=10
25 |
26 | # The number of ticks that can pass between
27 | # sending a request and getting an acknowledgement
28 | syncLimit=5
29 |
30 | # the directory where the snapshot is stored.
31 | # Choose appropriately for your environment
32 | dataDir=/opt/cluster/zookeeper/data
33 |
34 | # the port at which the clients will connect
35 | clientPort=2181
36 |
37 | maxClientCnxns=60
38 |
39 | # the directory where transaction log is stored.
40 | # this parameter provides dedicated log device for ZooKeeper
41 | dataLogDir=/opt/cluster/zookeeper/logs
42 |
43 | # ZooKeeper server and its port no.
44 | # ZooKeeper ensemble should know about every other machine in the ensemble
45 | # specify server id by creating 'myid' file in the dataDir
46 | # use hostname instead of IP address for convenient maintenance
47 | server.1=nn1.dilithium.com:2888:3888
48 | server.2=nn2.dilithium.com:2889:3889
49 | server.3=hbm1.dilithium.com:2890:3890
50 |
51 |
--------------------------------------------------------------------------------
/HA_RM/yarn-site.xml.ha:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | yarn.nodemanager.aux-services
8 | mapreduce_shuffle,spark_shuffle,spark2_shuffle
9 |
10 |
11 |
12 | yarn.nodemanager.aux-services.mapreduce.shuffle.class
13 | org.apache.hadoop.mapred.ShuffleHandler
14 |
15 |
16 |
17 | yarn.nodemanager.aux-services.spark_shuffle.class
18 | org.apache.spark.network.yarn.YarnShuffleService
19 |
20 |
21 |
22 | yarn.nodemanager.aux-services.spark2_shuffle.class
23 | org.apache.spark.network.yarn.YarnShuffleService
24 |
25 |
26 | # HA Configuration
27 |
28 |
29 |
30 |
31 | yarn.resourcemanager.ha.enabled
32 | true
33 |
34 |
35 |
36 | yarn.resourcemanager.cluster-id
37 | netxillon
38 |
39 |
40 |
41 | yarn.resourcemanager.ha.rm-ids
42 | rm1,rm2
43 |
44 |
45 |
46 | yarn.resourcemanager.hostname.rm1
47 | rm1.dilithium.com
48 |
49 |
50 |
51 | yarn.resourcemanager.hostname.rm2
52 | rm2.dilithium.com
53 |
54 |
55 |
56 | yarn.resourcemanager.webapp.address.rm1
57 | rm1.dilithium.com:8088
58 |
59 |
60 |
61 | yarn.resourcemanager.webapp.address.rm2
62 | rm2.dilithium.com:8088
63 |
64 |
65 |
66 | yarn.resourcemanager.recovery.enabled
67 | true
68 |
69 |
70 |
71 | yarn.resourcemanager.store.class
72 | org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore
73 |
74 |
75 |
76 | yarn.resourcemanager.zk-address
77 | nn1.dilithium.com:2181,nn2.dilithium.com:2181,hbm1.dilithium.com:2181
78 |
79 |
80 |
81 | yarn.client.failover-proxy-provider
82 | org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider
83 |
84 |
85 |
99 | # End HA Configuration
100 |
101 |
102 | yarn.nodemanager.resource.memory-mb
103 | 3072
104 |
105 |
106 |
107 | yarn.scheduler.minimum-allocation-mb
108 | 256
109 |
110 |
111 |
112 | yarn.scheduler.maximum-allocation-mb
113 | 3072
114 |
115 |
116 |
117 | yarn.scheduler.minimum-allocation-vcores
118 | 1
119 |
120 |
121 |
122 | yarn.scheduler.maximum-allocation-vcores
123 | 12
124 |
125 |
126 |
127 | yarn.nodemanager.resource.cpu-vcores
128 | 12
129 |
130 |
131 |
132 |
133 | yarn.nodemanager.vmem-pmem-ratio
134 | 2.1
135 |
136 |
137 |
138 |
139 | yarn.log-aggregation-enable
140 | true
141 |
142 |
143 |
144 | Where to aggregate logs to.
145 | yarn.nodemanager.remote-app-log-dir
146 | /tmp/logs
147 |
148 |
149 |
150 | yarn.log-aggregation.retain-seconds
151 | 259200
152 |
153 |
154 |
155 | yarn.log-aggregation.retain-check-interval-seconds
156 | 3600
157 |
158 |
159 |
160 |
161 |
--------------------------------------------------------------------------------
/HA_RM/yarn-site.xml.spark:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | yarn.nodemanager.aux-services
8 | mapreduce_shuffle,spark_shuffle,spark2_shuffle
9 |
10 |
11 |
12 | yarn.nodemanager.aux-services.mapreduce.shuffle.class
13 | org.apache.hadoop.mapred.ShuffleHandler
14 |
15 |
16 |
17 | yarn.nodemanager.aux-services.spark_shuffle.class
18 | org.apache.spark.network.yarn.YarnShuffleService
19 |
20 |
21 |
22 |
23 | yarn.nodemanager.aux-services.spark2_shuffle.class
24 | org.apache.spark.network.yarn.YarnShuffleService
25 |
26 |
27 |
28 | yarn.resourcemanager.resource-tracker.address
29 | rm1.dilithium.com:9001
30 |
31 |
32 |
33 | yarn.resourcemanager.scheduler.address
34 | rm1.dilithium.com:9002
35 |
36 |
37 |
38 | yarn.resourcemanager.address
39 | rm1.dilithium.com:9003
40 |
41 |
42 | #
43 | #yarn.nodemanager.local-dirs
44 | #file:/space/tmp1,file:/space/tmp2
45 | #
46 |
47 |
48 | yarn.nodemanager.resource.memory-mb
49 | 3072
50 |
51 |
52 |
53 | yarn.scheduler.minimum-allocation-mb
54 | 256
55 |
56 |
57 |
58 | yarn.scheduler.maximum-allocation-mb
59 | 3072
60 |
61 |
62 |
63 | yarn.scheduler.minimum-allocation-vcores
64 | 1
65 |
66 |
67 |
68 | yarn.scheduler.maximum-allocation-vcores
69 | 12
70 |
71 |
72 |
73 | yarn.nodemanager.resource.cpu-vcores
74 | 12
75 |
76 |
77 |
78 |
79 | yarn.nodemanager.vmem-pmem-ratio
80 | 2.1
81 |
82 |
83 | #
84 | # yarn.nodemanager.vmem-check-enabled
85 | # false
86 | # Whether virtual memory limits will be enforced for containers
87 | #
88 |
89 |
90 | yarn.log-aggregation-enable
91 | true
92 |
93 |
94 |
95 | Where to aggregate logs to.
96 | yarn.nodemanager.remote-app-log-dir
97 | /tmp/logs
98 |
99 |
100 |
101 | yarn.log-aggregation.retain-seconds
102 | 259200
103 |
104 |
105 |
106 | yarn.log-aggregation.retain-check-interval-seconds
107 | 3600
108 |
109 |
110 |
111 |
112 |
--------------------------------------------------------------------------------
/HA_RM/yarn-site_nodemanager_ha.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | yarn.nodemanager.aux-services
8 | mapreduce_shuffle,spark_shuffle,spark2_shuffle
9 |
10 |
11 |
12 | yarn.nodemanager.aux-services.mapreduce.shuffle.class
13 | org.apache.hadoop.mapred.ShuffleHandler
14 |
15 |
16 |
17 | yarn.nodemanager.aux-services.spark_shuffle.class
18 | org.apache.spark.network.yarn.YarnShuffleService
19 |
20 |
21 |
22 | yarn.nodemanager.aux-services.spark2_shuffle.class
23 | org.apache.spark.network.yarn.YarnShuffleService
24 |
25 |
26 | # HA Configuration
27 |
28 |
29 |
30 |
31 | yarn.resourcemanager.ha.enabled
32 | true
33 |
34 |
35 |
36 | yarn.resourcemanager.cluster-id
37 | netxillon
38 |
39 |
40 |
41 | yarn.resourcemanager.ha.rm-ids
42 | rm1,rm2
43 |
44 |
45 |
46 | yarn.resourcemanager.hostname.rm1
47 | rm1.dilithium.com
48 |
49 |
50 |
51 | yarn.resourcemanager.hostname.rm2
52 | rm2.dilithium.com
53 |
54 |
55 |
56 | yarn.client.failover-proxy-provider
57 | org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider
58 |
59 |
60 | # End HA Configuration
61 |
62 |
63 | yarn.nodemanager.resource.memory-mb
64 | 3072
65 |
66 |
67 |
68 | yarn.scheduler.minimum-allocation-mb
69 | 256
70 |
71 |
72 |
73 | yarn.scheduler.maximum-allocation-mb
74 | 3072
75 |
76 |
77 |
78 | yarn.scheduler.minimum-allocation-vcores
79 | 1
80 |
81 |
82 |
83 | yarn.scheduler.maximum-allocation-vcores
84 | 12
85 |
86 |
87 |
88 | yarn.nodemanager.resource.cpu-vcores
89 | 12
90 |
91 |
92 |
93 |
94 | yarn.nodemanager.vmem-pmem-ratio
95 | 2.1
96 |
97 |
98 |
99 |
100 | yarn.log-aggregation-enable
101 | true
102 |
103 |
104 |
105 | Where to aggregate logs to.
106 | yarn.nodemanager.remote-app-log-dir
107 | /tmp/logs
108 |
109 |
110 |
111 | yarn.log-aggregation.retain-seconds
112 | 259200
113 |
114 |
115 |
116 | yarn.log-aggregation.retain-check-interval-seconds
117 | 3600
118 |
119 |
120 |
121 |
122 |
--------------------------------------------------------------------------------
/HA_hadoop/core-site.xml:
--------------------------------------------------------------------------------
1 | Both Namenodes
2 | ==============
3 |
4 |
5 |
6 |
7 | fs.defaultFS
8 | hdfs://mycluster
9 |
10 |
11 |
12 | dfs.replication
13 | 1
14 |
15 |
16 |
17 |
18 | Data Nodes
19 | ==========
20 |
21 |
22 |
23 | fs.defaultFS
24 | hdfs://mycluster
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/HA_hadoop/hdfs-site.xml:
--------------------------------------------------------------------------------
1 | Namenodes
2 | =========
3 |
4 |
5 |
6 |
7 | dfs.namenode.name.dir
8 | file:/data/namenode
9 |
10 |
11 |
12 | dfs.nameservices
13 | mycluster
14 |
15 |
16 |
17 | dfs.ha.namenodes.mycluster
18 | nn1,nn2
19 |
20 |
21 |
22 | dfs.namenode.rpc-address.mycluster.nn1
23 | ha-nn1.hacluster1.com:9000
24 |
25 |
26 |
27 | dfs.namenode.rpc-address.mycluster.nn2
28 | ha-nn2.hacluster1.com:9000
29 |
30 |
31 |
32 | dfs.namenode.http-address.mycluster.nn1
33 | ha-nn1.hacluster1.com:50070
34 |
35 |
36 |
37 | dfs.namenode.http-address.mycluster.nn2
38 | ha-nn2.hacluster1.com:50070
39 |
40 |
41 |
42 | dfs.namenode.shared.edits.dir
43 | file:///mnt/filer
44 |
45 |
46 |
47 | dfs.client.failover.proxy.provider.mycluster
48 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
49 |
50 |
51 |
52 | dfs.ha.fencing.methods
53 | sshfence
54 |
55 |
56 |
57 | dfs.ha.fencing.ssh.private-key-files
58 | /home/hadoop/.ssh/id_rsa
59 |
60 |
61 |
62 | dfs.ha.fencing.methods
63 | sshfence
64 | shell(/bin/true)
65 |
66 |
67 |
68 |
69 | ======================
70 |
71 | Datanodes
72 | =========
73 |
74 |
75 |
76 |
77 | dfs.datanode.data.dir
78 | file:/data/datanode
79 |
80 |
81 |
82 | dfs.nameservices
83 | mycluster
84 |
85 |
86 |
87 | dfs.ha.namenodes.mycluster
88 | nn1,nn2
89 |
90 |
91 |
92 | dfs.namenode.rpc-address.mycluster.nn1
93 | ha-nn1.hacluster1.com:9000
94 |
95 |
96 |
97 | dfs.namenode.rpc-address.mycluster.nn2
98 | ha-nn2.hacluster1.com:9000
99 |
100 |
101 |
102 | dfs.client.failover.proxy.provider.mycluster
103 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
104 |
105 |
106 |
107 |
108 |
--------------------------------------------------------------------------------
/HBase/Optimizations/HBase_yscb.txt:
--------------------------------------------------------------------------------
1 | Steps:
2 |
3 | 1. tar -xzvf ycsb-0.13.0-SNAPSHOT.tar.gz
4 | 2. cd ycsb-0.13.0-SNAPSHOT
5 | 3. cp /usr/lib/hbase/lib/slf4j-api-1.6.1.jar .
6 | 4. cp /usr/lib/hbase/lib/zookeeper.jar .
7 |
8 | hbase> create 'usertable', {NAME => 'f1', VERSIONS => '1', COMPRESSION => 'SNAPPY'}
9 |
10 | 5. cd ycsb-0.13.0-SNAPSHOT/bin
11 |
12 | $ ycsb load hbase12 -P workloads/workloada -p columnfamily=f1 -p recordcount=1000000 -p threadcount=4 -s | tee -a write.txt
13 | $ ycsb load hbase12 -P workloads/workloadb -p columnfamily=f1 -p recordcount=100000 -p operationcount=10000 -p threadcount=4 -s | tee -a workloadread.dat
--------------------------------------------------------------------------------
/HBase/Optimizations/Hbase_create_table.txt:
--------------------------------------------------------------------------------
1 |
2 | hbase(main):001:0> create 'emp', 'personal data', 'professional data', {SPLITS => (1..n_splits).map {|i| "user#{1000+i*(9999-1000)/n_splits}"}}
3 |
4 | create 'emp1', 'personal data', 'professional data', {REPLICATION_SCOPE => 1}
5 |
6 |
7 | hbase(main):001:0> n_splits = 200 # HBase recommends (10 * number of regionservers)
8 | hbase(main):002:0> create 'usertable', 'family', {SPLITS => (1..n_splits).map {|i| "user#{1000+i*(9999-1000)/n_splits}"}}
9 |
10 |
11 | scan 'hbase:meta',{FILTER=>"PrefixFilter('emp1')"}
12 |
13 |
14 | Snapshots:
15 |
16 |
17 | hbase snapshot create -n snapshotName -t tableName
18 |
19 | hbase shell
20 | >> delete_snapshot 'snapshotName'
21 | >> restore_snapshot snapshotName
22 | >> list_snapshots
23 | >> clone_snapshot 'snapshotName', 'newTableName'
24 |
25 | hbase snapshot info -snapshot snapshotName
26 |
27 |
28 |
--------------------------------------------------------------------------------
/HBase/Optimizations/Hbase_happybase.txt:
--------------------------------------------------------------------------------
1 | table = connection.table('table-name')
2 |
3 | table.put(b'row-key', {b'family:qual1': b'value1',
4 | b'family:qual2': b'value2'})
5 |
6 | row = table.row(b'row-key')
7 | print(row[b'family:qual1']) # prints 'value1'
8 |
9 | for key, data in table.rows([b'row-key-1', b'row-key-2']):
10 | print(key, data) # prints row key and data for each row
11 |
12 | for key, data in table.scan(row_prefix=b'row'):
13 | print(key, data) # prints 'value1' and 'value2'
14 |
15 | row = table.delete(b'row-key')
16 |
17 |
18 | families = {
19 | 'cf1': dict(max_versions=10),
20 | 'cf2': dict(max_versions=1, block_cache_enabled=False),
21 | 'cf3': dict(), # use defaults
22 | }
23 |
24 | connection.create_table('mytable', families)
--------------------------------------------------------------------------------
/HBase/Optimizations/Hbase_rand_gen.txt:
--------------------------------------------------------------------------------
1 | hbase(main):005:0> put 'emp','1','personal data:name','raju’
2 | hbase(main):006:0> put 'emp','1','personal data:city','hyderabad'
3 | hbase(main):007:0> put 'emp','1','professional data:designation','manager'
4 | hbase(main):007:0> put 'emp','1','professional data:salary','50000’
5 |
6 | locate_region 'test', '1'
7 | get_splits 'test'
8 |
9 | create 'emp', 'personal data', 'professional data'
10 | #!/bin/bash
11 |
12 | for i in `seq 1 1000000`
13 | do
14 |
15 | echo "put 'emp', '$i', 'personal data:name', 'raju$i'"
16 | echo "put 'emp', '$i', 'personal data:city', 'hyderabad$i'"
17 | echo "put 'emp', '$i', 'professional data:designation', 'manager$i'"
18 | echo "put 'emp', '$i', 'professional data:salary', '20000$i'"
19 |
20 | done
21 |
22 | # Optimized versions
23 | ====================
24 |
25 | #!/bin/bash
26 |
27 | MIN=0
28 | MAX=1234567890
29 | while
30 | for i in `seq 1 1000000`
31 | do
32 | rand=$(cat /dev/urandom | tr -dc 0-9 | fold -w${#MAX} | head -1 | sed 's/^0*//;')
33 | [ -z $rnd ] && rnd=0
34 | (( $rnd < $MIN || $rnd > $MAX ))
35 |
36 | echo "put 'emp', '$rand', 'personal data:name', 'raju$i'"
37 | echo "put 'emp', '$rand', 'personal data:city', 'hyderabad$i'"
38 | echo "put 'emp', '$rand', 'professional data:designation', 'manager$i'"
39 | echo "put 'emp', '$rand', 'professional data:salary', '20000$i'"
40 | done
41 | do :
42 | done
43 |
44 | =============
45 |
46 | #!/bin/bash
47 |
48 | # create 'emp', 'personal data', 'professional data'
49 |
50 | MIN=0
51 | MAX=1234567890
52 | while
53 |
54 | exp=`shuf -i 2000-65000 -n 1`
55 | #for i in `seq 1 10000000`
56 | #do
57 | rand=$(cat /dev/urandom | tr -dc 0-9 | fold -w${#MAX} | head -1 | sed 's/^0*//;')
58 | [ -z $rand ] && rand=0
59 | (( $rand < $MIN || $rand > $MAX ))
60 |
61 | echo "put 'emp', '$rand', 'personal data:name', 'raju$exp'"
62 | echo "put 'emp', '$rand', 'personal data:city', 'hyderabad$exp'"
63 | echo "put 'emp', '$rand', 'professional data:designation', 'manager$exp'"
64 | echo "put 'emp', '$rand', 'professional data:salary', '20$exp'"
65 | #done
66 | do :
67 | done
68 |
--------------------------------------------------------------------------------
/HBase/Optimizations/Netxillon_HBase.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/HBase/Optimizations/Netxillon_HBase.pdf
--------------------------------------------------------------------------------
/HBase/README.md:
--------------------------------------------------------------------------------
1 | export HADOOP_ROOT_LOGGER=TRACE,console; export HADOOP_JAAS_DEBUG=true; export HADOOP_OPTS="-Dsun.security.krb5.debug=true"
2 |
3 | export HBASE_ROOT_LOGGER=hbase.root.logger=DEBUG,console
4 |
--------------------------------------------------------------------------------
/HBase/backup-masters:
--------------------------------------------------------------------------------
1 | dn2.cluster1.com
2 |
--------------------------------------------------------------------------------
/HBase/commands.txt:
--------------------------------------------------------------------------------
1 | start-hbase.sh
2 | stop-hbase.sh
3 |
4 | hbase shell;
5 |
6 |
7 | create 't1', {NAME => 'f1', VERSIONS => 5}
8 |
9 | describe 't1'
10 |
11 |
12 | create 'class', 'cf'
13 |
14 | put 'class', 'row1', 'cf:a', 'value1'
15 | put 'class', 'row2', 'cf:b', 'value2'
16 | put 'class', 'row3', 'cf:c', 'value3'
17 | scan 'test1'
18 |
19 | put 'test', 'row1', 'cf:a', 'value1'
20 | put 'test', 'row3', 'cf:c', 'value3'
21 |
--------------------------------------------------------------------------------
/HBase/hbase-site.txt:
--------------------------------------------------------------------------------
1 |
2 | hbase.master
3 | client.cluster1.com:60000
4 |
5 |
6 |
7 | hbase.rootdir
8 | hdfs://nn1.cluster1.com:9000/hbase
9 |
10 |
11 |
12 | hbase.cluster.distributed
13 | true
14 |
15 |
16 |
17 | hbase.zookeeper.quorum
18 | dn1.cluster1.com,dn2.cluster1.com
19 |
20 |
21 |
22 |
23 | hbase.zookeeper.property.clientPort
24 | 2181
25 |
26 |
27 | Standalone Only
28 | ===============
29 |
30 |
31 |
32 | hbase.rootdir
33 | file:///home/hadoop/hdata
34 |
35 |
36 |
37 | hbase.zookeeper.property.dataDir
38 | /home/hadoop/zookeeper
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/HBase/hfile:
--------------------------------------------------------------------------------
1 | [hdfs@edge1 conf]$ hbase org.apache.hadoop.hbase.io.hfile.HFile -v -f /hbase/data/default/class/9c4785ae4c37c4679773326224d7f5fe/cf/e90064527dda44d1b09f12b36edb4dd7
2 | SLF4J: Class path contains multiple SLF4J bindings.
3 | SLF4J: Found binding in [jar:file:/opt/cluster/hbase-1.0.1.1/lib/slf4j-log4j12-1.7.7.jar!/org/slf4j/impl/StaticLoggerBinder.class]
4 | SLF4J: Found binding in [jar:file:/opt/cluster/hadoop-2.6.0/share/hadoop/common/lib/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]
5 | SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
6 | SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
7 | Scanning -> /hbase/data/default/class/9c4785ae4c37c4679773326224d7f5fe/cf/e90064527dda44d1b09f12b36edb4dd7
8 | 2016-01-26 00:06:50,390 INFO [main] hfile.CacheConfig: CacheConfig:disabled
9 | Scanned kv count -> 5
10 | [hdfs@edge1 conf]$ hbase org.apache.hadoop.hbase.io.hfile.HFile -v -f hdfs://nn1.dilithium.com:9000/hbase/data/default/class/9c4785ae4c37c4679773326224d7f5fe/cf/e90064527dda44d1b09f12b36edb4dd7
11 | SLF4J: Class path contains multiple SLF4J bindings.
12 | SLF4J: Found binding in [jar:file:/opt/cluster/hbase-1.0.1.1/lib/slf4j-log4j12-1.7.7.jar!/org/slf4j/impl/StaticLoggerBinder.class]
13 | SLF4J: Found binding in [jar:file:/opt/cluster/hadoop-2.6.0/share/hadoop/common/lib/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]
14 | SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
15 | SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
16 | Scanning -> hdfs://nn1.dilithium.com:9000/hbase/data/default/class/9c4785ae4c37c4679773326224d7f5fe/cf/e90064527dda44d1b09f12b36edb4dd7
17 | 2016-01-26 00:07:16,371 INFO [main] hfile.CacheConfig: CacheConfig:disabled
18 | Scanned kv count -> 5
19 |
--------------------------------------------------------------------------------
/HBase/hive-mysql.txt:
--------------------------------------------------------------------------------
1 | mysql> CREATE DATABASE metastore_db;
2 | Query OK, 1 row affected (0.00 sec)
3 |
4 | mysql> CREATE USER 'hadoop'@'%' IDENTIFIED BY 'hivepassword';
5 | Query OK, 0 rows affected (0.00 sec)
6 |
7 | mysql> GRANT all on *.* to 'hadoop'@client.cluster1.com identified by 'hivepassword';
8 | Query OK, 0 rows affected (0.00 sec)
9 |
10 | mysql> flush privileges;
11 | Query OK, 0 rows affected (0.00 sec)
12 |
13 | ====================
14 |
15 |
16 |
17 |
18 |
19 | hive.metastore.local
20 | true
21 |
22 |
23 |
24 |
25 |
26 |
27 | javax.jdo.option.ConnectionURL
28 |
29 | jdbc:mysql://client.cluster1.com:3306/metastore_db?createDatabaseIfNotExist=true
30 |
31 | metadata is stored in a MySQL server
32 |
33 |
34 |
35 |
36 |
37 | javax.jdo.option.ConnectionDriverName
38 |
39 | com.mysql.jdbc.Driver
40 |
41 | MySQL JDBC driver class
42 |
43 |
44 |
45 |
46 |
47 | javax.jdo.option.ConnectionUserName
48 |
49 | hadoop
50 |
51 | user name for connecting to mysql server
52 |
53 |
54 |
55 |
56 |
57 | javax.jdo.option.ConnectionPassword
58 |
59 | hivepassword
60 |
61 | password for connecting to mysql server
62 |
63 |
64 |
65 |
66 | ===================
67 | Start hive Server
68 | ===================
69 |
70 | hive --service hiveserver2&
71 |
72 | Start a Separate Metastore Service
73 | -----------------------------------
74 |
75 |
76 | hive.metastore.uris
77 | thrift://:9083
78 | IP address (or fully-qualified domain name) and port of the metastore host
79 |
80 |
81 |
82 | hive.metastore.schema.verification
83 | true
84 |
85 |
86 | hive --service metastore&
87 |
88 | mysql> use metastore_db;
89 | Reading table information for completion of table and column names
90 | You can turn off this feature to get a quicker startup with -A
91 |
92 | Database changed
93 | mysql> show tables;
94 | +---------------------------+
95 | | Tables_in_metastore_db |
96 | +---------------------------+
97 | | BUCKETING_COLS |
98 | | CDS |
99 | | COLUMNS_V2 |
100 | | DATABASE_PARAMS |
101 | | DBS |
102 | | PARTITION_KEYS |
103 | | SDS |
104 | | SD_PARAMS |
105 | | SEQUENCE_TABLE |
106 | | SERDES |
107 | | SERDE_PARAMS |
108 | | SKEWED_COL_NAMES |
109 | | SKEWED_COL_VALUE_LOC_MAP |
110 | | SKEWED_STRING_LIST |
111 | | SKEWED_STRING_LIST_VALUES |
112 | | SKEWED_VALUES |
113 | | SORT_COLS |
114 | | TABLE_PARAMS |
115 | | TBLS |
116 | | VERSION |
117 | +---------------------------+
118 | 20 rows in set (0.00 sec)
119 |
120 | mysql> show databases;
121 | +--------------------+
122 | | Database |
123 | +--------------------+
124 | | information_schema |
125 | | employee |
126 | | metastore_db |
127 | | mysql |
128 | | test |
129 | +--------------------+
130 | 5 rows in set (0.00 sec)
131 |
132 | mysql> select * from TBLS;
133 | +--------+-------------+-------+------------------+--------+-----------+-------+-------------+---------------+--------------------+--------------------+
134 | | TBL_ID | CREATE_TIME | DB_ID | LAST_ACCESS_TIME | OWNER | RETENTION | SD_ID | TBL_NAME | TBL_TYPE | VIEW_EXPANDED_TEXT | VIEW_ORIGINAL_TEXT |
135 | +--------+-------------+-------+------------------+--------+-----------+-------+-------------+---------------+--------------------+--------------------+
136 | | 1 | 1403283170 | 1 | 0 | hadoop | 0 | 1 | hivetesting | MANAGED_TABLE | NULL | NULL |
137 | +--------+-------------+-------+------------------+--------+-----------+-------+-------------+---------------+--------------------+--------------------+
138 | 1 row in set (0.00 sec)
139 |
--------------------------------------------------------------------------------
/HBase/hive.txt:
--------------------------------------------------------------------------------
1 | export JAVA_HOME=/usr/java/jdk1.7.0_25/
2 | export HIVE_HOME=/home/hadoop/hive/
3 | export HBASE_HOME=/home/hadoop/hbase/
4 |
5 | PATH=$PATH:$HOME/bin
6 | PATH=$PATH:/home/hadoop/hadoop/bin:$JAVA_HOME/bin:$HIVE_HOME/bin:/$HBASE_HOME/bin
7 |
8 | export PIG_HOME=/home/hadoop/pig
9 | export PIG_INSTALL=/home/hadoop/pig
10 |
11 | export HIVE_HOME=/home/hadoop/hive
12 | export HBASE_HOME=/home/hadoop/hbase
13 |
14 | =============
15 |
16 |
17 | $ hadoop fs -mkdir /tmp
18 | $ hadoop fs -mkdir /user/hive/warehouse
19 | $ hadoop fs -chmod g+w /tmp
20 | $ hadoop fs -chmod g+w /user/hive/warehouse
21 |
22 | you must create /tmp and /user/hive/warehouse (aka hive.metastore.warehouse.dir) and set aprpopriate permissions in HDFS
23 |
24 | hive> SET mapred.job.tracker=myhost.mycompany.com:50030;
25 |
26 |
27 | CREATE DATABASE test_hive_db;
28 |
29 |
30 | Creating Hive Tables
31 | ==================
32 | hive> CREATE TABLE pokes (foo INT, bar STRING);
33 |
34 | LOAD DATA LOCAL INPATH './examples/files/kv1.txt' OVERWRITE INTO TABLE pokes;
35 |
36 |
37 | creates a table called pokes with two columns, the first being an integer and the other a string.
38 |
39 | =================
40 |
41 |
42 | hive> CREATE TABLE invites (foo INT, bar STRING) PARTITIONED BY (ds STRING);
43 |
44 |
45 | hive> LOAD DATA LOCAL INPATH './hive/examples/files/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
46 | hive> LOAD DATA LOCAL INPATH './examples/files/kv3.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-08');
47 |
48 |
49 | Loading from hdfs
50 |
51 | hive> LOAD DATA INPATH '/user/myname/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
52 |
53 |
54 | Browsing through Tables
55 |
56 | hive> SHOW TABLES;
57 |
58 | lists all the tables.
59 |
60 | hive> SHOW TABLES '.*s';
61 |
62 | hive> DESCRIBE invites;
63 |
64 | shows the list of columns.
65 | Altering and Dropping Tables
66 |
67 | Table names can be changed and columns can be added or replaced:
68 |
69 | hive> ALTER TABLE events RENAME TO 3koobecaf;
70 | hive> ALTER TABLE pokes ADD COLUMNS (new_col INT);
71 | hive> ALTER TABLE invites ADD COLUMNS (new_col2 INT COMMENT 'a comment');
72 | hive> ALTER TABLE invites REPLACE COLUMNS (foo INT, bar STRING, baz INT COMMENT 'baz replaces new_col2');
73 |
74 | Note that REPLACE COLUMNS replaces all existing columns and only changes the table's schema, not the data. The table must use a native SerDe. REPLACE COLUMNS can also be used to drop columns from the table's schema:
75 |
76 | hive> ALTER TABLE invites REPLACE COLUMNS (foo INT COMMENT 'only keep the first column');
77 |
78 | Dropping tables:
79 |
80 | hive> DROP TABLE pokes;
81 |
82 |
83 | hive> LOAD DATA LOCAL INPATH './examples/files/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
84 | hive> LOAD DATA LOCAL INPATH './examples/files/kv3.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-08');
85 | ==============
86 |
87 | CREATE TABLE tags (userId INT,movieId INT,tag STRING,time timestamp) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
88 |
89 | CREATE TABLE test (userId INT,movieId INT,tag STRING,time timestamp) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';
90 |
91 | CREATE external TABLE test1 (name STRING,Id INT,roll INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' stored as textfile location '/user/hadoop/dump';
92 |
93 | Hive VERSION Table
94 |
95 | mysql> CREATE TABLE VERSION ( VER_ID bigint(20) NOT NULL, SCHEMA_VERSION varchar(127) NOT NULL, VERSION_COMMENT varchar(255), PRIMARY KEY (VER_ID));
96 | Query OK, 0 rows affected (0.00 sec)
97 |
98 | mysql> insert into VERSION (VER_ID,SCHEMA_VERSION,VERSION_COMMENT) values (1,"0.14.0","Hive release version 0.14.0");
99 | Query OK, 1 row affected (0.00 sec)
100 |
101 | /usr/lib/hive/bin/schematool -dbType mysql -initSchema
102 |
103 |
104 | Performance tune Hive after checking stats on a table
105 |
106 | analyze table store compute statistics noscan;
107 | analyze table store compute statistics;
108 | analyze table store compute statistics for columns;
109 | ANALYZE TABLE Table1 CACHE METADATA;
110 | ANALYZE TABLE Table1 PARTITION(ds='2008-04-09', hr) COMPUTE STATISTICS NOSCAN;
111 |
--------------------------------------------------------------------------------
/HBase/regions.txt:
--------------------------------------------------------------------------------
1 | node1
2 | node2
--------------------------------------------------------------------------------
/HBase/regionservers:
--------------------------------------------------------------------------------
1 | dn1.cluster1.com
2 | dn2.cluster1.com
3 | dn3.cluster1.com
4 |
--------------------------------------------------------------------------------
/HBase/replication:
--------------------------------------------------------------------------------
1 | hbase(main):003:0> add_peer '1', CLUSTER_KEY => 'd1.aus.cloudera.site,d2.aus.cloudera.site,d3.aus.cloudera.site:2181:/hbase'
2 | hbase(main):003:0> disable_peer("1")
3 | hbase(main):003:0> enable_table_replication 'emp'
4 | hbase(main):003:0> enable_table_replication 'emp1'
5 |
6 | hbase$ hbase snapshot create -n emp1_4aug -t emp1
7 |
8 | hbase$ hbase org.apache.hadoop.hbase.snapshot.ExportSnapshot -snapshot emp1_4aug -copy-to hdfs://d2.aus.cloudera.site:8020/hbase -mappers 2
9 |
10 | hbase(main):003:0> enable_peer("1")
11 |
12 |
13 | # The above steps are to be used when the soruce cluster already has data/tables.
14 | # In a new cluster with no data, we do not need to export snapshot and disable_peer (To build backlog for WALs)
15 |
16 |
17 |
18 | hbase(main):003:0> list_peers
19 | PEER_ID CLUSTER_KEY ENDPOINT_CLASSNAME STATE REPLICATE_ALL NAMESPACES TABLE_CFS BANDWIDTH SERIAL
20 | 1 d1.aus.cloudera.site,d2.aus.cloudera.site,d3.aus.cloudera.site:2181:/hbase ENABLED true 0 false
21 | 1 row(s)
22 | Took 0.0125 seconds
23 | => #
24 |
25 | hbase(main):004:0> list_peer_configs
26 | PeerId 1
27 | Cluster Key d1.aus.cloudera.site,d2.aus.cloudera.site,d3.aus.cloudera.site:2181:/hbase
28 |
29 | Took 0.0090 seconds
30 | => {"1"=>#}
31 |
--------------------------------------------------------------------------------
/HBase/tez-setup:
--------------------------------------------------------------------------------
1 | Tez Configuration
2 |
3 | 1. Download Tez tar ball:
4 |
5 | $ su - hadoop
6 | $ wget www-us.apache.org/dist/tez/0.8.4/apache-tez-0.8.4-bin.tar.gz
7 |
8 | untar it in any directory and set path to it. Should be readable by the user running hive.
9 |
10 | $ tar xzvf apache-tez-0.8.4-bin.tar.gz
11 | $ ln -s apache-tez-0.8.4-bin tez
12 |
13 | Copy the tez tarball to a path on HDFS.
14 |
15 | $ hadoop fs -mkdir -p /apps/tez
16 | $ hadoop fs -put tez/share/tez.tar.gz /apps/tez
17 | $ hadoop fs -put hive/lib/hive-exec-1.2.2.jar /apps/tez
18 |
19 | $ vi tez/conf/tez-site.xml
20 |
21 |
22 |
23 |
24 | tez.lib.uris
25 | /apps/tez/tez.tar.gz "This path is the HDFS path, can be speficied using the hdfs://path syntax as well"
26 |
27 |
28 |
29 | tez.am.resource.memory.mb
30 | 2048
31 |
32 |
33 |
34 |
35 | Set ENV
36 |
37 | vi /etc/profile.d/hadoopenv.sh or .bash_profile
38 |
39 | export TEZ_CONF_DIR=/home/hadoop/tez/conf
40 | export TEZ_JARS=/home/hadoop/tez/
41 |
42 | export HADOOP_CLASSPATH=${TEZ_CONF_DIR}:${TEZ_JARS}/*:${TEZ_JARS}/lib/*:$HADOOP_CLASSPATH
43 |
44 | Set the execution mode in the hive configuration:
45 |
46 | $ vi hive/conf/hive-site.xml
47 |
48 |
49 |
50 | hive.execution.engine
51 | tez
52 |
53 |
54 |
55 | This can be done at the hive/beeline prompt as well:
56 |
57 | hive> set hive.execution.engine=tez;
58 |
59 |
60 | Test by running any example:
61 |
62 | hive> select count(*) from pokes;
63 | Query ID = hadoop_20180414105904_37f4b946-30cc-447a-8878-be956d0b222e
64 | Total jobs = 1
65 | Launching Job 1 out of 1
66 |
67 |
68 | Status: Running (Executing on YARN cluster with App id application_1523714759756_0007)
69 |
70 | --------------------------------------------------------------------------------
71 | VERTICES STATUS TOTAL COMPLETED RUNNING PENDING FAILED KILLED
72 | --------------------------------------------------------------------------------
73 | Map 1 .......... SUCCEEDED 1 1 0 0 0 0
74 | Reducer 2 ...... SUCCEEDED 1 1 0 0 0 0
75 | --------------------------------------------------------------------------------
76 | VERTICES: 02/02 [==========================>>] 100% ELAPSED TIME: 6.88 s
77 | --------------------------------------------------------------------------------
78 | OK
79 | 500
80 | Time taken: 9.721 seconds, Fetched: 1 row(s)
81 |
82 |
83 | Important thing to keep in mind that the env must be set on the nodes which are edge nodes, i.e hive client nodes.
84 | Hive server and metaserver etc will be as talked previously.
85 |
--------------------------------------------------------------------------------
/HBase/untitled.txt:
--------------------------------------------------------------------------------
1 | parted /dev/sdb --script -- mklabel msdos
2 | parted /dev/sdb --script -- mkpart primary 0 -1
3 | mkfs.ext3 /dev/sdb1
4 | mkdir -p /space/disk1
5 | mount /dev/sdb1 /space/disk1
6 |
7 | useradd hadoop; echo hadoop | passwd --stdin hadoop
8 | chown -R hadoop:hadoop /space
9 |
10 |
11 | yum install jdk -y
--------------------------------------------------------------------------------
/Hive_performance:
--------------------------------------------------------------------------------
1 | -XX:-UseGCOverheadLimit
2 |
3 | SET mapred.child.java.opts="-server1g -XX:+UseConcMarkSweepGC -XX:-UseGCOverheadLimit";
4 |
5 | To enable the optimization
6 |
7 | set hive.auto.convert.join = true
8 | set hive.optimize.skewjoin = true
9 |
10 |
11 | When you are working with a large number of small files, Hive uses CombineHiveInputFormat by default.
12 | In terms of MapReduce, it ultimately translates to using CombineFileInputFormat that creates virtual splits over multiple files,
13 | grouped by common node, rack when possible. The size of the combined split is determined by
14 |
15 | mapred.max.split.size
16 | or
17 | mapreduce.input.fileinputformat.split.maxsize ( in yarn/MR2);
18 |
19 | So if you want to have less splits(less mapper) you need to set this parameter higher.
20 |
21 | http://stackoverflow.com/questions/17852838/what-is-the-default-size-that-each-hadoop-mapper-will-read
22 |
23 | http://www.ericlin.me/how-to-control-the-number-of-mappers-required-for-a-hive-query
24 |
25 |
26 | set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
27 | set mapred.map.tasks = 20;
28 |
29 | Controlling split size:
30 |
31 | set mapreduce.input.fileinputformat.split.minsize==100000000;
32 | reference: https://hadoopjournal.wordpress.com/2015/06/13/set-mappers-in-pig-hive-and-mapreduce/
33 |
--------------------------------------------------------------------------------
/Jars/azure.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Jars/azure.tar.gz
--------------------------------------------------------------------------------
/Jars/hadoop-azure-3.1.1.3.1.0.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Jars/hadoop-azure-3.1.1.3.1.0.0-SNAPSHOT.jar
--------------------------------------------------------------------------------
/Jars/jce_policy-8.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Jars/jce_policy-8.zip
--------------------------------------------------------------------------------
/Kafka/commands:
--------------------------------------------------------------------------------
1 | # Make sure to setup the kafka variables like:
2 |
3 | export KAFKA_HOME=/home/hadoop/kafka
4 | PATH=$KAFKA_HOME/bin:$PATH
5 |
6 | Commands:
7 |
8 | kafka-server-start.sh kafka/config/server.properties
9 |
10 | Run as daemon:
11 |
12 | kafka-server-start.sh -daemon kafka/config/server.properties
13 |
14 | [hadoop@gw1 ~]$ jps
15 | 4581 Kafka
16 |
17 | # Stop:
18 |
19 | kafka-server-stop.sh
20 |
21 | # Useful commands:
22 |
23 | # Create Topic
24 | kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic test
25 |
26 | # List topcis
27 | kafka-topics.sh --list --zookeeper localhost:2181
28 | kafka-topics.sh --list --zookeeper n1.dilithium.com:2181
29 |
30 | echo "Hello, Kafka" | kafka-console-producer.sh --broker-list :9092 --topic MyTopic > /dev/null
31 | kafka-console-consumer.sh --zookeeper <> --topic MyTopic --from-beginning
32 |
33 | Examples:
34 |
35 | $ kafka-topics.sh --create --zookeeper n1.dilithium.com:2181 --replication-factor 1 --partitions 1 --topic test
36 | Created topic "test".
37 | $ kafka-topics.sh --list --zookeeper n1.dilithium.com:2181
38 | test
39 |
40 | echo "Hello, Kafka" | kafka-console-producer.sh --broker-list gw1.dilithium.com:9092 --topic test > /dev/null
41 | kafka-console-consumer.sh --bootstrap-server gw1.dilithium.com:9092 --topic test --from-beginning
42 |
43 | $ kafka-log-dirs.sh --describe --bootstrap-server gw1.dilithium.com:9092
44 | Querying brokers for log directories information
45 | Received log directory information from brokers 0
46 | {"version":1,"brokers":[{"broker":0,"logDirs":[{"logDir":"/data/kafka","error":null,"partitions":[]}]}]}
47 |
48 |
49 | Benchmarks
50 |
51 | These are just on my test lab(1 core VM, 2 GB RAM)
52 |
53 | $ kafka-producer-perf-test.sh --topic bench --num-records 1000000 --throughput 150000 --record-size 100 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=67108864 compression.type=none batch.size=8196
54 | 71189 records sent, 13945.0 records/sec (1.33 MB/sec), 1090.8 ms avg latency, 1612.0 max latency.
55 | 170124 records sent, 34018.0 records/sec (3.24 MB/sec), 2294.4 ms avg latency, 3198.0 max latency.
56 | 186553 records sent, 36882.8 records/sec (3.52 MB/sec), 4227.2 ms avg latency, 5537.0 max latency.
57 | 239463 records sent, 47892.6 records/sec (4.57 MB/sec), 7076.9 ms avg latency, 7590.0 max latency.
58 | 1000000 records sent, 39799.410969 records/sec (3.80 MB/sec), 5569.86 ms avg latency, 8151.00 ms max latency, 6986 ms 50th, 8012 ms 95th, 8107 ms 99th, 8143 ms 99.9th
59 |
60 | $ kafka-consumer-perf-test.sh --topic gsd --broker-list gw1.dilithium.com:9092 --messages 1000000 -threads 1 --num-fetch-threads 1 --print-metrics
61 |
62 | $ kafka-consumer-perf-test.sh --topic gsd --broker-list gw1.dilithium.com:9092 --messages 1000000 -threads 1 --num-fetch-threads 1
63 | start.time, end.time, data.consumed.in.MB, MB.sec, data.consumed.in.nMsg, nMsg.sec, rebalance.time.ms, fetch.time.ms, fetch.MB.sec, fetch.nMsg.sec
64 | 2019-02-11 19:23:15:397, 2019-02-11 19:23:20:199, 95.3787, 19.8623, 1000118, 208271.1370, 51, 4751, 20.0755, 210506.8407
65 |
66 | $ kafka-consumer-perf-test.sh --topic gsd --broker-list gw1.dilithium.com:9092 --messages 1000000 -threads 1 --num-fetch-threads 2
67 | start.time, end.time, data.consumed.in.MB, MB.sec, data.consumed.in.nMsg, nMsg.sec, rebalance.time.ms, fetch.time.ms, fetch.MB.sec, fetch.nMsg.sec
68 | 2019-02-11 19:23:49:632, 2019-02-11 19:23:54:701, 95.3787, 18.8161, 1000118, 197300.8483, 135, 4934, 19.3309, 202699.2298
69 | .
70 |
71 | Benchmark with various throughtput, message size:
72 |
73 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000 --throughput 150000 --record-size 100 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196
74 |
75 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000 --throughput 150000 --record-size 1 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196
76 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000 --throughput 15 --record-size 1 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196
77 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000 --throughput 1500 -record-size 1 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196
78 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000 --throughput 150 --record-size 1 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196
79 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000 --throughput 150000 --record-size 1 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196
80 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000 --throughput 15000000 --record-size 1 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196
81 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000 --throughput 15000000 --record-size 1000 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196
82 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000 --throughput -1 --record-size 1000 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196
83 |
--------------------------------------------------------------------------------
/Kafka/kafka-env.sh:
--------------------------------------------------------------------------------
1 | # Create this file as it is not part of the distro
2 |
3 | #!/bin/bash
4 |
5 | # Set KAFKA specific environment variables here.
6 |
7 | # The java implementation to use.
8 | export JAVA_HOME=/usr/java/default
9 | export PATH=$PATH:$JAVA_HOME/bin
10 | #export PID_DIR={{kafka_pid_dir}}
11 | #export LOG_DIR={{kafka_log_dir}}
12 | #export JMX_PORT=9093
13 |
14 | export KAFKA_HEAP_OPTS="-Xmx1g -Xms1g"
15 | export KAFKA_JVM_PERFORMANCE_OPTS="-XX:MetaspaceSize=96m -XX:+UseG1GC -XX:MaxGCPauseMillis=20 -XX:InitiatingHeapOccupancyPercent=35 -XX:G1HeapRegionSize=16M -XX:MinMetaspaceFreeRatio=50 -XX:MaxMetaspaceFreeRatio=80"
16 |
--------------------------------------------------------------------------------
/Kafka/kafka_ganglia2.txt:
--------------------------------------------------------------------------------
1 | {
2 | "servers" : [ {
3 | "port" : "9999", <--- Defined Kafka JMX Port
4 | "host" : "192.168.1.18", <--- Kafka Server
5 | "queries" : [ {
6 | "outputWriters" : [ {
7 | "@class" :
8 | "com.googlecode.jmxtrans.model.output.KeyOutWriter",
9 | "settings" : {
10 | "outputFile" : "/tmp/bufferPool_direct_stats.txt",
11 | "v31" : false
12 | }
13 | } ],
14 | "obj" : "java.nio:type=BufferPool,name=direct",
15 | "resultAlias": "bufferPool.direct",
16 | "attr" : [ "Count", "MemoryUsed", "Name", "ObjectName", "TotalCapacity" ]
17 | }, {
18 | "outputWriters" : [ {
19 | "@class" :
20 | "com.googlecode.jmxtrans.model.output.KeyOutWriter",
21 | "settings" : {
22 | "outputFile" : "/tmp/bufferPool_mapped_stats.txt",
23 | "v31" : false
24 | }
25 | } ],
26 | "obj" : "java.nio:type=BufferPool,name=mapped",
27 | "resultAlias": "bufferPool.mapped",
28 | "attr" : [ "Count", "MemoryUsed", "Name", "ObjectName", "TotalCapacity" ]
29 | }, {
30 | "outputWriters" : [ {
31 | "@class" :
32 | "com.googlecode.jmxtrans.model.output.KeyOutWriter",
33 | "settings" : {
34 | "outputFile" : "/tmp/kafka_log4j_stats.txt",
35 | "v31" : false
36 | }
37 | } ],
38 | "obj" : "kafka:type=kafka.Log4jController",
39 | "resultAlias": "kafka.log4jController",
40 | "attr" : [ "Loggers" ]
41 | }, {
42 | "outputWriters" : [ {
43 | "@class" :
44 | "com.googlecode.jmxtrans.model.output.KeyOutWriter",
45 | "settings" : {
46 | "outputFile" : "/tmp/kafka_socketServer_stats.txt",
47 | "v31" : false
48 | }
49 | } ],
50 | "obj" : "kafka:type=kafka.SocketServerStats",
51 | "resultAlias": "kafka.socketServerStats",
52 | "attr" : [ "AvgFetchRequestMs", "AvgProduceRequestMs", "BytesReadPerSecond", "BytesWrittenPerSecond", "FetchRequestsPerSecond", "MaxFetchRequestMs", "MaxProduceRequestMs" , "NumFetchRequests" , "NumProduceRequests" , "ProduceRequestsPerSecond", "TotalBytesRead", "TotalBytesWritten", "TotalFetchRequestMs", "TotalProduceRequestMs" ]
53 | } ],
54 | "numQueryThreads" : 2
55 | } ]
56 | }
57 | }
58 |
59 |
--------------------------------------------------------------------------------
/Kafka/kakfa_rsyslog.txt:
--------------------------------------------------------------------------------
1 | rsyslog (base, includes imfile)
2 | rsyslog-kafka
3 |
4 | /etc/rsyslog.conf
5 | -----------------
6 |
7 |
8 | $WorkDirectory /var/lib/rsyslog # where to place spool files
9 |
10 | $MainMsgQueueType LinkedList
11 | $MainMsgQueueFileName mainmsgq
12 | $MainMsgQueueSaveOnShutdown on
13 | $MainMsgQueueSize 15000
14 | $MainMsgQueueHighWatermark 10000
15 | $MainMsgQueueLowWatermark 1000
16 | $MainMsgQueueMaxDiskSpace 53687091 # 512KB, most containers have
17 |
18 |
19 | /etc/rsyslog.d/kafka.conf
20 | -------------------------
21 |
22 | module(load="omkafka") # provides omkafka
23 | # Use rainerscript, as below#$ActionQueueSize 1500000
24 | #$ActionQueueType LinkedList
25 | #$ActionQueueFileName omkafkaq
26 | #$ActionResumeRetryCount -1
27 | #$ActionQueueSaveOnShutdown on
28 | #$ActionQueueHighWatermark 1000000
29 | #$ActionQueueLowWatermark 100000
30 | #$ActionQueueMaxDiskSpace 536870912 # 512MB, most containers have
31 | #$ActionQueueMaxDiskSpace 536870912 # 512MB, most containers have <8GB of space
32 | #$MainMsgQueueDiscardMark 400000 # Low < Discard < High < DiskSpace
33 | #$MainMsgQueueDiscardSeverity 4 # Discard anything lower than warning
34 |
35 | *.* action(type="omkafka" topic="rsyslog-prod"
36 | broker="kafka1.example.com,kafka2.example.com,kafka3.example.com"
37 | queue.filename="omkafkaq" queue.spoolDirectory="/var/lib/rsyslog"
38 | queue.size="300000" queue.maxdiskspace="536870912"
39 | queue.lowwatermark="20000" queue.highwatermark="200000"
40 | queue.discardmark="250000" queue.type="LinkedList"
41 | queue.discardseverity="4"
42 | queue.saveonshutdown="on" queue.dequeuebatchsize="4"
43 | partitions.auto="on" errorFile="/var/log/rsyslog.err"
44 | confParam=[ "compression.codec=snappy",
45 | "socket.timeout.ms=1000",
46 | "socket.keepalive.enable=true"]
47 | )
48 |
--------------------------------------------------------------------------------
/Kafka/server.properties:
--------------------------------------------------------------------------------
1 | # Only need to change the below for each broker. This is a very basic kafka config
2 |
3 | # The id of the broker. This must be set to a unique integer for each broker.
4 | broker.id=0
5 |
6 | # root directory for all kafka znodes.
7 | zookeeper.connect=n1.dilithium.com:2181,n2.dilithium.com:2181,sn.dilithium.com:2181
8 |
--------------------------------------------------------------------------------
/Notes/Benchmarking.txt:
--------------------------------------------------------------------------------
1 | Test Hadoop
2 | ============
3 |
4 | hadoop jar hadoop/hadoop-test-0.20.205.0.jar TestDFSIO -write -nrFiles 10 -fileSize 1000
5 | hadoop jar hadoop/hadoop-test-0.20.205.0.jar TestDFSIO -read -nrFiles 10 -fileSize 1000
6 |
7 | hadoop jar hadoop/hadoop-test-0.20.205.0.jar TestDFSIO -clean
8 |
9 |
10 | Generate Tera Data
11 | ==================
12 |
13 | hadoop jar hadoop/hadoop-examples-0.20.205.0.jar teragen 1000 /user/hduser/terasort-input
14 |
15 | hadoop jar hadoop/hadoop-examples-0.20.205.0.jar terasort /user/hduser/terasort-input /user/hduser/terasort-output
16 |
17 | hadoop job -history all /user/hduser/terasort-input
18 |
19 |
20 |
21 | hadoop jar hadoop/hadoop-examples-0.20.205.0.jar nnbench -operation create_write \
22 | -maps 12 -reduces 6 -blockSize 1 -bytesToWrite 0 -numberOfFiles 1000 \
23 | -replicationFactorPerFile 3 -readFileAfterOpen true \
24 | -baseDir /benchmarks/NNBench-`hostname -s`
25 |
26 |
27 | hadoop jar hadoop/hadoop-examples-0.20.205.0.jar mrbench -numRuns 50
--------------------------------------------------------------------------------
/Notes/Hadoop_lab.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Notes/Hadoop_lab.doc
--------------------------------------------------------------------------------
/Notes/Hadoop_upgrade.txt:
--------------------------------------------------------------------------------
1 |
2 | Hadoop Upgrade
3 | ===============
4 |
5 | 1. hadoop dfsadmin -upgradeProgress status
6 |
7 | 2. Stop all client applications running on the MapReduce cluster.
8 |
9 | 3. Perform a filesystem check
10 | hadoop fsck / -files -blocks -locations > dfs-v-old-fsck-1.log
11 |
12 | 4. Save a complete listing of the HDFS namespace to a local file
13 | hadoop dfs -lsr / > dfs-v-old-lsr-1.log
14 |
15 | 5. Create a list of DataNodes participating in the cluster:
16 | hadoop dfsadmin -report > dfs-v-old-report-1.log
17 |
18 | 6. Optionally backup HDFS data
19 |
20 | 7. Upgrade process:
21 | Point to the new directory, update environment variables.
22 |
23 | 8. hadoop-daemon.sh start namenode -upgrade
24 |
25 | 9. hadoop dfsadmin -upgradeProgress status
26 |
27 | 10. Now start the datanode, after pointing to the new hadoop directory
28 |
29 | 11. hadoop dfsadmin -safemode get
30 |
31 | 12. hadoop dfsadmin -finalizeUpgrade
32 |
33 |
34 |
--------------------------------------------------------------------------------
/Notes/Performance.txt:
--------------------------------------------------------------------------------
1 | CPU-related parameters:mapred.tasktracker.map and reduce.tasks.maximum
2 | Decide the maximum number of map/reduce tasks that will be run simultaneously by a task tracker. These two parameters are the most relative ones to CPU utilization. The default value of both parameters is 2. Properly increasing their values according to your cluster condition increases the CPU utilization and therefore improves the performance. For example, assume each node of the cluster has 4 CPUs supporting simultaneous multi-threading, and each CPU has 2 cores; then the total number of daemons should be no more than 4x2x2=16. Considering DN and TT would take 2 slots, there are at most 14 slots for map/reduce tasks, so the best value is 7 for both parameters.
3 |
4 | Set this parameter in mapred-site.xml.
5 |
6 | Memory-related parameter:mapred.child.java.opts
7 | This is the main parameter for JVM tuning. The default value is -Xmx200m, which gives each child task thread 200 MB of memory at most. You can increase this value if the job is large, but should make sure it won't cause swap, which significantly reduces performance.
8 |
9 | Let's examine how this parameter can affect the total memory usage. Assume the maximum number of map/reduce tasks is set to 7, and mapred.child.java.opts is left to the default value. Then memory cost of running tasks will be 2x7x200 MB =2800 MB. If each worker node has both DN and TT daemons, and each daemon costs 1 GB memory by default, the total memory allocated would be around 4.8 GB.
10 |
11 | Set this parameter in mapred-site.xml.
12 |
13 | Disk I/O-related parameters:mapred.compress.map.output, mapred.output.compress, and mapred.map.output.compression.codec
14 | These are parameters that control whether to compress the output, in which mapred.compress.map.output is for map output compression, mapred.output.compress is for job output compression, and mapred.map.output.compression.codec is for compression code. All of these options are turned off by default.
15 |
16 | Turning on output compression can speed up disk (local/Hadoop Distributed File System (HDFS)) writes and reduce total time of data transfer (in both shuffle and HDFS writing phase), while on the other hand cost additional overhead during the compression/decompression process.
17 |
18 | According to personal experience, turning on compression is not effective for sequence filing with random keys/values. One suggestion is to turn on compression only when the data you're dealing with is large and organized (especially natural language data).
19 |
20 | Set these parameters in mapred-site.xml.
21 |
22 | io.sort.mb parameter:
23 | This parameter sets the buffer size for map-side sorting, in units of MB, 100 by default. The greater the value, the fewer spills to the disk, thus reducing I/O times on the map side. Notice that increasing this value increases memory required by each map task.
24 |
25 | According to experience, when the map output is large, and the map-side I/O is frequent, you should try increasing this value.
26 |
27 | Set this parameter in mapred-site.xml.
28 |
29 | io.sort.factor parameter
30 | This parameter sets the number of input streams (files) to be merged at once in both map and reduce tasks. The greater this value, the fewer spills to the disk, thus reducing I/O times on both the map and reduce sides. Notice that increasing this value might cost more garbage collection activities if memory allocated for each task is not large enough.
31 |
32 | According to experience, when there is a large number of spills to the disk, and I/O times of the sort and shuffle phase is high, you should try increasing this value.
33 |
34 | Set this parameter in mapred-site.xml.
35 |
36 | mapred.job.reduce.input.buffer.percent parameter
37 | This parameter sets the percentage of memory (relative to the maximum heap size) to retain map outputs during the reduce phase. When the shuffle is concluded, any remaining map outputs in memory must consume less than this threshold before the reduce phase can begin, 0 by default. The greater this value is, the less merge on the disk, thus reducing I/O times on the local disk during the reduce phase. Notice that increasing this value might cost more garbage collection activities if memory allocated for each task is not large enough.
38 |
39 | According to experience, when map output is large, and local disk I/O is frequent during the reduce through sort phases, you should try increasing this value.
--------------------------------------------------------------------------------
/Notes/backup.txt:
--------------------------------------------------------------------------------
1 | Hadoop Backup and Recovery
2 | ==========================
3 |
4 |
5 |
6 | dfs.secondary.http.address
7 | 192.168.1.68:50090
8 |
9 |
10 |
11 | 1. Secondary namenode checkpointing
12 |
13 | If you want to explicitly specify the file to be used by the namenode
14 |
15 | hadoop-daemons.sh --hosts masters start secondarynamenode
16 |
17 | hdfs secondarynamenode -checkpoint force
18 |
19 | 2. hadoop namenode -importCheckpoint
20 |
21 |
22 | fs.checkpoint.dir
23 | /data/new
24 |
25 |
26 | 3. Save NameSpace
27 |
28 | hadoop dfsadmin -safemode enter
29 |
30 | hadoop dfsadmin -saveNamespace
31 |
32 | Remember it updates under the Namespace directory.
33 |
34 | 4. Metadata Save
35 |
36 | hdfs dfsadmin -metasave filename.txt
37 |
38 | 5. Can do a detailed view of the namespace (above 0.21)
39 |
40 | hdfs oiv -i /data/namenode/current/fsimage -o fsimage.txt
41 |
42 |
43 |
44 | dfs.secondary.http.address
45 | 192.168.1.68:50090
46 |
--------------------------------------------------------------------------------
/Notes/cassandra2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Notes/cassandra2.pdf
--------------------------------------------------------------------------------
/Notes/class3_questions:
--------------------------------------------------------------------------------
1 | 1. when name node started what will be "keyword" to identify from log that at what timestamp namenode started and which log should we see?
2 | 2. Does Secondary Name node starts automatically?
3 | 3. "copyFromLocal" is locally load the data from the same node.. if yes, than if we want to load data from NAS or SAN or any client is there any specific command ..
4 | 4. Is there any command we can use to override if same file is present in HDFS already? or we always have to remove existing first?
5 | 5. If we change path hdfs then we need to reformat?
6 | 6. how different is hadoop fsck from linux fsck
7 | 7. Should we run "hadoop dfsadmin" & "hadoop fsck" (Admin) commands only from name node and "hadoop fs" files related command from any of the name or datanode in cluster?
8 | 8. how to start data node on selected machines instead of all as this earlier command is doing?
9 | 9. what is .meta and .curr files are created and what is difference. what if .meta file is deleted will data also be lost
10 | 10. in what cicumstances it will choose another node in case dn1 near to full capacity
11 | 11. only shows 1 live node, although running it several time, it's changing between the live datanodes, but only shows 1 live at a time. Is it normal? What could be the problem
12 | 12. is it possible to force the data to go into a particular data node?
13 | 13. so at some edits will too huge , does it rotate also?
14 | 14. can we have both NN and SNN on same node? or is it best practice to separate them
15 | 15. how we know which rack machine belongs to?
16 | 16. editing nodes in include and exclude files does not require a reboot?
17 | 17. In what kind of cases we might need to exclude a particular data node? Why would we create a DN and exclude it from the cluster?
18 | 18. Let say I have servers from 2 datacenters one in new ATLANTA and one in NEW YORK. Let say ATLANTA datacenter is down because of FLOOD. How can we recover our cluster from such disaster recovery. In that case can we configure our data file to consider servers of ATLANTA datacenter as ONE RACK and NEWYORK servers as another rack?
19 |
20 |
--------------------------------------------------------------------------------
/Notes/class4_questions:
--------------------------------------------------------------------------------
1 | https://www.packtpub.com/books/content/sizing-and-configuring-your-hadoop-cluster
2 | why do we need replication of replication? why do we need to have replication of data on the same server again
3 | what was that sdb1 & sdc1, is that new partition for dn1 & dn2
4 | is this similar to RAID 1(mirroring)?
5 | do we need to setup this only if we do not have raid 1 already?
6 | Isnt its a good practice to create these directories on NFS server? other than local?
7 | after chaning any configuration setting , do we need to run --format everytime to reflect the change?
8 | will thses two disks need to be kept in different racks to avaoid data loss
9 | If the disk io speed is different significantly between the local and NFS one, will this cause adverse effect for namenode?
10 | it will be overhead because instead of rsyncing 1 directory it has to copy 2 dirs to DR right?
11 | so which meta data name node is refering to if both mount points are up?
12 | what is the read policy? will it be only from the first disk specified?
13 | does two disk drives follow propotional fill algoritm?
14 | So in that case we dont have control how end clients are sending jobs? they can send any size
15 | So when name node is down and we have to make secondary as primary. I am confused why we need to change the hostname of secondary to primary? Instead of that if we would have configured namenode location by GSLB instead of direct hostname. Then we don’t need to change hostname
16 | you are setting Quota from command line. So it will be flushed once we restart the namenode?
17 |
--------------------------------------------------------------------------------
/Notes/cloudera.txt:
--------------------------------------------------------------------------------
1 | Cloudera Manager
2 | ================
3 |
4 | ./cloudera-manager-installer.bin --skip_repo_package=1
5 |
6 |
7 |
8 | 1. Cloudera Manager - GUI
9 | 2. Cloudera Packages CDH4 - Hadoop packages
--------------------------------------------------------------------------------
/Notes/disk_partition:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | hdd="/dev/sdc /dev/sdd /dev/sde /dev/sdf"
3 |
4 | count=0
5 |
6 | for disk in $hdd; do
7 | #echo -e "n\np\n\n\n\nw\n" | fdisk $disk;
8 |
9 | fs="${disk}1"
10 | #mkfs.xfs $fs;
11 |
12 | twoDigitCount=$(printf "%02d" $count)
13 | mount="/data/$twoDigitCount"
14 | mkdir -p $mount;
15 |
16 | mount $fs $mount;
17 |
18 | count=$((count+1))
19 | done
20 |
--------------------------------------------------------------------------------
/Notes/hadoop_ports.txt:
--------------------------------------------------------------------------------
1 | Hadoop nodes communication ports
2 |
3 | No. name protocol port # configuration file parameter name description
4 | 1 ssh tcp *:22 /etc/ssh/sshd_config Port ssh server port for ssh communication
5 | 2 HDFS default port tcp localhost:9000 core-site.xml fs.default.name HDFS port for clients.
6 | 3 secondary name node administration tcp 0.0.0.0:50090 hdfs-site.xml dfs.secondary.http.address The secondary namenode http server address and port. If the port is 0 then the server will start on a free port.
7 | 4 data node communication tcp 0.0.0.0:50010 hdfs-site.xml dfs.datanode.address
8 | 5 data node administration tcp 0.0.0.0:50075 hdfs-site.xml dfs.datanode.http.address
9 | 6 data node IPC communication tcp 0.0.0.0:50020 hdfs-site.xml dfs.datanode.ipc.address
10 | 7 name node administration tcp 0.0.0.0:50070 hdfs-site.xml dfs.http.address
11 | 8 data node administration tcp 0.0.0.0:50475 hdfs-site.xml dfs.datanode.https.address
12 | 9 name node administration tcp 0.0.0.0:50470 hdfs-site.xml dfs.https.address
13 | 10 MapReduce job tracker tcp 0.0.0.0:9001 mapred-site.xml mapred.job.tracker The port of Job Tracker accepting for job request.
14 | 11 job tracker administration tcp 0.0.0.0:50030 mapred-site.xml mapred.job.tracker.http.address
15 | 12 task tracker administration tcp 0.0.0.0:50060 mapred-site.xml mapred.task.tracker.http.address
--------------------------------------------------------------------------------
/Notes/hadoop_ports_firewall.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Notes/hadoop_ports_firewall.xls
--------------------------------------------------------------------------------
/Notes/installation.txt:
--------------------------------------------------------------------------------
1 | core-site.xml
2 |
3 |
4 | fs.default.name
5 | hdfs://nn1.cluster1.com:9000
6 |
7 |
8 | hdfs-site.xml
9 |
10 |
11 | dfs.name.dir
12 | /data/namenode
13 | true
14 |
15 |
16 |
17 | dfs.data.dir
18 | /space/disk1,/space/disk2
19 | true
20 |
21 |
22 |
23 | dfs.replication
24 | 1
25 |
26 |
27 |
28 | dfs.block.size
29 | 67108864
30 |
31 |
32 |
33 | dfs.hosts.exclude
34 | /home/hadoop/excludes
35 | true
36 |
37 |
38 |
39 | dfs.hosts
40 | /home/hadoop/include
41 | true
42 |
43 |
44 | mapred-site
45 |
46 |
47 |
48 | mapred.job.tracker
49 | jt.cluster1.com:9001
50 |
51 |
52 |
53 |
54 | export JAVA_HOME=/usr/java/jdk1.7.0_25/
55 | export HADOOP_HOME=/home/hadoop/hadoop
56 | export HADOOP_PID_DIR=/home/hadoop/pids
57 | export HADOOP_HEAPSIZE=500
58 |
59 | export HADOOP_LOG_DIR=${HADOOP_HOME}/logs
60 | export HADOOP_HOME_WARN_SUPPRESS="TRUE"
61 |
62 |
63 |
64 | export JAVA_HOME=/usr/java/jdk1.7.0_25/
65 |
66 | PATH=$JAVA_HOME/bin:$PATH:$HOME/bin
67 | PATH=$PATH:/home/hadoop/hadoop/bin
68 |
69 | export PATH
70 |
71 | ================
72 |
73 | heartbeat.recheck.interval
74 | 15
75 | Determines datanode heartbeat interval in seconds
76 |
77 |
78 | If above doesn't work - try the following (seems to be version-dependent):
79 |
80 |
81 | dfs.heartbeat.recheck.interval
82 | 15
83 | Determines datanode heartbeat interval in seconds.
84 |
85 |
--------------------------------------------------------------------------------
/Notes/pig.txt:
--------------------------------------------------------------------------------
1 | export PIG_HOME=/home/hadoop/pig/
2 |
3 |
4 | A = load 'passwd' using PigStorage(':');
5 | B = foreach A generate $0 as id;
6 | store B into 'id.out';
7 |
8 |
9 |
10 | pig -x local id.pig
11 |
12 | pig -x mapreduce id.pig
13 |
14 |
15 | passwd = LOAD '/etc/passwd' USING PigStorage(':') AS (user:chararray, \
16 | passwd:chararray, uid:int, gid:int, userinfo:chararray, home:chararray, \
17 | shell:chararray);
18 | grunt> DUMP passwd;
19 |
20 |
21 | grunt> counts = FOREACH grp_shell GENERATE group, COUNT(passwd);
22 | grunt> DUMP counts;
23 |
24 |
25 |
26 | A = load 'test';
27 |
28 | B = foreach A generate flatten(TOKENIZE((chararray)$0)) as word;
29 |
30 | C = group B by word;
31 |
32 | D = foreach C generate COUNT(B), group;
33 |
34 | store D into 'wordcount';
35 |
36 | ==================
37 |
38 | A = load 'http_access_2011-07-07.log' using PigStorage('-') as (f0,f1,f2,f3,f4);
39 | B = foreach A generate f0;
40 | C = distinct B;
41 | dump C;
42 |
43 | A = load 'http_access_2011-07-07.log' using PigStorage('"') as (f0,f1,f2,f3,f4,f5);
44 | B = foreach A generate f5;
45 | C = distinct B;
46 | dump C;
47 |
48 | A = load 'http_access_2011-07-07.log' using PigStorage('"') as (f0,f1,f2,f3,f4);
49 | B = foreach A generate f1;
50 | C = distinct B;
51 | dump C;
52 |
53 | ==============
54 | yum install ant*
55 |
56 | For Hadoop-2.0
57 |
58 | ant clean jar-withouthadoop -Dhadoopversion=23
59 |
60 | or
61 |
62 | ant clean jar-all -Dhadoopversion=23
--------------------------------------------------------------------------------
/Notes/questions.txt:
--------------------------------------------------------------------------------
1 | 1) If we add new DataNodes to the cluster will HDFS move the blocks to the newly added nodes in order to balance disk space utilization between the nodes?
2 |
3 | a) yes, it will automatically do balancing
4 | b) no, we have to manually to re-balancing (correct)
5 |
6 | 2) The name-node will stay in safe mode till all under-replicated files are fully replicated?
7 |
8 | a)TRUE b) FALSE (correct)
9 |
10 | 3) How do I set up a hadoop data node to use multiple volumes?
11 |
12 | a) We cannot do that b) We can use comma seperated fields (correct) c) This can only be done with SAN storage
13 |
14 | 4) Can a Hadoop client renames a file or a directory containing a file while another client is still writing into it?
15 |
16 | a) yes, it can (correct) b) No, hadoop does locking
17 |
18 | 5) Will the command bin/hadoop dfs -ls /projects/* list all the files under /projects ?
19 |
20 | a) yes (correct, but better to safeguard it with single quotes) b) no
21 |
22 | 6) Can we have multiple files in HDFS use different block sizes?
23 |
24 | a) yes (correct) b) no
25 |
26 | 7) How do you gracefully stop a running job?
27 |
28 | a) hadoop job -kill jobid(correct) b) kill the task tracker c) it can not be done
29 |
30 | 8) What is the best java version to use for Hadoop?
31 |
32 | a) It does not matter b) Must be greater then java2.6 c) greater then 1.6 (correct)
33 |
34 | 9) What is the command for adding the hosts newly added to the mapred.include file?
35 |
36 | a) hadoop dfsadmin -refreshNodes b) hadoop rmadmin -refreshNodes (correct)
37 |
38 | 10) What will happen, if we set the number of reducers to 0 ?
39 |
40 | a) job will fail b) the map-tasks r written directly to the disk (correct)
41 |
42 | 11) How many maximum JVM run on the slave node?
43 |
44 | a) only one as there is only one tasktracker b) 2 one each for tasktracker, datanode c) It depends upon task instances (correct)
45 |
46 | 12) Where is the intermidiate mapper output stored?
47 |
48 | a) It is stored in tmp folder on hdfs b) It is stored on local filesystem(correct) c) It is only in Memory
49 |
50 | 13) When does mappers run ?
51 |
52 | a) They start immediately when job is submitted b) They start only after the mapper finish (correct)
53 |
54 |
55 | 14) What action occurs automatically on a cluster when a DataNode is marked as dead?
56 |
57 | A. The NameNode forces re-replication of all the blocks which were stored on the dead DataNode.
58 | B. The next time a client submits job that requires blocks from the dead DataNode, the JobTracker receives no heart beats from the DataNode. The JobTracker tells the NameNode that the DataNode is dead, which triggers block re-replication on the cluster.
59 | C. The replication factor of the files which had blocks stored on the dead DataNode is temporarily reduced, until the dead DataNode is recovered and returned to the cluster.
60 | D. The NameNode informs the client which write the blocks that are no longer available; the client then re-writes the blocks to a different DataNode.
61 |
62 | 15) QUESTION: 5
63 | Which three distcp features can you utilize on a Hadoop cluster?
64 | A. Use distcp to copy files only between two clusters or more. You cannot use distcp to copy data between directories inside the same cluster.
65 | B. Use distcp to copy HBase table files.
66 | C. Use distcp to copy physical blocks from the source to the target destination in your cluster.
67 | D. Use distcp to copy data between directories inside the same cluster. E. Use distcp to run an internal MapReduce job to copy files.
68 | Answer: B, D, E
69 |
70 | 16) What is the recommended disk configuration for slave nodes in your Hadoop cluster with 6 x 2 TB hard drives?
71 | A. RAID 10 B. JBOD
72 | C. RAID 5 D. RAID 1+0
73 | Answer: B
74 |
75 | 17) Your Hadoop cluster has 25 nodes with a total of 100 TB (4 TB per node) of raw disk space allocated HDFS storage. Assuming Hadoop's default configuration, how much data will you be able to store?
76 | A. Approximately 100TB B. Approximately 25TB C. Approximately 10TB D. Approximately 33 TB
77 | Answer: D
78 |
79 | 18) The most important consideration for slave nodes in a Hadoop cluster running production jobs that require short turnaround times is:
80 | A. The ratio between the amount of memory and the number of disk drives.
81 | B. The ratio between the amount of memory and the total storage capacity.
82 | C. The ratio between the number of processor cores and the amount of memory. D. The ratio between the number of processor cores and total storage capacity. E. The ratio between the number of processor cores and number of disk drives.
83 | Answer: D
84 |
85 | 19) Your existing Hadoop cluster has 30 slave nodes, each of which has 4 x 2T hard drives. You plan to add another 10 nodes. How much disk space can your new nodes contain?
86 | A. The new nodes must all contain 8TB of disk space, but it does not matter how the disks are configured
87 | B. The new nodes cannot contain more than 8TB of disk space
88 | C. The new nodes can contain any amount of disk space
89 | D. The new nodes must all contain 4 x 2TB hard drives Answer: C
90 |
91 | 20) On a cluster running MapReduce v1 (MRv1), a MapReduce job is given a directory of 10 plain text as its input directory. Each file is made up of 3 HDFS blocks. How many Mappers will run?
92 | A. We cannot say; the number of Mappers is determined by the developer B. 30
93 | C. 10
94 | D. 1
95 | Answer: B
96 |
97 | 21) Which scheduler would you deploy to ensure that your cluster allows short jobs to finish within a reasonable time without starving long-running jobs?
98 | A. FIFO Scheduler
99 | B. Fair Scheduler
100 | C. Capacity Scheduler
101 | D. Completely Fair Scheduler (CFS)
102 | Answer: B
103 |
104 | 22) You are planning a Hadoop duster, and you expect to be receiving just under 1TB of data per week which will be stored on the cluster, using Hadoop's default replication. You decide that your slave nodes will be configured with 4 x 1TB disks. Calculate how many slave nodes you need to deploy at a minimum to store one year's worth of data.
105 | A. 100 slave nodes B. 100 slave nodes C. 10 slave nodes D. 50 slave nodes
106 | Answer: D
107 |
108 | 23) On a cluster running MapReduce v1 (MRv1), a MapReduce job is given a directory of 10 plain text as its input directory. Each file is made up of 3 HDFS blocks. How many Mappers will run?
109 | A. We cannot say; the number of Mappers is determined by the developer B. 30
110 | C. 10
111 | D. 1
112 | Answer: A
113 |
114 | 24) For each job, the Hadoop framework generates task log files. Where are Hadoop's task log files stored?
115 | A. Cached on the local disk of the slave node running the task, then purged immediately upon task completion.
116 | B. Cached on the local disk of the slave node running the task, then copied into HDFS.
117 | C. In HDFS, in the directory of the user who generates the job.
118 | D. On the local disk of the slave node running the task.
119 |
120 | Answer: D
121 |
122 |
123 |
--------------------------------------------------------------------------------
/Notes/quick-links:
--------------------------------------------------------------------------------
1 | AMS: https://cwiki.apache.org/confluence/display/AMBARI/Known+Issues
2 |
--------------------------------------------------------------------------------
/Notes/quiz4.txt:
--------------------------------------------------------------------------------
1 | 1) How do you gracefully stop a running job?
2 |
3 | a) hadoop job -kill jobid(correct) b) kill the task tracker c) it can not be done
4 |
5 | 2) What will happen, if we set the number of reducers to 0 ?
6 |
7 | a) job will fail b) the map-tasks r written directly to the disk (correct)
8 |
9 | 3) Where is the intermidiate mapper output stored?
10 |
11 | a) It is stored in tmp folder on hdfs b) It is stored on local filesystem(correct) c) It is only in Memory
12 |
13 | 4) When does mappers run ?
14 |
15 | a) They start immediately when job is submitted b) They start only after the mapper finish (correct)
16 |
17 | 5) Which property set the max number of tasktrackers ? (B is correct)
18 |
19 | a) mapred.tasktracker.map.tasks b) mapred.tasktracker.map.tasks.maximum c) map.tasks.maximum
20 |
--------------------------------------------------------------------------------
/Notes/quiz7.txt:
--------------------------------------------------------------------------------
1 | 1) What is HBase?
2 |
3 | a) Is an RDMS database b) Hbase is Column-Oriented c) Distributed database d) Both b and c
4 |
5 | 2) Why we use HBase ?
6 |
7 | a) It is a DB on top of HDFS b) Hbase provide random read and write on large data set. c) HBase is same as MySql
8 |
9 | 3) What is the maximum size of string data type supported by Hive?
10 |
11 | a) 64MB b) It depends upon the HDFS block size c) 2GB (correct)
12 |
13 | 4) In Hadoop ‘Reading‘ is done in parallel and ‘Writing‘ is not in HDFS.
14 |
15 | a) TRUE (correct)
16 | b) FALSE
17 |
18 | 5) Multiple users can use same metastore in 'Embedded metastore Mode'.
19 |
20 | a) TRUE
21 | b) FALSE (Correct)
22 |
23 | 6) Hbase 'CopyTable' utlitiy can be used to:
24 |
25 | a) Copy a partial table b) Full table c) It is not a valid command d) a and b (correct)
26 |
--------------------------------------------------------------------------------
/Notes/quota.txt:
--------------------------------------------------------------------------------
1 | Applying Quota
2 | --------------
3 |
4 | hadoop dfsadmin -setSpaceQuota 1m
5 |
6 |
7 | dfsadmin -setQuota
8 |
9 | dfsadmin -clrQuota
10 |
11 | dfsadmin -setSpaceQuota
12 |
13 | dfsadmin -clrSpaceQuota
14 |
15 |
16 | Distcp
17 | ======
18 |
19 | hadoop distcp hdfs://nn1:8020/foo/bar hdfs://nn2:8020/bar/foo
20 |
21 | hdfs://nn1:8020/foo/a hdfs://nn1:8020/foo/b
22 |
23 | hadoop distcp hdfs://nn1.cluster1.com:9000/jobtracker hdfs://nn1.cluster1.com:9000/newtracker
24 |
25 |
26 | Trash
27 | =======
28 |
29 |
30 | fs.trash.interval
31 | 40
32 |
33 |
34 | SetRep
35 | =====
36 | hadoop dfs -setrep -R -w 3 /chandra
--------------------------------------------------------------------------------
/Notes/rack.txt:
--------------------------------------------------------------------------------
1 | while [ $# -gt 0 ] ; do
2 | nodeArg=$1
3 | exec< /home/hadoop/topology.data
4 | result=""
5 | while read line ; do
6 | ar=( $line )
7 | if [ "${ar[0]}" = "$nodeArg" ] ; then
8 | result="${ar[1]}"
9 | fi
10 | done
11 | shift
12 | if [ -z "$result" ] ; then
13 | echo -n "/default"
14 | else
15 | echo -n "$result "
16 | fi
17 | done
18 |
19 |
20 |
21 | topology.script.file.name
22 | /home/hadoop/hadoop/conf/topology.sh
23 |
24 | ====================
25 | The Above works very well on Hadoop 1 but for hadoop 2, make sure to have the correct format emiited by the script. It
26 | takes IP addresses instead of DNS name and also there are multiple classes like simpleDNS and Table based. We do not need to do anything if we are using a script as above, but for Java invocations and other tabular formats we need to modify the "topology.node.switch.mapping.impl"
27 |
--------------------------------------------------------------------------------
/Notes/remove_datanode.txt:
--------------------------------------------------------------------------------
1 | Add/Remove a Datanode
2 | =====================
3 |
4 | Decommission a host gracefully
5 |
6 |
7 | dfs.hosts.exclude
8 | /home/hadoop/excludes
9 | true
10 |
11 |
12 | Similarly for Jobtracker.
13 |
14 |
15 | mapred.hosts.exclude
16 | /home/hadoop/excludes
17 | true
18 |
19 |
20 | mapred.hosts.exclude in mapred-site.xml
21 |
22 | Add the FQDN to the exclude file and refresh
23 |
24 | Update for the Namenode
25 | -----------------------
26 |
27 | hadoop dfsadmin -refreshNodes
28 |
29 | Update for Jobtracker
30 | ----------------------
31 |
32 | hadoop mradmin -refreshNodes
33 |
34 |
35 | Add Hosts:
36 |
37 | 1. dfs.hosts in the hdfs-site.xml, mapred.hosts
38 |
39 |
40 | ================================================
41 |
42 | Cluster Balancing
43 | -----------------
44 |
45 | hadoop balancer -threshold 40
46 |
47 | ==============================================
48 |
49 | Add Disk Space to a datanode
50 | ----------------------------
51 |
52 | How do you add storage to cluster
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 | dfs.hosts
61 | /home/hadoop/include
62 | true
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
--------------------------------------------------------------------------------
/Notes/repo_server.txt:
--------------------------------------------------------------------------------
1 | Setup Repo Server
2 | =================
3 |
4 | Mount Centos DVD and install:
5 |
6 | cd /media/Rhel 6 DVD/Packages/
7 |
8 | # yum install vsftpd*
9 | # yum install createrepo*
10 |
11 | # mkdir /var/ftp/pub/Centos65
12 |
13 | cp -a /media/RHEL_6_DVD/* /var/ftp/pub/Centos65/
14 |
15 | # createrepo -v /var/ftp/pub/Centos65/
16 |
17 |
18 | # service vsftpd restart
19 |
20 | ========================
21 |
22 | On all the nodes
23 |
24 | # rm -rf /etc/yum.repos.d/*
25 | # vi /etc/yum.repos.d/server.repo
26 |
27 | [server]
28 | name=Centos 6.5 repository
29 | baseurl=ftp:///pub/Centos65/
30 | gpgcheck=0
31 | enable=1
32 |
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/Notes/scoop.txt:
--------------------------------------------------------------------------------
1 | export SQOOP_HOME=/usr/lib/sqoop
2 | export PATH=$PATH:$SQOOP_HOME/bin
3 |
4 |
5 | Step 2: Configure the MySQL Service and Connector
6 |
7 | Download mysql-connector-java-5.0.5.jar file and copy it to $SQOOP_HOME/lib directory.
8 |
9 | Step 3: Sqoop Installation
10 |
11 | Sqoop Installation Tutorial for instructions of how to install Sqoop.
12 |
13 | Database and table creation in MySQL
14 |
15 | First connect to MySQL
16 |
17 | $ mysql -u root -p
18 |
19 | Enter password:
20 |
21 | Create database ‘testDb’ and use ‘testDb’ database as a current database.
22 |
23 | mysql> create database testDb;
24 |
25 | mysql> use testDb;
26 |
27 | Create table ‘student’
28 |
29 | mysql> create table student(id integer,name char(20));
30 |
31 | Add following 2 records to the table
32 |
33 | mysql> insert into student values(1,'Archana');
34 |
35 | mysql> insert into student values(2,'XYZ');
36 |
37 | Exit from MySQL
38 |
39 | mysql> exit;
40 |
41 | Sqoop import
42 | 1. Importing a table into HDFS
43 |
44 | 1st way
45 |
46 | Command for import
47 |
48 | sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --m 1
49 |
50 | Execute the sqoop import
51 |
52 | Here we are using database ‘testDb’ , username ‘root’, password ‘hadoop123′, and table student.
53 |
54 | $ sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --m 1
55 |
56 | ——————- NOTE——————–
57 |
58 | If you have not defined primary key for your table then you have to give ‘-m 1′ option for import.
59 | Otherwise it gives error
60 | ERROR tool.ImportTool: Error during import: No primary key could be found for table student1. Please specify one with --split-by or perform a sequential import with '-m 1'.
61 |
62 | 2nd Way
63 |
64 | Create a config file $HOME/import.txt add following to the config file
65 |
66 | import.txt
67 |
68 | import
69 | --connect
70 | jdbc:mysql://localhost/testDb
71 | --username
72 | root
73 | --password
74 | hadoop123
75 |
76 | Execute the sqoop import
77 |
78 | sqoop --options-file /home/hduser/import.txt --table student -m 1
79 |
80 | Once import is done you can find student.jar, student.class and student.java at following location /tmp/sqoop-hduser/compile/—-/student.jar
81 |
82 | Files created in HDFS
83 |
84 | $ hadoop dfs -ls -R student
85 |
86 | Found 3 items
87 |
88 | -rw-r--r-- 1 hduser supergroup 0 2013-09-13 15:38 /user/hduser/student/_SUCCESS
89 |
90 | drwxr-xr-x - hduser supergroup 0 2013-09-13 15:38 /user/hduser/student/_logs
91 |
92 | -rw-r--r-- 1 hduser supergroup 16 2013-09-13 15:38 /user/hduser/student/part-m-00000
93 |
94 | Data file contents
95 |
96 | $ hadoop dfs -cat /user/hduser/student/part-m-00000
97 |
98 | 1,Archana
99 | 2,XYZ
100 |
101 | 2 Import all rows of a table in MySQL, but specific columns of the table
102 |
103 | $ sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --columns "name" -m 1
104 |
105 | Data file contents
106 |
107 | $ hadoop dfs -cat /user/hduser/student/part-m-00000
108 |
109 | Archana
110 | Xyz
111 |
112 | 3 Import all columns, filter rows using where clause
113 |
114 | $ sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --where "id>1" -m 1 --target-dir /user/hduser/ar
115 |
116 | Data file contents
117 |
118 | $ hadoop dfs -cat /user/hduser/ar/part-m-00000
119 | 2,XYZ
--------------------------------------------------------------------------------
/Notes/sqoop.txt:
--------------------------------------------------------------------------------
1 | export SQOOP_HOME=/usr/lib/sqoop
2 | export PATH=$PATH:$SQOOP_HOME/bin
3 |
4 |
5 | Step 2: Configure the MySQL Service and Connector
6 |
7 | Download mysql-connector-java-5.0.5.jar file and copy it to $SQOOP_HOME/lib directory.
8 |
9 | Step 3: Sqoop Installation
10 |
11 | Sqoop Installation Tutorial for instructions of how to install Sqoop.
12 |
13 | Database and table creation in MySQL
14 |
15 | First connect to MySQL
16 |
17 | $ mysql -u root -p
18 |
19 | Enter password:
20 |
21 | Create database ‘testDb’ and use ‘testDb’ database as a current database.
22 |
23 | mysql> create database testDb;
24 |
25 | mysql> use testDb;
26 |
27 | Create table ‘student’
28 |
29 | mysql> create table student(id integer,name char(20));
30 |
31 | Add following 2 records to the table
32 |
33 | mysql> insert into student values(1,'Archana');
34 |
35 | mysql> insert into student values(2,'XYZ');
36 |
37 | Exit from MySQL
38 |
39 | mysql> exit;
40 |
41 | Sqoop import
42 | 1. Importing a table into HDFS
43 |
44 | 1st way
45 |
46 | Command for import
47 |
48 | sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --m 1
49 |
50 | Execute the sqoop import
51 |
52 | Here we are using database ‘testDb’ , username ‘root’, password ‘hadoop123′, and table student.
53 |
54 | $ sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --m 1
55 |
56 | ——————- NOTE——————–
57 |
58 | If you have not defined primary key for your table then you have to give ‘-m 1′ option for import.
59 | Otherwise it gives error
60 | ERROR tool.ImportTool: Error during import: No primary key could be found for table student1. Please specify one with --split-by or perform a sequential import with '-m 1'.
61 |
62 | 2nd Way
63 |
64 | Create a config file $HOME/import.txt add following to the config file
65 |
66 | import.txt
67 |
68 | import
69 | --connect
70 | jdbc:mysql://localhost/testDb
71 | --username
72 | root
73 | --password
74 | hadoop123
75 |
76 | Execute the sqoop import
77 |
78 | sqoop --options-file /home/hduser/import.txt --table student -m 1
79 |
80 | Once import is done you can find student.jar, student.class and student.java at following location /tmp/sqoop-hduser/compile/—-/student.jar
81 |
82 | Files created in HDFS
83 |
84 | $ hadoop dfs -ls -R student
85 |
86 | Found 3 items
87 |
88 | -rw-r--r-- 1 hduser supergroup 0 2013-09-13 15:38 /user/hduser/student/_SUCCESS
89 |
90 | drwxr-xr-x - hduser supergroup 0 2013-09-13 15:38 /user/hduser/student/_logs
91 |
92 | -rw-r--r-- 1 hduser supergroup 16 2013-09-13 15:38 /user/hduser/student/part-m-00000
93 |
94 | Data file contents
95 |
96 | $ hadoop dfs -cat /user/hduser/student/part-m-00000
97 |
98 | 1,Archana
99 | 2,XYZ
100 |
101 | 2 Import all rows of a table in MySQL, but specific columns of the table
102 |
103 | $ sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --columns "name" -m 1
104 |
105 | Data file contents
106 |
107 | $ hadoop dfs -cat /user/hduser/student/part-m-00000
108 |
109 | Archana
110 | Xyz
111 |
112 | 3 Import all columns, filter rows using where clause
113 |
114 | $ sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --where "id>1" -m 1 --target-dir /user/hduser/ar
115 |
116 | Data file contents
117 |
118 | $ hadoop dfs -cat /user/hduser/ar/part-m-00000
119 | 2,XYZ
--------------------------------------------------------------------------------
/Notes/sqoop1.txt:
--------------------------------------------------------------------------------
1 | sqoop list-databases --connect jdbc:mysql://client.cluster1.com/employee --username sqoop --password passwd
2 |
3 | sqoop list-tables --connect jdbc:mysql://client.cluster1.com/employee --username sqoop --password passwd
4 |
5 |
6 | sqoop import --connect jdbc:mysql://client.cluster1.com/employee --username sqoop --password passwd --table student -m 1 --target-dir /user/sqoop/employee
7 |
8 | sqoop import --connect jdbc:mysql://client.cluster1.com/employee --username sqoop --password passwd --table student -m 1 --target-dir /user/sqoop/employee
9 |
10 |
11 | sqoop --options-file SqoopImportOptions.txt \
12 | --table employees \
13 | --where "emp_no > 499948" \
14 | --as-textfile \
15 | -m 1 \
16 | --target-dir /user/airawat/sqoop-mysql/employeeGtTest
17 |
18 |
19 | sqoop --options-file SqoopImportOptions.txt \
20 | --query 'select EMP_NO,FIRST_NAME,LAST_NAME from employees where $CONDITIONS' \
21 | --fetch-size=50000 \
22 | --split-by EMP_NO \
23 | --direct \
24 | --target-dir /user/airawat/sqoop-mysql/FetchSize
25 |
26 | sqoop --options-file SqoopImportOptions.txt \
27 |
28 | --query 'select EMP_NO,FIRST_NAME,LAST_NAME from employees where $CONDITIONS' \
29 | -z \
30 | --split-by EMP_NO \
31 | --direct \
32 | --target-dir /user/airawat/sqoop-mysql/CompressedSampl
33 |
34 | =================
35 | mysql> create table employee(id varchar(20),name varchar(20),salary varchar(10));
36 |
37 | hive -> CREATE External TABLE emp_hive (id INT, name STRING, salary STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE location '/user/hadoop/table';
38 |
39 |
40 | sqoop import --connect jdbc:mysql://repo.cluster1.com/test --username hadoop --password hivepassword --table employee --target-dir /user/hadoop/table -m 1 --incremental append -check-column id
41 |
42 | #!/bin/bash
43 |
44 | for i in `seq 1 100`
45 | do
46 | echo "insert into test.employee(id,name,salary) values('${i}','Am${i}','10000');"
47 | done
48 |
--------------------------------------------------------------------------------
/Notes/yarn.txt:
--------------------------------------------------------------------------------
1 | hadoop-daemon.sh start namenode
2 | hadoop-daemon.sh start datanode
3 |
4 | yarn-daemon.sh start resourcemanager
5 | yarn-daemon.sh start nodemanager
6 |
7 |
8 |
9 | yarn.resourcemanager.address
10 | ha-nn1.hacluster1.com:8032
11 | the host is the hostname of the ResourceManager and the port is the port on
12 | which the clients can talk to the Resource Manager.
13 |
14 |
15 |
16 | yarn.resourcemanager.scheduler.address
17 | ha-nn1.hacluster1.com:8030
18 | host is the hostname of the resourcemanager and port is the port
19 | on which the Applications in the cluster talk to the Resource Manager.
20 |
21 |
22 |
23 |
24 | yarn.resourcemanager.resource-tracker.address
25 | ha-nn1.hacluster1.com:8031
26 | host is the hostname of the resource manager and
27 | port is the port on which the NodeManagers contact the Resource Manager.
28 |
29 |
30 |
31 |
32 | yarn.nodemanager.address
33 | 0.0.0.0:9004
34 | the nodemanagers bind to this port
35 |
36 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ```
2 | @ Netxillon Technologies. You are allowed to use and modify any work here, provided you acknowlege the source back.
3 | Please contact at trainings@netxillon.com for any questions.
4 |
5 | Disclaimer: There is no responsibility for any kind of damage caused, by using this github. Please make sure you understand the things here before implementing them in production.
6 | ```
7 | ```
8 | http://www.netxillon.com
9 | For any help you can reach me at: trainings@netxillon.com
10 | ```
11 |
12 | #### Courses
13 |
14 | Hadoop Cluster Configurations
15 | The config files are from running cluster. Feel free to use them, but please drop an email with your feedback.
16 |
17 | I provide Advanced Hadoop Administration and DevOps trainings:
18 | > Hadoop, HBase, Kafka, Spark
19 | > Ansible automation for Hadoop Stack
20 | > Advanced Linux Optmizations
21 |
22 | Advanced Hadoop Training: I will be covering topics like: detailed kerberos, Encryption, Centerlized caching, Storage policy, Ranger, Knox, Hadoop Performance Tuning and Production Use cases. Contact me for details.
23 |
24 | > "Doing a course is not a guarantee for a job, but having a solid foundation surely is"
25 |
26 | For Details on Courses offered, please refer to the folder **Courses_Offered**.
27 |
--------------------------------------------------------------------------------
/Schedulers/capacity-scheduler.xml:
--------------------------------------------------------------------------------
1 |
14 |
15 |
16 |
17 | yarn.scheduler.capacity.maximum-applications
18 | 10000
19 |
20 | Maximum number of applications that can be pending and running.
21 |
22 |
23 |
24 |
25 | yarn.scheduler.capacity.maximum-am-resource-percent
26 | 0.1
27 |
28 | Maximum percent of resources in the cluster which can be used to run
29 | application masters i.e. controls number of concurrent running
30 | applications.
31 |
32 |
33 |
34 |
35 | yarn.scheduler.capacity.resource-calculator
36 | org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator
37 |
38 | The ResourceCalculator implementation to be used to compare
39 | Resources in the scheduler.
40 | The default i.e. DefaultResourceCalculator only uses Memory while
41 | DominantResourceCalculator uses dominant-resource to compare
42 | multi-dimensional resources such as Memory, CPU etc.
43 |
44 |
45 |
46 |
47 | yarn.scheduler.capacity.root.queues
48 | default,sales,marketing
49 |
50 | The queues at the this level (root is the root queue).
51 |
52 |
53 |
54 |
55 | yarn.scheduler.capacity.root.default.capacity
56 | 50
57 | Default queue target capacity.
58 |
59 |
60 |
61 | yarn.scheduler.capacity.root.default.user-limit-factor
62 | 1
63 |
64 | Default queue user limit a percentage from 0.0 to 1.0.
65 |
66 |
67 |
68 |
69 | yarn.scheduler.capacity.root.default.maximum-capacity
70 | 100
71 |
72 | The maximum capacity of the default queue.
73 |
74 |
75 |
76 |
77 | yarn.scheduler.capacity.root.default.state
78 | RUNNING
79 |
80 | The state of the default queue. State can be one of RUNNING or STOPPED.
81 |
82 |
83 |
84 |
85 | yarn.scheduler.capacity.root.default.acl_submit_applications
86 | *
87 |
88 | The ACL of who can submit jobs to the default queue.
89 |
90 |
91 |
92 |
93 | yarn.scheduler.capacity.root.default.acl_administer_queue
94 | *
95 |
96 | The ACL of who can administer jobs on the default queue.
97 |
98 |
99 |
100 |
101 | yarn.scheduler.capacity.node-locality-delay
102 | 40
103 |
104 | Number of missed scheduling opportunities after which the CapacityScheduler
105 | attempts to schedule rack-local containers.
106 | Typically this should be set to number of nodes in the cluster, By default is setting
107 | approximately number of nodes in one rack which is 40.
108 |
109 |
110 |
111 | # sales queue
112 |
113 |
114 | yarn.scheduler.capacity.root.sales.capacity
115 | 30
116 |
117 |
118 |
119 | yarn.scheduler.capacity.root.sales.user-limit-factor
120 | 1
121 |
122 |
123 |
124 | yarn.scheduler.capacity.root.sales.maximum-capacity
125 | 100
126 |
127 |
128 |
129 | yarn.scheduler.capacity.root.sales.state
130 | RUNNING
131 |
132 |
133 |
134 | yarn.scheduler.capacity.root.sales.acl_submit_applications
135 | *
136 |
137 |
138 |
139 | yarn.scheduler.capacity.root.sales.acl_administer_queue
140 | *
141 |
142 |
143 | # Marketing Queue
144 |
145 |
146 | yarn.scheduler.capacity.root.marketing.capacity
147 | 20
148 |
149 |
150 |
151 | yarn.scheduler.capacity.root.marketing.user-limit-factor
152 | 1
153 |
154 |
155 |
156 | yarn.scheduler.capacity.root.marketing.maximum-capacity
157 | 100
158 |
159 |
160 |
161 | yarn.scheduler.capacity.root.marketing.state
162 | RUNNING
163 |
164 |
165 |
166 | yarn.scheduler.capacity.root.marketing.acl_submit_applications
167 | *
168 |
169 |
170 |
171 | yarn.scheduler.capacity.root.marketing.acl_administer_queue
172 | *
173 |
174 |
175 |
176 | yarn.scheduler.capacity.queue-mappings
177 |
178 |
179 | A list of mappings that will be used to assign jobs to queues
180 | The syntax for this list is [u|g]:[name]:[queue_name][,next mapping]*
181 | Typically this list will be used to map users to queues,
182 | for example, u:%user:%user maps all users to queues with the same name
183 | as the user.
184 |
185 |
186 |
187 |
188 | yarn.scheduler.capacity.queue-mappings-override.enable
189 | false
190 |
191 | If a queue mapping is present, will it override the value specified
192 | by the user? This can be used by administrators to place jobs in queues
193 | that are different than the one specified by the user.
194 | The default is false.
195 |
196 |
197 |
198 |
199 |
--------------------------------------------------------------------------------
/Schedulers/commands:
--------------------------------------------------------------------------------
1 | Hadoop 1:
2 | hadoop jar hadoop/hadoop-examples-1.2.1.jar wordcount -Dmapred.job.queue.name=high /project/input /output2233231
3 |
4 | Hadoop 2:
5 | yarn jar hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.3.jar wordcount -Dmapred.job.queue.name=sales /test /out
6 |
7 | Useful Commands:
8 | $ yarn rmadmin -refreshQueues
9 | $ mapred queue -list
10 |
--------------------------------------------------------------------------------
/Schedulers/fair-scheduler.xml:
--------------------------------------------------------------------------------
1 | Hadoop 1, we used the concept "pool" as well, but later it was standarized to queues
2 |
3 |
4 |
5 |
6 | 10
7 | 5
8 |
9 |
10 |
11 | #Examples
12 |
13 |
14 | 10000 mb,0vcores
15 | 90000 mb,0vcores
16 | 50
17 | 0.1
18 | 2.0
19 | fair
20 |
21 | charlie
22 | 5000 mb,0vcores
23 |
24 |
25 |
26 | 0.5
27 |
28 |
30 |
31 | 3.0
32 |
33 |
34 |
35 | 30
36 |
37 | 5
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/Schedulers/mapred-site.xml:
--------------------------------------------------------------------------------
1 |
2 | mapred.job.tracker
3 | jt.cluster1.com:9001
4 |
5 |
6 |
7 | mapred.jobtracker.taskScheduler
8 | org.apache.hadoop.mapred.FairScheduler
9 |
10 |
11 |
12 | mapred.fairscheduler.allocation.file
13 | /home/hadoop/hadoop/conf/fair-scheduler.xml
14 |
15 |
16 |
17 | mapred.fairscheduler.poolnameproperty
18 | mapred.job.queue.name
19 | true
20 |
21 |
22 |
23 | mapred.queue.names
24 | default,high,low
25 |
26 |
--------------------------------------------------------------------------------
/Schedulers/user-mappings.txt:
--------------------------------------------------------------------------------
1 |
2 | yarn.scheduler.capacity.queue-mappings
3 | u:hdfs:marketing
4 |
5 | A list of mappings that will be used to assign jobs to queues
6 | The syntax for this list is [u|g]:[name]:[queue_name][,next mapping]*
7 | Typically this list will be used to map users to queues,
8 | for example, u:%user:%user maps all users to queues with the same name
9 | as the user.
10 |
11 |
12 |
13 | u:%user:%primary_group
14 |
15 |
16 | yarn.scheduler.capacity.queue-mappings
17 | u:%user:%primary_group
18 |
19 |
20 |
21 | yarn.scheduler.capacity.queue-mappings
22 | u:maria:engineering,g:webadmins:weblog
23 |
24 |
25 |
26 | yarn.scheduler.capacity.queue-mappings-override.enable
27 | false
28 |
29 | If a queue mapping is present and override is set to true, it will override the queue value specified
30 | by the user. This can be used by administrators to place jobs in queues
31 | that are different than the one specified by the user.
32 | The default is false - user can specify to a non-default queue.
33 |
34 |
35 |
--------------------------------------------------------------------------------
/Schedulers/yarn-site.xml_capacity:
--------------------------------------------------------------------------------
1 | # Capacity Scheduler is the default scheduler. So, we do not need to configure the below in Hadoop 2.x
2 |
3 |
4 |
5 | yarn.resourcemanager.scheduler.class
6 | org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler
7 | yarn-default.xml
8 |
9 |
--------------------------------------------------------------------------------
/Schedulers/yarn-site.xml_fair:
--------------------------------------------------------------------------------
1 |
2 | yarn.resourcemanager.scheduler.class
3 | org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler
4 |
5 |
6 |
7 | yarn.scheduler.fair.allocation.file
8 | hadoop/conf/fair-scheduler.xml
9 |
10 |
11 |
--------------------------------------------------------------------------------
/Security/README.md:
--------------------------------------------------------------------------------
1 | Important Points before starrting with Security:
2 | ===============================================
3 | 1. Ensure NTP is working and all nodes are in sync.
4 | 2. Ensure every system has the right entropy. Atleast 1000; refer to installation of rng under the kerberos install script.
5 | - This will ensure faster cryptography for keys, principals etc
6 | 3. For Kerberos make sure Java is patched with Unrestricted key length.
7 | 4. If not using SASL for Datanodes, ensure JSVC_HOME is set to the binary.
8 |
9 | This is very vast topic and lots of things to talk about:
10 |
11 | - The integrations can be with AD, FreeIPA, OpenLdap, Kerberos.
12 | - SIEM or RIHNO etc
13 |
14 | For any specific needs, please contact me at trainings@netxillon.com
15 |
--------------------------------------------------------------------------------
/Security/SSL_Configs/CA/README.txt:
--------------------------------------------------------------------------------
1 | This is to setup CA and get all certs signed by CA.
2 |
--------------------------------------------------------------------------------
/Security/SSL_Configs/commands_CA_JKS:
--------------------------------------------------------------------------------
1 | # yum install openssl-devel
2 | cd /etc/pki/CA/
3 |
4 | ls -l crl/
5 | ls -l newcerts/
6 | ls -l private/
7 | vi /etc/pki/tls/openssl.cnf
8 | touch /etc/pki/CA/index.txt
9 | echo 01 > /etc/pki/CA/serial
10 |
11 | openssl genrsa -out private/myca.key -des3 2048
12 | or openssl genrsa -out private/myca.key -aes128 2048
13 |
14 | openssl req -new -x509 -key private/myca.key -days 365 > CA.crt
15 | ----------------
16 | more refined way:
17 | openssl req -new -sha256 -key private/myca.key -nodes -out rootCA.csr
18 | openssl x509 -req -days 3650 -extensions v3_ca -in rootCA.csr -signkey private/myca.key -out rootCA.pem
19 | ------------------
20 |
21 | mkdir certs
22 | cd certs/
23 | openssl req -new -newkey rsa:2048 -nodes -keyout dilithium.key -out dilithium.csr
24 |
25 | openssl ca -in dilithium.csr -out dilithium.crt
26 | openssl req -new -newkey rsa:2048 -nodes -keyout cluster1.key -out cluster1.csr
27 | openssl ca -in cluster1.csr -out cluster1.crt
28 | openssl req -new -newkey rsa:2048 -nodes -keyout cluster1.key -out cluster1.csr
29 | openssl ca -in cluster1.csr -out cluster1.crt
30 |
31 | openssl verify -CAfile /etc/pki/CA/CA.crt certs/dilithium.crt
32 |
33 |
34 | openssl verify cluster1.crt
35 | openssl verify dilithium.crt
36 |
37 |
38 | Hadoop JKS steps: CA signed
39 | ---------------------------
40 |
41 | keytool -genkey -alias `hostname -s` -keyalg RSA -dname "CN=`hostname -f`,OU=Netxillon Technologies,O=Netxillon Technologies,L=Melbourne,ST=Victoria,C=AU" -keypass password -keystore keystore.jks -storepass password
42 |
43 | keytool -certreq -alias `hostname -s` -keyalg RSA -file `hostname -s`.csr -keystore keystore.jks -storepass password
44 |
45 | openssl ca -batch -passin pass:redhat -in `hostname -s`.csr -out `hostname -s`.crt
46 |
47 | keytool -import -keystore keystore.jks -file CA.crt -alias CARoot -storepass password -noprompt
48 |
49 | keytool -import -keystore keystore.jks -file `hostname -s`.crt -alias `hostname -s` -keypass password -storepass password -noprompt
50 |
51 | keytool -importcert -keystore truststore.jks -file CA.crt -alias CARoot -storepass password -noprompt
52 |
53 | Good to do:
54 | ===========
55 | keytool -exportcert -alias caroot -keystore /etc/security/keys/truststore.jks -file /usr/java/default/jre/lib/security/cacerts
56 |
57 |
58 | Verify PEM format or not
59 | ========================
60 |
61 | openssl x509 -inform PEM -in CA.crt
62 | openssl x509 -inform PEM -in CA.pem
63 | openssl x509 -inform PEM -in cm1.opta.com-server.pem
64 |
65 | Verify cert presented by Server
66 | --------------------------------
67 | openssl s_client -verify 100 -showcerts -CAfile <($JAVA_HOME/bin/keytool -list -rfc -keystore $JAVA_HOME/jre/lib/security/jssecacerts -storepass changeit) -connect cm1.opta.com:7183
68 |
69 | openssl s_client -connect cm1.opta.com:7183 2>/dev/null
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | fs.default.name
10 | hdfs://nn1.cluster1.com:9000
11 |
12 |
13 |
14 | hadoop.rpc.protection
15 | privacy
16 |
17 |
18 |
19 | hadoop.ssl.require.client.cert
20 | false
21 |
22 |
23 |
24 | hadoop.ssl.hostname.verifier
25 | DEFAULT
26 |
27 |
28 |
29 | hadoop.ssl.keystores.factory.class
30 | org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory
31 |
32 |
33 |
34 | hadoop.ssl.server.conf
35 | ssl-server.xml
36 |
37 |
38 |
39 | hadoop.ssl.client.conf
40 | ssl-client.xml
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/Security/SSL_Configs/hadoop_ssl_configs/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | dfs.encrypt.data.transfer
10 | true
11 |
12 |
13 |
14 | dfs.block.access.token.enable
15 | true
16 |
17 |
18 |
19 | dfs.data.transfer.protection
20 | privacy
21 |
22 |
23 |
24 | dfs.namenode.secondary.https-address
25 | nn1.cluster1.com:50091
26 |
27 |
28 |
29 | dfs.namenode.https-address
30 | nn1.cluster1.com:50470
31 |
32 |
33 |
34 | dfs.webhdfs.enabled
35 | true
36 |
37 |
38 |
39 | dfs.https.enable
40 | true
41 |
42 |
43 |
44 | dfs.http.policy
45 | HTTPS_ONLY
46 |
47 |
48 |
49 | dfs.name.dir
50 | /data/nn1,/data/nn2
51 |
52 |
53 |
54 | dfs.data.dir
55 | /data/d1,/data/d2
56 |
57 |
58 |
59 | dfs.replication
60 | 1
61 |
62 |
63 |
64 | dfs.datanode.https.address
65 | 0.0.0.0:50475
66 |
67 |
68 |
69 | dfs.datanode.address
70 | 0.0.0.0:10019
71 |
72 |
73 |
74 | dfs.datanode.http.address
75 | 0.0.0.0:10022
76 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/Security/SSL_Configs/hadoop_ssl_configs/mapred-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | mapreduce.framework.name
10 | yarn
11 |
12 |
13 |
14 | hadoop.ssl.enabled
15 | true
16 |
17 |
18 |
19 | mapreduce.shuffle.ssl.enabled
20 | true
21 |
22 |
23 |
24 | hadoop.ssl.require.client.cert
25 | false
26 |
27 |
28 |
29 | hadoop.ssl.hostname.verifier
30 | DEFAULT
31 | true
32 |
33 |
34 |
35 | hadoop.ssl.keystores.factory.class
36 | org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory
37 | true
38 |
39 |
40 |
41 | hadoop.ssl.server.conf
42 | ssl-server.xml
43 | true
44 |
45 |
46 |
47 | hadoop.ssl.client.conf
48 | ssl-client.xml
49 | true
50 |
51 |
52 |
53 |
54 | mapreduce.jobhistory.http.policy
55 | HTTPS_ONLY
56 |
57 |
58 |
59 | mapreduce.jobhistory.webapp.https.address
60 | rm1.cluster1.com:19889
61 |
62 |
63 |
64 |
--------------------------------------------------------------------------------
/Security/SSL_Configs/hadoop_ssl_configs/ssl-client.xml:
--------------------------------------------------------------------------------
1 | [hadoop@ip-172-31-15-180 ~]$ cat /etc/hadoop/conf/ssl-client.xml
2 |
3 |
4 |
5 |
6 |
7 |
8 | ssl.client.truststore.location
9 | ${user.home}/keystore/final.jks
10 | Truststore to be used by clients like distcp. Must be
11 | specified.
12 |
13 |
14 |
15 |
16 | ssl.client.truststore.password
17 | password
18 | Optional. Default value is "".
19 |
20 |
21 |
22 |
23 | ssl.client.truststore.type
24 | jks
25 | Optional. The keystore file format, default value is "jks".
26 |
27 |
28 |
29 |
30 | ssl.client.truststore.reload.interval
31 | 10000
32 | Truststore reload check interval, in milliseconds.
33 | Default value is 10000 (10 seconds).
34 |
35 |
36 |
37 |
38 | ssl.client.keystore.location
39 | ${user.home}/keystore/keystore.jks
40 | Keystore to be used by clients like distcp. Must be
41 | specified.
42 |
43 |
44 |
45 |
46 | ssl.client.keystore.password
47 | password
48 | Optional. Default value is "".
49 |
50 |
51 |
52 |
53 | ssl.client.keystore.keypassword
54 | password
55 | Optional. Default value is "".
56 |
57 |
58 |
59 |
60 | ssl.client.keystore.type
61 | jks
62 | Optional. The keystore file format, default value is "jks".
63 |
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/Security/SSL_Configs/hadoop_ssl_configs/ssl-server.xml:
--------------------------------------------------------------------------------
1 | [hadoop@ip-172-31-15-180 ~]$ cat /etc/hadoop/conf/ssl-server.xml
2 |
3 |
4 |
5 |
6 |
7 |
8 | ssl.server.keystore.type
9 | jks
10 |
11 |
12 | ssl.server.keystore.location
13 | /home/hadoop/keystore/keystore.jks
14 |
15 |
16 | ssl.server.keystore.password
17 | password
18 |
19 |
20 |
21 |
22 | ssl.server.truststore.type
23 | jks
24 |
25 |
26 | ssl.server.truststore.location
27 | /home/hadoop/keystore/truststore.jks
28 |
29 |
30 | ssl.server.truststore.password
31 | password
32 |
33 |
34 | ssl.server.truststore.reload.interval
35 | 10000
36 |
37 |
38 |
--------------------------------------------------------------------------------
/Security/SSL_Configs/hadoop_ssl_configs/yarn-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | yarn.resourcemanager.resource-tracker.address
9 | rm1.cluster1.com:9001
10 |
11 |
12 |
13 | yarn.resourcemanager.scheduler.address
14 | rm1.cluster1.com:9002
15 |
16 |
17 |
18 | yarn.resourcemanager.address
19 | rm1.cluster1.com:9003
20 |
21 |
22 |
23 | yarn.nodemanager.aux-services
24 | mapreduce_shuffle
25 |
26 |
27 |
28 | yarn.nodemanager.aux-services.mapreduce.shuffle.class
29 | org.apache.hadoop.mapred.ShuffleHandler
30 |
31 |
32 |
33 | yarn.http.policy
34 | HTTPS_ONLY
35 |
36 |
37 |
38 | yarn.resourcemanager.webapp.https.address
39 | rm1.cluster1.com:8089
40 |
41 |
42 |
43 | yarn.log.server.url
44 | https://rm1.cluster1.com:19889/jobhistory/logs
45 |
46 |
47 |
48 | yarn.nodemanager.webapp.https.address
49 | 0.0.0.0:8090
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/Security/kerberos/JT/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | fs.default.name
10 | hdfs://nn1.cluster1.com:9000
11 |
12 |
13 |
14 | hadoop.security.authentication
15 | kerberos
16 |
17 |
18 |
19 | hadoop.security.authorization
20 | true
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/Security/kerberos/JT/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | dfs.data.dir
10 | /space/d1
11 | true
12 |
13 |
14 |
15 | dfs.replication
16 | 1
17 |
18 |
19 |
20 | dfs.permissions.supergroup
21 | hadoop
22 |
23 |
24 |
25 | dfs.permissions.superusergroup
26 | hadoop
27 |
28 |
29 |
30 | dfs.datanode.data.dir.perm
31 | 700
32 |
33 |
34 |
35 | dfs.datanode.address
36 | 192.168.1.74:1004
37 |
38 |
39 |
40 | dfs.datanode.http.address
41 | 192.168.1.74:1006
42 |
43 |
44 |
45 | dfs.datanode.keytab.file
46 | /home/hadoop/dn.hdfs.keytab
47 |
48 |
49 |
50 | dfs.datanode.kerberos.principal
51 | dn/_HOST@CLUSTER1.COM
52 |
53 |
54 |
55 | dfs.datanode.kerberos.https.principal
56 | host/_HOST@CLUSTER1.COM
57 |
58 |
59 |
60 | dfs.namenode.kerberos.principal
61 | nn/_HOST@CLUSTER1.COM
62 |
63 |
64 |
65 |
--------------------------------------------------------------------------------
/Security/kerberos/JT/mapred-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | mapred.job.tracker
10 | jt1.cluster1.com:9001
11 |
12 |
13 |
14 | mapreduce.jobtracker.kerberos.principal
15 | mapred/_HOST@CLUSTER1.COM
16 |
17 |
18 |
19 | mapreduce.jobtracker.kerberos.https.principal
20 | host/_HOST@CLUSTER1.COM
21 |
22 |
23 |
24 | mapreduce.jobtracker.keytab.file
25 | /home/hadoop/mapred.keytab
26 |
27 |
28 |
29 | mapreduce.tasktracker.kerberos.principal
30 | mapred/_HOST@CLUSTER1.COM
31 |
32 |
33 |
34 | mapreduce.tasktracker.kerberos.https.principal
35 | host/_HOST@CLUSTER1.COM
36 |
37 |
38 |
39 | mapreduce.tasktracker.keytab.file
40 | /home/hadoop/tt.mapred.keytab
41 |
42 |
43 |
44 | mapred.local.dir
45 | /space/tmp
46 |
47 |
48 |
49 | mapreduce.tasktracker.group
50 | mapred
51 |
52 |
53 |
54 |
--------------------------------------------------------------------------------
/Security/kerberos/JT/taskcontroller.cfg:
--------------------------------------------------------------------------------
1 | mapred.local.dir=/space/tmp#configured value of mapred.local.dir. It can be a list of comma separated paths.
2 | hadoop.log.dir=/home/hadoop/log#configured value of hadoop.log.dir.
3 | mapred.tasktracker.tasks.sleeptime-before-sigkill=#sleep time before sig kill is to be sent to process group after sigterm is sent. Should be in seconds
4 | mapreduce.tasktracker.group=#configured value of mapreduce.tasktracker.group.
5 |
6 |
7 | mapred.task.tracker.task-controller
8 | org.apache.hadoop.mapred.LinuxTaskController
9 |
10 |
11 |
12 | mapreduce.tasktracker.group
13 | mapred
14 |
15 |
--------------------------------------------------------------------------------
/Security/kerberos/Jsvc_download.txt:
--------------------------------------------------------------------------------
1 | 1. http://commons.apache.org/proper/commons-daemon/download_daemon.cgi
2 |
3 | Downlaod package: commons-daemon-1.1.0-native-src.tar.gz
4 |
5 | $ tar -xzvf commons-daemon-1.1.0-native-src.tar.gz
6 | $ cd commons-daemon-1.1.0-native-src/unix
7 | $ ./configure && make
8 | $ cp jsvc /usr/lib
9 |
10 |
11 | Under hadoop-env.sh
12 |
13 | export JSVC_HOME=/usr/lib
14 |
15 | 2. Directly download the binary: http://archive.apache.org/dist/commons/daemon/binaries/
16 |
--------------------------------------------------------------------------------
/Security/kerberos/Namenode_Datanode/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | fs.default.name
10 | hdfs://nn1.cluster1.com:9000
11 |
12 |
13 |
14 | hadoop.security.authentication
15 | kerberos
16 |
17 |
18 |
19 | hadoop.security.authorization
20 | true
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/Security/kerberos/Namenode_Datanode/hadoop-env.sh:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | # Set Hadoop-specific environment variables here.
18 |
19 | # The only required environment variable is JAVA_HOME. All others are
20 | # optional. When running a distributed configuration it is best to
21 | # set JAVA_HOME in this file, so that it is correctly defined on
22 | # remote nodes.
23 |
24 | export JAVA_HOME=/usr/java/latest
25 |
26 | # The java implementation to use.
27 | export JAVA_HOME=${JAVA_HOME}
28 |
29 | # The jsvc implementation to use. Jsvc is required to run secure datanodes
30 | # that bind to privileged ports to provide authentication of data transfer
31 | # protocol. Jsvc is not required if SASL is configured for authentication of
32 | # data transfer protocol using non-privileged ports.
33 | #export JSVC_HOME=${JSVC_HOME}
34 |
35 | export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"}
36 |
37 | # Extra Java CLASSPATH elements. Automatically insert capacity-scheduler.
38 | for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do
39 | if [ "$HADOOP_CLASSPATH" ]; then
40 | export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f
41 | else
42 | export HADOOP_CLASSPATH=$f
43 | fi
44 | done
45 |
46 | # The maximum amount of heap to use, in MB. Default is 1000.
47 | #export HADOOP_HEAPSIZE=
48 | #export HADOOP_NAMENODE_INIT_HEAPSIZE=""
49 |
50 | # Extra Java runtime options. Empty by default.
51 | export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true -Djavax.net.debug=ssl:handshake"
52 |
53 | # Command specific options appended to HADOOP_OPTS when specified
54 | export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} -Djavax.net.debug=ssl $HADOOP_NAMENODE_OPTS"
55 | export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS"
56 |
57 | export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS"
58 |
59 | export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS"
60 | export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS"
61 |
62 | # The following applies to multiple commands (fs, dfs, fsck, distcp etc)
63 | export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS"
64 | #HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS"
65 |
66 | # On secure datanodes, user to run the datanode as after dropping privileges.
67 | # This **MUST** be uncommented to enable secure HDFS if using privileged ports
68 | # to provide authentication of data transfer protocol. This **MUST NOT** be
69 | # defined if SASL is configured for authentication of data transfer protocol
70 | # using non-privileged ports.
71 | export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER}
72 |
73 | # Where log files are stored. $HADOOP_HOME/logs by default.
74 | #export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER
75 |
76 | # Where log files are stored in the secure data environment.
77 | export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}
78 |
79 | ###
80 | # HDFS Mover specific parameters
81 | ###
82 | # Specify the JVM options to be used when starting the HDFS Mover.
83 | # These options will be appended to the options specified as HADOOP_OPTS
84 | # and therefore may override any similar flags set in HADOOP_OPTS
85 | #
86 | # export HADOOP_MOVER_OPTS=""
87 |
88 | ###
89 | # Advanced Users Only!
90 | ###
91 |
92 | # The directory where pid files are stored. /tmp by default.
93 | # NOTE: this should be set to a directory that can only be written to by
94 | # the user that will run the hadoop daemons. Otherwise there is the
95 | # potential for a symlink attack.
96 | export HADOOP_PID_DIR=${HADOOP_PID_DIR}
97 | export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR}
98 |
99 | # A string representing this instance of hadoop. $USER by default.
100 | export HADOOP_IDENT_STRING=$USER
101 |
--------------------------------------------------------------------------------
/Security/kerberos/Namenode_Datanode/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | dfs.name.dir
5 | /data/nn1,/data/nn2
6 |
7 |
8 |
9 | dfs.data.dir
10 | /data/d1,/data/d2
11 |
12 |
13 |
14 | dfs.replication
15 | 1
16 |
17 |
18 |
19 | dfs.permissions.supergroup
20 | hadoop
21 |
22 |
23 | # Kerberos configuration
24 |
25 |
26 | dfs.block.access.token.enable
27 | true
28 |
29 |
30 |
31 | dfs.namenode.keytab.file
32 | /opt/cluster/security/nn.hdfs.keytab
33 |
34 |
35 |
36 | dfs.namenode.kerberos.principal
37 | hdfs/_HOST@CLUSTER1.COM
38 |
39 |
40 |
41 | dfs.namenode.kerberos.http.principal
42 | host/_HOST@CLUSTER1.COM
43 |
44 |
45 |
46 | dfs.web.authentication.kerberos.principal
47 | HTTP/_HOST@CLUSTER1.COM
48 |
49 |
50 |
51 | dfs.namenode.kerberos.internal.spnego.principal
52 | ${dfs.web.authentication.kerberos.principal}
53 |
54 |
55 | # Datanode configuration
56 |
57 |
58 | dfs.datanode.data.dir.perm
59 | 700
60 |
61 |
62 |
63 | dfs.datanode.address
64 | 0.0.0.0:1004
65 |
66 |
67 |
68 | dfs.datanode.http.address
69 | 0.0.0.0:1006
70 |
71 |
72 |
73 | dfs.datanode.keytab.file
74 | /opt/cluster/security/dn.hdfs.keytab
75 |
76 |
77 |
78 | dfs.datanode.kerberos.principal
79 | hdfs/_HOST@CLUSTER1.COM
80 |
81 |
82 |
83 | dfs.datanode.kerberos.http.principal
84 | host/_HOST@CLUSTER1.COM
85 |
86 |
87 |
88 | dfs.web.authentication.kerberos.principal
89 | HTTP/_HOST@CLUSTER1.COM
90 |
91 |
92 |
93 |
--------------------------------------------------------------------------------
/Security/kerberos/Namenode_Datanode/mapred-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | mapred.job.tracker
10 | jt1.cluster1.com:9001
11 |
12 |
13 |
14 | mapreduce.jobtracker.kerberos.principal
15 | mapred/_HOST@CLUSTER1.COM
16 |
17 |
18 |
19 | mapreduce.jobtracker.kerberos.https.principal
20 | host/_HOST@CLUSTER1.COM
21 |
22 |
23 |
24 | mapreduce.jobtracker.keytab.file
25 | /home/hadoop/mapred.keytab
26 |
27 |
28 |
29 | mapreduce.tasktracker.kerberos.principal
30 | mapred/_HOST@CLUSTER1.COM
31 |
32 |
33 |
34 | mapreduce.tasktracker.kerberos.https.principal
35 | host/_HOST@CLUSTER1.COM
36 |
37 |
38 |
39 | mapreduce.tasktracker.keytab.file
40 | /home/hadoop/tt.mapred.keytab
41 |
42 |
43 |
44 | mapred.local.dir
45 | /space/tmp
46 |
47 |
48 |
49 | mapreduce.tasktracker.group
50 | mapred
51 |
52 |
53 |
54 |
--------------------------------------------------------------------------------
/Security/kerberos/Namenode_Datanode/taskcontroller.cfg:
--------------------------------------------------------------------------------
1 | mapred.local.dir=/space/tmp#configured value of mapred.local.dir. It can be a list of comma separated paths.
2 | hadoop.log.dir=/home/hadoop/log#configured value of hadoop.log.dir.
3 | mapred.tasktracker.tasks.sleeptime-before-sigkill=#sleep time before sig kill is to be sent to process group after sigterm is sent. Should be in seconds
4 | mapreduce.tasktracker.group=#configured value of mapreduce.tasktracker.group.
5 |
6 |
7 | mapred.task.tracker.task-controller
8 | org.apache.hadoop.mapred.LinuxTaskController
9 |
10 |
11 |
12 | mapreduce.tasktracker.group
13 | mapred
14 |
15 |
--------------------------------------------------------------------------------
/Security/kerberos/README.md:
--------------------------------------------------------------------------------
1 | In production remove legacy encryption algo's and use only:
2 |
3 | default_tkt_enctypes = aes256-cts-hmac-sha1-96 aes128-cts-hmac-sha1-96
4 | default_tgs_enctypes = aes256-cts-hmac-sha1-96 aes128-cts-hmac-sha1-96
5 | permitted_enctypes = aes256-cts-hmac-sha1-96 aes128-cts-hmac-sha1-96
6 |
7 |
8 | Debugging:
9 |
10 | $ export HADOOP_ROOT_LOGGER=TRACE,console; export HADOOP_JAAS_DEBUG=true; export HADOOP_OPTS="-Dsun.security.krb5.debug=true"
11 |
12 | $ hadoop fs -ls / > >(tee fsls-logfile.txt) 2>&1
13 |
14 | $ export KRB5_TRACE=/tmp/kinit.log
15 |
--------------------------------------------------------------------------------
/Security/kerberos/kdc.conf:
--------------------------------------------------------------------------------
1 | # On KDC server /var/kerberos/krb5kdc/kdc.conf
2 |
3 | [kdcdefaults]
4 | kdc_ports = 88
5 | kdc_tcp_ports = 88
6 |
7 | [realms]
8 | CLUSTER1.COM = {
9 | #master_key_type = aes256-cts
10 | max_renewable_life = 7d 0h 0m 0s #Needed for Kerberos auto ticket renewing for long running jobs and Hue KGT renewer
11 | acl_file = /var/kerberos/krb5kdc/kadm5.acl
12 | dict_file = /usr/share/dict/words
13 | admin_keytab = /var/kerberos/krb5kdc/kadm5.keytab
14 | supported_enctypes = aes256-cts:normal aes128-cts:normal
15 | default_principal_flags = +renewable #Needed for Kerberos auto ticket renewing for long running jobs and Hue KGT renewer
16 | }
17 |
18 |
19 |
20 | # Also, we need the below steps:
21 | kadmin.local: modprinc -maxrenewlife 90day krbtgt/NETXILLON.COM
22 | kadmin.local: modprinc -maxrenewlife 90day +allow_renewable hue/edge1.netxillon.com@NETXILLON.COM
23 |
--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/README.md:
--------------------------------------------------------------------------------
1 | @Netxillon Technologies. These scripts I used for hadoop1.0 and 2.0, please update the service principals accordingly. Example tt, is no longer valid in hadoop2.0.
2 | Well, we can use any name for the service principal, but just to be consistent on naming conventions each service has a respective principal.
3 |
4 |
--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/add_users.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | for j in `cat user_list`
4 | do
5 | echo -e "hadoop\nhadoop" | kadmin.local -q "addprinc $j"
6 | done
7 |
--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/copy_keytabs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | clush -g all --copy nn.hdfs.keytab --dest=/opt/cluster/security/
4 | clush -g all --copy dn.hdfs.keytab --dest=/opt/cluster/security/
5 | clush -g all --copy user.hdfs.keytab --dest=/opt/cluster/security/
6 | clush -g all -b "chown -R hdfs:hadoop /opt/cluster/"
7 |
--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/create_dn_princs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Generate Hosts principals
4 |
5 | for i in `cat dn_host_list`
6 | do
7 | kadmin.local -q "addprinc -randkey host/$i"
8 | kadmin.local -q "addprinc -randkey HTTP/$i"
9 | kadmin.local -q "addprinc -randkey hdfs/$i"
10 | kadmin.local -q "xst -norandkey -k dn.hdfs.keytab host/$i"
11 | kadmin.local -q "xst -norandkey -k dn.hdfs.keytab HTTP/$i"
12 | kadmin.local -q "xst -norandkey -k dn.hdfs.keytab hdfs/$i"
13 | done
14 |
--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/create_nn_princs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | for k in `cat nn_host_list`
4 | do
5 | kadmin.local -q "addprinc -randkey host/$k"
6 | kadmin.local -q "addprinc -randkey HTTP/$k"
7 | kadmin.local -q "addprinc -randkey hdfs/$k"
8 |
9 | kadmin.local -q "xst -norandkey -k nn.hdfs.keytab host/$k"
10 | kadmin.local -q "xst -norandkey -k nn.hdfs.keytab HTTP/$k"
11 | kadmin.local -q "xst -norandkey -k nn.hdfs.keytab hdfs/$k"
12 | done
13 |
--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/create_partions.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | for i in `cat hosts`
4 | do
5 | ssh $i 'echo -e "o\nn\np\n1\n\n\nw"'
6 | done
7 |
--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/create_user_keytab.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | for k in `cat user_host_list`
4 | do
5 | kadmin.local -q "xst -norandkey -k user.hdfs.keytab host/$k"
6 | done
7 |
8 | for p in `cat user_list`
9 | do
10 | kadmin.local -q "xst -norandkey -k user.hdfs.keytab $p"
11 | done
12 |
--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/delete_list:
--------------------------------------------------------------------------------
1 | HTTP/dn1.cluster1.com@CLUSTER1.COM
2 | HTTP/dn2.cluster1.com@CLUSTER1.COM
3 | HTTP/dn3.cluster1.com@CLUSTER1.COM
4 | HTTP/dn4.cluster1.com@CLUSTER1.COM
5 | HTTP/nn1.cluster1.com@CLUSTER1.COM
6 | dn/dn1.cluster1.com@CLUSTER1.COM
7 | dn/dn2.cluster1.com@CLUSTER1.COM
8 | dn/dn3.cluster1.com@CLUSTER1.COM
9 | dn/dn4.cluster1.com@CLUSTER1.COM
10 | host/dn1.cluster1.com@CLUSTER1.COM
11 | host/dn2.cluster1.com@CLUSTER1.COM
12 | host/dn3.cluster1.com@CLUSTER1.COM
13 | host/dn4.cluster1.com@CLUSTER1.COM
14 | host/nn1.cluster1.com@CLUSTER1.COM
15 | nn/nn1.cluster1.com@CLUSTER1.COM
16 |
--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/delete_princs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | source list_princs.sh | egrep "host|nn|http|dn|mapred|jt|tt" > delete_list
4 |
5 | for i in `cat delete_list`
6 | do
7 | kadmin.local -q "delprinc -force $i"
8 | done
9 |
10 | rm -rf *.keytab
11 |
--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/dn_host_list:
--------------------------------------------------------------------------------
1 | dn1.cluster1.com
2 | dn2.cluster1.com
3 | dn3.cluster1.com
4 | dn4.cluster1.com
5 |
--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/hosts:
--------------------------------------------------------------------------------
1 | 192.168.1.10
2 |
--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/install_krb.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | yum clean all
4 | yum install -y krb5-server krb5-workstation krb5-devel pam_krb5 krb5-libs
5 |
6 | yum install rng-tools -y
7 |
8 | echo 'EXTRAOPTIONS="-r /dev/urandom"'' > /etc/sysconfig/rngd
9 | service rngd restart
10 | yum install ntp -y
11 |
12 | chkconfig ntpd on
13 | chkconfig rngd on
14 |
--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/list_princs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | kadmin.local -q "listprincs"
4 |
--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/nn_host_list:
--------------------------------------------------------------------------------
1 | nn1.cluster1.com
2 |
--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/setup_kerberos.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Note: Kerberos Realm does not need to be same as domain name. Just update [domain_realm] section mapping correctly.
4 | echo -e "redhat\nredhat" | kdb5_util create -r NETXILLON.COM -s
5 |
6 | echo -e "redhat\nredhat" | kadmin.local -q "addprinc root/admin"
7 |
8 | kadmin.local -q "ktadd -k /var/kerberos/krb5kdc/kadm5.keytab kadmin/admin"
9 | kadmin.local -q "ktadd -k /var/kerberos/krb5kdc/kadm5.keytab kadmin/changepw"
10 |
11 | /etc/init.d/kadmin restart
12 | /etc/init.d/krb5kdc restart
13 |
14 | chkconfig krb5kdc on
15 | chkconfig kadmin on
16 |
--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/user_host_list:
--------------------------------------------------------------------------------
1 | nn1.cluster1.com
2 | dn1.cluster1.com
3 | dn2.cluster1.com
4 | dn3.cluster1.com
5 | dn4.cluster1.com
6 |
--------------------------------------------------------------------------------
/Security/kerberos/kerberos_scripts/user_list:
--------------------------------------------------------------------------------
1 | hdfs
2 |
--------------------------------------------------------------------------------
/Security/kerberos/kerberos_user_mappings.txt:
--------------------------------------------------------------------------------
1 | This file talks about mapping the kerberos princial with local users.
2 |
3 | We can have a NN principal as:
4 | nn/_HOST@CLUSTER1.COM or hdfs/_HOST@CLUSTER1.COM
5 |
6 | If it is the first way, when Datanode sends the user (dnUsername), it will be as user "dn", which does not exist anywhere. So, NN will complain that the user "dn" is not part of supergroup. Which is right!
7 |
8 | For this we need to map users as below on all nodes under core-site.xml
9 |
10 |
11 | [hdfs@nn1 ~]$ hadoop org.apache.hadoop.security.HadoopKerberosName host/nn1.cluster1.com@CLUSTER1.COM
12 | Name: host/nn1.cluster1.com@CLUSTER1.COM to host
13 |
14 | [hdfs@nn1 ~]$ hadoop org.apache.hadoop.security.HadoopKerberosName dn/dn1.cluster1.com@CLUSTER1.COM
15 | Name: dn/dn1.cluster1.com@CLUSTER1.COM to dn
16 |
17 | After adding the rule as below in core-site.xml:
18 |
19 |
20 | hadoop.security.auth_to_local
21 |
22 | RULE:[2:$1/$2@$0](dn/.*@.*CLUSTER1.COM)s/.*/hdfs/
23 | DEFAULT
24 |
25 |
26 |
27 | [hdfs@nn1 hadoop]$ hadoop org.apache.hadoop.security.HadoopKerberosName host/nn1.cluster1.com@CLUSTER1.COM
28 | Name: host/nn1.cluster1.com@CLUSTER1.COM to host
29 | [hdfs@nn1 ~]$ hadoop org.apache.hadoop.security.HadoopKerberosName dn/dn1.cluster1.com@CLUSTER1.COM
30 | Name: dn/dn1.cluster1.com@CLUSTER1.COM to hdfs
31 |
32 | See above that the user "dn" is translated to user "hdfs", which is part of the supergroup. All Good!
33 |
34 | Good Read: https://www.cloudera.com/documentation/enterprise/5-8-x/topics/cdh_sg_kerbprin_to_sn.html
35 |
--------------------------------------------------------------------------------
/Security/kerberos/krb5.conf:
--------------------------------------------------------------------------------
1 | # On all nodes, including KDC: /etc/krb5.conf
2 |
3 | [libdefaults]
4 | default_realm = CLUSTER1.COM
5 | dns_lookup_realm = false
6 | dns_lookup_kdc = false
7 | ticket_lifetime = 24h
8 | renew_lifetime = 7d
9 | forwardable = true
10 |
11 | default_tkt_enctypes = aes256-cts-hmac-sha1-96 aes128-cts-hmac-sha1-96
12 | default_tgs_enctypes = aes256-cts-hmac-sha1-96 aes128-cts-hmac-sha1-96
13 | permitted_enctypes = aes256-cts-hmac-sha1-96 aes128-cts-hmac-sha1-96
14 |
15 | #default_tkt_enctypes = des3-cbc-sha1 des-cbc-crc
16 | #default_tgs_enctypes = des3-cbc-sha1 des-cbc-crc
17 | #permitted_enctypes = des3-cbc-sha1 des-cbc-crc
18 | udp_preference_limit = 1
19 |
20 | [realms]
21 | CLUSTER1.COM = {
22 | kdc = repo.cluster1.com:88
23 | admin_server = repo.cluster1.com:749
24 | default_domain = cluster1.com
25 | }
26 |
27 | [domain_realm]
28 | .cluster1.com = CLUSTER1.COM
29 | cluster1.com = CLUSTER1.COM
30 |
31 | [logging]
32 | kdc = FILE:/var/log/krb5kdc.log
33 | admin_server = FILE:/var/log/kadmin.log
34 | default = FILE:/var/log/krb5lib.log
35 |
--------------------------------------------------------------------------------
/Security/kms/kms-setup:
--------------------------------------------------------------------------------
1 | core-site.xml
2 |
3 |
4 | hadoop.security.key.provider.path
5 | kms://http@nn1.cluster1.com:16000/kms
6 |
7 |
8 | hdfs-site.xml file and make the changes shown here:
9 |
10 | dfs.encryption.key.provider.uri
11 | kms://http@nn1.cluster1.com:16000/kms
12 |
13 |
14 | /opt/cluster/hadoop/etc/hadoop/kms-env.sh:
15 |
16 | export KMS_TEMP=${KMS_HOME}/temp
17 |
18 |
19 | kms.sh start
20 | hadoop key list
21 | hadoop key create key1
22 | hadoop fs -mkdir /secure_zone
23 | hdfs crypto -createZone -keyName key1 -path /secure_zone
24 |
25 | hdfs crypto -listZones
26 |
27 | hadoop fs -put wordcount /secure_zone
28 | hadoop fs -cat /secure_zone/wordcount
29 | hadoop fs -mkdir /unsecure
30 |
--------------------------------------------------------------------------------
/Security/ldap/Installation_steps:
--------------------------------------------------------------------------------
1 | yum -y install openldap compat-openldap openldap-clients openldap-servers openldap-servers-sql openldap-devel
2 | yum -y install nss-pam-ldapd pam_ldap
3 |
4 | cp /usr/share/openldap-servers/DB_CONFIG.example /var/lib/ldap/DB_CONFIG
5 | cp /usr/share/openldap-servers/slapd.conf.obsolete slapd.conf
6 |
7 | edit "slapd.conf" and add/change the lines as below:
8 |
9 | suffix "dc=cluster1,dc=com"
10 | rootdn "cn=Manager,dc=cluster1,dc=com"
11 | rootpw {SSHA}2F2+4O43lt9jnPLrh6gjJ8tIVksTSSEg
12 |
13 | The password is generated using "slappasswd"
14 |
15 | slaptest -f /etc/openldap/slapd.conf -F /etc/openldap/slapd.d/
16 |
17 | chown -R ldap:ldap /var/run/openldap/
18 | chown -R ldap:ldap /var/lib/ldap
19 | chown -R ldap:ldap /etc/openldap/slap.d
20 |
21 | ldapadd -f base.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
22 | ldapadd -f groupadd.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
23 | ldapadd -f base1.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
24 | ldapadd -f users.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
25 | ldapadd -f adduser.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
26 |
27 | #PhpldapAdmin installation; need epel repo
28 |
29 | yum -y install httpd php php-ldap phpldapadmin
30 |
31 | Then change the files as below:
32 |
33 | /etc/phpldapadmin/config.php
34 |
35 | $servers->setValue('server','name','Netxillon LDAP Server');
36 | $servers->setValue('server','host','192.168.1.254');
37 | $servers->setValue('login','bind_id','cn=Manager,dc=cluster1,dc=com');
38 |
39 | comment //$servers->setValue('login','attr','uid');
40 | uncomment $servers->setValue('login','attr','dn');
41 |
42 | Change Deny rule to Allow in the http and restart the httpd
43 |
44 | # On Client nodes:
45 |
46 | authconfig --enableldap --enableldapauth --ldapserver=repo.cluster1.com --ldapbasedn="dc=cluster1,dc=com" --enablemkhomedir --update
47 |
48 |
49 |
50 |
--------------------------------------------------------------------------------
/Security/ldap/addmembers.ldif:
--------------------------------------------------------------------------------
1 | dn: cn=hadoop,ou=groups,dc=cluster1,dc=com
2 | changetype: modify
3 | add: memberuid
4 | memberuid: hdfs1
5 |
--------------------------------------------------------------------------------
/Security/ldap/adduser.ldif:
--------------------------------------------------------------------------------
1 | dn: uid=hdfs1,ou=users,dc=cluster1,dc=com
2 | objectClass: top
3 | objectClass: account
4 | objectClass: posixAccount
5 | objectClass: shadowAccount
6 | cn: hdfs1
7 | uid: hdfs1
8 | uidNumber: 509
9 | gidNumber: 509
10 | homeDirectory: /home/hdfs1
11 | loginShell: /bin/bash
12 | gecos: adam
13 | userPassword: {crypt}x
14 | shadowLastChange: 0
15 | shadowMax: 0
16 | shadowWarning: 0
17 |
--------------------------------------------------------------------------------
/Security/ldap/base.ldif:
--------------------------------------------------------------------------------
1 | dn: dc=cluster1,dc=com
2 | objectClass: dcObject
3 | objectClass: organization
4 | dc: cluster1
5 | o : cluster1
6 |
--------------------------------------------------------------------------------
/Security/ldap/base1.ldif:
--------------------------------------------------------------------------------
1 | dn:ou=groups, dc=cluster1, dc=com
2 | objectclass: top
3 | objectclass: organizationalUnit
4 | ou: groups
5 |
6 | dn:ou=people, dc=cluster1, dc=com
7 | objectclass: top
8 | objectclass: organizationalUnit
9 | ou: people
10 |
--------------------------------------------------------------------------------
/Security/ldap/base2.ldif:
--------------------------------------------------------------------------------
1 | dn: dc=cluster1,dc=com
2 | objectClass: top
3 | objectclass: organization
4 | o: cluster1
5 |
--------------------------------------------------------------------------------
/Security/ldap/commands:
--------------------------------------------------------------------------------
1 | ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/cosine.ldif
2 | 962 ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/*
3 | 963 ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/collective.ldif
4 | 964 ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/core.ldif
5 | 965 ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/inetorgperson.ldif
6 | 966 ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/nis.ldif
7 | 967 ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/ppolicy.ldif
8 | 969 ldapadd -f groupadd.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
9 | 971 ldapadd -f adduser.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
10 | 972 ldappasswd -s welcome123 -W -D "cn=Manager,dc=cluster1,dc=com" -x "uid=hdfs1,ou=users,dc=cluster1,dc=com"
11 | 974 yum install pam_ldap -y
12 | 976 cat /etc/openldap/ldap.conf
13 | 989 vi /etc/openldap/ldap.conf
14 | 997 authconfig --enableldap --enableldapauth --ldapserver=repo.cluster1.com --ldapbasedn="dc=cluster1,dc=com" --enablemkhomedir --update
15 | 998 cat /etc/openldap/ldap.conf
16 | 1004 cd ldap/
17 | 1006 ldapadd -f groupadd.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
18 | 1008 ldapadd -f groupadd.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
19 | 1010 ldapsearch -x -W -D "cn=Manager,dc=cluster1,dc=com" -b "uid=hdfs1,ou=users,dc=cluster1,dc=com" "(objectclass=*)"
20 | 1011 ldapsearch -x -W -D "cn=Manager,dc=cluster1,dc=com" -b "uid=hdfs1,ou=Users,dc=cluster1,dc=com" "(objectclass=*)"
21 | 1012 ldapdelete -W -D "cn=Manager,dc=cluster1,dc=com" "uid=hdfs1,ou=users,dc=cluster1,dc=com"
22 | 1013 ldapsearch -x -W -D "cn=Manager,dc=cluster1,dc=com" -b "uid=hdfs1,ou=Users,dc=cluster1,dc=com" "(objectclass=*)"
23 | 1016 ldapadd -f groupadd.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
24 | 1017 ldapadd -f adduser.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
25 | 1020 ldapadd -f addmembers.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat
26 | 1024 ldapsearch -x -W -D "cn=Manager,dc=cluster1,dc=com" -b "uid=hdfs1,ou=Users,dc=cluster1,dc=com" "(objectclass=*)"
27 |
--------------------------------------------------------------------------------
/Security/ldap/groupadd.ldif:
--------------------------------------------------------------------------------
1 | dn: cn=hdfs1,ou=groups,dc=cluster1,dc=com
2 | objectClass: top
3 | objectClass: posixGroup
4 | gidNumber: 509
5 |
--------------------------------------------------------------------------------
/Security/ldap/slapd.conf.obsolete:
--------------------------------------------------------------------------------
1 | #
2 | # See slapd.conf(5) for details on configuration options.
3 | # This file should NOT be world readable.
4 | #
5 |
6 | include /etc/openldap/schema/corba.schema
7 | include /etc/openldap/schema/core.schema
8 | include /etc/openldap/schema/cosine.schema
9 | include /etc/openldap/schema/duaconf.schema
10 | include /etc/openldap/schema/dyngroup.schema
11 | include /etc/openldap/schema/inetorgperson.schema
12 | include /etc/openldap/schema/java.schema
13 | include /etc/openldap/schema/misc.schema
14 | include /etc/openldap/schema/nis.schema
15 | include /etc/openldap/schema/openldap.schema
16 | include /etc/openldap/schema/ppolicy.schema
17 | include /etc/openldap/schema/collective.schema
18 |
19 | # Allow LDAPv2 client connections. This is NOT the default.
20 | allow bind_v2
21 |
22 | # Do not enable referrals until AFTER you have a working directory
23 | # service AND an understanding of referrals.
24 | #referral ldap://root.openldap.org
25 |
26 | pidfile /var/run/openldap/slapd.pid
27 | argsfile /var/run/openldap/slapd.args
28 |
29 | # Load dynamic backend modules
30 | # - modulepath is architecture dependent value (32/64-bit system)
31 | # - back_sql.la overlay requires openldap-server-sql package
32 | # - dyngroup.la and dynlist.la cannot be used at the same time
33 |
34 | # modulepath /usr/lib/openldap
35 | # modulepath /usr/lib64/openldap
36 |
37 | # moduleload accesslog.la
38 | # moduleload auditlog.la
39 | # moduleload back_sql.la
40 | # moduleload chain.la
41 | # moduleload collect.la
42 | # moduleload constraint.la
43 | # moduleload dds.la
44 | # moduleload deref.la
45 | # moduleload dyngroup.la
46 | # moduleload dynlist.la
47 | # moduleload memberof.la
48 | # moduleload pbind.la
49 | # moduleload pcache.la
50 | # moduleload ppolicy.la
51 | # moduleload refint.la
52 | # moduleload retcode.la
53 | # moduleload rwm.la
54 | # moduleload seqmod.la
55 | # moduleload smbk5pwd.la
56 | # moduleload sssvlv.la
57 | # moduleload syncprov.la
58 | # moduleload translucent.la
59 | # moduleload unique.la
60 | # moduleload valsort.la
61 |
62 | # The next three lines allow use of TLS for encrypting connections using a
63 | # dummy test certificate which you can generate by running
64 | # /usr/libexec/openldap/generate-server-cert.sh. Your client software may balk
65 | # at self-signed certificates, however.
66 | TLSCACertificatePath /etc/openldap/certs
67 | TLSCertificateFile "\"OpenLDAP Server\""
68 | TLSCertificateKeyFile /etc/openldap/certs/password
69 |
70 | # Sample security restrictions
71 | # Require integrity protection (prevent hijacking)
72 | # Require 112-bit (3DES or better) encryption for updates
73 | # Require 63-bit encryption for simple bind
74 | # security ssf=1 update_ssf=112 simple_bind=64
75 |
76 | # Sample access control policy:
77 | # Root DSE: allow anyone to read it
78 | # Subschema (sub)entry DSE: allow anyone to read it
79 | # Other DSEs:
80 | # Allow self write access
81 | # Allow authenticated users read access
82 | # Allow anonymous users to authenticate
83 | # Directives needed to implement policy:
84 | # access to dn.base="" by * read
85 | # access to dn.base="cn=Subschema" by * read
86 | # access to *
87 | # by self write
88 | # by users read
89 | # by anonymous auth
90 | #
91 | # if no access controls are present, the default policy
92 | # allows anyone and everyone to read anything but restricts
93 | # updates to rootdn. (e.g., "access to * by * read")
94 | #
95 | # rootdn can always read and write EVERYTHING!
96 |
97 | # enable on-the-fly configuration (cn=config)
98 | database config
99 | access to *
100 | by dn.exact="gidNumber=0+uidNumber=0,cn=peercred,cn=external,cn=auth" manage
101 | by * none
102 |
103 | # enable server status monitoring (cn=monitor)
104 | database monitor
105 | access to *
106 | by dn.exact="gidNumber=0+uidNumber=0,cn=peercred,cn=external,cn=auth" read
107 | by dn.exact="cn=Manager,dc=my-domain,dc=com" read
108 | by * none
109 |
110 | #######################################################################
111 | # database definitions
112 | #######################################################################
113 |
114 | database bdb
115 | suffix "dc=my-domain,dc=com"
116 | checkpoint 1024 15
117 | rootdn "cn=Manager,dc=my-domain,dc=com"
118 | # Cleartext passwords, especially for the rootdn, should
119 | # be avoided. See slappasswd(8) and slapd.conf(5) for details.
120 | # Use of strong authentication encouraged.
121 | # rootpw secret
122 | # rootpw {crypt}ijFYNcSNctBYg
123 |
124 | # The database directory MUST exist prior to running slapd AND
125 | # should only be accessible by the slapd and slap tools.
126 | # Mode 700 recommended.
127 | directory /var/lib/ldap
128 |
129 | # Indices to maintain for this database
130 | index objectClass eq,pres
131 | index ou,cn,mail,surname,givenname eq,pres,sub
132 | index uidNumber,gidNumber,loginShell eq,pres
133 | index uid,memberUid eq,pres,sub
134 | index nisMapName,nisMapEntry eq,pres,sub
135 |
136 | # Replicas of this database
137 | #replogfile /var/lib/ldap/openldap-master-replog
138 | #replica host=ldap-1.example.com:389 starttls=critical
139 | # bindmethod=sasl saslmech=GSSAPI
140 | # authcId=host/ldap-master.example.com@EXAMPLE.COM
141 |
--------------------------------------------------------------------------------
/Security/ldap/test.ldif:
--------------------------------------------------------------------------------
1 | dn: cn=Jim Bob,ou=people,dc=cluster1,dc=com
2 | objectclass: top
3 | objectclass: person
4 | objectclass: organizationalPerson
5 | objectclass: inetOrgPerson
6 | cn: Jim Bob
7 | sn: Bob
8 | mail: jimbob@example.com
9 | ou: sales
10 |
--------------------------------------------------------------------------------
/Security/ldap/users.ldif:
--------------------------------------------------------------------------------
1 | dn: ou=Users,dc=cluster1,dc=com
2 | objectClass: organizationalUnit
3 | ou: Users
4 |
--------------------------------------------------------------------------------
/Spark/Spark_build:
--------------------------------------------------------------------------------
1 | $ which mvn
2 | /opt/apache-maven-3.3.9/bin/mvn
3 |
4 | $ cat /etc/profile.d/maven.sh
5 | export JAVA_HOME=/usr/java/latest
6 | export M3_HOME=/opt/apache-maven-3.3.9
7 | export PATH=$JAVA_HOME/bin:${M3_HOME}/bin:/home/ec2-user/jruby-9.1.1.0/bin/:$PATH
8 |
9 |
10 | Build:
11 |
12 | $ git clone git://git.apache.org/spark.git
13 | $ cd spark
14 | $ mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -Dscala-2.11 -Phive -Phive-thriftserver -DskipTests clean package
15 |
16 | To build a distribution:
17 |
18 | ./dev/make-distribution.sh --tgz -Phadoop-2.7 -Phive -Phive-thriftserver -Pyarn -DskipTests
19 |
--------------------------------------------------------------------------------
/Spark/examples.txt:
--------------------------------------------------------------------------------
1 | # To get rid of the warning on Hadoop native library
2 | export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native
3 |
4 | [hdfs@edge1 ~]$ spark-shell --master yarn --driver-memory 512m --executor-memory 512m
5 | scala>
6 | val file = sc.textFile("/test")
7 | val counts = file.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey(_ + _)
8 | counts.saveAsTextFile("/tmp/wordcount")
9 | counts.count()
10 |
11 |
12 | Examples using Python:
13 |
14 | spark-submit ~/sparkPython/square.py --master yarn --deploy-mode cluster
15 |
16 | spark-submit ~/sparkPython/wordcount.py --master yarn --deploy-mode cluster (Copy the file on which count needs to be done on hdfs path)
17 |
--------------------------------------------------------------------------------
/Spark/spark-defaults.conf:
--------------------------------------------------------------------------------
1 | spark.master yarn
2 | spark.eventLog.enabled true
3 | spark.driver.memory 1024m
4 | spark.yarn.am.memory 1024m
5 |
6 | spark.yarn.jars hdfs://nn1.dilithium.com:9000/spark_jars/jars/*
7 | or
8 | spark.yarn.archive hdfs://nn1.dilithium.com:9000/spark_jars/spark-libs.jar
9 |
10 | #if using archive: $ jar cv0f spark-libs.jar -C $SPARK_HOME/jars/ .
11 |
12 | spark.serializer org.apache.spark.serializer.KryoSerializer
13 | spark.eventLog.dir hdfs://nn1.dilithium.com:9000/spark_logs
14 | spark.history.fs.logDirectory hdfs://nn1.dilithium.com:9000/spark_logs
15 | spark.history.provider org.apache.spark.deploy.history.FsHistoryProvider
16 | spark.history.fs.update.interval 10s
17 | spark.history.ui.port 18080
18 |
19 |
20 |
21 | yarn-site.xml (Tested on spark 2.2.1)
22 |
23 |
24 |
25 | yarn.nodemanager.aux-services.spark_shuffle.class
26 | org.apache.spark.network.yarn.YarnShuffleService
27 |
28 |
29 |
30 | yarn.nodemanager.aux-services.spark2_shuffle.class
31 | org.apache.spark.network.yarn.YarnShuffleService
32 |
33 |
34 |
35 | yarn.nodemanager.aux-services
36 | mapreduce_shuffle,spark_shuffle,spark2_shuffle
37 |
38 |
--------------------------------------------------------------------------------
/Spark/sparkPython/erfunction.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkContext
2 | sc = SparkContext()
3 | log = sc.textFile("/Users/pkuma380/sparkPython/error.txt")
4 |
5 | def errorcontain(s):
6 | return "ERROR" in s
7 | f_log = log.filter(errorcontain)
8 | for line in f_log.take(10):
9 | print "Start output", line
10 |
11 |
--------------------------------------------------------------------------------
/Spark/sparkPython/error.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkContext
2 | sc = SparkContext()
3 | log = sc.textFile("/Users/pkuma380/sparkPython/error.txt")
4 | f_log = log.filter(lambda data: "ERROR" in data)
5 | for line in f_log.take(10):
6 | print line
7 |
--------------------------------------------------------------------------------
/Spark/sparkPython/error.txt:
--------------------------------------------------------------------------------
1 | Spark Command: /Library/Java/JavaVirtualMachines/jdk1.8.0_60.jdk/Contents/Home//bin/java -cp /usr/local/spark/spark-1.3.1-bin-hadoop2.6/sbin/../conf:/usr/local/spark/spark-1.3.1-bin-hadoop2.6/lib/spark-assembly-1.3.1-hadoop2.6.0.jar:/usr/local/spark/spark-1.3.1-bin-hadoop2.6/lib/datanucleus-api-jdo-3.2.6.jar:/usr/local/spark/spark-1.3.1-bin-hadoop2.6/lib/datanucleus-core-3.2.10.jar:/usr/local/spark/spark-1.3.1-bin-hadoop2.6/lib/datanucleus-rdbms-3.2.9.jar -Dspark.akka.logLifecycleEvents=true -Xms512m -Xmx512m org.apache.spark.deploy.master.Master --ip BGLC02M965AFH01 --port 7077 --webui-port 8080
2 | ========================================
3 |
4 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
5 | 16/02/17 20:19:40 INFO Master: Registered signal handlers for [TERM, HUP, INT]
6 | 16/02/17 20:20:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
7 | 16/02/17 20:20:11 INFO SecurityManager: Changing view acls to: pkuma380
8 | 16/02/17 20:20:11 INFO SecurityManager: Changing modify acls to: pkuma380
9 | 16/02/17 20:20:11 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(pkuma380); users with modify permissions: Set(pkuma380)
10 | 16/02/17 20:20:11 INFO Slf4jLogger: Slf4jLogger started
11 | 16/02/17 20:20:11 INFO Remoting: Starting remoting
12 | 16/02/17 20:20:12 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkMaster@BGLC02M965AFH01:7077]
13 | 16/02/17 20:20:12 INFO Remoting: Remoting now listens on addresses: [akka.tcp://sparkMaster@BGLC02M965AFH01:7077]
14 | 16/02/17 20:20:12 INFO Utils: Successfully started service 'sparkMaster' on port 7077.
15 | 16/02/17 20:20:12 INFO Server: jetty-8.y.z-SNAPSHOT
16 | 16/02/17 20:20:12 INFO AbstractConnector: Started SelectChannelConnector@BGLC02M965AFH01:6066
17 | 16/02/17 20:20:12 INFO Utils: Successfully started service on port 6066.
18 | 16/02/17 20:20:12 INFO StandaloneRestServer: Started REST server for submitting applications on port 6066
19 | 16/02/17 20:20:12 INFO Master: Starting Spark master at spark://BGLC02M965AFH01:7077
20 | 16/02/17 20:20:12 INFO Master: Running Spark version 1.3.1
21 | 16/02/17 20:20:13 INFO Server: jetty-8.y.z-SNAPSHOT
22 | 16/02/17 20:20:13 INFO AbstractConnector: Started SelectChannelConnector@0.0.0.0:8080
23 | 16/02/17 20:20:13 INFO Utils: Successfully started service 'MasterUI' on port 8080.
24 | 16/02/17 20:20:13 INFO MasterWebUI: Started MasterWebUI at http://192.168.0.51:8080
25 | 16/02/17 20:20:13 INFO Master: I have been elected leader! New state: ALIVE
26 | 16/02/18 15:14:37 ERROR Master: RECEIVED SIGNAL 15: SIGTERM
27 | 16/02/18 15:14:37 ERROR Master: RECEIVED SIGNAL 15: SIGTERM
28 |
--------------------------------------------------------------------------------
/Spark/sparkPython/logparser.py:
--------------------------------------------------------------------------------
1 | import sys
2 | data = "/Users/pkuma380/sparkPython/error.txt"
3 | for line in open(data):
4 | #columns = line.split(" ")
5 | #if len(columns) > 1:
6 | if '16' in line:
7 | date =line.split("(\s+)")
8 |
9 | print date
10 |
--------------------------------------------------------------------------------
/Spark/sparkPython/pivot.txt:
--------------------------------------------------------------------------------
1 | userid age country number_of_calls
2 | x01 41 US 3
3 | x01 41 UK 1
4 | x01 41 CA 2
5 | x01 72 US 4
6 | x02 72 UK 6
7 | x02 72 CA 7
8 | x02 72 XX 8
9 | x02 72 XB 8
10 | x02 72 NA 9
11 |
--------------------------------------------------------------------------------
/Spark/sparkPython/square.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkContext
2 | sc = SparkContext()
3 | #data = sc.parallelize([1,2,3,4,5])
4 |
5 | def square(sq):
6 | return sq * sq
7 | data = sc.parallelize([1,2,3,4,5])
8 | sq= data.map(square)
9 | for line in sq.collect():
10 | print line
11 |
12 |
--------------------------------------------------------------------------------
/Spark/sparkPython/wordcount.py:
--------------------------------------------------------------------------------
1 | from operator import add
2 | from pyspark import SparkContext
3 | sc= SparkContext()
4 | file = sc.textFile("/user/hdfs/sparkPython/wordcount.txt")
5 | word = file.flatMap(lambda x: x.split(" "))
6 | mapword = word.map(lambda x: (x, 1))
7 | reduceword = mapword.reduceByKey(add)
8 | output = reduceword.collect()
9 | nums = sc.parallelize([output])
10 | for i in nums.collect():
11 | print i
12 |
--------------------------------------------------------------------------------
/Spark/sparkPython/wordcount.txt:
--------------------------------------------------------------------------------
1 | 49
2 | 2
3 | volumename
4 | ainduk
5 | apps
6 | axp
7 | axp.admin
8 | axp.apptests
9 | axp.hivequerylogs
10 | axp.mirror
11 | axp.mirror.home
12 | bjaya
13 | bmanikya
14 | dprichar
15 | dschexna
16 | gsing140
17 | hyalama
18 | idn
19 | idn.home
20 | kvall3
21 | kvarakan
22 | mapr.cldb.internal
23 | mapr.cluster.root
24 | mapr.hbase
25 | mapr.tmp
26 | mirror-cstonedb-vol2-test
27 | mirror-silver-datameer
28 | mysqlbcp
29 | naveenmirrortest
30 | ngupt131
31 | phari
32 | pigtemp
33 | PlatinumDR_Mysql_Backups
34 | psing141
35 | rsyncappsvrs
36 | rsynces
37 | rsyncmllab
38 | rsyncplatdrm5
39 | rsyncsilverm5
40 | rsyncsilverm7
41 | rsyncskytree
42 | rsyncstorm
43 | smanubo
44 | spark
45 | spool4
46 | spoudel
47 | twilli1
48 | ukris
49 | users
50 | vkomat
51 | zsmit3
52 | zsmit3
53 |
--------------------------------------------------------------------------------
/Spark/spark_command.txt:
--------------------------------------------------------------------------------
1 | YARN Node Labels:
2 |
3 | $ spark-submit --class org.apache.spark.examples.SparkPi --queue root.prod --conf spark.yarn.am.nodeLabelExpression=spark --conf spark.yarn.executor.nodeLabelExpression=spark --executor-memory 512m --num-executors 1 --driver-memory 512m --master yarn --deploy-mode cluster /opt/cloudera/parcels/CDH/jars/spark-examples*.jar 10
4 |
5 | # https://www.ibm.com/support/pages/yarn-node-labels-label-based-scheduling-and-resource-isolation-hadoop-dev
6 | # https://docs.cloudera.com/runtime/7.0.2/yarn-allocate-resources/topics/yarn-configuring-node-labels.html
7 | # https://docs.cloudera.com/cdp-private-cloud-base/7.1.5/yarn-allocate-resources/topics/yarn-associate-node-labels-with-queues.html
8 |
9 | Start cluster, after settign ssh-passphrase from master (Only for non Yarn cluster)
10 |
11 | $ /opt/cluster/spark/sbin/stop-all.sh
12 | $ /opt/cluster/spark/sbin/start-all.sh
13 |
14 |
15 | Tip
16 | ===
17 | To avoid loading assembly jar every time, set env variable as below, as copying jar to hadoop
18 |
19 | export SPARK_JAR=hdfs://nn1.dilithium.com:9000/user/hdfs/share/lib/spark-assembly-1.4.1-hadoop2.6.0.jar
20 |
21 | Submit jobs in 3 modes
22 |
23 | $ spark-submit --class org.apache.spark.examples.SparkPi /opt/cluster/spark/lib/spark-examples-1.6.1-hadoop2.2.0.jar 10 --master spark://rt1.cyrus.com:7077
24 | $ spark-submit --class org.apache.spark.examples.SparkPi /opt/cluster/spark/lib/spark-examples-1.6.1-hadoop2.2.0.jar 100 --master yarn --deploy-mode cluster
25 | $ spark-submit --class org.apache.spark.examples.SparkPi /opt/cluster/spark/lib/spark-examples-1.6.1-hadoop2.2.0.jar 100 --master yarn --deploy-mode client
26 |
27 | $ spark-submit --class org.apache.spark.examples.SparkPi /usr/lib/spark/lib/spark-examples-1.6.1-hadoop2.7.2-amzn-1.jar 100 --master yarn-master
28 |
29 | Other ways of running it
30 | -------------------------
31 | $ spark-shell --master yarn
32 | $ spark-submit --verbose ~/sparkPython/square.py --master yarn --deploy-mode cluster
33 | $ spark-submit --verbose ~/sparkPython/square.py --master yarn-cluster --deploy-mode cluster
34 | $ spark-submit --verbose ~/sparkPython/square.py --master yarn-client --deploy-mode cluster
35 |
36 |
--------------------------------------------------------------------------------
/Spark/spark_standalone_cluster.txt:
--------------------------------------------------------------------------------
1 | Spark Yarn Cluster Setup
2 | =========================
3 |
4 | # nodes
5 |
6 | edge1.dilithium.com(master)
7 | edge2.dilithium.com(worker)
8 | hbm1.dilithium.com(worker)
9 | -------------------------------------
10 | [hdfs@edge1 conf]$ cat spark-env.sh
11 |
12 | HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
13 |
14 | SPARK_LOCAL_IP=192.168.1.18
15 | SPARK_MASTER_IP=edge1.dilithium.com
16 |
17 | export SPARK_WORKER_MEMORY=256m
18 | export SPARK_EXECUTOR_MEMORY=128m
19 | export SPARK_WORKER_INSTANCES=1
20 | export SPARK_WORKER_CORES=1
21 | export SPARK_WORKER_DIR=/home/hdfs/work/sparkdata
22 | -------------------------------------------------
23 | [hdfs@edge2 conf]$ cat spark-env.sh
24 |
25 | HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
26 |
27 | SPARK_LOCAL_IP=192.168.1.19
28 | SPARK_MASTER_IP=edge1.dilithium.com
29 |
30 | export SPARK_WORKER_MEMORY=256m
31 | export SPARK_EXECUTOR_MEMORY=128m
32 | export SPARK_WORKER_INSTANCES=2
33 | export SPARK_WORKER_CORES=1
34 | export SPARK_WORKER_DIR=/home/hdfs/work/sparkdata
35 | -------------------------------------------------
36 | [hdfs@hbm1 conf]$ cat spark-env.sh
37 |
38 | HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
39 |
40 | SPARK_LOCAL_IP=192.168.1.30
41 | SPARK_MASTER_IP=edge1.dilithium.com
42 |
43 | export SPARK_WORKER_MEMORY=256m
44 | export SPARK_EXECUTOR_MEMORY=128m
45 | export SPARK_WORKER_INSTANCES=2
46 | export SPARK_WORKER_CORES=1
47 | export SPARK_WORKER_DIR=/home/hdfs/work/sparkdata
48 | --------------------------------------------------
49 | on Master node(edge1)
50 |
51 | [hdfs@edge1 conf]$ cat slaves
52 | # A Spark Worker will be started on each of the machines listed below.
53 |
54 | edge2.dilithium.com
55 | hbm1.dilithium.com
56 | --------------------
57 | On all nodes in the cluster
58 |
59 | [hdfs@edge1 conf]$ cat spark-defaults.conf
60 | # Default system properties included when running spark-submit.
61 | # This is useful for setting default environmental settings.
62 |
63 | # Example:
64 | spark.master spark://edge1.dilithium.com:7077
65 | spark.eventLog.enabled true
66 | spark.serializer org.apache.spark.serializer.KryoSerializer
67 | spark.eventLog.dir hdfs://nn1.dilithium.com:9000/user/hdfs/spark_logs
68 |
69 |
--------------------------------------------------------------------------------
/Spark/yarn-site.xml.spark:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | yarn.nodemanager.aux-services
8 | mapreduce_shuffle,spark_shuffle,spark2_shuffle
9 |
10 |
11 |
12 | yarn.nodemanager.aux-services.mapreduce.shuffle.class
13 | org.apache.hadoop.mapred.ShuffleHandler
14 |
15 |
16 |
17 | yarn.nodemanager.aux-services.spark_shuffle.class
18 | org.apache.spark.network.yarn.YarnShuffleService
19 |
20 |
21 |
22 |
23 | yarn.nodemanager.aux-services.spark2_shuffle.class
24 | org.apache.spark.network.yarn.YarnShuffleService
25 |
26 |
27 |
28 | yarn.resourcemanager.resource-tracker.address
29 | rm1.dilithium.com:9001
30 |
31 |
32 |
33 | yarn.resourcemanager.scheduler.address
34 | rm1.dilithium.com:9002
35 |
36 |
37 |
38 | yarn.resourcemanager.address
39 | rm1.dilithium.com:9003
40 |
41 |
42 | #
43 | #yarn.nodemanager.local-dirs
44 | #file:/space/tmp1,file:/space/tmp2
45 | #
46 |
47 |
48 | yarn.nodemanager.resource.memory-mb
49 | 3072
50 |
51 |
52 |
53 | yarn.scheduler.minimum-allocation-mb
54 | 256
55 |
56 |
57 |
58 | yarn.scheduler.maximum-allocation-mb
59 | 3072
60 |
61 |
62 |
63 | yarn.scheduler.minimum-allocation-vcores
64 | 1
65 |
66 |
67 |
68 | yarn.scheduler.maximum-allocation-vcores
69 | 12
70 |
71 |
72 |
73 | yarn.nodemanager.resource.cpu-vcores
74 | 12
75 |
76 |
77 |
78 |
79 | yarn.nodemanager.vmem-pmem-ratio
80 | 2.1
81 |
82 |
83 | #
84 | # yarn.nodemanager.vmem-check-enabled
85 | # false
86 | # Whether virtual memory limits will be enforced for containers
87 | #
88 |
89 |
90 | yarn.log-aggregation-enable
91 | true
92 |
93 |
94 |
95 | Where to aggregate logs to.
96 | yarn.nodemanager.remote-app-log-dir
97 | /tmp/logs
98 |
99 |
100 |
101 | yarn.log-aggregation.retain-seconds
102 | 259200
103 |
104 |
105 |
106 | yarn.log-aggregation.retain-check-interval-seconds
107 | 3600
108 |
109 |
110 |
111 |
112 |
--------------------------------------------------------------------------------
/Yarn_tuning/Yarn.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Yarn_tuning/Yarn.pdf
--------------------------------------------------------------------------------
/Yarn_tuning/mapred-site.xml:
--------------------------------------------------------------------------------
1 |
2 | mapreduce.map.memory.mb
3 | 768
4 |
5 |
6 |
7 | mapreduce.reduce.memory.mb
8 | 768
9 |
10 |
11 |
12 | mapreduce.map.java.opts
13 | -Xmx512m
14 |
15 |
16 |
17 | mapreduce.reduce.java.opts
18 | -Xmx512m
19 |
20 |
--------------------------------------------------------------------------------
/Yarn_tuning/yarn-site.xml:
--------------------------------------------------------------------------------
1 |
2 | yarn.app.mapreduce.am.resource.mb
3 | 1024
4 |
5 |
6 |
7 | yarn.nodemanager.resource.memory-mb
8 | 2048
9 |
10 |
11 |
12 | yarn.scheduler.minimum-allocation-mb
13 | 512
14 |
15 |
16 |
17 | yarn.scheduler.maximum-allocation-mb
18 | 1024
19 |
20 |
21 |
22 | yyarn.scheduler.minimum-allocation-vcores
23 | 1
24 |
25 |
26 |
27 | yarn.scheduler.maximum-allocation-vcores
28 | 2
29 |
30 |
31 |
32 | yarn.nodemanager.pmem-check-enabled
33 | false
34 |
35 |
36 |
37 | yarn.nodemanager.vmem-check-enabled
38 | false
39 |
40 |
41 |
42 | yarn.nodemanager.vmem-pmem-ratio
43 | 2.1
44 |
45 |
46 |
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman
--------------------------------------------------------------------------------
/hadoop1.0/README.md:
--------------------------------------------------------------------------------
1 | For all the Configurations related to Hadoop 1.x, please use the below branch
2 |
3 | https://github.com/netxillon/hadoop/tree/Hadoop1
4 |
--------------------------------------------------------------------------------
/hadoop2.0/bash_profile:
--------------------------------------------------------------------------------
1 | # .bash_profile
2 |
3 | # Get the aliases and functions
4 | if [ -f ~/.bashrc ]; then
5 | . ~/.bashrc
6 | fi
7 |
8 | # User specific environment and startup programs
9 |
10 | # User specific aliases and functions
11 |
12 | #export HADOOP_HOME=/home/hadoop/hadoop-2.2.0
13 |
14 | export HADOOP_HOME=/home/hadoop/hadoop
15 | export HADOOP_MAPRED_HOME=$HADOOP_HOME
16 | export HADOOP_COMMON_HOME=$HADOOP_HOME
17 | export HADOOP_HDFS_HOME=$HADOOP_HOME
18 | export YARN_HOME=$HADOOP_HOME
19 | export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
20 | export YARN_CONF_DIR=$HADOOP_HOME/etc/hadoop
21 |
22 | export JAVA_HOME=/usr/java/default
23 |
24 |
25 | PATH=$HADOOP_HOME/bin/:$HADOOP_HOME/sbin/:$JAVA_HOME/bin/:$PATH
26 | export PATH
27 |
--------------------------------------------------------------------------------
/hadoop2.0/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | fs.defaultFS
4 | hdfs://ha-nn1.hacluster1.com:9000
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/hadoop2.0/hdfs-site.xml:
--------------------------------------------------------------------------------
1 | Namenode
2 | ========
3 |
4 |
5 |
6 |
7 | dfs.namenode.name.dir
8 | file:/data/namenode
9 |
10 |
11 |
12 | dfs.replication
13 | 1
14 |
15 |
16 |
17 | dfs.blocksize
18 | 134217728
19 |
20 |
21 |
22 |
23 | Datanode
24 | ========
25 |
26 |
27 |
28 |
29 | dfs.datanode.data.dir
30 | file:/data/datanode
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/hadoop2.0/mapred-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | # The framework can be local, classic or yarn
4 |
5 |
6 | mapreduce.framework.name
7 | yarn
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/hadoop2.0/yarn-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | yarn.nodemanager.aux-services
5 | mapreduce_shuffle
6 |
7 |
8 |
9 | yarn.nodemanager.aux-services.mapreduce.shuffle.class
10 | org.apache.hadoop.mapred.ShuffleHandler
11 |
12 |
13 |
14 | yarn.resourcemanager.resource-tracker.address
15 | nn2.cluster1.com:9001
16 |
17 |
18 |
19 | yarn.resourcemanager.scheduler.address
20 | nn2.cluster1.com:9002
21 |
22 |
23 |
24 | yarn.resourcemanager.address
25 | nn2.cluster1.com:9003
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/hadoop_build64bit:
--------------------------------------------------------------------------------
1 | Build 64 bit Hadoop
2 | ===================
3 |
4 |
5 | 1. yum -y install gcc gcc-c++ openssl-devel make cmake zlib* libssl* autoconf automake libtool cyrus-sasl* libgsasl-devel* java-1.8.0-openjdk.x86_64 java-1.8.0-openjdk-devel.x86_64
6 |
7 | 2. Download Maven: wget http://mirrors.gigenet.com/apache/maven/maven-3/3.3.3/binaries/apache-maven-3.3.3-bin.tar.gz
8 |
9 | tar -zxf apache-maven-3.3.3-bin.tar.gz -C /opt/
10 |
11 | setup maven environment
12 |
13 | [root@repo67 ~]# cat /etc/profile.d/maven.sh
14 | export JAVA_HOME=/usr/java/latest
15 | export M3_HOME=/opt/apache-maven-3.3.3
16 | export PATH=$JAVA_HOME/bin:/opt/apache-maven-3.3.3/bin:$PATH
17 |
18 | 3. Download protobuf: wget https://github.com/google/protobuf/releases/download/v2.5.0/protobuf-2.5.0.tar.gz
19 |
20 | tar -xzf protobuf-2.5.0.tar.gz -C /opt
21 |
22 | cd /opt/protobuf-2.5.0/
23 | ./configure
24 | make;make install
25 |
26 | 4. Download latest stable hadoop source code , example hadoop-2.7.2-src.tar.gz
27 |
28 | tar -xzf hadoop-2.7.2-src.tar.gz -C /opt/
29 | cd /opt/hadoop-2.7.2-src
30 | mvn package -Pdist,native -DskipTests -Dtar -Dmaven.skip.test=true -Dmaven.javadoc.skip=true
31 |
32 | you will see a tar ball under hadoop-2.7.2-src/hadoop-dist/target/
33 |
34 | Enjoy !!
35 |
36 |
37 | Updated for maven 3.6.3 and protobuf 3.7.1
38 | ------------------------------------------
39 | Supported version hadoop 3.3.0
40 |
41 | 1. wget http://mirror.intergrid.com.au/apache/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.tar.gz
42 |
43 | 2. wget https://cmake.org/files/v3.6/cmake-3.6.2.tar.gz
44 | tar -zxvf cmake-3.6.2.tar.gz
45 | cd cmake-3.6.2
46 | ./bootstrap --prefix=/usr/local
47 | make; make install
48 | PATH=/usr/local/bin:$PATH
49 |
50 | 3. wget https://github.com/protocolbuffers/protobuf/releases/download/v3.7.1/protobuf-cpp-3.7.1.tar.gz
51 |
52 | - For all version prior to hadoop 3.3.0, protobuf version = 2.5.0.
53 | - For Hadoop 3.x and higher cmake version must be greater than 3.2
54 |
--------------------------------------------------------------------------------
/jumbune:
--------------------------------------------------------------------------------
1 | hadoop-env.sh
2 |
3 | export HADOOP_NAME NODE_OPTS="-Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=5677 $HADOOP_NAME NODE_OPTS"
4 | export HADOOP_DATANODE_OPTS="-Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=5679 $HADOOP_DATANODE_OPTS"
5 |
6 | yarn-env.sh
7 |
8 | export YARN_NODEMANAGER_OPTS="-Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=5678 $YARN_NODEMANAGER_OPTS"
9 | export YARN_RESOURCEMANAGER_OPTS="-Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=5680 $YARN_RESOURCEMANAGER_OPTS"
10 |
--------------------------------------------------------------------------------
/logging:
--------------------------------------------------------------------------------
1 |
2 | We can get and set the log level dynamically using the daemonlog command
3 | ===================
4 |
5 | $ hadoop daemonlog -getlevel master1.cyrus.com:50070 org.apache.hadoop.dfs.NameNode
6 | Connecting to http://master1.cyrus.com:50070/logLevel?log=org.apache.hadoop.dfs.NameNode
7 | Submitted Log Name: org.apache.hadoop.dfs.NameNode
8 | Log Class: org.apache.commons.logging.impl.Log4JLogger
9 | Effective level: INFO
10 |
11 |
12 | $ hadoop daemonlog -getlevel master1.cyrus.com:50070 org.apache.hadoop.hdfs.server.namenode.NodeNode
13 | Connecting to http://master1.cyrus.com:50070/logLevel?log=org.apache.hadoop.hdfs.server.namenode.NodeNode
14 | Submitted Log Name: org.apache.hadoop.hdfs.server.namenode.NodeNode
15 | Log Class: org.apache.commons.logging.impl.Log4JLogger
16 | Effective level: INFO
17 |
18 | +++++++++++++
19 |
20 | The logs are of the format /var/log/hadoop/hadoop-hadoop-$HADOOP_IDENT_STRING-.log
21 |
22 | Thinking of changing $HADOOP_IDENT_STRING ?
23 |
24 | Not a good idea:
25 |
26 | $HADOOP_IDENT_STRING=$USER (Do not try to change to any custom value, because the PID etc all are tracked by this
27 | and your scripts like hadoop-daemon.sh will fail.
28 |
--------------------------------------------------------------------------------
/map_scripts/job.txt:
--------------------------------------------------------------------------------
1 | $ hadoop jar contrib/streaming/hadoop-*streaming*.jar -file /home/hadoop/mapper.py -mapper /home/hadoop/mapper.py -file /home/hadoop/reducer.py -reducer /home/hadoop/reducer.py -input /input -output /output
2 |
3 |
4 |
5 | hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar -D stream.num.map.output.key.fields=2 -input /input /out -mapper /home/hadoop/mapper.sh -reducer /home/hadoop/reducer.sh
--------------------------------------------------------------------------------
/map_scripts/mapper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 |
5 | # input comes from STDIN (standard input)
6 | for line in sys.stdin:
7 | # remove leading and trailing whitespace
8 | line = line.strip()
9 | # split the line into words
10 | words = line.split()
11 | # increase counters
12 | for word in words:
13 | # write the results to STDOUT (standard output);
14 | # what we output here will be the input for the
15 | # Reduce step, i.e. the input for reducer.py
16 | #
17 | # tab-delimited; the trivial word count is 1
18 | print '%s\t%s' % (word, 1)
19 |
--------------------------------------------------------------------------------
/map_scripts/mapper.sh:
--------------------------------------------------------------------------------
1 | [training@localhost steve]$ cat maptf.sh
2 | #!/bin/bash
3 |
4 | exclude="\.\,?!\-_:;\]\[\#\|\$()\""
5 | while read split; do
6 | for word in $split; do
7 | term=`echo "${word//[$exclude]/}" | tr [:upper:] [:lower:]`
8 | if [ -n "$term" ]; then
9 | printf "%s\t%s\t%s\n" "$term" "$map_input_file" "1"
10 | fi
11 | done
12 | done
--------------------------------------------------------------------------------
/map_scripts/reducer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from operator import itemgetter
4 | import sys
5 |
6 | current_word = None
7 | current_count = 0
8 | word = None
9 |
10 | # input comes from STDIN
11 | for line in sys.stdin:
12 | # remove leading and trailing whitespace
13 | line = line.strip()
14 |
15 | # parse the input we got from mapper.py
16 | word, count = line.split('\t', 1)
17 |
18 | # convert count (currently a string) to int
19 | try:
20 | count = int(count)
21 | except ValueError:
22 | # count was not a number, so silently
23 | # ignore/discard this line
24 | continue
25 |
26 | # this IF-switch only works because Hadoop sorts map output
27 | # by key (here: word) before it is passed to the reducer
28 | if current_word == word:
29 | current_count += count
30 | else:
31 | if current_word:
32 | # write result to STDOUT
33 | print '%s\t%s' % (current_word, current_count)
34 | current_count = count
35 | current_word = word
36 |
37 | # do not forget to output the last word if needed!
38 | if current_word == word:
39 | print '%s\t%s' % (current_word, current_count)
40 |
--------------------------------------------------------------------------------
/map_scripts/reducer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | read currterm currfile currnum
4 | while read term file num; do
5 | if [[ $term = "$currterm" ]] && [[ $file = "$currfile" ]]; then
6 | currnum=$(( currnum + num ))
7 | else
8 | printf "%s\t%s\t%s\n" "$currterm" "$currfile" "$currnum"
9 | currterm="$term"
10 | currfile="$file"
11 | currnum="$num"
12 | fi
13 | done
14 | printf "%s\t%s\t%s\n" "$currterm" "$currfile" "$currnum"
--------------------------------------------------------------------------------
/zookeeper.txt:
--------------------------------------------------------------------------------
1 | Deploying ZooKeeper Cluster (Multi-Server) Setup
2 |
3 | Let’s begin installation and configuration of ZooKeeper.
4 |
5 | Step 1: Directory Structure creation, as decided in the designing section
6 |
7 | mac-book-pro:demo aman$ mkdir -p /Users/aman/zookeeper/zk-server-1 \
8 | /Users/aman/zookeeper/zk-server-2 \
9 | /Users/aman/zookeeper/zk-server-3 \
10 | /Users/aman/zookeeper/zk-server-4 \
11 | /Users/aman/zookeeper/zk-server-5
12 |
13 | mac-book-pro:demo aman$ mkdir -p /Users/aman/zookeeper/data/zk1 \
14 | /Users/aman/zookeeper/data/zk2 \
15 | /Users/aman/zookeeper/data/zk3 \
16 | /Users/aman/zookeeper/data/zk4 \
17 | /Users/aman/zookeeper/data/zk5
18 |
19 | mac-book-pro:demo aman$ mkdir -p /Users/aman/zookeeper/log/zk1 \
20 | /Users/aman/zookeeper/log/zk2 \
21 | /Users/aman/zookeeper/log/zk3 \
22 | /Users/aman/zookeeper/log/zk4 \
23 | /Users/aman/zookeeper/log/zk5
24 |
25 | Let’s take a look above created directory structure-
26 |
27 | mac-book-pro:demo aman$ tree /Users/aman/zookeeper
28 |
29 | /Users/aman/zookeeper
30 | |-data
31 | |---zk1
32 | |---zk2
33 | |---zk3
34 | |---zk4
35 | |---zk5
36 | |-log
37 | |---zk1
38 | |---zk2
39 | |---zk3
40 | |---zk4
41 | |---zk5
42 | |-zk-server-1
43 | |-zk-server-2
44 | |-zk-server-3
45 | |-zk-server-4
46 | |-zk-server-5
47 |
48 | mac-book-pro:demo aman$
49 |
50 | Okay, looks good!
51 |
52 | Step 2: Creating a ZooKeeper Server ID, basically this file reside in the ZooKeeper data directory. Go on choose your favorite text editor
53 |
54 | # just enter a value '1' in the file. Save the file, do the same for rest of ZooKeeper
55 | mac-book-pro:demo aman$ vi /Users/aman/zookeeper/data/zk1/myid
56 |
57 | # follow the same way to fill server id
58 | vi /Users/aman/zookeeper/data/zk2/myid
59 | vi /Users/aman/zookeeper/data/zk3/myid
60 | vi /Users/aman/zookeeper/data/zk4/myid
61 | vi /Users/aman/zookeeper/data/zk5/myid
62 |
63 | Step 3: Downloading ZooKeeper Release
64 |
65 | Download a ZooKeeper from http://hadoop.apache.org/zookeeper/releases.html; this article utilize the version 3.4.4 of ZooKeeper. However same principle is applied for other version too.
66 |
67 | Step 4: Extract & prepare ZooKeeper for deployment
68 |
69 | mac-book-pro:demo aman$ gzip -dc ~/Downloads/soft/zookeeper-3.4.4.tar.gz | tar -xf - -C /tmp
70 | mac-book-pro:demo aman$ cp -r /tmp/zookeeper-3.4.4/* /Users/aman/zookeeper/zk-server-1/
71 | mac-book-pro:demo aman$ cp -r /tmp/zookeeper-3.4.4/* /Users/aman/zookeeper/zk-server-2/
72 | mac-book-pro:demo aman$ cp -r /tmp/zookeeper-3.4.4/* /Users/aman/zookeeper/zk-server-3/
73 | mac-book-pro:demo aman$ cp -r /tmp/zookeeper-3.4.4/* /Users/aman/zookeeper/zk-server-4/
74 | mac-book-pro:demo aman$ cp -r /tmp/zookeeper-3.4.4/* /Users/aman/zookeeper/zk-server-5/
75 |
76 | Once done don’t forget to cleanup the ‘/tmp/zookeeper-3.4.4′
77 |
78 | Step 5: Preparing ZooKeeper configuration called ‘zoo.cfg‘ at ‘{zk-server-1}/conf/zoo.cfg‘. Here I will show you for Server 1 and perform same steps with appropriate values (clientPort, dataDir, dataLogDir) for respective ZooKeeper server.
79 |
80 | mac-book-pro:demo aman$ vi /Users/aman/zookeeper/zk-server-1/conf/zoo.cfg
81 |
82 | Place below configuration into it.
83 |
84 |
85 | # The number of milliseconds of each tick
86 | tickTime=2000
87 |
88 | # The number of ticks that the initial synchronization phase can take
89 | initLimit=10
90 |
91 | # The number of ticks that can pass between
92 | # sending a request and getting an acknowledgement
93 | syncLimit=5
94 |
95 | # the directory where the snapshot is stored.
96 | # Choose appropriately for your environment
97 | dataDir=/Users/aman/zookeeper/data/zk1
98 |
99 | # the port at which the clients will connect
100 | clientPort=2181
101 |
102 | # the directory where transaction log is stored.
103 | # this parameter provides dedicated log device for ZooKeeper
104 | dataLogDir=/Users/aman/zookeeper/log/zk1
105 |
106 | # ZooKeeper server and its port no.
107 | # ZooKeeper ensemble should know about every other machine in the ensemble
108 | # specify server id by creating 'myid' file in the dataDir
109 | # use hostname instead of IP address for convenient maintenance
110 | server.1=localhost:2888:3888
111 | server.2=localhost:2889:3889
112 | server.3=localhost:2890:3890
113 | server.4=localhost:2891:3891
114 | server.5=localhost:2892:3892
115 |
--------------------------------------------------------------------------------
/zookeeper_oozie/oozie-server.txt:
--------------------------------------------------------------------------------
1 | Oozie Server Setup
2 |
3 | Copy the built binaries to the home directory as ‘oozie’
4 |
5 | $ cd ../../
6 | $ cp -R oozie-3.3.2/distro/target/oozie-3.3.2-distro/oozie-3.3.2/ oozie
7 |
8 | Create the required libext directory
9 |
10 | $ cd oozie
11 | $ mkdir libext
12 |
13 | Copy all the required jars from hadooplibs to the libext directory using the following command:
14 |
15 | $ cp ../oozie-3.3.2/hadooplibs/target/oozie-3.3.2-hadooplibs.tar.gz .
16 | $ tar xzvf oozie-3.3.2-hadooplibs.tar.gz
17 | $ cp oozie-3.3.2/hadooplibs/hadooplib-1.1.1.oozie-3.3.2/* libext/
18 |
19 | Get Ext2Js – This library is not bundled with Oozie and needs to be downloaded separately. This library is used for the Oozie Web Console:
20 |
21 | $ cd libext
22 | $ wget http://extjs.com/deploy/ext-2.2.zip
23 | $ cd ..
24 |
25 | Update ../hadoop/conf/core-site.xml as follows:
26 |
27 |
28 | hadoop.proxyuser.hduser.hosts
29 | localhost
30 |
31 |
32 | hadoop.proxyuser.hduser.groups
33 | hadoop
34 |
35 |
36 | Here, ‘hduser’ is the username and it belongs to ‘hadoop’ group.
37 |
38 | Prepare the WAR file
39 |
40 | $ ./bin/oozie-setup.sh prepare-war
41 |
42 | setting CATALINA_OPTS="$CATALINA_OPTS -Xmx1024m"
43 |
44 | INFO: Adding extension: /home/hduser/oozie/libext/commons-beanutils-1.7.0.jar
45 | INFO: Adding extension: /home/hduser/oozie/libext/commons-beanutils-core-1.8.0.jar
46 | INFO: Adding extension: /home/hduser/oozie/libext/commons-codec-1.4.jar
47 | INFO: Adding extension: /home/hduser/oozie/libext/commons-collections-3.2.1.jar
48 | INFO: Adding extension: /home/hduser/oozie/libext/commons-configuration-1.6.jar
49 | INFO: Adding extension: /home/hduser/oozie/libext/commons-digester-1.8.jar
50 | INFO: Adding extension: /home/hduser/oozie/libext/commons-el-1.0.jar
51 | INFO: Adding extension: /home/hduser/oozie/libext/commons-io-2.1.jar
52 | INFO: Adding extension: /home/hduser/oozie/libext/commons-lang-2.4.jar
53 | INFO: Adding extension: /home/hduser/oozie/libext/commons-logging-1.1.jar
54 | INFO: Adding extension: /home/hduser/oozie/libext/commons-math-2.1.jar
55 | INFO: Adding extension: /home/hduser/oozie/libext/commons-net-1.4.1.jar
56 | INFO: Adding extension: /home/hduser/oozie/libext/hadoop-client-1.1.1.jar
57 | INFO: Adding extension: /home/hduser/oozie/libext/hadoop-core-1.1.1.jar
58 | INFO: Adding extension: /home/hduser/oozie/libext/hsqldb-1.8.0.7.jar
59 | INFO: Adding extension: /home/hduser/oozie/libext/jackson-core-asl-1.8.8.jar
60 | INFO: Adding extension: /home/hduser/oozie/libext/jackson-mapper-asl-1.8.8.jar
61 | INFO: Adding extension: /home/hduser/oozie/libext/log4j-1.2.16.jar
62 | INFO: Adding extension: /home/hduser/oozie/libext/oro-2.0.8.jar
63 | INFO: Adding extension: /home/hduser/oozie/libext/xmlenc-0.52.jar
64 |
65 | New Oozie WAR file with added 'ExtJS library, JARs' at /home/hduser/oozie/oozie-server/webapps/oozie.war
66 |
67 | INFO: Oozie is ready to be started
68 |
69 | Create sharelib on HDFS
70 |
71 | $ ./bin/oozie-setup.sh sharelib create -fs hdfs://localhost:54310
72 | setting CATALINA_OPTS="$CATALINA_OPTS -Xmx1024m"
73 | the destination path for sharelib is: /user/hduser/share/lib
74 |
75 | Create the OoozieDB
76 |
77 | $ ./bin/ooziedb.sh create -sqlfile oozie.sql -run
78 | setting CATALINA_OPTS="$CATALINA_OPTS -Xmx1024m"
79 |
80 | Validate DB Connection
81 | DONE
82 | Check DB schema does not exist
83 | DONE
84 | Check OOZIE_SYS table does not exist
85 | DONE
86 | Create SQL schema
87 | DONE
88 | Create OOZIE_SYS table
89 | DONE
90 |
91 | Oozie DB has been created for Oozie version '3.3.2'
92 |
93 | The SQL commands have been written to: oozie.sql
94 |
95 | To start Oozie as a daemon use the following command:
96 |
97 | $ ./bin/oozied.sh start
98 |
99 | Setting OOZIE_HOME: /home/hduser/oozie
100 | Setting OOZIE_CONFIG: /home/hduser/oozie/conf
101 | Sourcing: /home/hduser/oozie/conf/oozie-env.sh
102 | setting CATALINA_OPTS="$CATALINA_OPTS -Xmx1024m"
103 | Setting OOZIE_CONFIG_FILE: oozie-site.xml
104 | Setting OOZIE_DATA: /home/hduser/oozie/data
105 | Setting OOZIE_LOG: /home/hduser/oozie/logs
106 | Setting OOZIE_LOG4J_FILE: oozie-log4j.properties
107 | Setting OOZIE_LOG4J_RELOAD: 10
108 | Setting OOZIE_HTTP_HOSTNAME: rohit-VirtualBox
109 | Setting OOZIE_HTTP_PORT: 11000
110 | Setting OOZIE_ADMIN_PORT: 11001
111 | Setting OOZIE_HTTPS_PORT: 11443
112 | Setting OOZIE_BASE_URL: http://rohit-VirtualBox:11000/oozie
113 | Setting CATALINA_BASE: /home/hduser/oozie/oozie-server
114 | Setting OOZIE_HTTPS_KEYSTORE_FILE: /home/hduser/.keystore
115 | Setting OOZIE_HTTPS_KEYSTORE_PASS: password
116 | Setting CATALINA_OUT: /home/hduser/oozie/logs/catalina.out
117 | Setting CATALINA_PID: /home/hduser/oozie/oozie-server/temp/oozie.pid
118 |
119 | Using CATALINA_OPTS: -Xmx1024m -Dderby.stream.error.file=/home/hduser/oozie/logs/derby.log
120 | Adding to CATALINA_OPTS: -Doozie.home.dir=/home/hduser/oozie -Doozie.config.dir=/home/hduser/oozie/conf -Doozie.log.dir=/home/hduser/oozie/logs -Doozie.data.dir=/home/hduser/oozie/data -Doozie.config.file=oozie-site.xml -Doozie.log4j.file=oozie-log4j.properties -Doozie.log4j.reload=10 -Doozie.http.hostname=rohit-VirtualBox -Doozie.admin.port=11001 -Doozie.http.port=11000 -Doozie.https.port=11443 -Doozie.base.url=http://rohit-VirtualBox:11000/oozie -Doozie.https.keystore.file=/home/hduser/.keystore -Doozie.https.keystore.pass=password -Djava.library.path=
121 |
122 | Using CATALINA_BASE: /home/hduser/oozie/oozie-server
123 | Using CATALINA_HOME: /home/hduser/oozie/oozie-server
124 | Using CATALINA_TMPDIR: /home/hduser/oozie/oozie-server/temp
125 | Using JRE_HOME: /usr/lib/jvm/java-6-oracle
126 | Using CLASSPATH: /home/hduser/oozie/oozie-server/bin/bootstrap.jar
127 | Using CATALINA_PID: /home/hduser/oozie/oozie-server/temp/oozie.pid
128 |
129 | To start Oozie as a foreground process use the following command:
130 |
131 | $ ./bin/oozied.sh run
132 |
133 | Check the Oozie log file logs/oozie.log to ensure Oozie started properly.
134 |
135 | Use the following command to check the status of Oozie from command line:
136 |
137 | $ ./bin/oozie admin -oozie http://localhost:11000/oozie -status
138 | System mode: NORMAL
139 |
140 | URL for the Oozie Web Console is http://localhost:11000/oozie
141 |
--------------------------------------------------------------------------------
/zookeeper_oozie/zookeeper.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/zookeeper_oozie/zookeeper.txt
--------------------------------------------------------------------------------