├── Courses_Offered ├── Advanced_Data Lake_and_Streaming.pdf ├── Advanced_Hadoop_TroubleshootingVer2.0.pdf ├── Advanced_Hadoop_adminstrationV2.0.pdf ├── HBase_catalog_ver2.0.pdf └── Hadoop_Admin_catalog.pdf ├── DNS ├── dns_installation.txt ├── named.txt └── zones.txt ├── Flume ├── commands ├── flume_collector.conf └── web_server_source.conf ├── HA_QJM ├── core-site.xml ├── hdfs-site.xml ├── hdfs-site_datanode.xml ├── hdfs-site_namenode.xml ├── steps └── zoo.cfg ├── HA_RM ├── yarn-site.xml.ha ├── yarn-site.xml.spark └── yarn-site_nodemanager_ha.xml ├── HA_hadoop ├── core-site.xml └── hdfs-site.xml ├── HBase ├── Optimizations │ ├── HBase_yscb.txt │ ├── Hbase_create_table.txt │ ├── Hbase_happybase.txt │ ├── Hbase_rand_gen.txt │ └── Netxillon_HBase.pdf ├── README.md ├── backup-masters ├── commands.txt ├── hbase-site.txt ├── hfile ├── hive-mysql.txt ├── hive.txt ├── regions.txt ├── regionservers ├── replication ├── tez-setup └── untitled.txt ├── Hive_performance ├── Jars ├── azure.tar.gz ├── hadoop-azure-3.1.1.3.1.0.0-SNAPSHOT.jar └── jce_policy-8.zip ├── Kafka ├── commands ├── kafka-env.sh ├── kafka_ganglia.txt ├── kafka_ganglia2.txt ├── kakfa_rsyslog.txt └── server.properties ├── Notes ├── Benchmarking.txt ├── Hadoop_lab.doc ├── Hadoop_upgrade.txt ├── Performance.txt ├── backup.txt ├── cassandra2.pdf ├── class3_questions ├── class4_questions ├── cloudera.txt ├── disk_partition ├── hadoop_ports.txt ├── hadoop_ports_firewall.xls ├── install_hadoop.txt ├── installation.txt ├── pig.txt ├── questions.txt ├── quick-links ├── quiz4.txt ├── quiz7.txt ├── quota.txt ├── rack.txt ├── remove_datanode.txt ├── repo_server.txt ├── scoop.txt ├── sqoop.txt ├── sqoop1.txt └── yarn.txt ├── README.md ├── Schedulers ├── capacity-scheduler.xml ├── commands ├── fair-scheduler.xml ├── mapred-site.xml ├── user-mappings.txt ├── yarn-site.xml_capacity └── yarn-site.xml_fair ├── Security ├── README.md ├── SSL_Configs │ ├── CA │ │ ├── README.txt │ │ └── openssl.cnf │ ├── commands_CA_JKS │ ├── gen-certs.sh │ └── hadoop_ssl_configs │ │ ├── core-site.xml │ │ ├── hdfs-site.xml │ │ ├── mapred-site.xml │ │ ├── ssl-client.xml │ │ ├── ssl-server.xml │ │ └── yarn-site.xml ├── kerberos │ ├── JT │ │ ├── core-site.xml │ │ ├── hdfs-site.xml │ │ ├── mapred-site.xml │ │ └── taskcontroller.cfg │ ├── Jsvc_download.txt │ ├── Namenode_Datanode │ │ ├── core-site.xml │ │ ├── hadoop-env.sh │ │ ├── hdfs-site.xml │ │ ├── mapred-site.xml │ │ └── taskcontroller.cfg │ ├── README.md │ ├── kdc.conf │ ├── kerberos_scripts │ │ ├── README.md │ │ ├── add_users.sh │ │ ├── copy_keytabs.sh │ │ ├── create_dn_princs.sh │ │ ├── create_nn_princs.sh │ │ ├── create_partions.sh │ │ ├── create_user_keytab.sh │ │ ├── delete_list │ │ ├── delete_princs.sh │ │ ├── dn_host_list │ │ ├── hosts │ │ ├── install_krb.sh │ │ ├── list_princs.sh │ │ ├── nn_host_list │ │ ├── setup_kerberos.sh │ │ ├── user_host_list │ │ └── user_list │ ├── kerberos_user_mappings.txt │ └── krb5.conf ├── kms │ ├── kms-acl │ └── kms-setup └── ldap │ ├── Installation_steps │ ├── addmembers.ldif │ ├── adduser.ldif │ ├── base.ldif │ ├── base1.ldif │ ├── base2.ldif │ ├── commands │ ├── groupadd.ldif │ ├── slapd.conf.obsolete │ ├── test.ldif │ └── users.ldif ├── Spark ├── Spark_build ├── examples.txt ├── spark-defaults.conf ├── sparkPython │ ├── erfunction.py │ ├── error.py │ ├── error.txt │ ├── logparser.py │ ├── pivot.txt │ ├── square.py │ ├── wordcount.py │ └── wordcount.txt ├── spark_command.txt ├── spark_standalone_cluster.txt └── yarn-site.xml.spark ├── Yarn_tuning ├── Yarn.pdf ├── mapred-site.xml └── yarn-site.xml ├── _config.yml ├── hadoop1.0 └── README.md ├── hadoop2.0 ├── bash_profile ├── core-site.xml ├── hdfs-site.xml ├── mapred-site.xml └── yarn-site.xml ├── hadoop_build64bit ├── jumbune ├── logging ├── map_scripts ├── job.txt ├── mapper.py ├── mapper.sh ├── reducer.py └── reducer.sh ├── zookeeper.txt └── zookeeper_oozie ├── oozie-server.txt └── zookeeper.txt /Courses_Offered/Advanced_Data Lake_and_Streaming.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Courses_Offered/Advanced_Data Lake_and_Streaming.pdf -------------------------------------------------------------------------------- /Courses_Offered/Advanced_Hadoop_TroubleshootingVer2.0.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Courses_Offered/Advanced_Hadoop_TroubleshootingVer2.0.pdf -------------------------------------------------------------------------------- /Courses_Offered/Advanced_Hadoop_adminstrationV2.0.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Courses_Offered/Advanced_Hadoop_adminstrationV2.0.pdf -------------------------------------------------------------------------------- /Courses_Offered/HBase_catalog_ver2.0.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Courses_Offered/HBase_catalog_ver2.0.pdf -------------------------------------------------------------------------------- /Courses_Offered/Hadoop_Admin_catalog.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Courses_Offered/Hadoop_Admin_catalog.pdf -------------------------------------------------------------------------------- /DNS/dns_installation.txt: -------------------------------------------------------------------------------- 1 | DNS Installation 2 | ================ 3 | 4 | # yum install bind -y 5 | 6 | # vi /etc/named.conf 7 | 8 | remove everything and just use the lines below (Change IP's accordingly) 9 | 10 | options { 11 | <<<<<<< HEAD:DNS/dns_installation.txt 12 | listen-on port 53 { 192.168.1.254; }; 13 | directory "/var/named"; 14 | ======= 15 | listen-on port 53 { 192.168.1.254; }; 16 | directory "/var/named"; 17 | >>>>>>> 925912d1565c9ba263d928397d065c93178cb463:hadoop1.0/dns.txt 18 | 19 | allow-query { any; }; 20 | 21 | forwarders { 192.168.1.1; }; 22 | 23 | }; 24 | 25 | zone "cluster1.com" IN { 26 | type master; 27 | file "/var/named/named.hadoop.forw"; 28 | }; 29 | 30 | <<<<<<< HEAD:DNS/dns_installation.txt 31 | zone "1.168.192.in-addr.arpa" IN { 32 | type master; 33 | file "/var/named/named.ha.rev"; 34 | 35 | }; 36 | 37 | ================================================= 38 | 39 | # vi /var/named/named.hadoop.forward.zone 40 | 41 | $TTL 86400 42 | @ IN SOA cluster1.com root ( 43 | 42 ; serial 44 | 3H ; refresh 45 | 15M ; retry 46 | 1W ; expiry 47 | 1D ) ; minimum 48 | 49 | IN NS ns1.cluster1.com 50 | ns1 IN A 192.168.1.70 51 | 52 | nn1 IN A 192.168.1.70 53 | nn2 IN A 192.168.1.77 54 | dn1 IN A 192.168.1.71 55 | dn2 IN A 192.168.1.72 56 | dn3 IN A 192.168.1.73 57 | snn IN A 192.168.1.68 58 | jt IN A 192.168.1.69 59 | db IN A 192.168.1.99 60 | kdc IN A 192.168.1.40 61 | cm IN A 192.168.1.41 62 | base IN A 192.168.1.10 63 | cm1 IN A 192.168.1.11 64 | node1 IN A 192.168.1.12 65 | filer IN A 192.168.1.222 66 | cloudera IN A 192.168.1.151 67 | datanode IN A 192.168.1.152 68 | hadooplab IN A 192.168.1.33 69 | 70 | =================== 71 | 72 | # vi /var/named/named.ha.rev 73 | ======= 74 | 75 | zone "1.168.192.in-addr.arpa" IN { 76 | type master; 77 | file "/var/named/named.hadoop.rev"; 78 | }; 79 | ============ 80 | zone files (cluster1.com) forward zone 81 | ========= 82 | 83 | $TTL 1D 84 | @ IN SOA @ rname.invalid. ( 85 | 0 ; serial 86 | 1D ; refresh 87 | 1H ; retry 88 | 1W ; expire 89 | 3H ) ; minimum 90 | 91 | 92 | IN NS ns1.cluster1.com. 93 | ns1 IN A 192.168.1.254 94 | repo IN A 192.168.1.254 95 | 96 | ;namenodes 97 | nn1 IN A 192.168.1.70 98 | nn2 IN A 192.168.1.71 99 | 100 | ;Datanodes 101 | dn1 IN A 192.168.1.72 102 | dn2 IN A 192.168.1.73 103 | dn3 IN A 192.168.1.74 104 | dn4 IN A 192.168.1.75 105 | 106 | ;Other Masters 107 | 108 | snn IN A 192.168.1.68 109 | jt IN A 192.168.1.69 110 | client IN A 192.168.1.99 111 | kdc IN A 192.168.1.102 112 | 113 | ;Cloudera 114 | 115 | cm IN A 192.168.1.40 116 | node1 IN A 192.168.1.41 117 | node2 IN A 192.168.1.42 118 | server IN A 192.168.1.44 119 | 120 | ;Storage 121 | 122 | filer IN A 192.168.1.253 123 | 124 | ;Databases; 125 | 126 | mynode1 IN A 192.168.1.81 127 | mynode2 IN A 192.168.1.82 128 | labserver IN A 192.168.1.14 129 | =========== 130 | 131 | Reverse zone for cluster1.com 132 | ============================== 133 | >>>>>>> 925912d1565c9ba263d928397d065c93178cb463:hadoop1.0/dns.txt 134 | 135 | $TTL 86400 136 | @ IN SOA ns1.cluster1.com. root.cluster1.com. ( 137 | 1997022700 ; Serial 138 | 28800 ; Refresh 139 | 14400 ; Retry 140 | 3600000 ; Expire 141 | 86400 ) ; Minimum 142 | 143 | 1.168.192.in-addr.arpa. IN NS ns1.cluster1.com. 144 | 145 | 70 IN PTR nn1.cluster1.com. 146 | 40 IN PTR kdc.cluster1.com. 147 | 41 IN PTR cm.cluster1.com. 148 | 20 IN PTR node1.cluster1.com. 149 | 71 IN PTR dn1.cluster1.com. 150 | 72 IN PTR dn2.cluster1.com. 151 | 73 IN PTR dn3.cluster1.com. 152 | 10 IN PTR base.cluster1.com. 153 | 11 IN PTR cm1.cluster1.com. 154 | 12 IN PTR node1.cluster1.com. 155 | 151 IN PTR cloudera.cluster1.com. 156 | 152 IN PTR datanode.cluster1.com. 157 | 158 | ============================================== 159 | 160 | # chown -R root:named /var/named/ 161 | 162 | 163 | 164 | # service named restart 165 | 166 | =========================== 167 | 168 | On all client machines 169 | 170 | # vi /etc/resolv.conf 171 | 172 | nameserver 192.168.1.254 173 | -------------------------------------------------------------------------------- /DNS/named.txt: -------------------------------------------------------------------------------- 1 | options { 2 | listen-on port 53 { 192.168.1.254; }; 3 | directory "/var/named"; 4 | 5 | allow-query { any; }; 6 | 7 | forwarders { 192.168.1.1; }; 8 | 9 | }; 10 | 11 | zone "cluster1.com" IN { 12 | type master; 13 | file "/var/named/named.hadoop.forward.zone"; 14 | }; 15 | 16 | ;Second Zone 17 | zone "hacluster1.com" IN { 18 | type master; 19 | file "/var/named/named.ha.forward.zone"; 20 | }; 21 | 22 | zone "1.168.192.in-addr.arpa" IN { 23 | type master; 24 | file "/var/named/named.ha.rev"; 25 | # file "/var/named/named.hadoop.rev"; 26 | }; 27 | -------------------------------------------------------------------------------- /DNS/zones.txt: -------------------------------------------------------------------------------- 1 | #Forward lookup zone 2 | ;Make sure you understand that comments could be using # or ; and it might change in future versions 3 | 4 | $TTL 86400 5 | @ IN SOA cluster1.com root ( 6 | 42 ; serial 7 | 3H ; refresh 8 | 15M ; retry 9 | 1W ; expiry 10 | 1D ) ; minimum 11 | 12 | IN NS ns1.cluster1.com 13 | ns1 IN A 192.168.1.70 14 | 15 | nn1 IN A 192.168.1.70 16 | nn2 IN A 192.168.1.77 17 | dn1 IN A 192.168.1.71 18 | dn2 IN A 192.168.1.72 19 | dn3 IN A 192.168.1.73 20 | snn IN A 192.168.1.68 21 | jt IN A 192.168.1.69 22 | db IN A 192.168.1.99 23 | kdc IN A 192.168.1.40 24 | cm IN A 192.168.1.41 25 | base IN A 192.168.1.10 26 | cm1 IN A 192.168.1.11 27 | node1 IN A 192.168.1.12 28 | filer IN A 192.168.1.222 29 | cloudera IN A 192.168.1.151 30 | datanode IN A 192.168.1.152 31 | hadooplab IN A 192.168.1.33 32 | 33 | 34 | ================ 35 | # Reverse lookup Zone 36 | +++++++++++++++++++++++ 37 | 38 | $TTL 86400 39 | @ IN SOA ns1.cluster1.com. root.cluster1.com. ( 40 | 1997022700 ; Serial 41 | 28800 ; Refresh 42 | 14400 ; Retry 43 | 3600000 ; Expire 44 | 86400 ) ; Minimum 45 | 46 | 1.168.192.in-addr.arpa. IN NS ns1.cluster1.com. 47 | 48 | 70 IN PTR nn1.cluster1.com. 49 | 40 IN PTR kdc.cluster1.com. 50 | 41 IN PTR cm.cluster1.com. 51 | 20 IN PTR node1.cluster1.com. 52 | 71 IN PTR dn1.cluster1.com. 53 | 72 IN PTR dn2.cluster1.com. 54 | 73 IN PTR dn3.cluster1.com. 55 | 10 IN PTR base.cluster1.com. 56 | 11 IN PTR cm1.cluster1.com. 57 | 12 IN PTR node1.cluster1.com. 58 | 151 IN PTR cloudera.cluster1.com. 59 | 152 IN PTR datanode.cluster1.com. 60 | -------------------------------------------------------------------------------- /Flume/commands: -------------------------------------------------------------------------------- 1 | # Start Collector first: 2 | 3 | flume-ng agent -c conf -f flume/conf/flume_collector.conf -n collector (Name must match the agent name) 4 | flume-ng agent -c conf -f flume/conf/web_server_source.conf -n source_agent (Name must match agent name) 5 | 6 | 7 | # Change the hostnames/IPs in the config files accordingly. 8 | 9 | 10 | #Note: 11 | - New deployments do not use flume any more and kafka is doing lot of things. 12 | - But, flume is still a good use case for log aggregatio etc and avoid the overhead of kafka cluster. 13 | -------------------------------------------------------------------------------- /Flume/flume_collector.conf: -------------------------------------------------------------------------------- 1 | #http://flume.apache.org/FlumeUserGuide.html#avro-source 2 | collector.sources = AvroIn 3 | collector.sources.AvroIn.type = avro 4 | collector.sources.AvroIn.bind = 192.168.1.109 5 | collector.sources.AvroIn.port = 4545 6 | collector.sources.AvroIn.channels = mc1 mc2 7 | 8 | ## Channels ## 9 | ## Source writes to 2 channels, one for each sink 10 | collector.channels = mc1 mc2 11 | 12 | #http://flume.apache.org/FlumeUserGuide.html#memory-channel 13 | 14 | collector.channels.mc1.type = memory 15 | collector.channels.mc1.capacity = 100 16 | 17 | collector.channels.mc2.type = memory 18 | collector.channels.mc2.capacity = 100 19 | 20 | ## Sinks ## 21 | collector.sinks = LocalOut HadoopOut 22 | 23 | ## Write copy to Local Filesystem 24 | #http://flume.apache.org/FlumeUserGuide.html#file-roll-sink 25 | collector.sinks.LocalOut.type = file_roll 26 | collector.sinks.LocalOut.sink.directory = /var/log/flume-ng 27 | collector.sinks.LocalOut.sink.rollInterval = 0 28 | collector.sinks.LocalOut.channel = mc1 29 | 30 | ## Write to HDFS 31 | #http://flume.apache.org/FlumeUserGuide.html#hdfs-sink 32 | collector.sinks.HadoopOut.type = hdfs 33 | collector.sinks.HadoopOut.channel = mc2 34 | collector.sinks.HadoopOut.hdfs.path = /user/hadoop/flume-channel/%{log_type}/%y%m%d 35 | collector.sinks.HadoopOut.hdfs.fileType = DataStream 36 | collector.sinks.HadoopOut.hdfs.writeFormat = Text 37 | collector.sinks.HadoopOut.hdfs.rollSize = 0 38 | collector.sinks.HadoopOut.hdfs.rollCount = 10000 39 | collector.sinks.HadoopOut.hdfs.rollInterval = 600 40 | 41 | -------------------------------------------------------------------------------- /Flume/web_server_source.conf: -------------------------------------------------------------------------------- 1 | # Source Config 2 | 3 | source_agent.sources = apache_server 4 | source_agent.sources.apache_server.type = exec 5 | source_agent.sources.apache_server.command = tail -f /var/log/httpd/access_log 6 | source_agent.sources.apache_server.batchSize = 1 7 | source_agent.sources.apache_server.channels = mc1 8 | source_agent.sources.apache_server.interceptors = itime ihost itype 9 | 10 | # http://flume.apache.org/FlumeUserGuide.html#timestamp-interceptor 11 | source_agent.sources.apache_server.interceptors.itime.type = timestamp 12 | 13 | # http://flume.apache.org/FlumeUserGuide.html#host-interceptor 14 | source_agent.sources.apache_server.interceptors.ihost.type = host 15 | source_agent.sources.apache_server.interceptors.ihost.useIP = false 16 | source_agent.sources.apache_server.interceptors.ihost.hostHeader = host 17 | 18 | # http://flume.apache.org/FlumeUserGuide.html#static-interceptor 19 | source_agent.sources.apache_server.interceptors.itype.type = static 20 | source_agent.sources.apache_server.interceptors.itype.key = log_type 21 | source_agent.sources.apache_server.interceptors.itype.value = apache_access_combined 22 | 23 | # http://flume.apache.org/FlumeUserGuide.html#memory-channel 24 | source_agent.channels = mc1 25 | source_agent.channels.mc1.type = memory 26 | source_agent.channels.mc1.capacity = 100 27 | 28 | ## Send to Flume Collector on Hadoop Node 29 | # http://flume.apache.org/FlumeUserGuide.html#avro-sink 30 | source_agent.sinks = avro_sink 31 | source_agent.sinks.avro_sink.type = avro 32 | source_agent.sinks.avro_sink.channel = mc1 33 | source_agent.sinks.avro_sink.hostname = 192.168.1.109 34 | source_agent.sinks.avro_sink.port = 4545 35 | 36 | #source_agent.sinks = LocalOut 37 | #source_agent.sinks.LocalOut.type = file_roll 38 | #source_agent.sinks.LocalOut.sink.directory = /tmp/flume 39 | #source_agent.sinks.LocalOut.sink.rollInterval = 0 40 | #source_agent.sinks.LocalOut.channel = mc1 41 | -------------------------------------------------------------------------------- /HA_QJM/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | fs.defaultFS 8 | hdfs://netxillon 9 | 10 | 11 | 12 | ha.zookeeper.quorum 13 | nn1.dilithium.com:2181,nn2.dilithium.com:2181,hbm1.dilithium.com:2181 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /HA_QJM/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | dfs.namenode.name.dir 5 | file:/data/n1,file:/data/n2 6 | 7 | 8 | 9 | dfs.replication 10 | 2 11 | 12 | 13 | 14 | dfs.blocksize 15 | 268435456 16 | 17 | 18 | 19 | dfs.nameservices 20 | netxillon 21 | 22 | 23 | 24 | dfs.ha.namenodes.netxillon 25 | nn1,nn2 26 | 27 | 28 | 29 | dfs.namenode.rpc-address.netxillon.nn1 30 | nn1.dilithium.com:9000 31 | 32 | 33 | 34 | dfs.namenode.rpc-address.netxillon.nn2 35 | nn2.dilithium.com:9000 36 | 37 | 38 | 39 | dfs.namenode.http-address.netxillon.nn1 40 | nn1.dilithium.com:50070 41 | 42 | 43 | 44 | dfs.namenode.http-address.netxillon.nn2 45 | nn2.dilithium.com:50070 46 | 47 | 48 | 49 | dfs.ha.automatic-failover.enabled 50 | true 51 | 52 | 53 | 54 | dfs.journalnode.edits.dir 55 | /data/netxillon 56 | 57 | 58 | 59 | dfs.namenode.shared.edits.dir 60 | qjournal://nn1.dilithium.com:8485;nn2.dilithium.com:8485;hbm1.dilithium.com:8485/netxillon 61 | 62 | 63 | 64 | dfs.client.failover.proxy.provider.netxillon 65 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider 66 | 67 | 68 | 69 | dfs.ha.fencing.ssh.private-key-files 70 | /home/hadoop/.ssh/id_rsa 71 | 72 | 73 | 74 | dfs.ha.fencing.methods 75 | sshfence 76 | shell(/bin/true) 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /HA_QJM/hdfs-site_datanode.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | dfs.datanode.data.dir 5 | file:/space/d1,file:/space/d2 6 | 7 | 8 | 9 | dfs.replication 10 | 2 11 | 12 | 13 | 14 | dfs.blocksize 15 | 268435456 16 | 17 | 18 | 19 | dfs.nameservices 20 | netxillon 21 | 22 | 23 | 24 | dfs.ha.namenodes.netxillon 25 | nn1,nn2 26 | 27 | 28 | 29 | dfs.namenode.rpc-address.netxillon.nn1 30 | nn1.dilithium.com:9000 31 | 32 | 33 | 34 | dfs.namenode.rpc-address.netxillon.nn2 35 | nn2.dilithium.com:9000 36 | 37 | 38 | 39 | dfs.client.failover.proxy.provider.netxillon 40 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /HA_QJM/hdfs-site_namenode.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | dfs.namenode.name.dir 5 | file:/data/n1,file:/data/n2 6 | 7 | 8 | 9 | dfs.replication 10 | 2 11 | 12 | 13 | 14 | dfs.blocksize 15 | 268435456 16 | 17 | 18 | 19 | dfs.nameservices 20 | netxillon 21 | 22 | 23 | 24 | dfs.ha.namenodes.netxillon 25 | nn1,nn2 26 | 27 | 28 | 29 | dfs.namenode.rpc-address.netxillon.nn1 30 | nn1.dilithium.com:9000 31 | 32 | 33 | 34 | dfs.namenode.rpc-address.netxillon.nn2 35 | nn2.dilithium.com:9000 36 | 37 | 38 | 39 | dfs.namenode.http-address.netxillon.nn1 40 | nn1.dilithium.com:50070 41 | 42 | 43 | 44 | dfs.namenode.http-address.netxillon.nn2 45 | nn2.dilithium.com:50070 46 | 47 | 48 | 49 | dfs.ha.automatic-failover.enabled 50 | true 51 | 52 | 53 | 54 | dfs.journalnode.edits.dir 55 | /data/netxillon 56 | 57 | 58 | 59 | dfs.namenode.shared.edits.dir 60 | qjournal://nn1.dilithium.com:8485;nn2.dilithium.com:8485;hbm1.dilithium.com:8485/netxillon 61 | 62 | 63 | 64 | dfs.client.failover.proxy.provider.netxillon 65 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider 66 | 67 | 68 | 69 | dfs.ha.fencing.ssh.private-key-files 70 | /home/hadoop/.ssh/id_rsa 71 | 72 | 73 | 74 | dfs.ha.fencing.methods 75 | sshfence 76 | shell(/bin/true) 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /HA_QJM/steps: -------------------------------------------------------------------------------- 1 | QJM Steps 2 | ========= 3 | 1. Setup zookeeper quorum and make sure that it is healthy 4 | - ./zookeeper-3.4.5/bin/zkServer.sh start 5 | - ./zookeeper-3.4.5/bin/zkCli.sh -server nn1.dilithium.com:2181 6 | 7 | or 8 | 9 | [hdfs@nn2 ~]$ zkServer.sh status 10 | ZooKeeper JMX enabled by default 11 | Using config: /opt/cluster/zoo/bin/../conf/zoo.cfg 12 | Mode: follower 13 | 14 | 2. Setup core and hdfs file on each node as given. 15 | 16 | 3. Start Journalnode on all journal node machines 17 | - hadoop-daemon.sh start journalnode 18 | 19 | 4. Format namenode (Do not run this command, if you already have a NN runing and want to update it to HA) 20 | - hdfs namenode -format 21 | 22 | 5. Initialize shared edits for Journal node to see: 23 | - hdfs namenode -initializeSharedEdits -force 24 | 25 | Note: 26 | - namenode must be stopped for this step; 27 | - Only run this if you have not executed step4. 28 | - Means that there was already a single Namenode and now you need to initialize the shared edits for Journals. 29 | - You must not use initialize command if we are formating the Namenode as it will automatically initiallize the Journals 30 | node directories 31 | 32 | 33 | 6. Format zkFC 34 | - hdfs zkfc -formatZK -force 35 | 36 | 7. Start namenode 37 | - hadoop-daemon.sh start namenode 38 | 39 | 8. Start ZKFC 40 | - hadoop-daemon.sh start zkfc 41 | 42 | 9. Bootstrap StandbyNamenode 43 | - hdfs namenode -bootstrapStandby 44 | 45 | 10. Start Namenode on standby 46 | - hadoop-daemon.sh start namenode 47 | 48 | 11. Start ZKFC on standy 49 | - hadoop-daemon.sh start zkfc 50 | 51 | Now your cluster is HA with one active Namenode 52 | 53 | [hdfs@nn1 ~]$ hdfs haadmin -getServiceState nn1 54 | active 55 | [hdfs@nn1 ~]$ hdfs haadmin -getServiceState nn2 56 | standby 57 | 58 | The "start-dfs.sh" script understands that it is a HA with Journal nodes and automatically starts: 59 | - Both namenodes 60 | - All Journal nodes 61 | - Datanode nodes 62 | - Both ZkFC 63 | 64 | Note: Make sure you start the ZK quorum before hand. 65 | 66 | [hdfs@nn1 ~]$ jps 67 | 7828 Jps 68 | 7336 JournalNode 69 | 7512 DFSZKFailoverController 70 | 7162 NameNode 71 | 72 | -------------------------------------------------------------------------------- /HA_QJM/zoo.cfg: -------------------------------------------------------------------------------- 1 | [hdfs@nn2 ~]$ cat .bash_profile 2 | # .bash_profile 3 | 4 | # Get the aliases and functions 5 | if [ -f ~/.bashrc ]; then 6 | . ~/.bashrc 7 | fi 8 | 9 | # User specific environment and startup programs 10 | 11 | PATH=$PATH:$HOME/bin 12 | ZOOKEEPER_HOME=/opt/cluster/zoo 13 | 14 | PATH=$ZOOKEEPER_HOME/bin:$PATH 15 | export PATH 16 | 17 | 18 | [hdfs@nn2 ~]$ cat /opt/cluster/zoo/conf/zoo.cfg 19 | 20 | # The number of milliseconds of each tick 21 | tickTime=2000 22 | 23 | # The number of ticks that the initial synchronization phase can take 24 | initLimit=10 25 | 26 | # The number of ticks that can pass between 27 | # sending a request and getting an acknowledgement 28 | syncLimit=5 29 | 30 | # the directory where the snapshot is stored. 31 | # Choose appropriately for your environment 32 | dataDir=/opt/cluster/zookeeper/data 33 | 34 | # the port at which the clients will connect 35 | clientPort=2181 36 | 37 | maxClientCnxns=60 38 | 39 | # the directory where transaction log is stored. 40 | # this parameter provides dedicated log device for ZooKeeper 41 | dataLogDir=/opt/cluster/zookeeper/logs 42 | 43 | # ZooKeeper server and its port no. 44 | # ZooKeeper ensemble should know about every other machine in the ensemble 45 | # specify server id by creating 'myid' file in the dataDir 46 | # use hostname instead of IP address for convenient maintenance 47 | server.1=nn1.dilithium.com:2888:3888 48 | server.2=nn2.dilithium.com:2889:3889 49 | server.3=hbm1.dilithium.com:2890:3890 50 | 51 | -------------------------------------------------------------------------------- /HA_RM/yarn-site.xml.ha: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | yarn.nodemanager.aux-services 8 | mapreduce_shuffle,spark_shuffle,spark2_shuffle 9 | 10 | 11 | 12 | yarn.nodemanager.aux-services.mapreduce.shuffle.class 13 | org.apache.hadoop.mapred.ShuffleHandler 14 | 15 | 16 | 17 | yarn.nodemanager.aux-services.spark_shuffle.class 18 | org.apache.spark.network.yarn.YarnShuffleService 19 | 20 | 21 | 22 | yarn.nodemanager.aux-services.spark2_shuffle.class 23 | org.apache.spark.network.yarn.YarnShuffleService 24 | 25 | 26 | # HA Configuration 27 | 28 | 29 | 30 | 31 | yarn.resourcemanager.ha.enabled 32 | true 33 | 34 | 35 | 36 | yarn.resourcemanager.cluster-id 37 | netxillon 38 | 39 | 40 | 41 | yarn.resourcemanager.ha.rm-ids 42 | rm1,rm2 43 | 44 | 45 | 46 | yarn.resourcemanager.hostname.rm1 47 | rm1.dilithium.com 48 | 49 | 50 | 51 | yarn.resourcemanager.hostname.rm2 52 | rm2.dilithium.com 53 | 54 | 55 | 56 | yarn.resourcemanager.webapp.address.rm1 57 | rm1.dilithium.com:8088 58 | 59 | 60 | 61 | yarn.resourcemanager.webapp.address.rm2 62 | rm2.dilithium.com:8088 63 | 64 | 65 | 66 | yarn.resourcemanager.recovery.enabled 67 | true 68 | 69 | 70 | 71 | yarn.resourcemanager.store.class 72 | org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore 73 | 74 | 75 | 76 | yarn.resourcemanager.zk-address 77 | nn1.dilithium.com:2181,nn2.dilithium.com:2181,hbm1.dilithium.com:2181 78 | 79 | 80 | 81 | yarn.client.failover-proxy-provider 82 | org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider 83 | 84 | 85 | 99 | # End HA Configuration 100 | 101 | 102 | yarn.nodemanager.resource.memory-mb 103 | 3072 104 | 105 | 106 | 107 | yarn.scheduler.minimum-allocation-mb 108 | 256 109 | 110 | 111 | 112 | yarn.scheduler.maximum-allocation-mb 113 | 3072 114 | 115 | 116 | 117 | yarn.scheduler.minimum-allocation-vcores 118 | 1 119 | 120 | 121 | 122 | yarn.scheduler.maximum-allocation-vcores 123 | 12 124 | 125 | 126 | 127 | yarn.nodemanager.resource.cpu-vcores 128 | 12 129 | 130 | 131 | 132 | 133 | yarn.nodemanager.vmem-pmem-ratio 134 | 2.1 135 | 136 | 137 | 138 | 139 | yarn.log-aggregation-enable 140 | true 141 | 142 | 143 | 144 | Where to aggregate logs to. 145 | yarn.nodemanager.remote-app-log-dir 146 | /tmp/logs 147 | 148 | 149 | 150 | yarn.log-aggregation.retain-seconds 151 | 259200 152 | 153 | 154 | 155 | yarn.log-aggregation.retain-check-interval-seconds 156 | 3600 157 | 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /HA_RM/yarn-site.xml.spark: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | yarn.nodemanager.aux-services 8 | mapreduce_shuffle,spark_shuffle,spark2_shuffle 9 | 10 | 11 | 12 | yarn.nodemanager.aux-services.mapreduce.shuffle.class 13 | org.apache.hadoop.mapred.ShuffleHandler 14 | 15 | 16 | 17 | yarn.nodemanager.aux-services.spark_shuffle.class 18 | org.apache.spark.network.yarn.YarnShuffleService 19 | 20 | 21 | 22 | 23 | yarn.nodemanager.aux-services.spark2_shuffle.class 24 | org.apache.spark.network.yarn.YarnShuffleService 25 | 26 | 27 | 28 | yarn.resourcemanager.resource-tracker.address 29 | rm1.dilithium.com:9001 30 | 31 | 32 | 33 | yarn.resourcemanager.scheduler.address 34 | rm1.dilithium.com:9002 35 | 36 | 37 | 38 | yarn.resourcemanager.address 39 | rm1.dilithium.com:9003 40 | 41 | 42 | # 43 | #yarn.nodemanager.local-dirs 44 | #file:/space/tmp1,file:/space/tmp2 45 | # 46 | 47 | 48 | yarn.nodemanager.resource.memory-mb 49 | 3072 50 | 51 | 52 | 53 | yarn.scheduler.minimum-allocation-mb 54 | 256 55 | 56 | 57 | 58 | yarn.scheduler.maximum-allocation-mb 59 | 3072 60 | 61 | 62 | 63 | yarn.scheduler.minimum-allocation-vcores 64 | 1 65 | 66 | 67 | 68 | yarn.scheduler.maximum-allocation-vcores 69 | 12 70 | 71 | 72 | 73 | yarn.nodemanager.resource.cpu-vcores 74 | 12 75 | 76 | 77 | 78 | 79 | yarn.nodemanager.vmem-pmem-ratio 80 | 2.1 81 | 82 | 83 | # 84 | # yarn.nodemanager.vmem-check-enabled 85 | # false 86 | # Whether virtual memory limits will be enforced for containers 87 | # 88 | 89 | 90 | yarn.log-aggregation-enable 91 | true 92 | 93 | 94 | 95 | Where to aggregate logs to. 96 | yarn.nodemanager.remote-app-log-dir 97 | /tmp/logs 98 | 99 | 100 | 101 | yarn.log-aggregation.retain-seconds 102 | 259200 103 | 104 | 105 | 106 | yarn.log-aggregation.retain-check-interval-seconds 107 | 3600 108 | 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /HA_RM/yarn-site_nodemanager_ha.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | yarn.nodemanager.aux-services 8 | mapreduce_shuffle,spark_shuffle,spark2_shuffle 9 | 10 | 11 | 12 | yarn.nodemanager.aux-services.mapreduce.shuffle.class 13 | org.apache.hadoop.mapred.ShuffleHandler 14 | 15 | 16 | 17 | yarn.nodemanager.aux-services.spark_shuffle.class 18 | org.apache.spark.network.yarn.YarnShuffleService 19 | 20 | 21 | 22 | yarn.nodemanager.aux-services.spark2_shuffle.class 23 | org.apache.spark.network.yarn.YarnShuffleService 24 | 25 | 26 | # HA Configuration 27 | 28 | 29 | 30 | 31 | yarn.resourcemanager.ha.enabled 32 | true 33 | 34 | 35 | 36 | yarn.resourcemanager.cluster-id 37 | netxillon 38 | 39 | 40 | 41 | yarn.resourcemanager.ha.rm-ids 42 | rm1,rm2 43 | 44 | 45 | 46 | yarn.resourcemanager.hostname.rm1 47 | rm1.dilithium.com 48 | 49 | 50 | 51 | yarn.resourcemanager.hostname.rm2 52 | rm2.dilithium.com 53 | 54 | 55 | 56 | yarn.client.failover-proxy-provider 57 | org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider 58 | 59 | 60 | # End HA Configuration 61 | 62 | 63 | yarn.nodemanager.resource.memory-mb 64 | 3072 65 | 66 | 67 | 68 | yarn.scheduler.minimum-allocation-mb 69 | 256 70 | 71 | 72 | 73 | yarn.scheduler.maximum-allocation-mb 74 | 3072 75 | 76 | 77 | 78 | yarn.scheduler.minimum-allocation-vcores 79 | 1 80 | 81 | 82 | 83 | yarn.scheduler.maximum-allocation-vcores 84 | 12 85 | 86 | 87 | 88 | yarn.nodemanager.resource.cpu-vcores 89 | 12 90 | 91 | 92 | 93 | 94 | yarn.nodemanager.vmem-pmem-ratio 95 | 2.1 96 | 97 | 98 | 99 | 100 | yarn.log-aggregation-enable 101 | true 102 | 103 | 104 | 105 | Where to aggregate logs to. 106 | yarn.nodemanager.remote-app-log-dir 107 | /tmp/logs 108 | 109 | 110 | 111 | yarn.log-aggregation.retain-seconds 112 | 259200 113 | 114 | 115 | 116 | yarn.log-aggregation.retain-check-interval-seconds 117 | 3600 118 | 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /HA_hadoop/core-site.xml: -------------------------------------------------------------------------------- 1 | Both Namenodes 2 | ============== 3 | 4 | 5 | 6 | 7 | fs.defaultFS 8 | hdfs://mycluster 9 | 10 | 11 | 12 | dfs.replication 13 | 1 14 | 15 | 16 | 17 | 18 | Data Nodes 19 | ========== 20 | 21 | 22 | 23 | fs.defaultFS 24 | hdfs://mycluster 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /HA_hadoop/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | Namenodes 2 | ========= 3 | 4 | 5 | 6 | 7 | dfs.namenode.name.dir 8 | file:/data/namenode 9 | 10 | 11 | 12 | dfs.nameservices 13 | mycluster 14 | 15 | 16 | 17 | dfs.ha.namenodes.mycluster 18 | nn1,nn2 19 | 20 | 21 | 22 | dfs.namenode.rpc-address.mycluster.nn1 23 | ha-nn1.hacluster1.com:9000 24 | 25 | 26 | 27 | dfs.namenode.rpc-address.mycluster.nn2 28 | ha-nn2.hacluster1.com:9000 29 | 30 | 31 | 32 | dfs.namenode.http-address.mycluster.nn1 33 | ha-nn1.hacluster1.com:50070 34 | 35 | 36 | 37 | dfs.namenode.http-address.mycluster.nn2 38 | ha-nn2.hacluster1.com:50070 39 | 40 | 41 | 42 | dfs.namenode.shared.edits.dir 43 | file:///mnt/filer 44 | 45 | 46 | 47 | dfs.client.failover.proxy.provider.mycluster 48 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider 49 | 50 | 51 | 52 | dfs.ha.fencing.methods 53 | sshfence 54 | 55 | 56 | 57 | dfs.ha.fencing.ssh.private-key-files 58 | /home/hadoop/.ssh/id_rsa 59 | 60 | 61 | 62 | dfs.ha.fencing.methods 63 | sshfence 64 | shell(/bin/true) 65 | 66 | 67 | 68 | 69 | ====================== 70 | 71 | Datanodes 72 | ========= 73 | 74 | 75 | 76 | 77 | dfs.datanode.data.dir 78 | file:/data/datanode 79 | 80 | 81 | 82 | dfs.nameservices 83 | mycluster 84 | 85 | 86 | 87 | dfs.ha.namenodes.mycluster 88 | nn1,nn2 89 | 90 | 91 | 92 | dfs.namenode.rpc-address.mycluster.nn1 93 | ha-nn1.hacluster1.com:9000 94 | 95 | 96 | 97 | dfs.namenode.rpc-address.mycluster.nn2 98 | ha-nn2.hacluster1.com:9000 99 | 100 | 101 | 102 | dfs.client.failover.proxy.provider.mycluster 103 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /HBase/Optimizations/HBase_yscb.txt: -------------------------------------------------------------------------------- 1 | Steps: 2 | 3 | 1. tar -xzvf ycsb-0.13.0-SNAPSHOT.tar.gz 4 | 2. cd ycsb-0.13.0-SNAPSHOT 5 | 3. cp /usr/lib/hbase/lib/slf4j-api-1.6.1.jar . 6 | 4. cp /usr/lib/hbase/lib/zookeeper.jar . 7 | 8 | hbase> create 'usertable', {NAME => 'f1', VERSIONS => '1', COMPRESSION => 'SNAPPY'} 9 | 10 | 5. cd ycsb-0.13.0-SNAPSHOT/bin 11 | 12 | $ ycsb load hbase12 -P workloads/workloada -p columnfamily=f1 -p recordcount=1000000 -p threadcount=4 -s | tee -a write.txt 13 | $ ycsb load hbase12 -P workloads/workloadb -p columnfamily=f1 -p recordcount=100000 -p operationcount=10000 -p threadcount=4 -s | tee -a workloadread.dat -------------------------------------------------------------------------------- /HBase/Optimizations/Hbase_create_table.txt: -------------------------------------------------------------------------------- 1 | 2 | hbase(main):001:0> create 'emp', 'personal data', 'professional data', {SPLITS => (1..n_splits).map {|i| "user#{1000+i*(9999-1000)/n_splits}"}} 3 | 4 | create 'emp1', 'personal data', 'professional data', {REPLICATION_SCOPE => 1} 5 | 6 | 7 | hbase(main):001:0> n_splits = 200 # HBase recommends (10 * number of regionservers) 8 | hbase(main):002:0> create 'usertable', 'family', {SPLITS => (1..n_splits).map {|i| "user#{1000+i*(9999-1000)/n_splits}"}} 9 | 10 | 11 | scan 'hbase:meta',{FILTER=>"PrefixFilter('emp1')"} 12 | 13 | 14 | Snapshots: 15 | 16 | 17 | hbase snapshot create -n snapshotName -t tableName 18 | 19 | hbase shell 20 | >> delete_snapshot 'snapshotName' 21 | >> restore_snapshot snapshotName 22 | >> list_snapshots 23 | >> clone_snapshot 'snapshotName', 'newTableName' 24 | 25 | hbase snapshot info -snapshot snapshotName 26 | 27 | 28 | -------------------------------------------------------------------------------- /HBase/Optimizations/Hbase_happybase.txt: -------------------------------------------------------------------------------- 1 | table = connection.table('table-name') 2 | 3 | table.put(b'row-key', {b'family:qual1': b'value1', 4 | b'family:qual2': b'value2'}) 5 | 6 | row = table.row(b'row-key') 7 | print(row[b'family:qual1']) # prints 'value1' 8 | 9 | for key, data in table.rows([b'row-key-1', b'row-key-2']): 10 | print(key, data) # prints row key and data for each row 11 | 12 | for key, data in table.scan(row_prefix=b'row'): 13 | print(key, data) # prints 'value1' and 'value2' 14 | 15 | row = table.delete(b'row-key') 16 | 17 | 18 | families = { 19 | 'cf1': dict(max_versions=10), 20 | 'cf2': dict(max_versions=1, block_cache_enabled=False), 21 | 'cf3': dict(), # use defaults 22 | } 23 | 24 | connection.create_table('mytable', families) -------------------------------------------------------------------------------- /HBase/Optimizations/Hbase_rand_gen.txt: -------------------------------------------------------------------------------- 1 | hbase(main):005:0> put 'emp','1','personal data:name','raju’ 2 | hbase(main):006:0> put 'emp','1','personal data:city','hyderabad' 3 | hbase(main):007:0> put 'emp','1','professional data:designation','manager' 4 | hbase(main):007:0> put 'emp','1','professional data:salary','50000’ 5 | 6 | locate_region 'test', '1' 7 | get_splits 'test' 8 | 9 | create 'emp', 'personal data', 'professional data' 10 | #!/bin/bash 11 | 12 | for i in `seq 1 1000000` 13 | do 14 | 15 | echo "put 'emp', '$i', 'personal data:name', 'raju$i'" 16 | echo "put 'emp', '$i', 'personal data:city', 'hyderabad$i'" 17 | echo "put 'emp', '$i', 'professional data:designation', 'manager$i'" 18 | echo "put 'emp', '$i', 'professional data:salary', '20000$i'" 19 | 20 | done 21 | 22 | # Optimized versions 23 | ==================== 24 | 25 | #!/bin/bash 26 | 27 | MIN=0 28 | MAX=1234567890 29 | while 30 | for i in `seq 1 1000000` 31 | do 32 | rand=$(cat /dev/urandom | tr -dc 0-9 | fold -w${#MAX} | head -1 | sed 's/^0*//;') 33 | [ -z $rnd ] && rnd=0 34 | (( $rnd < $MIN || $rnd > $MAX )) 35 | 36 | echo "put 'emp', '$rand', 'personal data:name', 'raju$i'" 37 | echo "put 'emp', '$rand', 'personal data:city', 'hyderabad$i'" 38 | echo "put 'emp', '$rand', 'professional data:designation', 'manager$i'" 39 | echo "put 'emp', '$rand', 'professional data:salary', '20000$i'" 40 | done 41 | do : 42 | done 43 | 44 | ============= 45 | 46 | #!/bin/bash 47 | 48 | # create 'emp', 'personal data', 'professional data' 49 | 50 | MIN=0 51 | MAX=1234567890 52 | while 53 | 54 | exp=`shuf -i 2000-65000 -n 1` 55 | #for i in `seq 1 10000000` 56 | #do 57 | rand=$(cat /dev/urandom | tr -dc 0-9 | fold -w${#MAX} | head -1 | sed 's/^0*//;') 58 | [ -z $rand ] && rand=0 59 | (( $rand < $MIN || $rand > $MAX )) 60 | 61 | echo "put 'emp', '$rand', 'personal data:name', 'raju$exp'" 62 | echo "put 'emp', '$rand', 'personal data:city', 'hyderabad$exp'" 63 | echo "put 'emp', '$rand', 'professional data:designation', 'manager$exp'" 64 | echo "put 'emp', '$rand', 'professional data:salary', '20$exp'" 65 | #done 66 | do : 67 | done 68 | -------------------------------------------------------------------------------- /HBase/Optimizations/Netxillon_HBase.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/HBase/Optimizations/Netxillon_HBase.pdf -------------------------------------------------------------------------------- /HBase/README.md: -------------------------------------------------------------------------------- 1 | export HADOOP_ROOT_LOGGER=TRACE,console; export HADOOP_JAAS_DEBUG=true; export HADOOP_OPTS="-Dsun.security.krb5.debug=true" 2 | 3 | export HBASE_ROOT_LOGGER=hbase.root.logger=DEBUG,console 4 | -------------------------------------------------------------------------------- /HBase/backup-masters: -------------------------------------------------------------------------------- 1 | dn2.cluster1.com 2 | -------------------------------------------------------------------------------- /HBase/commands.txt: -------------------------------------------------------------------------------- 1 | start-hbase.sh 2 | stop-hbase.sh 3 | 4 | hbase shell; 5 | 6 | 7 | create 't1', {NAME => 'f1', VERSIONS => 5} 8 | 9 | describe 't1' 10 | 11 | 12 | create 'class', 'cf' 13 | 14 | put 'class', 'row1', 'cf:a', 'value1' 15 | put 'class', 'row2', 'cf:b', 'value2' 16 | put 'class', 'row3', 'cf:c', 'value3' 17 | scan 'test1' 18 | 19 | put 'test', 'row1', 'cf:a', 'value1' 20 | put 'test', 'row3', 'cf:c', 'value3' 21 | -------------------------------------------------------------------------------- /HBase/hbase-site.txt: -------------------------------------------------------------------------------- 1 | 2 | hbase.master 3 | client.cluster1.com:60000 4 | 5 | 6 | 7 | hbase.rootdir 8 | hdfs://nn1.cluster1.com:9000/hbase 9 | 10 | 11 | 12 | hbase.cluster.distributed 13 | true 14 | 15 | 16 | 17 | hbase.zookeeper.quorum 18 | dn1.cluster1.com,dn2.cluster1.com 19 | 20 | 21 | 22 | 23 | hbase.zookeeper.property.clientPort 24 | 2181 25 | 26 | 27 | Standalone Only 28 | =============== 29 | 30 | 31 | 32 | hbase.rootdir 33 | file:///home/hadoop/hdata 34 | 35 | 36 | 37 | hbase.zookeeper.property.dataDir 38 | /home/hadoop/zookeeper 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /HBase/hfile: -------------------------------------------------------------------------------- 1 | [hdfs@edge1 conf]$ hbase org.apache.hadoop.hbase.io.hfile.HFile -v -f /hbase/data/default/class/9c4785ae4c37c4679773326224d7f5fe/cf/e90064527dda44d1b09f12b36edb4dd7 2 | SLF4J: Class path contains multiple SLF4J bindings. 3 | SLF4J: Found binding in [jar:file:/opt/cluster/hbase-1.0.1.1/lib/slf4j-log4j12-1.7.7.jar!/org/slf4j/impl/StaticLoggerBinder.class] 4 | SLF4J: Found binding in [jar:file:/opt/cluster/hadoop-2.6.0/share/hadoop/common/lib/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class] 5 | SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation. 6 | SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory] 7 | Scanning -> /hbase/data/default/class/9c4785ae4c37c4679773326224d7f5fe/cf/e90064527dda44d1b09f12b36edb4dd7 8 | 2016-01-26 00:06:50,390 INFO [main] hfile.CacheConfig: CacheConfig:disabled 9 | Scanned kv count -> 5 10 | [hdfs@edge1 conf]$ hbase org.apache.hadoop.hbase.io.hfile.HFile -v -f hdfs://nn1.dilithium.com:9000/hbase/data/default/class/9c4785ae4c37c4679773326224d7f5fe/cf/e90064527dda44d1b09f12b36edb4dd7 11 | SLF4J: Class path contains multiple SLF4J bindings. 12 | SLF4J: Found binding in [jar:file:/opt/cluster/hbase-1.0.1.1/lib/slf4j-log4j12-1.7.7.jar!/org/slf4j/impl/StaticLoggerBinder.class] 13 | SLF4J: Found binding in [jar:file:/opt/cluster/hadoop-2.6.0/share/hadoop/common/lib/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class] 14 | SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation. 15 | SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory] 16 | Scanning -> hdfs://nn1.dilithium.com:9000/hbase/data/default/class/9c4785ae4c37c4679773326224d7f5fe/cf/e90064527dda44d1b09f12b36edb4dd7 17 | 2016-01-26 00:07:16,371 INFO [main] hfile.CacheConfig: CacheConfig:disabled 18 | Scanned kv count -> 5 19 | -------------------------------------------------------------------------------- /HBase/hive-mysql.txt: -------------------------------------------------------------------------------- 1 | mysql> CREATE DATABASE metastore_db; 2 | Query OK, 1 row affected (0.00 sec) 3 | 4 | mysql> CREATE USER 'hadoop'@'%' IDENTIFIED BY 'hivepassword'; 5 | Query OK, 0 rows affected (0.00 sec) 6 | 7 | mysql> GRANT all on *.* to 'hadoop'@client.cluster1.com identified by 'hivepassword'; 8 | Query OK, 0 rows affected (0.00 sec) 9 | 10 | mysql> flush privileges; 11 | Query OK, 0 rows affected (0.00 sec) 12 | 13 | ==================== 14 | 15 | 16 | 17 | 18 | 19 | hive.metastore.local 20 | true 21 | 22 | 23 | 24 | 25 | 26 | 27 | javax.jdo.option.ConnectionURL 28 | 29 | jdbc:mysql://client.cluster1.com:3306/metastore_db?createDatabaseIfNotExist=true 30 | 31 | metadata is stored in a MySQL server 32 | 33 | 34 | 35 | 36 | 37 | javax.jdo.option.ConnectionDriverName 38 | 39 | com.mysql.jdbc.Driver 40 | 41 | MySQL JDBC driver class 42 | 43 | 44 | 45 | 46 | 47 | javax.jdo.option.ConnectionUserName 48 | 49 | hadoop 50 | 51 | user name for connecting to mysql server 52 | 53 | 54 | 55 | 56 | 57 | javax.jdo.option.ConnectionPassword 58 | 59 | hivepassword 60 | 61 | password for connecting to mysql server 62 | 63 | 64 | 65 | 66 | =================== 67 | Start hive Server 68 | =================== 69 | 70 | hive --service hiveserver2& 71 | 72 | Start a Separate Metastore Service 73 | ----------------------------------- 74 | 75 | 76 | hive.metastore.uris 77 | thrift://:9083 78 | IP address (or fully-qualified domain name) and port of the metastore host 79 | 80 | 81 | 82 | hive.metastore.schema.verification 83 | true 84 | 85 | 86 | hive --service metastore& 87 | 88 | mysql> use metastore_db; 89 | Reading table information for completion of table and column names 90 | You can turn off this feature to get a quicker startup with -A 91 | 92 | Database changed 93 | mysql> show tables; 94 | +---------------------------+ 95 | | Tables_in_metastore_db | 96 | +---------------------------+ 97 | | BUCKETING_COLS | 98 | | CDS | 99 | | COLUMNS_V2 | 100 | | DATABASE_PARAMS | 101 | | DBS | 102 | | PARTITION_KEYS | 103 | | SDS | 104 | | SD_PARAMS | 105 | | SEQUENCE_TABLE | 106 | | SERDES | 107 | | SERDE_PARAMS | 108 | | SKEWED_COL_NAMES | 109 | | SKEWED_COL_VALUE_LOC_MAP | 110 | | SKEWED_STRING_LIST | 111 | | SKEWED_STRING_LIST_VALUES | 112 | | SKEWED_VALUES | 113 | | SORT_COLS | 114 | | TABLE_PARAMS | 115 | | TBLS | 116 | | VERSION | 117 | +---------------------------+ 118 | 20 rows in set (0.00 sec) 119 | 120 | mysql> show databases; 121 | +--------------------+ 122 | | Database | 123 | +--------------------+ 124 | | information_schema | 125 | | employee | 126 | | metastore_db | 127 | | mysql | 128 | | test | 129 | +--------------------+ 130 | 5 rows in set (0.00 sec) 131 | 132 | mysql> select * from TBLS; 133 | +--------+-------------+-------+------------------+--------+-----------+-------+-------------+---------------+--------------------+--------------------+ 134 | | TBL_ID | CREATE_TIME | DB_ID | LAST_ACCESS_TIME | OWNER | RETENTION | SD_ID | TBL_NAME | TBL_TYPE | VIEW_EXPANDED_TEXT | VIEW_ORIGINAL_TEXT | 135 | +--------+-------------+-------+------------------+--------+-----------+-------+-------------+---------------+--------------------+--------------------+ 136 | | 1 | 1403283170 | 1 | 0 | hadoop | 0 | 1 | hivetesting | MANAGED_TABLE | NULL | NULL | 137 | +--------+-------------+-------+------------------+--------+-----------+-------+-------------+---------------+--------------------+--------------------+ 138 | 1 row in set (0.00 sec) 139 | -------------------------------------------------------------------------------- /HBase/hive.txt: -------------------------------------------------------------------------------- 1 | export JAVA_HOME=/usr/java/jdk1.7.0_25/ 2 | export HIVE_HOME=/home/hadoop/hive/ 3 | export HBASE_HOME=/home/hadoop/hbase/ 4 | 5 | PATH=$PATH:$HOME/bin 6 | PATH=$PATH:/home/hadoop/hadoop/bin:$JAVA_HOME/bin:$HIVE_HOME/bin:/$HBASE_HOME/bin 7 | 8 | export PIG_HOME=/home/hadoop/pig 9 | export PIG_INSTALL=/home/hadoop/pig 10 | 11 | export HIVE_HOME=/home/hadoop/hive 12 | export HBASE_HOME=/home/hadoop/hbase 13 | 14 | ============= 15 | 16 | 17 | $ hadoop fs -mkdir /tmp 18 | $ hadoop fs -mkdir /user/hive/warehouse 19 | $ hadoop fs -chmod g+w /tmp 20 | $ hadoop fs -chmod g+w /user/hive/warehouse 21 | 22 | you must create /tmp and /user/hive/warehouse (aka hive.metastore.warehouse.dir) and set aprpopriate permissions in HDFS 23 | 24 | hive> SET mapred.job.tracker=myhost.mycompany.com:50030; 25 | 26 | 27 | CREATE DATABASE test_hive_db; 28 | 29 | 30 | Creating Hive Tables 31 | ================== 32 | hive> CREATE TABLE pokes (foo INT, bar STRING); 33 | 34 | LOAD DATA LOCAL INPATH './examples/files/kv1.txt' OVERWRITE INTO TABLE pokes; 35 | 36 | 37 | creates a table called pokes with two columns, the first being an integer and the other a string. 38 | 39 | ================= 40 | 41 | 42 | hive> CREATE TABLE invites (foo INT, bar STRING) PARTITIONED BY (ds STRING); 43 | 44 | 45 | hive> LOAD DATA LOCAL INPATH './hive/examples/files/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15'); 46 | hive> LOAD DATA LOCAL INPATH './examples/files/kv3.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-08'); 47 | 48 | 49 | Loading from hdfs 50 | 51 | hive> LOAD DATA INPATH '/user/myname/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15'); 52 | 53 | 54 | Browsing through Tables 55 | 56 | hive> SHOW TABLES; 57 | 58 | lists all the tables. 59 | 60 | hive> SHOW TABLES '.*s'; 61 | 62 | hive> DESCRIBE invites; 63 | 64 | shows the list of columns. 65 | Altering and Dropping Tables 66 | 67 | Table names can be changed and columns can be added or replaced: 68 | 69 | hive> ALTER TABLE events RENAME TO 3koobecaf; 70 | hive> ALTER TABLE pokes ADD COLUMNS (new_col INT); 71 | hive> ALTER TABLE invites ADD COLUMNS (new_col2 INT COMMENT 'a comment'); 72 | hive> ALTER TABLE invites REPLACE COLUMNS (foo INT, bar STRING, baz INT COMMENT 'baz replaces new_col2'); 73 | 74 | Note that REPLACE COLUMNS replaces all existing columns and only changes the table's schema, not the data. The table must use a native SerDe. REPLACE COLUMNS can also be used to drop columns from the table's schema: 75 | 76 | hive> ALTER TABLE invites REPLACE COLUMNS (foo INT COMMENT 'only keep the first column'); 77 | 78 | Dropping tables: 79 | 80 | hive> DROP TABLE pokes; 81 | 82 | 83 | hive> LOAD DATA LOCAL INPATH './examples/files/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15'); 84 | hive> LOAD DATA LOCAL INPATH './examples/files/kv3.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-08'); 85 | ============== 86 | 87 | CREATE TABLE tags (userId INT,movieId INT,tag STRING,time timestamp) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; 88 | 89 | CREATE TABLE test (userId INT,movieId INT,tag STRING,time timestamp) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'; 90 | 91 | CREATE external TABLE test1 (name STRING,Id INT,roll INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' stored as textfile location '/user/hadoop/dump'; 92 | 93 | Hive VERSION Table 94 | 95 | mysql> CREATE TABLE VERSION ( VER_ID bigint(20) NOT NULL, SCHEMA_VERSION varchar(127) NOT NULL, VERSION_COMMENT varchar(255), PRIMARY KEY (VER_ID)); 96 | Query OK, 0 rows affected (0.00 sec) 97 | 98 | mysql> insert into VERSION (VER_ID,SCHEMA_VERSION,VERSION_COMMENT) values (1,"0.14.0","Hive release version 0.14.0"); 99 | Query OK, 1 row affected (0.00 sec) 100 | 101 | /usr/lib/hive/bin/schematool -dbType mysql -initSchema 102 | 103 | 104 | Performance tune Hive after checking stats on a table 105 | 106 | analyze table store compute statistics noscan; 107 | analyze table store compute statistics; 108 | analyze table store compute statistics for columns; 109 | ANALYZE TABLE Table1 CACHE METADATA; 110 | ANALYZE TABLE Table1 PARTITION(ds='2008-04-09', hr) COMPUTE STATISTICS NOSCAN; 111 | -------------------------------------------------------------------------------- /HBase/regions.txt: -------------------------------------------------------------------------------- 1 | node1 2 | node2 -------------------------------------------------------------------------------- /HBase/regionservers: -------------------------------------------------------------------------------- 1 | dn1.cluster1.com 2 | dn2.cluster1.com 3 | dn3.cluster1.com 4 | -------------------------------------------------------------------------------- /HBase/replication: -------------------------------------------------------------------------------- 1 | hbase(main):003:0> add_peer '1', CLUSTER_KEY => 'd1.aus.cloudera.site,d2.aus.cloudera.site,d3.aus.cloudera.site:2181:/hbase' 2 | hbase(main):003:0> disable_peer("1") 3 | hbase(main):003:0> enable_table_replication 'emp' 4 | hbase(main):003:0> enable_table_replication 'emp1' 5 | 6 | hbase$ hbase snapshot create -n emp1_4aug -t emp1 7 | 8 | hbase$ hbase org.apache.hadoop.hbase.snapshot.ExportSnapshot -snapshot emp1_4aug -copy-to hdfs://d2.aus.cloudera.site:8020/hbase -mappers 2 9 | 10 | hbase(main):003:0> enable_peer("1") 11 | 12 | 13 | # The above steps are to be used when the soruce cluster already has data/tables. 14 | # In a new cluster with no data, we do not need to export snapshot and disable_peer (To build backlog for WALs) 15 | 16 | 17 | 18 | hbase(main):003:0> list_peers 19 | PEER_ID CLUSTER_KEY ENDPOINT_CLASSNAME STATE REPLICATE_ALL NAMESPACES TABLE_CFS BANDWIDTH SERIAL 20 | 1 d1.aus.cloudera.site,d2.aus.cloudera.site,d3.aus.cloudera.site:2181:/hbase ENABLED true 0 false 21 | 1 row(s) 22 | Took 0.0125 seconds 23 | => # 24 | 25 | hbase(main):004:0> list_peer_configs 26 | PeerId 1 27 | Cluster Key d1.aus.cloudera.site,d2.aus.cloudera.site,d3.aus.cloudera.site:2181:/hbase 28 | 29 | Took 0.0090 seconds 30 | => {"1"=>#} 31 | -------------------------------------------------------------------------------- /HBase/tez-setup: -------------------------------------------------------------------------------- 1 | Tez Configuration 2 | 3 | 1. Download Tez tar ball: 4 | 5 | $ su - hadoop 6 | $ wget www-us.apache.org/dist/tez/0.8.4/apache-tez-0.8.4-bin.tar.gz 7 | 8 | untar it in any directory and set path to it. Should be readable by the user running hive. 9 | 10 | $ tar xzvf apache-tez-0.8.4-bin.tar.gz 11 | $ ln -s apache-tez-0.8.4-bin tez 12 | 13 | Copy the tez tarball to a path on HDFS. 14 | 15 | $ hadoop fs -mkdir -p /apps/tez 16 | $ hadoop fs -put tez/share/tez.tar.gz /apps/tez 17 | $ hadoop fs -put hive/lib/hive-exec-1.2.2.jar /apps/tez 18 | 19 | $ vi tez/conf/tez-site.xml 20 | 21 | 22 | 23 | 24 | tez.lib.uris 25 | /apps/tez/tez.tar.gz "This path is the HDFS path, can be speficied using the hdfs://path syntax as well" 26 | 27 | 28 | 29 | tez.am.resource.memory.mb 30 | 2048 31 | 32 | 33 | 34 | 35 | Set ENV 36 | 37 | vi /etc/profile.d/hadoopenv.sh or .bash_profile 38 | 39 | export TEZ_CONF_DIR=/home/hadoop/tez/conf 40 | export TEZ_JARS=/home/hadoop/tez/ 41 | 42 | export HADOOP_CLASSPATH=${TEZ_CONF_DIR}:${TEZ_JARS}/*:${TEZ_JARS}/lib/*:$HADOOP_CLASSPATH 43 | 44 | Set the execution mode in the hive configuration: 45 | 46 | $ vi hive/conf/hive-site.xml 47 | 48 | 49 | 50 | hive.execution.engine 51 | tez 52 | 53 | 54 | 55 | This can be done at the hive/beeline prompt as well: 56 | 57 | hive> set hive.execution.engine=tez; 58 | 59 | 60 | Test by running any example: 61 | 62 | hive> select count(*) from pokes; 63 | Query ID = hadoop_20180414105904_37f4b946-30cc-447a-8878-be956d0b222e 64 | Total jobs = 1 65 | Launching Job 1 out of 1 66 | 67 | 68 | Status: Running (Executing on YARN cluster with App id application_1523714759756_0007) 69 | 70 | -------------------------------------------------------------------------------- 71 | VERTICES STATUS TOTAL COMPLETED RUNNING PENDING FAILED KILLED 72 | -------------------------------------------------------------------------------- 73 | Map 1 .......... SUCCEEDED 1 1 0 0 0 0 74 | Reducer 2 ...... SUCCEEDED 1 1 0 0 0 0 75 | -------------------------------------------------------------------------------- 76 | VERTICES: 02/02 [==========================>>] 100% ELAPSED TIME: 6.88 s 77 | -------------------------------------------------------------------------------- 78 | OK 79 | 500 80 | Time taken: 9.721 seconds, Fetched: 1 row(s) 81 | 82 | 83 | Important thing to keep in mind that the env must be set on the nodes which are edge nodes, i.e hive client nodes. 84 | Hive server and metaserver etc will be as talked previously. 85 | -------------------------------------------------------------------------------- /HBase/untitled.txt: -------------------------------------------------------------------------------- 1 | parted /dev/sdb --script -- mklabel msdos 2 | parted /dev/sdb --script -- mkpart primary 0 -1 3 | mkfs.ext3 /dev/sdb1 4 | mkdir -p /space/disk1 5 | mount /dev/sdb1 /space/disk1 6 | 7 | useradd hadoop; echo hadoop | passwd --stdin hadoop 8 | chown -R hadoop:hadoop /space 9 | 10 | 11 | yum install jdk -y -------------------------------------------------------------------------------- /Hive_performance: -------------------------------------------------------------------------------- 1 | -XX:-UseGCOverheadLimit 2 | 3 | SET mapred.child.java.opts="-server1g -XX:+UseConcMarkSweepGC -XX:-UseGCOverheadLimit"; 4 | 5 | To enable the optimization 6 | 7 | set hive.auto.convert.join = true 8 | set hive.optimize.skewjoin = true 9 | 10 | 11 | When you are working with a large number of small files, Hive uses CombineHiveInputFormat by default. 12 | In terms of MapReduce, it ultimately translates to using CombineFileInputFormat that creates virtual splits over multiple files, 13 | grouped by common node, rack when possible. The size of the combined split is determined by 14 | 15 | mapred.max.split.size 16 | or 17 | mapreduce.input.fileinputformat.split.maxsize ( in yarn/MR2); 18 | 19 | So if you want to have less splits(less mapper) you need to set this parameter higher. 20 | 21 | http://stackoverflow.com/questions/17852838/what-is-the-default-size-that-each-hadoop-mapper-will-read 22 | 23 | http://www.ericlin.me/how-to-control-the-number-of-mappers-required-for-a-hive-query 24 | 25 | 26 | set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; 27 | set mapred.map.tasks = 20; 28 | 29 | Controlling split size: 30 | 31 | set mapreduce.input.fileinputformat.split.minsize==100000000; 32 | reference: https://hadoopjournal.wordpress.com/2015/06/13/set-mappers-in-pig-hive-and-mapreduce/ 33 | -------------------------------------------------------------------------------- /Jars/azure.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Jars/azure.tar.gz -------------------------------------------------------------------------------- /Jars/hadoop-azure-3.1.1.3.1.0.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Jars/hadoop-azure-3.1.1.3.1.0.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /Jars/jce_policy-8.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Jars/jce_policy-8.zip -------------------------------------------------------------------------------- /Kafka/commands: -------------------------------------------------------------------------------- 1 | # Make sure to setup the kafka variables like: 2 | 3 | export KAFKA_HOME=/home/hadoop/kafka 4 | PATH=$KAFKA_HOME/bin:$PATH 5 | 6 | Commands: 7 | 8 | kafka-server-start.sh kafka/config/server.properties 9 | 10 | Run as daemon: 11 | 12 | kafka-server-start.sh -daemon kafka/config/server.properties 13 | 14 | [hadoop@gw1 ~]$ jps 15 | 4581 Kafka 16 | 17 | # Stop: 18 | 19 | kafka-server-stop.sh 20 | 21 | # Useful commands: 22 | 23 | # Create Topic 24 | kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic test 25 | 26 | # List topcis 27 | kafka-topics.sh --list --zookeeper localhost:2181 28 | kafka-topics.sh --list --zookeeper n1.dilithium.com:2181 29 | 30 | echo "Hello, Kafka" | kafka-console-producer.sh --broker-list :9092 --topic MyTopic > /dev/null 31 | kafka-console-consumer.sh --zookeeper <> --topic MyTopic --from-beginning 32 | 33 | Examples: 34 | 35 | $ kafka-topics.sh --create --zookeeper n1.dilithium.com:2181 --replication-factor 1 --partitions 1 --topic test 36 | Created topic "test". 37 | $ kafka-topics.sh --list --zookeeper n1.dilithium.com:2181 38 | test 39 | 40 | echo "Hello, Kafka" | kafka-console-producer.sh --broker-list gw1.dilithium.com:9092 --topic test > /dev/null 41 | kafka-console-consumer.sh --bootstrap-server gw1.dilithium.com:9092 --topic test --from-beginning 42 | 43 | $ kafka-log-dirs.sh --describe --bootstrap-server gw1.dilithium.com:9092 44 | Querying brokers for log directories information 45 | Received log directory information from brokers 0 46 | {"version":1,"brokers":[{"broker":0,"logDirs":[{"logDir":"/data/kafka","error":null,"partitions":[]}]}]} 47 | 48 | 49 | Benchmarks 50 | 51 | These are just on my test lab(1 core VM, 2 GB RAM) 52 | 53 | $ kafka-producer-perf-test.sh --topic bench --num-records 1000000 --throughput 150000 --record-size 100 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=67108864 compression.type=none batch.size=8196 54 | 71189 records sent, 13945.0 records/sec (1.33 MB/sec), 1090.8 ms avg latency, 1612.0 max latency. 55 | 170124 records sent, 34018.0 records/sec (3.24 MB/sec), 2294.4 ms avg latency, 3198.0 max latency. 56 | 186553 records sent, 36882.8 records/sec (3.52 MB/sec), 4227.2 ms avg latency, 5537.0 max latency. 57 | 239463 records sent, 47892.6 records/sec (4.57 MB/sec), 7076.9 ms avg latency, 7590.0 max latency. 58 | 1000000 records sent, 39799.410969 records/sec (3.80 MB/sec), 5569.86 ms avg latency, 8151.00 ms max latency, 6986 ms 50th, 8012 ms 95th, 8107 ms 99th, 8143 ms 99.9th 59 | 60 | $ kafka-consumer-perf-test.sh --topic gsd --broker-list gw1.dilithium.com:9092 --messages 1000000 -threads 1 --num-fetch-threads 1 --print-metrics 61 | 62 | $ kafka-consumer-perf-test.sh --topic gsd --broker-list gw1.dilithium.com:9092 --messages 1000000 -threads 1 --num-fetch-threads 1 63 | start.time, end.time, data.consumed.in.MB, MB.sec, data.consumed.in.nMsg, nMsg.sec, rebalance.time.ms, fetch.time.ms, fetch.MB.sec, fetch.nMsg.sec 64 | 2019-02-11 19:23:15:397, 2019-02-11 19:23:20:199, 95.3787, 19.8623, 1000118, 208271.1370, 51, 4751, 20.0755, 210506.8407 65 | 66 | $ kafka-consumer-perf-test.sh --topic gsd --broker-list gw1.dilithium.com:9092 --messages 1000000 -threads 1 --num-fetch-threads 2 67 | start.time, end.time, data.consumed.in.MB, MB.sec, data.consumed.in.nMsg, nMsg.sec, rebalance.time.ms, fetch.time.ms, fetch.MB.sec, fetch.nMsg.sec 68 | 2019-02-11 19:23:49:632, 2019-02-11 19:23:54:701, 95.3787, 18.8161, 1000118, 197300.8483, 135, 4934, 19.3309, 202699.2298 69 | . 70 | 71 | Benchmark with various throughtput, message size: 72 | 73 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000 --throughput 150000 --record-size 100 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196 74 | 75 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000 --throughput 150000 --record-size 1 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196 76 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000 --throughput 15 --record-size 1 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196 77 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000 --throughput 1500 -record-size 1 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196 78 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000 --throughput 150 --record-size 1 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196 79 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000 --throughput 150000 --record-size 1 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196 80 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000 --throughput 15000000 --record-size 1 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196 81 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000 --throughput 15000000 --record-size 1000 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196 82 | kafka-producer-perf-test.sh --topic gsd --num-records 1000000 --throughput -1 --record-size 1000 --producer-props acks=1 bootstrap.servers=gw1.dilithium.com:9092 buffer.memory=32108864 compression.type=none batch.size=8196 83 | -------------------------------------------------------------------------------- /Kafka/kafka-env.sh: -------------------------------------------------------------------------------- 1 | # Create this file as it is not part of the distro 2 | 3 | #!/bin/bash 4 | 5 | # Set KAFKA specific environment variables here. 6 | 7 | # The java implementation to use. 8 | export JAVA_HOME=/usr/java/default 9 | export PATH=$PATH:$JAVA_HOME/bin 10 | #export PID_DIR={{kafka_pid_dir}} 11 | #export LOG_DIR={{kafka_log_dir}} 12 | #export JMX_PORT=9093 13 | 14 | export KAFKA_HEAP_OPTS="-Xmx1g -Xms1g" 15 | export KAFKA_JVM_PERFORMANCE_OPTS="-XX:MetaspaceSize=96m -XX:+UseG1GC -XX:MaxGCPauseMillis=20 -XX:InitiatingHeapOccupancyPercent=35 -XX:G1HeapRegionSize=16M -XX:MinMetaspaceFreeRatio=50 -XX:MaxMetaspaceFreeRatio=80" 16 | -------------------------------------------------------------------------------- /Kafka/kafka_ganglia2.txt: -------------------------------------------------------------------------------- 1 | { 2 | "servers" : [ { 3 | "port" : "9999", <--- Defined Kafka JMX Port 4 | "host" : "192.168.1.18", <--- Kafka Server 5 | "queries" : [ { 6 | "outputWriters" : [ { 7 | "@class" : 8 | "com.googlecode.jmxtrans.model.output.KeyOutWriter", 9 | "settings" : { 10 | "outputFile" : "/tmp/bufferPool_direct_stats.txt", 11 | "v31" : false 12 | } 13 | } ], 14 | "obj" : "java.nio:type=BufferPool,name=direct", 15 | "resultAlias": "bufferPool.direct", 16 | "attr" : [ "Count", "MemoryUsed", "Name", "ObjectName", "TotalCapacity" ] 17 | }, { 18 | "outputWriters" : [ { 19 | "@class" : 20 | "com.googlecode.jmxtrans.model.output.KeyOutWriter", 21 | "settings" : { 22 | "outputFile" : "/tmp/bufferPool_mapped_stats.txt", 23 | "v31" : false 24 | } 25 | } ], 26 | "obj" : "java.nio:type=BufferPool,name=mapped", 27 | "resultAlias": "bufferPool.mapped", 28 | "attr" : [ "Count", "MemoryUsed", "Name", "ObjectName", "TotalCapacity" ] 29 | }, { 30 | "outputWriters" : [ { 31 | "@class" : 32 | "com.googlecode.jmxtrans.model.output.KeyOutWriter", 33 | "settings" : { 34 | "outputFile" : "/tmp/kafka_log4j_stats.txt", 35 | "v31" : false 36 | } 37 | } ], 38 | "obj" : "kafka:type=kafka.Log4jController", 39 | "resultAlias": "kafka.log4jController", 40 | "attr" : [ "Loggers" ] 41 | }, { 42 | "outputWriters" : [ { 43 | "@class" : 44 | "com.googlecode.jmxtrans.model.output.KeyOutWriter", 45 | "settings" : { 46 | "outputFile" : "/tmp/kafka_socketServer_stats.txt", 47 | "v31" : false 48 | } 49 | } ], 50 | "obj" : "kafka:type=kafka.SocketServerStats", 51 | "resultAlias": "kafka.socketServerStats", 52 | "attr" : [ "AvgFetchRequestMs", "AvgProduceRequestMs", "BytesReadPerSecond", "BytesWrittenPerSecond", "FetchRequestsPerSecond", "MaxFetchRequestMs", "MaxProduceRequestMs" , "NumFetchRequests" , "NumProduceRequests" , "ProduceRequestsPerSecond", "TotalBytesRead", "TotalBytesWritten", "TotalFetchRequestMs", "TotalProduceRequestMs" ] 53 | } ], 54 | "numQueryThreads" : 2 55 | } ] 56 | } 57 | } 58 | 59 | -------------------------------------------------------------------------------- /Kafka/kakfa_rsyslog.txt: -------------------------------------------------------------------------------- 1 | rsyslog (base, includes imfile) 2 | rsyslog-kafka 3 | 4 | /etc/rsyslog.conf 5 | ----------------- 6 | 7 | 8 | $WorkDirectory /var/lib/rsyslog # where to place spool files 9 | 10 | $MainMsgQueueType LinkedList 11 | $MainMsgQueueFileName mainmsgq 12 | $MainMsgQueueSaveOnShutdown on 13 | $MainMsgQueueSize 15000 14 | $MainMsgQueueHighWatermark 10000 15 | $MainMsgQueueLowWatermark 1000 16 | $MainMsgQueueMaxDiskSpace 53687091 # 512KB, most containers have 17 | 18 | 19 | /etc/rsyslog.d/kafka.conf 20 | ------------------------- 21 | 22 | module(load="omkafka") # provides omkafka 23 | # Use rainerscript, as below#$ActionQueueSize 1500000 24 | #$ActionQueueType LinkedList 25 | #$ActionQueueFileName omkafkaq 26 | #$ActionResumeRetryCount -1 27 | #$ActionQueueSaveOnShutdown on 28 | #$ActionQueueHighWatermark 1000000 29 | #$ActionQueueLowWatermark 100000 30 | #$ActionQueueMaxDiskSpace 536870912 # 512MB, most containers have 31 | #$ActionQueueMaxDiskSpace 536870912 # 512MB, most containers have <8GB of space 32 | #$MainMsgQueueDiscardMark 400000 # Low < Discard < High < DiskSpace 33 | #$MainMsgQueueDiscardSeverity 4 # Discard anything lower than warning 34 | 35 | *.* action(type="omkafka" topic="rsyslog-prod" 36 | broker="kafka1.example.com,kafka2.example.com,kafka3.example.com" 37 | queue.filename="omkafkaq" queue.spoolDirectory="/var/lib/rsyslog" 38 | queue.size="300000" queue.maxdiskspace="536870912" 39 | queue.lowwatermark="20000" queue.highwatermark="200000" 40 | queue.discardmark="250000" queue.type="LinkedList" 41 | queue.discardseverity="4" 42 | queue.saveonshutdown="on" queue.dequeuebatchsize="4" 43 | partitions.auto="on" errorFile="/var/log/rsyslog.err" 44 | confParam=[ "compression.codec=snappy", 45 | "socket.timeout.ms=1000", 46 | "socket.keepalive.enable=true"] 47 | ) 48 | -------------------------------------------------------------------------------- /Kafka/server.properties: -------------------------------------------------------------------------------- 1 | # Only need to change the below for each broker. This is a very basic kafka config 2 | 3 | # The id of the broker. This must be set to a unique integer for each broker. 4 | broker.id=0 5 | 6 | # root directory for all kafka znodes. 7 | zookeeper.connect=n1.dilithium.com:2181,n2.dilithium.com:2181,sn.dilithium.com:2181 8 | -------------------------------------------------------------------------------- /Notes/Benchmarking.txt: -------------------------------------------------------------------------------- 1 | Test Hadoop 2 | ============ 3 | 4 | hadoop jar hadoop/hadoop-test-0.20.205.0.jar TestDFSIO -write -nrFiles 10 -fileSize 1000 5 | hadoop jar hadoop/hadoop-test-0.20.205.0.jar TestDFSIO -read -nrFiles 10 -fileSize 1000 6 | 7 | hadoop jar hadoop/hadoop-test-0.20.205.0.jar TestDFSIO -clean 8 | 9 | 10 | Generate Tera Data 11 | ================== 12 | 13 | hadoop jar hadoop/hadoop-examples-0.20.205.0.jar teragen 1000 /user/hduser/terasort-input 14 | 15 | hadoop jar hadoop/hadoop-examples-0.20.205.0.jar terasort /user/hduser/terasort-input /user/hduser/terasort-output 16 | 17 | hadoop job -history all /user/hduser/terasort-input 18 | 19 | 20 | 21 | hadoop jar hadoop/hadoop-examples-0.20.205.0.jar nnbench -operation create_write \ 22 | -maps 12 -reduces 6 -blockSize 1 -bytesToWrite 0 -numberOfFiles 1000 \ 23 | -replicationFactorPerFile 3 -readFileAfterOpen true \ 24 | -baseDir /benchmarks/NNBench-`hostname -s` 25 | 26 | 27 | hadoop jar hadoop/hadoop-examples-0.20.205.0.jar mrbench -numRuns 50 -------------------------------------------------------------------------------- /Notes/Hadoop_lab.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Notes/Hadoop_lab.doc -------------------------------------------------------------------------------- /Notes/Hadoop_upgrade.txt: -------------------------------------------------------------------------------- 1 | 2 | Hadoop Upgrade 3 | =============== 4 | 5 | 1. hadoop dfsadmin -upgradeProgress status 6 | 7 | 2. Stop all client applications running on the MapReduce cluster. 8 | 9 | 3. Perform a filesystem check 10 | hadoop fsck / -files -blocks -locations > dfs-v-old-fsck-1.log 11 | 12 | 4. Save a complete listing of the HDFS namespace to a local file 13 | hadoop dfs -lsr / > dfs-v-old-lsr-1.log 14 | 15 | 5. Create a list of DataNodes participating in the cluster: 16 | hadoop dfsadmin -report > dfs-v-old-report-1.log 17 | 18 | 6. Optionally backup HDFS data 19 | 20 | 7. Upgrade process: 21 | Point to the new directory, update environment variables. 22 | 23 | 8. hadoop-daemon.sh start namenode -upgrade 24 | 25 | 9. hadoop dfsadmin -upgradeProgress status 26 | 27 | 10. Now start the datanode, after pointing to the new hadoop directory 28 | 29 | 11. hadoop dfsadmin -safemode get 30 | 31 | 12. hadoop dfsadmin -finalizeUpgrade 32 | 33 | 34 | -------------------------------------------------------------------------------- /Notes/Performance.txt: -------------------------------------------------------------------------------- 1 | CPU-related parameters:mapred.tasktracker.map and reduce.tasks.maximum 2 | Decide the maximum number of map/reduce tasks that will be run simultaneously by a task tracker. These two parameters are the most relative ones to CPU utilization. The default value of both parameters is 2. Properly increasing their values according to your cluster condition increases the CPU utilization and therefore improves the performance. For example, assume each node of the cluster has 4 CPUs supporting simultaneous multi-threading, and each CPU has 2 cores; then the total number of daemons should be no more than 4x2x2=16. Considering DN and TT would take 2 slots, there are at most 14 slots for map/reduce tasks, so the best value is 7 for both parameters. 3 | 4 | Set this parameter in mapred-site.xml. 5 | 6 | Memory-related parameter:mapred.child.java.opts 7 | This is the main parameter for JVM tuning. The default value is -Xmx200m, which gives each child task thread 200 MB of memory at most. You can increase this value if the job is large, but should make sure it won't cause swap, which significantly reduces performance. 8 | 9 | Let's examine how this parameter can affect the total memory usage. Assume the maximum number of map/reduce tasks is set to 7, and mapred.child.java.opts is left to the default value. Then memory cost of running tasks will be 2x7x200 MB =2800 MB. If each worker node has both DN and TT daemons, and each daemon costs 1 GB memory by default, the total memory allocated would be around 4.8 GB. 10 | 11 | Set this parameter in mapred-site.xml. 12 | 13 | Disk I/O-related parameters:mapred.compress.map.output, mapred.output.compress, and mapred.map.output.compression.codec 14 | These are parameters that control whether to compress the output, in which mapred.compress.map.output is for map output compression, mapred.output.compress is for job output compression, and mapred.map.output.compression.codec is for compression code. All of these options are turned off by default. 15 | 16 | Turning on output compression can speed up disk (local/Hadoop Distributed File System (HDFS)) writes and reduce total time of data transfer (in both shuffle and HDFS writing phase), while on the other hand cost additional overhead during the compression/decompression process. 17 | 18 | According to personal experience, turning on compression is not effective for sequence filing with random keys/values. One suggestion is to turn on compression only when the data you're dealing with is large and organized (especially natural language data). 19 | 20 | Set these parameters in mapred-site.xml. 21 | 22 | io.sort.mb parameter: 23 | This parameter sets the buffer size for map-side sorting, in units of MB, 100 by default. The greater the value, the fewer spills to the disk, thus reducing I/O times on the map side. Notice that increasing this value increases memory required by each map task. 24 | 25 | According to experience, when the map output is large, and the map-side I/O is frequent, you should try increasing this value. 26 | 27 | Set this parameter in mapred-site.xml. 28 | 29 | io.sort.factor parameter 30 | This parameter sets the number of input streams (files) to be merged at once in both map and reduce tasks. The greater this value, the fewer spills to the disk, thus reducing I/O times on both the map and reduce sides. Notice that increasing this value might cost more garbage collection activities if memory allocated for each task is not large enough. 31 | 32 | According to experience, when there is a large number of spills to the disk, and I/O times of the sort and shuffle phase is high, you should try increasing this value. 33 | 34 | Set this parameter in mapred-site.xml. 35 | 36 | mapred.job.reduce.input.buffer.percent parameter 37 | This parameter sets the percentage of memory (relative to the maximum heap size) to retain map outputs during the reduce phase. When the shuffle is concluded, any remaining map outputs in memory must consume less than this threshold before the reduce phase can begin, 0 by default. The greater this value is, the less merge on the disk, thus reducing I/O times on the local disk during the reduce phase. Notice that increasing this value might cost more garbage collection activities if memory allocated for each task is not large enough. 38 | 39 | According to experience, when map output is large, and local disk I/O is frequent during the reduce through sort phases, you should try increasing this value. -------------------------------------------------------------------------------- /Notes/backup.txt: -------------------------------------------------------------------------------- 1 | Hadoop Backup and Recovery 2 | ========================== 3 | 4 | 5 | 6 | dfs.secondary.http.address 7 | 192.168.1.68:50090 8 | 9 | 10 | 11 | 1. Secondary namenode checkpointing 12 | 13 | If you want to explicitly specify the file to be used by the namenode 14 | 15 | hadoop-daemons.sh --hosts masters start secondarynamenode 16 | 17 | hdfs secondarynamenode -checkpoint force 18 | 19 | 2. hadoop namenode -importCheckpoint 20 | 21 | 22 | fs.checkpoint.dir 23 | /data/new 24 | 25 | 26 | 3. Save NameSpace 27 | 28 | hadoop dfsadmin -safemode enter 29 | 30 | hadoop dfsadmin -saveNamespace 31 | 32 | Remember it updates under the Namespace directory. 33 | 34 | 4. Metadata Save 35 | 36 | hdfs dfsadmin -metasave filename.txt 37 | 38 | 5. Can do a detailed view of the namespace (above 0.21) 39 | 40 | hdfs oiv -i /data/namenode/current/fsimage -o fsimage.txt 41 | 42 | 43 | 44 | dfs.secondary.http.address 45 | 192.168.1.68:50090 46 | -------------------------------------------------------------------------------- /Notes/cassandra2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Notes/cassandra2.pdf -------------------------------------------------------------------------------- /Notes/class3_questions: -------------------------------------------------------------------------------- 1 | 1. when name node started what will be "keyword" to identify from log that at what timestamp namenode started and which log should we see? 2 | 2. Does Secondary Name node starts automatically? 3 | 3. "copyFromLocal" is locally load the data from the same node.. if yes, than if we want to load data from NAS or SAN or any client is there any specific command .. 4 | 4. Is there any command we can use to override if same file is present in HDFS already? or we always have to remove existing first? 5 | 5. If we change path hdfs then we need to reformat? 6 | 6. how different is hadoop fsck from linux fsck 7 | 7. Should we run "hadoop dfsadmin" & "hadoop fsck" (Admin) commands only from name node and "hadoop fs" files related command from any of the name or datanode in cluster? 8 | 8. how to start data node on selected machines instead of all as this earlier command is doing? 9 | 9. what is .meta and .curr files are created and what is difference. what if .meta file is deleted will data also be lost 10 | 10. in what cicumstances it will choose another node in case dn1 near to full capacity 11 | 11. only shows 1 live node, although running it several time, it's changing between the live datanodes, but only shows 1 live at a time. Is it normal? What could be the problem 12 | 12. is it possible to force the data to go into a particular data node? 13 | 13. so at some edits will too huge , does it rotate also? 14 | 14. can we have both NN and SNN on same node? or is it best practice to separate them 15 | 15. how we know which rack machine belongs to? 16 | 16. editing nodes in include and exclude files does not require a reboot? 17 | 17. In what kind of cases we might need to exclude a particular data node? Why would we create a DN and exclude it from the cluster? 18 | 18. Let say I have servers from 2 datacenters one in new ATLANTA and one in NEW YORK. Let say ATLANTA datacenter is down because of FLOOD. How can we recover our cluster from such disaster recovery. In that case can we configure our data file to consider servers of ATLANTA datacenter as ONE RACK and NEWYORK servers as another rack? 19 | 20 | -------------------------------------------------------------------------------- /Notes/class4_questions: -------------------------------------------------------------------------------- 1 | https://www.packtpub.com/books/content/sizing-and-configuring-your-hadoop-cluster 2 | why do we need replication of replication? why do we need to have replication of data on the same server again 3 | what was that sdb1 & sdc1, is that new partition for dn1 & dn2 4 | is this similar to RAID 1(mirroring)? 5 | do we need to setup this only if we do not have raid 1 already? 6 | Isnt its a good practice to create these directories on NFS server? other than local? 7 | after chaning any configuration setting , do we need to run --format everytime to reflect the change? 8 | will thses two disks need to be kept in different racks to avaoid data loss 9 | If the disk io speed is different significantly between the local and NFS one, will this cause adverse effect for namenode? 10 | it will be overhead because instead of rsyncing 1 directory it has to copy 2 dirs to DR right? 11 | so which meta data name node is refering to if both mount points are up? 12 | what is the read policy? will it be only from the first disk specified? 13 | does two disk drives follow propotional fill algoritm? 14 | So in that case we dont have control how end clients are sending jobs? they can send any size 15 | So when name node is down and we have to make secondary as primary. I am confused why we need to change the hostname of secondary to primary? Instead of that if we would have configured namenode location by GSLB instead of direct hostname. Then we don’t need to change hostname 16 | you are setting Quota from command line. So it will be flushed once we restart the namenode? 17 | -------------------------------------------------------------------------------- /Notes/cloudera.txt: -------------------------------------------------------------------------------- 1 | Cloudera Manager 2 | ================ 3 | 4 | ./cloudera-manager-installer.bin --skip_repo_package=1 5 | 6 | 7 | 8 | 1. Cloudera Manager - GUI 9 | 2. Cloudera Packages CDH4 - Hadoop packages -------------------------------------------------------------------------------- /Notes/disk_partition: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | hdd="/dev/sdc /dev/sdd /dev/sde /dev/sdf" 3 | 4 | count=0 5 | 6 | for disk in $hdd; do 7 | #echo -e "n\np\n\n\n\nw\n" | fdisk $disk; 8 | 9 | fs="${disk}1" 10 | #mkfs.xfs $fs; 11 | 12 | twoDigitCount=$(printf "%02d" $count) 13 | mount="/data/$twoDigitCount" 14 | mkdir -p $mount; 15 | 16 | mount $fs $mount; 17 | 18 | count=$((count+1)) 19 | done 20 | -------------------------------------------------------------------------------- /Notes/hadoop_ports.txt: -------------------------------------------------------------------------------- 1 | Hadoop nodes communication ports 2 | 3 | No. name protocol port # configuration file parameter name description 4 | 1 ssh tcp *:22 /etc/ssh/sshd_config Port ssh server port for ssh communication 5 | 2 HDFS default port tcp localhost:9000 core-site.xml fs.default.name HDFS port for clients. 6 | 3 secondary name node administration tcp 0.0.0.0:50090 hdfs-site.xml dfs.secondary.http.address The secondary namenode http server address and port. If the port is 0 then the server will start on a free port. 7 | 4 data node communication tcp 0.0.0.0:50010 hdfs-site.xml dfs.datanode.address 8 | 5 data node administration tcp 0.0.0.0:50075 hdfs-site.xml dfs.datanode.http.address 9 | 6 data node IPC communication tcp 0.0.0.0:50020 hdfs-site.xml dfs.datanode.ipc.address 10 | 7 name node administration tcp 0.0.0.0:50070 hdfs-site.xml dfs.http.address 11 | 8 data node administration tcp 0.0.0.0:50475 hdfs-site.xml dfs.datanode.https.address 12 | 9 name node administration tcp 0.0.0.0:50470 hdfs-site.xml dfs.https.address 13 | 10 MapReduce job tracker tcp 0.0.0.0:9001 mapred-site.xml mapred.job.tracker The port of Job Tracker accepting for job request. 14 | 11 job tracker administration tcp 0.0.0.0:50030 mapred-site.xml mapred.job.tracker.http.address 15 | 12 task tracker administration tcp 0.0.0.0:50060 mapred-site.xml mapred.task.tracker.http.address -------------------------------------------------------------------------------- /Notes/hadoop_ports_firewall.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Notes/hadoop_ports_firewall.xls -------------------------------------------------------------------------------- /Notes/installation.txt: -------------------------------------------------------------------------------- 1 | core-site.xml 2 | 3 | 4 | fs.default.name 5 | hdfs://nn1.cluster1.com:9000 6 | 7 | 8 | hdfs-site.xml 9 | 10 | 11 | dfs.name.dir 12 | /data/namenode 13 | true 14 | 15 | 16 | 17 | dfs.data.dir 18 | /space/disk1,/space/disk2 19 | true 20 | 21 | 22 | 23 | dfs.replication 24 | 1 25 | 26 | 27 | 28 | dfs.block.size 29 | 67108864 30 | 31 | 32 | 33 | dfs.hosts.exclude 34 | /home/hadoop/excludes 35 | true 36 | 37 | 38 | 39 | dfs.hosts 40 | /home/hadoop/include 41 | true 42 | 43 | 44 | mapred-site 45 | 46 | 47 | 48 | mapred.job.tracker 49 | jt.cluster1.com:9001 50 | 51 | 52 | 53 | 54 | export JAVA_HOME=/usr/java/jdk1.7.0_25/ 55 | export HADOOP_HOME=/home/hadoop/hadoop 56 | export HADOOP_PID_DIR=/home/hadoop/pids 57 | export HADOOP_HEAPSIZE=500 58 | 59 | export HADOOP_LOG_DIR=${HADOOP_HOME}/logs 60 | export HADOOP_HOME_WARN_SUPPRESS="TRUE" 61 | 62 | 63 | 64 | export JAVA_HOME=/usr/java/jdk1.7.0_25/ 65 | 66 | PATH=$JAVA_HOME/bin:$PATH:$HOME/bin 67 | PATH=$PATH:/home/hadoop/hadoop/bin 68 | 69 | export PATH 70 | 71 | ================ 72 | 73 | heartbeat.recheck.interval 74 | 15 75 | Determines datanode heartbeat interval in seconds 76 | 77 | 78 | If above doesn't work - try the following (seems to be version-dependent): 79 | 80 | 81 | dfs.heartbeat.recheck.interval 82 | 15 83 | Determines datanode heartbeat interval in seconds. 84 | 85 | -------------------------------------------------------------------------------- /Notes/pig.txt: -------------------------------------------------------------------------------- 1 | export PIG_HOME=/home/hadoop/pig/ 2 | 3 | 4 | A = load 'passwd' using PigStorage(':'); 5 | B = foreach A generate $0 as id; 6 | store B into 'id.out'; 7 | 8 | 9 | 10 | pig -x local id.pig 11 | 12 | pig -x mapreduce id.pig 13 | 14 | 15 | passwd = LOAD '/etc/passwd' USING PigStorage(':') AS (user:chararray, \ 16 | passwd:chararray, uid:int, gid:int, userinfo:chararray, home:chararray, \ 17 | shell:chararray); 18 | grunt> DUMP passwd; 19 | 20 | 21 | grunt> counts = FOREACH grp_shell GENERATE group, COUNT(passwd); 22 | grunt> DUMP counts; 23 | 24 | 25 | 26 | A = load 'test'; 27 | 28 | B = foreach A generate flatten(TOKENIZE((chararray)$0)) as word; 29 | 30 | C = group B by word; 31 | 32 | D = foreach C generate COUNT(B), group; 33 | 34 | store D into 'wordcount'; 35 | 36 | ================== 37 | 38 | A = load 'http_access_2011-07-07.log' using PigStorage('-') as (f0,f1,f2,f3,f4); 39 | B = foreach A generate f0; 40 | C = distinct B; 41 | dump C; 42 | 43 | A = load 'http_access_2011-07-07.log' using PigStorage('"') as (f0,f1,f2,f3,f4,f5); 44 | B = foreach A generate f5; 45 | C = distinct B; 46 | dump C; 47 | 48 | A = load 'http_access_2011-07-07.log' using PigStorage('"') as (f0,f1,f2,f3,f4); 49 | B = foreach A generate f1; 50 | C = distinct B; 51 | dump C; 52 | 53 | ============== 54 | yum install ant* 55 | 56 | For Hadoop-2.0 57 | 58 | ant clean jar-withouthadoop -Dhadoopversion=23 59 | 60 | or 61 | 62 | ant clean jar-all -Dhadoopversion=23 -------------------------------------------------------------------------------- /Notes/questions.txt: -------------------------------------------------------------------------------- 1 | 1) If we add new DataNodes to the cluster will HDFS move the blocks to the newly added nodes in order to balance disk space utilization between the nodes? 2 | 3 | a) yes, it will automatically do balancing 4 | b) no, we have to manually to re-balancing (correct) 5 | 6 | 2) The name-node will stay in safe mode till all under-replicated files are fully replicated? 7 | 8 | a)TRUE b) FALSE (correct) 9 | 10 | 3) How do I set up a hadoop data node to use multiple volumes? 11 | 12 | a) We cannot do that b) We can use comma seperated fields (correct) c) This can only be done with SAN storage 13 | 14 | 4) Can a Hadoop client renames a file or a directory containing a file while another client is still writing into it? 15 | 16 | a) yes, it can (correct) b) No, hadoop does locking 17 | 18 | 5) Will the command bin/hadoop dfs -ls /projects/* list all the files under /projects ? 19 | 20 | a) yes (correct, but better to safeguard it with single quotes) b) no 21 | 22 | 6) Can we have multiple files in HDFS use different block sizes? 23 | 24 | a) yes (correct) b) no 25 | 26 | 7) How do you gracefully stop a running job? 27 | 28 | a) hadoop job -kill jobid(correct) b) kill the task tracker c) it can not be done 29 | 30 | 8) What is the best java version to use for Hadoop? 31 | 32 | a) It does not matter b) Must be greater then java2.6 c) greater then 1.6 (correct) 33 | 34 | 9) What is the command for adding the hosts newly added to the mapred.include file? 35 | 36 | a) hadoop dfsadmin -refreshNodes b) hadoop rmadmin -refreshNodes (correct) 37 | 38 | 10) What will happen, if we set the number of reducers to 0 ? 39 | 40 | a) job will fail b) the map-tasks r written directly to the disk (correct) 41 | 42 | 11) How many maximum JVM run on the slave node? 43 | 44 | a) only one as there is only one tasktracker b) 2 one each for tasktracker, datanode c) It depends upon task instances (correct) 45 | 46 | 12) Where is the intermidiate mapper output stored? 47 | 48 | a) It is stored in tmp folder on hdfs b) It is stored on local filesystem(correct) c) It is only in Memory 49 | 50 | 13) When does mappers run ? 51 | 52 | a) They start immediately when job is submitted b) They start only after the mapper finish (correct) 53 | 54 | 55 | 14) What action occurs automatically on a cluster when a DataNode is marked as dead? 56 | 57 | A. The NameNode forces re-replication of all the blocks which were stored on the dead DataNode. 58 | B. The next time a client submits job that requires blocks from the dead DataNode, the JobTracker receives no heart beats from the DataNode. The JobTracker tells the NameNode that the DataNode is dead, which triggers block re-replication on the cluster. 59 | C. The replication factor of the files which had blocks stored on the dead DataNode is temporarily reduced, until the dead DataNode is recovered and returned to the cluster. 60 | D. The NameNode informs the client which write the blocks that are no longer available; the client then re-writes the blocks to a different DataNode. 61 | 62 | 15) QUESTION: 5 63 | Which three distcp features can you utilize on a Hadoop cluster? 64 | A. Use distcp to copy files only between two clusters or more. You cannot use distcp to copy data between directories inside the same cluster. 65 | B. Use distcp to copy HBase table files. 66 | C. Use distcp to copy physical blocks from the source to the target destination in your cluster. 67 | D. Use distcp to copy data between directories inside the same cluster. E. Use distcp to run an internal MapReduce job to copy files. 68 | Answer: B, D, E 69 | 70 | 16) What is the recommended disk configuration for slave nodes in your Hadoop cluster with 6 x 2 TB hard drives? 71 | A. RAID 10 B. JBOD 72 | C. RAID 5 D. RAID 1+0 73 | Answer: B 74 | 75 | 17) Your Hadoop cluster has 25 nodes with a total of 100 TB (4 TB per node) of raw disk space allocated HDFS storage. Assuming Hadoop's default configuration, how much data will you be able to store? 76 | A. Approximately 100TB B. Approximately 25TB C. Approximately 10TB D. Approximately 33 TB 77 | Answer: D 78 | 79 | 18) The most important consideration for slave nodes in a Hadoop cluster running production jobs that require short turnaround times is: 80 | A. The ratio between the amount of memory and the number of disk drives. 81 | B. The ratio between the amount of memory and the total storage capacity. 82 | C. The ratio between the number of processor cores and the amount of memory. D. The ratio between the number of processor cores and total storage capacity. E. The ratio between the number of processor cores and number of disk drives. 83 | Answer: D 84 | 85 | 19) Your existing Hadoop cluster has 30 slave nodes, each of which has 4 x 2T hard drives. You plan to add another 10 nodes. How much disk space can your new nodes contain? 86 | A. The new nodes must all contain 8TB of disk space, but it does not matter how the disks are configured 87 | B. The new nodes cannot contain more than 8TB of disk space 88 | C. The new nodes can contain any amount of disk space 89 | D. The new nodes must all contain 4 x 2TB hard drives Answer: C 90 | 91 | 20) On a cluster running MapReduce v1 (MRv1), a MapReduce job is given a directory of 10 plain text as its input directory. Each file is made up of 3 HDFS blocks. How many Mappers will run? 92 | A. We cannot say; the number of Mappers is determined by the developer B. 30 93 | C. 10 94 | D. 1 95 | Answer: B 96 | 97 | 21) Which scheduler would you deploy to ensure that your cluster allows short jobs to finish within a reasonable time without starving long-running jobs? 98 | A. FIFO Scheduler 99 | B. Fair Scheduler 100 | C. Capacity Scheduler 101 | D. Completely Fair Scheduler (CFS) 102 | Answer: B 103 | 104 | 22) You are planning a Hadoop duster, and you expect to be receiving just under 1TB of data per week which will be stored on the cluster, using Hadoop's default replication. You decide that your slave nodes will be configured with 4 x 1TB disks. Calculate how many slave nodes you need to deploy at a minimum to store one year's worth of data. 105 | A. 100 slave nodes B. 100 slave nodes C. 10 slave nodes D. 50 slave nodes 106 | Answer: D 107 | 108 | 23) On a cluster running MapReduce v1 (MRv1), a MapReduce job is given a directory of 10 plain text as its input directory. Each file is made up of 3 HDFS blocks. How many Mappers will run? 109 | A. We cannot say; the number of Mappers is determined by the developer B. 30 110 | C. 10 111 | D. 1 112 | Answer: A 113 | 114 | 24) For each job, the Hadoop framework generates task log files. Where are Hadoop's task log files stored? 115 | A. Cached on the local disk of the slave node running the task, then purged immediately upon task completion. 116 | B. Cached on the local disk of the slave node running the task, then copied into HDFS. 117 | C. In HDFS, in the directory of the user who generates the job. 118 | D. On the local disk of the slave node running the task. 119 | 120 | Answer: D 121 | 122 | 123 | -------------------------------------------------------------------------------- /Notes/quick-links: -------------------------------------------------------------------------------- 1 | AMS: https://cwiki.apache.org/confluence/display/AMBARI/Known+Issues 2 | -------------------------------------------------------------------------------- /Notes/quiz4.txt: -------------------------------------------------------------------------------- 1 | 1) How do you gracefully stop a running job? 2 | 3 | a) hadoop job -kill jobid(correct) b) kill the task tracker c) it can not be done 4 | 5 | 2) What will happen, if we set the number of reducers to 0 ? 6 | 7 | a) job will fail b) the map-tasks r written directly to the disk (correct) 8 | 9 | 3) Where is the intermidiate mapper output stored? 10 | 11 | a) It is stored in tmp folder on hdfs b) It is stored on local filesystem(correct) c) It is only in Memory 12 | 13 | 4) When does mappers run ? 14 | 15 | a) They start immediately when job is submitted b) They start only after the mapper finish (correct) 16 | 17 | 5) Which property set the max number of tasktrackers ? (B is correct) 18 | 19 | a) mapred.tasktracker.map.tasks b) mapred.tasktracker.map.tasks.maximum c) map.tasks.maximum 20 | -------------------------------------------------------------------------------- /Notes/quiz7.txt: -------------------------------------------------------------------------------- 1 | 1) What is HBase? 2 | 3 | a) Is an RDMS database b) Hbase is Column-Oriented c) Distributed database d) Both b and c 4 | 5 | 2) Why we use HBase ? 6 | 7 | a) It is a DB on top of HDFS b) Hbase provide random read and write on large data set. c) HBase is same as MySql 8 | 9 | 3) What is the maximum size of string data type supported by Hive? 10 | 11 | a) 64MB b) It depends upon the HDFS block size c) 2GB (correct) 12 | 13 | 4) In Hadoop ‘Reading‘ is done in parallel and ‘Writing‘ is not in HDFS. 14 | 15 | a) TRUE (correct) 16 | b) FALSE 17 | 18 | 5) Multiple users can use same metastore in 'Embedded metastore Mode'. 19 | 20 | a) TRUE 21 | b) FALSE (Correct) 22 | 23 | 6) Hbase 'CopyTable' utlitiy can be used to: 24 | 25 | a) Copy a partial table b) Full table c) It is not a valid command d) a and b (correct) 26 | -------------------------------------------------------------------------------- /Notes/quota.txt: -------------------------------------------------------------------------------- 1 | Applying Quota 2 | -------------- 3 | 4 | hadoop dfsadmin -setSpaceQuota 1m 5 | 6 | 7 | dfsadmin -setQuota 8 | 9 | dfsadmin -clrQuota 10 | 11 | dfsadmin -setSpaceQuota 12 | 13 | dfsadmin -clrSpaceQuota 14 | 15 | 16 | Distcp 17 | ====== 18 | 19 | hadoop distcp hdfs://nn1:8020/foo/bar hdfs://nn2:8020/bar/foo 20 | 21 | hdfs://nn1:8020/foo/a hdfs://nn1:8020/foo/b 22 | 23 | hadoop distcp hdfs://nn1.cluster1.com:9000/jobtracker hdfs://nn1.cluster1.com:9000/newtracker 24 | 25 | 26 | Trash 27 | ======= 28 | 29 | 30 | fs.trash.interval 31 | 40 32 | 33 | 34 | SetRep 35 | ===== 36 | hadoop dfs -setrep -R -w 3 /chandra -------------------------------------------------------------------------------- /Notes/rack.txt: -------------------------------------------------------------------------------- 1 | while [ $# -gt 0 ] ; do 2 | nodeArg=$1 3 | exec< /home/hadoop/topology.data 4 | result="" 5 | while read line ; do 6 | ar=( $line ) 7 | if [ "${ar[0]}" = "$nodeArg" ] ; then 8 | result="${ar[1]}" 9 | fi 10 | done 11 | shift 12 | if [ -z "$result" ] ; then 13 | echo -n "/default" 14 | else 15 | echo -n "$result " 16 | fi 17 | done 18 | 19 | 20 | 21 | topology.script.file.name 22 | /home/hadoop/hadoop/conf/topology.sh 23 | 24 | ==================== 25 | The Above works very well on Hadoop 1 but for hadoop 2, make sure to have the correct format emiited by the script. It 26 | takes IP addresses instead of DNS name and also there are multiple classes like simpleDNS and Table based. We do not need to do anything if we are using a script as above, but for Java invocations and other tabular formats we need to modify the "topology.node.switch.mapping.impl" 27 | -------------------------------------------------------------------------------- /Notes/remove_datanode.txt: -------------------------------------------------------------------------------- 1 | Add/Remove a Datanode 2 | ===================== 3 | 4 | Decommission a host gracefully 5 | 6 | 7 | dfs.hosts.exclude 8 | /home/hadoop/excludes 9 | true 10 | 11 | 12 | Similarly for Jobtracker. 13 | 14 | 15 | mapred.hosts.exclude 16 | /home/hadoop/excludes 17 | true 18 | 19 | 20 | mapred.hosts.exclude in mapred-site.xml 21 | 22 | Add the FQDN to the exclude file and refresh 23 | 24 | Update for the Namenode 25 | ----------------------- 26 | 27 | hadoop dfsadmin -refreshNodes 28 | 29 | Update for Jobtracker 30 | ---------------------- 31 | 32 | hadoop mradmin -refreshNodes 33 | 34 | 35 | Add Hosts: 36 | 37 | 1. dfs.hosts in the hdfs-site.xml, mapred.hosts 38 | 39 | 40 | ================================================ 41 | 42 | Cluster Balancing 43 | ----------------- 44 | 45 | hadoop balancer -threshold 40 46 | 47 | ============================================== 48 | 49 | Add Disk Space to a datanode 50 | ---------------------------- 51 | 52 | How do you add storage to cluster 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | dfs.hosts 61 | /home/hadoop/include 62 | true 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /Notes/repo_server.txt: -------------------------------------------------------------------------------- 1 | Setup Repo Server 2 | ================= 3 | 4 | Mount Centos DVD and install: 5 | 6 | cd /media/Rhel 6 DVD/Packages/ 7 | 8 | # yum install vsftpd* 9 | # yum install createrepo* 10 | 11 | # mkdir /var/ftp/pub/Centos65 12 | 13 | cp -a /media/RHEL_6_DVD/* /var/ftp/pub/Centos65/ 14 | 15 | # createrepo -v /var/ftp/pub/Centos65/ 16 | 17 | 18 | # service vsftpd restart 19 | 20 | ======================== 21 | 22 | On all the nodes 23 | 24 | # rm -rf /etc/yum.repos.d/* 25 | # vi /etc/yum.repos.d/server.repo 26 | 27 | [server] 28 | name=Centos 6.5 repository 29 | baseurl=ftp:///pub/Centos65/ 30 | gpgcheck=0 31 | enable=1 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /Notes/scoop.txt: -------------------------------------------------------------------------------- 1 | export SQOOP_HOME=/usr/lib/sqoop 2 | export PATH=$PATH:$SQOOP_HOME/bin 3 | 4 | 5 | Step 2: Configure the MySQL Service and Connector 6 | 7 | Download mysql-connector-java-5.0.5.jar file and copy it to $SQOOP_HOME/lib directory. 8 | 9 | Step 3: Sqoop Installation 10 | 11 | Sqoop Installation Tutorial for instructions of how to install Sqoop. 12 | 13 | Database and table creation in MySQL 14 | 15 | First connect to MySQL 16 | 17 | $ mysql -u root -p 18 | 19 | Enter password: 20 | 21 | Create database ‘testDb’ and use ‘testDb’ database as a current database. 22 | 23 | mysql> create database testDb; 24 | 25 | mysql> use testDb; 26 | 27 | Create table ‘student’ 28 | 29 | mysql> create table student(id integer,name char(20)); 30 | 31 | Add following 2 records to the table 32 | 33 | mysql> insert into student values(1,'Archana'); 34 | 35 | mysql> insert into student values(2,'XYZ'); 36 | 37 | Exit from MySQL 38 | 39 | mysql> exit; 40 | 41 | Sqoop import 42 | 1. Importing a table into HDFS 43 | 44 | 1st way 45 | 46 | Command for import 47 | 48 | sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --m 1 49 | 50 | Execute the sqoop import 51 | 52 | Here we are using database ‘testDb’ , username ‘root’, password ‘hadoop123′, and table student. 53 | 54 | $ sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --m 1 55 | 56 | ——————- NOTE——————– 57 | 58 | If you have not defined primary key for your table then you have to give ‘-m 1′ option for import. 59 | Otherwise it gives error 60 | ERROR tool.ImportTool: Error during import: No primary key could be found for table student1. Please specify one with --split-by or perform a sequential import with '-m 1'. 61 | 62 | 2nd Way 63 | 64 | Create a config file $HOME/import.txt add following to the config file 65 | 66 | import.txt 67 | 68 | import 69 | --connect 70 | jdbc:mysql://localhost/testDb 71 | --username 72 | root 73 | --password 74 | hadoop123 75 | 76 | Execute the sqoop import 77 | 78 | sqoop --options-file /home/hduser/import.txt --table student -m 1 79 | 80 | Once import is done you can find student.jar, student.class and student.java at following location /tmp/sqoop-hduser/compile/—-/student.jar 81 | 82 | Files created in HDFS 83 | 84 | $ hadoop dfs -ls -R student 85 | 86 | Found 3 items 87 | 88 | -rw-r--r-- 1 hduser supergroup 0 2013-09-13 15:38 /user/hduser/student/_SUCCESS 89 | 90 | drwxr-xr-x - hduser supergroup 0 2013-09-13 15:38 /user/hduser/student/_logs 91 | 92 | -rw-r--r-- 1 hduser supergroup 16 2013-09-13 15:38 /user/hduser/student/part-m-00000 93 | 94 | Data file contents 95 | 96 | $ hadoop dfs -cat /user/hduser/student/part-m-00000 97 | 98 | 1,Archana 99 | 2,XYZ 100 | 101 | 2 Import all rows of a table in MySQL, but specific columns of the table 102 | 103 | $ sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --columns "name" -m 1 104 | 105 | Data file contents 106 | 107 | $ hadoop dfs -cat /user/hduser/student/part-m-00000 108 | 109 | Archana 110 | Xyz 111 | 112 | 3 Import all columns, filter rows using where clause 113 | 114 | $ sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --where "id>1" -m 1 --target-dir /user/hduser/ar 115 | 116 | Data file contents 117 | 118 | $ hadoop dfs -cat /user/hduser/ar/part-m-00000 119 | 2,XYZ -------------------------------------------------------------------------------- /Notes/sqoop.txt: -------------------------------------------------------------------------------- 1 | export SQOOP_HOME=/usr/lib/sqoop 2 | export PATH=$PATH:$SQOOP_HOME/bin 3 | 4 | 5 | Step 2: Configure the MySQL Service and Connector 6 | 7 | Download mysql-connector-java-5.0.5.jar file and copy it to $SQOOP_HOME/lib directory. 8 | 9 | Step 3: Sqoop Installation 10 | 11 | Sqoop Installation Tutorial for instructions of how to install Sqoop. 12 | 13 | Database and table creation in MySQL 14 | 15 | First connect to MySQL 16 | 17 | $ mysql -u root -p 18 | 19 | Enter password: 20 | 21 | Create database ‘testDb’ and use ‘testDb’ database as a current database. 22 | 23 | mysql> create database testDb; 24 | 25 | mysql> use testDb; 26 | 27 | Create table ‘student’ 28 | 29 | mysql> create table student(id integer,name char(20)); 30 | 31 | Add following 2 records to the table 32 | 33 | mysql> insert into student values(1,'Archana'); 34 | 35 | mysql> insert into student values(2,'XYZ'); 36 | 37 | Exit from MySQL 38 | 39 | mysql> exit; 40 | 41 | Sqoop import 42 | 1. Importing a table into HDFS 43 | 44 | 1st way 45 | 46 | Command for import 47 | 48 | sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --m 1 49 | 50 | Execute the sqoop import 51 | 52 | Here we are using database ‘testDb’ , username ‘root’, password ‘hadoop123′, and table student. 53 | 54 | $ sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --m 1 55 | 56 | ——————- NOTE——————– 57 | 58 | If you have not defined primary key for your table then you have to give ‘-m 1′ option for import. 59 | Otherwise it gives error 60 | ERROR tool.ImportTool: Error during import: No primary key could be found for table student1. Please specify one with --split-by or perform a sequential import with '-m 1'. 61 | 62 | 2nd Way 63 | 64 | Create a config file $HOME/import.txt add following to the config file 65 | 66 | import.txt 67 | 68 | import 69 | --connect 70 | jdbc:mysql://localhost/testDb 71 | --username 72 | root 73 | --password 74 | hadoop123 75 | 76 | Execute the sqoop import 77 | 78 | sqoop --options-file /home/hduser/import.txt --table student -m 1 79 | 80 | Once import is done you can find student.jar, student.class and student.java at following location /tmp/sqoop-hduser/compile/—-/student.jar 81 | 82 | Files created in HDFS 83 | 84 | $ hadoop dfs -ls -R student 85 | 86 | Found 3 items 87 | 88 | -rw-r--r-- 1 hduser supergroup 0 2013-09-13 15:38 /user/hduser/student/_SUCCESS 89 | 90 | drwxr-xr-x - hduser supergroup 0 2013-09-13 15:38 /user/hduser/student/_logs 91 | 92 | -rw-r--r-- 1 hduser supergroup 16 2013-09-13 15:38 /user/hduser/student/part-m-00000 93 | 94 | Data file contents 95 | 96 | $ hadoop dfs -cat /user/hduser/student/part-m-00000 97 | 98 | 1,Archana 99 | 2,XYZ 100 | 101 | 2 Import all rows of a table in MySQL, but specific columns of the table 102 | 103 | $ sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --columns "name" -m 1 104 | 105 | Data file contents 106 | 107 | $ hadoop dfs -cat /user/hduser/student/part-m-00000 108 | 109 | Archana 110 | Xyz 111 | 112 | 3 Import all columns, filter rows using where clause 113 | 114 | $ sqoop import --connect jdbc:mysql://localhost/testDb --username root --password hadoop123 --table student --where "id>1" -m 1 --target-dir /user/hduser/ar 115 | 116 | Data file contents 117 | 118 | $ hadoop dfs -cat /user/hduser/ar/part-m-00000 119 | 2,XYZ -------------------------------------------------------------------------------- /Notes/sqoop1.txt: -------------------------------------------------------------------------------- 1 | sqoop list-databases --connect jdbc:mysql://client.cluster1.com/employee --username sqoop --password passwd 2 | 3 | sqoop list-tables --connect jdbc:mysql://client.cluster1.com/employee --username sqoop --password passwd 4 | 5 | 6 | sqoop import --connect jdbc:mysql://client.cluster1.com/employee --username sqoop --password passwd --table student -m 1 --target-dir /user/sqoop/employee 7 | 8 | sqoop import --connect jdbc:mysql://client.cluster1.com/employee --username sqoop --password passwd --table student -m 1 --target-dir /user/sqoop/employee 9 | 10 | 11 | sqoop --options-file SqoopImportOptions.txt \ 12 | --table employees \ 13 | --where "emp_no > 499948" \ 14 | --as-textfile \ 15 | -m 1 \ 16 | --target-dir /user/airawat/sqoop-mysql/employeeGtTest 17 | 18 | 19 | sqoop --options-file SqoopImportOptions.txt \ 20 | --query 'select EMP_NO,FIRST_NAME,LAST_NAME from employees where $CONDITIONS' \ 21 | --fetch-size=50000 \ 22 | --split-by EMP_NO \ 23 | --direct \ 24 | --target-dir /user/airawat/sqoop-mysql/FetchSize 25 | 26 | sqoop --options-file SqoopImportOptions.txt \ 27 | 28 | --query 'select EMP_NO,FIRST_NAME,LAST_NAME from employees where $CONDITIONS' \ 29 | -z \ 30 | --split-by EMP_NO \ 31 | --direct \ 32 | --target-dir /user/airawat/sqoop-mysql/CompressedSampl 33 | 34 | ================= 35 | mysql> create table employee(id varchar(20),name varchar(20),salary varchar(10)); 36 | 37 | hive -> CREATE External TABLE emp_hive (id INT, name STRING, salary STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE location '/user/hadoop/table'; 38 | 39 | 40 | sqoop import --connect jdbc:mysql://repo.cluster1.com/test --username hadoop --password hivepassword --table employee --target-dir /user/hadoop/table -m 1 --incremental append -check-column id 41 | 42 | #!/bin/bash 43 | 44 | for i in `seq 1 100` 45 | do 46 | echo "insert into test.employee(id,name,salary) values('${i}','Am${i}','10000');" 47 | done 48 | -------------------------------------------------------------------------------- /Notes/yarn.txt: -------------------------------------------------------------------------------- 1 | hadoop-daemon.sh start namenode 2 | hadoop-daemon.sh start datanode 3 | 4 | yarn-daemon.sh start resourcemanager 5 | yarn-daemon.sh start nodemanager 6 | 7 | 8 | 9 | yarn.resourcemanager.address 10 | ha-nn1.hacluster1.com:8032 11 | the host is the hostname of the ResourceManager and the port is the port on 12 | which the clients can talk to the Resource Manager. 13 | 14 | 15 | 16 | yarn.resourcemanager.scheduler.address 17 | ha-nn1.hacluster1.com:8030 18 | host is the hostname of the resourcemanager and port is the port 19 | on which the Applications in the cluster talk to the Resource Manager. 20 | 21 | 22 | 23 | 24 | yarn.resourcemanager.resource-tracker.address 25 | ha-nn1.hacluster1.com:8031 26 | host is the hostname of the resource manager and 27 | port is the port on which the NodeManagers contact the Resource Manager. 28 | 29 | 30 | 31 | 32 | yarn.nodemanager.address 33 | 0.0.0.0:9004 34 | the nodemanagers bind to this port 35 | 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ``` 2 | @ Netxillon Technologies. You are allowed to use and modify any work here, provided you acknowlege the source back. 3 | Please contact at trainings@netxillon.com for any questions. 4 | 5 | Disclaimer: There is no responsibility for any kind of damage caused, by using this github. Please make sure you understand the things here before implementing them in production. 6 | ``` 7 | ``` 8 | http://www.netxillon.com 9 | For any help you can reach me at: trainings@netxillon.com 10 | ``` 11 | 12 | #### Courses 13 | 14 | Hadoop Cluster Configurations 15 | The config files are from running cluster. Feel free to use them, but please drop an email with your feedback. 16 | 17 | I provide Advanced Hadoop Administration and DevOps trainings: 18 | > Hadoop, HBase, Kafka, Spark 19 | > Ansible automation for Hadoop Stack 20 | > Advanced Linux Optmizations 21 | 22 | Advanced Hadoop Training: I will be covering topics like: detailed kerberos, Encryption, Centerlized caching, Storage policy, Ranger, Knox, Hadoop Performance Tuning and Production Use cases. Contact me for details. 23 | 24 | > "Doing a course is not a guarantee for a job, but having a solid foundation surely is" 25 | 26 | For Details on Courses offered, please refer to the folder **Courses_Offered**. 27 | -------------------------------------------------------------------------------- /Schedulers/capacity-scheduler.xml: -------------------------------------------------------------------------------- 1 | 14 | 15 | 16 | 17 | yarn.scheduler.capacity.maximum-applications 18 | 10000 19 | 20 | Maximum number of applications that can be pending and running. 21 | 22 | 23 | 24 | 25 | yarn.scheduler.capacity.maximum-am-resource-percent 26 | 0.1 27 | 28 | Maximum percent of resources in the cluster which can be used to run 29 | application masters i.e. controls number of concurrent running 30 | applications. 31 | 32 | 33 | 34 | 35 | yarn.scheduler.capacity.resource-calculator 36 | org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator 37 | 38 | The ResourceCalculator implementation to be used to compare 39 | Resources in the scheduler. 40 | The default i.e. DefaultResourceCalculator only uses Memory while 41 | DominantResourceCalculator uses dominant-resource to compare 42 | multi-dimensional resources such as Memory, CPU etc. 43 | 44 | 45 | 46 | 47 | yarn.scheduler.capacity.root.queues 48 | default,sales,marketing 49 | 50 | The queues at the this level (root is the root queue). 51 | 52 | 53 | 54 | 55 | yarn.scheduler.capacity.root.default.capacity 56 | 50 57 | Default queue target capacity. 58 | 59 | 60 | 61 | yarn.scheduler.capacity.root.default.user-limit-factor 62 | 1 63 | 64 | Default queue user limit a percentage from 0.0 to 1.0. 65 | 66 | 67 | 68 | 69 | yarn.scheduler.capacity.root.default.maximum-capacity 70 | 100 71 | 72 | The maximum capacity of the default queue. 73 | 74 | 75 | 76 | 77 | yarn.scheduler.capacity.root.default.state 78 | RUNNING 79 | 80 | The state of the default queue. State can be one of RUNNING or STOPPED. 81 | 82 | 83 | 84 | 85 | yarn.scheduler.capacity.root.default.acl_submit_applications 86 | * 87 | 88 | The ACL of who can submit jobs to the default queue. 89 | 90 | 91 | 92 | 93 | yarn.scheduler.capacity.root.default.acl_administer_queue 94 | * 95 | 96 | The ACL of who can administer jobs on the default queue. 97 | 98 | 99 | 100 | 101 | yarn.scheduler.capacity.node-locality-delay 102 | 40 103 | 104 | Number of missed scheduling opportunities after which the CapacityScheduler 105 | attempts to schedule rack-local containers. 106 | Typically this should be set to number of nodes in the cluster, By default is setting 107 | approximately number of nodes in one rack which is 40. 108 | 109 | 110 | 111 | # sales queue 112 | 113 | 114 | yarn.scheduler.capacity.root.sales.capacity 115 | 30 116 | 117 | 118 | 119 | yarn.scheduler.capacity.root.sales.user-limit-factor 120 | 1 121 | 122 | 123 | 124 | yarn.scheduler.capacity.root.sales.maximum-capacity 125 | 100 126 | 127 | 128 | 129 | yarn.scheduler.capacity.root.sales.state 130 | RUNNING 131 | 132 | 133 | 134 | yarn.scheduler.capacity.root.sales.acl_submit_applications 135 | * 136 | 137 | 138 | 139 | yarn.scheduler.capacity.root.sales.acl_administer_queue 140 | * 141 | 142 | 143 | # Marketing Queue 144 | 145 | 146 | yarn.scheduler.capacity.root.marketing.capacity 147 | 20 148 | 149 | 150 | 151 | yarn.scheduler.capacity.root.marketing.user-limit-factor 152 | 1 153 | 154 | 155 | 156 | yarn.scheduler.capacity.root.marketing.maximum-capacity 157 | 100 158 | 159 | 160 | 161 | yarn.scheduler.capacity.root.marketing.state 162 | RUNNING 163 | 164 | 165 | 166 | yarn.scheduler.capacity.root.marketing.acl_submit_applications 167 | * 168 | 169 | 170 | 171 | yarn.scheduler.capacity.root.marketing.acl_administer_queue 172 | * 173 | 174 | 175 | 176 | yarn.scheduler.capacity.queue-mappings 177 | 178 | 179 | A list of mappings that will be used to assign jobs to queues 180 | The syntax for this list is [u|g]:[name]:[queue_name][,next mapping]* 181 | Typically this list will be used to map users to queues, 182 | for example, u:%user:%user maps all users to queues with the same name 183 | as the user. 184 | 185 | 186 | 187 | 188 | yarn.scheduler.capacity.queue-mappings-override.enable 189 | false 190 | 191 | If a queue mapping is present, will it override the value specified 192 | by the user? This can be used by administrators to place jobs in queues 193 | that are different than the one specified by the user. 194 | The default is false. 195 | 196 | 197 | 198 | 199 | -------------------------------------------------------------------------------- /Schedulers/commands: -------------------------------------------------------------------------------- 1 | Hadoop 1: 2 | hadoop jar hadoop/hadoop-examples-1.2.1.jar wordcount -Dmapred.job.queue.name=high /project/input /output2233231 3 | 4 | Hadoop 2: 5 | yarn jar hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.3.jar wordcount -Dmapred.job.queue.name=sales /test /out 6 | 7 | Useful Commands: 8 | $ yarn rmadmin -refreshQueues 9 | $ mapred queue -list 10 | -------------------------------------------------------------------------------- /Schedulers/fair-scheduler.xml: -------------------------------------------------------------------------------- 1 | Hadoop 1, we used the concept "pool" as well, but later it was standarized to queues 2 | 3 | 4 | 5 | 6 | 10 7 | 5 8 | 9 | 10 | 11 | #Examples 12 | 13 | 14 | 10000 mb,0vcores 15 | 90000 mb,0vcores 16 | 50 17 | 0.1 18 | 2.0 19 | fair 20 | 21 | charlie 22 | 5000 mb,0vcores 23 | 24 | 25 | 26 | 0.5 27 | 28 | 30 | 31 | 3.0 32 | 33 | 34 | 35 | 30 36 | 37 | 5 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /Schedulers/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | mapred.job.tracker 3 | jt.cluster1.com:9001 4 | 5 | 6 | 7 | mapred.jobtracker.taskScheduler 8 | org.apache.hadoop.mapred.FairScheduler 9 | 10 | 11 | 12 | mapred.fairscheduler.allocation.file 13 | /home/hadoop/hadoop/conf/fair-scheduler.xml 14 | 15 | 16 | 17 | mapred.fairscheduler.poolnameproperty 18 | mapred.job.queue.name 19 | true 20 | 21 | 22 | 23 | mapred.queue.names 24 | default,high,low 25 | 26 | -------------------------------------------------------------------------------- /Schedulers/user-mappings.txt: -------------------------------------------------------------------------------- 1 | 2 | yarn.scheduler.capacity.queue-mappings 3 | u:hdfs:marketing 4 | 5 | A list of mappings that will be used to assign jobs to queues 6 | The syntax for this list is [u|g]:[name]:[queue_name][,next mapping]* 7 | Typically this list will be used to map users to queues, 8 | for example, u:%user:%user maps all users to queues with the same name 9 | as the user. 10 | 11 | 12 | 13 | u:%user:%primary_group 14 | 15 | 16 | yarn.scheduler.capacity.queue-mappings 17 | u:%user:%primary_group 18 | 19 | 20 | 21 | yarn.scheduler.capacity.queue-mappings 22 | u:maria:engineering,g:webadmins:weblog 23 | 24 | 25 | 26 | yarn.scheduler.capacity.queue-mappings-override.enable 27 | false 28 | 29 | If a queue mapping is present and override is set to true, it will override the queue value specified 30 | by the user. This can be used by administrators to place jobs in queues 31 | that are different than the one specified by the user. 32 | The default is false - user can specify to a non-default queue. 33 | 34 | 35 | -------------------------------------------------------------------------------- /Schedulers/yarn-site.xml_capacity: -------------------------------------------------------------------------------- 1 | # Capacity Scheduler is the default scheduler. So, we do not need to configure the below in Hadoop 2.x 2 | 3 | 4 | 5 | yarn.resourcemanager.scheduler.class 6 | org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler 7 | yarn-default.xml 8 | 9 | -------------------------------------------------------------------------------- /Schedulers/yarn-site.xml_fair: -------------------------------------------------------------------------------- 1 | 2 | yarn.resourcemanager.scheduler.class 3 | org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler 4 | 5 | 6 | 7 | yarn.scheduler.fair.allocation.file 8 | hadoop/conf/fair-scheduler.xml 9 | 10 | 11 | -------------------------------------------------------------------------------- /Security/README.md: -------------------------------------------------------------------------------- 1 | Important Points before starrting with Security: 2 | =============================================== 3 | 1. Ensure NTP is working and all nodes are in sync. 4 | 2. Ensure every system has the right entropy. Atleast 1000; refer to installation of rng under the kerberos install script. 5 | - This will ensure faster cryptography for keys, principals etc 6 | 3. For Kerberos make sure Java is patched with Unrestricted key length. 7 | 4. If not using SASL for Datanodes, ensure JSVC_HOME is set to the binary. 8 | 9 | This is very vast topic and lots of things to talk about: 10 | 11 | - The integrations can be with AD, FreeIPA, OpenLdap, Kerberos. 12 | - SIEM or RIHNO etc 13 | 14 | For any specific needs, please contact me at trainings@netxillon.com 15 | -------------------------------------------------------------------------------- /Security/SSL_Configs/CA/README.txt: -------------------------------------------------------------------------------- 1 | This is to setup CA and get all certs signed by CA. 2 | -------------------------------------------------------------------------------- /Security/SSL_Configs/commands_CA_JKS: -------------------------------------------------------------------------------- 1 | # yum install openssl-devel 2 | cd /etc/pki/CA/ 3 | 4 | ls -l crl/ 5 | ls -l newcerts/ 6 | ls -l private/ 7 | vi /etc/pki/tls/openssl.cnf 8 | touch /etc/pki/CA/index.txt 9 | echo 01 > /etc/pki/CA/serial 10 | 11 | openssl genrsa -out private/myca.key -des3 2048 12 | or openssl genrsa -out private/myca.key -aes128 2048 13 | 14 | openssl req -new -x509 -key private/myca.key -days 365 > CA.crt 15 | ---------------- 16 | more refined way: 17 | openssl req -new -sha256 -key private/myca.key -nodes -out rootCA.csr 18 | openssl x509 -req -days 3650 -extensions v3_ca -in rootCA.csr -signkey private/myca.key -out rootCA.pem 19 | ------------------ 20 | 21 | mkdir certs 22 | cd certs/ 23 | openssl req -new -newkey rsa:2048 -nodes -keyout dilithium.key -out dilithium.csr 24 | 25 | openssl ca -in dilithium.csr -out dilithium.crt 26 | openssl req -new -newkey rsa:2048 -nodes -keyout cluster1.key -out cluster1.csr 27 | openssl ca -in cluster1.csr -out cluster1.crt 28 | openssl req -new -newkey rsa:2048 -nodes -keyout cluster1.key -out cluster1.csr 29 | openssl ca -in cluster1.csr -out cluster1.crt 30 | 31 | openssl verify -CAfile /etc/pki/CA/CA.crt certs/dilithium.crt 32 | 33 | 34 | openssl verify cluster1.crt 35 | openssl verify dilithium.crt 36 | 37 | 38 | Hadoop JKS steps: CA signed 39 | --------------------------- 40 | 41 | keytool -genkey -alias `hostname -s` -keyalg RSA -dname "CN=`hostname -f`,OU=Netxillon Technologies,O=Netxillon Technologies,L=Melbourne,ST=Victoria,C=AU" -keypass password -keystore keystore.jks -storepass password 42 | 43 | keytool -certreq -alias `hostname -s` -keyalg RSA -file `hostname -s`.csr -keystore keystore.jks -storepass password 44 | 45 | openssl ca -batch -passin pass:redhat -in `hostname -s`.csr -out `hostname -s`.crt 46 | 47 | keytool -import -keystore keystore.jks -file CA.crt -alias CARoot -storepass password -noprompt 48 | 49 | keytool -import -keystore keystore.jks -file `hostname -s`.crt -alias `hostname -s` -keypass password -storepass password -noprompt 50 | 51 | keytool -importcert -keystore truststore.jks -file CA.crt -alias CARoot -storepass password -noprompt 52 | 53 | Good to do: 54 | =========== 55 | keytool -exportcert -alias caroot -keystore /etc/security/keys/truststore.jks -file /usr/java/default/jre/lib/security/cacerts 56 | 57 | 58 | Verify PEM format or not 59 | ======================== 60 | 61 | openssl x509 -inform PEM -in CA.crt 62 | openssl x509 -inform PEM -in CA.pem 63 | openssl x509 -inform PEM -in cm1.opta.com-server.pem 64 | 65 | Verify cert presented by Server 66 | -------------------------------- 67 | openssl s_client -verify 100 -showcerts -CAfile <($JAVA_HOME/bin/keytool -list -rfc -keystore $JAVA_HOME/jre/lib/security/jssecacerts -storepass changeit) -connect cm1.opta.com:7183 68 | 69 | openssl s_client -connect cm1.opta.com:7183 2>/dev/null 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | fs.default.name 10 | hdfs://nn1.cluster1.com:9000 11 | 12 | 13 | 14 | hadoop.rpc.protection 15 | privacy 16 | 17 | 18 | 19 | hadoop.ssl.require.client.cert 20 | false 21 | 22 | 23 | 24 | hadoop.ssl.hostname.verifier 25 | DEFAULT 26 | 27 | 28 | 29 | hadoop.ssl.keystores.factory.class 30 | org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory 31 | 32 | 33 | 34 | hadoop.ssl.server.conf 35 | ssl-server.xml 36 | 37 | 38 | 39 | hadoop.ssl.client.conf 40 | ssl-client.xml 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /Security/SSL_Configs/hadoop_ssl_configs/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | dfs.encrypt.data.transfer 10 | true 11 | 12 | 13 | 14 | dfs.block.access.token.enable 15 | true 16 | 17 | 18 | 19 | dfs.data.transfer.protection 20 | privacy 21 | 22 | 23 | 24 | dfs.namenode.secondary.https-address 25 | nn1.cluster1.com:50091 26 | 27 | 28 | 29 | dfs.namenode.https-address 30 | nn1.cluster1.com:50470 31 | 32 | 33 | 34 | dfs.webhdfs.enabled 35 | true 36 | 37 | 38 | 39 | dfs.https.enable 40 | true 41 | 42 | 43 | 44 | dfs.http.policy 45 | HTTPS_ONLY 46 | 47 | 48 | 49 | dfs.name.dir 50 | /data/nn1,/data/nn2 51 | 52 | 53 | 54 | dfs.data.dir 55 | /data/d1,/data/d2 56 | 57 | 58 | 59 | dfs.replication 60 | 1 61 | 62 | 63 | 64 | dfs.datanode.https.address 65 | 0.0.0.0:50475 66 | 67 | 68 | 69 | dfs.datanode.address 70 | 0.0.0.0:10019 71 | 72 | 73 | 74 | dfs.datanode.http.address 75 | 0.0.0.0:10022 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /Security/SSL_Configs/hadoop_ssl_configs/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | mapreduce.framework.name 10 | yarn 11 | 12 | 13 | 14 | hadoop.ssl.enabled 15 | true 16 | 17 | 18 | 19 | mapreduce.shuffle.ssl.enabled 20 | true 21 | 22 | 23 | 24 | hadoop.ssl.require.client.cert 25 | false 26 | 27 | 28 | 29 | hadoop.ssl.hostname.verifier 30 | DEFAULT 31 | true 32 | 33 | 34 | 35 | hadoop.ssl.keystores.factory.class 36 | org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory 37 | true 38 | 39 | 40 | 41 | hadoop.ssl.server.conf 42 | ssl-server.xml 43 | true 44 | 45 | 46 | 47 | hadoop.ssl.client.conf 48 | ssl-client.xml 49 | true 50 | 51 | 52 | 53 | 54 | mapreduce.jobhistory.http.policy 55 | HTTPS_ONLY 56 | 57 | 58 | 59 | mapreduce.jobhistory.webapp.https.address 60 | rm1.cluster1.com:19889 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /Security/SSL_Configs/hadoop_ssl_configs/ssl-client.xml: -------------------------------------------------------------------------------- 1 | [hadoop@ip-172-31-15-180 ~]$ cat /etc/hadoop/conf/ssl-client.xml 2 | 3 | 4 | 5 | 6 | 7 | 8 | ssl.client.truststore.location 9 | ${user.home}/keystore/final.jks 10 | Truststore to be used by clients like distcp. Must be 11 | specified. 12 | 13 | 14 | 15 | 16 | ssl.client.truststore.password 17 | password 18 | Optional. Default value is "". 19 | 20 | 21 | 22 | 23 | ssl.client.truststore.type 24 | jks 25 | Optional. The keystore file format, default value is "jks". 26 | 27 | 28 | 29 | 30 | ssl.client.truststore.reload.interval 31 | 10000 32 | Truststore reload check interval, in milliseconds. 33 | Default value is 10000 (10 seconds). 34 | 35 | 36 | 37 | 38 | ssl.client.keystore.location 39 | ${user.home}/keystore/keystore.jks 40 | Keystore to be used by clients like distcp. Must be 41 | specified. 42 | 43 | 44 | 45 | 46 | ssl.client.keystore.password 47 | password 48 | Optional. Default value is "". 49 | 50 | 51 | 52 | 53 | ssl.client.keystore.keypassword 54 | password 55 | Optional. Default value is "". 56 | 57 | 58 | 59 | 60 | ssl.client.keystore.type 61 | jks 62 | Optional. The keystore file format, default value is "jks". 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /Security/SSL_Configs/hadoop_ssl_configs/ssl-server.xml: -------------------------------------------------------------------------------- 1 | [hadoop@ip-172-31-15-180 ~]$ cat /etc/hadoop/conf/ssl-server.xml 2 | 3 | 4 | 5 | 6 | 7 | 8 | ssl.server.keystore.type 9 | jks 10 | 11 | 12 | ssl.server.keystore.location 13 | /home/hadoop/keystore/keystore.jks 14 | 15 | 16 | ssl.server.keystore.password 17 | password 18 | 19 | 20 | 21 | 22 | ssl.server.truststore.type 23 | jks 24 | 25 | 26 | ssl.server.truststore.location 27 | /home/hadoop/keystore/truststore.jks 28 | 29 | 30 | ssl.server.truststore.password 31 | password 32 | 33 | 34 | ssl.server.truststore.reload.interval 35 | 10000 36 | 37 | 38 | -------------------------------------------------------------------------------- /Security/SSL_Configs/hadoop_ssl_configs/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | yarn.resourcemanager.resource-tracker.address 9 | rm1.cluster1.com:9001 10 | 11 | 12 | 13 | yarn.resourcemanager.scheduler.address 14 | rm1.cluster1.com:9002 15 | 16 | 17 | 18 | yarn.resourcemanager.address 19 | rm1.cluster1.com:9003 20 | 21 | 22 | 23 | yarn.nodemanager.aux-services 24 | mapreduce_shuffle 25 | 26 | 27 | 28 | yarn.nodemanager.aux-services.mapreduce.shuffle.class 29 | org.apache.hadoop.mapred.ShuffleHandler 30 | 31 | 32 | 33 | yarn.http.policy 34 | HTTPS_ONLY 35 | 36 | 37 | 38 | yarn.resourcemanager.webapp.https.address 39 | rm1.cluster1.com:8089 40 | 41 | 42 | 43 | yarn.log.server.url 44 | https://rm1.cluster1.com:19889/jobhistory/logs 45 | 46 | 47 | 48 | yarn.nodemanager.webapp.https.address 49 | 0.0.0.0:8090 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /Security/kerberos/JT/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | fs.default.name 10 | hdfs://nn1.cluster1.com:9000 11 | 12 | 13 | 14 | hadoop.security.authentication 15 | kerberos 16 | 17 | 18 | 19 | hadoop.security.authorization 20 | true 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /Security/kerberos/JT/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | dfs.data.dir 10 | /space/d1 11 | true 12 | 13 | 14 | 15 | dfs.replication 16 | 1 17 | 18 | 19 | 20 | dfs.permissions.supergroup 21 | hadoop 22 | 23 | 24 | 25 | dfs.permissions.superusergroup 26 | hadoop 27 | 28 | 29 | 30 | dfs.datanode.data.dir.perm 31 | 700 32 | 33 | 34 | 35 | dfs.datanode.address 36 | 192.168.1.74:1004 37 | 38 | 39 | 40 | dfs.datanode.http.address 41 | 192.168.1.74:1006 42 | 43 | 44 | 45 | dfs.datanode.keytab.file 46 | /home/hadoop/dn.hdfs.keytab 47 | 48 | 49 | 50 | dfs.datanode.kerberos.principal 51 | dn/_HOST@CLUSTER1.COM 52 | 53 | 54 | 55 | dfs.datanode.kerberos.https.principal 56 | host/_HOST@CLUSTER1.COM 57 | 58 | 59 | 60 | dfs.namenode.kerberos.principal 61 | nn/_HOST@CLUSTER1.COM 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /Security/kerberos/JT/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | mapred.job.tracker 10 | jt1.cluster1.com:9001 11 | 12 | 13 | 14 | mapreduce.jobtracker.kerberos.principal 15 | mapred/_HOST@CLUSTER1.COM 16 | 17 | 18 | 19 | mapreduce.jobtracker.kerberos.https.principal 20 | host/_HOST@CLUSTER1.COM 21 | 22 | 23 | 24 | mapreduce.jobtracker.keytab.file 25 | /home/hadoop/mapred.keytab 26 | 27 | 28 | 29 | mapreduce.tasktracker.kerberos.principal 30 | mapred/_HOST@CLUSTER1.COM 31 | 32 | 33 | 34 | mapreduce.tasktracker.kerberos.https.principal 35 | host/_HOST@CLUSTER1.COM 36 | 37 | 38 | 39 | mapreduce.tasktracker.keytab.file 40 | /home/hadoop/tt.mapred.keytab 41 | 42 | 43 | 44 | mapred.local.dir 45 | /space/tmp 46 | 47 | 48 | 49 | mapreduce.tasktracker.group 50 | mapred 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /Security/kerberos/JT/taskcontroller.cfg: -------------------------------------------------------------------------------- 1 | mapred.local.dir=/space/tmp#configured value of mapred.local.dir. It can be a list of comma separated paths. 2 | hadoop.log.dir=/home/hadoop/log#configured value of hadoop.log.dir. 3 | mapred.tasktracker.tasks.sleeptime-before-sigkill=#sleep time before sig kill is to be sent to process group after sigterm is sent. Should be in seconds 4 | mapreduce.tasktracker.group=#configured value of mapreduce.tasktracker.group. 5 | 6 | 7 | mapred.task.tracker.task-controller 8 | org.apache.hadoop.mapred.LinuxTaskController 9 | 10 | 11 | 12 | mapreduce.tasktracker.group 13 | mapred 14 | 15 | -------------------------------------------------------------------------------- /Security/kerberos/Jsvc_download.txt: -------------------------------------------------------------------------------- 1 | 1. http://commons.apache.org/proper/commons-daemon/download_daemon.cgi 2 | 3 | Downlaod package: commons-daemon-1.1.0-native-src.tar.gz 4 | 5 | $ tar -xzvf commons-daemon-1.1.0-native-src.tar.gz 6 | $ cd commons-daemon-1.1.0-native-src/unix 7 | $ ./configure && make 8 | $ cp jsvc /usr/lib 9 | 10 | 11 | Under hadoop-env.sh 12 | 13 | export JSVC_HOME=/usr/lib 14 | 15 | 2. Directly download the binary: http://archive.apache.org/dist/commons/daemon/binaries/ 16 | -------------------------------------------------------------------------------- /Security/kerberos/Namenode_Datanode/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | fs.default.name 10 | hdfs://nn1.cluster1.com:9000 11 | 12 | 13 | 14 | hadoop.security.authentication 15 | kerberos 16 | 17 | 18 | 19 | hadoop.security.authorization 20 | true 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /Security/kerberos/Namenode_Datanode/hadoop-env.sh: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Set Hadoop-specific environment variables here. 18 | 19 | # The only required environment variable is JAVA_HOME. All others are 20 | # optional. When running a distributed configuration it is best to 21 | # set JAVA_HOME in this file, so that it is correctly defined on 22 | # remote nodes. 23 | 24 | export JAVA_HOME=/usr/java/latest 25 | 26 | # The java implementation to use. 27 | export JAVA_HOME=${JAVA_HOME} 28 | 29 | # The jsvc implementation to use. Jsvc is required to run secure datanodes 30 | # that bind to privileged ports to provide authentication of data transfer 31 | # protocol. Jsvc is not required if SASL is configured for authentication of 32 | # data transfer protocol using non-privileged ports. 33 | #export JSVC_HOME=${JSVC_HOME} 34 | 35 | export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"} 36 | 37 | # Extra Java CLASSPATH elements. Automatically insert capacity-scheduler. 38 | for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do 39 | if [ "$HADOOP_CLASSPATH" ]; then 40 | export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f 41 | else 42 | export HADOOP_CLASSPATH=$f 43 | fi 44 | done 45 | 46 | # The maximum amount of heap to use, in MB. Default is 1000. 47 | #export HADOOP_HEAPSIZE= 48 | #export HADOOP_NAMENODE_INIT_HEAPSIZE="" 49 | 50 | # Extra Java runtime options. Empty by default. 51 | export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true -Djavax.net.debug=ssl:handshake" 52 | 53 | # Command specific options appended to HADOOP_OPTS when specified 54 | export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} -Djavax.net.debug=ssl $HADOOP_NAMENODE_OPTS" 55 | export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS" 56 | 57 | export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS" 58 | 59 | export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS" 60 | export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS" 61 | 62 | # The following applies to multiple commands (fs, dfs, fsck, distcp etc) 63 | export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS" 64 | #HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS" 65 | 66 | # On secure datanodes, user to run the datanode as after dropping privileges. 67 | # This **MUST** be uncommented to enable secure HDFS if using privileged ports 68 | # to provide authentication of data transfer protocol. This **MUST NOT** be 69 | # defined if SASL is configured for authentication of data transfer protocol 70 | # using non-privileged ports. 71 | export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER} 72 | 73 | # Where log files are stored. $HADOOP_HOME/logs by default. 74 | #export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER 75 | 76 | # Where log files are stored in the secure data environment. 77 | export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER} 78 | 79 | ### 80 | # HDFS Mover specific parameters 81 | ### 82 | # Specify the JVM options to be used when starting the HDFS Mover. 83 | # These options will be appended to the options specified as HADOOP_OPTS 84 | # and therefore may override any similar flags set in HADOOP_OPTS 85 | # 86 | # export HADOOP_MOVER_OPTS="" 87 | 88 | ### 89 | # Advanced Users Only! 90 | ### 91 | 92 | # The directory where pid files are stored. /tmp by default. 93 | # NOTE: this should be set to a directory that can only be written to by 94 | # the user that will run the hadoop daemons. Otherwise there is the 95 | # potential for a symlink attack. 96 | export HADOOP_PID_DIR=${HADOOP_PID_DIR} 97 | export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR} 98 | 99 | # A string representing this instance of hadoop. $USER by default. 100 | export HADOOP_IDENT_STRING=$USER 101 | -------------------------------------------------------------------------------- /Security/kerberos/Namenode_Datanode/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | dfs.name.dir 5 | /data/nn1,/data/nn2 6 | 7 | 8 | 9 | dfs.data.dir 10 | /data/d1,/data/d2 11 | 12 | 13 | 14 | dfs.replication 15 | 1 16 | 17 | 18 | 19 | dfs.permissions.supergroup 20 | hadoop 21 | 22 | 23 | # Kerberos configuration 24 | 25 | 26 | dfs.block.access.token.enable 27 | true 28 | 29 | 30 | 31 | dfs.namenode.keytab.file 32 | /opt/cluster/security/nn.hdfs.keytab 33 | 34 | 35 | 36 | dfs.namenode.kerberos.principal 37 | hdfs/_HOST@CLUSTER1.COM 38 | 39 | 40 | 41 | dfs.namenode.kerberos.http.principal 42 | host/_HOST@CLUSTER1.COM 43 | 44 | 45 | 46 | dfs.web.authentication.kerberos.principal 47 | HTTP/_HOST@CLUSTER1.COM 48 | 49 | 50 | 51 | dfs.namenode.kerberos.internal.spnego.principal 52 | ${dfs.web.authentication.kerberos.principal} 53 | 54 | 55 | # Datanode configuration 56 | 57 | 58 | dfs.datanode.data.dir.perm 59 | 700 60 | 61 | 62 | 63 | dfs.datanode.address 64 | 0.0.0.0:1004 65 | 66 | 67 | 68 | dfs.datanode.http.address 69 | 0.0.0.0:1006 70 | 71 | 72 | 73 | dfs.datanode.keytab.file 74 | /opt/cluster/security/dn.hdfs.keytab 75 | 76 | 77 | 78 | dfs.datanode.kerberos.principal 79 | hdfs/_HOST@CLUSTER1.COM 80 | 81 | 82 | 83 | dfs.datanode.kerberos.http.principal 84 | host/_HOST@CLUSTER1.COM 85 | 86 | 87 | 88 | dfs.web.authentication.kerberos.principal 89 | HTTP/_HOST@CLUSTER1.COM 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /Security/kerberos/Namenode_Datanode/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | mapred.job.tracker 10 | jt1.cluster1.com:9001 11 | 12 | 13 | 14 | mapreduce.jobtracker.kerberos.principal 15 | mapred/_HOST@CLUSTER1.COM 16 | 17 | 18 | 19 | mapreduce.jobtracker.kerberos.https.principal 20 | host/_HOST@CLUSTER1.COM 21 | 22 | 23 | 24 | mapreduce.jobtracker.keytab.file 25 | /home/hadoop/mapred.keytab 26 | 27 | 28 | 29 | mapreduce.tasktracker.kerberos.principal 30 | mapred/_HOST@CLUSTER1.COM 31 | 32 | 33 | 34 | mapreduce.tasktracker.kerberos.https.principal 35 | host/_HOST@CLUSTER1.COM 36 | 37 | 38 | 39 | mapreduce.tasktracker.keytab.file 40 | /home/hadoop/tt.mapred.keytab 41 | 42 | 43 | 44 | mapred.local.dir 45 | /space/tmp 46 | 47 | 48 | 49 | mapreduce.tasktracker.group 50 | mapred 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /Security/kerberos/Namenode_Datanode/taskcontroller.cfg: -------------------------------------------------------------------------------- 1 | mapred.local.dir=/space/tmp#configured value of mapred.local.dir. It can be a list of comma separated paths. 2 | hadoop.log.dir=/home/hadoop/log#configured value of hadoop.log.dir. 3 | mapred.tasktracker.tasks.sleeptime-before-sigkill=#sleep time before sig kill is to be sent to process group after sigterm is sent. Should be in seconds 4 | mapreduce.tasktracker.group=#configured value of mapreduce.tasktracker.group. 5 | 6 | 7 | mapred.task.tracker.task-controller 8 | org.apache.hadoop.mapred.LinuxTaskController 9 | 10 | 11 | 12 | mapreduce.tasktracker.group 13 | mapred 14 | 15 | -------------------------------------------------------------------------------- /Security/kerberos/README.md: -------------------------------------------------------------------------------- 1 | In production remove legacy encryption algo's and use only: 2 | 3 | default_tkt_enctypes = aes256-cts-hmac-sha1-96 aes128-cts-hmac-sha1-96 4 | default_tgs_enctypes = aes256-cts-hmac-sha1-96 aes128-cts-hmac-sha1-96 5 | permitted_enctypes = aes256-cts-hmac-sha1-96 aes128-cts-hmac-sha1-96 6 | 7 | 8 | Debugging: 9 | 10 | $ export HADOOP_ROOT_LOGGER=TRACE,console; export HADOOP_JAAS_DEBUG=true; export HADOOP_OPTS="-Dsun.security.krb5.debug=true" 11 | 12 | $ hadoop fs -ls / > >(tee fsls-logfile.txt) 2>&1 13 | 14 | $ export KRB5_TRACE=/tmp/kinit.log 15 | -------------------------------------------------------------------------------- /Security/kerberos/kdc.conf: -------------------------------------------------------------------------------- 1 | # On KDC server /var/kerberos/krb5kdc/kdc.conf 2 | 3 | [kdcdefaults] 4 | kdc_ports = 88 5 | kdc_tcp_ports = 88 6 | 7 | [realms] 8 | CLUSTER1.COM = { 9 | #master_key_type = aes256-cts 10 | max_renewable_life = 7d 0h 0m 0s #Needed for Kerberos auto ticket renewing for long running jobs and Hue KGT renewer 11 | acl_file = /var/kerberos/krb5kdc/kadm5.acl 12 | dict_file = /usr/share/dict/words 13 | admin_keytab = /var/kerberos/krb5kdc/kadm5.keytab 14 | supported_enctypes = aes256-cts:normal aes128-cts:normal 15 | default_principal_flags = +renewable #Needed for Kerberos auto ticket renewing for long running jobs and Hue KGT renewer 16 | } 17 | 18 | 19 | 20 | # Also, we need the below steps: 21 | kadmin.local: modprinc -maxrenewlife 90day krbtgt/NETXILLON.COM 22 | kadmin.local: modprinc -maxrenewlife 90day +allow_renewable hue/edge1.netxillon.com@NETXILLON.COM 23 | -------------------------------------------------------------------------------- /Security/kerberos/kerberos_scripts/README.md: -------------------------------------------------------------------------------- 1 | @Netxillon Technologies. These scripts I used for hadoop1.0 and 2.0, please update the service principals accordingly. Example tt, is no longer valid in hadoop2.0. 2 | Well, we can use any name for the service principal, but just to be consistent on naming conventions each service has a respective principal. 3 | 4 | -------------------------------------------------------------------------------- /Security/kerberos/kerberos_scripts/add_users.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for j in `cat user_list` 4 | do 5 | echo -e "hadoop\nhadoop" | kadmin.local -q "addprinc $j" 6 | done 7 | -------------------------------------------------------------------------------- /Security/kerberos/kerberos_scripts/copy_keytabs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | clush -g all --copy nn.hdfs.keytab --dest=/opt/cluster/security/ 4 | clush -g all --copy dn.hdfs.keytab --dest=/opt/cluster/security/ 5 | clush -g all --copy user.hdfs.keytab --dest=/opt/cluster/security/ 6 | clush -g all -b "chown -R hdfs:hadoop /opt/cluster/" 7 | -------------------------------------------------------------------------------- /Security/kerberos/kerberos_scripts/create_dn_princs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Generate Hosts principals 4 | 5 | for i in `cat dn_host_list` 6 | do 7 | kadmin.local -q "addprinc -randkey host/$i" 8 | kadmin.local -q "addprinc -randkey HTTP/$i" 9 | kadmin.local -q "addprinc -randkey hdfs/$i" 10 | kadmin.local -q "xst -norandkey -k dn.hdfs.keytab host/$i" 11 | kadmin.local -q "xst -norandkey -k dn.hdfs.keytab HTTP/$i" 12 | kadmin.local -q "xst -norandkey -k dn.hdfs.keytab hdfs/$i" 13 | done 14 | -------------------------------------------------------------------------------- /Security/kerberos/kerberos_scripts/create_nn_princs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for k in `cat nn_host_list` 4 | do 5 | kadmin.local -q "addprinc -randkey host/$k" 6 | kadmin.local -q "addprinc -randkey HTTP/$k" 7 | kadmin.local -q "addprinc -randkey hdfs/$k" 8 | 9 | kadmin.local -q "xst -norandkey -k nn.hdfs.keytab host/$k" 10 | kadmin.local -q "xst -norandkey -k nn.hdfs.keytab HTTP/$k" 11 | kadmin.local -q "xst -norandkey -k nn.hdfs.keytab hdfs/$k" 12 | done 13 | -------------------------------------------------------------------------------- /Security/kerberos/kerberos_scripts/create_partions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for i in `cat hosts` 4 | do 5 | ssh $i 'echo -e "o\nn\np\n1\n\n\nw"' 6 | done 7 | -------------------------------------------------------------------------------- /Security/kerberos/kerberos_scripts/create_user_keytab.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for k in `cat user_host_list` 4 | do 5 | kadmin.local -q "xst -norandkey -k user.hdfs.keytab host/$k" 6 | done 7 | 8 | for p in `cat user_list` 9 | do 10 | kadmin.local -q "xst -norandkey -k user.hdfs.keytab $p" 11 | done 12 | -------------------------------------------------------------------------------- /Security/kerberos/kerberos_scripts/delete_list: -------------------------------------------------------------------------------- 1 | HTTP/dn1.cluster1.com@CLUSTER1.COM 2 | HTTP/dn2.cluster1.com@CLUSTER1.COM 3 | HTTP/dn3.cluster1.com@CLUSTER1.COM 4 | HTTP/dn4.cluster1.com@CLUSTER1.COM 5 | HTTP/nn1.cluster1.com@CLUSTER1.COM 6 | dn/dn1.cluster1.com@CLUSTER1.COM 7 | dn/dn2.cluster1.com@CLUSTER1.COM 8 | dn/dn3.cluster1.com@CLUSTER1.COM 9 | dn/dn4.cluster1.com@CLUSTER1.COM 10 | host/dn1.cluster1.com@CLUSTER1.COM 11 | host/dn2.cluster1.com@CLUSTER1.COM 12 | host/dn3.cluster1.com@CLUSTER1.COM 13 | host/dn4.cluster1.com@CLUSTER1.COM 14 | host/nn1.cluster1.com@CLUSTER1.COM 15 | nn/nn1.cluster1.com@CLUSTER1.COM 16 | -------------------------------------------------------------------------------- /Security/kerberos/kerberos_scripts/delete_princs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source list_princs.sh | egrep "host|nn|http|dn|mapred|jt|tt" > delete_list 4 | 5 | for i in `cat delete_list` 6 | do 7 | kadmin.local -q "delprinc -force $i" 8 | done 9 | 10 | rm -rf *.keytab 11 | -------------------------------------------------------------------------------- /Security/kerberos/kerberos_scripts/dn_host_list: -------------------------------------------------------------------------------- 1 | dn1.cluster1.com 2 | dn2.cluster1.com 3 | dn3.cluster1.com 4 | dn4.cluster1.com 5 | -------------------------------------------------------------------------------- /Security/kerberos/kerberos_scripts/hosts: -------------------------------------------------------------------------------- 1 | 192.168.1.10 2 | -------------------------------------------------------------------------------- /Security/kerberos/kerberos_scripts/install_krb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | yum clean all 4 | yum install -y krb5-server krb5-workstation krb5-devel pam_krb5 krb5-libs 5 | 6 | yum install rng-tools -y 7 | 8 | echo 'EXTRAOPTIONS="-r /dev/urandom"'' > /etc/sysconfig/rngd 9 | service rngd restart 10 | yum install ntp -y 11 | 12 | chkconfig ntpd on 13 | chkconfig rngd on 14 | -------------------------------------------------------------------------------- /Security/kerberos/kerberos_scripts/list_princs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | kadmin.local -q "listprincs" 4 | -------------------------------------------------------------------------------- /Security/kerberos/kerberos_scripts/nn_host_list: -------------------------------------------------------------------------------- 1 | nn1.cluster1.com 2 | -------------------------------------------------------------------------------- /Security/kerberos/kerberos_scripts/setup_kerberos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Note: Kerberos Realm does not need to be same as domain name. Just update [domain_realm] section mapping correctly. 4 | echo -e "redhat\nredhat" | kdb5_util create -r NETXILLON.COM -s 5 | 6 | echo -e "redhat\nredhat" | kadmin.local -q "addprinc root/admin" 7 | 8 | kadmin.local -q "ktadd -k /var/kerberos/krb5kdc/kadm5.keytab kadmin/admin" 9 | kadmin.local -q "ktadd -k /var/kerberos/krb5kdc/kadm5.keytab kadmin/changepw" 10 | 11 | /etc/init.d/kadmin restart 12 | /etc/init.d/krb5kdc restart 13 | 14 | chkconfig krb5kdc on 15 | chkconfig kadmin on 16 | -------------------------------------------------------------------------------- /Security/kerberos/kerberos_scripts/user_host_list: -------------------------------------------------------------------------------- 1 | nn1.cluster1.com 2 | dn1.cluster1.com 3 | dn2.cluster1.com 4 | dn3.cluster1.com 5 | dn4.cluster1.com 6 | -------------------------------------------------------------------------------- /Security/kerberos/kerberos_scripts/user_list: -------------------------------------------------------------------------------- 1 | hdfs 2 | -------------------------------------------------------------------------------- /Security/kerberos/kerberos_user_mappings.txt: -------------------------------------------------------------------------------- 1 | This file talks about mapping the kerberos princial with local users. 2 | 3 | We can have a NN principal as: 4 | nn/_HOST@CLUSTER1.COM or hdfs/_HOST@CLUSTER1.COM 5 | 6 | If it is the first way, when Datanode sends the user (dnUsername), it will be as user "dn", which does not exist anywhere. So, NN will complain that the user "dn" is not part of supergroup. Which is right! 7 | 8 | For this we need to map users as below on all nodes under core-site.xml 9 | 10 | 11 | [hdfs@nn1 ~]$ hadoop org.apache.hadoop.security.HadoopKerberosName host/nn1.cluster1.com@CLUSTER1.COM 12 | Name: host/nn1.cluster1.com@CLUSTER1.COM to host 13 | 14 | [hdfs@nn1 ~]$ hadoop org.apache.hadoop.security.HadoopKerberosName dn/dn1.cluster1.com@CLUSTER1.COM 15 | Name: dn/dn1.cluster1.com@CLUSTER1.COM to dn 16 | 17 | After adding the rule as below in core-site.xml: 18 | 19 | 20 | hadoop.security.auth_to_local 21 | 22 | RULE:[2:$1/$2@$0](dn/.*@.*CLUSTER1.COM)s/.*/hdfs/ 23 | DEFAULT 24 | 25 | 26 | 27 | [hdfs@nn1 hadoop]$ hadoop org.apache.hadoop.security.HadoopKerberosName host/nn1.cluster1.com@CLUSTER1.COM 28 | Name: host/nn1.cluster1.com@CLUSTER1.COM to host 29 | [hdfs@nn1 ~]$ hadoop org.apache.hadoop.security.HadoopKerberosName dn/dn1.cluster1.com@CLUSTER1.COM 30 | Name: dn/dn1.cluster1.com@CLUSTER1.COM to hdfs 31 | 32 | See above that the user "dn" is translated to user "hdfs", which is part of the supergroup. All Good! 33 | 34 | Good Read: https://www.cloudera.com/documentation/enterprise/5-8-x/topics/cdh_sg_kerbprin_to_sn.html 35 | -------------------------------------------------------------------------------- /Security/kerberos/krb5.conf: -------------------------------------------------------------------------------- 1 | # On all nodes, including KDC: /etc/krb5.conf 2 | 3 | [libdefaults] 4 | default_realm = CLUSTER1.COM 5 | dns_lookup_realm = false 6 | dns_lookup_kdc = false 7 | ticket_lifetime = 24h 8 | renew_lifetime = 7d 9 | forwardable = true 10 | 11 | default_tkt_enctypes = aes256-cts-hmac-sha1-96 aes128-cts-hmac-sha1-96 12 | default_tgs_enctypes = aes256-cts-hmac-sha1-96 aes128-cts-hmac-sha1-96 13 | permitted_enctypes = aes256-cts-hmac-sha1-96 aes128-cts-hmac-sha1-96 14 | 15 | #default_tkt_enctypes = des3-cbc-sha1 des-cbc-crc 16 | #default_tgs_enctypes = des3-cbc-sha1 des-cbc-crc 17 | #permitted_enctypes = des3-cbc-sha1 des-cbc-crc 18 | udp_preference_limit = 1 19 | 20 | [realms] 21 | CLUSTER1.COM = { 22 | kdc = repo.cluster1.com:88 23 | admin_server = repo.cluster1.com:749 24 | default_domain = cluster1.com 25 | } 26 | 27 | [domain_realm] 28 | .cluster1.com = CLUSTER1.COM 29 | cluster1.com = CLUSTER1.COM 30 | 31 | [logging] 32 | kdc = FILE:/var/log/krb5kdc.log 33 | admin_server = FILE:/var/log/kadmin.log 34 | default = FILE:/var/log/krb5lib.log 35 | -------------------------------------------------------------------------------- /Security/kms/kms-setup: -------------------------------------------------------------------------------- 1 | core-site.xml 2 | 3 | 4 | hadoop.security.key.provider.path 5 | kms://http@nn1.cluster1.com:16000/kms 6 | 7 | 8 | hdfs-site.xml file and make the changes shown here: 9 | 10 | dfs.encryption.key.provider.uri 11 | kms://http@nn1.cluster1.com:16000/kms 12 | 13 | 14 | /opt/cluster/hadoop/etc/hadoop/kms-env.sh: 15 | 16 | export KMS_TEMP=${KMS_HOME}/temp 17 | 18 | 19 | kms.sh start 20 | hadoop key list 21 | hadoop key create key1 22 | hadoop fs -mkdir /secure_zone 23 | hdfs crypto -createZone -keyName key1 -path /secure_zone 24 | 25 | hdfs crypto -listZones 26 | 27 | hadoop fs -put wordcount /secure_zone 28 | hadoop fs -cat /secure_zone/wordcount 29 | hadoop fs -mkdir /unsecure 30 | -------------------------------------------------------------------------------- /Security/ldap/Installation_steps: -------------------------------------------------------------------------------- 1 | yum -y install openldap compat-openldap openldap-clients openldap-servers openldap-servers-sql openldap-devel 2 | yum -y install nss-pam-ldapd pam_ldap 3 | 4 | cp /usr/share/openldap-servers/DB_CONFIG.example /var/lib/ldap/DB_CONFIG 5 | cp /usr/share/openldap-servers/slapd.conf.obsolete slapd.conf 6 | 7 | edit "slapd.conf" and add/change the lines as below: 8 | 9 | suffix "dc=cluster1,dc=com" 10 | rootdn "cn=Manager,dc=cluster1,dc=com" 11 | rootpw {SSHA}2F2+4O43lt9jnPLrh6gjJ8tIVksTSSEg 12 | 13 | The password is generated using "slappasswd" 14 | 15 | slaptest -f /etc/openldap/slapd.conf -F /etc/openldap/slapd.d/ 16 | 17 | chown -R ldap:ldap /var/run/openldap/ 18 | chown -R ldap:ldap /var/lib/ldap 19 | chown -R ldap:ldap /etc/openldap/slap.d 20 | 21 | ldapadd -f base.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat 22 | ldapadd -f groupadd.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat 23 | ldapadd -f base1.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat 24 | ldapadd -f users.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat 25 | ldapadd -f adduser.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat 26 | 27 | #PhpldapAdmin installation; need epel repo 28 | 29 | yum -y install httpd php php-ldap phpldapadmin 30 | 31 | Then change the files as below: 32 | 33 | /etc/phpldapadmin/config.php 34 | 35 | $servers->setValue('server','name','Netxillon LDAP Server'); 36 | $servers->setValue('server','host','192.168.1.254'); 37 | $servers->setValue('login','bind_id','cn=Manager,dc=cluster1,dc=com'); 38 | 39 | comment //$servers->setValue('login','attr','uid'); 40 | uncomment $servers->setValue('login','attr','dn'); 41 | 42 | Change Deny rule to Allow in the http and restart the httpd 43 | 44 | # On Client nodes: 45 | 46 | authconfig --enableldap --enableldapauth --ldapserver=repo.cluster1.com --ldapbasedn="dc=cluster1,dc=com" --enablemkhomedir --update 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /Security/ldap/addmembers.ldif: -------------------------------------------------------------------------------- 1 | dn: cn=hadoop,ou=groups,dc=cluster1,dc=com 2 | changetype: modify 3 | add: memberuid 4 | memberuid: hdfs1 5 | -------------------------------------------------------------------------------- /Security/ldap/adduser.ldif: -------------------------------------------------------------------------------- 1 | dn: uid=hdfs1,ou=users,dc=cluster1,dc=com 2 | objectClass: top 3 | objectClass: account 4 | objectClass: posixAccount 5 | objectClass: shadowAccount 6 | cn: hdfs1 7 | uid: hdfs1 8 | uidNumber: 509 9 | gidNumber: 509 10 | homeDirectory: /home/hdfs1 11 | loginShell: /bin/bash 12 | gecos: adam 13 | userPassword: {crypt}x 14 | shadowLastChange: 0 15 | shadowMax: 0 16 | shadowWarning: 0 17 | -------------------------------------------------------------------------------- /Security/ldap/base.ldif: -------------------------------------------------------------------------------- 1 | dn: dc=cluster1,dc=com 2 | objectClass: dcObject 3 | objectClass: organization 4 | dc: cluster1 5 | o : cluster1 6 | -------------------------------------------------------------------------------- /Security/ldap/base1.ldif: -------------------------------------------------------------------------------- 1 | dn:ou=groups, dc=cluster1, dc=com 2 | objectclass: top 3 | objectclass: organizationalUnit 4 | ou: groups 5 | 6 | dn:ou=people, dc=cluster1, dc=com 7 | objectclass: top 8 | objectclass: organizationalUnit 9 | ou: people 10 | -------------------------------------------------------------------------------- /Security/ldap/base2.ldif: -------------------------------------------------------------------------------- 1 | dn: dc=cluster1,dc=com 2 | objectClass: top 3 | objectclass: organization 4 | o: cluster1 5 | -------------------------------------------------------------------------------- /Security/ldap/commands: -------------------------------------------------------------------------------- 1 | ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/cosine.ldif 2 | 962 ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/* 3 | 963 ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/collective.ldif 4 | 964 ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/core.ldif 5 | 965 ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/inetorgperson.ldif 6 | 966 ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/nis.ldif 7 | 967 ldapadd -Y EXTERNAL -H ldapi:/// -f /etc/openldap/schema/ppolicy.ldif 8 | 969 ldapadd -f groupadd.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat 9 | 971 ldapadd -f adduser.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat 10 | 972 ldappasswd -s welcome123 -W -D "cn=Manager,dc=cluster1,dc=com" -x "uid=hdfs1,ou=users,dc=cluster1,dc=com" 11 | 974 yum install pam_ldap -y 12 | 976 cat /etc/openldap/ldap.conf 13 | 989 vi /etc/openldap/ldap.conf 14 | 997 authconfig --enableldap --enableldapauth --ldapserver=repo.cluster1.com --ldapbasedn="dc=cluster1,dc=com" --enablemkhomedir --update 15 | 998 cat /etc/openldap/ldap.conf 16 | 1004 cd ldap/ 17 | 1006 ldapadd -f groupadd.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat 18 | 1008 ldapadd -f groupadd.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat 19 | 1010 ldapsearch -x -W -D "cn=Manager,dc=cluster1,dc=com" -b "uid=hdfs1,ou=users,dc=cluster1,dc=com" "(objectclass=*)" 20 | 1011 ldapsearch -x -W -D "cn=Manager,dc=cluster1,dc=com" -b "uid=hdfs1,ou=Users,dc=cluster1,dc=com" "(objectclass=*)" 21 | 1012 ldapdelete -W -D "cn=Manager,dc=cluster1,dc=com" "uid=hdfs1,ou=users,dc=cluster1,dc=com" 22 | 1013 ldapsearch -x -W -D "cn=Manager,dc=cluster1,dc=com" -b "uid=hdfs1,ou=Users,dc=cluster1,dc=com" "(objectclass=*)" 23 | 1016 ldapadd -f groupadd.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat 24 | 1017 ldapadd -f adduser.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat 25 | 1020 ldapadd -f addmembers.ldif -D cn=Manager,dc=cluster1,dc=com -x -w redhat 26 | 1024 ldapsearch -x -W -D "cn=Manager,dc=cluster1,dc=com" -b "uid=hdfs1,ou=Users,dc=cluster1,dc=com" "(objectclass=*)" 27 | -------------------------------------------------------------------------------- /Security/ldap/groupadd.ldif: -------------------------------------------------------------------------------- 1 | dn: cn=hdfs1,ou=groups,dc=cluster1,dc=com 2 | objectClass: top 3 | objectClass: posixGroup 4 | gidNumber: 509 5 | -------------------------------------------------------------------------------- /Security/ldap/slapd.conf.obsolete: -------------------------------------------------------------------------------- 1 | # 2 | # See slapd.conf(5) for details on configuration options. 3 | # This file should NOT be world readable. 4 | # 5 | 6 | include /etc/openldap/schema/corba.schema 7 | include /etc/openldap/schema/core.schema 8 | include /etc/openldap/schema/cosine.schema 9 | include /etc/openldap/schema/duaconf.schema 10 | include /etc/openldap/schema/dyngroup.schema 11 | include /etc/openldap/schema/inetorgperson.schema 12 | include /etc/openldap/schema/java.schema 13 | include /etc/openldap/schema/misc.schema 14 | include /etc/openldap/schema/nis.schema 15 | include /etc/openldap/schema/openldap.schema 16 | include /etc/openldap/schema/ppolicy.schema 17 | include /etc/openldap/schema/collective.schema 18 | 19 | # Allow LDAPv2 client connections. This is NOT the default. 20 | allow bind_v2 21 | 22 | # Do not enable referrals until AFTER you have a working directory 23 | # service AND an understanding of referrals. 24 | #referral ldap://root.openldap.org 25 | 26 | pidfile /var/run/openldap/slapd.pid 27 | argsfile /var/run/openldap/slapd.args 28 | 29 | # Load dynamic backend modules 30 | # - modulepath is architecture dependent value (32/64-bit system) 31 | # - back_sql.la overlay requires openldap-server-sql package 32 | # - dyngroup.la and dynlist.la cannot be used at the same time 33 | 34 | # modulepath /usr/lib/openldap 35 | # modulepath /usr/lib64/openldap 36 | 37 | # moduleload accesslog.la 38 | # moduleload auditlog.la 39 | # moduleload back_sql.la 40 | # moduleload chain.la 41 | # moduleload collect.la 42 | # moduleload constraint.la 43 | # moduleload dds.la 44 | # moduleload deref.la 45 | # moduleload dyngroup.la 46 | # moduleload dynlist.la 47 | # moduleload memberof.la 48 | # moduleload pbind.la 49 | # moduleload pcache.la 50 | # moduleload ppolicy.la 51 | # moduleload refint.la 52 | # moduleload retcode.la 53 | # moduleload rwm.la 54 | # moduleload seqmod.la 55 | # moduleload smbk5pwd.la 56 | # moduleload sssvlv.la 57 | # moduleload syncprov.la 58 | # moduleload translucent.la 59 | # moduleload unique.la 60 | # moduleload valsort.la 61 | 62 | # The next three lines allow use of TLS for encrypting connections using a 63 | # dummy test certificate which you can generate by running 64 | # /usr/libexec/openldap/generate-server-cert.sh. Your client software may balk 65 | # at self-signed certificates, however. 66 | TLSCACertificatePath /etc/openldap/certs 67 | TLSCertificateFile "\"OpenLDAP Server\"" 68 | TLSCertificateKeyFile /etc/openldap/certs/password 69 | 70 | # Sample security restrictions 71 | # Require integrity protection (prevent hijacking) 72 | # Require 112-bit (3DES or better) encryption for updates 73 | # Require 63-bit encryption for simple bind 74 | # security ssf=1 update_ssf=112 simple_bind=64 75 | 76 | # Sample access control policy: 77 | # Root DSE: allow anyone to read it 78 | # Subschema (sub)entry DSE: allow anyone to read it 79 | # Other DSEs: 80 | # Allow self write access 81 | # Allow authenticated users read access 82 | # Allow anonymous users to authenticate 83 | # Directives needed to implement policy: 84 | # access to dn.base="" by * read 85 | # access to dn.base="cn=Subschema" by * read 86 | # access to * 87 | # by self write 88 | # by users read 89 | # by anonymous auth 90 | # 91 | # if no access controls are present, the default policy 92 | # allows anyone and everyone to read anything but restricts 93 | # updates to rootdn. (e.g., "access to * by * read") 94 | # 95 | # rootdn can always read and write EVERYTHING! 96 | 97 | # enable on-the-fly configuration (cn=config) 98 | database config 99 | access to * 100 | by dn.exact="gidNumber=0+uidNumber=0,cn=peercred,cn=external,cn=auth" manage 101 | by * none 102 | 103 | # enable server status monitoring (cn=monitor) 104 | database monitor 105 | access to * 106 | by dn.exact="gidNumber=0+uidNumber=0,cn=peercred,cn=external,cn=auth" read 107 | by dn.exact="cn=Manager,dc=my-domain,dc=com" read 108 | by * none 109 | 110 | ####################################################################### 111 | # database definitions 112 | ####################################################################### 113 | 114 | database bdb 115 | suffix "dc=my-domain,dc=com" 116 | checkpoint 1024 15 117 | rootdn "cn=Manager,dc=my-domain,dc=com" 118 | # Cleartext passwords, especially for the rootdn, should 119 | # be avoided. See slappasswd(8) and slapd.conf(5) for details. 120 | # Use of strong authentication encouraged. 121 | # rootpw secret 122 | # rootpw {crypt}ijFYNcSNctBYg 123 | 124 | # The database directory MUST exist prior to running slapd AND 125 | # should only be accessible by the slapd and slap tools. 126 | # Mode 700 recommended. 127 | directory /var/lib/ldap 128 | 129 | # Indices to maintain for this database 130 | index objectClass eq,pres 131 | index ou,cn,mail,surname,givenname eq,pres,sub 132 | index uidNumber,gidNumber,loginShell eq,pres 133 | index uid,memberUid eq,pres,sub 134 | index nisMapName,nisMapEntry eq,pres,sub 135 | 136 | # Replicas of this database 137 | #replogfile /var/lib/ldap/openldap-master-replog 138 | #replica host=ldap-1.example.com:389 starttls=critical 139 | # bindmethod=sasl saslmech=GSSAPI 140 | # authcId=host/ldap-master.example.com@EXAMPLE.COM 141 | -------------------------------------------------------------------------------- /Security/ldap/test.ldif: -------------------------------------------------------------------------------- 1 | dn: cn=Jim Bob,ou=people,dc=cluster1,dc=com 2 | objectclass: top 3 | objectclass: person 4 | objectclass: organizationalPerson 5 | objectclass: inetOrgPerson 6 | cn: Jim Bob 7 | sn: Bob 8 | mail: jimbob@example.com 9 | ou: sales 10 | -------------------------------------------------------------------------------- /Security/ldap/users.ldif: -------------------------------------------------------------------------------- 1 | dn: ou=Users,dc=cluster1,dc=com 2 | objectClass: organizationalUnit 3 | ou: Users 4 | -------------------------------------------------------------------------------- /Spark/Spark_build: -------------------------------------------------------------------------------- 1 | $ which mvn 2 | /opt/apache-maven-3.3.9/bin/mvn 3 | 4 | $ cat /etc/profile.d/maven.sh 5 | export JAVA_HOME=/usr/java/latest 6 | export M3_HOME=/opt/apache-maven-3.3.9 7 | export PATH=$JAVA_HOME/bin:${M3_HOME}/bin:/home/ec2-user/jruby-9.1.1.0/bin/:$PATH 8 | 9 | 10 | Build: 11 | 12 | $ git clone git://git.apache.org/spark.git 13 | $ cd spark 14 | $ mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -Dscala-2.11 -Phive -Phive-thriftserver -DskipTests clean package 15 | 16 | To build a distribution: 17 | 18 | ./dev/make-distribution.sh --tgz -Phadoop-2.7 -Phive -Phive-thriftserver -Pyarn -DskipTests 19 | -------------------------------------------------------------------------------- /Spark/examples.txt: -------------------------------------------------------------------------------- 1 | # To get rid of the warning on Hadoop native library 2 | export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native 3 | 4 | [hdfs@edge1 ~]$ spark-shell --master yarn --driver-memory 512m --executor-memory 512m 5 | scala> 6 | val file = sc.textFile("/test") 7 | val counts = file.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey(_ + _) 8 | counts.saveAsTextFile("/tmp/wordcount") 9 | counts.count() 10 | 11 | 12 | Examples using Python: 13 | 14 | spark-submit ~/sparkPython/square.py --master yarn --deploy-mode cluster 15 | 16 | spark-submit ~/sparkPython/wordcount.py --master yarn --deploy-mode cluster (Copy the file on which count needs to be done on hdfs path) 17 | -------------------------------------------------------------------------------- /Spark/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | spark.master yarn 2 | spark.eventLog.enabled true 3 | spark.driver.memory 1024m 4 | spark.yarn.am.memory 1024m 5 | 6 | spark.yarn.jars hdfs://nn1.dilithium.com:9000/spark_jars/jars/* 7 | or 8 | spark.yarn.archive hdfs://nn1.dilithium.com:9000/spark_jars/spark-libs.jar 9 | 10 | #if using archive: $ jar cv0f spark-libs.jar -C $SPARK_HOME/jars/ . 11 | 12 | spark.serializer org.apache.spark.serializer.KryoSerializer 13 | spark.eventLog.dir hdfs://nn1.dilithium.com:9000/spark_logs 14 | spark.history.fs.logDirectory hdfs://nn1.dilithium.com:9000/spark_logs 15 | spark.history.provider org.apache.spark.deploy.history.FsHistoryProvider 16 | spark.history.fs.update.interval 10s 17 | spark.history.ui.port 18080 18 | 19 | 20 | 21 | yarn-site.xml (Tested on spark 2.2.1) 22 | 23 | 24 | 25 | yarn.nodemanager.aux-services.spark_shuffle.class 26 | org.apache.spark.network.yarn.YarnShuffleService 27 | 28 | 29 | 30 | yarn.nodemanager.aux-services.spark2_shuffle.class 31 | org.apache.spark.network.yarn.YarnShuffleService 32 | 33 | 34 | 35 | yarn.nodemanager.aux-services 36 | mapreduce_shuffle,spark_shuffle,spark2_shuffle 37 | 38 | -------------------------------------------------------------------------------- /Spark/sparkPython/erfunction.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | sc = SparkContext() 3 | log = sc.textFile("/Users/pkuma380/sparkPython/error.txt") 4 | 5 | def errorcontain(s): 6 | return "ERROR" in s 7 | f_log = log.filter(errorcontain) 8 | for line in f_log.take(10): 9 | print "Start output", line 10 | 11 | -------------------------------------------------------------------------------- /Spark/sparkPython/error.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | sc = SparkContext() 3 | log = sc.textFile("/Users/pkuma380/sparkPython/error.txt") 4 | f_log = log.filter(lambda data: "ERROR" in data) 5 | for line in f_log.take(10): 6 | print line 7 | -------------------------------------------------------------------------------- /Spark/sparkPython/error.txt: -------------------------------------------------------------------------------- 1 | Spark Command: /Library/Java/JavaVirtualMachines/jdk1.8.0_60.jdk/Contents/Home//bin/java -cp /usr/local/spark/spark-1.3.1-bin-hadoop2.6/sbin/../conf:/usr/local/spark/spark-1.3.1-bin-hadoop2.6/lib/spark-assembly-1.3.1-hadoop2.6.0.jar:/usr/local/spark/spark-1.3.1-bin-hadoop2.6/lib/datanucleus-api-jdo-3.2.6.jar:/usr/local/spark/spark-1.3.1-bin-hadoop2.6/lib/datanucleus-core-3.2.10.jar:/usr/local/spark/spark-1.3.1-bin-hadoop2.6/lib/datanucleus-rdbms-3.2.9.jar -Dspark.akka.logLifecycleEvents=true -Xms512m -Xmx512m org.apache.spark.deploy.master.Master --ip BGLC02M965AFH01 --port 7077 --webui-port 8080 2 | ======================================== 3 | 4 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 5 | 16/02/17 20:19:40 INFO Master: Registered signal handlers for [TERM, HUP, INT] 6 | 16/02/17 20:20:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 7 | 16/02/17 20:20:11 INFO SecurityManager: Changing view acls to: pkuma380 8 | 16/02/17 20:20:11 INFO SecurityManager: Changing modify acls to: pkuma380 9 | 16/02/17 20:20:11 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(pkuma380); users with modify permissions: Set(pkuma380) 10 | 16/02/17 20:20:11 INFO Slf4jLogger: Slf4jLogger started 11 | 16/02/17 20:20:11 INFO Remoting: Starting remoting 12 | 16/02/17 20:20:12 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkMaster@BGLC02M965AFH01:7077] 13 | 16/02/17 20:20:12 INFO Remoting: Remoting now listens on addresses: [akka.tcp://sparkMaster@BGLC02M965AFH01:7077] 14 | 16/02/17 20:20:12 INFO Utils: Successfully started service 'sparkMaster' on port 7077. 15 | 16/02/17 20:20:12 INFO Server: jetty-8.y.z-SNAPSHOT 16 | 16/02/17 20:20:12 INFO AbstractConnector: Started SelectChannelConnector@BGLC02M965AFH01:6066 17 | 16/02/17 20:20:12 INFO Utils: Successfully started service on port 6066. 18 | 16/02/17 20:20:12 INFO StandaloneRestServer: Started REST server for submitting applications on port 6066 19 | 16/02/17 20:20:12 INFO Master: Starting Spark master at spark://BGLC02M965AFH01:7077 20 | 16/02/17 20:20:12 INFO Master: Running Spark version 1.3.1 21 | 16/02/17 20:20:13 INFO Server: jetty-8.y.z-SNAPSHOT 22 | 16/02/17 20:20:13 INFO AbstractConnector: Started SelectChannelConnector@0.0.0.0:8080 23 | 16/02/17 20:20:13 INFO Utils: Successfully started service 'MasterUI' on port 8080. 24 | 16/02/17 20:20:13 INFO MasterWebUI: Started MasterWebUI at http://192.168.0.51:8080 25 | 16/02/17 20:20:13 INFO Master: I have been elected leader! New state: ALIVE 26 | 16/02/18 15:14:37 ERROR Master: RECEIVED SIGNAL 15: SIGTERM 27 | 16/02/18 15:14:37 ERROR Master: RECEIVED SIGNAL 15: SIGTERM 28 | -------------------------------------------------------------------------------- /Spark/sparkPython/logparser.py: -------------------------------------------------------------------------------- 1 | import sys 2 | data = "/Users/pkuma380/sparkPython/error.txt" 3 | for line in open(data): 4 | #columns = line.split(" ") 5 | #if len(columns) > 1: 6 | if '16' in line: 7 | date =line.split("(\s+)") 8 | 9 | print date 10 | -------------------------------------------------------------------------------- /Spark/sparkPython/pivot.txt: -------------------------------------------------------------------------------- 1 | userid age country number_of_calls 2 | x01 41 US 3 3 | x01 41 UK 1 4 | x01 41 CA 2 5 | x01 72 US 4 6 | x02 72 UK 6 7 | x02 72 CA 7 8 | x02 72 XX 8 9 | x02 72 XB 8 10 | x02 72 NA 9 11 | -------------------------------------------------------------------------------- /Spark/sparkPython/square.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | sc = SparkContext() 3 | #data = sc.parallelize([1,2,3,4,5]) 4 | 5 | def square(sq): 6 | return sq * sq 7 | data = sc.parallelize([1,2,3,4,5]) 8 | sq= data.map(square) 9 | for line in sq.collect(): 10 | print line 11 | 12 | -------------------------------------------------------------------------------- /Spark/sparkPython/wordcount.py: -------------------------------------------------------------------------------- 1 | from operator import add 2 | from pyspark import SparkContext 3 | sc= SparkContext() 4 | file = sc.textFile("/user/hdfs/sparkPython/wordcount.txt") 5 | word = file.flatMap(lambda x: x.split(" ")) 6 | mapword = word.map(lambda x: (x, 1)) 7 | reduceword = mapword.reduceByKey(add) 8 | output = reduceword.collect() 9 | nums = sc.parallelize([output]) 10 | for i in nums.collect(): 11 | print i 12 | -------------------------------------------------------------------------------- /Spark/sparkPython/wordcount.txt: -------------------------------------------------------------------------------- 1 | 49 2 | 2 3 | volumename 4 | ainduk 5 | apps 6 | axp 7 | axp.admin 8 | axp.apptests 9 | axp.hivequerylogs 10 | axp.mirror 11 | axp.mirror.home 12 | bjaya 13 | bmanikya 14 | dprichar 15 | dschexna 16 | gsing140 17 | hyalama 18 | idn 19 | idn.home 20 | kvall3 21 | kvarakan 22 | mapr.cldb.internal 23 | mapr.cluster.root 24 | mapr.hbase 25 | mapr.tmp 26 | mirror-cstonedb-vol2-test 27 | mirror-silver-datameer 28 | mysqlbcp 29 | naveenmirrortest 30 | ngupt131 31 | phari 32 | pigtemp 33 | PlatinumDR_Mysql_Backups 34 | psing141 35 | rsyncappsvrs 36 | rsynces 37 | rsyncmllab 38 | rsyncplatdrm5 39 | rsyncsilverm5 40 | rsyncsilverm7 41 | rsyncskytree 42 | rsyncstorm 43 | smanubo 44 | spark 45 | spool4 46 | spoudel 47 | twilli1 48 | ukris 49 | users 50 | vkomat 51 | zsmit3 52 | zsmit3 53 | -------------------------------------------------------------------------------- /Spark/spark_command.txt: -------------------------------------------------------------------------------- 1 | YARN Node Labels: 2 | 3 | $ spark-submit --class org.apache.spark.examples.SparkPi --queue root.prod --conf spark.yarn.am.nodeLabelExpression=spark --conf spark.yarn.executor.nodeLabelExpression=spark --executor-memory 512m --num-executors 1 --driver-memory 512m --master yarn --deploy-mode cluster /opt/cloudera/parcels/CDH/jars/spark-examples*.jar 10 4 | 5 | # https://www.ibm.com/support/pages/yarn-node-labels-label-based-scheduling-and-resource-isolation-hadoop-dev 6 | # https://docs.cloudera.com/runtime/7.0.2/yarn-allocate-resources/topics/yarn-configuring-node-labels.html 7 | # https://docs.cloudera.com/cdp-private-cloud-base/7.1.5/yarn-allocate-resources/topics/yarn-associate-node-labels-with-queues.html 8 | 9 | Start cluster, after settign ssh-passphrase from master (Only for non Yarn cluster) 10 | 11 | $ /opt/cluster/spark/sbin/stop-all.sh 12 | $ /opt/cluster/spark/sbin/start-all.sh 13 | 14 | 15 | Tip 16 | === 17 | To avoid loading assembly jar every time, set env variable as below, as copying jar to hadoop 18 | 19 | export SPARK_JAR=hdfs://nn1.dilithium.com:9000/user/hdfs/share/lib/spark-assembly-1.4.1-hadoop2.6.0.jar 20 | 21 | Submit jobs in 3 modes 22 | 23 | $ spark-submit --class org.apache.spark.examples.SparkPi /opt/cluster/spark/lib/spark-examples-1.6.1-hadoop2.2.0.jar 10 --master spark://rt1.cyrus.com:7077 24 | $ spark-submit --class org.apache.spark.examples.SparkPi /opt/cluster/spark/lib/spark-examples-1.6.1-hadoop2.2.0.jar 100 --master yarn --deploy-mode cluster 25 | $ spark-submit --class org.apache.spark.examples.SparkPi /opt/cluster/spark/lib/spark-examples-1.6.1-hadoop2.2.0.jar 100 --master yarn --deploy-mode client 26 | 27 | $ spark-submit --class org.apache.spark.examples.SparkPi /usr/lib/spark/lib/spark-examples-1.6.1-hadoop2.7.2-amzn-1.jar 100 --master yarn-master 28 | 29 | Other ways of running it 30 | ------------------------- 31 | $ spark-shell --master yarn 32 | $ spark-submit --verbose ~/sparkPython/square.py --master yarn --deploy-mode cluster 33 | $ spark-submit --verbose ~/sparkPython/square.py --master yarn-cluster --deploy-mode cluster 34 | $ spark-submit --verbose ~/sparkPython/square.py --master yarn-client --deploy-mode cluster 35 | 36 | -------------------------------------------------------------------------------- /Spark/spark_standalone_cluster.txt: -------------------------------------------------------------------------------- 1 | Spark Yarn Cluster Setup 2 | ========================= 3 | 4 | # nodes 5 | 6 | edge1.dilithium.com(master) 7 | edge2.dilithium.com(worker) 8 | hbm1.dilithium.com(worker) 9 | ------------------------------------- 10 | [hdfs@edge1 conf]$ cat spark-env.sh 11 | 12 | HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop 13 | 14 | SPARK_LOCAL_IP=192.168.1.18 15 | SPARK_MASTER_IP=edge1.dilithium.com 16 | 17 | export SPARK_WORKER_MEMORY=256m 18 | export SPARK_EXECUTOR_MEMORY=128m 19 | export SPARK_WORKER_INSTANCES=1 20 | export SPARK_WORKER_CORES=1 21 | export SPARK_WORKER_DIR=/home/hdfs/work/sparkdata 22 | ------------------------------------------------- 23 | [hdfs@edge2 conf]$ cat spark-env.sh 24 | 25 | HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop 26 | 27 | SPARK_LOCAL_IP=192.168.1.19 28 | SPARK_MASTER_IP=edge1.dilithium.com 29 | 30 | export SPARK_WORKER_MEMORY=256m 31 | export SPARK_EXECUTOR_MEMORY=128m 32 | export SPARK_WORKER_INSTANCES=2 33 | export SPARK_WORKER_CORES=1 34 | export SPARK_WORKER_DIR=/home/hdfs/work/sparkdata 35 | ------------------------------------------------- 36 | [hdfs@hbm1 conf]$ cat spark-env.sh 37 | 38 | HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop 39 | 40 | SPARK_LOCAL_IP=192.168.1.30 41 | SPARK_MASTER_IP=edge1.dilithium.com 42 | 43 | export SPARK_WORKER_MEMORY=256m 44 | export SPARK_EXECUTOR_MEMORY=128m 45 | export SPARK_WORKER_INSTANCES=2 46 | export SPARK_WORKER_CORES=1 47 | export SPARK_WORKER_DIR=/home/hdfs/work/sparkdata 48 | -------------------------------------------------- 49 | on Master node(edge1) 50 | 51 | [hdfs@edge1 conf]$ cat slaves 52 | # A Spark Worker will be started on each of the machines listed below. 53 | 54 | edge2.dilithium.com 55 | hbm1.dilithium.com 56 | -------------------- 57 | On all nodes in the cluster 58 | 59 | [hdfs@edge1 conf]$ cat spark-defaults.conf 60 | # Default system properties included when running spark-submit. 61 | # This is useful for setting default environmental settings. 62 | 63 | # Example: 64 | spark.master spark://edge1.dilithium.com:7077 65 | spark.eventLog.enabled true 66 | spark.serializer org.apache.spark.serializer.KryoSerializer 67 | spark.eventLog.dir hdfs://nn1.dilithium.com:9000/user/hdfs/spark_logs 68 | 69 | -------------------------------------------------------------------------------- /Spark/yarn-site.xml.spark: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | yarn.nodemanager.aux-services 8 | mapreduce_shuffle,spark_shuffle,spark2_shuffle 9 | 10 | 11 | 12 | yarn.nodemanager.aux-services.mapreduce.shuffle.class 13 | org.apache.hadoop.mapred.ShuffleHandler 14 | 15 | 16 | 17 | yarn.nodemanager.aux-services.spark_shuffle.class 18 | org.apache.spark.network.yarn.YarnShuffleService 19 | 20 | 21 | 22 | 23 | yarn.nodemanager.aux-services.spark2_shuffle.class 24 | org.apache.spark.network.yarn.YarnShuffleService 25 | 26 | 27 | 28 | yarn.resourcemanager.resource-tracker.address 29 | rm1.dilithium.com:9001 30 | 31 | 32 | 33 | yarn.resourcemanager.scheduler.address 34 | rm1.dilithium.com:9002 35 | 36 | 37 | 38 | yarn.resourcemanager.address 39 | rm1.dilithium.com:9003 40 | 41 | 42 | # 43 | #yarn.nodemanager.local-dirs 44 | #file:/space/tmp1,file:/space/tmp2 45 | # 46 | 47 | 48 | yarn.nodemanager.resource.memory-mb 49 | 3072 50 | 51 | 52 | 53 | yarn.scheduler.minimum-allocation-mb 54 | 256 55 | 56 | 57 | 58 | yarn.scheduler.maximum-allocation-mb 59 | 3072 60 | 61 | 62 | 63 | yarn.scheduler.minimum-allocation-vcores 64 | 1 65 | 66 | 67 | 68 | yarn.scheduler.maximum-allocation-vcores 69 | 12 70 | 71 | 72 | 73 | yarn.nodemanager.resource.cpu-vcores 74 | 12 75 | 76 | 77 | 78 | 79 | yarn.nodemanager.vmem-pmem-ratio 80 | 2.1 81 | 82 | 83 | # 84 | # yarn.nodemanager.vmem-check-enabled 85 | # false 86 | # Whether virtual memory limits will be enforced for containers 87 | # 88 | 89 | 90 | yarn.log-aggregation-enable 91 | true 92 | 93 | 94 | 95 | Where to aggregate logs to. 96 | yarn.nodemanager.remote-app-log-dir 97 | /tmp/logs 98 | 99 | 100 | 101 | yarn.log-aggregation.retain-seconds 102 | 259200 103 | 104 | 105 | 106 | yarn.log-aggregation.retain-check-interval-seconds 107 | 3600 108 | 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /Yarn_tuning/Yarn.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/Yarn_tuning/Yarn.pdf -------------------------------------------------------------------------------- /Yarn_tuning/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | mapreduce.map.memory.mb 3 | 768 4 | 5 | 6 | 7 | mapreduce.reduce.memory.mb 8 | 768 9 | 10 | 11 | 12 | mapreduce.map.java.opts 13 | -Xmx512m 14 | 15 | 16 | 17 | mapreduce.reduce.java.opts 18 | -Xmx512m 19 | 20 | -------------------------------------------------------------------------------- /Yarn_tuning/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | yarn.app.mapreduce.am.resource.mb 3 | 1024 4 | 5 | 6 | 7 | yarn.nodemanager.resource.memory-mb 8 | 2048 9 | 10 | 11 | 12 | yarn.scheduler.minimum-allocation-mb 13 | 512 14 | 15 | 16 | 17 | yarn.scheduler.maximum-allocation-mb 18 | 1024 19 | 20 | 21 | 22 | yyarn.scheduler.minimum-allocation-vcores 23 | 1 24 | 25 | 26 | 27 | yarn.scheduler.maximum-allocation-vcores 28 | 2 29 | 30 | 31 | 32 | yarn.nodemanager.pmem-check-enabled 33 | false 34 | 35 | 36 | 37 | yarn.nodemanager.vmem-check-enabled 38 | false 39 | 40 | 41 | 42 | yarn.nodemanager.vmem-pmem-ratio 43 | 2.1 44 | 45 | 46 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /hadoop1.0/README.md: -------------------------------------------------------------------------------- 1 | For all the Configurations related to Hadoop 1.x, please use the below branch 2 | 3 | https://github.com/netxillon/hadoop/tree/Hadoop1 4 | -------------------------------------------------------------------------------- /hadoop2.0/bash_profile: -------------------------------------------------------------------------------- 1 | # .bash_profile 2 | 3 | # Get the aliases and functions 4 | if [ -f ~/.bashrc ]; then 5 | . ~/.bashrc 6 | fi 7 | 8 | # User specific environment and startup programs 9 | 10 | # User specific aliases and functions 11 | 12 | #export HADOOP_HOME=/home/hadoop/hadoop-2.2.0 13 | 14 | export HADOOP_HOME=/home/hadoop/hadoop 15 | export HADOOP_MAPRED_HOME=$HADOOP_HOME 16 | export HADOOP_COMMON_HOME=$HADOOP_HOME 17 | export HADOOP_HDFS_HOME=$HADOOP_HOME 18 | export YARN_HOME=$HADOOP_HOME 19 | export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop 20 | export YARN_CONF_DIR=$HADOOP_HOME/etc/hadoop 21 | 22 | export JAVA_HOME=/usr/java/default 23 | 24 | 25 | PATH=$HADOOP_HOME/bin/:$HADOOP_HOME/sbin/:$JAVA_HOME/bin/:$PATH 26 | export PATH 27 | -------------------------------------------------------------------------------- /hadoop2.0/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | fs.defaultFS 4 | hdfs://ha-nn1.hacluster1.com:9000 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /hadoop2.0/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | Namenode 2 | ======== 3 | 4 | 5 | 6 | 7 | dfs.namenode.name.dir 8 | file:/data/namenode 9 | 10 | 11 | 12 | dfs.replication 13 | 1 14 | 15 | 16 | 17 | dfs.blocksize 18 | 134217728 19 | 20 | 21 | 22 | 23 | Datanode 24 | ======== 25 | 26 | 27 | 28 | 29 | dfs.datanode.data.dir 30 | file:/data/datanode 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /hadoop2.0/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | # The framework can be local, classic or yarn 4 | 5 | 6 | mapreduce.framework.name 7 | yarn 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /hadoop2.0/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | yarn.nodemanager.aux-services 5 | mapreduce_shuffle 6 | 7 | 8 | 9 | yarn.nodemanager.aux-services.mapreduce.shuffle.class 10 | org.apache.hadoop.mapred.ShuffleHandler 11 | 12 | 13 | 14 | yarn.resourcemanager.resource-tracker.address 15 | nn2.cluster1.com:9001 16 | 17 | 18 | 19 | yarn.resourcemanager.scheduler.address 20 | nn2.cluster1.com:9002 21 | 22 | 23 | 24 | yarn.resourcemanager.address 25 | nn2.cluster1.com:9003 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /hadoop_build64bit: -------------------------------------------------------------------------------- 1 | Build 64 bit Hadoop 2 | =================== 3 | 4 | 5 | 1. yum -y install gcc gcc-c++ openssl-devel make cmake zlib* libssl* autoconf automake libtool cyrus-sasl* libgsasl-devel* java-1.8.0-openjdk.x86_64 java-1.8.0-openjdk-devel.x86_64 6 | 7 | 2. Download Maven: wget http://mirrors.gigenet.com/apache/maven/maven-3/3.3.3/binaries/apache-maven-3.3.3-bin.tar.gz 8 | 9 | tar -zxf apache-maven-3.3.3-bin.tar.gz -C /opt/ 10 | 11 | setup maven environment 12 | 13 | [root@repo67 ~]# cat /etc/profile.d/maven.sh 14 | export JAVA_HOME=/usr/java/latest 15 | export M3_HOME=/opt/apache-maven-3.3.3 16 | export PATH=$JAVA_HOME/bin:/opt/apache-maven-3.3.3/bin:$PATH 17 | 18 | 3. Download protobuf: wget https://github.com/google/protobuf/releases/download/v2.5.0/protobuf-2.5.0.tar.gz 19 | 20 | tar -xzf protobuf-2.5.0.tar.gz -C /opt 21 | 22 | cd /opt/protobuf-2.5.0/ 23 | ./configure 24 | make;make install 25 | 26 | 4. Download latest stable hadoop source code , example hadoop-2.7.2-src.tar.gz 27 | 28 | tar -xzf hadoop-2.7.2-src.tar.gz -C /opt/ 29 | cd /opt/hadoop-2.7.2-src 30 | mvn package -Pdist,native -DskipTests -Dtar -Dmaven.skip.test=true -Dmaven.javadoc.skip=true 31 | 32 | you will see a tar ball under hadoop-2.7.2-src/hadoop-dist/target/ 33 | 34 | Enjoy !! 35 | 36 | 37 | Updated for maven 3.6.3 and protobuf 3.7.1 38 | ------------------------------------------ 39 | Supported version hadoop 3.3.0 40 | 41 | 1. wget http://mirror.intergrid.com.au/apache/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.tar.gz 42 | 43 | 2. wget https://cmake.org/files/v3.6/cmake-3.6.2.tar.gz 44 | tar -zxvf cmake-3.6.2.tar.gz 45 | cd cmake-3.6.2 46 | ./bootstrap --prefix=/usr/local 47 | make; make install 48 | PATH=/usr/local/bin:$PATH 49 | 50 | 3. wget https://github.com/protocolbuffers/protobuf/releases/download/v3.7.1/protobuf-cpp-3.7.1.tar.gz 51 | 52 | - For all version prior to hadoop 3.3.0, protobuf version = 2.5.0. 53 | - For Hadoop 3.x and higher cmake version must be greater than 3.2 54 | -------------------------------------------------------------------------------- /jumbune: -------------------------------------------------------------------------------- 1 | hadoop-env.sh 2 | 3 | export HADOOP_NAME NODE_OPTS="-Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=5677 $HADOOP_NAME NODE_OPTS" 4 | export HADOOP_DATANODE_OPTS="-Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=5679 $HADOOP_DATANODE_OPTS" 5 | 6 | yarn-env.sh 7 | 8 | export YARN_NODEMANAGER_OPTS="-Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=5678 $YARN_NODEMANAGER_OPTS" 9 | export YARN_RESOURCEMANAGER_OPTS="-Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=5680 $YARN_RESOURCEMANAGER_OPTS" 10 | -------------------------------------------------------------------------------- /logging: -------------------------------------------------------------------------------- 1 | 2 | We can get and set the log level dynamically using the daemonlog command 3 | =================== 4 | 5 | $ hadoop daemonlog -getlevel master1.cyrus.com:50070 org.apache.hadoop.dfs.NameNode 6 | Connecting to http://master1.cyrus.com:50070/logLevel?log=org.apache.hadoop.dfs.NameNode 7 | Submitted Log Name: org.apache.hadoop.dfs.NameNode 8 | Log Class: org.apache.commons.logging.impl.Log4JLogger 9 | Effective level: INFO 10 | 11 | 12 | $ hadoop daemonlog -getlevel master1.cyrus.com:50070 org.apache.hadoop.hdfs.server.namenode.NodeNode 13 | Connecting to http://master1.cyrus.com:50070/logLevel?log=org.apache.hadoop.hdfs.server.namenode.NodeNode 14 | Submitted Log Name: org.apache.hadoop.hdfs.server.namenode.NodeNode 15 | Log Class: org.apache.commons.logging.impl.Log4JLogger 16 | Effective level: INFO 17 | 18 | +++++++++++++ 19 | 20 | The logs are of the format /var/log/hadoop/hadoop-hadoop-$HADOOP_IDENT_STRING-.log 21 | 22 | Thinking of changing $HADOOP_IDENT_STRING ? 23 | 24 | Not a good idea: 25 | 26 | $HADOOP_IDENT_STRING=$USER (Do not try to change to any custom value, because the PID etc all are tracked by this 27 | and your scripts like hadoop-daemon.sh will fail. 28 | -------------------------------------------------------------------------------- /map_scripts/job.txt: -------------------------------------------------------------------------------- 1 | $ hadoop jar contrib/streaming/hadoop-*streaming*.jar -file /home/hadoop/mapper.py -mapper /home/hadoop/mapper.py -file /home/hadoop/reducer.py -reducer /home/hadoop/reducer.py -input /input -output /output 2 | 3 | 4 | 5 | hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar -D stream.num.map.output.key.fields=2 -input /input /out -mapper /home/hadoop/mapper.sh -reducer /home/hadoop/reducer.sh -------------------------------------------------------------------------------- /map_scripts/mapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | # input comes from STDIN (standard input) 6 | for line in sys.stdin: 7 | # remove leading and trailing whitespace 8 | line = line.strip() 9 | # split the line into words 10 | words = line.split() 11 | # increase counters 12 | for word in words: 13 | # write the results to STDOUT (standard output); 14 | # what we output here will be the input for the 15 | # Reduce step, i.e. the input for reducer.py 16 | # 17 | # tab-delimited; the trivial word count is 1 18 | print '%s\t%s' % (word, 1) 19 | -------------------------------------------------------------------------------- /map_scripts/mapper.sh: -------------------------------------------------------------------------------- 1 | [training@localhost steve]$ cat maptf.sh 2 | #!/bin/bash 3 | 4 | exclude="\.\,?!\-_:;\]\[\#\|\$()\"" 5 | while read split; do 6 | for word in $split; do 7 | term=`echo "${word//[$exclude]/}" | tr [:upper:] [:lower:]` 8 | if [ -n "$term" ]; then 9 | printf "%s\t%s\t%s\n" "$term" "$map_input_file" "1" 10 | fi 11 | done 12 | done -------------------------------------------------------------------------------- /map_scripts/reducer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from operator import itemgetter 4 | import sys 5 | 6 | current_word = None 7 | current_count = 0 8 | word = None 9 | 10 | # input comes from STDIN 11 | for line in sys.stdin: 12 | # remove leading and trailing whitespace 13 | line = line.strip() 14 | 15 | # parse the input we got from mapper.py 16 | word, count = line.split('\t', 1) 17 | 18 | # convert count (currently a string) to int 19 | try: 20 | count = int(count) 21 | except ValueError: 22 | # count was not a number, so silently 23 | # ignore/discard this line 24 | continue 25 | 26 | # this IF-switch only works because Hadoop sorts map output 27 | # by key (here: word) before it is passed to the reducer 28 | if current_word == word: 29 | current_count += count 30 | else: 31 | if current_word: 32 | # write result to STDOUT 33 | print '%s\t%s' % (current_word, current_count) 34 | current_count = count 35 | current_word = word 36 | 37 | # do not forget to output the last word if needed! 38 | if current_word == word: 39 | print '%s\t%s' % (current_word, current_count) 40 | -------------------------------------------------------------------------------- /map_scripts/reducer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | read currterm currfile currnum 4 | while read term file num; do 5 | if [[ $term = "$currterm" ]] && [[ $file = "$currfile" ]]; then 6 | currnum=$(( currnum + num )) 7 | else 8 | printf "%s\t%s\t%s\n" "$currterm" "$currfile" "$currnum" 9 | currterm="$term" 10 | currfile="$file" 11 | currnum="$num" 12 | fi 13 | done 14 | printf "%s\t%s\t%s\n" "$currterm" "$currfile" "$currnum" -------------------------------------------------------------------------------- /zookeeper.txt: -------------------------------------------------------------------------------- 1 | Deploying ZooKeeper Cluster (Multi-Server) Setup 2 | 3 | Let’s begin installation and configuration of ZooKeeper. 4 | 5 | Step 1: Directory Structure creation, as decided in the designing section 6 | 7 | mac-book-pro:demo aman$ mkdir -p /Users/aman/zookeeper/zk-server-1 \ 8 | /Users/aman/zookeeper/zk-server-2 \ 9 | /Users/aman/zookeeper/zk-server-3 \ 10 | /Users/aman/zookeeper/zk-server-4 \ 11 | /Users/aman/zookeeper/zk-server-5 12 | 13 | mac-book-pro:demo aman$ mkdir -p /Users/aman/zookeeper/data/zk1 \ 14 | /Users/aman/zookeeper/data/zk2 \ 15 | /Users/aman/zookeeper/data/zk3 \ 16 | /Users/aman/zookeeper/data/zk4 \ 17 | /Users/aman/zookeeper/data/zk5 18 | 19 | mac-book-pro:demo aman$ mkdir -p /Users/aman/zookeeper/log/zk1 \ 20 | /Users/aman/zookeeper/log/zk2 \ 21 | /Users/aman/zookeeper/log/zk3 \ 22 | /Users/aman/zookeeper/log/zk4 \ 23 | /Users/aman/zookeeper/log/zk5 24 | 25 | Let’s take a look above created directory structure- 26 | 27 | mac-book-pro:demo aman$ tree /Users/aman/zookeeper 28 | 29 | /Users/aman/zookeeper 30 | |-data 31 | |---zk1 32 | |---zk2 33 | |---zk3 34 | |---zk4 35 | |---zk5 36 | |-log 37 | |---zk1 38 | |---zk2 39 | |---zk3 40 | |---zk4 41 | |---zk5 42 | |-zk-server-1 43 | |-zk-server-2 44 | |-zk-server-3 45 | |-zk-server-4 46 | |-zk-server-5 47 | 48 | mac-book-pro:demo aman$ 49 | 50 | Okay, looks good! 51 | 52 | Step 2: Creating a ZooKeeper Server ID, basically this file reside in the ZooKeeper data directory. Go on choose your favorite text editor 53 | 54 | # just enter a value '1' in the file. Save the file, do the same for rest of ZooKeeper 55 | mac-book-pro:demo aman$ vi /Users/aman/zookeeper/data/zk1/myid 56 | 57 | # follow the same way to fill server id 58 | vi /Users/aman/zookeeper/data/zk2/myid 59 | vi /Users/aman/zookeeper/data/zk3/myid 60 | vi /Users/aman/zookeeper/data/zk4/myid 61 | vi /Users/aman/zookeeper/data/zk5/myid 62 | 63 | Step 3: Downloading ZooKeeper Release 64 | 65 | Download a ZooKeeper from http://hadoop.apache.org/zookeeper/releases.html; this article utilize the version 3.4.4 of ZooKeeper. However same principle is applied for other version too. 66 | 67 | Step 4: Extract & prepare ZooKeeper for deployment 68 | 69 | mac-book-pro:demo aman$ gzip -dc ~/Downloads/soft/zookeeper-3.4.4.tar.gz | tar -xf - -C /tmp 70 | mac-book-pro:demo aman$ cp -r /tmp/zookeeper-3.4.4/* /Users/aman/zookeeper/zk-server-1/ 71 | mac-book-pro:demo aman$ cp -r /tmp/zookeeper-3.4.4/* /Users/aman/zookeeper/zk-server-2/ 72 | mac-book-pro:demo aman$ cp -r /tmp/zookeeper-3.4.4/* /Users/aman/zookeeper/zk-server-3/ 73 | mac-book-pro:demo aman$ cp -r /tmp/zookeeper-3.4.4/* /Users/aman/zookeeper/zk-server-4/ 74 | mac-book-pro:demo aman$ cp -r /tmp/zookeeper-3.4.4/* /Users/aman/zookeeper/zk-server-5/ 75 | 76 | Once done don’t forget to cleanup the ‘/tmp/zookeeper-3.4.4′ 77 | 78 | Step 5: Preparing ZooKeeper configuration called ‘zoo.cfg‘ at ‘{zk-server-1}/conf/zoo.cfg‘. Here I will show you for Server 1 and perform same steps with appropriate values (clientPort, dataDir, dataLogDir) for respective ZooKeeper server. 79 | 80 | mac-book-pro:demo aman$ vi /Users/aman/zookeeper/zk-server-1/conf/zoo.cfg 81 | 82 | Place below configuration into it. 83 | 84 | 85 | # The number of milliseconds of each tick 86 | tickTime=2000 87 | 88 | # The number of ticks that the initial synchronization phase can take 89 | initLimit=10 90 | 91 | # The number of ticks that can pass between 92 | # sending a request and getting an acknowledgement 93 | syncLimit=5 94 | 95 | # the directory where the snapshot is stored. 96 | # Choose appropriately for your environment 97 | dataDir=/Users/aman/zookeeper/data/zk1 98 | 99 | # the port at which the clients will connect 100 | clientPort=2181 101 | 102 | # the directory where transaction log is stored. 103 | # this parameter provides dedicated log device for ZooKeeper 104 | dataLogDir=/Users/aman/zookeeper/log/zk1 105 | 106 | # ZooKeeper server and its port no. 107 | # ZooKeeper ensemble should know about every other machine in the ensemble 108 | # specify server id by creating 'myid' file in the dataDir 109 | # use hostname instead of IP address for convenient maintenance 110 | server.1=localhost:2888:3888 111 | server.2=localhost:2889:3889 112 | server.3=localhost:2890:3890 113 | server.4=localhost:2891:3891 114 | server.5=localhost:2892:3892 115 | -------------------------------------------------------------------------------- /zookeeper_oozie/oozie-server.txt: -------------------------------------------------------------------------------- 1 | Oozie Server Setup 2 | 3 | Copy the built binaries to the home directory as ‘oozie’ 4 | 5 | $ cd ../../ 6 | $ cp -R oozie-3.3.2/distro/target/oozie-3.3.2-distro/oozie-3.3.2/ oozie 7 | 8 | Create the required libext directory 9 | 10 | $ cd oozie 11 | $ mkdir libext 12 | 13 | Copy all the required jars from hadooplibs to the libext directory using the following command: 14 | 15 | $ cp ../oozie-3.3.2/hadooplibs/target/oozie-3.3.2-hadooplibs.tar.gz . 16 | $ tar xzvf oozie-3.3.2-hadooplibs.tar.gz 17 | $ cp oozie-3.3.2/hadooplibs/hadooplib-1.1.1.oozie-3.3.2/* libext/ 18 | 19 | Get Ext2Js – This library is not bundled with Oozie and needs to be downloaded separately. This library is used for the Oozie Web Console: 20 | 21 | $ cd libext 22 | $ wget http://extjs.com/deploy/ext-2.2.zip 23 | $ cd .. 24 | 25 | Update ../hadoop/conf/core-site.xml as follows: 26 | 27 | 28 | hadoop.proxyuser.hduser.hosts 29 | localhost 30 | 31 | 32 | hadoop.proxyuser.hduser.groups 33 | hadoop 34 | 35 | 36 | Here, ‘hduser’ is the username and it belongs to ‘hadoop’ group. 37 | 38 | Prepare the WAR file 39 | 40 | $ ./bin/oozie-setup.sh prepare-war 41 | 42 | setting CATALINA_OPTS="$CATALINA_OPTS -Xmx1024m" 43 | 44 | INFO: Adding extension: /home/hduser/oozie/libext/commons-beanutils-1.7.0.jar 45 | INFO: Adding extension: /home/hduser/oozie/libext/commons-beanutils-core-1.8.0.jar 46 | INFO: Adding extension: /home/hduser/oozie/libext/commons-codec-1.4.jar 47 | INFO: Adding extension: /home/hduser/oozie/libext/commons-collections-3.2.1.jar 48 | INFO: Adding extension: /home/hduser/oozie/libext/commons-configuration-1.6.jar 49 | INFO: Adding extension: /home/hduser/oozie/libext/commons-digester-1.8.jar 50 | INFO: Adding extension: /home/hduser/oozie/libext/commons-el-1.0.jar 51 | INFO: Adding extension: /home/hduser/oozie/libext/commons-io-2.1.jar 52 | INFO: Adding extension: /home/hduser/oozie/libext/commons-lang-2.4.jar 53 | INFO: Adding extension: /home/hduser/oozie/libext/commons-logging-1.1.jar 54 | INFO: Adding extension: /home/hduser/oozie/libext/commons-math-2.1.jar 55 | INFO: Adding extension: /home/hduser/oozie/libext/commons-net-1.4.1.jar 56 | INFO: Adding extension: /home/hduser/oozie/libext/hadoop-client-1.1.1.jar 57 | INFO: Adding extension: /home/hduser/oozie/libext/hadoop-core-1.1.1.jar 58 | INFO: Adding extension: /home/hduser/oozie/libext/hsqldb-1.8.0.7.jar 59 | INFO: Adding extension: /home/hduser/oozie/libext/jackson-core-asl-1.8.8.jar 60 | INFO: Adding extension: /home/hduser/oozie/libext/jackson-mapper-asl-1.8.8.jar 61 | INFO: Adding extension: /home/hduser/oozie/libext/log4j-1.2.16.jar 62 | INFO: Adding extension: /home/hduser/oozie/libext/oro-2.0.8.jar 63 | INFO: Adding extension: /home/hduser/oozie/libext/xmlenc-0.52.jar 64 | 65 | New Oozie WAR file with added 'ExtJS library, JARs' at /home/hduser/oozie/oozie-server/webapps/oozie.war 66 | 67 | INFO: Oozie is ready to be started 68 | 69 | Create sharelib on HDFS 70 | 71 | $ ./bin/oozie-setup.sh sharelib create -fs hdfs://localhost:54310 72 | setting CATALINA_OPTS="$CATALINA_OPTS -Xmx1024m" 73 | the destination path for sharelib is: /user/hduser/share/lib 74 | 75 | Create the OoozieDB 76 | 77 | $ ./bin/ooziedb.sh create -sqlfile oozie.sql -run 78 | setting CATALINA_OPTS="$CATALINA_OPTS -Xmx1024m" 79 | 80 | Validate DB Connection 81 | DONE 82 | Check DB schema does not exist 83 | DONE 84 | Check OOZIE_SYS table does not exist 85 | DONE 86 | Create SQL schema 87 | DONE 88 | Create OOZIE_SYS table 89 | DONE 90 | 91 | Oozie DB has been created for Oozie version '3.3.2' 92 | 93 | The SQL commands have been written to: oozie.sql 94 | 95 | To start Oozie as a daemon use the following command: 96 | 97 | $ ./bin/oozied.sh start 98 | 99 | Setting OOZIE_HOME: /home/hduser/oozie 100 | Setting OOZIE_CONFIG: /home/hduser/oozie/conf 101 | Sourcing: /home/hduser/oozie/conf/oozie-env.sh 102 | setting CATALINA_OPTS="$CATALINA_OPTS -Xmx1024m" 103 | Setting OOZIE_CONFIG_FILE: oozie-site.xml 104 | Setting OOZIE_DATA: /home/hduser/oozie/data 105 | Setting OOZIE_LOG: /home/hduser/oozie/logs 106 | Setting OOZIE_LOG4J_FILE: oozie-log4j.properties 107 | Setting OOZIE_LOG4J_RELOAD: 10 108 | Setting OOZIE_HTTP_HOSTNAME: rohit-VirtualBox 109 | Setting OOZIE_HTTP_PORT: 11000 110 | Setting OOZIE_ADMIN_PORT: 11001 111 | Setting OOZIE_HTTPS_PORT: 11443 112 | Setting OOZIE_BASE_URL: http://rohit-VirtualBox:11000/oozie 113 | Setting CATALINA_BASE: /home/hduser/oozie/oozie-server 114 | Setting OOZIE_HTTPS_KEYSTORE_FILE: /home/hduser/.keystore 115 | Setting OOZIE_HTTPS_KEYSTORE_PASS: password 116 | Setting CATALINA_OUT: /home/hduser/oozie/logs/catalina.out 117 | Setting CATALINA_PID: /home/hduser/oozie/oozie-server/temp/oozie.pid 118 | 119 | Using CATALINA_OPTS: -Xmx1024m -Dderby.stream.error.file=/home/hduser/oozie/logs/derby.log 120 | Adding to CATALINA_OPTS: -Doozie.home.dir=/home/hduser/oozie -Doozie.config.dir=/home/hduser/oozie/conf -Doozie.log.dir=/home/hduser/oozie/logs -Doozie.data.dir=/home/hduser/oozie/data -Doozie.config.file=oozie-site.xml -Doozie.log4j.file=oozie-log4j.properties -Doozie.log4j.reload=10 -Doozie.http.hostname=rohit-VirtualBox -Doozie.admin.port=11001 -Doozie.http.port=11000 -Doozie.https.port=11443 -Doozie.base.url=http://rohit-VirtualBox:11000/oozie -Doozie.https.keystore.file=/home/hduser/.keystore -Doozie.https.keystore.pass=password -Djava.library.path= 121 | 122 | Using CATALINA_BASE: /home/hduser/oozie/oozie-server 123 | Using CATALINA_HOME: /home/hduser/oozie/oozie-server 124 | Using CATALINA_TMPDIR: /home/hduser/oozie/oozie-server/temp 125 | Using JRE_HOME: /usr/lib/jvm/java-6-oracle 126 | Using CLASSPATH: /home/hduser/oozie/oozie-server/bin/bootstrap.jar 127 | Using CATALINA_PID: /home/hduser/oozie/oozie-server/temp/oozie.pid 128 | 129 | To start Oozie as a foreground process use the following command: 130 | 131 | $ ./bin/oozied.sh run 132 | 133 | Check the Oozie log file logs/oozie.log to ensure Oozie started properly. 134 | 135 | Use the following command to check the status of Oozie from command line: 136 | 137 | $ ./bin/oozie admin -oozie http://localhost:11000/oozie -status 138 | System mode: NORMAL 139 | 140 | URL for the Oozie Web Console is http://localhost:11000/oozie 141 | -------------------------------------------------------------------------------- /zookeeper_oozie/zookeeper.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netxillon/Hadoop/ebeccd5adbc95c20994a7119a83cfc9c4e1181b5/zookeeper_oozie/zookeeper.txt --------------------------------------------------------------------------------