├── LICENSE ├── README.md ├── hbase-hive-pig ├── README.md ├── create_hbase_table.sql └── load_hbase.pig ├── hbase-tables ├── README.md ├── create_schemas.py ├── data │ ├── gethue.com.html │ ├── gethue.pdf │ └── hue-logo.png ├── load_binary.py ├── load_data.log └── load_data.sh ├── hcatalog ├── README.md ├── avg_salary.hql └── avg_salary.pig ├── hive-udf ├── README.md ├── myudfs.jar ├── org │ └── hue │ │ └── udf │ │ └── MyUpper.java └── pom.xml ├── hive-workflow ├── README.md ├── create_table.hql ├── insert_table.hql ├── select_table.hql └── select_top_n.sql ├── hue-ha ├── README.md └── howto.txt ├── hue-saml ├── shibboleth-conf │ ├── attribute-filter.xml │ ├── attribute-resolver.xml │ ├── handler.xml │ ├── internal.xml │ ├── logging.xml │ ├── login.config │ ├── relying-party.xml │ └── service.xml └── tomcat6-conf │ ├── server.xml │ └── web.xml ├── impala ├── README.md ├── avro_converter.pig ├── create_avro_table.sql ├── create_parquet_table.sql ├── sample_tables │ ├── alltypes.sql │ ├── alltypes.zip │ └── table_100_cols │ │ ├── 100_cols.txt │ │ └── create.sql └── select_table.hql ├── notebook └── shared_rdd │ ├── README.md │ ├── hue-sharedrdd-notebook.json │ ├── shareable_rdd.py │ ├── shared_rdd.ipynb │ └── shared_rdd.py ├── oozie ├── credentials │ ├── hive-config.xml │ ├── hive.sql │ └── workflow.xml ├── el-functions │ └── rkanter │ │ └── MyELFunctions.java ├── hiveserver2-action │ ├── hive-site3.xml │ ├── select_genericl.sql │ └── workflow.xml └── workflow_demo │ ├── bundle.xml │ ├── coordinator.xml │ ├── job.properties │ ├── lib │ └── piggybank.jar │ ├── script.pig │ └── workflow.xml ├── pig-json-python-udf ├── README.md ├── clean_json.pig ├── converter.py └── python_udf.pig ├── search └── indexing │ ├── README.md │ └── apache_logs.py ├── solr-local-search ├── README.md ├── data_subset.sql ├── load_index.sh ├── solr_local │ ├── conf │ │ ├── admin-extra.html │ │ ├── admin-extra.menu-bottom.html │ │ ├── admin-extra.menu-top.html │ │ ├── currency.xml │ │ ├── elevate.xml │ │ ├── lang │ │ │ ├── contractions_ca.txt │ │ │ ├── contractions_fr.txt │ │ │ ├── contractions_ga.txt │ │ │ ├── contractions_it.txt │ │ │ ├── hyphenations_ga.txt │ │ │ ├── stemdict_nl.txt │ │ │ ├── stoptags_ja.txt │ │ │ ├── stopwords_ar.txt │ │ │ ├── stopwords_bg.txt │ │ │ ├── stopwords_ca.txt │ │ │ ├── stopwords_cz.txt │ │ │ ├── stopwords_da.txt │ │ │ ├── stopwords_de.txt │ │ │ ├── stopwords_el.txt │ │ │ ├── stopwords_en.txt │ │ │ ├── stopwords_es.txt │ │ │ ├── stopwords_eu.txt │ │ │ ├── stopwords_fa.txt │ │ │ ├── stopwords_fi.txt │ │ │ ├── stopwords_fr.txt │ │ │ ├── stopwords_ga.txt │ │ │ ├── stopwords_gl.txt │ │ │ ├── stopwords_hi.txt │ │ │ ├── stopwords_hu.txt │ │ │ ├── stopwords_hy.txt │ │ │ ├── stopwords_id.txt │ │ │ ├── stopwords_it.txt │ │ │ ├── stopwords_ja.txt │ │ │ ├── stopwords_lv.txt │ │ │ ├── stopwords_nl.txt │ │ │ ├── stopwords_no.txt │ │ │ ├── stopwords_pt.txt │ │ │ ├── stopwords_ro.txt │ │ │ ├── stopwords_ru.txt │ │ │ ├── stopwords_sv.txt │ │ │ ├── stopwords_th.txt │ │ │ ├── stopwords_tr.txt │ │ │ └── userdict_ja.txt │ │ ├── mapping-FoldToASCII.txt │ │ ├── mapping-ISOLatin1Accent.txt │ │ ├── protwords.txt │ │ ├── schema.xml │ │ ├── scripts.conf │ │ ├── solrconfig.xml │ │ ├── spellings.txt │ │ ├── stopwords.txt │ │ ├── synonyms.txt │ │ ├── update-script.js │ │ ├── velocity │ │ │ ├── VM_global_library.vm │ │ │ ├── browse.vm │ │ │ ├── cluster.vm │ │ │ ├── clusterResults.vm │ │ │ ├── debug.vm │ │ │ ├── did_you_mean.vm │ │ │ ├── facet_fields.vm │ │ │ ├── facet_pivot.vm │ │ │ ├── facet_queries.vm │ │ │ ├── facet_ranges.vm │ │ │ ├── facets.vm │ │ │ ├── footer.vm │ │ │ ├── head.vm │ │ │ ├── header.vm │ │ │ ├── hit.vm │ │ │ ├── hitGrouped.vm │ │ │ ├── join-doc.vm │ │ │ ├── jquery.autocomplete.css │ │ │ ├── jquery.autocomplete.js │ │ │ ├── layout.vm │ │ │ ├── main.css │ │ │ ├── product-doc.vm │ │ │ ├── query.vm │ │ │ ├── queryGroup.vm │ │ │ ├── querySpatial.vm │ │ │ ├── richtext-doc.vm │ │ │ ├── suggest.vm │ │ │ └── tabs.vm │ │ └── xslt │ │ │ ├── example.xsl │ │ │ ├── example_atom.xsl │ │ │ ├── example_rss.xsl │ │ │ ├── luke.xsl │ │ │ └── updateXml.xsl │ └── reviews.conf └── yelp_40.csv ├── spark └── bikeshare │ ├── 201408_weather_data.csv │ ├── README.md │ ├── index_data.csv │ ├── notebook.txt │ └── weather-data.spark.hue.json ├── sqoop2 ├── README.md ├── create_table.sql └── stats.pig └── static └── hue-3.5.png /README.md: -------------------------------------------------------------------------------- 1 | Hadoop Tutorials and Examples 2 | ============================= 3 | 4 | Source, data and tutorials of the Hue video series, the [Web UI for Apache Hadoop](http://gethue.com). 5 | Follow [@gethue](https://twitter.com/gethue) 6 | 7 | List of all [tutorials](http://gethue.com/category/tutorial/). 8 | 9 | Search 10 | - [Hadoop search: Dynamic search dashboards with Solr](http://gethue.com/hadoop-search-dynamic-search-dashboards-with-solr/) 11 | - [Analyse Apache logs and build your own Web Analytics dashboard with Hadoop and Solr](http://gethue.com/analyse-apache-logs-and-build-your-own-web-analytics-dashboard-with-hadoop-and-solr/) 12 | 13 | Spark 14 | - [Get started with Spark: deploy Spark Server and compute Pi from your Web Browser](http://gethue.com/get-started-with-spark-deploy-spark-server-and-compute-pi-from-your-web-browser/) 15 | 16 | Hive, HBase, Pig 17 | - [Get started with Hadoop: Collect and Analyse Twitter data](http://gethue.com/how-to-analyze-twitter-data-with-hue) | [Hive, Flume, HDFS, Oozie](https://github.com/romainr/cdh-twitter-example) 18 | - [How to access Hive in Pig with HCatalog in Hue](http://gethue.com/hadoop-tutorial-how-to-access-hive-in-pig-with) | [HCatalog](hcatalog) 19 | - [High Availability of Hue](http://gethue.com/hadoop-tutorial-high-availability-of-hue) | [Hue HA](hue-ha) 20 | - [How to create example tables in HBase](http://gethue.com/hadoop-tutorial-how-to-create-example-tables-in-hbase) | [HBase tables](hbase-tables) 21 | - [Build and use a Hive UDF in 1 minute](http://gethue.com/hadoop-tutorial-hive-udf-in-1-minute) | [Hive UDF](hive-udf) 22 | - [The Web UI for HBase: HBase Browser](http://gethue.com/the-web-ui-for-hbase-hbase-browser) 23 | 24 | Season 2 25 | - [1. Prepare the data for analysis with Pig and Python UDF](http://gethue.com/hadoop-tutorials-ii-1-prepare-the-data-for-analysis) | [Pig Python UDF and Json](pig-json-python-udf) 26 | - [2. Execute Hive queries and schedule them with Oozie](http://gethue.com/video-series-ii-2-execute-hive-queries-and-schedule) 27 | - [3. Schedule Hive queries with Oozie coordinators](http://gethue.com/hadoop-tutorials-ii-3-schedule-hive-queries-with) 28 | - [4. Fast SQL with the Impala Query Editor](http://gethue.com/fast-sql-with-the-impala-query-editor) 29 | - [5. Bundle Oozie coordinators with Hue](http://gethue.com/hadoop-tutorial-bundle-oozie-coordinators-with-hue) 30 | - [6. Use Pig and Hive with HBase](http://gethue.com/hadoop-tutorial-use-pig-and-hive-with-hbase) 31 | 32 | ![image](static/hue-3.5.png?raw=true) 33 | 34 | -------------------------------------------------------------------------------- /hbase-hive-pig/README.md: -------------------------------------------------------------------------------- 1 | Blog URL 2 | ======== 3 | 4 | TBD 5 | -------------------------------------------------------------------------------- /hbase-hive-pig/create_hbase_table.sql: -------------------------------------------------------------------------------- 1 | 2 | -- Create table in Hive/HBase 3 | 4 | set hbase.zookeeper.quorum localhost 5 | 6 | CREATE TABLE top_cool_hbase (key string, value map) 7 | STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' 8 | WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,review:") 9 | TBLPROPERTIES ("hbase.table.name" = "top_cool"); 10 | 11 | 12 | 13 | -- Insert data 14 | 15 | ADD JAR /usr/lib/hive/lib/zookeeper.jar; 16 | ADD JAR /usr/lib/hive/lib/hbase.jar; 17 | ADD JAR /usr/lib/hive/lib/hive-hbase-handler-0.10.0-cdh4.3.0.jar 18 | ADD JAR /usr/lib/hive/lib/guava-11.0.2.jar; 19 | 20 | INSERT OVERWRITE TABLE top_cool_hbase SELECT name, map(`date`, cast(coolness as int)) FROM top_cool 21 | 22 | 23 | -- Insert cooler data: restaurant names 24 | 25 | INSERT OVERWRITE TABLE top_cool_hbase SELECT name, map(`date`, cast(r.stars as int)) FROM review r join business b on r.business_id = b.business_id; 26 | 27 | -------------------------------------------------------------------------------- /hbase-hive-pig/load_hbase.pig: -------------------------------------------------------------------------------- 1 | REGISTER /usr/lib/zookeeper/zookeeper-3.4.5-cdh4.3.0.jar 2 | REGISTER /usr/lib/hbase/hbase-0.94.6-cdh4.3.0-security.jar 3 | 4 | set hbase.zookeeper.quorum 'localhost' 5 | 6 | data = LOAD 'hbase://top_cool' 7 | USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('review:*', '-loadKey true') 8 | as (name:CHARARRAY, dates:MAP[]); 9 | 10 | counts = 11 | FOREACH data 12 | GENERATE name, dates#'2012-12-02'; 13 | 14 | DUMP counts; 15 | 16 | -------------------------------------------------------------------------------- /hbase-tables/README.md: -------------------------------------------------------------------------------- 1 | Blog URL 2 | ======== 3 | 4 | [How to create example tables in HBase](http://gethue.com/hadoop-tutorial-how-to-create-example-tables-in-hbase/) | [HBase tables](hbase-tables) 5 | 6 | -------------------------------------------------------------------------------- /hbase-tables/create_schemas.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Generates columns and cell data for an analytics tables of 1000+ columns 4 | # cf. url 5 | # 6 | 7 | # create 'analytics', 'hour', 'day', 'total' 8 | 9 | 10 | import itertools 11 | import random 12 | 13 | random.seed(1) 14 | 15 | ROWS = 1000 16 | HOURS = range(0, 25) 17 | DAYS = range(0, 366) 18 | COUNTRIES = ['US', 'France', 'Italy'] 19 | FAMILLIES = ['hour', 'day', 'total'] 20 | 21 | 22 | # Utilities 23 | 24 | def columns_hours(): 25 | FAMILLY = 'hour' 26 | cols = [] 27 | for hour in HOURS: 28 | cols.append('%s:%02d-%s' % (FAMILLY, hour, 'total')) 29 | for country in COUNTRIES: 30 | cols.append('%s:%02d-%s' % (FAMILLY, hour, country)) 31 | return cols 32 | 33 | def columns_days(): 34 | FAMILLY = 'day' 35 | cols = [] 36 | for day in DAYS: 37 | cols.append('%s:%03d-%s' % (FAMILLY, day, 'total')) 38 | for country in COUNTRIES: 39 | cols.append('%s:%03d-%s' % (FAMILLY, day, country)) 40 | return cols 41 | 42 | def columns_total(): 43 | FAMILLY = 'total' 44 | return ['%s:%s' % (FAMILLY, col) for col in ['total'] + COUNTRIES] 45 | 46 | def get_domain(n): 47 | return ['domain.%s' % n] 48 | 49 | def total(): 50 | return [count_by_country(10000)] 51 | 52 | def days(): 53 | return [count_by_country(1000) for day in DAYS] 54 | 55 | def hours(): 56 | return [count_by_country(100) for hour in HOURS] 57 | 58 | def count_by_country(n): 59 | counts = [random.randrange(1, n) for country in COUNTRIES] 60 | return [sum(counts)] + counts 61 | 62 | def print_columns(): 63 | all_cols = columns_hours() + columns_days() + columns_total() 64 | print "-Dimporttsv.columns=HBASE_ROW_KEY," + ','.join(['%s' % col for col in all_cols]) 65 | 66 | def generate_data(data_file): 67 | f = open(data_file, 'w') 68 | 69 | for i in xrange(ROWS): 70 | a = hours() + days() + total() 71 | f.write('\t'.join(get_domain(i) + map(str, itertools.chain.from_iterable(a))) + '\n') 72 | 73 | print data_file + ' genererated' 74 | 75 | 76 | # Main 77 | 78 | # 79 | # Print columns and generate data into a file 80 | # 81 | print_columns() 82 | generate_data('/tmp/hbase-analytics.tsv') 83 | -------------------------------------------------------------------------------- /hbase-tables/data/gethue.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/romainr/hadoop-tutorials-examples/a070880c7d44402997080d860bb49cc234dff879/hbase-tables/data/gethue.pdf -------------------------------------------------------------------------------- /hbase-tables/data/hue-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/romainr/hadoop-tutorials-examples/a070880c7d44402997080d860bb49cc234dff879/hbase-tables/data/hue-logo.png -------------------------------------------------------------------------------- /hbase-tables/load_binary.py: -------------------------------------------------------------------------------- 1 | ## 2 | ## Insert various data into HBase 3 | ## 4 | 5 | ## cd $HUE_HOME (e.g. cd /usr/share/hue(/opt/cloudera/parcels/CDH-XXXXX/share/hue if using parcels)) 6 | ## build/env/bin/hue shell 7 | ## 8 | 9 | from hbase.api import HbaseApi 10 | 11 | HbaseApi().putRow('Cluster', 'events', 'hue-20130801', {'doc:txt': 'Hue is awesome!'}) 12 | HbaseApi().putRow('Cluster', 'events', 'hue-20130801', {'doc:json': '{"user": "hue", "coolness": "extra"}'}) 13 | HbaseApi().putRow('Cluster', 'events', 'hue-20130802', {'doc:version': 'I like HBase'}) 14 | HbaseApi().putRow('Cluster', 'events', 'hue-20130802', {'doc:version': 'I LOVE HBase'}) 15 | 16 | 17 | ## From https://github.com/romainr/hadoop-tutorials-examples 18 | ## cd /tmp 19 | ## git clone https://github.com/romainr/hadoop-tutorials-examples.git 20 | 21 | root='/tmp/hadoop-tutorials-examples' 22 | 23 | HbaseApi().putRow('Cluster', 'events', 'hue-20130801', {'doc:img': open(root + '/hbase-tables/data/hue-logo.png', "rb").read()}) 24 | HbaseApi().putRow('Cluster', 'events', 'hue-20130801', {'doc:html': open(root + '/hbase-tables/data/gethue.com.html', "rb").read()}) 25 | HbaseApi().putRow('Cluster', 'events', 'hue-20130801', {'doc:pdf': open(root + '/hbase-tables/data/gethue.pdf', "rb").read()}) 26 | 27 | -------------------------------------------------------------------------------- /hcatalog/README.md: -------------------------------------------------------------------------------- 1 | Blog URL 2 | ======== 3 | 4 | [How to access Hive in Pig with HCatalog in Hue](http://gethue.tumblr.com/post/56804308712/hadoop-tutorial-how-to-access-hive-in-pig-with) 5 | 6 | -------------------------------------------------------------------------------- /hcatalog/avg_salary.hql: -------------------------------------------------------------------------------- 1 | SELECT AVG(salary) FROM sample_07; 2 | 3 | -------------------------------------------------------------------------------- /hcatalog/avg_salary.pig: -------------------------------------------------------------------------------- 1 | -- Load table 'sample_07' 2 | sample_07 = LOAD 'sample_07' USING org.apache.hcatalog.pig.HCatLoader(); 3 | 4 | -- Compute the average salary of the table 5 | salaries = GROUP sample_07 ALL; 6 | out = FOREACH salaries GENERATE AVG(sample_07.salary); 7 | DUMP out; 8 | 9 | -------------------------------------------------------------------------------- /hive-udf/README.md: -------------------------------------------------------------------------------- 1 | Blog URL 2 | ======== 3 | 4 | [Build and use a Hive UDF in 1 minute](http://gethue.com/hadoop-tutorial-hive-udf-in-1-minute/) 5 | 6 | 7 | How to 8 | ====== 9 | 10 | - Just use the precompiled [myudfs.jar](myudfs.jar) 11 | 12 | - Or compile it with: 13 | ``` 14 | javac -cp $(ls /usr/lib/hive/lib/hive-exec*.jar):/usr/lib/hadoop/hadoop-common.jar org/hue/udf/MyUpper.java 15 | jar -cf myudfs.jar -C . . 16 | ``` 17 | -------------------------------------------------------------------------------- /hive-udf/myudfs.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/romainr/hadoop-tutorials-examples/a070880c7d44402997080d860bb49cc234dff879/hive-udf/myudfs.jar -------------------------------------------------------------------------------- /hive-udf/org/hue/udf/MyUpper.java: -------------------------------------------------------------------------------- 1 | 2 | package org.hue.udf; 3 | 4 | import org.apache.hadoop.hive.ql.exec.UDF; 5 | import org.apache.hadoop.io.Text; 6 | 7 | 8 | public final class MyUpper extends UDF { 9 | public Text evaluate(final Text s) { 10 | if (s == null) { return null; } 11 | return new Text(s.toString().toUpperCase()); 12 | } 13 | } 14 | 15 | -------------------------------------------------------------------------------- /hive-udf/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 20 | 21 | 28 | 29 | 4.0.0 30 | 31 | org.hue.udf 32 | myudfs 33 | 1.0-SNAPSHOT 34 | jar 35 | 36 | myudfs 37 | http://gethue.com 38 | 39 | 40 | 1.1.0-cdh5.5.0-SNAPSHOT 41 | 2.6.0-cdh5.5.0-SNAPSHOT 42 | 43 | 44 | 45 | 46 | 47 | 48 | org.apache.maven.plugins 49 | maven-compiler-plugin 50 | 2.3.2 51 | 52 | 1.6 53 | 1.6 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | org.apache.hive 63 | hive-exec 64 | ${hive.version} 65 | 66 | 67 | org.apache.hadoop 68 | hadoop-common 69 | ${hadoop.version} 70 | 71 | 72 | 73 | 74 | 75 | cloudera 76 | https://repository.cloudera.com/content/repositories/snapshots 77 | 78 | true 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /hive-workflow/README.md: -------------------------------------------------------------------------------- 1 | Blog URL 2 | ======== 3 | 4 | TBD 5 | -------------------------------------------------------------------------------- /hive-workflow/create_table.hql: -------------------------------------------------------------------------------- 1 | CREATE TABLE top_cool AS 2 | SELECT r.business_id, name, SUM(cool) AS coolness, '$date' as `date` 3 | FROM review r JOIN business b 4 | ON (r.business_id = b.business_id) 5 | WHERE categories LIKE '%Restaurants%' 6 | AND `date` = '$date' 7 | GROUP BY r.business_id, name 8 | ORDER BY coolness DESC 9 | LIMIT 10 10 | 11 | -------------------------------------------------------------------------------- /hive-workflow/insert_table.hql: -------------------------------------------------------------------------------- 1 | INSERT INTO TABLE top_cool 2 | SELECT r.business_id, name, SUM(cool) AS coolness, '${date}' as `date` 3 | FROM review r JOIN business b 4 | ON (r.business_id = b.business_id) 5 | WHERE categories LIKE '%Restaurants%' 6 | AND `date` = '${date}' 7 | GROUP BY r.business_id, name 8 | ORDER BY coolness DESC 9 | LIMIT 10 10 | 11 | -------------------------------------------------------------------------------- /hive-workflow/select_table.hql: -------------------------------------------------------------------------------- 1 | SELECT r.business_id, name, SUM(cool) AS coolness, '$date' as `date` 2 | FROM review r JOIN business b 3 | ON (r.business_id = b.business_id) 4 | WHERE categories LIKE '%Restaurants%' 5 | AND `date` = '$date' 6 | GROUP BY r.business_id, name 7 | ORDER BY coolness DESC 8 | LIMIT 10 9 | -------------------------------------------------------------------------------- /hive-workflow/select_top_n.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE top_cool AS 2 | SELECT r.business_id, name, SUM(cool) AS coolness, '${date}' as `date` 3 | FROM review r JOIN business b 4 | ON (r.business_id = b.business_id) 5 | WHERE categories LIKE '%Restaurants%' 6 | AND `date` = '${date}' 7 | GROUP BY r.business_id, name 8 | ORDER BY coolness DESC 9 | LIMIT ${n} 10 | -------------------------------------------------------------------------------- /hue-ha/README.md: -------------------------------------------------------------------------------- 1 | Blog URL 2 | ======== 3 | 4 | [High Availability of Hue](http://gethue.com/automatic-high-availability-with-hue-and-cloudera-manager/) | [Hue HA](hue-ha) 5 | 6 | -------------------------------------------------------------------------------- /hue-ha/howto.txt: -------------------------------------------------------------------------------- 1 | 1. Install rsyslogd, edit its config file (/etc/rsyslog.conf), add the following lines at the very bottom, then restart rsyslogd. 2 | $ModLoad imudp $UDPServerAddress 127.0.0.1 $UDPServerRun 514 3 | 4 | 2. Add the following to /etc/rsyslog.d/haproxy.conf: 5 | local6.* /var/log/haproxy.log 6 | 7 | 3. Create the file /tmp/haproxy.conf and add the following lines to it: 8 | 9 | global 10 | 11 | daemon 12 | 13 | nbproc 1 14 | 15 | maxconn 100000 16 | 17 | log 127.0.0.1 local6 debug 18 | 19 | defaults 20 | 21 | option http-server-close 22 | 23 | mode http 24 | 25 | timeout http-request 5s 26 | 27 | timeout connect 5s 28 | 29 | timeout server 10s 30 | 31 | timeout client 10s 32 | 33 | listen Hue 0.0.0.0:80 34 | 35 | log global 36 | 37 | mode http 38 | 39 | stats enable 40 | 41 | balance source 42 | 43 | server hue1 hue-ha-test1-1.ent.cloudera.com:8888 cookie ServerA check inter 2000 fall 3 44 | server hue2 hue-ha-test1-2.ent.cloudera.com:8888 cookie ServerB check inter 2000 fall 3 45 | 46 | 4. Download and gunzip HAProxy from here: http://haproxy.1wt.eu/download/1.4/bin/haproxy-1.4.24-pcre-40kses-linux-i586.stripped.gz. 47 | 48 | 5. Run ./haproxy-1.4.24-pcre-40kses-linux-i586.stripped -f /tmp/haproxy.conf 49 | 50 | 6. Goto localhost:7000 and see that Hue is running. Show /var/log/haproxy.log. 51 | 52 | 7. Go to http://hue-ha-test1-1.ent.cloudera.com:7180/ (admin:admin) and stop the Hue instance that you're on (probably will have to stop each instance to figure that out). 53 | 54 | 8. Go to Hue (it should be down once). Refresh and show Hue is back up. Show /var/log/haproxy.log. 55 | 56 | -------------------------------------------------------------------------------- /hue-saml/shibboleth-conf/attribute-filter.xml: -------------------------------------------------------------------------------- 1 | 2 | 9 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 39 | 62 | 63 | 66 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /hue-saml/shibboleth-conf/logging.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 21 | 22 | 26 | 27 | 30 | 31 | /opt/shibboleth-idp/logs/idp-access.log 32 | 33 | 34 | /opt/shibboleth-idp/logs/idp-access-%d{yyyy-MM-dd}.log 35 | 36 | 37 | 38 | UTF-8 39 | %msg%n 40 | 41 | 42 | 43 | 44 | /opt/shibboleth-idp/logs/idp-audit.log 45 | 46 | 47 | /opt/shibboleth-idp/logs/idp-audit-%d{yyyy-MM-dd}.log 48 | 49 | 50 | 51 | UTF-8 52 | %msg%n 53 | 54 | 55 | 56 | 57 | /opt/shibboleth-idp/logs/idp-process.log 58 | 59 | 60 | /opt/shibboleth-idp/logs/idp-process-%d{yyyy-MM-dd}.log 61 | 62 | 63 | 64 | UTF-8 65 | %date{HH:mm:ss.SSS} - %level [%logger:%line] - %msg%n 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /hue-saml/shibboleth-conf/login.config: -------------------------------------------------------------------------------- 1 | /* 2 | This is the JAAS configuration file used by the Shibboleth IdP. 3 | 4 | A JAAS configuration file is a grouping of LoginModules defined in the following manner: 5 | ; 6 | 7 | LoginModuleClass - fully qualified class name of the LoginModule class 8 | Flag - indicates whether the requirement level for the modules; 9 | allowed values: required, requisite, sufficient, optional 10 | ModuleOptions - a space delimited list of name="value" options 11 | 12 | For complete documentation on the format of this file see: 13 | http://java.sun.com/j2se/1.5.0/docs/api/javax/security/auth/login/Configuration.html 14 | 15 | For LoginModules available within the Sun JVM see: 16 | http://java.sun.com/j2se/1.5.0/docs/guide/security/jaas/tutorials/LoginConfigFile.html 17 | 18 | Warning: Do NOT use Sun's JNDI LoginModule to authentication against an LDAP directory, 19 | Use the LdapLoginModule that ships with Shibboleth and is demonstrated below. 20 | 21 | Note, the application identifier MUST be ShibUserPassAuth 22 | */ 23 | 24 | 25 | ShibUserPassAuth { 26 | 27 | // Example LDAP authentication 28 | // See: https://wiki.shibboleth.net/confluence/display/SHIB2/IdPAuthUserPass 29 | 30 | edu.vt.middleware.ldap.jaas.LdapLoginModule required 31 | ldapUrl="ldap://localhost:1389" 32 | baseDn="ou=People,dc=example,dc=com" 33 | ssl="false" 34 | userFilter="uid={0}"; 35 | 36 | 37 | // Example Kerberos authentication, requires Sun's JVM 38 | // See: https://wiki.shibboleth.net/confluence/display/SHIB2/IdPAuthUserPass 39 | /* 40 | com.sun.security.auth.module.Krb5LoginModule required 41 | useKeyTab="true" 42 | keyTab="/path/to/idp/keytab/file"; 43 | */ 44 | 45 | }; 46 | -------------------------------------------------------------------------------- /hue-saml/shibboleth-conf/service.xml: -------------------------------------------------------------------------------- 1 | 2 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 25 | 26 | 29 | 30 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 43 | 48 | -------------------------------------------------------------------------------- /impala/README.md: -------------------------------------------------------------------------------- 1 | Blog URL 2 | ======== 3 | 4 | TBD 5 | -------------------------------------------------------------------------------- /impala/avro_converter.pig: -------------------------------------------------------------------------------- 1 | REGISTER piggybank.jar 2 | 3 | 4 | data = load '/user/hive/warehouse/review/yelp_academic_dataset_review_clean.json' 5 | AS (funny:INT, useful:INT, cool:INT, user_id:CHARARRAY, review_id:CHARARRAY, 6 | stars:INT, text:CHARARRAY, business_id:CHARARRAY, date:CHARARRAY, type:CHARARRAY); 7 | 8 | data_clean = 9 | FILTER data 10 | BY funny IS NOT NULL 11 | AND useful IS NOT NULL 12 | AND cool IS NOT NULL 13 | AND user_id IS NOT NULL 14 | AND review_id IS NOT NULL 15 | AND business_id IS NOT NULL 16 | AND stars IS NOT NULL 17 | AND date IS NOT NULL 18 | AND type IS NOT NULL; 19 | 20 | STORE data_clean INTO 'impala/reviews_avro' 21 | USING org.apache.pig.piggybank.storage.avro.AvroStorage( 22 | '{ 23 | "schema": { 24 | "name": "review", 25 | "type": "record", 26 | "fields": [ 27 | {"name":"funny", "type":"int"}, 28 | {"name":"useful", "type":"int"}, 29 | {"name":"cool", "type":"int"}, 30 | {"name":"user_id", "type":"string"} 31 | {"name":"review_id", "type":"string"}, 32 | {"name":"stars", "type":"int"}, 33 | {"name":"text", "type":"string"}, 34 | {"name":"business_id", "type":"string"}, 35 | {"name":"date", "type":"string"}, 36 | {"name":"type", "type":"string"}, 37 | ]} 38 | }'); 39 | 40 | -------------------------------------------------------------------------------- /impala/create_avro_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE review_avro 2 | ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' 3 | STORED AS 4 | inputformat 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' 5 | outputformat 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' 6 | LOCATION '/user/romain/impala/reviews_avro' 7 | tblproperties ('avro.schema.literal'='{ 8 | "name": "review", 9 | "type": "record", 10 | "fields": [ 11 | {"name":"business_id", "type":"string"}, 12 | {"name":"cool", "type":"int"}, 13 | {"name":"date", "type":"string"}, 14 | {"name":"funny", "type":"int"}, 15 | {"name":"review_id", "type":"string"}, 16 | {"name":"stars", "type":"int"}, 17 | {"name":"text", "type":"string"}, 18 | {"name":"type", "type":"string"}, 19 | {"name":"useful", "type":"int"}, 20 | {"name":"user_id", "type":"string"}]}'); 21 | 22 | -------------------------------------------------------------------------------- /impala/create_parquet_table.sql: -------------------------------------------------------------------------------- 1 | 2 | CREATE TABLE review_parquet LIKE review STORED AS PARQUETFILE; 3 | 4 | INSERT OVERWRITE review_parquet SELECT * FROM review; 5 | 6 | -------------------------------------------------------------------------------- /impala/sample_tables/alltypes.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE IF NOT EXISTS functional; 2 | DROP TABLE IF EXISTS functional.alltypes; 3 | CREATE EXTERNAL TABLE IF NOT EXISTS functional.alltypes ( 4 | id int COMMENT 'Add a comment', 5 | bool_col boolean, 6 | tinyint_col tinyint, 7 | smallint_col smallint, 8 | int_col int, 9 | bigint_col bigint, 10 | float_col float, 11 | double_col double, 12 | date_string_col string, 13 | string_col string, 14 | timestamp_col timestamp) 15 | PARTITIONED BY (year int, month int) 16 | ROW FORMAT delimited fields terminated by ',' escaped by '\\' 17 | STORED AS TEXTFILE 18 | LOCATION '/user/admin/alltypes/alltypes'; 19 | 20 | USE functional; 21 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2009, month=1); 22 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2009, month=2); 23 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2009, month=3); 24 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2009, month=4); 25 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2009, month=5); 26 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2009, month=6); 27 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2009, month=7); 28 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2009, month=8); 29 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2009, month=9); 30 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2009, month=10); 31 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2009, month=11); 32 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2009, month=12); 33 | 34 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2010, month=1); 35 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2010, month=2); 36 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2010, month=3); 37 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2010, month=4); 38 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2010, month=5); 39 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2010, month=6); 40 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2010, month=7); 41 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2010, month=8); 42 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2010, month=9); 43 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2010, month=10); 44 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2010, month=11); 45 | ALTER TABLE alltypes ADD IF NOT EXISTS PARTITION(year=2010, month=12); 46 | -------------------------------------------------------------------------------- /impala/sample_tables/alltypes.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/romainr/hadoop-tutorials-examples/a070880c7d44402997080d860bb49cc234dff879/impala/sample_tables/alltypes.zip -------------------------------------------------------------------------------- /impala/sample_tables/table_100_cols/create.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_100 ( 2 | id_0 int COMMENT 'Add a comment', 3 | bool_col_0 boolean, 4 | tinyint_col_0 tinyint, 5 | smallint_col_0 smallint, 6 | int_col_0 int, 7 | bigint_col_0 bigint, 8 | float_col_0 float, 9 | double_col_0 double, 10 | date_string_col_0 string, 11 | string_col_0 string, 12 | timestamp_col_0 timestamp, 13 | id_1 int COMMENT 'Add a comment', 14 | bool_col_1 boolean, 15 | tinyint_col_1 tinyint, 16 | smallint_col_1 smallint, 17 | int_col_1 int, 18 | bigint_col_1 bigint, 19 | float_col_1 float, 20 | double_col_1 double, 21 | date_string_col_1 string, 22 | string_col_1 string, 23 | timestamp_col_1 timestamp, 24 | id_2 int COMMENT 'Add a comment', 25 | bool_col_2 boolean, 26 | tinyint_col_2 tinyint, 27 | smallint_col_2 smallint, 28 | int_col_2 int, 29 | bigint_col_2 bigint, 30 | float_col_2 float, 31 | double_col_2 double, 32 | date_string_col_2 string, 33 | string_col_2 string, 34 | timestamp_col_2 timestamp, 35 | id_3 int COMMENT 'Add a comment', 36 | bool_col_3 boolean, 37 | tinyint_col_3 tinyint, 38 | smallint_col_3 smallint, 39 | int_col_3 int, 40 | bigint_col_3 bigint, 41 | float_col_3 float, 42 | double_col_3 double, 43 | date_string_col_3 string, 44 | string_col_3 string, 45 | timestamp_col_3 timestamp, 46 | id_4 int COMMENT 'Add a comment', 47 | bool_col_4 boolean, 48 | tinyint_col_4 tinyint, 49 | smallint_col_4 smallint, 50 | int_col_4 int, 51 | bigint_col_4 bigint, 52 | float_col_4 float, 53 | double_col_4 double, 54 | date_string_col_4 string, 55 | string_col_4 string, 56 | timestamp_col_4 timestamp, 57 | id_5 int COMMENT 'Add a comment', 58 | bool_col_5 boolean, 59 | tinyint_col_5 tinyint, 60 | smallint_col_5 smallint, 61 | int_col_5 int, 62 | bigint_col_5 bigint, 63 | float_col_5 float, 64 | double_col_5 double, 65 | date_string_col_5 string, 66 | string_col_5 string, 67 | timestamp_col_5 timestamp, 68 | id_6 int COMMENT 'Add a comment', 69 | bool_col_6 boolean, 70 | tinyint_col_6 tinyint, 71 | smallint_col_6 smallint, 72 | int_col_6 int, 73 | bigint_col_6 bigint, 74 | float_col_6 float, 75 | double_col_6 double, 76 | date_string_col_6 string, 77 | string_col_6 string, 78 | timestamp_col_6 timestamp, 79 | id_7 int COMMENT 'Add a comment', 80 | bool_col_7 boolean, 81 | tinyint_col_7 tinyint, 82 | smallint_col_7 smallint, 83 | int_col_7 int, 84 | bigint_col_7 bigint, 85 | float_col_7 float, 86 | double_col_7 double, 87 | date_string_col_7 string, 88 | string_col_7 string, 89 | timestamp_col_7 timestamp, 90 | id_8 int COMMENT 'Add a comment', 91 | bool_col_8 boolean, 92 | tinyint_col_8 tinyint, 93 | smallint_col_8 smallint, 94 | int_col_8 int, 95 | bigint_col_8 bigint, 96 | float_col_8 float, 97 | double_col_8 double, 98 | date_string_col_8 string, 99 | string_col_8 string, 100 | timestamp_col_8 timestamp, 101 | id_9 int COMMENT 'Add a comment', 102 | bool_col_9 boolean, 103 | tinyint_col_9 tinyint, 104 | smallint_col_9 smallint, 105 | int_col_9 int, 106 | bigint_col_9 bigint, 107 | float_col_9 float, 108 | double_col_9 double, 109 | date_string_col_9 string, 110 | string_col_9 string, 111 | timestamp_col_9 timestamp, 112 | id_10 int COMMENT 'Add a comment', 113 | bool_col_10 boolean, 114 | tinyint_col_10 tinyint, 115 | smallint_col_10 smallint, 116 | int_col_10 int, 117 | bigint_col_10 bigint, 118 | float_col_10 float, 119 | double_col_10 double, 120 | date_string_col_10 string, 121 | string_col_10 string, 122 | timestamp_col_10 timestamp 123 | 124 | ) 125 | 126 | ROW FORMAT delimited fields terminated by ',' escaped by '\\' 127 | STORED AS TEXTFILE 128 | LOCATION '/user/admin/table_100'; 129 | -------------------------------------------------------------------------------- /impala/select_table.hql: -------------------------------------------------------------------------------- 1 | SELECT r.business_id, name, SUM(cool) AS coolness, '$date' as `date` 2 | FROM review r JOIN business b 3 | ON (r.business_id = b.business_id) 4 | WHERE categories LIKE '%Restaurants%' 5 | AND `date` = '$date' 6 | GROUP BY r.business_id, name 7 | ORDER BY coolness DESC 8 | LIMIT 10 9 | -------------------------------------------------------------------------------- /notebook/shared_rdd/README.md: -------------------------------------------------------------------------------- 1 | Blog 2 | ==== 3 | 4 | [How to use the Livy Spark REST Job Server API for sharing Spark RDDs and contexts](http://gethue.com/how-to-use-the-livy-spark-rest-job-server-api-for-sharing-spark-rdds-and-contexts/) 5 | 6 | Read more on: 7 | http://gethue.com/spark/ 8 | 9 | -------------------------------------------------------------------------------- /notebook/shared_rdd/shareable_rdd.py: -------------------------------------------------------------------------------- 1 | 2 | # Start a named RDD on a remote Livy PypSpark session that simulates a shared in memory key/value store. 3 | # To start in a Livy PySpark session. 4 | 5 | class ShareableRdd(): 6 | 7 | def __init__(self): 8 | self.data = sc.parallelize([]) 9 | 10 | def get(self, key): 11 | return self.data.filter(lambda row: row[0] == key).take(1)[0] 12 | 13 | def set(self, key, value): 14 | new_key = sc.parallelize([[key, value]]) 15 | self.data = self.data.union(new_key) 16 | 17 | 18 | a = ShareableRdd() 19 | 20 | a.set('ak', 'Alaska') 21 | a.set('ca', 'California') 22 | 23 | 24 | a.get('ak') 25 | 26 | -------------------------------------------------------------------------------- /notebook/shared_rdd/shared_rdd.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import requests\n", 12 | "import json\n", 13 | "\n", 14 | "\n", 15 | "class SharedRdd():\n", 16 | " \"\"\"\n", 17 | " Perform REST calls to a remote PySpark shell containing a Shared named RDD.\n", 18 | " \"\"\" \n", 19 | " def __init__(self, session_url, name):\n", 20 | " self.session_url = session_url\n", 21 | " self.name = name\n", 22 | " \n", 23 | " def get(self, key):\n", 24 | " return self._curl('%(rdd)s.get(\"%(key)s\")' % {'rdd': self.name, 'key': key})\n", 25 | " \n", 26 | " def set(self, key, value):\n", 27 | " return self._curl('%(rdd)s.set(\"%(key)s\", \"%(value)s\")' % {'rdd': self.name, 'key': key, 'value': value})\n", 28 | " \n", 29 | " def _curl(self, code):\n", 30 | " statements_url = self.session_url + '/statements'\n", 31 | " data = {'code': code}\n", 32 | " r = requests.post(statements_url, data=json.dumps(data), headers={'Content-Type': 'application/json'})\n", 33 | " resp = r.json()\n", 34 | " statement_id = str(resp['id'])\n", 35 | " while resp['state'] == 'running':\n", 36 | " r = requests.get(statements_url + '/' + statement_id)\n", 37 | " resp = r.json() \n", 38 | " if 'output' in resp:\n", 39 | " return resp['output']['data']\n", 40 | " else:\n", 41 | " return resp['data']\n" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "collapsed": true 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "states = SharedRdd('http://localhost:8998/sessions/0', 'states')" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "collapsed": true 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "print states.get('ak')\n", 64 | "print states.get('ca')" 65 | ] 66 | } 67 | ], 68 | "metadata": { 69 | "kernelspec": { 70 | "display_name": "Python 2", 71 | "language": "python", 72 | "name": "python2" 73 | }, 74 | "language_info": { 75 | "codemirror_mode": { 76 | "name": "ipython", 77 | "version": 2 78 | }, 79 | "file_extension": ".py", 80 | "mimetype": "text/x-python", 81 | "name": "python", 82 | "nbconvert_exporter": "python", 83 | "pygments_lexer": "ipython2", 84 | "version": "2.7.10" 85 | } 86 | }, 87 | "nbformat": 4, 88 | "nbformat_minor": 0 89 | } 90 | -------------------------------------------------------------------------------- /notebook/shared_rdd/shared_rdd.py: -------------------------------------------------------------------------------- 1 | 2 | # 3 | # Access a named RDD on a remote Livy PypSpark session that simulates a shared in memory key/value store. 4 | # To type in a regular Python shell. 5 | # Depends on: pip install requests 6 | # 7 | import requests 8 | import json 9 | 10 | 11 | class SharedRdd2(): 12 | 13 | def __init__(self, session_url, name): 14 | self.session_url = session_url 15 | self.name = name 16 | 17 | def get(self, key): 18 | return self._curl('%(rdd)s.get("%(key)s")' % {'rdd': self.name, 'key': key}) 19 | 20 | def set(self, key, value): 21 | return self._curl('%(rdd)s.set("%(key)s", "%(value)s")' % {'rdd': self.name, 'key': key, 'value': value}) 22 | 23 | def _curl(self, code): 24 | statements_url = self.session_url + '/statements' 25 | data = {'code': code} 26 | print data 27 | print statements_url 28 | r = requests.post(statements_url, data=json.dumps(data), headers={'Content-Type': 'application/json'}) 29 | resp = r.json() 30 | statement_id = str(resp['id']) 31 | while resp['state'] == 'running': 32 | r = requests.get(statements_url + '/' + statement_id) 33 | resp = r.json() 34 | if 'output' in resp: # Case Livy returns automatically 35 | return resp['output']['data'] 36 | else: 37 | return resp['data'] 38 | 39 | 40 | states = SharedRdd2('http://localhost:8998/sessions/0', 'states') 41 | 42 | 43 | states.get('ak') 44 | states.get('ca') 45 | -------------------------------------------------------------------------------- /oozie/credentials/hive-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 20 | 21 | 22 | 30 | 31 | 32 | hive.metastore.local 33 | false 34 | 35 | 36 | hive.metastore.uris 37 | thrift://test-cdh5-hue.ent.cloudera.com:9083 38 | 39 | 40 | hive.metastore.client.socket.timeout 41 | 300 42 | 43 | 44 | hive.metastore.warehouse.dir 45 | /user/hive/warehouse 46 | 47 | 48 | hive.warehouse.subdir.inherit.perms 49 | true 50 | 51 | 52 | mapred.reduce.tasks 53 | -1 54 | 55 | 56 | hive.exec.reducers.bytes.per.reducer 57 | 1073741824 58 | 59 | 60 | hive.exec.reducers.max 61 | 999 62 | 63 | 64 | hive.metastore.execute.setugi 65 | true 66 | 67 | 68 | hive.support.concurrency 69 | true 70 | 71 | 72 | hive.zookeeper.quorum 73 | test-cdh5-hue.ent.cloudera.com 74 | 75 | 76 | hive.zookeeper.client.port 77 | 2181 78 | 79 | 80 | 81 | 82 | hive.zookeeper.namespace 83 | hive_zookeeper_namespace_HIVE-1 84 | 85 | 86 | hive.server2.enable.doAs 87 | true 88 | 89 | 90 | fs.hdfs.impl.disable.cache 91 | true 92 | 93 | 94 | hive.metastore.sasl.enabled 95 | true 96 | 97 | 98 | hive.server2.authentication 99 | kerberos 100 | 101 | 102 | hive.metastore.kerberos.principal 103 | hive/_HOST@ENT.CLOUDERA.COM 104 | 105 | 106 | hive.server2.authentication.kerberos.principal 107 | hive/_HOST@ENT.CLOUDERA.COM 108 | 109 | 110 | hive.stats.dbclass 111 | jdbc:mysql 112 | 113 | 114 | hive.stats.jdbcdriver 115 | com.mysql.jdbc.Driver 116 | 117 | 118 | hive.stats.dbconnectionstring 119 | jdbc:mysql://test-cdh5-hue.ent.cloudera.com:3306/hive1?user=hive1&password=hive1 120 | 121 | 122 | hive.aux.jars.path 123 | file:///usr/share/java//mysql-connector-java.jar 124 | 125 | 126 | hbase.zookeeper.quorum 127 | test-cdh5-hue.ent.cloudera.com 128 | 129 | 130 | hbase.zookeeper.property.clientPort 131 | 2181 132 | 133 | 134 | -------------------------------------------------------------------------------- /oozie/credentials/hive.sql: -------------------------------------------------------------------------------- 1 | show tables; 2 | 3 | select count(*) from sample_07; -------------------------------------------------------------------------------- /oozie/credentials/workflow.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | hcat.metastore.uri 7 | thrift://test-cdh5-hue.ent.cloudera.com:9083 8 | 9 | 10 | hcat.metastore.principal 11 | hive/test-cdh5-hue.ent.cloudera.com@ENT.CLOUDERA.COM 12 | 13 | 14 | 15 | 16 | 17 | 18 | ${jobTracker} 19 | ${nameNode} 20 | hive-config.xml 21 | 22 | hive-config.xml#hive-config.xml 23 | 24 | 25 | 26 | 27 | 28 | Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /oozie/el-functions/rkanter/MyELFunctions.java: -------------------------------------------------------------------------------- 1 | package rkanter; 2 | 3 | public class MyELFunctions { 4 | /** 5 | * Compares two strings while ignoring case. 6 | * 7 | * @param s1 first string. 8 | * @param s2 second string. 9 | * @return true if s1 and s2 are equal, regardless of case, and false otherwise 10 | */ 11 | public static boolean equalsIgnoreCase(String s1, String s2) { 12 | if (s1 == null) { 13 | return (s2 == null); 14 | } 15 | return s1.equalsIgnoreCase(s2); 16 | } 17 | } -------------------------------------------------------------------------------- /oozie/hiveserver2-action/hive-site3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | hive.metastore.local 6 | false 7 | 8 | 9 | hive.metastore.uris 10 | thrift://test-cdh5-hue.ent.cloudera.com:9083 11 | 12 | 13 | hive.metastore.client.socket.timeout 14 | 300 15 | 16 | 17 | hive.metastore.warehouse.dir 18 | /user/hive/warehouse 19 | 20 | 21 | hive.warehouse.subdir.inherit.perms 22 | true 23 | 24 | 25 | mapred.reduce.tasks 26 | -1 27 | 28 | 29 | hive.exec.reducers.bytes.per.reducer 30 | 1073741824 31 | 32 | 33 | hive.exec.reducers.max 34 | 999 35 | 36 | 37 | hive.metastore.execute.setugi 38 | true 39 | 40 | 41 | hive.support.concurrency 42 | true 43 | 44 | 45 | hive.zookeeper.quorum 46 | test-cdh5-hue.ent.cloudera.com 47 | 48 | 49 | hive.zookeeper.client.port 50 | 2181 51 | 52 | 53 | 54 | 55 | hive.zookeeper.namespace 56 | hive_zookeeper_namespace_HIVE-1 57 | 58 | 59 | hive.server2.enable.doAs 60 | true 61 | 62 | 63 | fs.hdfs.impl.disable.cache 64 | true 65 | 66 | 67 | hive.metastore.sasl.enabled 68 | true 69 | 70 | 71 | hive.server2.authentication 72 | kerberos 73 | 74 | 75 | hive.metastore.kerberos.principal 76 | hive/_HOST@ENT.CLOUDERA.COM 77 | 78 | 79 | hive.server2.authentication.kerberos.principal 80 | hive/_HOST@ENT.CLOUDERA.COM 81 | 82 | 83 | hive.stats.dbclass 84 | jdbc:mysql 85 | 86 | 87 | hive.stats.jdbcdriver 88 | com.mysql.jdbc.Driver 89 | 90 | 91 | hive.stats.dbconnectionstring 92 | jdbc:mysql://test-cdh5-hue.ent.cloudera.com:3306/hive1?user=hive1&password=hive1 93 | 94 | 95 | hive.aux.jars.path 96 | file:///usr/share/java//mysql-connector-java.jar 97 | 98 | 99 | hbase.zookeeper.quorum 100 | test-cdh5-hue.ent.cloudera.com 101 | 102 | 103 | hbase.zookeeper.property.clientPort 104 | 2181 105 | 106 | 107 | -------------------------------------------------------------------------------- /oozie/hiveserver2-action/select_genericl.sql: -------------------------------------------------------------------------------- 1 | !connect ${connectString} systest ${hive.password} org.apache.hive.jdbc.HiveDriver 2 | select count(*) from ${tableName}; 3 | -------------------------------------------------------------------------------- /oozie/hiveserver2-action/workflow.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | hive2.jdbc.url 7 | jdbc:hive2://test-cdh5-hue.ent.cloudera.com:10000/default 8 | 9 | 10 | hive2.server.principal 11 | hive/test-cdh5-hue.ent.cloudera.com@ENT.CLOUDERA.COM 12 | 13 | 14 | 15 | 16 | 17 | 18 | ${jobTracker} 19 | ${nameNode} 20 | hive-site3.xml 21 | 22 | 23 | oozie.action.sharelib.for.hive 24 | hive2 25 | 26 | 27 | oozie.launcher.action.main.class 28 | org.apache.oozie.action.hadoop.Hive2Main 29 | 30 | 31 | 32 | connectString=${connectString} 33 | tableName=${tableName} 34 | 35 | 36 | 37 | 38 | 39 | Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /oozie/workflow_demo/bundle.xml: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | oozie.use.system.libpath 7 | true 8 | 9 | 10 | 11 | 2013-06-01T00:00Z 12 | 13 | 14 | ${nameNode}/user/hue/oozie/deployments/_romain_-oozie-827-1384552268.08 15 | 16 | 17 | wf_application_path 18 | hdfs://localhost:8020/user/hue/oozie/deployments/_romain_-oozie-807-1384552240.98 19 | 20 | 21 | 22 | 23 | ${nameNode}/user/hue/oozie/deployments/_romain_-oozie-828-1384552298.63 24 | 25 | 26 | wf_application_path 27 | hdfs://localhost:8020/user/hue/oozie/deployments/_romain_-oozie-806-1384552283.32 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /oozie/workflow_demo/coordinator.xml: -------------------------------------------------------------------------------- 1 | 5 | 6 | 8 | /user/romain/words/201207${DAY} 9 | 10 | 11 | 12 | 13 | 14 | ${coord:current(0)} 15 | 16 | 17 | 18 | 19 | ...workflow.xml 20 | 21 | 22 | input 23 | ${coord:dataOut('DAILY_WORDS')} 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /oozie/workflow_demo/job.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Hadoop tutorials project 3 | # 4 | 5 | input=/user/hue/pig/examples/data/midsummer.txt 6 | 7 | oozie.use.system.libpath=true 8 | 9 | nameNode=hdfs://localhost:8020 10 | jobTracker=localhost:8021 11 | queueName=default 12 | examplesRoot=examples 13 | 14 | oozie.wf.application.path=/user/romain/examples/workflow_demo 15 | outputDir=map-reduce 16 | -------------------------------------------------------------------------------- /oozie/workflow_demo/lib/piggybank.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/romainr/hadoop-tutorials-examples/a070880c7d44402997080d860bb49cc234dff879/oozie/workflow_demo/lib/piggybank.jar -------------------------------------------------------------------------------- /oozie/workflow_demo/script.pig: -------------------------------------------------------------------------------- 1 | REGISTER piggybank.jar; 2 | 3 | data = LOAD '$input' as (text:CHARARRAY); 4 | upper_case = FOREACH data GENERATE org.apache.pig.piggybank.evaluation.string.UPPER(text); 5 | 6 | STORE upper_case INTO '$output' ; 7 | -------------------------------------------------------------------------------- /oozie/workflow_demo/workflow.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ${jobTracker} 6 | ${nameNode} 7 | 8 | -param 9 | input=${input} 10 | -param 11 | output=${output} 12 | 13 | 14 | 15 | 16 | 17 | Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] 18 | 19 | 20 | -------------------------------------------------------------------------------- /pig-json-python-udf/README.md: -------------------------------------------------------------------------------- 1 | Blog URL 2 | ======== 3 | 4 | [Prepare the data for analysis with Pig and Python UDF](http://gethue.com/hadoop-tutorials-ii-1-prepare-the-data-for-analysis/) 5 | -------------------------------------------------------------------------------- /pig-json-python-udf/clean_json.pig: -------------------------------------------------------------------------------- 1 | -- funny, useful, cool,user_id, review_id, stars, text, date, type, business_id 2 | 3 | reviews = load 'yelp_academic_dataset_review.json' 4 | using JsonLoader('votes:map[],user_id:chararray,review_id:chararray,stars:int,date:chararray,text:chararray,type:chararray,business_id:chararray'); 5 | 6 | tabs = FOREACH reviews generate 7 | (INT) votes#'funny', (INT) votes#'useful', (INT) votes#'cool', user_id, review_id, stars, REPLACE(REPLACE(text, '\n', ''), '\t', ''), date, type, business_id; 8 | 9 | STORE tabs INTO 'yelp_academic_dataset_review.tsv'; 10 | 11 | -------------------------------------------------------------------------------- /pig-json-python-udf/converter.py: -------------------------------------------------------------------------------- 1 | from com.xhaus.jyson import JysonCodec as json 2 | 3 | @outputSchema("business:chararray") 4 | def tsvify(line): 5 | business_json = json.loads(line) 6 | business = map(unicode, business_json.values()) 7 | return '\t'.join(business).replace('\n', ' ').encode('utf-8') 8 | 9 | -------------------------------------------------------------------------------- /pig-json-python-udf/python_udf.pig: -------------------------------------------------------------------------------- 1 | -- city, review_count, name, neighborhoods, type, business_id, full_address, state, longitude, stars, latitude, open, categories 2 | 3 | REGISTER 'converter.py' USING jython AS converter; 4 | 5 | 6 | reviews = LOAD 'yelp_academic_dataset_business.json' AS (line:CHARARRAY); 7 | 8 | tsv = FOREACH reviews GENERATE converter.tsvify(line); 9 | 10 | STORE tsv INTO 'yelp_academic_dataset_business.tsv'; 11 | 12 | -------------------------------------------------------------------------------- /search/indexing/README.md: -------------------------------------------------------------------------------- 1 | Blog URL 2 | ======== 3 | 4 | Dynamic search dashboards with Solr 5 | http://gethue.com/hadoop-search-dynamic-search-dashboards-with-solr/ 6 | 7 | How to index 8 | http://gethue.com/index-and-search-data-with-hadoop-and-solr 9 | 10 | -------------------------------------------------------------------------------- /search/indexing/apache_logs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Parses Apache Log files into a CSV data file ready to be indexed by Solr. 4 | 5 | Input format: 6 | demo.gethue.com:80 49.206.186.56 - - [04/May/2014:07:38:53 +0000] "GET /oozie/?format=json&type=running HTTP/1.1" 200 324 "http://demo.gethue.com/oozie/" "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.132 Safari/537.36" 7 | 8 | Requires these libraries: 9 | pip install pyyaml ua-parser 10 | https://github.com/tobie/ua-parser 11 | 12 | pip install pygeoip 13 | https://github.com/appliedsec/pygeoip 14 | 15 | Download http://geolite.maxmind.com/download/geoip/database/GeoLiteCity.dat.gz 16 | from http://dev.maxmind.com/geoip/legacy/geolite/ 17 | 18 | Script is quick & dirty and given as an example. 19 | """ 20 | 21 | import csv 22 | import re 23 | import uuid 24 | 25 | import pygeoip 26 | 27 | from datetime import datetime 28 | from ua_parser import user_agent_parser 29 | 30 | 31 | INPUT_LOGS = 'other_vhosts_access.log' 32 | OUTPUT_CSV = 'index_data.csv' 33 | MAX_ROWS = 1000 34 | 35 | LINE_RE = re.compile('(?P.+?) (?P[(\d\.)]+) - - \[(?P