├── .gitignore
├── .gitmodules
├── README.md
├── avro
    ├── .gitignore
    ├── README.org
    ├── devices
    │   ├── README.md
    │   ├── avro
    │   │   ├── __init__.py
    │   │   ├── datafile.py
    │   │   ├── io.py
    │   │   ├── ipc.py
    │   │   ├── protocol.py
    │   │   ├── schema.py
    │   │   ├── tool.py
    │   │   └── txipc.py
    │   ├── devices.avro
    │   └── read_devices.py
    ├── download_avro.sh
    ├── read_users.py
    ├── run.sh
    ├── user.avsc
    ├── write_bunch_of_users.py
    └── write_users.py
├── cloudera-director
    └── api-examples
    │   └── list-instances.py
├── cloudera-manager
    └── python_rest_api
    │   ├── README.md
    │   ├── cm-dump-config.py
    │   ├── simple_cluster_properties.py
    │   └── simple_config_settings.py
├── data-engineering
    ├── README.md
    ├── create_txnl_tbl.hql
    ├── create_txnl_tbl.sh
    ├── insert_into_products_txnl.hql
    ├── insert_into_products_txnl.sh
    ├── query_txnl.hql
    ├── query_txnl_using_hive.sh
    ├── query_txnl_using_impala.sh
    ├── update_txnl_using_hive.hql
    ├── update_txnl_using_hive.sh
    └── view_dir_structure.sh
├── hbase
    ├── apitests
    │   ├── README.md
    │   ├── pom.xml
    │   ├── run.sh
    │   └── src
    │   │   ├── main
    │   │       └── java
    │   │       │   ├── gui
    │   │       │       └── HBaseBlaster.java
    │   │       │   ├── misc
    │   │       │       └── CreateTable.java
    │   │       │   └── util
    │   │       │       └── HBaseUtility.java
    │   │   └── test
    │   │       └── java
    │   │           └── apitests
    │   │               └── AppTest.java
    ├── authorization
    │   └── simple
    │   │   └── README.md
    ├── client_maven_example
    │   └── pom.xml
    ├── colfam_flush
    │   ├── .gitignore
    │   ├── README.md
    │   └── insert_random_words.py
    ├── groovy
    │   └── loadRandomData
    │   │   ├── README.md
    │   │   ├── compile.sh
    │   │   ├── loadRandomData.groovy
    │   │   └── run.sh
    ├── hbase-sandbox
    │   ├── README.md
    │   ├── pom.xml
    │   ├── runStopRowThing.sh
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── hbasesandbox
    │   │           │   ├── StopRowThing.java
    │   │           │   └── UtilityThing.java
    │   │       └── resources
    │   │           └── log4j.properties
    ├── hbase_blocks
    │   └── hbase_blocks.rb
    ├── hbase_hive_impala
    │   ├── README.org
    │   ├── create_hbase_table.rb
    │   ├── create_hive_hbase_table.sql
    │   ├── map-hive-to-hbase-ratings.sql
    │   ├── put_data_into_sales_aggregate_table.sh
    │   └── select_some_ratings.sql
    ├── hotSpots
    │   ├── README.md
    │   ├── TransactionFactory.groovy
    │   ├── TransactionImporter.groovy
    │   ├── compile.sh
    │   └── run.sh
    ├── null_safe_joins
    │   ├── create_and_query.sql
    │   ├── create_and_query.txt
    │   └── data.rb
    ├── random_words_python
    │   ├── README.md
    │   └── insert_random_words.py
    ├── recordGenerator
    │   ├── PutRandomRecords.groovy
    │   └── run.sh
    ├── schema_design
    │   └── schema_design.org
    ├── shell_stuff
    │   ├── README.md
    │   ├── check_new_stuff
    │   │   ├── alter_table_properties_async.rb
    │   │   ├── alter_versions_async.rb
    │   │   ├── colfam_async.rb
    │   │   ├── create_simple_table.rb
    │   │   ├── create_table_shorthand.rb
    │   │   ├── create_table_two_colfams.rb
    │   │   ├── create_table_two_versions.rb
    │   │   ├── delete.rb
    │   │   ├── delete_colfam_async.rb
    │   │   ├── get_colfam.rb
    │   │   ├── namespace_create.rb
    │   │   ├── namespace_tables.rb
    │   │   ├── new_shell_commands.rb
    │   │   ├── parameter_create_tbl.rb
    │   │   ├── run_parameter_script.sh
    │   │   ├── scan_examples.rb
    │   │   ├── scan_filter.rb
    │   │   ├── unknown_arguments_warning.rb
    │   │   └── versions_async.rb
    │   ├── inspect_HTable.rb
    │   ├── list_regions.rb
    │   └── list_tables.rb
    └── simpleConnection
    │   ├── SimpleCreateAndPut.groovy
    │   ├── compile.sh
    │   └── run.sh
├── hdfs
    ├── data-visibility
    │   ├── README.md
    │   └── foo.pl
    ├── hdfs-cheatsheet.md
    ├── replication
    │   └── run.sh
    └── webhdfs-httpfs
    │   ├── run.sh
    │   └── testdata.txt
├── hive
    ├── crlf
    │   ├── data.txt
    │   ├── data_unix.txt
    │   ├── get_max.sql
    │   └── run.sh
    ├── debate
    │   ├── analyze_debate.hql
    │   └── debate.txt
    ├── incremental_insert
    │   ├── README.TXT
    │   ├── employees.txt
    │   ├── join_table.sql
    │   ├── load_and_run.sql
    │   ├── more_nicknames.txt
    │   ├── nicknames.txt
    │   └── run.sh
    ├── partition-example
    │   ├── README.TXT
    │   ├── create_and_load_employees.sql
    │   ├── employees.txt
    │   ├── get_partition_info.sql
    │   ├── partition_employees.sql
    │   ├── partition_employees_keep_orig_data.sql
    │   └── run.sh
    ├── simple_queries
    │   ├── README.md
    │   ├── create_tables.sql
    │   ├── customers.txt
    │   ├── load_data.sh
    │   ├── orders.txt
    │   └── subquery_in_where.sql
    ├── transform
    │   ├── awk-example.sh
    │   ├── legalpets.pl
    │   └── transform-pets.hql
    └── wordcount
    │   ├── README.TXT
    │   ├── README.md
    │   ├── compare.hql
    │   ├── create-external-table-for-mapreduce-output.hql
    │   ├── run-mr-and-hive-queries.sh
    │   ├── run.sh
    │   └── wordcount.hql
├── impala
    ├── README.md
    ├── analytic-functions
    │   ├── ads.txt
    │   ├── avg_ads.sql
    │   ├── avg_ads.txt
    │   ├── create_ads.sql
    │   ├── impala-version.txt
    │   ├── lag_ads.sql
    │   ├── lag_ads.txt
    │   └── run.sh
    ├── datatypes
    │   └── decimal_vs_integer
    │   │   ├── README.md
    │   │   ├── create_table.sql
    │   │   ├── data.txt
    │   │   ├── run.sh
    │   │   └── run_queries.sql
    ├── dyn_test
    │   ├── README.md
    │   ├── branch_totals_monday.txt
    │   ├── branch_totals_tuesday.txt
    │   ├── dyn_part.sql
    │   ├── run_me_hive.sh
    │   └── run_me_impala.sh
    ├── file_format_shootout
    │   ├── README.TXT
    │   ├── count.sh
    │   ├── create_and_populate_parquet_table.sql
    │   ├── create_rc_and_sequencefile_table.sql
    │   ├── drop_tables.sql
    │   ├── du_tables.sh
    │   ├── populate_rc_and_sequencefile_table.sql
    │   ├── q19.sql
    │   ├── run.sh
    │   └── run_q_19.sh
    ├── google-ngrams
    │   ├── README.md
    │   ├── count_spark.sql
    │   ├── find_spark.sql
    │   └── run.sh
    ├── impala-impyla-playground
    │   ├── README.TXT
    │   ├── data
    │   │   └── simple.txt
    │   └── simple.py
    ├── impyla
    │   └── query_impala.py
    ├── parquet
    │   ├── README.txt
    │   └── run.sh
    ├── refresh-and-invalidate
    │   ├── create-table.sql
    │   ├── monday.txt
    │   ├── run.sh
    │   ├── tuesday.txt
    │   └── wednesday.txt
    ├── simple_queries
    │   ├── create_tables.sql
    │   ├── customers.txt
    │   ├── load_data.sh
    │   ├── orders.txt
    │   └── subquery_in_where.sql
    ├── timestamps
    │   ├── README.md
    │   ├── queries.sql
    │   └── querying_timestamps.sql
    ├── tpcds
    │   ├── frequent_customers.sql
    │   └── run.sh
    └── tuning
    │   ├── compare_store_sales.sql
    │   ├── results.txt
    │   └── show_summary.sql
├── kafka-examples
    ├── .gitignore
    ├── README.md
    ├── THIS_IS_COOL.properties
    ├── log4jConfigs
    │   └── seekToBeginning.properties
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── hadoop
    │           │   └── examples
    │           │       └── kafka
    │           │           ├── AdminClientExamples.java
    │           │           ├── SeekToBeginningListener.java
    │           │           ├── SimpleConsumer.java
    │           │           └── SimpleProducer.java
    │       └── resources
    │           └── log4j.properties
├── kite-sdk
    ├── README.md
    ├── install-kite-cli.sh
    └── simple-cli
    │   ├── README.md
    │   ├── run.sh
    │   ├── sandwich.avsc
    │   └── sandwiches.csv
├── kudu
    ├── dataframes
    │   └── kuduDF.scala
    └── range-partitioning
    │   ├── README.md
    │   ├── RUNME.sh
    │   ├── create_hashed_metrics.sql
    │   ├── create_people.sql
    │   └── people.txt
├── mr
    ├── kill_job_from_mapper
    │   ├── README.md
    │   ├── compile.sh
    │   ├── run.sh
    │   ├── solution
    │   │   ├── KillMapper.java
    │   │   └── TryKill.java
    │   └── somedata.txt
    ├── local_jobrunner
    │   ├── detect-local-filesystem
    │   │   └── LocalJobRunnerDriver.java
    │   └── simple-example
    │   │   ├── .gitignore
    │   │   ├── SimpleDriver.java
    │   │   ├── compile.sh
    │   │   ├── run.sh
    │   │   └── somedata.txt
    ├── map_only_streaming
    │   ├── mapper.pl
    │   └── run.sh
    ├── maven_project_template_CDH4
    │   ├── .classpath
    │   ├── .project
    │   ├── .settings
    │   │   └── org.eclipse.jdt.core.prefs
    │   ├── README.txt
    │   ├── TUTORIAL
    │   │   └── Maven and CDH4.odt
    │   ├── pom.xml
    │   ├── pom.xml~
    │   ├── src
    │   │   ├── main
    │   │   │   └── java
    │   │   │   │   ├── CDHTRAINING
    │   │   │   │       └── App.java
    │   │   │   │   ├── SumReducer.java
    │   │   │   │   └── WordMapper.java
    │   │   └── test
    │   │   │   └── java
    │   │   │       ├── CDHTRAINING
    │   │   │           └── AppTest.java
    │   │   │       └── TestWordCount.java
    │   └── target
    │   │   ├── classes
    │   │       ├── CDHTRAINING
    │   │       │   └── App.class
    │   │       ├── SumReducer.class
    │   │       └── WordMapper.class
    │   │   ├── surefire-reports
    │   │       ├── CDHTRAINING.AppTest.txt
    │   │       ├── TEST-CDHTRAINING.AppTest.xml
    │   │       ├── TEST-TestWordCount.xml
    │   │       └── TestWordCount.txt
    │   │   └── test-classes
    │   │       ├── CDHTRAINING
    │   │           └── AppTest.class
    │   │       └── TestWordCount.class
    ├── nlineinputformat
    │   ├── README.md
    │   ├── generate_task_list.py
    │   ├── mapper.pl
    │   ├── run.sh
    │   └── task_list.txt
    ├── rest_api
    │   └── basic.sh
    ├── streaming_config_dumper
    │   ├── mapper.pl
    │   ├── part-00000
    │   ├── reducer.pl
    │   ├── run.sh
    │   └── something.txt
    ├── total_order_partitioner
    │   ├── .classpath
    │   ├── .project
    │   ├── .settings
    │   │   └── org.eclipse.jdt.core.prefs
    │   ├── README.txt
    │   ├── bin
    │   │   └── solution
    │   │   │   ├── ProcessLogs.class
    │   │   │   ├── domain
    │   │   │       └── MapperFunction.class
    │   │   │   └── mr
    │   │   │       ├── CountReducer.class
    │   │   │       ├── IdentityMapper.class
    │   │   │       ├── LogMonthMapper.class
    │   │   │       ├── SumReducer.class
    │   │   │       └── WordMapper.class
    │   ├── conf
    │   │   └── log4j.properties
    │   ├── src
    │   │   └── solution
    │   │   │   ├── ProcessLogs.java
    │   │   │   ├── domain
    │   │   │       └── MapperFunction.java
    │   │   │   └── mr
    │   │   │       ├── CountReducer.java
    │   │   │       ├── IdentityMapper.java
    │   │   │       ├── LogMonthMapper.java
    │   │   │       ├── SumReducer.java
    │   │   │       └── WordMapper.java
    │   └── tot-ord-part.jardesc
    └── yarn_containers
    │   ├── README.md
    │   ├── SleepJob.java
    │   ├── run.sh
    │   └── test_container_boundaries
    │       ├── README.md
    │       ├── SleepJobWithArray.java
    │       └── run.sh
├── pig
    ├── configuration
    │   ├── README.md
    │   ├── fixpig.sh
    │   ├── log4j.local
    │   ├── log4j.properties
    │   ├── pig.properties
    │   ├── pig.properties.data_analyst_vm
    │   └── pig0.12
    │   │   ├── README.md
    │   │   ├── conf
    │   │       └── log4j.properties
    │   │   ├── run.sh
    │   │   ├── sales.pig
    │   │   └── sales.txt
    ├── explain-split-vs-filter
    │   ├── explain-using-dot.sh
    │   ├── using-filter.pig
    │   ├── using-split.pig
    │   └── webcrawl.txt
    ├── generate
    │   ├── conf
    │   │   └── log4j.properties
    │   ├── run.sh
    │   ├── sales.pig
    │   └── sales.txt
    ├── hcatalog
    │   ├── run.sh
    │   ├── sample_store_sales.pig
    │   └── store_sales.pig
    ├── local-mode-hacks
    │   ├── README.md
    │   ├── read_some_hdfs_data.pig
    │   ├── run.sh
    │   └── somedata.txt
    ├── round
    │   ├── data.txt
    │   ├── results.txt
    │   └── round_this.pig
    └── sampling
    │   └── sample_tpcds.pig
├── spark
    ├── data-generator
    │   └── hash-data-generator.scala
    ├── data-parsing
    │   ├── data-parsing-using-try.scala
    │   └── data-parsing.scala
    ├── dataframes
    │   ├── README.org
    │   ├── alias.scala
    │   ├── analyzingExerciseUsingSparkSQL.py
    │   ├── antiJoin.py
    │   ├── columnExpresssions.scala
    │   ├── column_rename_after_joins.py
    │   ├── corruptRecords.py
    │   ├── corruptRecords.scala
    │   ├── creatingDataFrames.scala
    │   ├── grouping.scala
    │   ├── hadoop-examples-data
    │   │   ├── 01.csv
    │   │   ├── data.csv
    │   │   ├── interests.json
    │   │   ├── left.csv
    │   │   ├── maxVals.json
    │   │   ├── people.json
    │   │   ├── right.csv
    │   │   └── two.csv
    │   ├── hdfs_partitions.py
    │   ├── joins.py
    │   ├── joins.scala
    │   ├── rowFunctions.scala
    │   ├── saveDataFrameToDataSource.scala
    │   ├── schemasCSV.scala
    │   ├── spark_sql_udfs.py
    │   ├── windowing.py
    │   └── withColumn.scala
    ├── get-python-examples.sh
    ├── local_file.scala
    ├── log-level
    │   ├── README.md
    │   ├── log4j.properties
    │   ├── log4j.properties.with.debug.log.level
    │   └── run.sh
    ├── maven_example
    │   ├── README.md
    │   └── pom.xml
    ├── pair
    │   ├── sales.txt
    │   ├── sales_by_salesperson.scala
    │   ├── weblogs.scala
    │   └── weblogs.txt
    ├── run.sh
    ├── simple
    │   ├── core-site.xml
    │   ├── count.scala
    │   ├── data.txt
    │   └── log4j.properties
    ├── somedata.txt
    ├── spark-sql-scripts
    │   ├── README.md
    │   ├── computeStats.md
    │   ├── create_table_and_load.scala
    │   ├── create_table_and_load_parquet.scala
    │   ├── data
    │   │   └── people.json
    │   └── parse_json.scala
    ├── spark-sql
    │   ├── README.md
    │   ├── data
    │   │   └── favorite_foods
    │   │   │   └── favorite_foods.txt
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── resource
    │   │           └── log4j.properties
    │   │       └── scala
    │   │           └── examples
    │   │               ├── SimpleShowTables.scala
    │   │               ├── explode_and_friends.scala
    │   │               └── more_sql.scala
    ├── sparkml
    │   └── kmeans.py
    ├── structured_streaming_sensors
    │   ├── rate_source_simple.py
    │   └── rate_source_with_more_data.py
    ├── tf-idf
    │   └── tf-idf.spark
    ├── wordcount.scala
    └── wordlength_with_details
    │   ├── data.txt
    │   ├── log4j.properties
    │   └── wordlength_with_details.scala
├── sql-diffs
    ├── .gitignore
    ├── README.md
    ├── accts.txt
    ├── load_data.sh
    ├── query1.sql
    ├── query2.sql
    ├── query3.sql
    ├── query4.sql
    ├── query5.sql
    └── run_queries.sh
├── sqoop
    ├── README.md
    └── sqoop-job-create
└── utils
    ├── random_crash
        └── run.sh
    ├── setup_env.sh
    ├── sleepjob
        ├── allocations.xml
        ├── bigjob.sh
        ├── submitLONGSleepJob.sh
        └── submitReportsBOSSPool.sh
    └── teragen-and-terasort.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | .DS_Store
 3 | *.class
 4 | *.jar
 5 | *.pyc
 6 | *.log
 7 | *.*~
 8 | metastore_db
 9 | *.classpath
10 | *.project
11 | target/
12 | tags
13 | .settings
14 | index.html
15 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "impala-tpcds-kit"]
2 | 	path = impala-tpcds-kit
3 | 	url = hadoop-examples:NathanNeff/impala-tpcds-kit.git
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Hadoop Examples
 2 | ===============
 3 | 
 4 | Hadoop Examples is a set of simple example scripts to illustrate Hadoop ecosystem
 5 | tools like Hive and Pig.
 6 | 
 7 | Installation
 8 | -------------
 9 | EXAMPLES_DIR is an environment variable you can set to point to the directory
10 | where the hadoop-examples.jar is installed.
11 | 
12 | There is also a script: utils/setup_env.sh that can be sourced inside other
13 | shell scripts to try to find the hadoop-examples.jar.  It is ugly, but
14 | sometimes convenient :-/
15 | 
16 | # Release Notes
17 | 
18 | ## HBase Block Size
19 | 
20 | November 2016
21 | 
22 | HBase Block Size utility =hbase/hbase_blocks/hbase_blocks.rb= creates a table with
23 | a specified HBase block size.  Writes data, flushes, then uses admin object to
24 | get the region name.  Displays exact command =hbase hfile= to use to view the store
25 | file's index.  Some okay/kinda cool JRuby stuff there.
26 | 
27 | 
28 | Streaming Config Dumper
29 | -----------------------
30 | 
31 | MapReduce scripts to print their ENV variables, which also include
32 | Hadoop configuration stuff for streaming jobs.
33 | 
34 | See =mr/streaming_config_dumper/=
35 | 
36 | 
37 | Hive and Pig
38 | ------------
39 | 12/20/2013 
40 | 
41 | - Incremental insert example in Hive
42 |   Inserts non-duplicate data into a join table from incrementally updated
43 |   source tables See hive/incremental_insert/
44 | 
45 | - Added example of Pig's EXPLAIN command to show a diagram of the execution plan
46 |              for SPLIT versus FILTER
47 |   See pig/explain-split-vs-filter/
48 | 
49 | - Added example of Hive's PARTITION feature
50 |   See hive/partition-example/
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/avro/.gitignore:
--------------------------------------------------------------------------------
1 | users.avro
2 | 


--------------------------------------------------------------------------------
/avro/README.org:
--------------------------------------------------------------------------------
1 | Tutorial from: 
2 | http://avro.apache.org/docs/current/gettingstartedpython.html
3 | 
4 | And:
5 | http://www.harshj.com/2010/04/25/writing-and-reading-avro-data-files-using-python/
6 | 


--------------------------------------------------------------------------------
/avro/devices/README.md:
--------------------------------------------------------------------------------
1 | The avro directory was copied from an Avro egg that
2 | came with Hue
3 | 


--------------------------------------------------------------------------------
/avro/devices/avro/__init__.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | # 
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | # 
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | __all__ = ['schema', 'io', 'datafile', 'protocol', 'ipc']
18 | 
19 | 


--------------------------------------------------------------------------------
/avro/devices/devices.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/avro/devices/devices.avro


--------------------------------------------------------------------------------
/avro/devices/read_devices.py:
--------------------------------------------------------------------------------
1 | import avro.schema
2 | from avro.datafile import DataFileReader, DataFileWriter
3 | from avro.io import DatumReader, DatumWriter
4 | 
5 | reader = DataFileReader(open("devices.avro", "r"), DatumReader())
6 | for device in reader:
7 |     print device
8 | reader.close()
9 | 


--------------------------------------------------------------------------------
/avro/download_avro.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | AVRO_TARBALL=avro-1.7.4.tar.gz
3 | MD5FILE=avro-1.7.4.tar.gz.md5
4 | MIRROR=http://www.eng.lsu.edu/mirrors/apache/avro/avro-1.7.4/py
5 | 
6 | test -f $MD5FILE || wget http://www.us.apache.org/dist/avro/stable/py/$MD5FILE
7 | test -f $AVRO_TARBALL || wget $MIRROR/$AVRO_TARBALL
8 | md5sum -c $MD5FILE
9 | 


--------------------------------------------------------------------------------
/avro/read_users.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import avro.schema
 3 | from avro.datafile import DataFileReader
 4 | from avro.io import DatumReader
 5 | 
 6 | reader = DataFileReader(open("users.avro", "r"), DatumReader())
 7 | for user in reader:
 8 |         print user
 9 | reader.close()
10 | 


--------------------------------------------------------------------------------
/avro/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Find your own eggs on your system
3 | EGG_DIR=/opt/cloudera/parcels/CDH-5.1.2-1.cdh5.1.2.p0.3/lib/hue/build/env/lib/python2.6/site-packages/avro-1.7.6-py2.6.egg
4 | export PYTHONPATH=$EGG_DIR
5 | python ./write_bunch_of_users.py
6 | python ./read_users.py
7 | 


--------------------------------------------------------------------------------
/avro/user.avsc:
--------------------------------------------------------------------------------
 1 | {"namespace": "example.avro",
 2 |         "type": "record",
 3 |         "name": "User",
 4 |         "fields": [
 5 |                 {"name": "fullname", "type": "string"},
 6 |                 {"name": "favorite_number",  "type": ["int", "null"]},
 7 |                 {"name": "favorite_color", "type": ["string", "null"]}
 8 |         ]
 9 | }
10 | 


--------------------------------------------------------------------------------
/avro/write_bunch_of_users.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import avro.schema
 3 | from avro.datafile import DataFileReader, DataFileWriter
 4 | from avro.io import DatumReader, DatumWriter
 5 | 
 6 | schema = avro.schema.parse(open("user.avsc").read())
 7 | writer = DataFileWriter(open("users.avro", "w"), DatumWriter(), schema)
 8 | 
 9 | dictionary_file = open('/usr/share/dict/words', 'r')
10 | 
11 | for word in dictionary_file:
12 |         print "Adding " + word
13 |         writer.append({"fullname": word, "favorite_number": len(word)})
14 |         if word > "l":
15 |                 break
16 | writer.close()
17 | 


--------------------------------------------------------------------------------
/avro/write_users.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import avro.schema
 3 | from avro.datafile import DataFileReader, DataFileWriter
 4 | from avro.io import DatumReader, DatumWriter
 5 | 
 6 | schema = avro.schema.parse(open("user.avsc").read())
 7 | 
 8 | writer = DataFileWriter(open("users.avro", "w"), DatumWriter(), schema)
 9 | writer.append({"fullname": "Alyssa", "favorite_number": 256})
10 | writer.append({"fullname": "Ben", "favorite_number": 7, "favorite_color": "red"})
11 | writer.close()
12 | 


--------------------------------------------------------------------------------
/cloudera-director/api-examples/list-instances.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | from cloudera.director.common.client import ApiClient
 3 | from cloudera.director.latest import AuthenticationApi, EnvironmentsApi, DeploymentsApi, ClustersApi
 4 | from cloudera.director.latest.models import Login
 5 | 
 6 | client = ApiClient("http://localhost:7189")
 7 | AuthenticationApi(client).login(Login(username="admin", password="admin"))
 8 | for envName in EnvironmentsApi(client).list():
 9 | 	print "Environment: %s" % envName
10 | 	if DeploymentsApi(client).list(envName):
11 | 		for depName in DeploymentsApi(client).list(envName):
12 | 			print "\tDeployment: %s" % depName
13 | 			if ClustersApi(client).list(envName, depName):
14 | 				for clusterName in ClustersApi(client).list(envName, depName):
15 | 					print "\t\tCluster: %s" % clusterName
16 | 					cluster = ClustersApi(client).get(envName, depName, clusterName)
17 | 					if cluster.instances:
18 | 						for instance in cluster.instances:
19 | 							print "\t\t\tInstance: %s %s" % (instance.properties['publicIpAddress'], instance.health.status)
20 | 
21 | 


--------------------------------------------------------------------------------
/cloudera-manager/python_rest_api/README.md:
--------------------------------------------------------------------------------
 1 | # Python REST API for Cloudera Manager
 2 | 
 3 | These are my dabblings for using the Python interface to 
 4 | work with the Cloudera REST API
 5 | 
 6 | # Installation
 7 | 
 8 | Basically, I ran =sudo pip install cm_api=
 9 | 
10 |         - http://cloudera.github.io/cm_api/
11 | 
12 |         - Great examples of installation and usage
13 |                 - http://cloudera.github.io/cm_api/docs/python-client/
14 | 
15 | 
16 |         - When you deal with the results of these Python calls, the "objects" fit this model:
17 |                 - http://cloudera.github.io/cm_api/epydoc/5.0.0/index.html
18 |                 - http://cloudera.github.io/cm_api/apidocs/v6/model.html
19 | 
20 |         - The terms used by Cloudera Manager, like "Roles", "Role Groups" etc. are detailed here,
21 |           and it pays off big-time to understand the hierarchy and relationships between these entities
22 | 
23 |           http://www.cloudera.com/content/cloudera-content/cloudera-docs/CM5/latest/Cloudera-Manager-Introduction/cm5i_primer.html?scroll=concept_wfj_tny_jk_unique_1
24 | 
25 | # Examples
26 | 
27 | ## Simple List Cluster Properties
28 |     - Prints names of the clusters that are managed by CM
29 |     - Dives a bit into the properties of clusters, like hosts, roles, etc.
30 |     - [Simple Cluster Properties](simple_cluster_properties.py)
31 | 
32 | ## Dump Cluster Configurations
33 |   - [CM Dump Config](cm-dump-config.py)
34 | 
35 | ## Show Configuration for Role Instance
36 |   - Shows configuration settings for a specific DataNode in a cluster
37 |   - [Simple Configuration Settings Example](simple_config_settings.py)
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/cloudera-manager/python_rest_api/simple_cluster_properties.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | from cm_api.api_client import ApiResource
 4 | 
 5 | cm_host = ""
 6 | if len(sys.argv) > 1:
 7 |         cm_host = sys.argv[1]
 8 | else:
 9 |         sys.stderr.write("Usage: simple_cluster_properties.py <CM_SERVER>")
10 |         sys.exit(1)
11 | 
12 | api = ApiResource(cm_host, username="admin", password="admin")
13 | 
14 | def printClusterNames():
15 |         for c in api.get_all_clusters():
16 |                 print "Cluster \"%s\" is version %s" % (c.name, c.version)
17 | 
18 | # Host Object Model http://cloudera.github.io/cm_api/apidocs/v6/ns0_apiHost.html
19 | def printClusterHosts():
20 |         for c in api.get_all_clusters():
21 |                 # cluster.get_all_hosts returns ApiHostRefs, which need to be looked up
22 |                 print "Hosts in cluster \"%s\" are: " % c.name
23 |                 for host_ref in c.list_hosts():
24 |                         host = api.get_host(host_ref.hostId)
25 |                         print host.hostname
26 |                         
27 | def printHostTemplates(host_template_name):                        
28 |         for c in api.get_all_clusters():
29 |             print c.get_all_host_templates()
30 |             host_template = c.get_host_template(host_template_name)
31 |             if host_template is not None:
32 |                 print "I found host template \"%s\":" % host_template_name
33 |                 print host_template
34 | 
35 | printClusterNames()
36 | printClusterHosts()
37 | printHostTemplates("ThisGuy")
38 | 


--------------------------------------------------------------------------------
/cloudera-manager/python_rest_api/simple_config_settings.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Display configuration settings for DataNodes
 3 | import sys
 4 | from cm_api.api_client import ApiResource
 5 | 
 6 | cm_host = None
 7 | cluster_name = None
 8 | 
 9 | if len(sys.argv) > 2:
10 |         cm_host = sys.argv[1]
11 |         cluster_name = sys.argv[2]
12 | else:
13 |         sys.stderr.write("Usage: simple_config_settings.py <CM_SERVER> <CLUSTER_NAME>")
14 |         sys.exit(1)
15 | 
16 | api = ApiResource(cm_host, username="admin", password="admin")
17 | 
18 | # The service api must be retrieved from the cluster api
19 | def printDataNodeConfig():
20 |         c = api.get_cluster(cluster_name)
21 |         dn_groups = []
22 |         hdfs = None
23 |         for service in c.get_all_services():
24 |                 if service.type == "HDFS":
25 |                         hdfs = service
26 | 
27 |         for group in hdfs.get_all_role_config_groups():
28 |                 if group.roleType == 'DATANODE':
29 |                         dn_groups.append(group)
30 | 
31 |         for cg in dn_groups:
32 |                 print "Found config group:  " + cg.name
33 | 
34 |         dn_config = dn_groups[0].get_config(view='full')
35 | 
36 |         print "Each datanode will store data on these local directories: \n%s" % dn_config['dfs_data_dir_list'].value
37 |         print "Each datanode can use up to this amount on each disk for HDFS: \n%s" % dn_config['dfs_datanode_du_reserved'].value
38 | 
39 | printDataNodeConfig()
40 | 


--------------------------------------------------------------------------------
/data-engineering/README.md:
--------------------------------------------------------------------------------
 1 | # Comparison of Hive / Impala in CDP 7.2
 2 | 
 3 | https://docs.cloudera.com/runtime/7.2.0/using-hiveql/topics/hive-orc-parquet-compare.html
 4 | # Run the following in order
 5 | 
 6 | 1) create_txnl_tbl.sh
 7 | 
 8 | 1) insert_into_products_txnl.sh
 9 | 
10 | -- Impala currently does not query fully transactional tables
11 | -- (Coming soon)
12 | -- https://issues.apache.org/jira/browse/IMPALA-9042
13 | 
14 | 1) query_txnl_using_impala.sh
15 | 
16 | -- Hive does
17 | 1) query_txnl_using_hive.sh
18 | 
19 | -- View directory structure in table
20 | 1) ./view_dir_structure.sh
21 | 
22 | -- Update data in txnl table
23 | 1) ./update_txnl_using_hive.sh
24 | 
25 | -- View directory structure in table
26 | 1) ./view_dir_structure.sh
27 | 
28 | 


--------------------------------------------------------------------------------
/data-engineering/create_txnl_tbl.hql:
--------------------------------------------------------------------------------
 1 | USE analyst;
 2 | 
 3 | DROP TABLE IF EXISTS products_txnl;
 4 | CREATE TABLE `products_txnl`(
 5 | `prod_id` int,
 6 | `brand` string,
 7 | `name` string,
 8 | `price` int,
 9 | `cost` int,
10 | `shipping_wt` int);
11 | 
12 | DESCRIBE FORMATTED products_txnl;
13 | 
14 | 


--------------------------------------------------------------------------------
/data-engineering/create_txnl_tbl.sh:
--------------------------------------------------------------------------------
1 | beeline --verbose=false -u jdbc:hive2://localhost:10000 -f create_txnl_tbl.hql
2 | 


--------------------------------------------------------------------------------
/data-engineering/insert_into_products_txnl.hql:
--------------------------------------------------------------------------------
1 | USE analyst;
2 | INSERT INTO products_txnl
3 | SELECT * from products;
4 | 


--------------------------------------------------------------------------------
/data-engineering/insert_into_products_txnl.sh:
--------------------------------------------------------------------------------
1 | beeline --verbose=false --report -u jdbc:hive2://localhost:10000 -f insert_into_products_txnl.hql
2 | 


--------------------------------------------------------------------------------
/data-engineering/query_txnl.hql:
--------------------------------------------------------------------------------
1 | USE analyst;
2 | SELECT MIN(price) FROM analyst.products_txnl;
3 | 


--------------------------------------------------------------------------------
/data-engineering/query_txnl_using_hive.sh:
--------------------------------------------------------------------------------
1 | beeline --verbose=false -u jdbc:hive2://localhost:10000 -f "query_txnl.hql"
2 | cat query_txnl.hql
3 | 


--------------------------------------------------------------------------------
/data-engineering/query_txnl_using_impala.sh:
--------------------------------------------------------------------------------
1 | impala-shell -q "REFRESH analyst.products_txnl;"
2 | impala-shell -f query_txnl.hql
3 | 


--------------------------------------------------------------------------------
/data-engineering/update_txnl_using_hive.hql:
--------------------------------------------------------------------------------
1 | USE analyst;
2 | UPDATE products_txnl
3 | SET price = 100
4 | WHERE price < 100;
5 | 


--------------------------------------------------------------------------------
/data-engineering/update_txnl_using_hive.sh:
--------------------------------------------------------------------------------
1 | beeline --verbose=false -u jdbc:hive2://localhost:10000 -f update_txnl_using_hive.hql
2 | 


--------------------------------------------------------------------------------
/data-engineering/view_dir_structure.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | hdfs dfs -ls /warehouse/tablespace/managed/hive/analyst.db/products_txnl
3 | 


--------------------------------------------------------------------------------
/hbase/apitests/README.md:
--------------------------------------------------------------------------------
 1 | # Java example of using HBaseConfiguration.create()
 2 | 
 3 | ## Compile using Maven:
 4 | 
 5 | mvn compile
 6 | 
 7 | ## To run:
 8 | 
 9 | cd target/classes
10 | java -cp `hbase classpath` apitests.CreateTable [useold]
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/hbase/apitests/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |     <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |     <groupId>apitests</groupId>
 6 |     <artifactId>apitests</artifactId>
 7 |     <version>1.0-SNAPSHOT</version>
 8 |     <packaging>jar</packaging>
 9 | 
10 |     <name>apitests</name>
11 |     <url>http://maven.apache.org</url>
12 | 
13 |     <properties>
14 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15 |     </properties>
16 | 
17 |     <repositories>
18 |         <repository>
19 |             <id>cloudera</id>
20 |             <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
21 |         </repository>
22 |     </repositories>
23 | 
24 |     <dependencies>
25 |         <dependency>
26 |             <groupId>junit</groupId>
27 |             <artifactId>junit</artifactId>
28 |             <version>3.8.1</version>
29 |             <scope>test</scope>
30 |         </dependency>
31 | 
32 |         <dependency> 
33 |             <groupId>org.apache.hbase</groupId> 
34 |             <artifactId>hbase-client</artifactId> 
35 |             <version>0.98.6-cdh5.2.0</version> 
36 | 
37 |         </dependency>
38 | 
39 |     </dependencies>
40 |     <build>
41 |         <plugins>
42 |             <plugin>
43 |                 <groupId>org.codehaus.mojo</groupId>
44 |                 <artifactId>exec-maven-plugin</artifactId>
45 |                 <version>1.4.0</version>
46 |                 <configuration>
47 |                     <executable>java</executable>
48 |                     <classpath/>
49 |                 </configuration>
50 | 
51 | 
52 |             </plugin>
53 |         </plugins>
54 |     </build>
55 | 
56 | </project>
57 | 


--------------------------------------------------------------------------------
/hbase/apitests/run.sh:
--------------------------------------------------------------------------------
1 | java -cp `hbase classpath`:target/classes apitests.CreateTable
2 | 


--------------------------------------------------------------------------------
/hbase/client_maven_example/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 2 |   <modelVersion>4.0.0</modelVersion>
 3 |   <groupId>hbaseExample</groupId>
 4 |   <artifactId>hbaseExample</artifactId>
 5 |   <version>0.0.1-SNAPSHOT</version>
 6 |   <properties>
 7 |       <hbase.version>0.98.6-cdh5.2.0</hbase.version>
 8 |   </properties>
 9 |   <repositories>
10 |       <repository>
11 |           <id>cloudera</id>
12 |           <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
13 |       </repository>
14 |   </repositories>
15 |   <dependencies>
16 |     <dependency>
17 |       <groupId>org.apache.hbase</groupId> 
18 |       <artifactId>hbase-client</artifactId> 
19 |       <version>${hbase.version}</version> 
20 |     </dependency>
21 |     <dependency>
22 |       <groupId>mysql</groupId>
23 |       <artifactId>mysql-connector-java</artifactId>
24 |       <version>8.0.16</version>
25 |     </dependency>
26 |   </dependencies>
27 |   <build>
28 |     <sourceDirectory>src</sourceDirectory>
29 |     <plugins>
30 |       <plugin>
31 |         <artifactId>maven-compiler-plugin</artifactId>
32 |         <version>3.1</version>
33 |         <configuration>
34 |           <source>1.7</source>
35 |           <target>1.7</target>
36 |         </configuration>
37 |       </plugin>
38 |     </plugins>
39 |   </build>
40 | </project>
41 | 


--------------------------------------------------------------------------------
/hbase/colfam_flush/.gitignore:
--------------------------------------------------------------------------------
1 | hbase/
2 | thrift/
3 | 


--------------------------------------------------------------------------------
/hbase/colfam_flush/README.md:
--------------------------------------------------------------------------------
 1 | # README
 2 | 
 3 | Script to insert random words into HBase
 4 | 
 5 | # The HBase Python Thrift libraries must be installed
 6 | 
 7 | For this script to work.
 8 | 
 9 | # TODO 
10 | 
11 | - Find what the licensing is on the Python/Hbase thrift libraries and
12 |   distribute with this code
13 | 
14 | 


--------------------------------------------------------------------------------
/hbase/groovy/loadRandomData/README.md:
--------------------------------------------------------------------------------
 1 | # README
 2 | 
 3 | These scripts are written using the Groovy programming language.
 4 | 
 5 | Groovy is an excellent language that runs on the Java Virtual Machine.
 6 | 
 7 | Simple instructions for installing Groovy can be found at:
 8 | http://groovy.codehaus.org/Installing+Groovy
 9 | 
10 | # Movie Lens Data
11 | 
12 | ./loadRandomData.groovy relies on the Movie Lens dataset:
13 | 
14 | - http://files.grouplens.org/datasets/movielens/ml-10m-README.html
15 | 


--------------------------------------------------------------------------------
/hbase/groovy/loadRandomData/compile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Just compile/syntax check the file
3 | groovyc -cp `hadoop classpath`:`hbase classpath` ./loadRandomData.groovy
4 | 


--------------------------------------------------------------------------------
/hbase/groovy/loadRandomData/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | groovy -classpath `hbase classpath` loadRandomData.groovy
3 | 


--------------------------------------------------------------------------------
/hbase/hbase-sandbox/README.md:
--------------------------------------------------------------------------------
 1 | # HBase Sandbox
 2 | 
 3 | ## Stop Row Thing
 4 | 
 5 | Java class that creates a table, splits it into X number of regions, then performs various Scans.
 6 | Scans use no start/stop row, stop row and PrefixFilter to demonstrate the # of regions that are
 7 | scanned and rows returned using various combinations of start/stop row, and Prefix filters.
 8 | 
 9 | To run, use:
10 | 
11 | 	./runStopRowThing.sh
12 | 
13 | You can use Eclipse to work with the code.  To create an Eclipse project, use:
14 | 
15 | 	mvn eclipse:eclipse
16 | 
17 | It should then be possible to use Eclipse to modify / run the Java source code.
18 | 
19 | Log level can be tuned by editing src/main/resources/log4j.properties and
20 | setting hbase.root.logger=INFO,console
21 | 


--------------------------------------------------------------------------------
/hbase/hbase-sandbox/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |     <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |     <groupId>hbasesandbox</groupId>
 6 |     <artifactId>hbasesandbox</artifactId>
 7 |     <version>1.0-SNAPSHOT</version>
 8 |     <packaging>jar</packaging>
 9 | 
10 |     <name>HBase Sandbox</name>
11 |     <url>http://maven.apache.org</url>
12 | 
13 |     <properties>
14 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15 |     </properties>
16 | 
17 |     <repositories>
18 |         <repository>
19 |             <id>cloudera</id>
20 |             <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
21 |         </repository>
22 |     </repositories>
23 | 
24 |     <dependencies>
25 |         <dependency>
26 |             <groupId>junit</groupId>
27 |             <artifactId>junit</artifactId>
28 |             <version>3.8.1</version>
29 |             <scope>test</scope>
30 |         </dependency>
31 | 
32 |         <dependency> 
33 |             <groupId>org.apache.hbase</groupId> 
34 |             <artifactId>hbase-client</artifactId> 
35 |             <version>0.98.6-cdh5.2.0</version> 
36 | 
37 |         </dependency>
38 | 
39 |     </dependencies>
40 |     <build>
41 |         <plugins>
42 |             <plugin>
43 |                 <groupId>org.codehaus.mojo</groupId>
44 |                 <artifactId>exec-maven-plugin</artifactId>
45 |                 <version>1.4.0</version>
46 |                 <configuration>
47 |                     <executable>java</executable>
48 |                     <classpath/>
49 |                 </configuration>
50 |             </plugin>
51 |         </plugins>
52 |     </build>
53 | 
54 | </project>
55 | 


--------------------------------------------------------------------------------
/hbase/hbase-sandbox/runStopRowThing.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | mvn package
4 | java -cp target/hbasesandbox-1.0-SNAPSHOT.jar:`hbase classpath` hbasesandbox.StopRowThing
5 | 


--------------------------------------------------------------------------------
/hbase/hbase_hive_impala/README.org:
--------------------------------------------------------------------------------
 1 | * This assumes that data is in HBase
 2 |   in the users.ratings column family
 3 | 
 4 |   key     column = movieid
 5 | 
 6 |   userid  value = rating
 7 | 
 8 |   Example, bob rated movie #100 a 5 and rated movie #123 with a 4:
 9 | 
10 |   key
11 | 
12 |   bob    100:5, 123:4
13 | 


--------------------------------------------------------------------------------
/hbase/hbase_hive_impala/create_hbase_table.rb:
--------------------------------------------------------------------------------
 1 | # Create HBase Table
 2 | def drop_if_exists_and_create(name, *args)
 3 |   if @hbase.admin(@formatter).exists?(name.to_s)
 4 |     @hbase.admin(@formatter).disable name
 5 |     @hbase.admin(@formatter).drop name
 6 |     puts "Droppped table: " + name
 7 |   end
 8 |   
 9 |   @hbase.admin(@formatter).create name, *args
10 |   puts "Created table: " + name + "\n\n"
11 | end
12 | 
13 | drop_if_exists_and_create 'nate_hbase_sales_grouped', { NAME => 'cf1' }
14 | drop_if_exists_and_create 'nate_hbase_movie_ratings', { NAME => 'ratings' }
15 | 
16 | put 'nate_hbase_movie_ratings', 'nate', 'ratings:star_wars', '5'
17 | put 'nate_hbase_movie_ratings', 'nate', 'ratings:clone_wars', '1'
18 | 
19 | put 'nate_hbase_movie_ratings', 'steve', 'ratings:star_wars', '5'
20 | put 'nate_hbase_movie_ratings', 'steve', 'ratings:clone_wars', '1'
21 | 
22 | put 'nate_hbase_movie_ratings', 'dumbo', 'ratings:star_wars', '1'
23 | put 'nate_hbase_movie_ratings', 'dumbo', 'ratings:clone_wars', '5'
24 | 
25 | put 'nate_hbase_movie_ratings', 'suzie', 'ratings:beaches', '4'
26 | put 'nate_hbase_movie_ratings', 'suzie', 'ratings:magnolia', '4'
27 | 
28 | exit
29 | 


--------------------------------------------------------------------------------
/hbase/hbase_hive_impala/create_hive_hbase_table.sql:
--------------------------------------------------------------------------------
 1 | -- The following JARs might need to be added.
 2 | -- Use Hive's ADD JAR command
 3 | -- zookeeper.jar;
 4 | -- hive-hbase-handler.jar;
 5 | -- guava-11.0.2.jar;
 6 | -- hbase-client.jar;
 7 | -- hbase-common.jar;
 8 | -- hbase-hadoop-compat.jar;
 9 | -- hbase-hadoop2-compat.jar;
10 | -- hbase-protocol.jar;
11 | -- hbase-server.jar;
12 | -- htrace-core.jar;
13 | 
14 | DROP TABLE IF EXISTS sales_grouped;
15 | 
16 | CREATE EXTERNAL TABLE sales_grouped
17 |         (customer_id INT, 
18 |          total_sales INT)
19 | STORED BY
20 |    'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
21 | WITH SERDEPROPERTIES ("hbase.columns.mapping" =
22 |         ":key, 
23 |         cf1:total_sales")
24 | TBLPROPERTIES
25 |         ("hbase.table.name" = "hbase_sales_grouped");
26 | 
27 | 
28 | DROP TABLE IF EXISTS the_movie_ratings;
29 | 
30 | -- ratings is simply the entire 'ratings' column family 
31 | CREATE EXTERNAL TABLE the_movie_ratings
32 |         (userid STRING, 
33 |          movie_ratings MAP<STRING,STRING>)
34 | STORED BY
35 |    'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
36 | WITH SERDEPROPERTIES ("hbase.columns.mapping" =
37 |         ":key, 
38 |         ratings:") -- just map entire column family to the movie_ratings MAP
39 | TBLPROPERTIES
40 |         ("hbase.table.name" = "nate_hbase_movie_ratings");
41 | 


--------------------------------------------------------------------------------
/hbase/hbase_hive_impala/map-hive-to-hbase-ratings.sql:
--------------------------------------------------------------------------------
 1 | -- You may need to set hbase.zookeeper.quorum
 2 | set hbase.zookeeper.quorum=
 3 | 
 4 | -- The following JARs may be necessary, and distro/version dependent
 5 | -- hbase-0.94.6-<distro-version>-security.jar;
 6 | -- hive-hbase-handler-0.10.0-<distro version>.jar;
 7 | 
 8 | CREATE EXTERNAL TABLE IF NOT EXISTS hbase_ratings
 9 | (userid int, ratings MAP<STRING, STRING>, lname STRING)   
10 | STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'   
11 |         WITH SERDEPROPERTIES ("hbase.columns.mapping" =":key, ratings:, info:lname")
12 |         TBLPROPERTIES ("hbase.table.name" = "user");
13 | 
14 | -- Find ratings for movie ID 400
15 | SELECT userid, ratings['2997'] FROM hbase_ratings WHERE ratings['2997'] IS NOT NULL;
16 | 
17 | CREATE TABLE IF NOT EXISTS exported_hbase_ratings(userid int, movieid int, rating int)
18 | 


--------------------------------------------------------------------------------
/hbase/hbase_hive_impala/put_data_into_sales_aggregate_table.sh:
--------------------------------------------------------------------------------
1 | # Create HBase Table
2 | create 'hbase_sales_grouped', 'cf1'
3 | 


--------------------------------------------------------------------------------
/hbase/hbase_hive_impala/select_some_ratings.sql:
--------------------------------------------------------------------------------
 1 | -- These add jar statements will vary, depending on distro
 2 | -- zookeeper.jar;
 3 | -- hive-hbase-handler.jar;
 4 | -- guava-11.0.2.jar;
 5 | -- hbase-client.jar;
 6 | -- hbase-common.jar;
 7 | -- hbase-hadoop-compat.jar;
 8 | -- hbase-hadoop2-compat.jar;
 9 | -- hbase-protocol.jar;
10 | -- hbase-server.jar;
11 | -- htrace-core.jar;
12 | 
13 | -- SELECT * FROM the_movie_ratings WHERE movie_ratings['star_wars'] IS NOT NULL;
14 | 
15 | -- Find top 2 users with the most movie ratings
16 | SELECT userid, MAP_KEYS(movie_ratings) AS the_count
17 | FROM the_movie_ratings;
18 | 
19 | -- WHERE movie_ratings['star_wars'] IS NOT NULL
20 | -- GROUP BY the_count
21 | -- ORDER BY the_count DESC
22 | -- LIMIT 2;
23 | 
24 | 


--------------------------------------------------------------------------------
/hbase/hotSpots/README.md:
--------------------------------------------------------------------------------
1 | # hotSpots
2 | 
3 | Groovy scripts to create hotspots
4 | 


--------------------------------------------------------------------------------
/hbase/hotSpots/TransactionFactory.groovy:
--------------------------------------------------------------------------------
 1 | public class TransactionFactory {
 2 | 
 3 |         Random rand = new Random()
 4 | 
 5 |         def curPrice = 100
 6 | 
 7 |         def tickers = [
 8 |                 [ symbol : 'IBM', price : 100],
 9 |                 [ symbol : 'AAPL', price : 200],
10 |                 [ symbol : 'MSFT', price : 300],
11 |                 [ symbol : 'INTC', price : 50],
12 |                 [ symbol : 'BWLD', price : 200]
13 |         ]
14 | 
15 |         def getNewTrans() {
16 |                 def dt = new Date()
17 |                 def ticker = tickers[rand.nextInt(tickers.size())]
18 |                 def thekey = dt.format('yyyy/MM/dd HH:mm:ss:SS') + ' ' + ticker['symbol']
19 | 
20 |                 return [ key: thekey, dt: dt.time.toString(), price:ticker['price'], symbol:ticker['symbol'] ]
21 |         }
22 | 
23 |         
24 | }
25 | 


--------------------------------------------------------------------------------
/hbase/hotSpots/compile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | groovyc TransactionFactory.groovy
3 | 


--------------------------------------------------------------------------------
/hbase/hotSpots/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | groovy --classpath `hbase classpath` ./TransactionImporter.groovy --numRows=10000000
3 | 


--------------------------------------------------------------------------------
/hbase/null_safe_joins/data.rb:
--------------------------------------------------------------------------------
 1 | create 'hbase_a', 'cf1'
 2 | create 'hbase_b', 'cf1'
 3 | 
 4 | put 'hbase_a', '1', 'cf1:record_id', '1'
 5 | put 'hbase_a', '1', 'cf1:record_name', 'bob'
 6 | 
 7 | put 'hbase_b', '1', 'cf1:record_id', '1'
 8 | put 'hbase_b', '1', 'cf1:record_name', 'B robert'
 9 | 
10 | # record_id for bob in both tables where record_id is NULL
11 | put 'hbase_a', '2', 'cf1:record_name', 'steve'
12 | put 'hbase_b', '2', 'cf1:record_name', 'steves record'
13 | 
14 | exit
15 | 


--------------------------------------------------------------------------------
/hbase/random_words_python/README.md:
--------------------------------------------------------------------------------
 1 | # README
 2 | 
 3 | Script to insert random words into HBase
 4 | 
 5 | # TODO
 6 | 
 7 | Find what the licensing is on the Python/Hbase thrift libraries and distribute
 8 | with this code
 9 | 
10 | 


--------------------------------------------------------------------------------
/hbase/random_words_python/insert_random_words.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | from thrift.transport import TSocket
 4 | from thrift.protocol import TBinaryProtocol
 5 | from thrift.transport import TTransport
 6 | from hbase import Hbase
 7 | import os
 8 | import os.path
 9 | import random
10 | import sys
11 | 
12 | if len(sys.argv) > 1:
13 |         thriftserver = sys.argv[1]
14 | else: 
15 |         thriftserver = "localhost"
16 | 
17 | random.seed()
18 | words = [line.strip() for line in open('/usr/share/dict/words')]
19 | max = len(words) - 1
20 | 
21 | # Connect to HBase Thrift server
22 | # Assumes that thrift server is localhost
23 | transport = TTransport.TBufferedTransport(TSocket.TSocket(thriftserver, 9090))
24 | protocol = TBinaryProtocol.TBinaryProtocolAccelerated(transport)
25 | 
26 | # Create and open the client connection
27 | client = Hbase.Client(protocol)
28 | transport.open()
29 | 
30 | # Create a list of mutations per batch
31 | mutationsbatch = []
32 | 
33 | # Create 1 billion data rows
34 | num_rows = 1000000
35 | batchsize = 10000
36 | 
37 | mutationsbatch = []
38 | for x in range(0, num_rows - 1):
39 |         if x % batchsize == 0:
40 |                 print "Pushing " + str(x)
41 |                 client.mutateRows("words", mutationsbatch)
42 |                 mutationsbatch = []
43 |                 r = random.randint(0, max)
44 | 	
45 |         row = []
46 | 
47 | 	# Add this cell
48 | 	row.append(Hbase.Mutation(column="w:" + words[r] + "@" + str(x), value=str(1)))
49 | 		
50 |         thisword = words[r]
51 | 	mutationsbatch.append(Hbase.BatchMutation(row=thisword + str(r),mutations=row))
52 | 	
53 | # Run the mutations for the words
54 | client.mutateRows("words", mutationsbatch)
55 | 	
56 | transport.close()
57 | 


--------------------------------------------------------------------------------
/hbase/recordGenerator/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ~/tools/groovy/bin/groovy -classpath `hbase classpath` ./PutRandomRecords.groovy
3 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/README.md:
--------------------------------------------------------------------------------
 1 | This is playing around with the hbase shell, and trying to
 2 | get to underlying Java objects.
 3 | 
 4 | To run these things, use:
 5 | 
 6 |     hbase shell <ruby_sript.rb>
 7 | 
 8 | 
 9 | Example:
10 | 
11 |     hbase shell inspect_HTable.rb
12 | 
13 | # Run with parameters
14 | 
15 | The check_new_stuff/run_parameter_script.sh is rudimentary example of invoking hbase/ruby
16 | script with parameters. 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/check_new_stuff/alter_table_properties_async.rb:
--------------------------------------------------------------------------------
 1 | tbl = "test_movie"
 2 | if @hbase.admin(@formatter).exists?(tbl)
 3 |         puts "Table #{tbl} already exists. Please drop or use different table"
 4 |         exit 1
 5 | end
 6 | 
 7 | create tbl, { NAME => 'desc'}, { READONLY => 'true' }
 8 | put tbl, 'Star Wars', 'desc:title', 'Star Wars'
 9 | 
10 | alter_async tbl, READONLY => 'false'
11 | put tbl, 'Star Wars', 'desc:title', 'Star Wars'
12 | 
13 | disable tbl
14 | drop tbl
15 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/check_new_stuff/alter_versions_async.rb:
--------------------------------------------------------------------------------
 1 | tbl = "test_movie"
 2 | disable tbl
 3 | drop tbl
 4 | if @hbase.admin(@formatter).exists?(tbl)
 5 |         puts "Table #{tbl} already exists. Please drop or use different table"
 6 |         exit 1
 7 | end
 8 | 
 9 | create tbl, { NAME => 'desc' }
10 | put tbl, 'Star Wars', 'desc:title', 'Star Wars'
11 | put tbl, 'Star Wars', 'desc:title', 'Star Wars:A New Hope'
12 | 
13 | get tbl, 'Star Wars', { COLUMNS => 'desc:title', VERSIONS => 2 }
14 | 
15 | alter_async tbl, NAME => 'desc', VERSIONS => 2
16 | put tbl, 'Star Wars', 'desc:title', 'Star Wars:A New New Hope'
17 | get tbl, 'Star Wars', { COLUMNS => 'desc:title', VERSIONS => 2 }
18 | 
19 | puts "Checking alter status"
20 | alter_status tbl
21 | 
22 | disable tbl
23 | drop tbl
24 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/check_new_stuff/colfam_async.rb:
--------------------------------------------------------------------------------
 1 | tbl = 'nate_alter'
 2 | 
 3 | if @hbase.admin(@formatter).exists?(tbl)
 4 |         puts "Table '#{tbl}' already exists.  Please drop it first."
 5 |         exit 1
 6 | end
 7 | 
 8 | create tbl, { NAME => 'cf1' }
 9 | put tbl, '1', 'cf1:col1', 'value'
10 | alter tbl, NAME => 'cf2'
11 | put tbl, '1', 'cf2:col2', 'value'
12 | alter tbl, NAME => 'cf3'
13 | put tbl, '1', 'cf3:col3', 'value'
14 | describe tbl
15 | scan tbl
16 | 
17 | disable tbl
18 | drop tbl
19 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/check_new_stuff/create_simple_table.rb:
--------------------------------------------------------------------------------
 1 | tbl = 'test_simple'
 2 | 
 3 | create tbl, {NAME => 'desc'}
 4 | describe tbl
 5 | put tbl, 'Star Wars', 'desc:title', 'Star Wars'
 6 | get tbl, 'Star Wars'
 7 | 
 8 | disable tbl
 9 | drop tbl
10 | exit
11 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/check_new_stuff/create_table_shorthand.rb:
--------------------------------------------------------------------------------
 1 | # Shorthand
 2 | tbl = 'test_shorthand'
 3 | create tbl, 'movie', 'desc', 'media'
 4 | put tbl, 'Phantom', 'desc:title', 'Phantom Menace'
 5 | put tbl, 'Phantom', 'media:thumbs_down', 'thumbs_down'
 6 | get tbl, 'Phantom'
 7 | 
 8 | disable tbl
 9 | drop tbl
10 | 
11 | exit
12 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/check_new_stuff/create_table_two_colfams.rb:
--------------------------------------------------------------------------------
 1 | # Create two colfams
 2 | tbl = 'test_two_colfams'
 3 | 
 4 | create tbl, {NAME => 'desc'}, {NAME => 'media'}
 5 | describe tbl
 6 | 
 7 | put tbl, 'Jedi', 'desc:title', 'Return of the Jedi'
 8 | put tbl, 'Jedi', 'media:fanboy_picture', 'fanboy\'s picture'
 9 | 
10 | get tbl, 'Jedi'
11 | 
12 | disable tbl
13 | drop tbl
14 | exit
15 | 
16 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/check_new_stuff/create_table_two_versions.rb:
--------------------------------------------------------------------------------
 1 | # Create tbl w/two versions
 2 | tbl = 'test_two_vers'
 3 | create tbl, {NAME => 'desc', VERSIONS => 2}
 4 | describe tbl
 5 | 
 6 | put tbl, 'Empire', 'desc:title', 'Empire Wimps Out'
 7 | put tbl, 'Empire', 'desc:title', 'Empire Strikes Back'
 8 | 
 9 | get tbl, 'Empire', { COLUMN=>'desc:title', VERSIONS=> 2}
10 | 
11 | disable tbl
12 | drop tbl
13 | 
14 | exit
15 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/check_new_stuff/delete.rb:
--------------------------------------------------------------------------------
 1 | tbl = "test_movie"
 2 | if @hbase.admin(@formatter).exists?(tbl)
 3 |         puts "Table #{tbl} already exists. Please drop or use different table"
 4 |         exit 1
 5 | end
 6 | 
 7 | create tbl, { NAME => 'desc', VERSIONS => 3 }
 8 | put tbl, 'rowkey1', 'desc:title', 'New Hope'
 9 | put tbl, 'rowkey1', 'desc:year', '1975', 1
10 | put tbl, 'rowkey1', 'desc:year', '1976', 2
11 | put tbl, 'rowkey1', 'desc:year', '1977', 3
12 | 
13 | put tbl, 'rowkey2', 'desc:title', 'Empire Strikes Back'
14 | put tbl, 'rowkey2', 'desc:year', '1975', 1
15 | put tbl, 'rowkey2', 'desc:year', '1976', 2
16 | put tbl, 'rowkey2', 'desc:year', '1980', 3
17 | 
18 | put tbl, 'rowkey3', 'desc:title', 'Return of the Jedi'
19 | put tbl, 'rowkey3', 'desc:year', '1975'
20 | put tbl, 'rowkey3', 'desc:year', '1976'
21 | put tbl, 'rowkey3', 'desc:year', '1982'
22 | 
23 | puts "We have all three rows"
24 | scan tbl
25 | 
26 | delete tbl, 'rowkey3', 'desc:year'
27 | puts "No Jedi years should be visible here"
28 | scan tbl, { STARTROW => 'rowkey3' }
29 | 
30 | puts "No Empire year before 1980 should be here"
31 | delete tbl, 'rowkey2', 'desc:year', 2
32 | scan tbl, { STARTROW => 'rowkey2', ENDROW => 'rowkey3', VERSIONS => 3 }
33 | 
34 | puts "No Star Wars rows should be here"
35 | deleteall tbl, 'rowkey1'
36 | scan tbl
37 | 
38 | puts "No more rows should be here:"
39 | truncate tbl
40 | scan tbl
41 | 
42 | disable tbl
43 | drop tbl
44 | 
45 | exit
46 | 
47 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/check_new_stuff/delete_colfam_async.rb:
--------------------------------------------------------------------------------
 1 | tbl = 'nate_alter'
 2 | 
 3 | if @hbase.admin(@formatter).exists?(tbl)
 4 |         puts "Table '#{tbl}' already exists.  Please drop it first."
 5 |         exit 1
 6 | end
 7 | 
 8 | create tbl, { NAME => 'cf1' }, { NAME => 'cf2' }
 9 | put tbl, 1, 'cf1:col1', 'value1'
10 | put tbl, 1, 'cf2:col2', 'value1'
11 | 
12 | scan tbl
13 | 
14 | puts "Now deleting cf2"
15 | alter tbl, NAME => 'cf2', METHOD => 'delete'
16 | 
17 | scan tbl
18 | 
19 | puts "*" * 10, "Watch this, we can delete only remaining colfam"
20 | alter tbl, NAME => 'cf1', METHOD => 'delete'
21 | scan tbl
22 | describe tbl
23 | 
24 | disable tbl
25 | drop tbl
26 | exit
27 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/check_new_stuff/get_colfam.rb:
--------------------------------------------------------------------------------
 1 | tbl = "test_movie"
 2 | if @hbase.admin(@formatter).exists?(tbl)
 3 |         puts "Table #{tbl} already exists. Please drop or use different table"
 4 |         exit 1
 5 | end
 6 | 
 7 | create tbl, { NAME => 'desc' }, { NAME => 'ratings' }
 8 | put tbl, 'Star Wars', 'desc:title', 'New Hope'
 9 | put tbl, 'Star Wars', 'desc:year', '1977', 1274032629664
10 | put tbl, 'Star Wars', 'desc:year', '1978', 1274032629663
11 | put tbl, 'Star Wars', 'ratings:bob', '1'
12 | put tbl, 'Star Wars', 'ratings:steve', '5'
13 | 
14 | 
15 | puts "Getting data:  We should only see data from desc colfam\n:" +
16 |      "And we should see 1977 because it has a later timestamp"
17 | get tbl, 'Star Wars', { COLUMN => 'desc' }
18 | 
19 | puts "Getting data with ['desc']" 
20 | get tbl, 'Star Wars', { COLUMN => ['desc'] }
21 | 
22 | # Note ['desc:'] invalid as of (at least) CDH 5.2
23 | puts "Getting data with ['desc:']" 
24 | get tbl, 'Star Wars', { COLUMN => ['desc:'] }
25 | 
26 | # Note 'desc:' invalid as of (at least) CDH 5.2
27 | puts "Getting the data with 'desc:'"
28 | get tbl, 'Star Wars', { COLUMN => 'desc:' }
29 | 
30 | puts "Disabling/dropping #{tbl}"
31 | disable tbl
32 | drop tbl
33 | 
34 | exit
35 | 
36 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/check_new_stuff/namespace_create.rb:
--------------------------------------------------------------------------------
1 | create_namespace 'namespaceName'
2 | alter_namespace 'namespaceName', { METHOD => 'set', 'SOME_PROPERTY' => 'SOME_VALUE' }
3 | drop_namespace 'namespaceName'
4 | 
5 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/check_new_stuff/namespace_tables.rb:
--------------------------------------------------------------------------------
 1 | puts "Simple table"
 2 | create_namespace 'entertainment'
 3 | create 'entertainment:movie', { NAME => 'desc' }
 4 | disable 'entertainment:movie'
 5 | drop 'entertainment:movie'
 6 | 
 7 | puts "Versions = 2"
 8 | create 'entertainment:movie', { NAME => 'desc', VERSIONS => 2 }
 9 | disable 'entertainment:movie'
10 | drop 'entertainment:movie'
11 | 
12 | puts "Two colfams"
13 | create 'entertainment:movie', { NAME => 'desc', VERSIONS => 2 }
14 | disable 'entertainment:movie'
15 | drop 'entertainment:movie'
16 | 
17 | # Only empty namespaces can be removed
18 | drop_namespace 'entertainment'
19 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/check_new_stuff/new_shell_commands.rb:
--------------------------------------------------------------------------------
 1 | # Tested create and acceptance of VERSIONS parameter
 2 | disable 'nate_movie'
 3 | drop 'nate_movie'
 4 | 
 5 | # 04-08 Shell Command Syntax
 6 | create 'nate_movie', {NAME => 'desc', VERSIONS => 5}
 7 | 
 8 | # Verification
 9 | put 'nate_movie', 1, 'desc:title', 'Star Wars'
10 | put 'nate_movie', 1, 'desc:title', 'Star Wars version 2'
11 | put 'nate_movie', 1, 'desc:title', 'Star Wars version 3'
12 | put 'nate_movie', 1, 'desc:title', 'Star Wars version 4'
13 | put 'nate_movie', 1, 'desc:title', 'Star Wars version 5'
14 | 
15 | get 'nate_movie', '1', {COLUMN=>'desc', VERSIONS=>3}
16 | 
17 | # Pass parameters test
18 | if ARGV.length()
19 | get ARGV[1], ARGV[2]
20 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/check_new_stuff/parameter_create_tbl.rb:
--------------------------------------------------------------------------------
 1 | if ARGV && ARGV.length() == 2
 2 |         tbl = ARGV[0]
 3 |         colfam = ARGV[1]
 4 |         create tbl, colfam
 5 |         describe tbl
 6 | else
 7 |         puts "Usage:  parameter_create_tbl.rb <table> <colfam>"
 8 |         exit 1
 9 | end
10 | exit 0
11 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/check_new_stuff/run_parameter_script.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | hbase shell parameter_create_tbl.rb sometable somecolfam
3 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/check_new_stuff/scan_examples.rb:
--------------------------------------------------------------------------------
 1 | tbl = "test_movie"
 2 | if @hbase.admin(@formatter).exists?(tbl)
 3 |         puts "Table #{tbl} already exists. Please drop or use different table"
 4 |         exit 1
 5 | end
 6 | 
 7 | create tbl, { NAME => 'desc' }, { NAME => 'media' }
 8 | put tbl, 'rowkey1', 'desc:title', 'New Hope'
 9 | put tbl, 'rowkey1', 'media:type', 'Tape'
10 | 
11 | put tbl, 'rowkey2', 'desc:title', 'Empire Strikes Back'
12 | put tbl, 'rowkey2', 'desc:year', '1980'
13 | put tbl, 'rowkey2', 'media:type', 'Tape'
14 | 
15 | put tbl, 'rowkey3', 'desc:title', 'Jedi'
16 | put tbl, 'rowkey3', 'media:type', 'Tape'
17 | 
18 | put tbl, 'rowkey4', 'desc:title', 'Phantom'
19 | put tbl, 'rowkey4', 'media:type', 'DVD'
20 | 
21 | put tbl, 'rowkey5', 'desc:title', 'Clone'
22 | put tbl, 'rowkey5', 'media:type', 'DVD'
23 | 
24 | 
25 | scan tbl
26 | 
27 | puts "limiting to 1"
28 | scan tbl, { LIMIT => 1 }
29 | 
30 | puts "startrow of rowkey1, end of rowkey4"
31 | scan tbl, { STARTROW => 'rowkey1', STOPROW => 'rowkey4' }
32 | 
33 | puts "only retrieve title and type fields"
34 | scan tbl, { COLUMNS => [ 'desc:title', 'media:type' ] }
35 | 
36 | put tbl, 'desc:duration', 120
37 | scan tbl, { FILTER => "SingleColumnValueFilter('desc', 'duration', =, 'binary:120')" }
38 | 
39 | disable tbl
40 | drop tbl
41 | 
42 | exit
43 | 
44 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/check_new_stuff/scan_filter.rb:
--------------------------------------------------------------------------------
 1 | tbl = "test_movie"
 2 | if @hbase.admin(@formatter).exists?(tbl)
 3 |         puts "Table #{tbl} already exists. Please drop or use different table"
 4 |         exit 1
 5 | end
 6 | 
 7 | create tbl, { NAME => 'desc' }, { NAME => 'media' }
 8 | 
 9 | put tbl, 'Star Wars', 'desc:duration', 'binary:120'
10 | put tbl, 'Empire', 'desc:duration', 100
11 | put tbl, 'Jedi', 'desc:duration', '120'
12 | 
13 | # Binary tells the filter what kind of comparator to use
14 | scan tbl, { FILTER => "SingleColumnValueFilter('desc', 'duration', =, 'binary:120')" }
15 | 
16 | disable tbl
17 | drop tbl
18 | 
19 | exit
20 | 
21 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/check_new_stuff/unknown_arguments_warning.rb:
--------------------------------------------------------------------------------
1 | # Thankfully, invalid/unknown arguments cause HBase Shell
2 | # to print a warning
3 | create 'sometable', { NAME => 'cf1', VERSION => 1, VERSIONS => 2 }
4 | disable 'sometable'
5 | drop 'sometable'
6 | exit
7 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/check_new_stuff/versions_async.rb:
--------------------------------------------------------------------------------
 1 | tbl = 'nate_alter'
 2 | 
 3 | if @hbase.admin(@formatter).exists?(tbl)
 4 |         puts "Table '#{tbl}' already exists.  Please drop it first."
 5 |         exit 1
 6 | end
 7 | 
 8 | create tbl, { NAME => 'cf1' }
 9 | 
10 | put tbl, '1', 'cf1:col1', 'value'
11 | put tbl, '1', 'cf1:col1', 'value'
12 | put tbl, '1', 'cf1:col1', 'value'
13 | put tbl, '1', 'cf1:col1', 'value'
14 | 
15 | puts "*" * 10, "Get row -- we see 1 version is kept."
16 | get tbl, '1', { COLUMN => 'cf3:col3', VERSIONS => 5 }
17 | 
18 | puts "*" * 10, "Now, alter versions to 5"
19 | alter tbl, NAME => 'cf3', VERSIONS => '5'
20 | 
21 | put tbl, '1', 'cf1:col1', 'value'
22 | put tbl, '1', 'cf1:col1', 'value'
23 | put tbl, '1', 'cf1:col1', 'value'
24 | put tbl, '1', 'cf1:col1', 'value'
25 | puts "*" * 10, "Now we have more versions retained"
26 | get tbl, '1', { COLUMN => 'cf1:col1', VERSIONS => 5 }
27 | 
28 | disable tbl
29 | drop tbl
30 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/inspect_HTable.rb:
--------------------------------------------------------------------------------
1 | tbl = get_table('njn_transactions')
2 | # Get to the underlying table for the REAL power, Jedi!
3 | puts "Here's the methods of the tbl.table:  "
4 | puts tbl.table.methods
5 | exit
6 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/list_regions.rb:
--------------------------------------------------------------------------------
 1 | require 'test/unit'
 2 | extend Test::Unit::Assertions
 3 | 
 4 | tbl = get_table('njn_transactions')
 5 | # Get to the underlying tbl.table for the REAL power, my apprentice.
 6 | puts "Here's the regions:"
 7 | # get_region_locations returns a "NavigableMap" Java object that has a RegionInfo as a key, and ServerName as value
 8 | # http://archive.cloudera.com/cdh5/cdh/5/hbase-0.96.1.1-cdh5.0.1/devapidocs/org/apache/hadoop/hbase/client/HTable.html#getRegionLocations%28%29
 9 | tbl.table.get_region_locations.each_with_index do |region_thingy, idx|
10 |         puts "-" * 100
11 |         puts "Region " + idx.to_s
12 | 
13 |         # get_region_name_as_string is the same thing as .regionName(), except String vs. Byte Array
14 |         regionName = region_thingy[0].get_region_name_as_string
15 |         assert_equal regionName, Bytes.toString(region_thingy[0].regionName)
16 | 
17 |         # Print out info about this region
18 |         puts "Region Name is: " + regionName
19 |         puts region_thingy[0].toString()
20 | end
21 | 
22 | puts "Now printing start keys of each region in this table:  "
23 | tbl.table.get_start_keys.each do |byte_array_start_key|
24 |         puts Bytes.toString(byte_array_start_key)
25 | end
26 | exit
27 | 


--------------------------------------------------------------------------------
/hbase/shell_stuff/list_tables.rb:
--------------------------------------------------------------------------------
1 | import org.apache.hadoop.hbase.client.HBaseAdmin
2 | 
3 | admin = HBaseAdmin.new(@hbase.configuration)
4 | puts admin.getTableNames().to_a
5 | exit
6 | 


--------------------------------------------------------------------------------
/hbase/simpleConnection/SimpleCreateAndPut.groovy:
--------------------------------------------------------------------------------
 1 | #!/bin/env groovy
 2 | // This will load data from movieratings flat file into a table xx_users
 3 | // The data is simply put into the key, and fake values are put into ratings column family
 4 | // it is simply used to show data ingestion using the HBase API
 5 | import org.apache.hadoop.hbase.HBaseConfiguration
 6 | import org.apache.hadoop.hbase.HTableDescriptor
 7 | import org.apache.hadoop.hbase.HColumnDescriptor
 8 | import org.apache.hadoop.hbase.client.HBaseAdmin
 9 | import org.apache.hadoop.hbase.client.HConnectionManager
10 | import org.apache.hadoop.hbase.client.HConnection
11 | import org.apache.hadoop.hbase.client.Put
12 | import org.apache.hadoop.hbase.util.Bytes
13 | import groovy.time.*
14 | 
15 | /* Setup */
16 | def ratingsTable
17 | def tableName = "njn_users"
18 | def shouldCreateTable = true
19 | def shouldPreSplit = true
20 | 
21 | HBaseConfiguration conf = new HBaseConfiguration()
22 | HConnection connection = HConnectionManager.createConnection(conf)
23 | 
24 | if (shouldCreateTable) {
25 |         admin = new HBaseAdmin(conf)
26 | 
27 |         if (admin.tableExists(tableName)) {
28 |             admin.disableTable(tableName) 
29 |             admin.deleteTable(tableName)
30 |         }
31 | 
32 |         def desc = new HTableDescriptor(Bytes.toBytes(tableName))
33 |         desc.addFamily(new HColumnDescriptor(Bytes.toBytes("info")))
34 |         admin.createTable(desc)
35 | }
36 | 
37 | ratingsTable = connection.getTable(tableName)
38 | 
39 | def start = new Date()
40 | 
41 | Put p = new Put(Bytes.toBytes("StevesKey"))
42 | p.add(Bytes.toBytes("info"), Bytes.toBytes("fname"), Bytes.toBytes("Steve"))
43 | ratingsTable.put(p)
44 | 
45 | TimeDuration duration = TimeCategory.minus(new Date(), start)
46 | 
47 | println "Done, a one-row insert took " + duration
48 | 
49 | 


--------------------------------------------------------------------------------
/hbase/simpleConnection/compile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | groovyc -classpath `hbase classpath` ./SimpleCreateAndPut.groovy
3 | 


--------------------------------------------------------------------------------
/hbase/simpleConnection/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | groovy -classpath `hbase classpath` ./SimpleCreateAndPut.groovy
3 | 


--------------------------------------------------------------------------------
/hdfs/data-visibility/README.md:
--------------------------------------------------------------------------------
 1 | # README
 2 | 
 3 | Illustrate that a file need not be a full HDFS block before 
 4 | data can be read from the file.
 5 | 
 6 | 1.  Run this in a shell:
 7 | 
 8 |     ./foo.pl | hadoop fs -put - data.txt
 9 | 
10 | 2.  Open another shell and run this
11 | 
12 |     hadoop fs -ls data.txt.\_COPYING\_
13 |     
14 |     hadoop fs -cat data.txt.\_COPYING\_ | head -n 10
15 | 
16 | 3. *Note* Don't forget to Ctrl-C ./foo.pl!!!!
17 | 


--------------------------------------------------------------------------------
/hdfs/data-visibility/foo.pl:
--------------------------------------------------------------------------------
1 | #!/bin/env perl
2 | for($i=0;  $i<=1_000_000; $i++) {
3 |         if ($i % 10000 == 0) {
4 |                 sleep(1);
5 |                 print "Sleeping " . `date`;
6 |         }
7 |         print $i, "\n";
8 | }
9 | 


--------------------------------------------------------------------------------
/hdfs/replication/run.sh:
--------------------------------------------------------------------------------
1 | TMP_FILE=words_`date "+%F%s"`
2 | hadoop fs -put /usr/share/dict/words $TMP_FILE
3 | hadoop fs -setrep 4 $TMP_FILE
4 | echo "Check out $TMP_FILE in your home dir."
5 | sleep 10
6 | hadoop fs -setrep 3 $TMP_FILE
7 | 


--------------------------------------------------------------------------------
/hdfs/webhdfs-httpfs/testdata.txt:
--------------------------------------------------------------------------------
1 | The quick brown fox
2 | jumped over the lazy dog
3 | 


--------------------------------------------------------------------------------
/hive/crlf/data.txt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 2
 3 | 3
 4 | 4
 5 | 5
 6 | 6
 7 | 7
 8 | 8
 9 | 9
10 | 10
11 | 


--------------------------------------------------------------------------------
/hive/crlf/data_unix.txt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 2
 3 | 3
 4 | 4
 5 | 5
 6 | 6
 7 | 7
 8 | 8
 9 | 9
10 | 10
11 | 


--------------------------------------------------------------------------------
/hive/crlf/get_max.sql:
--------------------------------------------------------------------------------
1 | select max(junk) as the_max, count(junk) as the_count
2 | from dosjunk
3 | group by junk
4 | order by the_max limit 100000
5 | 


--------------------------------------------------------------------------------
/hive/crlf/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script runs impala and hive and MapReduce wordcount.
 3 | # Please remove whatever you don't want to run.
 4 | set -e
 5 | IMPALA_SERVER=$1
 6 | OUTPUT_DIR=output/wc_dosjunk
 7 | impala-shell -i $IMPALA_SERVER -f ./get_max.sql
 8 | hive -f ./get_max.sql
 9 | hadoop fs -test -d $OUTPUT_DIR && hadoop fs -rm -r $OUTPUT_DIR 
10 | hadoop jar $EXAMPLES_DIR/hadoop-examples.jar wordcount dosjunk $OUTPUT_DIR 
11 | hadoop fs -getmerge $OUTPUT_DIR wordcount_output.txt
12 | echo "Wordcount output is in wordcount_output.txt"
13 | 


--------------------------------------------------------------------------------
/hive/debate/analyze_debate.hql:
--------------------------------------------------------------------------------
 1 | DROP TABLE IF EXISTS debate;
 2 | CREATE TABLE debate(debatetext string);
 3 | 
 4 | LOAD DATA LOCAL INPATH 'debate.txt' 
 5 | INTO TABLE debate;
 6 | 
 7 | SELECT EXPLODE(NGRAMS(
 8 | SENTENCES(debate.debatetext), 4, 10)) -- <<< Try this with 3 or 4 and see how results change
 9 | AS x
10 | FROM debate
11 | 


--------------------------------------------------------------------------------
/hive/incremental_insert/README.TXT:
--------------------------------------------------------------------------------
1 | Read load_and_run.sql for a description of this project
2 | 
3 | Run run.sh to see it in action
4 | 


--------------------------------------------------------------------------------
/hive/incremental_insert/employees.txt:
--------------------------------------------------------------------------------
1 | bobsupervisor
2 | steveprogrammer
3 | 


--------------------------------------------------------------------------------
/hive/incremental_insert/join_table.sql:
--------------------------------------------------------------------------------
 1 | -- INSERT into join_table the name, title, nickname from the employees and nicknames,
 2 | -- but DO NOT duplicate existing data in join_table
 3 | INSERT INTO TABLE join_table
 4 | SELECT e.name, e.title, n.nickname
 5 |         FROM employees e
 6 |         JOIN nicknames n ON e.name = n.name
 7 |         LEFT OUTER JOIN join_table jt 
 8 |         ON (jt.name = e.name AND n.nickname = jt.nickname AND jt.title = e.title)
 9 |         WHERE jt.name IS NULL;
10 | 


--------------------------------------------------------------------------------
/hive/incremental_insert/load_and_run.sql:
--------------------------------------------------------------------------------
 1 | -- This example shows how to use Hive to insert non-duplicate data
 2 | -- into a join table.
 3 | 
 4 | -- Employees table:
 5 | -- bob     supervisor
 6 | 
 7 | -- Nicknames table:
 8 | -- bob     bob_nickname
 9 | 
10 | -- We create a join_table with a very simple initial
11 | -- dataset:
12 | -- bob     supervisor     bob_nickname     
13 | -- steve   programmer     steve_nickname     
14 | 
15 | -- Then, we load *more* nicknames into the nicknames table, and only
16 | -- insert the new nickname relations into the join_table.
17 | -- bob     another_bob_nickname
18 | 
19 | -- We want the resulting join_table to include only:
20 | -- bob     supervisor     bob_nickname     
21 | -- bob     supervisor     another_bob_nickname
22 | -- steve   programmer     steve_nickname     
23 | -- steve   programmer     another_steve_nickname     
24 | 
25 | -- We don't want to get duplicate data in the join_table
26 | ADD FILE join_table.sql;
27 | CREATE DATABASE IF NOT EXISTS incremental_insert;
28 | 
29 | USE incremental_insert;
30 | 
31 | DROP TABLE IF EXISTS employees;
32 | CREATE TABLE employees(name STRING, title STRING);
33 | LOAD DATA LOCAL INPATH 'employees.txt' INTO TABLE employees;
34 | 
35 | DROP TABLE IF EXISTS nicknames;
36 | CREATE TABLE nicknames(name STRING, nickname STRING);
37 | LOAD DATA LOCAL INPATH 'nicknames.txt' INTO TABLE nicknames;
38 | 
39 | DROP TABLE IF EXISTS join_table;
40 | CREATE TABLE join_table(name STRING, title STRING, nickname STRING);
41 | 
42 | -- Run the join
43 | SOURCE join_table.sql;
44 | SELECT COUNT(*) FROM join_table;
45 | 
46 | -- Now, load more nicknames
47 | LOAD DATA LOCAL INPATH 'more_nicknames.txt' INTO TABLE nicknames;
48 | 
49 | -- Run the join again
50 | SOURCE join_table.sql;
51 | SELECT COUNT(*) FROM join_table;
52 | 
53 | 


--------------------------------------------------------------------------------
/hive/incremental_insert/more_nicknames.txt:
--------------------------------------------------------------------------------
1 | bobanother_bob_nickname
2 | steveanother_steve_nickname
3 | 


--------------------------------------------------------------------------------
/hive/incremental_insert/nicknames.txt:
--------------------------------------------------------------------------------
1 | bobbob_nickname
2 | stevesteve_nickname
3 | 


--------------------------------------------------------------------------------
/hive/incremental_insert/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | hive -S -v -f load_and_run.sql
3 | 


--------------------------------------------------------------------------------
/hive/partition-example/README.TXT:
--------------------------------------------------------------------------------
1 | The script run.sh will do everything for you.
2 | 
3 | get_partition_info.sql has an example of EXPLAIN EXTENDED
4 | to show that Hive will use Partitions in a SELECT statement.
5 | Note that Hive is really smart, and if you SELECT * according to
6 | a partition, Hive will not run MapReduce, it will just perform a
7 | hadoop fs -get /user/hive/warehouse/yourtable/partition=<whatever> :)
8 | 


--------------------------------------------------------------------------------
/hive/partition-example/create_and_load_employees.sql:
--------------------------------------------------------------------------------
1 | CREATE DATABASE IF NOT EXISTS db1;
2 | DROP TABLE IF EXISTS db1.employees;
3 | CREATE TABLE db1.employees(name STRING, state STRING)
4 |     ROW FORMAT DELIMITED
5 |     FIELDS TERMINATED BY '\t';
6 | LOAD DATA LOCAL INPATH 'employees.txt' INTO TABLE db1.employees;
7 | 


--------------------------------------------------------------------------------
/hive/partition-example/employees.txt:
--------------------------------------------------------------------------------
 1 | Bob	CA
 2 | Steve	CA
 3 | Andy	TX
 4 | Sherry	TX
 5 | Silvia	TX
 6 | Cynthia	TX
 7 | Tex	TX
 8 | Alvin	TX
 9 | Nate	LA
10 | Jerry	TX
11 | Doug	TX
12 | Terry	CA
13 | Betty	TX
14 | Bertha	TX
15 | Walter	TX
16 | Gus	TX
17 | Jesse	CA
18 | Lydia	TX
19 | Hank	TX
20 | Marie	TX
21 | Fen	IL
22 | Mike	TX
23 | Jack	CA
24 | Ben	TX
25 | Ian	NY
26 | Sarah	TX
27 | Charles	MO
28 | Tom	MO
29 | Mirko	CO
30 | Ted	OH
31 | Kaufman	MA
32 | Andrew	MA
33 | 


--------------------------------------------------------------------------------
/hive/partition-example/get_partition_info.sql:
--------------------------------------------------------------------------------
1 | USE db1;
2 | SHOW PARTITIONS employees_partitioned;
3 | 
4 | EXPLAIN EXTENDED 
5 | SELECT name FROM EMPLOYEES
6 | WHERE state = 'MO';
7 | 


--------------------------------------------------------------------------------
/hive/partition-example/partition_employees.sql:
--------------------------------------------------------------------------------
 1 | DROP TABLE IF EXISTS db1.employees_partitioned;
 2 | CREATE TABLE db1.employees_partitioned(name STRING)
 3 |     PARTITIONED BY (state STRING)
 4 |     ROW FORMAT DELIMITED
 5 |     FIELDS TERMINATED BY '\t';
 6 | 
 7 | set hive.exec.dynamic.partition=true;
 8 | set hive.exec.dynamic.partition.mode=nonstrict;
 9 | -- The columns you're partitioning by should be listed at the END of the SELECT statement
10 | INSERT OVERWRITE TABLE db1.employees_partitioned 
11 |     PARTITION (state)
12 |     SELECT name, state FROM db1.employees;
13 | 


--------------------------------------------------------------------------------
/hive/partition-example/partition_employees_keep_orig_data.sql:
--------------------------------------------------------------------------------
 1 | -- What if you want to keep the original data (state)
 2 | -- in the partitioned table?  Then create a dummy field
 3 | -- in the original table, and select the "state" field twice
 4 | -- below.
 5 | DROP TABLE IF EXISTS db1.employees_partitioned_keep_orig_data;
 6 | CREATE TABLE db1.employees_partitioned_keep_orig_data(name STRING, orig_state STRING)
 7 |     PARTITIONED BY (state STRING)
 8 |     ROW FORMAT DELIMITED
 9 |     FIELDS TERMINATED BY '\t';
10 | 
11 | set hive.exec.dynamic.partition=true;
12 | set hive.exec.dynamic.partition.mode=nonstrict;
13 | INSERT OVERWRITE TABLE db1.employees_partitioned_keep_orig_data
14 |     PARTITION (state)
15 |     SELECT name, state AS orig_state, state FROM db1.employees;
16 | 


--------------------------------------------------------------------------------
/hive/partition-example/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Script should stop if there's a failure
3 | set -e
4 | hive -S -f create_and_load_employees.sql
5 | hive -S -f partition_employees.sql
6 | hive -S -f partition_employees_keep_orig_data.sql
7 | echo "Browse the data in the /user/hive/warehouse/db1 directory"
8 | hive -S -v -f get_partition_info.sql
9 | 


--------------------------------------------------------------------------------
/hive/simple_queries/README.md:
--------------------------------------------------------------------------------
1 | # Simple Queries
2 | 
3 | Tests for simple queries in Hive
4 | 
5 | - subquery_in_where.sql : Meant to test LEFT SEMI JOIN versus subqueries in WHERE clauses.
6 | 


--------------------------------------------------------------------------------
/hive/simple_queries/create_tables.sql:
--------------------------------------------------------------------------------
 1 | CREATE DATABASE IF NOT EXISTS he;
 2 | CREATE TABLE IF NOT EXISTS he.customers(
 3 | 		cust_id STRING, 
 4 |         first_name STRING)
 5 | ROW FORMAT DELIMITED
 6 | FIELDS TERMINATED BY '\t';
 7 | 
 8 | CREATE TABLE IF NOT EXISTS he.orders(
 9 | 		order_id INT, 
10 | 		cust_id STRING, 
11 | 		first_name STRING, 
12 | 		order_date STRING)
13 | ROW FORMAT DELIMITED
14 | FIELDS TERMINATED BY '\t';
15 | 


--------------------------------------------------------------------------------
/hive/simple_queries/customers.txt:
--------------------------------------------------------------------------------
1 | nate	nate
2 | bob	bob
3 | steve	steve
4 | carl	carl
5 | sandy	sandy
6 | tom	tom
7 | rip	rip
8 | zip	zip
9 | 


--------------------------------------------------------------------------------
/hive/simple_queries/load_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | hive -f create_tables.sql
4 | hdfs dfs -put -f customers.txt /user/training
5 | hdfs dfs -put -f orders.txt /user/training
6 | hive -e "LOAD DATA INPATH '/user/training/customers.txt' OVERWRITE INTO TABLE he.customers"
7 | hive -e "LOAD DATA INPATH '/user/training/orders.txt' OVERWRITE INTO TABLE he.orders"
8 | 


--------------------------------------------------------------------------------
/hive/simple_queries/orders.txt:
--------------------------------------------------------------------------------
1 | 1	nate	product1	2011-01-01
2 | 2	bob	product1	2011-01-01
3 | 3	steve	product1	2011-01-01
4 | 4	carl	product1	2011-01-01
5 | 5	sandy	product1	2011-01-01
6 | 6	tom	product1	2011-01-01
7 | 6	rip	product1	2011-01-01
8 | 7	rip	product1	2012-01-01
9 | 


--------------------------------------------------------------------------------
/hive/simple_queries/subquery_in_where.sql:
--------------------------------------------------------------------------------
 1 | SELECT c.cust_id FROM he.customers c
 2 | WHERE cust_id IN
 3 | (SELECT o.cust_id FROM he.orders o
 4 | 		WHERE YEAR(o.order_date) = 2012);
 5 | 
 6 | SELECT c.cust_id
 7 | FROM he.customers c
 8 | LEFT SEMI JOIN he.orders o
 9 | ON (c.cust_id = o.cust_id
10 | 		AND YEAR(o.order_date) = 2012);
11 | 


--------------------------------------------------------------------------------
/hive/transform/awk-example.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | echo "BEFORE:  "
3 | cat ../sample-data/transform-example.txt 
4 | echo "AFTER AWK FILTER"
5 | cat ../sample-data/transform-example.txt | awk '! a[$1]++'
6 | 


--------------------------------------------------------------------------------
/hive/transform/legalpets.pl:
--------------------------------------------------------------------------------
 1 | #!/bin/env perl
 2 | %legal = qw/dog 1
 3 |             cat 1
 4 |             ferret 1
 5 |             bird 1
 6 | 	    chimpanzee 1/;
 7 | 
 8 | my @petsIveSeen = ();
 9 | while ($pet = <>) {
10 | 	chomp($pet);
11 | 	# debug -- this goes to /var/log/hadoop/userlogs/<job>/<task>/stderr
12 | 	print STDERR $pet;
13 | 	if ($legal{$pet}) {
14 | 		print "$pet\tYES\n";
15 | 	}
16 | 	else {
17 | 		print "$pet\tNO\n";
18 | 	}	
19 | 
20 |         push(@petsIveSeen, $pet);
21 | }
22 | # debug -- this goes to /var/log/hadoop/userlogs/<job>/<task>/stderr
23 | print STDERR join(',', @petsIveSeen);
24 | 


--------------------------------------------------------------------------------
/hive/transform/transform-pets.hql:
--------------------------------------------------------------------------------
1 | ADD FILE /home/training/src/training-scripts/hive/transform/legalpets.pl;
2 | 
3 | FROM pets
4 | SELECT TRANSFORM ( pet )
5 | USING "legalpets.pl" 
6 | AS name, islegal;
7 | 


--------------------------------------------------------------------------------
/hive/wordcount/README.TXT:
--------------------------------------------------------------------------------
 1 | Basic idea:
 2 | 
 3 | 1) Run wordcount on shakespare using MapReduce
 4 | 2) Run wordcount on shakespeare using Hive and compare differences
 5 | 
 6 | Steps
 7 | 1) Upload shakespeare to cluster
 8 | 2) Run wordcount using MapReduce
 9 | 3) Create a table 'mapred_wordcount' containing the output from MapReduce job
10 | 
11 | 4) Load shakespeare into a Hive table
12 | 5) Run Hive Query to create a hive_wordcount table
13 | 6) Compare hive_wordcount to mapred_wordcount
14 | 
15 | 
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 
22 | First run ./run-comparison.sh
23 | Then you can use ./create-external-table-for-mapreduce-output.hql
24 | to view the output of the mapreduce job more easily,
25 | or you can simply use 
26 | 
27 | hadoop fs -get output/wordcount
28 | 


--------------------------------------------------------------------------------
/hive/wordcount/README.md:
--------------------------------------------------------------------------------
1 | # Wordcount using Hive
2 | 
3 | Upload data to a directory in HDFS.  Specify
4 | the **absolute** path to the directory to *run.sh*
5 | 
6 | Run ./run.sh /absolute/path/to/your/data
7 | 


--------------------------------------------------------------------------------
/hive/wordcount/compare.hql:
--------------------------------------------------------------------------------
1 | SET hive.cli.print.header=true;
2 | 
3 | SELECT wordcount.word, wordcount.count AS hive_count, mr_wordcount.count AS mr_count
4 | FROM wordcount
5 | FULL OUTER JOIN mr_wordcount on (mr_wordcount.word = wordcount.word)
6 | WHERE wordcount.word IS NULL or
7 | mr_wordcount.word IS NULL;
8 | 


--------------------------------------------------------------------------------
/hive/wordcount/create-external-table-for-mapreduce-output.hql:
--------------------------------------------------------------------------------
1 | CREATE EXTERNAL TABLE mr_wordcount
2 | (word STRING, count INT)
3 | ROW FORMAT DELIMITED
4 | FIELDS TERMINATED BY '\t'
5 | LOCATION '/user/training/output/wordcount'
6 | 


--------------------------------------------------------------------------------
/hive/wordcount/run-mr-and-hive-queries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SHAKESPEARE_DIR=/user/training/shakespeare
 3 | LOCAL_DATA_DIR=~/training_materials/developer/data/
 4 | OUTPUT_DIR=/user/training/output/wordcount
 5 | 
 6 | # Already set in ~/.bashrc
 7 | # EXAMPLES_DIR=/usr/lib/hadoop-0.20-mapreduce
 8 | 
 9 | hadoop fs -test -d $SHAKESPEARE_DIR || {
10 | 
11 | 	test -d $LOCAL_DATA_DIR/shakespeare || \
12 | 		tar -C $LOCAL_DATA_DIR -xzvf $LOCAL_DATA_DIR/shakespeare.tar.gz
13 | 
14 | 	hadoop fs -put $LOCAL_DATA_DIR/shakespeare $SHAKESPEARE_DIR
15 | }
16 | 	
17 | echo "RUNNING MAPREDUCE JOB"
18 | 
19 | hadoop fs -rm -R $OUTPUT_DIR
20 | hadoop jar $EXAMPLES_DIR/hadoop-examples.jar wordcount $SHAKESPEARE_DIR \
21 | 	$OUTPUT_DIR
22 | 
23 | echo "RUNNING HIVE QUERY"
24 | hive -f ./wordcount.hql
25 | 
26 | 


--------------------------------------------------------------------------------
/hive/wordcount/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | DATA_DIRECTORY=$1
 3 | test -z "$DATA_DIRECTORY" && {
 4 |         echo "Usage:  run.sh <directory in HDFS that has words to count>"
 5 |         exit 1
 6 | }
 7 | 
 8 | hadoop fs -test -e $DATA_DIRECTORY || {
 9 |         echo "HDFS directory $DATA_DIRECTORY doesn't exist"
10 |         echo "Usage:  run.sh <directory in HDFS that has words to count>"
11 |         exit 1
12 | }
13 | echo "RUNNING HIVE QUERY"
14 | hive -S -d input_directory=$DATA_DIRECTORY -f ./wordcount.hql
15 | 
16 | 


--------------------------------------------------------------------------------
/hive/wordcount/wordcount.hql:
--------------------------------------------------------------------------------
 1 | DROP TABLE shakespeare;
 2 | CREATE EXTERNAL TABLE shakespeare (line STRING)
 3 | LOCATION '/user/training/shakespeare';
 4 | 
 5 | DROP TABLE IF EXISTS wordcount;
 6 | CREATE TABLE wordcount AS
 7 | SELECT word, count(1) AS count 
 8 | FROM
 9 | 	(SELECT explode(split(lcase(line), '\\W+')) AS word 
10 |           FROM shakespeare) words
11 | GROUP BY word ORDER BY word;
12 | 


--------------------------------------------------------------------------------
/impala/README.md:
--------------------------------------------------------------------------------
1 | # Simple Queries
2 | 
3 | Tests for simple queries in Impala
4 | 
5 | - subquery_in_where.sql : Meant to test LEFT SEMI JOIN versus subqueries in WHERE clauses.
6 | 		- Related Hive code/"test" in ../../hive/simple_queries
7 | 


--------------------------------------------------------------------------------
/impala/analytic-functions/ads.txt:
--------------------------------------------------------------------------------
 1 | 2015-05-01	losing_clicks
 2 | 2015-05-01	losing_clicks
 3 | 2015-05-01	losing_clicks
 4 | 2015-05-01	losing_clicks
 5 | 2015-05-01	losing_clicks
 6 | 2015-05-01	losing_clicks
 7 | 2015-05-01	losing_clicks
 8 | 2015-05-01	gaining
 9 | 2015-05-02	losing_clicks
10 | 2015-05-02	losing_clicks
11 | 2015-05-02	losing_clicks
12 | 2015-05-02	losing_clicks
13 | 2015-05-02	losing_clicks
14 | 2015-05-02	losing_clicks
15 | 2015-05-02	gaining
16 | 2015-05-02	gaining
17 | 2015-05-03	losing_clicks
18 | 2015-05-03	losing_clicks
19 | 2015-05-03	losing_clicks
20 | 2015-05-03	losing_clicks
21 | 2015-05-03	losing_clicks
22 | 2015-05-03	gaining
23 | 2015-05-03	gaining
24 | 2015-05-03	gaining
25 | 2015-05-04	losing_clicks
26 | 2015-05-04	losing_clicks
27 | 2015-05-04	losing_clicks
28 | 2015-05-04	losing_clicks
29 | 2015-05-04	gaining
30 | 2015-05-04	gaining
31 | 2015-05-04	gaining
32 | 2015-05-04	gaining
33 | 2015-05-05	losing_clicks
34 | 2015-05-05	losing_clicks
35 | 2015-05-05	losing_clicks
36 | 2015-05-05	gaining
37 | 2015-05-05	gaining
38 | 2015-05-05	gaining
39 | 2015-05-05	gaining
40 | 2015-05-05	gaining
41 | 2015-05-06	losing_clicks
42 | 2015-05-06	losing_clicks
43 | 2015-05-06	gaining
44 | 2015-05-06	gaining
45 | 2015-05-06	gaining
46 | 2015-05-06	gaining
47 | 2015-05-06	gaining
48 | 2015-05-06	gaining
49 | 2015-05-07	losing_clicks
50 | 2015-05-07	gaining
51 | 2015-05-07	gaining
52 | 2015-05-07	gaining
53 | 2015-05-07	gaining
54 | 2015-05-07	gaining
55 | 2015-05-07	gaining
56 | 2015-05-07	gaining
57 | 2015-05-08	gaining
58 | 2015-05-08	gaining
59 | 2015-05-08	gaining
60 | 2015-05-08	gaining
61 | 2015-05-08	gaining
62 | 2015-05-08	gaining
63 | 2015-05-08	gaining
64 | 2015-05-08	gaining
65 | 


--------------------------------------------------------------------------------
/impala/analytic-functions/avg_ads.sql:
--------------------------------------------------------------------------------
 1 | SELECT display_date, display_site, n,
 2 | 		AVG(n) OVER
 3 | 			   (PARTITION BY display_site 
 4 |                 ORDER BY display_date
 5 |                 ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS wavg
 6 | FROM (
 7 | 		SELECT display_date, display_site, 
 8 |         count(display_date) AS n
 9 | 		FROM ads GROUP BY display_date, display_site
10 | ) ads
11 | ORDER BY display_site, display_date;
12 | 


--------------------------------------------------------------------------------
/impala/analytic-functions/avg_ads.txt:
--------------------------------------------------------------------------------
 1 | +---------------------+---------------+---+------+
 2 | | display_date        | display_site  | n | wavg |
 3 | +---------------------+---------------+---+------+
 4 | | 2015-05-01 00:00:00 | gaining       | 1 | 1    |
 5 | | 2015-05-02 00:00:00 | gaining       | 2 | 1.5  |
 6 | | 2015-05-03 00:00:00 | gaining       | 3 | 2    |
 7 | | 2015-05-04 00:00:00 | gaining       | 4 | 2.5  |
 8 | | 2015-05-05 00:00:00 | gaining       | 5 | 3.5  |
 9 | | 2015-05-06 00:00:00 | gaining       | 6 | 4.5  |
10 | | 2015-05-07 00:00:00 | gaining       | 7 | 5.5  |
11 | | 2015-05-08 00:00:00 | gaining       | 8 | 6.5  |
12 | | 2015-05-01 00:00:00 | losing_clicks | 7 | 7    |
13 | | 2015-05-02 00:00:00 | losing_clicks | 6 | 6.5  |
14 | | 2015-05-03 00:00:00 | losing_clicks | 5 | 6    |
15 | | 2015-05-04 00:00:00 | losing_clicks | 4 | 5.5  |
16 | | 2015-05-05 00:00:00 | losing_clicks | 3 | 4.5  |
17 | | 2015-05-06 00:00:00 | losing_clicks | 2 | 3.5  |
18 | | 2015-05-07 00:00:00 | losing_clicks | 1 | 2.5  |
19 | +---------------------+---------------+---+------+
20 | 


--------------------------------------------------------------------------------
/impala/analytic-functions/create_ads.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE IF EXISTS ads;
2 | 
3 | CREATE EXTERNAL TABLE ads
4 | 		(display_date TIMESTAMP,
5 | 		 display_site STRING)
6 | ROW FORMAT DELIMITED 
7 | FIELDS TERMINATED BY '\t'
8 | LOCATION '/dualcore/ads';
9 | 


--------------------------------------------------------------------------------
/impala/analytic-functions/impala-version.txt:
--------------------------------------------------------------------------------
1 | Impala Shell v2.0.0-cdh5 (ecf30af) built on Sat Oct 11 13:56:06 PDT 2014
2 | Hadoop 2.5.0-cdh5.2.0
3 | Subversion http://github.com/cloudera/hadoop -r e1f20a08bde76a33b79df026d00a0c91b2298387
4 | Compiled by jenkins on 2014-10-11T21:00Z
5 | Compiled with protoc 2.5.0
6 | From source with checksum 309bccd135b199bdfdd6df5f3f4153d
7 | This command was run using /usr/lib/hadoop/hadoop-common-2.5.0-cdh5.2.0.jar
8 | 


--------------------------------------------------------------------------------
/impala/analytic-functions/lag_ads.sql:
--------------------------------------------------------------------------------
 1 | SELECT display_date, display_site, n,
 2 | 		LAG(n) OVER
 3 | 			   (PARTITION BY display_site 
 4 |                 ORDER BY display_date) AS nprev
 5 | FROM (
 6 | 		SELECT display_date, display_site, 
 7 |         count(display_date) AS n
 8 | 		FROM ads GROUP BY display_date, display_site
 9 | ) ads
10 | ORDER BY display_site, display_date;
11 | 


--------------------------------------------------------------------------------
/impala/analytic-functions/lag_ads.txt:
--------------------------------------------------------------------------------
 1 | +---------------------+---------------+---+-------+
 2 | | display_date        | display_site  | n | nprev |
 3 | +---------------------+---------------+---+-------+
 4 | | 2015-05-01 00:00:00 | gaining       | 1 | NULL  |
 5 | | 2015-05-02 00:00:00 | gaining       | 2 | 1     |
 6 | | 2015-05-03 00:00:00 | gaining       | 3 | 2     |
 7 | | 2015-05-04 00:00:00 | gaining       | 4 | 3     |
 8 | | 2015-05-05 00:00:00 | gaining       | 5 | 4     |
 9 | | 2015-05-06 00:00:00 | gaining       | 6 | 5     |
10 | | 2015-05-07 00:00:00 | gaining       | 7 | 6     |
11 | | 2015-05-08 00:00:00 | gaining       | 8 | 7     |
12 | | 2015-05-01 00:00:00 | losing_clicks | 7 | NULL  |
13 | | 2015-05-02 00:00:00 | losing_clicks | 6 | 7     |
14 | | 2015-05-03 00:00:00 | losing_clicks | 5 | 6     |
15 | | 2015-05-04 00:00:00 | losing_clicks | 4 | 5     |
16 | | 2015-05-05 00:00:00 | losing_clicks | 3 | 4     |
17 | | 2015-05-06 00:00:00 | losing_clicks | 2 | 3     |
18 | | 2015-05-07 00:00:00 | losing_clicks | 1 | 2     |
19 | +---------------------+---------------+---+-------+
20 | 


--------------------------------------------------------------------------------
/impala/analytic-functions/run.sh:
--------------------------------------------------------------------------------
1 | hdfs dfs -mkdir -p /dualcore/ads
2 | hdfs dfs -put ads.txt /dualcore/ads
3 | impala-shell -f create_ads.sql
4 | impala-shell -f lag_ads.sql -o lag_ads.txt
5 | impala-shell -f avg_ads.sql -o avg_ads.txt
6 | echo "Look in lag_ads.txt, and avg_ads.txt"
7 | 


--------------------------------------------------------------------------------
/impala/datatypes/decimal_vs_integer/README.md:
--------------------------------------------------------------------------------
1 | # Decimal vs. Integer
2 | 
3 | Playing with Impala data types
4 | 


--------------------------------------------------------------------------------
/impala/datatypes/decimal_vs_integer/create_table.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE test_decimal
 2 | (userid STRING,
 3 |  some_number DECIMAL);
 4 | 
 5 | CREATE EXTERNAL TABLE test_integer
 6 | (userid STRING,
 7 |  some_number INT)
 8 | LOCATION '/user/hive/warehouse/test_decimal';
 9 | 
10 | LOAD DATA INPATH data.txt INTO TABLE test_decimal;
11 | INVALIDATE METDATA;
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/impala/datatypes/decimal_vs_integer/data.txt:
--------------------------------------------------------------------------------
1 | bob	10
2 | bob	10.00
3 | bob	10.0
4 | nate .11
5 | 


--------------------------------------------------------------------------------
/impala/datatypes/decimal_vs_integer/run.sh:
--------------------------------------------------------------------------------
1 | IMPALAD=$1
2 | hdfs dfs -put data.txt
3 | impala-shell -i $IMPALAD -f ./create_table.sql
4 | impala-shell -i $IMPALAD -f ./run_queries.sql
5 | 


--------------------------------------------------------------------------------
/impala/datatypes/decimal_vs_integer/run_queries.sql:
--------------------------------------------------------------------------------
1 | SELECT some_number FROM test_decimal;
2 | SELECT some_number FROM test_integer;
3 | 


--------------------------------------------------------------------------------
/impala/dyn_test/README.md:
--------------------------------------------------------------------------------
1 | # Dynamic Partition Insert Tests
2 | 
3 | Test whether existing partitions are affected by dynamic 
4 | partition INSERT OVERWRITE statements.
5 | 


--------------------------------------------------------------------------------
/impala/dyn_test/branch_totals_monday.txt:
--------------------------------------------------------------------------------
1 | 2015	branch1	100
2 | 2016	branch2	100
3 | 


--------------------------------------------------------------------------------
/impala/dyn_test/branch_totals_tuesday.txt:
--------------------------------------------------------------------------------
1 | 2016	branch2	200
2 | 2017	branch3	1000
3 | 


--------------------------------------------------------------------------------
/impala/dyn_test/dyn_part.sql:
--------------------------------------------------------------------------------
 1 | DROP TABLE IF EXISTS branch_totals;
 2 | DROP TABLE IF EXISTS monday_totals;
 3 | DROP TABLE IF EXISTS tuesday_totals;
 4 | 
 5 | CREATE TABLE monday_totals(
 6 | 	year INT,
 7 |     branch STRING,
 8 |     total INT
 9 | )
10 | ROW FORMAT DELIMITED
11 | FIELDS TERMINATED BY '\t';
12 | 
13 | LOAD DATA INPATH '/loudacre/branch_totals_monday.txt'
14 | INTO TABLE monday_totals;
15 | 
16 | CREATE TABLE tuesday_totals LIKE monday_totals;
17 | 
18 | LOAD DATA INPATH '/loudacre/branch_totals_tuesday.txt'
19 | INTO TABLE tuesday_totals;
20 | 	
21 | CREATE TABLE branch_totals(
22 | branch STRING,
23 | total INT)
24 | PARTITIONED BY (year INT)
25 | STORED AS PARQUET;
26 | 
27 | INSERT OVERWRITE TABLE branch_totals
28 | PARTITION(year)
29 | SELECT branch, total, year
30 | FROM monday_totals;
31 | 
32 | SELECT year, branch, total 
33 | FROM branch_totals
34 | ORDER BY year;
35 | 
36 | INSERT OVERWRITE TABLE branch_totals
37 | PARTITION(year)
38 | SELECT branch, total, year
39 | FROM tuesday_totals;
40 | 
41 | SELECT year, branch, total 
42 | FROM branch_totals
43 | ORDER BY year;
44 | 


--------------------------------------------------------------------------------
/impala/dyn_test/run_me_hive.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | hdfs dfs -mkdir -p /loudacre
 3 | hdfs dfs -put -f branch_totals_monday.txt /loudacre
 4 | hdfs dfs -put -f branch_totals_tuesday.txt /loudacre
 5 | beeline -u jdbc:hive2://localhost:10000 \
 6 | -f dyn_part.sql \
 7 | --silent=true \
 8 | --hiveconf hive.exec.dynamic.partition=true \
 9 | --hiveconf hive.exec.dynamic.partition.mode=nonstrict 
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/impala/dyn_test/run_me_impala.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -e
2 | hdfs dfs -mkdir -p /loudacre
3 | hdfs dfs -put -f branch_totals_monday.txt /loudacre
4 | hdfs dfs -put -f branch_totals_tuesday.txt /loudacre
5 | impala-shell -f dyn_part.sql --quiet
6 | 


--------------------------------------------------------------------------------
/impala/file_format_shootout/README.TXT:
--------------------------------------------------------------------------------
1 | There should be a database/table tpcds_sample.store_sales
2 | 


--------------------------------------------------------------------------------
/impala/file_format_shootout/count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Simply a script to run some counts, selects on the various tables
 3 | IMPALAD=$1
 4 | if [[ -z "$IMPALAD" ]]; then
 5 |         echo "Usage $0 <impala daemon>"
 6 |         exit 1
 7 | fi
 8 | 
 9 | impala-shell --impalad $IMPALAD -q "INVALIDATE METADATA;"
10 | 
11 | for tbl in seq_store_sales parquet_store_sales rc_store_sales store_sales
12 | do
13 |                 QUERY=$(cat <<EEYORE
14 |                 SELECT COUNT(*) FROM $tbl;
15 |                 SELECT ss_coupon_amt FROM $tbl 
16 |                         WHERE ss_coupon_amt IS NOT NULL
17 |                         ORDER BY ss_coupon_amt DESC
18 |                         LIMIT 10;
19 | EEYORE
20 | )
21 |                         
22 | time impala-shell --database tpcds_sample --impalad $IMPALAD -q "$QUERY"
23 | # --output_file $tbl.out
24 | done
25 | 


--------------------------------------------------------------------------------
/impala/file_format_shootout/create_and_populate_parquet_table.sql:
--------------------------------------------------------------------------------
1 | -- Use Impala to run this!
2 | USE tpcds_sample;
3 | CREATE TABLE parquet_store_sales 
4 |         LIKE store_sales
5 |         STORED AS PARQUET;
6 | 
7 | INSERT INTO parquet_store_sales SELECT * FROM store_sales;
8 | 


--------------------------------------------------------------------------------
/impala/file_format_shootout/create_rc_and_sequencefile_table.sql:
--------------------------------------------------------------------------------
 1 | -- Use Impala to run this!
 2 | INVALIDATE METADATA;
 3 | USE tpcds_sample;
 4 | 
 5 | DROP TABLE IF EXISTS rc_store_sales;
 6 | DROP TABLE IF EXISTS seq_store_sales;
 7 | 
 8 | CREATE TABLE rc_store_sales 
 9 |         LIKE store_sales
10 |         STORED AS RCFILE;
11 | 
12 | CREATE TABLE seq_store_sales 
13 |         LIKE store_sales
14 |         STORED AS SEQUENCEFILE;
15 | 


--------------------------------------------------------------------------------
/impala/file_format_shootout/drop_tables.sql:
--------------------------------------------------------------------------------
1 | USE tpcds_sample;
2 | 
3 | DROP TABLE IF EXISTS parquet_store_sales;
4 | DROP TABLE IF EXISTS rc_store_sales;
5 | DROP TABLE IF EXISTS seq_store_sales;
6 | 


--------------------------------------------------------------------------------
/impala/file_format_shootout/du_tables.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | hadoop fs -du -h /user/hive/warehouse/tpcds_sample.db
3 | 


--------------------------------------------------------------------------------
/impala/file_format_shootout/populate_rc_and_sequencefile_table.sql:
--------------------------------------------------------------------------------
 1 | USE tpcds_sample;
 2 | 
 3 | -- From 
 4 | -- http://www.cloudera.com/content/cloudera-content/cloudera-docs/Impala/latest/Installing-and-Using-Impala/ciiu_rcfile.html?scroll=rcfile_compression_unique_1
 5 | SET hive.exec.compress.output=true;
 6 | SET mapred.max.split.size=256000000;
 7 | SET mapred.output.compression.type=BLOCK;
 8 | SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
 9 | 
10 | INSERT OVERWRITE TABLE rc_store_sales SELECT * FROM store_sales;
11 | INSERT OVERWRITE TABLE seq_store_sales SELECT * FROM store_sales;
12 | 


--------------------------------------------------------------------------------
/impala/file_format_shootout/q19.sql:
--------------------------------------------------------------------------------
 1 | -- start query 1 in stream 0 using template query19.tpl
 2 | select
 3 |   i_brand_id brand_id,
 4 |   i_brand brand,
 5 |   i_manufact_id,
 6 |   i_manufact,
 7 |   sum(ss_ext_sales_price) ext_price
 8 | from
 9 |   date_dim,
10 |   store_sales,
11 |   item,
12 |   customer,
13 |   customer_address,
14 |   store
15 | where
16 |   d_date_sk = ss_sold_date_sk
17 |   and ss_item_sk = i_item_sk
18 |   and i_manager_id = 7
19 |   and d_moy = 11
20 |   and d_year = 1999
21 |   and ss_customer_sk = c_customer_sk
22 |   and c_current_addr_sk = ca_address_sk
23 |   and substr(ca_zip, 1, 5) <> substr(s_zip, 1, 5)
24 |   and ss_store_sk = s_store_sk
25 |   and ss_sold_date_sk between 2451484 and 2451513  -- partition key filter
26 | group by
27 |   i_brand,
28 |   i_brand_id,
29 |   i_manufact_id,
30 |   i_manufact
31 | order by
32 |   ext_price desc,
33 |   i_brand,
34 |   i_brand_id,
35 |   i_manufact_id,
36 |   i_manufact
37 | limit 100;
38 | -- end query 1 in stream 0 using template query19.tpl
39 | 


--------------------------------------------------------------------------------
/impala/file_format_shootout/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | IMPALAD=$1
 4 | test -z "$IMPALAD" && {
 5 |         echo "Usage: $0 <impala daemon>"
 6 |         exit 1
 7 | }
 8 | 
 9 | echo "This will drop and recreate parquet, sequencefile, rcfile data!. Press ENTER to continue, Ctrl-C to cancel"
10 | read GOAHEAD
11 | 
12 | # Display commands before being run
13 | set -x
14 | 
15 | # Zap tables prior to running.  There should be a tpcds_sample database with store_sales table in it.
16 | hive -f ./drop_tables.sql
17 | # First, create the parquet table!
18 | impala-shell --impalad $IMPALAD --refresh_after_connect -f ./create_and_populate_parquet_table.sql
19 | 
20 | # Then, use Impala to cheat and easily create/define the RC Table Definition
21 | # I haven't found out how to use the CREATE TABLE LIKE in Hive with RCFilez
22 | impala-shell --impalad $IMPALAD --refresh_after_connect -f ./create_rc_and_sequencefile_table.sql
23 | 
24 | # Now, Use Hive to populate RC and SequenceFile Tables.  Impala can't do that yet.
25 | # http://www.cloudera.com/content/cloudera-content/cloudera-docs/Impala/latest/Installing-and-Using-Impala/ciiu_file_formats.html
26 | hive -f ./populate_rc_and_sequencefile_table.sql
27 | 


--------------------------------------------------------------------------------
/impala/file_format_shootout/run_q_19.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | IMPALAD=$1
 3 | DB=$2
 4 | if [[ -z "$IMPALAD" ]]; then
 5 |         echo "Usage $0 <impala daemon> <database>"
 6 |         exit 1
 7 | fi
 8 | if [[ -z "$DB" ]]; then
 9 |         echo "Usage $0 <impala daemon> <database>"
10 |         exit 1
11 | fi
12 | 
13 | impala-shell --database=$DB --impalad=$IMPALAD -q "alter table big_tpcds_parquet.store_sales set cached in 'four_gig_pool';"
14 | # impala-shell --database=$DB --impalad=$IMPALAD -q "COMPUTE STATS store_sales;"
15 | impala-shell --database=$DB --impalad=$IMPALAD --query_file=./q19.sql
16 | # Don't run this vvvvvvvvvv
17 | # impala-shell --big_tpcds --impalad $IMPALAD -q "INVALIDATE METADATA;"
18 | 


--------------------------------------------------------------------------------
/impala/google-ngrams/README.md:
--------------------------------------------------------------------------------
1 | # README
2 | 
3 | Impala script to search through some google-ngrams data from a *very* small
4 | subset of http://storage.googleapis.com/books/ngrams/books/datasetsv2.html
5 | 


--------------------------------------------------------------------------------
/impala/google-ngrams/count_spark.sql:
--------------------------------------------------------------------------------
 1 | CREATE EXTERNAL TABLE IF NOT EXISTS 
 2 |         google_ngrams(line STRING)
 3 |         LOCATION '/google-ngrams';
 4 | 
 5 | REFRESH google_ngrams;
 6 | 
 7 | SELECT COUNT(line) 
 8 | FROM google_ngrams 
 9 | WHERE line LIKE "%spark%";
10 | 


--------------------------------------------------------------------------------
/impala/google-ngrams/find_spark.sql:
--------------------------------------------------------------------------------
 1 | CREATE EXTERNAL TABLE IF NOT EXISTS 
 2 |         google_ngrams(line STRING)
 3 |         LOCATION '/google-ngrams';
 4 | 
 5 | REFRESH google_ngrams;
 6 | 
 7 | SELECT COUNT(line) 
 8 | FROM google_ngrams 
 9 | WHERE line LIKE "%spark%";
10 | 


--------------------------------------------------------------------------------
/impala/google-ngrams/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | if [[ -z "$1" ]]; then
3 |         echo "Usage: $0 <impalad>"
4 |         exit 1
5 | fi
6 | 
7 | impala-shell -i $1 -f find_spark.sql
8 | 


--------------------------------------------------------------------------------
/impala/impala-impyla-playground/README.TXT:
--------------------------------------------------------------------------------
1 | # README
2 | 
3 | impala-impyla-playground is a set of loose scripts to use python's dynamic
4 | language features to easily write "unit test" for queries written in Impala
5 | 


--------------------------------------------------------------------------------
/impala/impala-impyla-playground/data/simple.txt:
--------------------------------------------------------------------------------
 1 | 1	Nate
 2 | 1	Nate
 3 | 1	Nate
 4 | 1	Nate
 5 | 1	Nate
 6 | 1	Nate
 7 | 1	Nate
 8 | 1	Nate
 9 | 1	Nate
10 | 1	Nate
11 | 1	Nate
12 | 1	Nate
13 | 1	Nate
14 | 1	Nate
15 | 1	Nate
16 | 1	Nate
17 | 1	Nate
18 | 1	Nate
19 | 1	Nate
20 | 1	Nate
21 | 1	Nate
22 | 1	Nate
23 | 1	Nate
24 | 1	Nate
25 | 1	Nate
26 | 1	Nate
27 | 1	Nate
28 | 1	Nate
29 | 1	Nate
30 | 1	Nate
31 | 1	Nate
32 | 1	Nate
33 | 1	Nate
34 | 1	Nate
35 | 1	Nate
36 | 1	Nate
37 | 1	Nate
38 | 1	Nate
39 | 1	Nate
40 | 1	Nate
41 | 
42 | 


--------------------------------------------------------------------------------
/impala/impala-impyla-playground/simple.py:
--------------------------------------------------------------------------------
 1 | from impala.dbapi import connect
 2 | conn = connect(host='localhost', port=21050)
 3 | cur = conn.cursor()
 4 | cur.execute("""
 5 | CREATE EXTERNAL TABLE IF NOT EXISTS simple(id INT, name STRING)
 6 | 	ROW FORMAT DELIMITED
 7 | 	FIELDS TERMINATED BY '\t'
 8 | 	STORED AS TEXTFILE
 9 | 	LOCATION '/user/cloudera/tables/simple';
10 | """)
11 | 
12 | cur.execute("SELECT * FROM simple");
13 | results = cur.fetchall()
14 | assert 41 == len(results);
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/impala/impyla/query_impala.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | import sys
 3 | from impala.dbapi import connect
 4 | 
 5 | which_db = "tpcds"
 6 | impalad = ""
 7 | 
 8 | if len(sys.argv) > 1:
 9 |             impalad = sys.argv[1]
10 | else: 
11 |             print "Usage: query_impyla.py <impala_daemon_to_connect_to>"
12 |             exit(1)
13 | 
14 | def message(m):
15 |         print "-" * 20
16 |         print m
17 | 
18 | def show_tables(db):
19 |         cur.execute('SHOW TABLES IN %s' % db)
20 |         tables = cur.fetchall()
21 |         message("The tables in the %s database are: " % db)
22 |         print tables
23 | 
24 | def top_five_customers(db):
25 |         # This method is obviously database dependent and assumes the TPCDS-DB
26 | 
27 |         cur.execute("DESCRIBE %s.customer" % db)
28 | 
29 |         message("Showing customer schema")
30 |         print "Fields in customer are:"
31 |         for fieldz in cur.fetchall():
32 |                 print "%-25s %-25s" % (fieldz[0], fieldz[1])
33 | 
34 |         message("Customer Data")
35 |         cur.execute("SELECT c_last_name, c_first_name FROM %s.customer WHERE c_last_name IS NOT NULL ORDER BY c_last_name DESC LIMIT 50" % db)
36 |         customers = cur.fetchall()
37 | 
38 |         print "%-25s %-25s\n%s" % ("Last Name", "First Name", "-" * 50)
39 |         for c in customers:
40 |                 print "%-25s %-25s" % c
41 | 
42 | 
43 | conn = connect(host=impalad, port = 21050)
44 | cur = conn.cursor()
45 | 
46 | show_tables(which_db)
47 | 
48 | top_five_customers(which_db)
49 | 
50 | 


--------------------------------------------------------------------------------
/impala/parquet/README.txt:
--------------------------------------------------------------------------------
 1 | Query: select max(ss_coupon_amt) FROM tpcds.store_sales LIMIT 10
 2 | +--------------------+
 3 | | max(ss_coupon_amt) |
 4 | +--------------------+
 5 | | 19225              |
 6 | +--------------------+
 7 | Returned 1 row(s) in 467.99s
 8 | 
 9 | real    7m48.449s
10 | user    0m0.835s
11 | sys     0m0.230s
12 | 
13 | -- Parquet
14 | Query: select max(ss_coupon_amt) FROM tpcds.parquet_store_sales LIMIT 10
15 | +--------------------+
16 | | max(ss_coupon_amt) |
17 | +--------------------+
18 | | 19225              |
19 | +--------------------+
20 | Returned 1 row(s) in 7.44s
21 | real    0m7.869s
22 | user    0m0.430s
23 | sys     0m0.057s
24 | 


--------------------------------------------------------------------------------
/impala/parquet/run.sh:
--------------------------------------------------------------------------------
 1 | IMPALAD=$1
 2 | if [[ -z "$IMPALAD" ]]; then
 3 |         echo "Usage: run.sh <impalad>"
 4 |         exit 1
 5 | fi
 6 | echo "Running against Parquet table......"
 7 | time impala-shell --impalad $IMPALAD -q "select ss_coupon_amt FROM tpcds.parquet_store_sales WHERE ss_coupon_amt IS NOT NULL ORDER BY ss_coupon_amt DESC LIMIT 10;"
 8 | echo "Running against Text table......"
 9 | time impala-shell --impalad $IMPALAD -q "select ss_coupon_amt FROM tpcds.store_sales WHERE ss_coupon_amt IS NOT NULL ORDER BY ss_coupon_amt DESC LIMIT 10;"
10 | 


--------------------------------------------------------------------------------
/impala/refresh-and-invalidate/create-table.sql:
--------------------------------------------------------------------------------
1 | CREATE DATABASE IF NOT EXISTS hadoop_examples;
2 | USE hadoop_examples;
3 | DROP TABLE IF EXISTS refresh_test;
4 | CREATE TABLE IF NOT EXISTS refresh_test(id INT);
5 | 


--------------------------------------------------------------------------------
/impala/refresh-and-invalidate/monday.txt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 2
 3 | 3
 4 | 4
 5 | 5
 6 | 6
 7 | 7
 8 | 8
 9 | 9
10 | 10
11 | 


--------------------------------------------------------------------------------
/impala/refresh-and-invalidate/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | IMPALAD1=$1
 4 | IMPALAD2=$2
 5 | DBNAME=hadoop_examples
 6 | TBLNAME=refresh_test
 7 | 
 8 | if [[ -z "$IMPALAD1" || -z "$IMPALAD2" ]]; then
 9 |         echo "run.sh <impalad1> <impalad2>"
10 |         exit 1
11 | fi
12 | 
13 | # Using Hive, otherwise Impala doesn't drop existing files :-O
14 | hive -S -f ./create-table.sql
15 | hive -S -e "LOAD DATA LOCAL INPATH 'monday.txt' INTO TABLE $DBNAME.$TBLNAME"
16 | 
17 | echo "Issuing query to $IMPALAD1 for # of rows"
18 | impala-shell --quiet --impalad $IMPALAD1 -q "SELECT COUNT(*) FROM $DBNAME.$TBLNAME"
19 | 
20 | echo "Issuing query to $IMPALAD2 for # of rows"
21 | impala-shell --quiet --impalad $IMPALAD2 -q "SELECT COUNT(*) FROM $DBNAME.$TBLNAME"
22 | 
23 | echo "Loading more data, but NO refresh"
24 | hive -S -e "LOAD DATA LOCAL INPATH 'tuesday.txt' INTO TABLE $DBNAME.$TBLNAME"
25 | 
26 | echo "Issuing query to $IMPALAD1 for # of rows, should still see only 10"
27 | impala-shell --quiet --impalad $IMPALAD1 -q "SELECT COUNT(*) FROM $DBNAME.$TBLNAME"
28 | 
29 | echo "Issuing REFRESH to $IMPALAD1 for # of rows, should now see 20"
30 | impala-shell --quiet --impalad $IMPALAD1 -q "REFRESH $DBNAME.$TBLNAME;SELECT COUNT(*) FROM $DBNAME.$TBLNAME"
31 | 
32 | echo "Issuing count query to $IMPALAD2, withOUT refresh, should see 20 due to catalog server caching"
33 | impala-shell --quiet --impalad $IMPALAD2 -q "SELECT COUNT(*) FROM $DBNAME.$TBLNAME"
34 | 


--------------------------------------------------------------------------------
/impala/refresh-and-invalidate/tuesday.txt:
--------------------------------------------------------------------------------
 1 | 11
 2 | 12
 3 | 13
 4 | 14
 5 | 15
 6 | 16
 7 | 17
 8 | 18
 9 | 19
10 | 20
11 | 


--------------------------------------------------------------------------------
/impala/refresh-and-invalidate/wednesday.txt:
--------------------------------------------------------------------------------
 1 | 21
 2 | 22
 3 | 23
 4 | 24
 5 | 25
 6 | 26
 7 | 27
 8 | 28
 9 | 29
10 | 30
11 | 


--------------------------------------------------------------------------------
/impala/simple_queries/create_tables.sql:
--------------------------------------------------------------------------------
 1 | CREATE DATABASE IF NOT EXISTS he;
 2 | CREATE TABLE IF NOT EXISTS he.customers(
 3 | 		cust_id STRING, 
 4 |         first_name STRING)
 5 | ROW FORMAT DELIMITED
 6 | FIELDS TERMINATED BY '\t';
 7 | 
 8 | CREATE TABLE IF NOT EXISTS he.orders(
 9 | 		order_id INT, 
10 | 		cust_id STRING, 
11 | 		first_name STRING, 
12 | 		order_date STRING)
13 | ROW FORMAT DELIMITED
14 | FIELDS TERMINATED BY '\t';
15 | 


--------------------------------------------------------------------------------
/impala/simple_queries/customers.txt:
--------------------------------------------------------------------------------
1 | nate	nate
2 | bob	bob
3 | steve	steve
4 | carl	carl
5 | sandy	sandy
6 | tom	tom
7 | rip	rip
8 | zip	zip
9 | 


--------------------------------------------------------------------------------
/impala/simple_queries/load_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | impala-shell -f create_tables.sql
4 | hdfs dfs -put -f customers.txt /user/training
5 | hdfs dfs -put -f orders.txt /user/training
6 | impala-shell -q "LOAD DATA INPATH '/user/training/customers.txt' OVERWRITE INTO TABLE he.customers"
7 | impala-shell -q "LOAD DATA INPATH '/user/training/orders.txt' OVERWRITE INTO TABLE he.orders"
8 | 


--------------------------------------------------------------------------------
/impala/simple_queries/orders.txt:
--------------------------------------------------------------------------------
1 | 1	nate	product1	2011-01-01
2 | 2	bob	product1	2011-01-01
3 | 3	steve	product1	2011-01-01
4 | 4	carl	product1	2011-01-01
5 | 5	sandy	product1	2011-01-01
6 | 6	tom	product1	2011-01-01
7 | 6	rip	product1	2011-01-01
8 | 7	rip	product1	2012-01-01
9 | 


--------------------------------------------------------------------------------
/impala/simple_queries/subquery_in_where.sql:
--------------------------------------------------------------------------------
 1 | SELECT c.cust_id FROM he.customers c
 2 | WHERE cust_id IN
 3 | (SELECT o.cust_id FROM he.orders o
 4 | 		WHERE YEAR(o.order_date) = 2012);
 5 | 
 6 | SELECT c.cust_id
 7 | FROM he.customers c
 8 | LEFT SEMI JOIN he.orders o
 9 | ON (c.cust_id = o.cust_id
10 | 		AND YEAR(o.order_date) = 2012);
11 | 


--------------------------------------------------------------------------------
/impala/timestamps/README.md:
--------------------------------------------------------------------------------
1 | # Timestamps
2 | 
3 | Example "queries" are from
4 | 
5 | http://www.cloudera.com/content/cloudera/en/documentation/cloudera-impala/latest/topics/impala_timestamp.html#timestamp
6 | 


--------------------------------------------------------------------------------
/impala/timestamps/queries.sql:
--------------------------------------------------------------------------------
 1 | select cast('1966-07-30' as timestamp);
 2 | select cast('1985-09-25 17:45:30.005' as timestamp);
 3 | select cast('08:30:00' as timestamp);
 4 | select hour('1970-01-01 15:30:00');         -- Succeeds, returns 15.
 5 | select hour('1970-01-01 15:30');            -- Returns NULL because seconds field required.
 6 | select hour('1970-01-01 27:30:00');         -- Returns NULL because hour value out of range.
 7 | select dayofweek('2004-06-13');             -- Returns 1, representing Sunday.
 8 | select dayname('2004-06-13');               -- Returns 'Sunday'.
 9 | select date_add('2004-06-13', 365);         -- Returns 2005-06-13 with zeros for hh:mm:ss fields.
10 | select day('2004-06-13');                   -- Returns 13.
11 | select datediff('1989-12-31','1984-09-01'); -- How many days between these 2 dates?
12 | select now();                               -- Returns current date and time in local timezone.
13 | 


--------------------------------------------------------------------------------
/impala/timestamps/querying_timestamps.sql:
--------------------------------------------------------------------------------
 1 | create table if not exists dates_and_times (actual_value_as_string STRING, t timestamp);
 2 | 
 3 | insert into dates_and_times values
 4 |   ('1966-07-30', '1966-07-30'), 
 5 |   ('1985-09-25 17:45:30.005', '1985-09-25 17:45:30.005'), 
 6 |   ('08:30:00', '08:30:00'), 
 7 |   (CAST(now() AS STRING), now());
 8 | 
 9 | select actual_value_as_string, hour(t) 
10 | from dates_and_times
11 | order by actual_value_as_string;
12 | 


--------------------------------------------------------------------------------
/impala/tpcds/frequent_customers.sql:
--------------------------------------------------------------------------------
 1 | -- For kicks, try store_sales versus parquet_store_sales
 2 | -- For smaller data size, use tpcds_sample database
 3 | USE tpcds_parquet;
 4 | SELECT ss_customer_sk, 
 5 |         COUNT(*) AS num_purchases,
 6 |         SUM(ss_net_profit) AS total_profit
 7 | FROM store_sales
 8 |         GROUP BY ss_customer_sk
 9 |         ORDER BY num_purchases DESC
10 |         LIMIT 100;
11 | 


--------------------------------------------------------------------------------
/impala/tpcds/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 100GB dataset!
 3 | # 15 seconds with Parquet, versus 480 seconds with plain text
 4 | IMPALAD=$1
 5 | TABLE=tpcds.parquet_store_sales
 6 | if [[ -z "$IMPALAD" ]]; then
 7 |         echo "Usage:  $0 <impalad>"
 8 |         exit 1
 9 | fi
10 | 
11 | impala-shell --impalad $IMPALAD -f ./frequent_customers.sql
12 | 


--------------------------------------------------------------------------------
/impala/tuning/compare_store_sales.sql:
--------------------------------------------------------------------------------
 1 | SELECT count(*) FROM big_tpcds_parquet.store_sales;
 2 | 
 3 | SUMMARY;
 4 | PROFILE;
 5 | 
 6 | SELECT count(*) FROM big_tpcds.store_sales;
 7 | 
 8 | SUMMARY;
 9 | PROFILE;
10 | 
11 | 


--------------------------------------------------------------------------------
/impala/tuning/show_summary.sql:
--------------------------------------------------------------------------------
 1 | /* You can run SUMMARY right after a query to get overall stats
 2 |  * and PROFILE to get details
 3 |  */
 4 | 
 5 | SELECT count(*) FROM shakespeare;
 6 | 
 7 | SUMMARY;
 8 | 
 9 | PROFILE;
10 | 
11 | 


--------------------------------------------------------------------------------
/kafka-examples/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | he-kafka-examples.iml
3 | runme.sh
4 | 


--------------------------------------------------------------------------------
/kafka-examples/README.md:
--------------------------------------------------------------------------------
 1 | # Running these
 2 | 
 3 | I like the Maven exec plugin:
 4 | 
 5 |     mvn exec:java -Dexec.mainClass="com.cloudera.kafkaexamples.SimpleProducer"
 6 | 
 7 | Also kinda cool to override the log4j properties at runtime:
 8 | 
 9 |     mvn exec:java \
10 |         -Dexec.mainClass="com.cloudera.kafkaexamples.SimpleProducer" \
11 |         -Dlog4j.configuration="file:/full/path/to/THIS_IS_COOL.properties"
12 | 
13 | ## Running the From-Beginning Example
14 | 
15 | You need to supply TOPIC and BOOTSTRAP_SERVERS environment variable(s)
16 | 
17 |     export TOPIC=customers
18 |     export BOOTSTRAP_SERVERS=
19 |     mvn exec:java \
20 |         -Dexec.mainClass="com.cloudera.kafkaexamples.SimpleConsumer" \
21 |         -Dexec.args="--from-beginning --group-id foogroup --bootstrap-server $BOOTSTRAP_SERVERS --topic $TOPIC" \
22 |         -Dlog4j.configuration="file:./log4jConfigs/seekToBeginning.properties"
23 | 


--------------------------------------------------------------------------------
/kafka-examples/THIS_IS_COOL.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootLogger=TRACE, stdout, fileAppender
 2 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 3 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 4 | log4j.appender.stdout.layout.ConversionPattern=[%d] %p %m thread %X (%c)%n
 5 | 
 6 | log4j.appender.fileAppender=org.apache.log4j.FileAppender
 7 | log4j.appender.fileAppender.File=kafka-request.log
 8 | log4j.appender.fileAppender.Append=False
 9 | 
10 | log4j.appender.fileAppender.layout=org.apache.log4j.EnhancedPatternLayout
11 | log4j.appender.fileAppender.layout.ConversionPattern= %-4r [%t] %-5p %c - %m%n
12 | 
13 | 
14 | 
15 | # Turn on all our debugging info
16 | #log4j.logger.kafka=TRACE,fileAppender
17 | #log4j.logger.kafka.producer.async.DefaultEventHandler=DEBUG,stdout
18 | #log4j.logger.kafka.consumer.PartitionTopicInfo=TRACE,stdout
19 | #log4j.logger.kafka.request.logger=TRACE,fileAppender
20 | #log4j.additivity.kafka.request.logger=false
21 | #log4j.logger.kafka.network.Processor=TRACE,fileAppender
22 | #log4j.additivity.kafka.network.Processor=false
23 | #log4j.logger.org.I0Itec.zkclient.ZkClient=DEBUG
24 | 


--------------------------------------------------------------------------------
/kafka-examples/log4jConfigs/seekToBeginning.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootLogger=INFO, stdout, fileAppender
 2 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 3 | log4j.appender.stdout.layout=org.apache.log4j.EnhancedPatternLayout
 4 | log4j.appender.stdout.layout.ConversionPattern= %d{HH:mm:ss,SSS} %-5p %c - %m%n
 5 | 
 6 | log4j.appender.fileAppender=org.apache.log4j.FileAppender
 7 | log4j.appender.fileAppender.File=kafka-request.log
 8 | log4j.appender.fileAppender.Append=False
 9 | 
10 | log4j.appender.fileAppender.layout=org.apache.log4j.EnhancedPatternLayout
11 | # http://logging.apache.org/log4j/1.2/apidocs/org/apache/log4j/EnhancedPatternLayout.html
12 | # Date, priority, class, message and linefeed.
13 | log4j.appender.fileAppender.layout.ConversionPattern= %d{HH:mm:ss,SSS} %-5p %c - %m%n
14 | 
15 | # Turn on all our debugging info
16 | log4j.logger.com.cloudera.kafkaexamples=DEBUG,fileAppender,stdout
17 | 


--------------------------------------------------------------------------------
/kafka-examples/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootLogger=TRACE, fileAppender
 2 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 3 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 4 | log4j.appender.stdout.layout.ConversionPattern=[%d] %p %m (%c)%n
 5 | 
 6 | log4j.appender.fileAppender=org.apache.log4j.FileAppender
 7 | log4j.appender.fileAppender.File=kafka-request.log
 8 | log4j.appender.fileAppender.Append=False
 9 | 
10 | log4j.appender.fileAppender.layout=org.apache.log4j.PatternLayout
11 | log4j.appender.fileAppender.layout.ConversionPattern= %-4r [%t] %-5p %c %x - %m%n
12 | 
13 | 
14 | # Turn on all our debugging info
15 | #log4j.logger.kafka=TRACE,fileAppender
16 | #log4j.logger.kafka.producer.async.DefaultEventHandler=DEBUG,stdout
17 | #log4j.logger.kafka.consumer.PartitionTopicInfo=TRACE,stdout
18 | #log4j.logger.kafka.request.logger=TRACE,fileAppender
19 | #log4j.additivity.kafka.request.logger=false
20 | #log4j.logger.kafka.network.Processor=TRACE,fileAppender
21 | #log4j.additivity.kafka.network.Processor=false
22 | #log4j.logger.org.I0Itec.zkclient.ZkClient=DEBUG


--------------------------------------------------------------------------------
/kite-sdk/README.md:
--------------------------------------------------------------------------------
 1 | # Kite SDK Examples
 2 | 
 3 | Playing with Kite SDK:  http://kitesdk.org
 4 | 
 5 | # Kite CLI
 6 | 
 7 | http://kitesdk.org/docs/0.18.0/Install-Kite.html
 8 | 
 9 | See install-kite-cli.sh
10 | 


--------------------------------------------------------------------------------
/kite-sdk/install-kite-cli.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Download kite-dataset commandline executable
3 | SOME_DIR_ON_PATH=~/bin
4 | cd $SOME_DIR_ON_PATH
5 | curl http://central.maven.org/maven2/org/kitesdk/kite-tools/0.18.0/kite-tools-0.18.0-binary.jar -o $SOME_DIR_ON_PATH/kite-dataset
6 | chmod +x $SOME_DIR_ON_PATH/kite-dataset
7 | 


--------------------------------------------------------------------------------
/kite-sdk/simple-cli/README.md:
--------------------------------------------------------------------------------
1 | Using HBASE_HOME=/opt/cloudera/parcels/CDH/lib/hadoop/../hbase
2 | 
3 | Needed to export HIVE_HOME=/opt/cloudera/parcels/CDH/lib/hadoop/../hive
4 | 


--------------------------------------------------------------------------------
/kite-sdk/simple-cli/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export HIVE_HOME=/opt/cloudera/parcels/CDH/lib/hadoop/../hive
3 | debug=true kite-dataset -v create sandwiches -s sandwich.avsc
4 | 


--------------------------------------------------------------------------------
/kite-sdk/simple-cli/sandwich.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type" : "record",
 3 |   "name" : "Sandwich",
 4 |   "doc" : "Schema generated by Kite",
 5 |   "fields" : [ {
 6 |     "name" : "name",
 7 |     "type" : [ "null", "string" ],
 8 |     "doc" : "Type inferred from 'Reuben'"
 9 |   }, {
10 |     "name" : "description",
11 |     "type" : [ "null", "string" ],
12 |     "doc" : "Type inferred from 'Pastrami and sauerkraut on toasted rye with Russian dressing.'"
13 |   } ]
14 | }


--------------------------------------------------------------------------------
/kite-sdk/simple-cli/sandwiches.csv:
--------------------------------------------------------------------------------
1 | name,description
2 | Reuben,Pastrami and sauerkraut on toasted rye with Russian dressing.
3 | PBJ,Peanut butter and grape jelly on white bread.
4 | 


--------------------------------------------------------------------------------
/kudu/dataframes/kuduDF.scala:
--------------------------------------------------------------------------------
 1 | val customersDF = spark.read.format("org.apache.kudu.spark.kudu").
 2 |     option("kudu.master", "master-2:7051").
 3 |     option("kudu.table", "customers").
 4 |     load()
 5 | 
 6 | customersDF.show(10)
 7 | 
 8 | // Reverse the name
 9 | val customersReversedNameDF = customersDF.withColumn("name", reverse(customersDF("name")))
10 | 
11 | customersReversedNameDF.write.format("org.apache.kudu.spark.kudu").
12 |     option("kudu.master", "master-2:7051").
13 |     option("kudu.table", "customers").
14 |     mode("append").
15 |     save()
16 | 
17 | // <mackey>Requery the data, m'kay?</mackey>
18 | val customersDFAfterReverse = spark.read.format("org.apache.kudu.spark.kudu").
19 |     option("kudu.master", "master-2:7051").
20 |     option("kudu.table", "customers").
21 |     load()
22 | customersDFAfterReverse.show(10)
23 | 


--------------------------------------------------------------------------------
/kudu/range-partitioning/README.md:
--------------------------------------------------------------------------------
 1 | Partitioning example
 2 | 
 3 | Run the RUNME.sh script,then go to the Kudu tablet server, and 
 4 | there should be 27 tablets.  Only 2 of the tablets should have data:
 5 | 
 6 | * Tablet with <start> - "a" as the bound
 7 | * Tablet with "z" - <end> as the bound
 8 | 
 9 | Screenshot:  https://www.evernote.com/l/AOKPPTreBGBGJowqPxYFX22VxNR0yGt1_QY
10 | 


--------------------------------------------------------------------------------
/kudu/range-partitioning/RUNME.sh:
--------------------------------------------------------------------------------
1 | hdfs dfs -mkdir -p /user/training/people
2 | hdfs dfs -rm /user/training/people/*
3 | hdfs dfs -put ./people.txt /user/training/people
4 | impala-shell -f ./create_people.sql
5 | 


--------------------------------------------------------------------------------
/kudu/range-partitioning/create_hashed_metrics.sql:
--------------------------------------------------------------------------------
 1 | DROP TABLE IF EXISTS kudu_logs;
 2 | DROP TABLE IF EXISTS logs;
 3 | 
 4 | CREATE EXTERNAL TABLE logs
 5 | (time BIGINT, 
 6 |  metric STRING)
 7 | ROW FORMAT DELIMITED
 8 | FIELDS TERMINATED BY '\t'
 9 | LOCATION '/user/training/logs';
10 | 
11 | CREATE TABLE kudu_logs
12 | DISTRIBUTE BY HASH(time) INTO 5 BUCKETS, RANGE(metric)
13 | SPLIT ROWS(
14 |   ('1')
15 | )
16 | TBLPROPERTIES(
17 |     'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler',
18 |     'kudu.table_name' = 'kudu_logs',
19 |     'kudu.master_addresses' = 'localhost:7051',
20 |     'kudu.key_columns' = 'time, metric')
21 | AS SELECT * FROM logs;
22 | 


--------------------------------------------------------------------------------
/kudu/range-partitioning/create_people.sql:
--------------------------------------------------------------------------------
 1 | -- NOTE:  First mkdir /user/training/people then
 2 | -- hdfs dfs -put people.txt /user/training/people
 3 | DROP TABLE IF EXISTS kudu_people;
 4 | DROP TABLE IF EXISTS people;
 5 | 
 6 | CREATE EXTERNAL TABLE people
 7 | (name STRING)
 8 | ROW FORMAT DELIMITED
 9 | FIELDS TERMINATED BY '\t'
10 | LOCATION '/user/training/people';
11 | 
12 | CREATE TABLE kudu_people
13 | DISTRIBUTE BY RANGE(name)
14 | SPLIT ROWS(
15 | ("a"),
16 | ("b"),
17 | ("c"),
18 | ("d"),
19 | ("e"),
20 | ("f"),
21 | ("g"),
22 | ("h"),
23 | ("i"),
24 | ("j"),
25 | ("k"),
26 | ("l"),
27 | ("m"),
28 | ("n"),
29 | ("o"),
30 | ("p"),
31 | ("q"),
32 | ("r"),
33 | ("s"),
34 | ("t"),
35 | ("u"),
36 | ("v"),
37 | ("w"),
38 | ("x"),
39 | ("y"),
40 | ("z")) 
41 | TBLPROPERTIES(
42 | 	'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler',
43 |     'kudu.table_name' = 'kudu_people',
44 |     'kudu.master_addresses' = 'localhost:7051',
45 |     'kudu.key_columns' = 'name')
46 | AS SELECT * FROM people;
47 | 


--------------------------------------------------------------------------------
/kudu/range-partitioning/people.txt:
--------------------------------------------------------------------------------
1 | }this guy
2 | {curly brace dude}
3 | {{two curly brace dude}
4 | _underscore dude
5 | !someone
6 | [another person
7 | [[another person
8 | [[[another person
9 | 


--------------------------------------------------------------------------------
/mr/kill_job_from_mapper/README.md:
--------------------------------------------------------------------------------
 1 | # Try Kill
 2 | 
 3 | This MapReduce job shows how to kill a job from a Mapper / Reducer
 4 | using the Context object.
 5 | 
 6 | The downgrade() method was the toughest thing to find, and
 7 | it helped to look through the source code for JobClient.
 8 | 
 9 | # Setup
10 | 
11 | Simply use ./run.sh to see this job kill itself from a Map.
12 | 
13 | The job uses NLineInputFormat to create 1 Mapper for each line of
14 | ./somedata.txt.  The Maps all read their *one* line of massive data,
15 | and if they don't find a "100", then the Map calls "killJob".
16 | 
17 | # Challenge
18 | 
19 | It would be cool for the Mapper who kills the job to report itself to the
20 | master (Application Master or JobTracker).  This would make finding the
21 | "offending" Map task much easier for admins by looking at the Job History log,
22 | instead of scouring through 100 Maps whose state is "FAILED".
23 | 
24 | The "state" of the Maps in this job is either "SUCCEEDED" or "FAILED".
25 | It would seem that "KILLED" would be a more fitting state for the Maps that
26 | were running when the job was killed.  Hmmmm.
27 | 


--------------------------------------------------------------------------------
/mr/kill_job_from_mapper/compile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | javac -cp `hadoop classpath` solution/*java
3 | 


--------------------------------------------------------------------------------
/mr/kill_job_from_mapper/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | hadoop fs -put -f somedata.txt
4 | rm -f solution/*.class
5 | rm -f TryKill.jar
6 | javac -cp `hadoop classpath` solution/*java
7 | jar cvf TryKill.jar solution/*.class
8 | hadoop jar TryKill.jar solution.TryKill -Dmapred.job.name="Job Kill From Mapper" somedata.txt
9 | 


--------------------------------------------------------------------------------
/mr/kill_job_from_mapper/somedata.txt:
--------------------------------------------------------------------------------
 1 | 100
 2 | 100
 3 | 100
 4 | 100
 5 | 100
 6 | 100
 7 | 100
 8 | CHAOS MONKEY
 9 | 100
10 | 100
11 | 100
12 | 


--------------------------------------------------------------------------------
/mr/local_jobrunner/simple-example/.gitignore:
--------------------------------------------------------------------------------
1 | outputDir
2 | 


--------------------------------------------------------------------------------
/mr/local_jobrunner/simple-example/SimpleDriver.java:
--------------------------------------------------------------------------------
 1 | import org.apache.hadoop.util.Tool;
 2 | import org.apache.hadoop.util.ToolRunner;
 3 | import org.apache.hadoop.conf.Configured;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.Path;
 6 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 7 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 8 | import org.apache.hadoop.mapreduce.Job;
 9 | 
10 | public class SimpleDriver extends Configured implements Tool {
11 |   public static void main(String[] args) throws Exception {
12 |       int exitCode = ToolRunner.run(new Configuration(), new SimpleDriver(), args);
13 |   }
14 |   public int run (String [] args) throws Exception {
15 |     if (args.length != 2) {
16 |       System.out.printf(
17 |           args.length + " - Usage: SimpleDriver <input dir> <output dir>\n");
18 |       System.exit(-1);
19 |     }
20 | 
21 |     // Example of "new" way to instantiate Job
22 |     Job job = Job.getInstance(getConf());
23 |     job.setJarByClass(SimpleDriver.class);
24 |     job.setJobName("New Job constuctor example");
25 | 
26 |     FileInputFormat.setInputPaths(job, new Path(args[0]));
27 |     FileOutputFormat.setOutputPath(job, new Path(args[1]));
28 | 
29 |     boolean success = job.waitForCompletion(true);
30 |     return success ? 0 : 1;
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/mr/local_jobrunner/simple-example/compile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | JAR_FILE=SimpleDriver.jar
4 | rm -f *.class
5 | rm -f $JAR_FILE
6 | javac -Xlint:deprecation -cp `hadoop classpath` *java
7 | jar cvf $JAR_FILE *.class
8 | 


--------------------------------------------------------------------------------
/mr/local_jobrunner/simple-example/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | JAR_FILE=SimpleDriver.jar
4 | ./compile.sh || exit 1
5 | test -d outputDir && rm -rf outputDir
6 | 
7 | hadoop jar $JAR_FILE SimpleDriver -fs=file:/// -jt=local -Dmapred.job.name="Simple THIS!" somedata.txt outputDir
8 | 


--------------------------------------------------------------------------------
/mr/local_jobrunner/simple-example/somedata.txt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 2
 3 | 3
 4 | 4
 5 | 4A
 6 | B
 7 | C
 8 | D
 9 | 45
10 | 65
11 | 45
12 | 33
13 | 
14 | 


--------------------------------------------------------------------------------
/mr/map_only_streaming/mapper.pl:
--------------------------------------------------------------------------------
1 | #!/bin/env perl
2 | while (<>) {
3 |         $num++
4 | }
5 | print "$num\tYep.\n";
6 | 


--------------------------------------------------------------------------------
/mr/map_only_streaming/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | STREAMING_JAR=/opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar
 3 | MAPPER=mapper.pl
 4 | OUTPUT_DIR=output/map_only_streaming
 5 | INPUT=tpcds_data.dat
 6 | 
 7 | hadoop fs -rm -R $OUTPUT_DIR
 8 | 
 9 | hadoop jar $STREAMING_JAR \
10 |         -D mapred.map.tasks=20 \
11 |         -D mapred.reduce.tasks=0 \
12 |         -input $INPUT -output $OUTPUT_DIR \
13 |         -mapper $MAPPER -file $MAPPER
14 | 


--------------------------------------------------------------------------------
/mr/maven_project_template_CDH4/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 |   <name>MR1a</name>
 4 |   <comment>NO_M2ECLIPSE_SUPPORT: Project files created with the maven-eclipse-plugin are not supported in M2Eclipse.</comment>
 5 |   <projects/>
 6 |   <buildSpec>
 7 |     <buildCommand>
 8 |       <name>org.eclipse.jdt.core.javabuilder</name>
 9 |     </buildCommand>
10 |   </buildSpec>
11 |   <natures>
12 |     <nature>org.eclipse.jdt.core.javanature</nature>
13 |   </natures>
14 | </projectDescription>


--------------------------------------------------------------------------------
/mr/maven_project_template_CDH4/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | #Mon Oct 22 09:22:55 CEST 2012
2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
3 | eclipse.preferences.version=1
4 | org.eclipse.jdt.core.compiler.source=1.6
5 | org.eclipse.jdt.core.compiler.compliance=1.6
6 | 


--------------------------------------------------------------------------------
/mr/maven_project_template_CDH4/README.txt:
--------------------------------------------------------------------------------
1 | Project Template for CDH4.2 Maven based projects.
2 | 
3 | 
4 | 
5 | mvn clean
6 | mvn compile
7 | mvn test
8 | 


--------------------------------------------------------------------------------
/mr/maven_project_template_CDH4/TUTORIAL/Maven and CDH4.odt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/maven_project_template_CDH4/TUTORIAL/Maven and CDH4.odt


--------------------------------------------------------------------------------
/mr/maven_project_template_CDH4/src/main/java/CDHTRAINING/App.java:
--------------------------------------------------------------------------------
 1 | package CDHTRAINING;
 2 | 
 3 | /**
 4 |  * Hello world!
 5 |  *
 6 |  */
 7 | public class App 
 8 | {
 9 |     public static void main( String[] args )
10 |     {
11 |         System.out.println( "Hello World!" );
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/mr/maven_project_template_CDH4/src/main/java/SumReducer.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.util.Iterator;
 3 | 
 4 | import org.apache.hadoop.io.IntWritable;
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapred.OutputCollector;
 7 | import org.apache.hadoop.mapred.MapReduceBase;
 8 | import org.apache.hadoop.mapred.Reducer;
 9 | import org.apache.hadoop.mapred.Reporter;
10 | 
11 | public class SumReducer extends MapReduceBase implements
12 |     Reducer<Text, IntWritable, Text, IntWritable> {
13 | 
14 |   @Override
15 |   public void reduce(Text key, Iterator<IntWritable> values,
16 |       OutputCollector<Text, IntWritable> output, Reporter reporter)
17 |       throws IOException {
18 | 
19 |     int wordCount = 0;
20 |     while (values.hasNext()) {
21 |       IntWritable value = values.next();
22 |       wordCount += value.get();
23 |     }
24 |     output.collect(key, new IntWritable(wordCount));
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/mr/maven_project_template_CDH4/src/main/java/WordMapper.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | 
 3 | import org.apache.hadoop.io.IntWritable;
 4 | import org.apache.hadoop.io.LongWritable;
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapred.MapReduceBase;
 7 | import org.apache.hadoop.mapred.Mapper;
 8 | import org.apache.hadoop.mapred.OutputCollector;
 9 | import org.apache.hadoop.mapred.Reporter;
10 | 
11 | public class WordMapper extends MapReduceBase implements
12 |     Mapper<LongWritable, Text, Text, IntWritable> {
13 | 
14 |   @Override
15 |   public void map(LongWritable key, Text value,
16 |       OutputCollector<Text, IntWritable> output, Reporter reporter)
17 |       throws IOException {
18 |     String s = value.toString();
19 |     for (String word : s.split("\\W+")) {
20 |       if (word.length() > 0) {
21 |         output.collect(new Text(word), new IntWritable(1));
22 |       }
23 |     }
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/mr/maven_project_template_CDH4/src/test/java/CDHTRAINING/AppTest.java:
--------------------------------------------------------------------------------
 1 | package CDHTRAINING;
 2 | 
 3 | import junit.framework.Test;
 4 | import junit.framework.TestCase;
 5 | import junit.framework.TestSuite;
 6 | 
 7 | /**
 8 |  * Unit test for simple App.
 9 |  */
10 | public class AppTest 
11 |     extends TestCase
12 | {
13 |     /**
14 |      * Create the test case
15 |      *
16 |      * @param testName name of the test case
17 |      */
18 |     public AppTest( String testName )
19 |     {
20 |         super( testName );
21 |     }
22 | 
23 |     /**
24 |      * @return the suite of tests being tested
25 |      */
26 |     public static Test suite()
27 |     {
28 |         return new TestSuite( AppTest.class );
29 |     }
30 | 
31 |     /**
32 |      * Rigourous Test :-)
33 |      */
34 |     public void testApp()
35 |     {
36 |         assertTrue( true );
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/mr/maven_project_template_CDH4/target/classes/CDHTRAINING/App.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/maven_project_template_CDH4/target/classes/CDHTRAINING/App.class


--------------------------------------------------------------------------------
/mr/maven_project_template_CDH4/target/classes/SumReducer.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/maven_project_template_CDH4/target/classes/SumReducer.class


--------------------------------------------------------------------------------
/mr/maven_project_template_CDH4/target/classes/WordMapper.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/maven_project_template_CDH4/target/classes/WordMapper.class


--------------------------------------------------------------------------------
/mr/maven_project_template_CDH4/target/surefire-reports/CDHTRAINING.AppTest.txt:
--------------------------------------------------------------------------------
1 | -------------------------------------------------------------------------------
2 | Test set: CDHTRAINING.AppTest
3 | -------------------------------------------------------------------------------
4 | Tests run: 1, Failures: 0, Errors: 0, Skipped: 0, Time elapsed: 0.012 sec
5 | 


--------------------------------------------------------------------------------
/mr/maven_project_template_CDH4/target/surefire-reports/TestWordCount.txt:
--------------------------------------------------------------------------------
1 | -------------------------------------------------------------------------------
2 | Test set: TestWordCount
3 | -------------------------------------------------------------------------------
4 | Tests run: 3, Failures: 0, Errors: 0, Skipped: 0, Time elapsed: 1.584 sec
5 | 


--------------------------------------------------------------------------------
/mr/maven_project_template_CDH4/target/test-classes/CDHTRAINING/AppTest.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/maven_project_template_CDH4/target/test-classes/CDHTRAINING/AppTest.class


--------------------------------------------------------------------------------
/mr/maven_project_template_CDH4/target/test-classes/TestWordCount.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/maven_project_template_CDH4/target/test-classes/TestWordCount.class


--------------------------------------------------------------------------------
/mr/nlineinputformat/README.md:
--------------------------------------------------------------------------------
 1 | # README
 2 | 
 3 | This example shows NLineInputFormat to take records from
 4 | task_list.txt and simply print them using ./mapper.pl
 5 | 
 6 | It shows the NLineInputFormat can be used to create X mappers
 7 | per line -- in this example, there's 1 mapper for each line of the
 8 | input file. 
 9 | 
10 | 


--------------------------------------------------------------------------------
/mr/nlineinputformat/generate_task_list.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | if len(sys.argv) > 1:
 5 |         num_lines = int(sys.argv[1])
 6 | else:
 7 |         num_lines = 1000
 8 | 
 9 | for i in range(0, num_lines):
10 |         print i
11 | 


--------------------------------------------------------------------------------
/mr/nlineinputformat/mapper.pl:
--------------------------------------------------------------------------------
1 | #!/bin/env perl
2 | while (<>) {
3 |         print $_;
4 | }
5 | 


--------------------------------------------------------------------------------
/mr/nlineinputformat/task_list.txt:
--------------------------------------------------------------------------------
 1 | 0
 2 | 1
 3 | 2
 4 | 3
 5 | 4
 6 | 5
 7 | 6
 8 | 7
 9 | 8
10 | 9
11 | 0
12 | 0
13 | 1
14 | 1
15 | 1
16 | 2
17 | 3
18 | 4
19 | 5
20 | 6
21 | 7
22 | 8
23 | 9
24 | 0
25 | 1
26 | 2
27 | 2
28 | 3
29 | 4
30 | 5
31 | 6
32 | 7
33 | 8
34 | 9
35 | 0
36 | 1
37 | 2
38 | 2
39 | 3
40 | 4
41 | 5
42 | 6
43 | 7
44 | 8
45 | 9
46 | 0
47 | 1
48 | 2
49 | 


--------------------------------------------------------------------------------
/mr/rest_api/basic.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # http://hadoop.apache.org/docs/r2.3.0/hadoop-yarn/hadoop-yarn-site/WebServicesIntro.html#URIs
3 | RESOURCE_MGR=$1
4 | test -z "$1" && {
5 |         echo "Usage: $0 <resource manager>"
6 |         exit 1
7 | }
8 | curl --compressed -H "Accept: application/json" -X GET "http://$RESOURCE_MGR:8088/ws/v1/cluster"
9 | 


--------------------------------------------------------------------------------
/mr/streaming_config_dumper/mapper.pl:
--------------------------------------------------------------------------------
1 | #!/bin/env perl
2 | # Just print this mapper's env vars
3 | while(<>) {} # <--- this is weird, the scripts won't finish (They'll crash with "Broken Pipe" errors unless you close STDIN explicitly
4 |              # or use a while(<>) {}
5 | foreach $key(keys(%ENV)) {
6 |         print $key, "\t", $ENV{$key}, "\n";
7 | }
8 | print STDERR "I've fallen and can't get up! + $ENV{map_input_file}\n";
9 | 


--------------------------------------------------------------------------------
/mr/streaming_config_dumper/reducer.pl:
--------------------------------------------------------------------------------
 1 | #!/bin/env perl
 2 | # Get all key value pairs, and just concatenate them together
 3 | use strict;
 4 | use warnings;
 5 | my $curr_key;
 6 | my $prev_key;
 7 | my $curr_val;
 8 | my $val;
 9 | 
10 | while(<>) {
11 |         ($curr_key, $curr_val) = split /\t/;
12 |         if ($prev_key && ($curr_key ne $prev_key)) {
13 |                 print $prev_key, "\t", $val, "\n";
14 |                 $prev_key = $curr_key;
15 |                 $val = $curr_val;
16 |         }
17 |         else {
18 |                 $prev_key = $curr_key;
19 |                 $val .= " $curr_val";
20 |         }
21 | }
22 | if ($curr_key) {
23 |         print $curr_key, "\t", $val, "\n";
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/mr/streaming_config_dumper/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Purpose of this is to collect the ENV stuff from all map tasks
 3 | # shows a lot of the cool stuff that Hadoop gives to streaming tasks
 4 | MAPPER=mapper.pl
 5 | REDUCER=reducer.pl
 6 | OUTPUT_DIR=output/nothing
 7 | INPUT=something.txt
 8 | 
 9 | hadoop fs -test -e /user/training/something.txt || hadoop fs -put something.txt
10 | hadoop fs -rm -R $OUTPUT_DIR
11 | 
12 | hadoop jar  /usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-*.jar \
13 |         -D com.example.something=hello \
14 |         -input $INPUT -output $OUTPUT_DIR \
15 |         -mapper $MAPPER -file $MAPPER \
16 |         -reducer $REDUCER -file $REDUCER
17 | 


--------------------------------------------------------------------------------
/mr/streaming_config_dumper/something.txt:
--------------------------------------------------------------------------------
1 | something
2 | wherefore
3 | 


--------------------------------------------------------------------------------
/mr/total_order_partitioner/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>total.order.partitioner</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.jdt.core.javabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 	</buildSpec>
14 | 	<natures>
15 | 		<nature>org.eclipse.jdt.core.javanature</nature>
16 | 	</natures>
17 | </projectDescription>
18 | 


--------------------------------------------------------------------------------
/mr/total_order_partitioner/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
 1 | #Thu Apr 19 09:46:53 CDT 2012
 2 | eclipse.preferences.version=1
 3 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
 4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
 5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
 6 | org.eclipse.jdt.core.compiler.compliance=1.6
 7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
 8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
 9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
12 | org.eclipse.jdt.core.compiler.source=1.6
13 | 


--------------------------------------------------------------------------------
/mr/total_order_partitioner/README.txt:
--------------------------------------------------------------------------------
1 | This is an example implementation for the TotalOrderPartitioner:
2 | 
3 | Run it with our weblog test data:
4 | 
5 |      hadoop jar tot-ord-part.jar solution.ProcessLogs weblog RESULTxyz
6 | 


--------------------------------------------------------------------------------
/mr/total_order_partitioner/bin/solution/ProcessLogs.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/total_order_partitioner/bin/solution/ProcessLogs.class


--------------------------------------------------------------------------------
/mr/total_order_partitioner/bin/solution/domain/MapperFunction.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/total_order_partitioner/bin/solution/domain/MapperFunction.class


--------------------------------------------------------------------------------
/mr/total_order_partitioner/bin/solution/mr/CountReducer.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/total_order_partitioner/bin/solution/mr/CountReducer.class


--------------------------------------------------------------------------------
/mr/total_order_partitioner/bin/solution/mr/IdentityMapper.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/total_order_partitioner/bin/solution/mr/IdentityMapper.class


--------------------------------------------------------------------------------
/mr/total_order_partitioner/bin/solution/mr/LogMonthMapper.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/total_order_partitioner/bin/solution/mr/LogMonthMapper.class


--------------------------------------------------------------------------------
/mr/total_order_partitioner/bin/solution/mr/SumReducer.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/total_order_partitioner/bin/solution/mr/SumReducer.class


--------------------------------------------------------------------------------
/mr/total_order_partitioner/bin/solution/mr/WordMapper.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/total_order_partitioner/bin/solution/mr/WordMapper.class


--------------------------------------------------------------------------------
/mr/total_order_partitioner/src/solution/domain/MapperFunction.java:
--------------------------------------------------------------------------------
 1 | package solution.domain;
 2 | 
 3 | import java.util.Arrays;
 4 | import java.util.List;
 5 | 
 6 | import org.apache.hadoop.io.Text;
 7 | 
 8 | /**
 9 |  * This is a helper class, which encapsulates the logic of our mapper in a
10 |  * "non hadoop" class which can be tested even without MRUnit.
11 |  * 
12 |  * @author training
13 |  * 
14 |  */
15 | public class MapperFunction {
16 | 
17 | 	public static List<String> months = Arrays.asList("Jan", "Feb", "Mar",
18 | 			"Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec");
19 | 
20 | 	static String[] kv = new String[2];
21 | 
22 | 	/**
23 | 	 * Example input line: 96.7.4.14 - - [24/Apr/2011:04:20:11 -0400]
24 | 	 * "GET /cat.jpg HTTP/1.1" 200 12433
25 | 	 * 
26 | 	 */
27 | 	public static String[] getKVPair(String value) {
28 | 
29 | 		kv[0] = null;
30 | 		kv[1] = null;
31 | 
32 | 		/*
33 | 		 * Split the input line into space-delimited fields.
34 | 		 */
35 | 		String[] fields = value.split(" ");
36 | 
37 | 		if (fields.length > 3) {
38 | 
39 | 			/*
40 | 			 * Save the first field in the line as the IP address.
41 | 			 */
42 | 			// String ip = fields[0];
43 | 			kv[0]  = fields[0];
44 | 
45 | 			/*
46 | 			 * The fourth field contains [dd/Mmm/yyyy:hh:mm:ss]. Split the
47 | 			 * fourth field into "/" delimited fields. The second of these
48 | 			 * contains the month.
49 | 			 */
50 | 			String[] dtFields = fields[3].split("/");
51 | 
52 | 			if (dtFields.length > 1) {
53 | 
54 | 				String theMonth = dtFields[1];
55 | 
56 | 				/* check if it's a valid month, if so, write it out */
57 | 				if (months.contains(theMonth)) {
58 | 					kv[1] = theMonth;
59 | 				}
60 | 			}
61 | 		}
62 | 
63 | 		return kv;
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/mr/total_order_partitioner/src/solution/mr/CountReducer.java:
--------------------------------------------------------------------------------
 1 | package solution.mr;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Reducer;
 8 | 
 9 | /* Counts the number of values associated with a key */
10 | 
11 | public class CountReducer extends Reducer<Text, Text, Text, IntWritable> {
12 | 
13 | 	@Override
14 | 	public void reduce(Text key, Iterable<Text> values, Context context)
15 | 			throws IOException, InterruptedException {
16 | 
17 | 		/*
18 | 		 * Iterate over the values iterable and count the number
19 | 		 * of values in it. Emit the key (unchanged) and an IntWritable
20 | 		 * containing the number of values.
21 | 		 */
22 | 
23 | 		int count = 0;
24 | 
25 | 		/*
26 | 		 * Use for loop to count items in the iterator. 
27 | 		 */
28 | 		
29 | 		/* Ignore warnings that we
30 | 		 * don't use the value -- in this case, we only need to count the
31 | 		 * values, not use them.
32 | 		 */
33 | 		for (@SuppressWarnings("unused")
34 | 		Text value : values) {
35 | 
36 | 			/*
37 | 			 * for each item in the list, increment the count
38 | 			 */
39 | 			count++;
40 | 		}
41 | 
42 | 		context.write(key, new IntWritable(count));
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/mr/total_order_partitioner/src/solution/mr/IdentityMapper.java:
--------------------------------------------------------------------------------
 1 | package solution.mr;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Arrays;
 5 | import java.util.List;
 6 | 
 7 | import org.apache.hadoop.io.LongWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Mapper;
10 | 
11 | import solution.domain.MapperFunction;
12 | 
13 | 
14 | public class IdentityMapper extends Mapper<Text, Text, Text, Text> {
15 | 
16 | 	
17 |   /**
18 |    * Example input line:
19 |    * 96.7.4.14 - - [24/Apr/2011:04:20:11 -0400] "GET /cat.jpg HTTP/1.1" 200 12433
20 |    *
21 |    */
22 |   @Override
23 |   public void map(Text key, Text value, Context context)
24 |       throws IOException, InterruptedException {
25 |   
26 |  	  context.write( key, value);
27 |   
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/mr/total_order_partitioner/src/solution/mr/LogMonthMapper.java:
--------------------------------------------------------------------------------
 1 | package solution.mr;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Arrays;
 5 | import java.util.List;
 6 | 
 7 | import org.apache.hadoop.io.LongWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Mapper;
10 | 
11 | import solution.domain.MapperFunction;
12 | 
13 | 
14 | public class LogMonthMapper extends Mapper<LongWritable, Text, Text, Text> {
15 | 
16 | 	Text k = new Text();
17 | 	Text v = new Text();
18 | 
19 |   /**
20 |    * Example input line:
21 |    * 96.7.4.14 - - [24/Apr/2011:04:20:11 -0400] "GET /cat.jpg HTTP/1.1" 200 12433
22 |    *
23 |    */
24 |   @Override
25 |   public void map(LongWritable key, Text value, Context context)
26 |       throws IOException, InterruptedException {
27 |   
28 | 	  String[] kv = MapperFunction.getKVPair(value.toString());
29 | 	  
30 | 	  
31 | 	  if ( kv[1] != null ) { 
32 | 		  k.set( kv[1]+"."+kv[0] );
33 | 		  v.set( kv[1] );
34 | 		  context.write( k, v);
35 | 	  }	  
36 | 	  
37 |   }
38 |     
39 |     
40 |   
41 | }
42 | 


--------------------------------------------------------------------------------
/mr/total_order_partitioner/src/solution/mr/SumReducer.java:
--------------------------------------------------------------------------------
 1 | package solution.mr;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Reducer;
 8 | 
 9 | /* 
10 |  * To define a reduce function for your MapReduce job, subclass 
11 |  * the Reducer class and override the reduce method.
12 |  * The class definition requires four parameters: 
13 |  *   The data type of the input key (which is the output key type 
14 |  *   from the mapper)
15 |  *   The data type of the input value (which is the output value 
16 |  *   type from the mapper)
17 |  *   The data type of the output key
18 |  *   The data type of the output value
19 |  */   
20 | public class SumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
21 | 
22 |   /*
23 |    * The reduce method runs once for each key received from
24 |    * the shuffle and sort phase of the MapReduce framework.
25 |    * The method receives a key of type Text, a set of values of type
26 |    * IntWritable, and a Context object.
27 |    */
28 |   @Override
29 | 	public void reduce(Text key, Iterable<IntWritable> values, Context context)
30 | 			throws IOException, InterruptedException {
31 | 		int wordCount = 0;
32 | 		
33 | 		/*
34 | 		 * For each value in the set of values passed to us by the mapper:
35 | 		 */
36 | 		for (IntWritable value : values) {
37 | 		  
38 | 		  /*
39 | 		   * Add the value to the word count counter for this key.
40 | 		   */
41 | 			wordCount += value.get();
42 | 		}
43 | 		
44 | 		/*
45 | 		 * Call the write method on the Context object to emit a key
46 | 		 * and a value from the reduce method. 
47 | 		 */
48 | 		context.write(key, new IntWritable(wordCount));
49 | 	}
50 | }


--------------------------------------------------------------------------------
/mr/total_order_partitioner/tot-ord-part.jardesc:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <jardesc>
 3 |     <jar path="tot-ord-part.jar"/>
 4 |     <options buildIfNeeded="true" compress="true" descriptionLocation="/total.order.partitioner/tot-ord-part.jardesc" exportErrors="true" exportWarnings="true" includeDirectoryEntries="false" overwrite="true" saveDescription="true" storeRefactorings="false" useSourceFolders="false"/>
 5 |     <storedRefactorings deprecationInfo="true" structuralOnly="false"/>
 6 |     <selectedProjects/>
 7 |     <manifest generateManifest="true" manifestLocation="" manifestVersion="1.0" reuseManifest="false" saveManifest="false" usesManifest="true">
 8 |         <sealing sealJar="false">
 9 |             <packagesToSeal/>
10 |             <packagesToUnSeal/>
11 |         </sealing>
12 |     </manifest>
13 |     <selectedElements exportClassFiles="true" exportJavaFiles="false" exportOutputFolder="false">
14 |         <javaElement handleIdentifier="=total.order.partitioner/src"/>
15 |     </selectedElements>
16 | </jardesc>
17 | 


--------------------------------------------------------------------------------
/mr/yarn_containers/README.md:
--------------------------------------------------------------------------------
 1 | # Container Memory Allocation
 2 | 
 3 | These are the properties that I had to change/verify to make sure
 4 | that NodeManagers could allocate, for example 8 containers for
 5 | 512 MB maps, and 4 containers for 1024 MB maps
 6 | 
 7 | ```xml
 8 | <property>
 9 |         <name>yarn.nodemanager.resource.memory-mb</name>
10 |         <value>4096</value>
11 |         <source>yarn-site.xml</source>
12 | </property>
13 | 
14 | <property>
15 |         <name>yarn.scheduler.minimum-allocation-mb</name>
16 |         <value>512</value>
17 |         <source>yarn-site.xml</source>
18 | </property>
19 | 
20 | <property>
21 |         <name>yarn.nodemanager.resource.cpu-vcores</name>
22 |         <value>8</value>
23 |         <source>yarn-site.xml</source>
24 | </property>
25 | ```
26 | 
27 | Also, you can set
28 | 
29 |   ``mapreduce.map.memory.mb`` or ``mapreduce.map.reduce.mb`` on a per-job basis like this:
30 |   
31 |     hadoop jar ./SleepJob.jar SleepJob -Dmapreduce.map.memory.mb=1024 -m 100 -r 10 -mt 240000
32 | 
33 | Or, set the defaults in the mapred-site.xml.
34 | 
35 | And, of course, if you really want the JVMs to actually use the memory or not
36 | use it, you must specify:
37 | 
38 |     mapreduce.map.java.opts     # (Default is 200MB!!)
39 |     mapreduce.reduce.java.opts  # (Default is 200MB!!)
40 | 
41 | 


--------------------------------------------------------------------------------
/mr/yarn_containers/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | rm SleepJob.class
 3 | javac -cp `hadoop classpath` ./SleepJob.java
 4 | jar cvf SleepJob.jar *.class
 5 | # This will spawn 4 containers per Node Manager if each NM has 
 6 | # 4 GB of yarn.nodemanager.resource.memory-mb
 7 | # (if it's the only job running of course)
 8 | hadoop jar ./SleepJob.jar SleepJob -Dmapreduce.map.memory.mb=1024 -m 100 -r 10 -mt 240000
 9 | 
10 | # This will spawn 8 containers per Node Manager if each NM has 
11 | # 4 GB of yarn.nodemanager.resource.memory-mb
12 | # (if it's the only job running of course)
13 | hadoop jar ./SleepJob.jar SleepJob -Dmapreduce.map.memory.mb=512 -m 100 -r 10 -mt 240000
14 | 
15 | 


--------------------------------------------------------------------------------
/mr/yarn_containers/test_container_boundaries/README.md:
--------------------------------------------------------------------------------
 1 | # Container Memory Management
 2 | 
 3 | See ./run.sh and SleepJobWithArray.java for how YARN kills
 4 | tasks which request more memory than their containers have.  The SleepJobWithArray simply
 5 | tries to instantiate an array of ints that is greater than the container's memory size.
 6 | 
 7 | The weird thing is that I can't get YARN to kill the Java process simply because it
 8 | tries to start with an Xmx (Or Xms) that's greater than the container's memory size.
 9 | 
10 | The java process actually has to have the code that requests > memory than the YARN container
11 | has.  Notice that in SleepJobWithArray, it requests an array of 512 MB, which is under
12 | the Java Heap Size that's requested Xmx=1024m.
13 | 
14 | 


--------------------------------------------------------------------------------
/mr/yarn_containers/test_container_boundaries/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Test how many times a naughty process is retried when it violates the container's memory size
 3 | set -e
 4 | rm -f *.class
 5 | javac -cp `hadoop classpath` ./SleepJobWithArray.java
 6 | jar cvf SleepJobWithArray.jar *.class
 7 | 
 8 | # Max container size is 256 MB, but Java heap is 1024.
 9 | #    even *that* won't kill the job.  Java code actually
10 | #    must request the memory.
11 | 
12 | # This won't be killed
13 | hadoop jar ./SleepJobWithArray.jar SleepJobWithArray \
14 |         -Dmapreduce.job.name="Sleep without init array" \
15 |         -Dmapreduce.map.memory.mb=256 \
16 |         -Dmapreduce.map.java.opts=-Xms1024m \
17 |         -DinitBigArray=false \
18 |         -m 1 -r 1 -mt 1000
19 | 
20 | # This won't crash either
21 | hadoop jar ./SleepJobWithArray.jar SleepJobWithArray \
22 |         -Dmapreduce.job.name="Sleep, but init smallish array" \
23 |         -Dmapreduce.map.memory.mb=256 \
24 |         -Dmapreduce.map.java.opts=-Xms1024m \
25 |         -DinitBigArray=true \
26 |         -DbigArraySize=1000000 \
27 |         -m 1 -r 1 -mt 1000
28 | 
29 | # This will crash after overstepping 256M container limit?
30 | hadoop jar ./SleepJobWithArray.jar SleepJobWithArray \
31 |         -Dmapreduce.job.name="Sleep, but init big array" \
32 |         -Dmapreduce.map.memory.mb=256 \
33 |         -Dmapreduce.map.java.opts=-Xms1024m \
34 |         -DinitBigArray=true \
35 |         -DbigArraySize=256000000 \
36 |         -m 1 -r 1 -mt 1000
37 | 
38 | # This will NOT crash because we bumped the container size
39 | hadoop jar ./SleepJobWithArray.jar SleepJobWithArray \
40 |         -Dmapreduce.job.name="Sleep, but init big array" \
41 |         -Dmapreduce.map.memory.mb=512 \
42 |         -Dmapreduce.map.java.opts=-Xms1024m \
43 |         -DinitBigArray=true \
44 |         -DbigArraySize=256000000 \
45 |         -m 1 -r 1 -mt 1000
46 | 


--------------------------------------------------------------------------------
/pig/configuration/README.md:
--------------------------------------------------------------------------------
 1 | # Configuration playground
 2 | 
 3 | # Log verbosity
 4 | 
 5 | The main examples deal with supressing log messages from Pig
 6 | 
 7 | Copy the following files to /etc/pig/conf:
 8 | 
 9 | log4j.local is an attempt to silence anything in localmode that's not ERROR
10 | 


--------------------------------------------------------------------------------
/pig/configuration/fixpig.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Script to deploy log4j.properties
3 | sudo cp /etc/pig/conf/log4j.properties /etc/pig/conf/log4j.properties.bak
4 | sudo cp log4j.local /etc/pig/conf/log4j.properties
5 | 


--------------------------------------------------------------------------------
/pig/configuration/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # ***** Set root logger level to DEBUG and its only appender to A.
17 | log4j.logger.org.apache.pig=ERROR, A
18 | log4j.logger.org.apache.hadoop.conf.Configuration=ERROR, A
19 | 
20 | # ***** A is set to be a ConsoleAppender.
21 | log4j.appender.A=org.apache.log4j.ConsoleAppender
22 | # ***** A uses PatternLayout.
23 | log4j.appender.A.layout=org.apache.log4j.PatternLayout
24 | log4j.appender.A.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/pig/configuration/pig0.12/README.md:
--------------------------------------------------------------------------------
1 | # Pig 0.12 log4j -- suppress INFO and lower logs
2 | 
3 | Had to change log level for a couple more packages
4 | See conf/log4j.properties
5 | 
6 | 
7 | Also, not a promis the this is the best way, but...
8 | 


--------------------------------------------------------------------------------
/pig/configuration/pig0.12/conf/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # ***** Set root logger level to DEBUG and its only appender to A.
17 | log4j.logger.mapred=ERROR, A
18 | log4j.logger.org.apache.hadoop.metrics.jvm=ERROR, A
19 | log4j.logger.org.apache.hadoop.mapreduce=ERROR, A
20 | log4j.logger.org.apache.hadoop.mapred=ERROR, A
21 | log4j.logger.org.apache.pig=ERROR, A
22 | log4j.logger.org.apache.hadoop.conf.Configuration=ERROR, A
23 | 
24 | # ***** A is set to be a ConsoleAppender.
25 | log4j.appender.A=org.apache.log4j.ConsoleAppender
26 | # ***** A uses PatternLayout.
27 | log4j.appender.A.layout=org.apache.log4j.PatternLayout
28 | log4j.appender.A.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/pig/configuration/pig0.12/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Show log4j properties needed for Pig 0.12 CDH *quiet* mode
3 | pig -4 ./conf/log4j.properties -x local sales.pig 
4 | 


--------------------------------------------------------------------------------
/pig/configuration/pig0.12/sales.pig:
--------------------------------------------------------------------------------
 1 | sales = LOAD 'sales.txt' AS (salesperson_id, amount);
 2 | sales_tuples = FOREACH sales GENERATE (salesperson_id, amount); -- This generates a bag with a single tuple field
 3 | 
 4 | -- This is what you want (No parens)
 5 | sales_bag = FOREACH sales GENERATE salesperson_id, amount;
 6 | sh echo "This is sales BAG";
 7 | DUMP sales_bag;
 8 | sh echo "This is sales TUPLE";
 9 | DUMP sales_tuples;
10 | 


--------------------------------------------------------------------------------
/pig/configuration/pig0.12/sales.txt:
--------------------------------------------------------------------------------
1 | bob	100
2 | steve	200
3 | 


--------------------------------------------------------------------------------
/pig/explain-split-vs-filter/explain-using-dot.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | sudo yum install graphviz
 3 | pig -x local -e 'explain -script ./using-filter.pig' -dot -out using-filter.dot
 4 | pig -x local -e 'explain -script ./using-split.pig' -dot -out using-split.dot
 5 | for fil in *dot; do
 6 |         # output all graphs in the *dot files (there's three per .dot file)
 7 |         # automagically generate filenames ($fil.1.pdf, $fil.2.pdf, etc)
 8 |         # enjoy
 9 |         dot -Tpdf $fil -O
10 | done
11 | 


--------------------------------------------------------------------------------
/pig/explain-split-vs-filter/using-filter.pig:
--------------------------------------------------------------------------------
1 | wlogs = load 'webcrawl.txt' as (pageid, url, timestamp);
2 | apr03 = filter wlogs by timestamp < '20110404';
3 | apr02 = filter wlogs by timestamp < '20110403' and timestamp > '20110401';
4 | apr01 = filter wlogs by timestamp < '20110402' and timestamp > '20110331';
5 | store apr03 into 'filter/20110403';
6 | store apr02 into 'filter/20110402';
7 | store apr01 into 'filter/20110401';
8 | 


--------------------------------------------------------------------------------
/pig/explain-split-vs-filter/using-split.pig:
--------------------------------------------------------------------------------
1 | wlogs = load 'webcrawl.txt' as (pageid, url, timestamp);
2 | split wlogs into apr03 if timestamp < '20110404',
3 | apr02 if timestamp < '20110403' and timestamp > '20110401',
4 | apr01 if timestamp < '20110402' and timestamp > '20110331';
5 | 
6 | store apr03 into 'split/20110403';
7 | store apr02 into 'split/20110402';
8 | store apr01 into 'split/20110401';
9 | 


--------------------------------------------------------------------------------
/pig/explain-split-vs-filter/webcrawl.txt:
--------------------------------------------------------------------------------
1 | http://pig.apache.org	1	{(http://pig.apache.org/index.html),(http://pig.apache.org/releases.html),(http://pig.apache.org/about.html),(http://pig.apache.org/mailing_lists.html),(http://pig.apache.org/whoweare.html),(http://pig.apache.org/bylaws.html),(http://pig.apache.org/privacypolicy.html),(http://pig.apache.org/issue_tracking.html),(http://pig.apache.org/version_control.html),(http://pig.apache.org/philosophy.html)}
2 | http://pig.apache.org/index.html	1	{}
3 | 


--------------------------------------------------------------------------------
/pig/generate/conf/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # ***** Set root logger level to DEBUG and its only appender to A.
17 | log4j.logger.mapred=ERROR, A
18 | log4j.logger.org.apache.hadoop.metrics.jvm=ERROR, A
19 | log4j.logger.org.apache.hadoop.mapreduce=ERROR, A
20 | log4j.logger.org.apache.hadoop.mapred=ERROR, A
21 | log4j.logger.org.apache.pig=ERROR, A
22 | log4j.logger.org.apache.hadoop.conf.Configuration=ERROR, A
23 | 
24 | # ***** A is set to be a ConsoleAppender.
25 | log4j.appender.A=org.apache.log4j.ConsoleAppender
26 | # ***** A uses PatternLayout.
27 | log4j.appender.A.layout=org.apache.log4j.PatternLayout
28 | log4j.appender.A.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/pig/generate/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Show log4j properties needed for Pig 0.12 CDH *quiet* mode
3 | pig -4 ./conf/log4j.properties -x local sales.pig 
4 | 


--------------------------------------------------------------------------------
/pig/generate/sales.pig:
--------------------------------------------------------------------------------
 1 | sales = LOAD 'sales.txt' AS (salesperson_id, amount);
 2 | sales_tuples = FOREACH sales GENERATE (salesperson_id, amount); -- This generates a bag with a single tuple field
 3 | 
 4 | -- This is what you want (No parens)
 5 | sales_bag = FOREACH sales GENERATE salesperson_id, amount;
 6 | sh echo "This is sales BAG";
 7 | DUMP sales_bag;
 8 | sh echo "This is sales TUPLE";
 9 | DUMP sales_tuples;
10 | 


--------------------------------------------------------------------------------
/pig/generate/sales.txt:
--------------------------------------------------------------------------------
1 | bob	100
2 | steve	200
3 | 


--------------------------------------------------------------------------------
/pig/hcatalog/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # sudo yum install hcatalog
3 | pig -useHCatalog store_sales.pig
4 | 


--------------------------------------------------------------------------------
/pig/hcatalog/sample_store_sales.pig:
--------------------------------------------------------------------------------
1 | salez = LOAD 'tpcds.parquet_store_sales' USING org.apache.hcatalog.pig.HCatLoader(); 
2 | sampld = SAMPLE salez 0.1;
3 | STORE sampld INTO 'zanky';
4 | 


--------------------------------------------------------------------------------
/pig/hcatalog/store_sales.pig:
--------------------------------------------------------------------------------
1 | salez = LOAD 'tpcds.store_sales' USING org.apache.hcatalog.pig.HCatLoader(); 
2 | DESCRIBE salez;
3 | 


--------------------------------------------------------------------------------
/pig/local-mode-hacks/README.md:
--------------------------------------------------------------------------------
1 | # Local Mode Hacks
2 | 
3 | Wanted to find out if Pig can be called:
4 | 
5 | - Both local execution and local filesystem is easy `$pig -x local`
6 | - Local execution but HDFS filesystem. `$ pig -jt local` seems to work, even on YARN
7 | 
8 | Yay!
9 | 


--------------------------------------------------------------------------------
/pig/local-mode-hacks/read_some_hdfs_data.pig:
--------------------------------------------------------------------------------
1 | hdfs_data = LOAD 'THISISINHDFS.txt';
2 | grpd = GROUP hdfs_data ALL;
3 | counted = FOREACH grpd GENERATE COUNT(hdfs_data);
4 | DUMP counted;
5 | 
6 | 


--------------------------------------------------------------------------------
/pig/local-mode-hacks/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Put some data into HDFS and run Pig locally, but refer to data in HDFS
3 | hadoop fs -put -f somedata.txt THISISINHDFS.txt
4 | pig -jt local read_some_hdfs_data.pig
5 | 


--------------------------------------------------------------------------------
/pig/local-mode-hacks/somedata.txt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 2
 3 | 3
 4 | 4
 5 | 5
 6 | 6
 7 | 7
 8 | 8
 9 | 9
10 | 10
11 | 


--------------------------------------------------------------------------------
/pig/round/data.txt:
--------------------------------------------------------------------------------
 1 | bob	1.456
 2 | bob	1.456
 3 | bob	1.456
 4 | bob	1.456
 5 | bob	1.456
 6 | bob	1.456
 7 | bob	1.456
 8 | steve	123.456
 9 | supersteve	123.456
10 | supersteve	123.756
11 | supersteve	123.756
12 | supersteve	123.856
13 | supersteve	123.856
14 | supersteve	123.156
15 | fluffy	123.456
16 | ted	123.700
17 | harsh	123.456
18 | ian	123.456
19 | peabody	123.456
20 | squirko	123.456
21 | mirko	123.456
22 | kai	123.456
23 | sarah	123.456
24 | ted	123.700
25 | 


--------------------------------------------------------------------------------
/pig/round/results.txt:
--------------------------------------------------------------------------------
 1 | bob	1.5
 2 | bob	1.5
 3 | bob	1.5
 4 | bob	1.5
 5 | bob	1.5
 6 | bob	1.5
 7 | bob	1.5
 8 | steve	123.5
 9 | supersteve	123.5
10 | supersteve	123.8
11 | supersteve	123.8
12 | supersteve	123.9
13 | supersteve	123.9
14 | supersteve	123.2
15 | fluffy	123.5
16 | ted	123.7
17 | harsh	123.5
18 | ian	123.5
19 | peabody	123.5
20 | squirko	123.5
21 | mirko	123.5
22 | kai	123.5
23 | sarah	123.5
24 | ted	123.7
25 | 


--------------------------------------------------------------------------------
/pig/round/round_this.pig:
--------------------------------------------------------------------------------
1 | data = LOAD 'data.txt' AS (name:chararray, amount:float);
2 | round = FOREACH data GENERATE name, (float)(ROUND(amount*10))/10 AS data;
3 | STORE round INTO 'round';
4 | 


--------------------------------------------------------------------------------
/pig/sampling/sample_tpcds.pig:
--------------------------------------------------------------------------------
1 | sales = LOAD 'tpcds/store_sales';
2 | sampl = SAMPLE sales 0.01;
3 | STORE sampl INTO 'tpcds_sample/store_sales';
4 | 


--------------------------------------------------------------------------------
/spark/data-generator/hash-data-generator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | curl https://raw.githubusercontent.com/eneko/data-repository/master/data/words.txt > words.txt
 3 | sudo -u hdfs hdfs dfs -mkdir /user/ec2-user
 4 | sudo -u hdfs hdfs dfs -chown ec2-user /user/ec2-user
 5 | hdfs dfs -put words.txt /user/ec2-user/words
 6 | */
 7 | import java.security.MessageDigest
 8 | 
 9 | val words = sc.textFile("words", 20)
10 | val moreWords = words.flatMap(word => List(word, word.toUpperCase(),
11 |                                              word.toLowerCase(),
12 |                                              word.reverse.toUpperCase(),
13 |                                              word + "!",
14 |                                              word + "?",
15 |                                              word.reverse.toLowerCase()))
16 | 
17 | val md5s = moreWords.mapPartitions{iterator =>
18 |     val md5 = MessageDigest.getInstance("MD5")
19 |     val sha1 = MessageDigest.getInstance("SHA-1")
20 |     val sha256 = MessageDigest.getInstance("SHA-256")
21 |     iterator.map(word => List(word,
22 |                          md5.digest(word.getBytes).map("%02x".format(_)).mkString,
23 |                          sha1.digest(word.getBytes).map("%02x".format(_)).mkString,
24 |                          sha256.digest(word.getBytes).map("%02x".format(_)).mkString).mkString("\t"))
25 | }
26 | 
27 | md5s.saveAsTextFile("hash-data")
28 | 


--------------------------------------------------------------------------------
/spark/data-parsing/data-parsing-using-try.scala:
--------------------------------------------------------------------------------
 1 | import scala.util.{Try, Failure, Success}
 2 | // Define test data
 3 | // With thanks to http://rcardin.github.io/big-data/apache-spark/scala/programming/2016/09/25/try-again-apache-spark.html
 4 | val orig_data = Array(
 5 | 	"1",
 6 | 	"2",
 7 |     "trash",
 8 |     "4"
 9 | )
10 | 
11 | val data = sc.parallelize(orig_data)
12 | val weblogs = data.map(line => Try(line.toInt))
13 | println(weblogs.getClass)
14 | val good = weblogs.filter(d => d.isSuccess)
15 | weblogs.collect()
16 | good.collect()
17 | 


--------------------------------------------------------------------------------
/spark/data-parsing/data-parsing.scala:
--------------------------------------------------------------------------------
 1 | // Define test data
 2 | val orig_data = Array(
 3 | 	"1.2.3.4 - 12345 \"[1/1/2017 12:00:00]\" \"/some.jpg GET\" 200 9999",
 4 | 	"1.2.3.4 - 12345 \"[1/1/2017 12:00:02]\" \"/home.html GET\" 200 9997",
 5 |     "trash",
 6 |     "1.2.3.5 - aaaaa"
 7 | )
 8 | 
 9 | // case class to mimic data
10 | case class Weblog(ip:String, userid:String, req_ts:String)
11 | 
12 | // Define regex to parse test data into case class (only ip, userid and req_ts
13 | // for now)
14 | val regex = """(.*) - (\d+) \"\[(.+)\]\".*""".r
15 | 
16 | // Fancy print/debug function
17 | def printWeblog(weblog:Weblog) =
18 | 	println(s"""Data: $weblog 
19 | 				Class: ${weblog.getClass}
20 | 				IP:${weblog.ip}
21 | 				Userid:${weblog.userid}
22 | 				Request Timestamp:${weblog.req_ts} 
23 | 				----------------""")
24 | 
25 | // Go!
26 | val data = sc.parallelize(orig_data)
27 | val weblogs = data.map{
28 | 	case regex(ip, userid, req_ts) => 
29 |             Weblog(ip, userid, req_ts)
30 |     case line =>
31 | 			Console.err.println(s"Unexpected line: $line")
32 | 			Weblog("error", line, "")
33 | }
34 | 
35 | val errors = weblogs.filter(wl => wl.ip == "error")
36 | val not_errors = weblogs.filter(wl => wl.ip != "error")
37 | 
38 | println("These are errors")
39 | for (r <- errors.collect()) {
40 | 		printWeblog(r)
41 | }
42 | 
43 | println("These are NOT errors")
44 | for (r <- not_errors.collect()) {
45 | 		printWeblog(r)
46 | }
47 | 
48 | 


--------------------------------------------------------------------------------
/spark/dataframes/README.org:
--------------------------------------------------------------------------------
 1 | * Installation
 2 | 
 3 | Most of these examples use data from the "hadoop-examples-data" directory in this
 4 | repository.  I recommend to run:
 5 | 
 6 |     $ hdfs dfs -put hadoop-examples-data
 7 | 
 8 | * Column Renaming
 9 |   
10 | - Example in column_rename_after_joins.py
11 | - https://sparkbyexamples.com/spark/rename-a-column-on-spark-dataframes/
12 | - https://stackoverflow.com/questions/50287558/how-to-rename-duplicated-columns-after-join
13 | - https://stackoverflow.com/questions/33778664/spark-dataframe-distinguish-columns-with-duplicated-name#33779190
14 |   - References this KB article:  https://kb.databricks.com/data/join-two-dataframes-duplicated-columns.html
15 | 


--------------------------------------------------------------------------------
/spark/dataframes/alias.scala:
--------------------------------------------------------------------------------
1 | // Note:  Put data/people.json into the HDFS home directory
2 | val df = spark.read.json("people.json")
3 | 
4 | df.select($"firstName" as "fn_as", 
5 |           $"firstName" alias "fname_alias",
6 |           $"firstName" name "First_name").show()
7 | 
8 | 


--------------------------------------------------------------------------------
/spark/dataframes/analyzingExerciseUsingSparkSQL.py:
--------------------------------------------------------------------------------
 1 | # First, create / massage all possible data sources
 2 | # ----- Accounts
 3 | # Create a DataFrame based on the Hive accounts table
 4 | accountsDF = spark.read.table("devsh.accounts")\
 5 |                  .select("acct_num", "acct_close_dt")\
 6 |                  .withColumnRenamed("acct_num", "account_id")
 7 | 
 8 | accountsDF.createOrReplaceTempView("accounts")
 9 | 
10 | # ----- Account Devices
11 | # Load accountdevice data to HDFS in another terminal window
12 | # $ hdfs dfs -put $DEVDATA/accountdevice/ /devsh_loudacre/
13 | accountDeviceDF = spark.read.option("header","true").\
14 |                              option("inferSchema","true")\
15 |                              .csv("/devsh_loudacre/accountdevice")
16 | 
17 | accountDeviceDF.createOrReplaceTempView("account_devices")
18 | 
19 | # ----- Devices
20 | devicesDF = spark.read.json("/devsh_loudacre/devices.json").withColumnRenamed("devnum", "device_id")
21 | devicesDF.createOrReplaceTempView("devices")
22 | 
23 | # Spark SQL!!!!!
24 | 
25 | sql = """
26 | SELECT d.device_id, d.make, d.model, COUNT(*) AS number_of_devices
27 | FROM accounts a
28 | JOIN account_devices ad ON a.account_id = ad.account_id
29 | JOIN devices d ON d.device_id = ad.device_id
30 | WHERE a.acct_close_dt IS NULL
31 | GROUP BY d.device_id, d.make, d.model
32 | ORDER BY number_of_devices DESC
33 | """
34 | 
35 | activeDeviceCountsDF = spark.sql(sql)
36 | activeDeviceCountsDF.write\
37 |     .mode("overwrite")\
38 |     .option("path","/devsh_loudacre/active_device_counts_using_spark_sql")\
39 |     .saveAsTable("devsh.active_device_counts_using_spark_sql")
40 | 


--------------------------------------------------------------------------------
/spark/dataframes/antiJoin.py:
--------------------------------------------------------------------------------
 1 | leftFile = "hadoop-examples-data/left.csv"
 2 | rightFile = "hadoop-examples-data/right.csv"
 3 | opts = {
 4 |     "inferSchema" : True,
 5 |     "header"      : True }
 6 | 
 7 | leftDF = spark.read.options(**opts).csv(leftFile)
 8 | print("This is leftDF:")
 9 | leftDF.show()
10 | 
11 | rightDF = spark.read.options(**opts).csv(rightFile)
12 | print("This is rightDF:")
13 | rightDF.show()
14 | 
15 | antiJoinLeftRightDF = leftDF.join(rightDF, "name", "left_anti")
16 | print("This is left anti join right")
17 | antiJoinLeftRightDF.show()
18 | 
19 | antiJoinRightLeftDF = rightDF.join(leftDF, "name", "left_anti")
20 | print("This is right anti join left")
21 | antiJoinRightLeftDF.show()
22 | 


--------------------------------------------------------------------------------
/spark/dataframes/columnExpresssions.scala:
--------------------------------------------------------------------------------
 1 | // Note:  Put data/people.json into the HDFS home directory
 2 | val df = spark.read.json("people.json")
 3 | 
 4 | assert("org.apache.spark.sql.Column" == df("firstName").getClass.getName)
 5 | 
 6 | val firstNameTimesThree = df("firstName") * 3
 7 | assert("org.apache.spark.sql.Column" == firstNameTimesThree.getClass.getName)
 8 | 
 9 | val someColLike = $"firstName" like "Bo%"
10 | assert("org.apache.spark.sql.Column" == someColLike.getClass.getName)
11 | 
12 | 


--------------------------------------------------------------------------------
/spark/dataframes/creatingDataFrames.scala:
--------------------------------------------------------------------------------
 1 | import sys.process._
 2 | 
 3 | if(Seq("hdfs", "dfs", "-test", "-d", "people").! == 0) {
 4 | 		Seq("hdfs", "dfs", "-rm", "-R", "people").!
 5 | }
 6 | val someDFReader = spark.read
 7 | assert("org.apache.spark.sql.DataFrameReader" == someDFReader.getClass.getName)
 8 | 
 9 | val first =  Seq(Tuple1("Arvin"), Tuple1("Betty"))
10 | val second =  Seq(Tuple1("Chris"), Tuple1("Derek"))
11 | val third =  Seq(Tuple1("Eric"), Tuple1("Ferris"))
12 | 
13 | spark.createDataFrame(first).write.save("people")
14 | 
15 | spark.createDataFrame(second).
16 | 		write.mode("append").save("people")
17 | 
18 | spark.createDataFrame(third).
19 | 		write.mode("ignore").save("people")
20 | 
21 | val allPeople = spark.read.load("people")
22 | allPeople.count()
23 | "hdfs dfs -ls people".!
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/spark/dataframes/grouping.scala:
--------------------------------------------------------------------------------
 1 | // Note:  Put data/people.json into the HDFS home directory
 2 | val df = spark.read.json("people.json")
 3 | 
 4 | val grpd = df.groupBy("testScore").agg(min($"firstName") as "First FirstName",
 5 |                                        max($"firstName") as "Last FirstName",
 6 |                                        min($"studyTime") as "min studytime",
 7 |                                        sum($"studyTime") as "all_studytime").
 8 |                                    where($"all_studytime" > 100)
 9 | grpd.show()
10 | 
11 | // Grouping with LEFT OUTER JOIN, and showing zero for
12 | // count of rows with no corresponding child elements
13 | 
14 | val customerSeq = Seq("""{ "firstName":"Nate",   "id":1 }""",
15 |                       """{ "firstName":"Jackie", "id":2 }""")
16 | val customersDS = spark.createDataset(customerSeq)
17 | val customersDF = spark.read.json(customersDS)
18 | 
19 | val ordersSeq = Seq("""{ "cust_id":1, "product":"Something", "order_id":2}""")
20 | val ordersDS = spark.createDataset(ordersSeq)
21 | val ordersDF = spark.read.json(ordersDS)
22 | 
23 | val customerOrders = customersDF.join(ordersDF, $"id" === $"cust_id", "left_outer")
24 | 
25 | val customerOrderCounts = customerOrders.
26 |     groupBy($"firstName", $"id").
27 |     agg(count($"cust_id") as "num_orders")
28 | 
29 | customerOrderCounts.show()
30 | 
31 | 


--------------------------------------------------------------------------------
/spark/dataframes/hadoop-examples-data/01.csv:
--------------------------------------------------------------------------------
1 | firstname,lastname,middle
2 | Bob,Johnson,Midi
3 | Steve,Worrell,Midrow
4 | 


--------------------------------------------------------------------------------
/spark/dataframes/hadoop-examples-data/data.csv:
--------------------------------------------------------------------------------
1 | robert,10
2 | julie,20
3 | 


--------------------------------------------------------------------------------
/spark/dataframes/hadoop-examples-data/interests.json:
--------------------------------------------------------------------------------
1 | { "fname":"Julie", "interest":"Pets" }
2 | { "fname":"Julie", "interest":"Cats" }
3 | { "fname":"Julie", "interest":"Dogs" }
4 | { "fname":"Aaron", "interest":"Economics" }
5 | { "fname":"Steve", "interest":"Real Estate" }
6 | { "fname":"aaron", "interest":"Fake Estate" }
7 | 


--------------------------------------------------------------------------------
/spark/dataframes/hadoop-examples-data/left.csv:
--------------------------------------------------------------------------------
1 | name,region
2 | Nate,west
3 | Bob,west
4 | Sir Robin,west
5 | Bartholomew,west
6 | 


--------------------------------------------------------------------------------
/spark/dataframes/hadoop-examples-data/maxVals.json:
--------------------------------------------------------------------------------
1 | { "product":"Car", "price":2147483650 }
2 | { "product":"Maxwell TV", "price":2147483648 }
3 | { "product":"Radio", "price":2147483647 }
4 | { "product":"Junk", "price":-2147483650 }
5 | { "product":"Defunct", "price":-2147483647 }
6 | { "product":"Mortgage", "price":-2147483646 }
7 | { "product":"Bad Price", "price":"$23000" }
8 | 


--------------------------------------------------------------------------------
/spark/dataframes/hadoop-examples-data/people.json:
--------------------------------------------------------------------------------
1 | { "firstName":"Julie", "testScore":30, "studyTime":100 }
2 | { "firstName":"Aaron", "testScore":20, "studyTime":200 }
3 | { "firstName":"Steve", "testScore":40, "studyTime":300 }
4 | { "firstName":"Walter", "testScore":10, "studyTime":400 }
5 | { "firstName":"Josie", "testScore":30, "studyTime":500 }
6 | { "firstName":"Willie", "testScore":10, "studyTime":600 }
7 | 


--------------------------------------------------------------------------------
/spark/dataframes/hadoop-examples-data/right.csv:
--------------------------------------------------------------------------------
1 | name,region
2 | Nate,east
3 | Bob,east
4 | Sir Robin,east
5 | 


--------------------------------------------------------------------------------
/spark/dataframes/hadoop-examples-data/two.csv:
--------------------------------------------------------------------------------
1 | lastname,firstname
2 | Smith,Terry
3 | Jackson,Billy
4 | 


--------------------------------------------------------------------------------
/spark/dataframes/joins.scala:
--------------------------------------------------------------------------------
 1 | // Note:  Put data/people.json into the HDFS home directory
 2 | val dfPeople = spark.read.json("people.json")
 3 | val dfInterests = spark.read.json("interests.json")
 4 | 
 5 | dfPeople.join(dfInterests, lower($"firstName") === lower($"fname"), "left_outer").
 6 | 		select("firstName", "interest").show()
 7 | 
 8 | dfPeople.join(dfInterests, lower($"firstName") === lower($"fname"), "right_outer").
 9 | 		select("firstName", "interest").show()
10 | 
11 | dfPeople.join(dfInterests).where($"firstName" === $"fname").
12 | 		select("firstName", "interest").show()
13 | 


--------------------------------------------------------------------------------
/spark/dataframes/rowFunctions.scala:
--------------------------------------------------------------------------------
 1 | // Misc. functions on Row objects
 2 | val jsonSeq = Seq("""{ "firstName":"Nate", "iq":0, "prev_iq":200}""",
 3 |               """{ "firstName":"Jackie", "iq":200, "prev_iq":200 }""",
 4 |               """{ "firstName":"Ricky", "iq":-20}""")
 5 | val peopleDS = spark.createDataset(jsonSeq)
 6 | 
 7 | val peopleDF = spark.read.json(peopleDS)
 8 | val firstRow = peopleDF.take(3)(2)
 9 | 
10 | println(firstRow.getClass)
11 | 
12 | // println(firstRow.getAs[String]("firstName"))
13 | val firstName = firstRow.getAs[String]("firstName")
14 | val iq = firstRow.getAs[Long]("iq")
15 | println(firstName)
16 | println(firstName.getClass)
17 | println(iq)
18 | println(iq.getClass)
19 | 


--------------------------------------------------------------------------------
/spark/dataframes/saveDataFrameToDataSource.scala:
--------------------------------------------------------------------------------
 1 | val tableName = "testpeople"
 2 | 
 3 | val first =  Seq(Tuple1("Arvin"), Tuple1("Betty"))
 4 | val second =  Seq(Tuple1("Chris"), Tuple1("Derek"))
 5 | val third =  Seq(Tuple1("Eric"), Tuple1("Ferris"))
 6 | 
 7 | println("Writing first")
 8 | spark.createDataFrame(first).write.mode("overwrite").saveAsTable(tableName)
 9 | 
10 | println("Writing second")
11 | spark.createDataFrame(second).
12 | 		write.mode("append").saveAsTable(tableName)
13 | 
14 | println("Writing third")
15 | spark.createDataFrame(third).write.
16 | 		option("path", "peopleinmyhomedir").
17 | 		saveAsTable(tableName)
18 | 
19 | 
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/spark/dataframes/schemasCSV.scala:
--------------------------------------------------------------------------------
 1 | // Put the CSV files in this directory into a
 2 | // directory "csvdata" in HDFS prior to running this
 3 | val opts = Map("header" -> "true",
 4 |                "inferSchema" -> "true")
 5 | val csvDF = spark.read.options(opts).csv("csvdata")
 6 | csvDF.printSchema()
 7 | // csvDF.show()
 8 | 
 9 | 
10 | // Try specifying a schema and see what happens
11 | import org.apache.spark.sql.types.{StringType,StructField,StructType}
12 | val columnsList = List(
13 | 	StructField("firstname", StringType),
14 | 	StructField("lastname", StringType)
15 | )
16 | 
17 | val peopleSchema = StructType(columnsList)
18 | val csvDFWithSchema = spark.read.option("header", "true").
19 |     schema(peopleSchema).csv("csvdata")
20 | csvDFWithSchema.show()
21 | 


--------------------------------------------------------------------------------
/spark/dataframes/spark_sql_udfs.py:
--------------------------------------------------------------------------------
1 | # Example of using Python UDFs in Spark SQL
2 | def my_uppercase(x):
3 |     upper(x)
4 |     
5 | my_uppercase_udf = udf(my_uppercase, returnType=IntegerType())
6 | rides_clean.createOrReplaceTempView("rides_clean")
7 | spark.udf.register("my_uppercase_udf", my_uppercase_udf)
8 | spark.sql("select date_time, my_uppercase_udf(date_time) from rides_clean").show()
9 | 


--------------------------------------------------------------------------------
/spark/dataframes/windowing.py:
--------------------------------------------------------------------------------
 1 | # Spark SQL syntax for windowing
 2 | # https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-window.html
 3 | df = spark.range(10)
 4 | df.createOrReplaceTempView("df")
 5 | spark.sql("""
 6 | SELECT id,
 7 |   COUNT(id) OVER (ORDER BY id ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) cum_count,
 8 |   SUM(id) OVER (ORDER BY id ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) cum_count
 9 |   FROM df
10 | """).show()
11 | 
12 | Example of LAG
13 | spark.sql("""
14 | SELECT rider_id, date_time,
15 |   LAG(date_time) OVER (PARTITION BY rider_id ORDER BY date_time) date_time_previous
16 |   FROM rides
17 | """).show()
18 | 


--------------------------------------------------------------------------------
/spark/dataframes/withColumn.scala:
--------------------------------------------------------------------------------
 1 | val flightsSeq = Seq("""{ "flight_num":1,   "dt":"2017-01-01" }""",
 2 |                       """{ "flight_num":2, "dt":"2017-01-02" }""",
 3 |                       """{ "flight_num":3, "dt":"2017-01-03" }""",
 4 |                       """{ "flight_num":4, "dt":"2017-01-04" }""")
 5 | 
 6 | val stringRDD = sc.parallelize(flightsSeq)
 7 | val flightsDS = spark.read.json(stringRDD)
 8 | 
 9 | // There's a better way to do this (see below!)
10 | val enhanced = flightsDS.withColumn(
11 |     "DayOfWeek", date_format($"dt", "E")).withColumn(
12 |     "isSaturday", date_format($"dt", "E") === "Sat").withColumn(
13 |     "isSunday", date_format($"dt", "E") === "Sun").withColumn(
14 |     "isMonday", date_format($"dt", "E") === "Mon").withColumn(
15 |     "isTuesday", date_format($"dt", "E") === "Tue").withColumn(
16 |     "isWednesday", date_format($"dt", "E") === "Wed").withColumn(
17 |     "isThursday", date_format($"dt", "E") === "Thu").withColumn(
18 |     "isFriday", date_format($"dt", "E") === "Fri")
19 | 
20 | 
21 | enhanced.show()
22 | 
23 | // There's a better (e.g. functional way to do this but for now . . . .)
24 | var enhancedBetter = flightsDS
25 | for (day <- Array("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")) {
26 |     enhancedBetter = 
27 |         enhancedBetter.withColumn("is" + day,  
28 |                             date_format($"dt", "E") === day)
29 | }
30 |                
31 | 


--------------------------------------------------------------------------------
/spark/get-python-examples.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # You can find the python tar gz using a command like this:
 3 | # sudo find / -follow -iname "*python*tar*gz" 2>/dev/null
 4 | PYTHON_EXAMPLES_FILE=/opt/cloudera/parcels/CDH/lib/spark/python.tar.gz
 5 | cd
 6 | mkdir -p ~/python-examples && cd ~/python-examples
 7 | tar -xzvf $PYTHON_EXAMPLES_FILE
 8 | ls -lR
 9 | 
10 | # Example of running code (Remove the # in front of spark-submit)
11 | # spark-submit als.py
12 | # 2>/dev/null is a quick and dirty add which silences noisy log messages
13 | # spark-submit pi.py 100 2>/dev/null
14 | 


--------------------------------------------------------------------------------
/spark/local_file.scala:
--------------------------------------------------------------------------------
1 | val f = sc.textFile("file:///some_local_file.txt")
2 | f.count()
3 | 


--------------------------------------------------------------------------------
/spark/log-level/README.md:
--------------------------------------------------------------------------------
 1 | # Log-Levels
 2 | 
 3 | Run ./run.sh to see the difference.  If ./log4j.properties is in the working
 4 | directory of Spark-Shell, then it will (hopefully) read and adhere to the log4j.properties
 5 | in the directory where you fire off the shell.
 6 | 
 7 | ./log4j.properties.with.debug.log.level is an example of setting Spark's 
 8 | log level to DEBUG.
 9 | 
10 | ./log4j.properties is an example of setting Spark's log level to ERROR 
11 | (to include only critical error messages in Spark's output)
12 | 
13 | $SPARK_HOME/conf
14 | 
15 | Example, if you installed Spark in your home directory,
16 | $HOME/tools/spark-1.0.0/conf/log4j.properties
17 | 


--------------------------------------------------------------------------------
/spark/log-level/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootCategory=ERROR, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 7 | 
 8 | # Settings to quiet third party logs that are too verbose
 9 | log4j.logger.org.eclipse.jetty=WARN
10 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
11 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
12 | 


--------------------------------------------------------------------------------
/spark/log-level/log4j.properties.with.debug.log.level:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootCategory=DEBUG, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 7 | 
 8 | # Settings to quiet third party logs that are too verbose
 9 | log4j.logger.org.eclipse.jetty=WARN
10 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
11 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
12 | 


--------------------------------------------------------------------------------
/spark/log-level/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | OLD_DIR=$PWD
 3 | cd $HOME
 4 | echo "Running Spark Shell from your $HOME directory.  There (hopefully is no) log4j.properties in $HOME."
 5 | echo "Notice that WARN and INFO messages will appear when you run spark-shell"
 6 | echo "Enter :quit in the Spark Shell when you come to the spark> prompt.  Press a key to run"
 7 | read FOO
 8 | spark-shell
 9 | 
10 | clear
11 | cd $OLD_DIR
12 | echo "Now running Spark Shell with thel log4j.properties in this directory specified.  "
13 | echo "WARN and INFO messages should NOT appear in the Spark shell."
14 | echo "Enter :quit in the Spark Shell when you come to the spark> prompt.  Press a key to run"
15 | read FOO
16 | spark-shell
17 | 


--------------------------------------------------------------------------------
/spark/maven_example/README.md:
--------------------------------------------------------------------------------
 1 | # Readme
 2 | 
 3 |     # This is only to be performed on the VM, and is in no
 4 |     # way an example of a good practice
 5 | 
 6 |     cd ~/training_materials/sparkdev/projects/countjpgs
 7 |     mv pom.xml pom.xml.bak
 8 |     curl https://raw.githubusercontent.com/NathanNeff/hadoop-examples/master/spark/maven_example/pom.xml > pom.xml
 9 | 
10 |     # to hack the local repository on the VM, remove the local repository by un-commenting this line and running
11 |     # it
12 |     # rm -rf ~/.m2
13 |     mvn compile
14 | 
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/spark/pair/sales.txt:
--------------------------------------------------------------------------------
 1 | bob	10
 2 | steve	10
 3 | mirko	10
 4 | bob	10
 5 | dave	10
 6 | zip	10
 7 | zip	100
 8 | bob	10
 9 | bob	10
10 | mirko	20
11 | 


--------------------------------------------------------------------------------
/spark/pair/sales_by_salesperson.scala:
--------------------------------------------------------------------------------
 1 | val inputFile = "file:/home/training/src/hadoop-examples/spark/pair/sales.txt"
 2 | val sales = sc.textFile(inputFile)
 3 | val sales_pairs =
 4 |     sales.map(sale => (sale.split('\t')(0),
 5 |                        (sale.split('\t')(1)).toInt))
 6 | val sales_by_salesperson =
 7 |     sales_pairs.reduceByKey((s1, s2) =>
 8 |        s1 + s2)
 9 | sales_by_salesperson.take(10)
10 | 


--------------------------------------------------------------------------------
/spark/pair/weblogs.scala:
--------------------------------------------------------------------------------
1 | val inputFile = "file:/home/training/src/hadoop-examples/spark/pair/weblogs.txt"
2 | val weblogs = sc.textFile(inputFile)
3 | val ips_and_page = weblogs.map(s => (s.split(' ')(0) + '::' + s.split(' ')(2)))
4 | ips_and_page.take(1)
5 | 


--------------------------------------------------------------------------------
/spark/pair/weblogs.txt:
--------------------------------------------------------------------------------
 1 | 1.2.3.4 - /foo.html
 2 | 1.2.3.5 - /foo.html
 3 | 1.2.3.5 - /foo.html
 4 | 1.2.3.6 - /foo.html
 5 | 1.2.3.6 - /foo.html
 6 | 1.2.3.4 - /foo.html
 7 | 192.168.1.123 - /bar.html
 8 | 192.168.1.124 - /bar.html
 9 | 192.168.1.123 - /bar.html
10 | 192.168.1.124 - /bar.html
11 | 192.168.1.123 - /bar.html
12 | 


--------------------------------------------------------------------------------
/spark/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | spark-shell -nowarn -i wordcount.scala
3 | 


--------------------------------------------------------------------------------
/spark/simple/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <!--
 3 |   Licensed to the Apache Software Foundation (ASF) under one or more
 4 |   contributor license agreements.  See the NOTICE file distributed with
 5 |   this work for additional information regarding copyright ownership.
 6 |   The ASF licenses this file to You under the Apache License, Version 2.0
 7 |   (the "License"); you may not use this file except in compliance with
 8 |   the License.  You may obtain a copy of the License at
 9 | 
10 |       http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |   Unless required by applicable law or agreed to in writing, software
13 |   distributed under the License is distributed on an "AS IS" BASIS,
14 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   See the License for the specific language governing permissions and
16 |   limitations under the License.
17 | -->
18 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
19 | 
20 | <configuration>
21 |   <property>
22 |     <name>fs.default.name</name>
23 |     <value>file:///</value>
24 |   </property>
25 | </configuration>
26 | 


--------------------------------------------------------------------------------
/spark/simple/count.scala:
--------------------------------------------------------------------------------
1 | val data = sc.textFile("data.txt")
2 | data.count()
3 | 


--------------------------------------------------------------------------------
/spark/simple/data.txt:
--------------------------------------------------------------------------------
1 | bob	10
2 | nate	20
3 | steve	30
4 | 


--------------------------------------------------------------------------------
/spark/simple/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootCategory=ERROR, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 7 | 
 8 | # Settings to quiet third party logs that are too verbose
 9 | log4j.logger.org.eclipse.jetty=WARN
10 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
11 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
12 | 


--------------------------------------------------------------------------------
/spark/somedata.txt:
--------------------------------------------------------------------------------
 1 | spark
 2 | shark
 3 | spark
 4 | squark
 5 | lark
 6 | bark
 7 | lark
 8 | Spark
 9 | 
10 | 


--------------------------------------------------------------------------------
/spark/spark-sql-scripts/README.md:
--------------------------------------------------------------------------------
 1 | # spark-sql-scripts
 2 | 
 3 | This directory contains simple Spark SQL snippets
 4 | that are meant to be run from a Spark Shell that
 5 | already has a sqlContext initialized.
 6 | 
 7 | # create_table_and_load.scala
 8 | 
 9 | Creates a DataFrame from an RDD using schema from
10 | an existing table.
11 | 
12 | - TODO Need to investigate deprecation warning [1] (potentially from
13 |        insertInto function?) and need to investigate KeyProviderCache exception [2]
14 | 
15 |        However, code still runs "correctly" and inserts into table.
16 | 
17 | [1] warning: there were 1 deprecation warning(s); re-run with -deprecation for details
18 | 
19 | [2] ERROR hdfs.KeyProviderCache: Could not find uri with key [dfs.encryption.key.provider.uri] 
20 |     to create a keyProvider !!
21 | 
22 | 


--------------------------------------------------------------------------------
/spark/spark-sql-scripts/computeStats.md:
--------------------------------------------------------------------------------
 1 | // Examples of table stats
 2 | 
 3 | 
 4 | In Hive / Impala, run the following:
 5 | 
 6 |     CREATE TABLE accounts2
 7 |     STORED AS PARQUET
 8 |     AS SELECT * FROM accounts;
 9 | 
10 | In SparkSQL /  Spark Shell, run the following
11 | 
12 |     spark.sql("describe extended accounts2").select("col_name", "data_type").collect().foreach(println)
13 | 
14 | In Hive / Impala, run the following:
15 |     
16 |     COMPUTE STATS accounts2;
17 | 
18 | Re-run this statement, notice that Spark SQL picks up the table stats (albeit very course stats like totalSize)
19 |     spark.sql("describe extended accounts2").select("col_name", "data_type").collect().foreach(println)
20 | 
21 | 


--------------------------------------------------------------------------------
/spark/spark-sql-scripts/create_table_and_load.scala:
--------------------------------------------------------------------------------
 1 | // This is meant to be run inside the Spark Shell
 2 | import sqlContext.implicits._
 3 | import org.apache.spark.sql._
 4 | 
 5 | sqlContext.sql("""CREATE EXTERNAL TABLE IF NOT EXISTS hadoop_examples_ips
 6 |                   (ip STRING)
 7 |                   ROW FORMAT DELIMITED 
 8 |                   FIELDS TERMINATED BY '\t'""")
 9 | val ips_schema = sqlContext.table("hadoop_examples_ips").schema
10 | 
11 | val data = Array("123.456.789.999 - bob - GET /cat_picture.jpg",
12 |                  "1.2.3.4 - steve GET /dog_picture.jpg")
13 | 
14 | val weblogs = sc.parallelize(data)
15 | 
16 | val ips = weblogs.map(line => Row(line.split(' ')(0)))
17 | 
18 | val ips_df = sqlContext.createDataFrame(ips, ips_schema)
19 | 
20 | ips_df.insertInto("hadoop_examples_ips")
21 | 


--------------------------------------------------------------------------------
/spark/spark-sql-scripts/create_table_and_load_parquet.scala:
--------------------------------------------------------------------------------
 1 | // This is meant to be run inside the Spark Shell
 2 | import sqlContext.implicits._
 3 | import org.apache.spark.sql._
 4 | 
 5 | val tableName = "hadoop_examples_ips_parquet"
 6 | 
 7 | // don't forget the "s" function before """
 8 | sqlContext.sql(s"""CREATE EXTERNAL TABLE IF NOT EXISTS $tableName
 9 |                   (ip STRING)
10 |                   STORED AS PARQUET""")
11 | 
12 | val ips_schema = sqlContext.table(tableName).schema
13 | 
14 | val data = Array("123.456.789.999 - bob - GET /cat_picture.jpg",
15 |                  "1.2.3.4 - steve GET /dog_picture.jpg")
16 | 
17 | val weblogs = sc.parallelize(data)
18 | 
19 | val ips = weblogs.map(line => Row(line.split(' ')(0)))
20 | 
21 | val ips_df = sqlContext.createDataFrame(ips, ips_schema)
22 | 
23 | ips_df.insertInto(tableName)
24 | 


--------------------------------------------------------------------------------
/spark/spark-sql-scripts/data/people.json:
--------------------------------------------------------------------------------
1 | {"name":"Bob",    "relatives": [ {"name":"Robert","rel":"dad"}, {"name":"Roberta","rel":"mother"}]}
2 | {"name":"Steve",  "relatives": [ {"name":"Steven","rel":"dad"}]}
3 | {"name":"Sharon", "relatives": [ {"name":"Ozzy","rel":"husband"}]}
4 | {"name":"Han Solo"}
5 | 


--------------------------------------------------------------------------------
/spark/spark-sql-scripts/parse_json.scala:
--------------------------------------------------------------------------------
 1 | import sqlContext.implicits._
 2 | import org.apache.spark.sql._
 3 | 
 4 | val curDir = System.getProperty("user.dir")
 5 | val json = sqlContext.read.json("file:" + curDir + "/data/people.json")
 6 | json.printSchema()
 7 | json.registerTempTable("json")
 8 | 
 9 | /* 
10 |    TODO figure out what the table_name versus column_name
11 |    means when EXPLODING arrays of structs
12 |    https://docs.databricks.com/spark/latest/spark-sql/language-manual/select.html
13 |    (Remove OUTER to only get people with relatives)
14 | */ 
15 | val flattened_table = sqlContext.sql("""SELECT name, r.name AS relative_name, r.rel as foo 
16 | 										FROM json 
17 | 										LATERAL VIEW OUTER EXPLODE(relatives) r AS r""")
18 | 
19 | flattened_table.show()
20 | flattened_table.printSchema
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/spark/spark-sql/README.md:
--------------------------------------------------------------------------------
 1 | ## Step 1: Add data to HDFS
 2 | 
 3 |     hdfs dfs -mkdir /user/training/
 4 |     hdfs dfs -put data/favorite_foods /user/training
 5 | 
 6 | ## Step 2
 7 | 
 8 | ## Step 3: Profit!
 9 | 
10 |     mvn package
11 |     spark-submit --class examples.ExplodeAndFriends target/SparkSQLExamples-1.0.jar
12 | 


--------------------------------------------------------------------------------
/spark/spark-sql/data/favorite_foods/favorite_foods.txt:
--------------------------------------------------------------------------------
 1 | 1	chocolate,bread,pot roast,chicken
 2 | 2	pizza,milk,flowers,led zeppelin,rabbit,duck,pheasant,pheasant
 3 | 3	nothing
 4 | 4	
 5 | 5	cheese,milk,bananas,apples
 6 | 6	carrots,celery,steak
 7 | 7	chili
 8 | 8	turkey
 9 | 9	roasted duck
10 | 10	possum soup
11 | 11	stone soup,french fries
12 | 12	hamburgers,quail,milk
13 | 


--------------------------------------------------------------------------------
/spark/spark-sql/src/main/scala/examples/SimpleShowTables.scala:
--------------------------------------------------------------------------------
 1 | package examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.sql.hive._
 6 | 
 7 | object MoreSQL {
 8 |   def main(args: Array[String]) {
 9 | 
10 |     val sc = new SparkContext()
11 | 	sc.setLogLevel("FATAL")
12 | 
13 |     val sqlContext = new HiveContext(sc)
14 | 
15 | 	sqlContext.sql("DROP TABLE IF EXISTS favorite_foods")
16 | 	val sql = """
17 | 	CREATE EXTERNAL TABLE favorite_foods(
18 | 	userid INT,
19 | 	favorite_foods STRING)
20 | 	ROW FORMAT DELIMITED
21 | 	FIELDS TERMINATED BY '\t'
22 |     LOCATION '/user/training/favorite_foods'
23 | 	"""
24 |     sqlContext.sql(sql)
25 | 
26 | 
27 |     // http://spark.apache.org/docs/1.6.0/api/scala/index.html#org.apache.spark.sql.DataFrame
28 |     val fav_foods = sqlContext.read.table("favorite_foods")
29 | 
30 |     val expl = fav_foods.explode("favorite_foods", "favorite_food") {
31 | 	    foods:String => foods.split(",")
32 |     }
33 | 
34 |     expl.columns
35 |     expl.show()
36 | 
37 |     val lateral = sqlContext.sql("SELECT userid, favorite_food FROM favorite_foods LATERAL VIEW explode(SPLIT(favorite_foods,',')) adTable AS favorite_food")
38 | 
39 |     lateral.columns
40 |     lateral.show()
41 | 
42 |     sc.stop
43 |    }
44 |  }
45 | 
46 | 


--------------------------------------------------------------------------------
/spark/spark-sql/src/main/scala/examples/explode_and_friends.scala:
--------------------------------------------------------------------------------
 1 | package examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.sql.hive._
 6 | 
 7 | object ExplodeAndFriends {
 8 |   def main(args: Array[String]) {
 9 | 
10 |     val sc = new SparkContext()
11 | 	sc.setLogLevel("FATAL")
12 | 
13 |     val sqlContext = new HiveContext(sc)
14 | 
15 | 	sqlContext.sql("DROP TABLE IF EXISTS favorite_foods")
16 | 	val sql = """
17 | 	CREATE EXTERNAL TABLE favorite_foods(
18 | 	userid INT,
19 | 	favorite_foods STRING)
20 | 	ROW FORMAT DELIMITED
21 | 	FIELDS TERMINATED BY '\t'
22 |     LOCATION '/user/training/favorite_foods'
23 | 	"""
24 |     sqlContext.sql(sql)
25 | 
26 | 
27 |     // http://spark.apache.org/docs/1.6.0/api/scala/index.html#org.apache.spark.sql.DataFrame
28 |     val fav_foods = sqlContext.read.table("favorite_foods")
29 | 
30 |     val expl = fav_foods.explode("favorite_foods", "favorite_food") {
31 | 	    foods:String => foods.split(",")
32 |     }
33 | 
34 |     expl.columns
35 |     expl.show()
36 | 
37 |     val lateral = sqlContext.sql("SELECT userid, favorite_food FROM favorite_foods LATERAL VIEW explode(SPLIT(favorite_foods,',')) adTable AS favorite_food")
38 | 
39 |     lateral.columns
40 |     lateral.show()
41 | 
42 |     sc.stop
43 |    }
44 |  }
45 | 
46 | 


--------------------------------------------------------------------------------
/spark/spark-sql/src/main/scala/examples/more_sql.scala:
--------------------------------------------------------------------------------
 1 | package examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.sql.hive._
 6 | 
 7 | object MoreSQL {
 8 |   def main(args: Array[String]) {
 9 | 
10 |     val sc = new SparkContext()
11 | 	sc.setLogLevel("FATAL")
12 | 
13 |     val sqlContext = new HiveContext(sc)
14 | 
15 | 	sqlContext.sql("DROP TABLE IF EXISTS favorite_foods")
16 | 	val sql = """
17 | 	CREATE EXTERNAL TABLE favorite_foods(
18 | 	userid INT,
19 | 	favorite_foods STRING)
20 | 	ROW FORMAT DELIMITED
21 | 	FIELDS TERMINATED BY '\t'
22 |     LOCATION '/user/training/favorite_foods'
23 | 	"""
24 |     sqlContext.sql(sql)
25 | 
26 | 
27 |     // http://spark.apache.org/docs/1.6.0/api/scala/index.html#org.apache.spark.sql.DataFrame
28 |     val fav_foods = sqlContext.read.table("favorite_foods")
29 | 
30 |     val expl = fav_foods.explode("favorite_foods", "favorite_food") {
31 | 	    foods:String => foods.split(",")
32 |     }
33 | 
34 |     expl.columns
35 |     expl.show()
36 | 
37 |     val lateral = sqlContext.sql("SELECT userid, favorite_food FROM favorite_foods LATERAL VIEW explode(SPLIT(favorite_foods,',')) adTable AS favorite_food")
38 | 
39 |     lateral.columns
40 |     lateral.show()
41 | 
42 |     sc.stop
43 |    }
44 |  }
45 | 
46 | 


--------------------------------------------------------------------------------
/spark/sparkml/kmeans.py:
--------------------------------------------------------------------------------
 1 | from pyspark.ml.clustering import KMeans
 2 | from pyspark.ml import Pipeline
 3 | from pyspark.ml import PipelineModel
 4 | from pyspark.sql import Row
 5 | from pyspark.ml.feature import VectorAssembler
 6 | from pyspark.sql.functions import split, col
 7 | 
 8 | # Load the data from a directory with device status data including
 9 | # latitude, longitude
10 | # Create a DataFrame that #has 2 columns named Lat and Lon from the 4th and 5th
11 | # fields in the file
12 | 
13 | filename = input("Please enter the HDFS Directory where the data is located:")
14 | latLonDF = spark.read.csv(filename).\
15 | select(col('_c3').cast('float').alias('lat'),\
16 | col('_c4').cast('float').alias('lon'))\
17 | .where("lat <> 0 and lon <> 0")
18 | 
19 | 
20 | # Create a vector assembler that will take in our DataFrame and convert the
21 | # inputCols specified
22 | va = VectorAssembler(inputCols=["lat","lon"],outputCol="features")
23 | 
24 | # Use the vector assembler to transform the DataFame which will add a new
25 | # column called 'features' which will be of the Vector type
26 | 
27 | vectorDF = va.transform(latLonDF)
28 | 
29 | # Create a Kmeans estimator that takes the "features" column as input and set
30 | # the value for K to 5 with a tolerance of .01 and a seed # of 12345
31 | km= KMeans(k=5,tol=.01,seed=12345, featuresCol="features")
32 | 
33 | kmModel = km.fit(vectorDF)
34 | 
35 | # Print out the cluster centers
36 | for center in kmModel.clusterCenters(): print(center)
37 | predictionDF = kmModel.transform(vectorDF)
38 | predictionDF.show()
39 | 
40 | # Same process via an ML pipeline
41 | # pl = Pipeline(stages=[va,km])
42 | # plmodel = pl.fit(latLonDF)
43 | # predictions = plmodel.transform(latLonDF)
44 | # plmodel.write().overwrite().save("/loudacre/pipelineModel/")
45 | # plmodel1 = PipelineModel.load("/loudacre/pipelineModel/")
46 | # predictions.show(5)
47 | 
48 | 


--------------------------------------------------------------------------------
/spark/structured_streaming_sensors/rate_source_simple.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple example of Spark Structured Streaming's rate source
 3 | 
 4 | References:  Changing a column's name:
 5 |   - https://stackoverflow.com/questions/34077353
 6 | """
 7 | from pyspark.sql import SparkSession
 8 | 
 9 | spark = SparkSession.builder.getOrCreate()
10 | 
11 | # read data from a set of streaming files
12 | rateDF = spark.readStream.format("rate").option("rowsPerSecond", 50).load()
13 | 
14 | # rateDF.printSchema()
15 | # root
16 | #  |-- timestamp: timestamp (nullable = true)
17 | #  |-- value: long (nullable = true)
18 | 
19 | rateQuery = rateDF.writeStream.format("console").option("truncate", False).start()
20 | 
21 | # Remove all columns except "timestamp" and rename
22 | renamedQuery = rateDF.selectExpr("timestamp AS ts").\
23 |     writeStream.\
24 |     format("console").option("truncate", False).start()
25 | 
26 | # Call rateQuery.stop()
27 | # Call renamedQuery.stop()
28 | 


--------------------------------------------------------------------------------
/spark/tf-idf/tf-idf.spark:
--------------------------------------------------------------------------------
 1 | myfiles = "hdfs://localhost:8020/user/training/mytext/"
 2 | mytext = sc.wholeTextFiles(myfiles)
 3 | mylines = mytext.map(lambda (filename, content) : ((os.path.basename(filename),content.replace("\n", " "))))
 4 | mywords = mylines.flatMapValues(lambda content : content.split(" "))
 5 |  
 6 | tf = mywords.map(lambda (filename, word) : ((filename, word), 1)).reduceByKey(lambda a,b : a+b)
 7 |  
 8 | bign = mytext.count()
 9 |  
10 | df = tf.map(lambda ((file,word),count) : (word,1)).countByKey()
11 |  
12 | import math
13 |  
14 | tfidf = tf.map(lambda ((file,word),count) : ((file,word),count*math.log(bign/df.get(word))))
15 | tfidf.collect()
16 | 


--------------------------------------------------------------------------------
/spark/wordcount.scala:
--------------------------------------------------------------------------------
1 | val textFile = sc.textFile("somedata.txt")
2 | textFile.count() // Number of items in this RDD
3 | textFile.first() // First item in this RDD
4 | val linesWithSpark = textFile.filter(line => line.contains("Spark"))
5 | textFile.filter(line => line.contains("Spark")).count() // How many lines contain "Spark"?
6 | println linesWithSpark
7 | exit
8 | 


--------------------------------------------------------------------------------
/spark/wordlength_with_details/data.txt:
--------------------------------------------------------------------------------
1 | one oompf
2 | otto
3 | ottoman
4 | terabyte
5 | tough
6 | thyme
7 | 


--------------------------------------------------------------------------------
/spark/wordlength_with_details/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootCategory=ERROR, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 7 | 
 8 | # Settings to quiet third party logs that are too verbose
 9 | log4j.logger.org.eclipse.jetty=WARN
10 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
11 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
12 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
13 | 


--------------------------------------------------------------------------------
/spark/wordlength_with_details/wordlength_with_details.scala:
--------------------------------------------------------------------------------
 1 | val input_dir = System.getProperty("user.dir")
 2 | 
 3 | val words = sc.textFile("file:" + input_dir + "/data.txt").
 4 |     flatMap(line => line.split(" "))
 5 | 
 6 | /* ("o", "other"),
 7 |  * ("o", "otto"),
 8 |  * ("t", "tomcat"),
 9 |  * ("o", "otto"),
10 |  */
11 | val first_letter_and_word = words.map(
12 |     word => (word.substring(0,1), word))
13 | 
14 | val first_letter_counts = 
15 | 	first_letter_and_word.map{ 
16 | 		case (first_letter, word) => (first_letter, 1)}.
17 |         reduceByKey((x,y) => (x + y))
18 | 
19 | 
20 | /* Produce:
21 |  * ("o", "otto,other")
22 |  * ("t", "too",tomcat")
23 |  */
24 | val first_letter_and_wordlist = 
25 |     first_letter_and_word.distinct().
26 |     groupByKey().
27 |     mapValues(_.mkString(","))
28 | 
29 | val counts_with_words =
30 | 	first_letter_counts.join(first_letter_and_wordlist)
31 | 
32 | val output_dir = "file:" + input_dir + "/wordlength_with_details"
33 | counts_with_words.saveAsTextFile(output_dir)
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/sql-diffs/.gitignore:
--------------------------------------------------------------------------------
1 | *results.txt
2 | 


--------------------------------------------------------------------------------
/sql-diffs/README.md:
--------------------------------------------------------------------------------
 1 | # README
 2 | 
 3 | Illustration of variations in the SQL-dialects
 4 | on a given version of Hadoop/Impala.
 5 | 
 6 | To load data from ./accts.txt
 7 | 
 8 |     ./load_data.sh <database> <userid> <password>
 9 | 
10 | To compare SQL variations, look at query*.sql files, then run
11 | them using:
12 | 
13 |     ./run_queries.sh <database> <userid> <password>
14 |     
15 | To compare output, review output files:
16 | 
17 |     ./query1.sql-hive-results.txt vs.
18 |     ./query1.sql-impala-results.txt vs.
19 |     ./query1.sql-mysql-results.txt
20 | 
21 | You could use some diff script or just run diff.
22 | 


--------------------------------------------------------------------------------
/sql-diffs/accts.txt:
--------------------------------------------------------------------------------
 1 | 1	Aaron	Aardvark	CO
 2 | 2	Betty	Aardvark	CO
 3 | 3	Cathy	Aardvark	IL
 4 | 4	Steve	Snake	MO
 5 | 5	Steve	Slither	CO
 6 | 6	Steve	Zeppelin	CO
 7 | 7	Lenny	Zeppelin	CO
 8 | 8	Sal	Stevens	IL
 9 | 20	Aaron	Aardvark	CO
10 | 21	Aaron	Aardvark	CO
11 | 22	Sandwichhead	Aardvark	CO
12 | 22	Sandwichhead	Bigfoot	CO
13 | 


--------------------------------------------------------------------------------
/sql-diffs/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | DB_NAME=$1
 4 | USER_ID=$2
 5 | PASSWORD=$3
 6 | if [[ -z "$DB_NAME" || -z "$USER_ID" || -z "$PASSWORD" ]]; then
 7 | 	echo "Usage: $0 <DB_NAME> <USER_ID> <PASSWORD>"
 8 | 	exit 1
 9 | fi
10 | 
11 | # Load MySQL
12 | mysql -u $USER_ID --password=$PASSWORD <<EOF
13 | 		CREATE DATABASE IF NOT EXISTS $DB_NAME;
14 | 
15 | 		USE $DB_NAME;
16 | 
17 | 		DROP TABLE IF EXISTS accts;
18 | 
19 | 		CREATE TABLE accts (
20 | 		id INT,
21 | 		first_name VARCHAR(255),
22 | 		last_name VARCHAR(255),
23 | 		state VARCHAR(255));
24 | EOF
25 | 
26 | mysqlimport -u $USER_ID --password=$PASSWORD \
27 |    --fields-terminated-by '\t' \
28 |    --delete $DB_NAME `pwd`/accts.txt
29 | 
30 | 
31 | hdfs dfs -mkdir -p /user/$USER/input_data
32 | hdfs dfs -put -f accts.txt /user/$USER/input_data
33 | 
34 | # Load Impala/Hive
35 | impala-shell -q "
36 | CREATE DATABASE IF NOT EXISTS $DB_NAME;
37 | 
38 | USE $DB_NAME;
39 | 
40 | DROP TABLE IF EXISTS accts;
41 | 
42 | CREATE TABLE accts (
43 | 		id INT,
44 | 		first_name STRING,
45 | 		last_name STRING,
46 | 		state STRING
47 | )
48 | ROW FORMAT DELIMITED
49 | FIELDS TERMINATED BY '\t';
50 | 
51 | LOAD DATA INPATH '/user/$USER/input_data' OVERWRITE INTO TABLE accts;"
52 | 
53 | 


--------------------------------------------------------------------------------
/sql-diffs/query1.sql:
--------------------------------------------------------------------------------
1 | SELECT concat_ws(' ', first_name, last_name)
2 | FROM accts
3 | WHERE state = 'CO'
4 | GROUP BY first_name, last_name
5 | ORDER BY last_name, first_name
6 | 


--------------------------------------------------------------------------------
/sql-diffs/query2.sql:
--------------------------------------------------------------------------------
1 | SELECT concat_ws(' ', first_name, last_name)
2 | FROM (
3 | 	SELECT first_name, last_name
4 | 	FROM accts
5 | 	WHERE state = 'CO'
6 | 	GROUP BY first_name, last_name) unnecessary_subquery
7 | ORDER BY last_name, first_name;
8 | 


--------------------------------------------------------------------------------
/sql-diffs/query3.sql:
--------------------------------------------------------------------------------
1 | SELECT sorted.name FROM 
2 |     (SELECT DISTINCT concat_ws(' ', first_name, last_name) AS name, first_name, last_name 
3 |      FROM accts 
4 |      WHERE state='CO' ORDER BY last_name, first_name) sorted;
5 | 


--------------------------------------------------------------------------------
/sql-diffs/query4.sql:
--------------------------------------------------------------------------------
1 | SELECT sq.name FROM
2 |     (SELECT DISTINCT concat_ws(' ', first_name, last_name) AS name, first_name, last_name
3 |      FROM accts
4 |      WHERE state='CO' ) sq
5 | ORDER BY sq.last_name, sq.first_name;
6 | 


--------------------------------------------------------------------------------
/sql-diffs/query5.sql:
--------------------------------------------------------------------------------
1 | SELECT sq.name, sq.last_name, sq.first_name 
2 | FROM 
3 |     (SELECT DISTINCT concat_ws(' ', first_name, last_name) AS name, first_name, last_name 
4 |      FROM accts 
5 |      WHERE state='CO') sq
6 | ORDER BY sq.last_name, sq.first_name;
7 | 


--------------------------------------------------------------------------------
/sql-diffs/run_queries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # If no query*sql files are present, don't run anything.
 3 | shopt -s nullglob
 4 | DB_NAME=$1
 5 | USER_ID=$2
 6 | PASSWORD=$3
 7 | if [[ -z "$DB_NAME" || -z "$USER_ID" || -z "$PASSWORD" ]]; then
 8 |     echo "Usage: $0 <DB_NAME> <USER_ID> <password>"
 9 |     exit 1
10 | fi
11 | for SQL_QUERY_FILE in query*.sql; do
12 |         impala-shell -d $DB_NAME -f $SQL_QUERY_FILE \
13 |                      -o $SQL_QUERY_FILE-impala-results.txt \
14 |                      --delimited
15 |         mysql -u $USER_ID --password=$PASSWORD $DB_NAME --column-names=false \
16 | 		    < $SQL_QUERY_FILE > $SQL_QUERY_FILE-mysql-results.txt
17 | 
18 | 		beeline -u jdbc:hive2://localhost:10000/$DB_NAME --silent --verbose=false \
19 |                  --showHeader=false \
20 |                  --username=$USER_ID --password=$PASSWORD \
21 |                 -f $SQL_QUERY_FILE --outputformat=tsv2 | sed -e '/^$d/d' > $SQL_QUERY_FILE-hive-results.txt
22 | done
23 | 


--------------------------------------------------------------------------------
/sqoop/README.md:
--------------------------------------------------------------------------------
1 | # Sqoop Job
2 | 
3 | 
4 | execute ./sqoop-job-create
5 | sqoop job --list <job>
6 | sqoop job --show <job>
7 | sqoop job --exec import-accounts
8 | sqoop job --show <job>
9 | 


--------------------------------------------------------------------------------
/sqoop/sqoop-job-create:
--------------------------------------------------------------------------------
 1 | sqoop job --create import-accounts -- \
 2 | import \
 3 | --connect jdbc:mysql://servername/dbname \
 4 | --username something --password something \
 5 | --table accounts \
 6 | --target-dir /accounts \
 7 | --null-string '\\N' \
 8 | --null-non-string '\\N' \
 9 | --incremental append \
10 | --check-column acct_num
11 | 


--------------------------------------------------------------------------------
/utils/random_crash/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | INPUT_DIR=input/random-crash
 3 | OUTPUT_DIR=output/random-crash
 4 | STREAMING_JAR=/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-*.jar
 5 | 
 6 | hadoop fs -test -d $INPUT_DIR && hadoop fs -rm -R $INPUT_DIR
 7 | hadoop fs -test -d $OUTPUT_DIR && hadoop fs -rm -R $OUTPUT_DIR
 8 | hadoop fs -mkdir $INPUT_DIR
 9 | 
10 | for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20;do
11 |         hadoop fs -touchz $INPUT_DIR/$i
12 | done
13 | 
14 | for j in 1 2 3 4
15 | do
16 |     echo "Starting job $j"
17 |     hadoop jar $STREAMING_JAR \
18 |     -Dmapred.reduce.tasks=0 \
19 |     -input $INPUT_DIR \
20 |     -output $OUTPUT_DIR \
21 |     -mapper mapper.pl \
22 |     -file mapper.pl \
23 |     -Dmapred.job.name=Random_Crash_$j &
24 | done
25 | 


--------------------------------------------------------------------------------
/utils/setup_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | for dir in $EXAMPLES_DIR /opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce
 3 |         do
 4 |                 test -d $dir && export EXAMPLES_DIR=$dir
 5 |         done
 6 | 
 7 | test -d $EXAMPLES_DIR || {
 8 |         echo "Can't find examples dir $EXAMPLES_DIR"
 9 |         exit 1
10 | }
11 | 
12 | export EXAMPLES_JAR=$EXAMPLES_DIR/hadoop-examples.jar
13 | 
14 | test -f $EXAMPLES_JAR || {
15 |         echo "Can't find $EXAMPLES_JAR"
16 |         exit 1
17 | }
18 | 


--------------------------------------------------------------------------------
/utils/sleepjob/allocations.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <allocations>
 3 | 	<pool name="boss">
 4 | 		<minMaps>8</minMaps>
 5 | 		<minReduces>3</minReduces>
 6 | 		<schedulingMode>FIFO</schedulingMode>
 7 | 		<minSharePreemptionTimeout>30</minSharePreemptionTimeout>
 8 | 	</pool>
 9 | 	<pool name="something">
10 | 		
11 | 	</pool>
12 | 	<defaultMinSharePreemptionTimeout>60</defaultMinSharePreemptionTimeout>
13 | </allocations>                
14 | 


--------------------------------------------------------------------------------
/utils/sleepjob/bigjob.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | OUTPUT_DIR=/user/training/nate/output/wordcount2
3 | INPUT_DIR=/user/training/nate/input
4 | hadoop fs -test -d $OUTPUT_DIR && hadoop fs -rm -R $OUTPUT_DIR
5 | hadoop jar /usr/lib/hadoop-0.20-mapreduce/hadoop-examples.jar wordcount $INPUT_DIR $OUTPUT_DIR
6 | 


--------------------------------------------------------------------------------
/utils/sleepjob/submitLONGSleepJob.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | hadoop jar ~/assets/sleep.jar SleepJob \
3 |     -D pool.name="BallHog" \
4 |     -D mapred.job.name="BallHogJob" \
5 |     -m 10 -r 10 -mt 300000 -rt 300000 &
6 | echo "Just submitted BallHog job"
7 | 


--------------------------------------------------------------------------------
/utils/sleepjob/submitReportsBOSSPool.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | for month in January February March April May; do
 3 |         hadoop jar ~/assets/sleep.jar SleepJob \
 4 |         -D pool.name="boss" \
 5 |         -D mapred.job.name="FIFO $month" \
 6 |         -m 10 -r 10 -mt 30000 -rt 30000 &
 7 |         sleep 2
 8 | done
 9 | #-fs hdfs://greg:8020 \
10 | #-jt hari:8021 \
11 | 


--------------------------------------------------------------------------------
/utils/teragen-and-terasort.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Usage is here:
 3 | # http://www.michael-noll.com/blog/2011/04/09/benchmarking-and-stress-testing-an-hadoop-cluster-with-terasort-testdfsio-nnbench-mrbench/#teragen-generate-the-terasort-input-data-if-needed
 4 | INPUT_DIR=data/teragendata_10_gb
 5 | OUTPUT_DIR=output/terasort_10_gb
 6 | 
 7 | # 1 GB = 10000000 
 8 | # 20 GB = 200000000 
 9 | TERAGEN_SIZE=100000000 
10 | # Try Bumping # of Maps for Teragen :-)
11 | TERAGEN_MAPS=20
12 | FORCE_TERAGEN=1
13 | # Set RUN_COMPARISON to 1 if you want to run terasort (again) with io.sort.mb set differently
14 | RUN_COMPARISON=1
15 | OUTPUT_DIR2=output/terasort10_gb_again
16 | 
17 | # If you don't have EXAMPLES_DIR set, you can manually set it here,
18 | # or let this code loop through possible dirs and find an existing directory
19 | test -z "$EXAMPLES_DIR" && {
20 |         for d in /usr/lib/hadoop-0.20-mapreduce/ /opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce;
21 |         do
22 |                 test -d $d && EXAMPLES_DIR=$d
23 |         done
24 | }
25 | 
26 | # hadoop jar hadoop-*examples*.jar teragen <number of 100-byte rows> <output dir>
27 | if ! hadoop fs -test -e $INPUT_DIR/part-00000 || [[ "$FORCE_TERAGEN" -ne "0" ]]; then
28 | 	echo "------------- Creating test data --------------"
29 | 	hadoop fs -rm -R $INPUT_DIR
30 |         
31 | 	hadoop jar $EXAMPLES_DIR/hadoop-examples.jar teragen -Dmapred.map.tasks=$TERAGEN_MAPS $TERAGEN_SIZE $INPUT_DIR
32 | fi
33 | 
34 | hadoop fs -test -d $OUTPUT_DIR && hadoop fs -rm -R $OUTPUT_DIR
35 | hadoop jar $EXAMPLES_DIR/hadoop-examples.jar terasort $INPUT_DIR $OUTPUT_DIR
36 | 
37 | if [[ "1" -eq "$RUN_COMPARISON" ]]; then
38 |         # This might be needed if running on EC2 (Admin Class) or smaller cluster
39 |         # with less than 1GB Heap Size for Mappers
40 |         # -Dmapred.child.java.opts=-Xmx512m 
41 |         hadoop jar $EXAMPLES_DIR/hadoop-examples.jar terasort -Dio.sort.mb=256 $INPUT_DIR $OUTPUT_DIR2
42 | fi
43 | 


--------------------------------------------------------------------------------