├── .gitignore ├── .gitmodules ├── README.md ├── avro ├── .gitignore ├── README.org ├── devices │ ├── README.md │ ├── avro │ │ ├── __init__.py │ │ ├── datafile.py │ │ ├── io.py │ │ ├── ipc.py │ │ ├── protocol.py │ │ ├── schema.py │ │ ├── tool.py │ │ └── txipc.py │ ├── devices.avro │ └── read_devices.py ├── download_avro.sh ├── read_users.py ├── run.sh ├── user.avsc ├── write_bunch_of_users.py └── write_users.py ├── cloudera-director └── api-examples │ └── list-instances.py ├── cloudera-manager └── python_rest_api │ ├── README.md │ ├── cm-dump-config.py │ ├── simple_cluster_properties.py │ └── simple_config_settings.py ├── data-engineering ├── README.md ├── create_txnl_tbl.hql ├── create_txnl_tbl.sh ├── insert_into_products_txnl.hql ├── insert_into_products_txnl.sh ├── query_txnl.hql ├── query_txnl_using_hive.sh ├── query_txnl_using_impala.sh ├── update_txnl_using_hive.hql ├── update_txnl_using_hive.sh └── view_dir_structure.sh ├── hbase ├── apitests │ ├── README.md │ ├── pom.xml │ ├── run.sh │ └── src │ │ ├── main │ │ └── java │ │ │ ├── gui │ │ │ └── HBaseBlaster.java │ │ │ ├── misc │ │ │ └── CreateTable.java │ │ │ └── util │ │ │ └── HBaseUtility.java │ │ └── test │ │ └── java │ │ └── apitests │ │ └── AppTest.java ├── authorization │ └── simple │ │ └── README.md ├── client_maven_example │ └── pom.xml ├── colfam_flush │ ├── .gitignore │ ├── README.md │ └── insert_random_words.py ├── groovy │ └── loadRandomData │ │ ├── README.md │ │ ├── compile.sh │ │ ├── loadRandomData.groovy │ │ └── run.sh ├── hbase-sandbox │ ├── README.md │ ├── pom.xml │ ├── runStopRowThing.sh │ └── src │ │ └── main │ │ ├── java │ │ └── hbasesandbox │ │ │ ├── StopRowThing.java │ │ │ └── UtilityThing.java │ │ └── resources │ │ └── log4j.properties ├── hbase_blocks │ └── hbase_blocks.rb ├── hbase_hive_impala │ ├── README.org │ ├── create_hbase_table.rb │ ├── create_hive_hbase_table.sql │ ├── map-hive-to-hbase-ratings.sql │ ├── put_data_into_sales_aggregate_table.sh │ └── select_some_ratings.sql ├── hotSpots │ ├── README.md │ ├── TransactionFactory.groovy │ ├── TransactionImporter.groovy │ ├── compile.sh │ └── run.sh ├── null_safe_joins │ ├── create_and_query.sql │ ├── create_and_query.txt │ └── data.rb ├── random_words_python │ ├── README.md │ └── insert_random_words.py ├── recordGenerator │ ├── PutRandomRecords.groovy │ └── run.sh ├── schema_design │ └── schema_design.org ├── shell_stuff │ ├── README.md │ ├── check_new_stuff │ │ ├── alter_table_properties_async.rb │ │ ├── alter_versions_async.rb │ │ ├── colfam_async.rb │ │ ├── create_simple_table.rb │ │ ├── create_table_shorthand.rb │ │ ├── create_table_two_colfams.rb │ │ ├── create_table_two_versions.rb │ │ ├── delete.rb │ │ ├── delete_colfam_async.rb │ │ ├── get_colfam.rb │ │ ├── namespace_create.rb │ │ ├── namespace_tables.rb │ │ ├── new_shell_commands.rb │ │ ├── parameter_create_tbl.rb │ │ ├── run_parameter_script.sh │ │ ├── scan_examples.rb │ │ ├── scan_filter.rb │ │ ├── unknown_arguments_warning.rb │ │ └── versions_async.rb │ ├── inspect_HTable.rb │ ├── list_regions.rb │ └── list_tables.rb └── simpleConnection │ ├── SimpleCreateAndPut.groovy │ ├── compile.sh │ └── run.sh ├── hdfs ├── data-visibility │ ├── README.md │ └── foo.pl ├── hdfs-cheatsheet.md ├── replication │ └── run.sh └── webhdfs-httpfs │ ├── run.sh │ └── testdata.txt ├── hive ├── crlf │ ├── data.txt │ ├── data_unix.txt │ ├── get_max.sql │ └── run.sh ├── debate │ ├── analyze_debate.hql │ └── debate.txt ├── incremental_insert │ ├── README.TXT │ ├── employees.txt │ ├── join_table.sql │ ├── load_and_run.sql │ ├── more_nicknames.txt │ ├── nicknames.txt │ └── run.sh ├── partition-example │ ├── README.TXT │ ├── create_and_load_employees.sql │ ├── employees.txt │ ├── get_partition_info.sql │ ├── partition_employees.sql │ ├── partition_employees_keep_orig_data.sql │ └── run.sh ├── simple_queries │ ├── README.md │ ├── create_tables.sql │ ├── customers.txt │ ├── load_data.sh │ ├── orders.txt │ └── subquery_in_where.sql ├── transform │ ├── awk-example.sh │ ├── legalpets.pl │ └── transform-pets.hql └── wordcount │ ├── README.TXT │ ├── README.md │ ├── compare.hql │ ├── create-external-table-for-mapreduce-output.hql │ ├── run-mr-and-hive-queries.sh │ ├── run.sh │ └── wordcount.hql ├── impala ├── README.md ├── analytic-functions │ ├── ads.txt │ ├── avg_ads.sql │ ├── avg_ads.txt │ ├── create_ads.sql │ ├── impala-version.txt │ ├── lag_ads.sql │ ├── lag_ads.txt │ └── run.sh ├── datatypes │ └── decimal_vs_integer │ │ ├── README.md │ │ ├── create_table.sql │ │ ├── data.txt │ │ ├── run.sh │ │ └── run_queries.sql ├── dyn_test │ ├── README.md │ ├── branch_totals_monday.txt │ ├── branch_totals_tuesday.txt │ ├── dyn_part.sql │ ├── run_me_hive.sh │ └── run_me_impala.sh ├── file_format_shootout │ ├── README.TXT │ ├── count.sh │ ├── create_and_populate_parquet_table.sql │ ├── create_rc_and_sequencefile_table.sql │ ├── drop_tables.sql │ ├── du_tables.sh │ ├── populate_rc_and_sequencefile_table.sql │ ├── q19.sql │ ├── run.sh │ └── run_q_19.sh ├── google-ngrams │ ├── README.md │ ├── count_spark.sql │ ├── find_spark.sql │ └── run.sh ├── impala-impyla-playground │ ├── README.TXT │ ├── data │ │ └── simple.txt │ └── simple.py ├── impyla │ └── query_impala.py ├── parquet │ ├── README.txt │ └── run.sh ├── refresh-and-invalidate │ ├── create-table.sql │ ├── monday.txt │ ├── run.sh │ ├── tuesday.txt │ └── wednesday.txt ├── simple_queries │ ├── create_tables.sql │ ├── customers.txt │ ├── load_data.sh │ ├── orders.txt │ └── subquery_in_where.sql ├── timestamps │ ├── README.md │ ├── queries.sql │ └── querying_timestamps.sql ├── tpcds │ ├── frequent_customers.sql │ └── run.sh └── tuning │ ├── compare_store_sales.sql │ ├── results.txt │ └── show_summary.sql ├── kafka-examples ├── .gitignore ├── README.md ├── THIS_IS_COOL.properties ├── log4jConfigs │ └── seekToBeginning.properties ├── pom.xml └── src │ └── main │ ├── java │ └── hadoop │ │ └── examples │ │ └── kafka │ │ ├── AdminClientExamples.java │ │ ├── SeekToBeginningListener.java │ │ ├── SimpleConsumer.java │ │ └── SimpleProducer.java │ └── resources │ └── log4j.properties ├── kite-sdk ├── README.md ├── install-kite-cli.sh └── simple-cli │ ├── README.md │ ├── run.sh │ ├── sandwich.avsc │ └── sandwiches.csv ├── kudu ├── dataframes │ └── kuduDF.scala └── range-partitioning │ ├── README.md │ ├── RUNME.sh │ ├── create_hashed_metrics.sql │ ├── create_people.sql │ └── people.txt ├── mr ├── kill_job_from_mapper │ ├── README.md │ ├── compile.sh │ ├── run.sh │ ├── solution │ │ ├── KillMapper.java │ │ └── TryKill.java │ └── somedata.txt ├── local_jobrunner │ ├── detect-local-filesystem │ │ └── LocalJobRunnerDriver.java │ └── simple-example │ │ ├── .gitignore │ │ ├── SimpleDriver.java │ │ ├── compile.sh │ │ ├── run.sh │ │ └── somedata.txt ├── map_only_streaming │ ├── mapper.pl │ └── run.sh ├── maven_project_template_CDH4 │ ├── .classpath │ ├── .project │ ├── .settings │ │ └── org.eclipse.jdt.core.prefs │ ├── README.txt │ ├── TUTORIAL │ │ └── Maven and CDH4.odt │ ├── pom.xml │ ├── pom.xml~ │ ├── src │ │ ├── main │ │ │ └── java │ │ │ │ ├── CDHTRAINING │ │ │ │ └── App.java │ │ │ │ ├── SumReducer.java │ │ │ │ └── WordMapper.java │ │ └── test │ │ │ └── java │ │ │ ├── CDHTRAINING │ │ │ └── AppTest.java │ │ │ └── TestWordCount.java │ └── target │ │ ├── classes │ │ ├── CDHTRAINING │ │ │ └── App.class │ │ ├── SumReducer.class │ │ └── WordMapper.class │ │ ├── surefire-reports │ │ ├── CDHTRAINING.AppTest.txt │ │ ├── TEST-CDHTRAINING.AppTest.xml │ │ ├── TEST-TestWordCount.xml │ │ └── TestWordCount.txt │ │ └── test-classes │ │ ├── CDHTRAINING │ │ └── AppTest.class │ │ └── TestWordCount.class ├── nlineinputformat │ ├── README.md │ ├── generate_task_list.py │ ├── mapper.pl │ ├── run.sh │ └── task_list.txt ├── rest_api │ └── basic.sh ├── streaming_config_dumper │ ├── mapper.pl │ ├── part-00000 │ ├── reducer.pl │ ├── run.sh │ └── something.txt ├── total_order_partitioner │ ├── .classpath │ ├── .project │ ├── .settings │ │ └── org.eclipse.jdt.core.prefs │ ├── README.txt │ ├── bin │ │ └── solution │ │ │ ├── ProcessLogs.class │ │ │ ├── domain │ │ │ └── MapperFunction.class │ │ │ └── mr │ │ │ ├── CountReducer.class │ │ │ ├── IdentityMapper.class │ │ │ ├── LogMonthMapper.class │ │ │ ├── SumReducer.class │ │ │ └── WordMapper.class │ ├── conf │ │ └── log4j.properties │ ├── src │ │ └── solution │ │ │ ├── ProcessLogs.java │ │ │ ├── domain │ │ │ └── MapperFunction.java │ │ │ └── mr │ │ │ ├── CountReducer.java │ │ │ ├── IdentityMapper.java │ │ │ ├── LogMonthMapper.java │ │ │ ├── SumReducer.java │ │ │ └── WordMapper.java │ └── tot-ord-part.jardesc └── yarn_containers │ ├── README.md │ ├── SleepJob.java │ ├── run.sh │ └── test_container_boundaries │ ├── README.md │ ├── SleepJobWithArray.java │ └── run.sh ├── pig ├── configuration │ ├── README.md │ ├── fixpig.sh │ ├── log4j.local │ ├── log4j.properties │ ├── pig.properties │ ├── pig.properties.data_analyst_vm │ └── pig0.12 │ │ ├── README.md │ │ ├── conf │ │ └── log4j.properties │ │ ├── run.sh │ │ ├── sales.pig │ │ └── sales.txt ├── explain-split-vs-filter │ ├── explain-using-dot.sh │ ├── using-filter.pig │ ├── using-split.pig │ └── webcrawl.txt ├── generate │ ├── conf │ │ └── log4j.properties │ ├── run.sh │ ├── sales.pig │ └── sales.txt ├── hcatalog │ ├── run.sh │ ├── sample_store_sales.pig │ └── store_sales.pig ├── local-mode-hacks │ ├── README.md │ ├── read_some_hdfs_data.pig │ ├── run.sh │ └── somedata.txt ├── round │ ├── data.txt │ ├── results.txt │ └── round_this.pig └── sampling │ └── sample_tpcds.pig ├── spark ├── data-generator │ └── hash-data-generator.scala ├── data-parsing │ ├── data-parsing-using-try.scala │ └── data-parsing.scala ├── dataframes │ ├── README.org │ ├── alias.scala │ ├── analyzingExerciseUsingSparkSQL.py │ ├── antiJoin.py │ ├── columnExpresssions.scala │ ├── column_rename_after_joins.py │ ├── corruptRecords.py │ ├── corruptRecords.scala │ ├── creatingDataFrames.scala │ ├── grouping.scala │ ├── hadoop-examples-data │ │ ├── 01.csv │ │ ├── data.csv │ │ ├── interests.json │ │ ├── left.csv │ │ ├── maxVals.json │ │ ├── people.json │ │ ├── right.csv │ │ └── two.csv │ ├── hdfs_partitions.py │ ├── joins.py │ ├── joins.scala │ ├── rowFunctions.scala │ ├── saveDataFrameToDataSource.scala │ ├── schemasCSV.scala │ ├── spark_sql_udfs.py │ ├── windowing.py │ └── withColumn.scala ├── get-python-examples.sh ├── local_file.scala ├── log-level │ ├── README.md │ ├── log4j.properties │ ├── log4j.properties.with.debug.log.level │ └── run.sh ├── maven_example │ ├── README.md │ └── pom.xml ├── pair │ ├── sales.txt │ ├── sales_by_salesperson.scala │ ├── weblogs.scala │ └── weblogs.txt ├── run.sh ├── simple │ ├── core-site.xml │ ├── count.scala │ ├── data.txt │ └── log4j.properties ├── somedata.txt ├── spark-sql-scripts │ ├── README.md │ ├── computeStats.md │ ├── create_table_and_load.scala │ ├── create_table_and_load_parquet.scala │ ├── data │ │ └── people.json │ └── parse_json.scala ├── spark-sql │ ├── README.md │ ├── data │ │ └── favorite_foods │ │ │ └── favorite_foods.txt │ ├── pom.xml │ └── src │ │ └── main │ │ ├── resource │ │ └── log4j.properties │ │ └── scala │ │ └── examples │ │ ├── SimpleShowTables.scala │ │ ├── explode_and_friends.scala │ │ └── more_sql.scala ├── sparkml │ └── kmeans.py ├── structured_streaming_sensors │ ├── rate_source_simple.py │ └── rate_source_with_more_data.py ├── tf-idf │ └── tf-idf.spark ├── wordcount.scala └── wordlength_with_details │ ├── data.txt │ ├── log4j.properties │ └── wordlength_with_details.scala ├── sql-diffs ├── .gitignore ├── README.md ├── accts.txt ├── load_data.sh ├── query1.sql ├── query2.sql ├── query3.sql ├── query4.sql ├── query5.sql └── run_queries.sh ├── sqoop ├── README.md └── sqoop-job-create └── utils ├── random_crash └── run.sh ├── setup_env.sh ├── sleepjob ├── allocations.xml ├── bigjob.sh ├── submitLONGSleepJob.sh └── submitReportsBOSSPool.sh └── teragen-and-terasort.sh /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | .DS_Store 3 | *.class 4 | *.jar 5 | *.pyc 6 | *.log 7 | *.*~ 8 | metastore_db 9 | *.classpath 10 | *.project 11 | target/ 12 | tags 13 | .settings 14 | index.html 15 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "impala-tpcds-kit"] 2 | path = impala-tpcds-kit 3 | url = hadoop-examples:NathanNeff/impala-tpcds-kit.git 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Hadoop Examples 2 | =============== 3 | 4 | Hadoop Examples is a set of simple example scripts to illustrate Hadoop ecosystem 5 | tools like Hive and Pig. 6 | 7 | Installation 8 | ------------- 9 | EXAMPLES_DIR is an environment variable you can set to point to the directory 10 | where the hadoop-examples.jar is installed. 11 | 12 | There is also a script: utils/setup_env.sh that can be sourced inside other 13 | shell scripts to try to find the hadoop-examples.jar. It is ugly, but 14 | sometimes convenient :-/ 15 | 16 | # Release Notes 17 | 18 | ## HBase Block Size 19 | 20 | November 2016 21 | 22 | HBase Block Size utility =hbase/hbase_blocks/hbase_blocks.rb= creates a table with 23 | a specified HBase block size. Writes data, flushes, then uses admin object to 24 | get the region name. Displays exact command =hbase hfile= to use to view the store 25 | file's index. Some okay/kinda cool JRuby stuff there. 26 | 27 | 28 | Streaming Config Dumper 29 | ----------------------- 30 | 31 | MapReduce scripts to print their ENV variables, which also include 32 | Hadoop configuration stuff for streaming jobs. 33 | 34 | See =mr/streaming_config_dumper/= 35 | 36 | 37 | Hive and Pig 38 | ------------ 39 | 12/20/2013 40 | 41 | - Incremental insert example in Hive 42 | Inserts non-duplicate data into a join table from incrementally updated 43 | source tables See hive/incremental_insert/ 44 | 45 | - Added example of Pig's EXPLAIN command to show a diagram of the execution plan 46 | for SPLIT versus FILTER 47 | See pig/explain-split-vs-filter/ 48 | 49 | - Added example of Hive's PARTITION feature 50 | See hive/partition-example/ 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /avro/.gitignore: -------------------------------------------------------------------------------- 1 | users.avro 2 | -------------------------------------------------------------------------------- /avro/README.org: -------------------------------------------------------------------------------- 1 | Tutorial from: 2 | http://avro.apache.org/docs/current/gettingstartedpython.html 3 | 4 | And: 5 | http://www.harshj.com/2010/04/25/writing-and-reading-avro-data-files-using-python/ 6 | -------------------------------------------------------------------------------- /avro/devices/README.md: -------------------------------------------------------------------------------- 1 | The avro directory was copied from an Avro egg that 2 | came with Hue 3 | -------------------------------------------------------------------------------- /avro/devices/avro/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | __all__ = ['schema', 'io', 'datafile', 'protocol', 'ipc'] 18 | 19 | -------------------------------------------------------------------------------- /avro/devices/devices.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/avro/devices/devices.avro -------------------------------------------------------------------------------- /avro/devices/read_devices.py: -------------------------------------------------------------------------------- 1 | import avro.schema 2 | from avro.datafile import DataFileReader, DataFileWriter 3 | from avro.io import DatumReader, DatumWriter 4 | 5 | reader = DataFileReader(open("devices.avro", "r"), DatumReader()) 6 | for device in reader: 7 | print device 8 | reader.close() 9 | -------------------------------------------------------------------------------- /avro/download_avro.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | AVRO_TARBALL=avro-1.7.4.tar.gz 3 | MD5FILE=avro-1.7.4.tar.gz.md5 4 | MIRROR=http://www.eng.lsu.edu/mirrors/apache/avro/avro-1.7.4/py 5 | 6 | test -f $MD5FILE || wget http://www.us.apache.org/dist/avro/stable/py/$MD5FILE 7 | test -f $AVRO_TARBALL || wget $MIRROR/$AVRO_TARBALL 8 | md5sum -c $MD5FILE 9 | -------------------------------------------------------------------------------- /avro/read_users.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import avro.schema 3 | from avro.datafile import DataFileReader 4 | from avro.io import DatumReader 5 | 6 | reader = DataFileReader(open("users.avro", "r"), DatumReader()) 7 | for user in reader: 8 | print user 9 | reader.close() 10 | -------------------------------------------------------------------------------- /avro/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Find your own eggs on your system 3 | EGG_DIR=/opt/cloudera/parcels/CDH-5.1.2-1.cdh5.1.2.p0.3/lib/hue/build/env/lib/python2.6/site-packages/avro-1.7.6-py2.6.egg 4 | export PYTHONPATH=$EGG_DIR 5 | python ./write_bunch_of_users.py 6 | python ./read_users.py 7 | -------------------------------------------------------------------------------- /avro/user.avsc: -------------------------------------------------------------------------------- 1 | {"namespace": "example.avro", 2 | "type": "record", 3 | "name": "User", 4 | "fields": [ 5 | {"name": "fullname", "type": "string"}, 6 | {"name": "favorite_number", "type": ["int", "null"]}, 7 | {"name": "favorite_color", "type": ["string", "null"]} 8 | ] 9 | } 10 | -------------------------------------------------------------------------------- /avro/write_bunch_of_users.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import avro.schema 3 | from avro.datafile import DataFileReader, DataFileWriter 4 | from avro.io import DatumReader, DatumWriter 5 | 6 | schema = avro.schema.parse(open("user.avsc").read()) 7 | writer = DataFileWriter(open("users.avro", "w"), DatumWriter(), schema) 8 | 9 | dictionary_file = open('/usr/share/dict/words', 'r') 10 | 11 | for word in dictionary_file: 12 | print "Adding " + word 13 | writer.append({"fullname": word, "favorite_number": len(word)}) 14 | if word > "l": 15 | break 16 | writer.close() 17 | -------------------------------------------------------------------------------- /avro/write_users.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import avro.schema 3 | from avro.datafile import DataFileReader, DataFileWriter 4 | from avro.io import DatumReader, DatumWriter 5 | 6 | schema = avro.schema.parse(open("user.avsc").read()) 7 | 8 | writer = DataFileWriter(open("users.avro", "w"), DatumWriter(), schema) 9 | writer.append({"fullname": "Alyssa", "favorite_number": 256}) 10 | writer.append({"fullname": "Ben", "favorite_number": 7, "favorite_color": "red"}) 11 | writer.close() 12 | -------------------------------------------------------------------------------- /cloudera-director/api-examples/list-instances.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | from cloudera.director.common.client import ApiClient 3 | from cloudera.director.latest import AuthenticationApi, EnvironmentsApi, DeploymentsApi, ClustersApi 4 | from cloudera.director.latest.models import Login 5 | 6 | client = ApiClient("http://localhost:7189") 7 | AuthenticationApi(client).login(Login(username="admin", password="admin")) 8 | for envName in EnvironmentsApi(client).list(): 9 | print "Environment: %s" % envName 10 | if DeploymentsApi(client).list(envName): 11 | for depName in DeploymentsApi(client).list(envName): 12 | print "\tDeployment: %s" % depName 13 | if ClustersApi(client).list(envName, depName): 14 | for clusterName in ClustersApi(client).list(envName, depName): 15 | print "\t\tCluster: %s" % clusterName 16 | cluster = ClustersApi(client).get(envName, depName, clusterName) 17 | if cluster.instances: 18 | for instance in cluster.instances: 19 | print "\t\t\tInstance: %s %s" % (instance.properties['publicIpAddress'], instance.health.status) 20 | 21 | -------------------------------------------------------------------------------- /cloudera-manager/python_rest_api/README.md: -------------------------------------------------------------------------------- 1 | # Python REST API for Cloudera Manager 2 | 3 | These are my dabblings for using the Python interface to 4 | work with the Cloudera REST API 5 | 6 | # Installation 7 | 8 | Basically, I ran =sudo pip install cm_api= 9 | 10 | - http://cloudera.github.io/cm_api/ 11 | 12 | - Great examples of installation and usage 13 | - http://cloudera.github.io/cm_api/docs/python-client/ 14 | 15 | 16 | - When you deal with the results of these Python calls, the "objects" fit this model: 17 | - http://cloudera.github.io/cm_api/epydoc/5.0.0/index.html 18 | - http://cloudera.github.io/cm_api/apidocs/v6/model.html 19 | 20 | - The terms used by Cloudera Manager, like "Roles", "Role Groups" etc. are detailed here, 21 | and it pays off big-time to understand the hierarchy and relationships between these entities 22 | 23 | http://www.cloudera.com/content/cloudera-content/cloudera-docs/CM5/latest/Cloudera-Manager-Introduction/cm5i_primer.html?scroll=concept_wfj_tny_jk_unique_1 24 | 25 | # Examples 26 | 27 | ## Simple List Cluster Properties 28 | - Prints names of the clusters that are managed by CM 29 | - Dives a bit into the properties of clusters, like hosts, roles, etc. 30 | - [Simple Cluster Properties](simple_cluster_properties.py) 31 | 32 | ## Dump Cluster Configurations 33 | - [CM Dump Config](cm-dump-config.py) 34 | 35 | ## Show Configuration for Role Instance 36 | - Shows configuration settings for a specific DataNode in a cluster 37 | - [Simple Configuration Settings Example](simple_config_settings.py) 38 | 39 | 40 | -------------------------------------------------------------------------------- /cloudera-manager/python_rest_api/simple_cluster_properties.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | from cm_api.api_client import ApiResource 4 | 5 | cm_host = "" 6 | if len(sys.argv) > 1: 7 | cm_host = sys.argv[1] 8 | else: 9 | sys.stderr.write("Usage: simple_cluster_properties.py ") 10 | sys.exit(1) 11 | 12 | api = ApiResource(cm_host, username="admin", password="admin") 13 | 14 | def printClusterNames(): 15 | for c in api.get_all_clusters(): 16 | print "Cluster \"%s\" is version %s" % (c.name, c.version) 17 | 18 | # Host Object Model http://cloudera.github.io/cm_api/apidocs/v6/ns0_apiHost.html 19 | def printClusterHosts(): 20 | for c in api.get_all_clusters(): 21 | # cluster.get_all_hosts returns ApiHostRefs, which need to be looked up 22 | print "Hosts in cluster \"%s\" are: " % c.name 23 | for host_ref in c.list_hosts(): 24 | host = api.get_host(host_ref.hostId) 25 | print host.hostname 26 | 27 | def printHostTemplates(host_template_name): 28 | for c in api.get_all_clusters(): 29 | print c.get_all_host_templates() 30 | host_template = c.get_host_template(host_template_name) 31 | if host_template is not None: 32 | print "I found host template \"%s\":" % host_template_name 33 | print host_template 34 | 35 | printClusterNames() 36 | printClusterHosts() 37 | printHostTemplates("ThisGuy") 38 | -------------------------------------------------------------------------------- /cloudera-manager/python_rest_api/simple_config_settings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Display configuration settings for DataNodes 3 | import sys 4 | from cm_api.api_client import ApiResource 5 | 6 | cm_host = None 7 | cluster_name = None 8 | 9 | if len(sys.argv) > 2: 10 | cm_host = sys.argv[1] 11 | cluster_name = sys.argv[2] 12 | else: 13 | sys.stderr.write("Usage: simple_config_settings.py ") 14 | sys.exit(1) 15 | 16 | api = ApiResource(cm_host, username="admin", password="admin") 17 | 18 | # The service api must be retrieved from the cluster api 19 | def printDataNodeConfig(): 20 | c = api.get_cluster(cluster_name) 21 | dn_groups = [] 22 | hdfs = None 23 | for service in c.get_all_services(): 24 | if service.type == "HDFS": 25 | hdfs = service 26 | 27 | for group in hdfs.get_all_role_config_groups(): 28 | if group.roleType == 'DATANODE': 29 | dn_groups.append(group) 30 | 31 | for cg in dn_groups: 32 | print "Found config group: " + cg.name 33 | 34 | dn_config = dn_groups[0].get_config(view='full') 35 | 36 | print "Each datanode will store data on these local directories: \n%s" % dn_config['dfs_data_dir_list'].value 37 | print "Each datanode can use up to this amount on each disk for HDFS: \n%s" % dn_config['dfs_datanode_du_reserved'].value 38 | 39 | printDataNodeConfig() 40 | -------------------------------------------------------------------------------- /data-engineering/README.md: -------------------------------------------------------------------------------- 1 | # Comparison of Hive / Impala in CDP 7.2 2 | 3 | https://docs.cloudera.com/runtime/7.2.0/using-hiveql/topics/hive-orc-parquet-compare.html 4 | # Run the following in order 5 | 6 | 1) create_txnl_tbl.sh 7 | 8 | 1) insert_into_products_txnl.sh 9 | 10 | -- Impala currently does not query fully transactional tables 11 | -- (Coming soon) 12 | -- https://issues.apache.org/jira/browse/IMPALA-9042 13 | 14 | 1) query_txnl_using_impala.sh 15 | 16 | -- Hive does 17 | 1) query_txnl_using_hive.sh 18 | 19 | -- View directory structure in table 20 | 1) ./view_dir_structure.sh 21 | 22 | -- Update data in txnl table 23 | 1) ./update_txnl_using_hive.sh 24 | 25 | -- View directory structure in table 26 | 1) ./view_dir_structure.sh 27 | 28 | -------------------------------------------------------------------------------- /data-engineering/create_txnl_tbl.hql: -------------------------------------------------------------------------------- 1 | USE analyst; 2 | 3 | DROP TABLE IF EXISTS products_txnl; 4 | CREATE TABLE `products_txnl`( 5 | `prod_id` int, 6 | `brand` string, 7 | `name` string, 8 | `price` int, 9 | `cost` int, 10 | `shipping_wt` int); 11 | 12 | DESCRIBE FORMATTED products_txnl; 13 | 14 | -------------------------------------------------------------------------------- /data-engineering/create_txnl_tbl.sh: -------------------------------------------------------------------------------- 1 | beeline --verbose=false -u jdbc:hive2://localhost:10000 -f create_txnl_tbl.hql 2 | -------------------------------------------------------------------------------- /data-engineering/insert_into_products_txnl.hql: -------------------------------------------------------------------------------- 1 | USE analyst; 2 | INSERT INTO products_txnl 3 | SELECT * from products; 4 | -------------------------------------------------------------------------------- /data-engineering/insert_into_products_txnl.sh: -------------------------------------------------------------------------------- 1 | beeline --verbose=false --report -u jdbc:hive2://localhost:10000 -f insert_into_products_txnl.hql 2 | -------------------------------------------------------------------------------- /data-engineering/query_txnl.hql: -------------------------------------------------------------------------------- 1 | USE analyst; 2 | SELECT MIN(price) FROM analyst.products_txnl; 3 | -------------------------------------------------------------------------------- /data-engineering/query_txnl_using_hive.sh: -------------------------------------------------------------------------------- 1 | beeline --verbose=false -u jdbc:hive2://localhost:10000 -f "query_txnl.hql" 2 | cat query_txnl.hql 3 | -------------------------------------------------------------------------------- /data-engineering/query_txnl_using_impala.sh: -------------------------------------------------------------------------------- 1 | impala-shell -q "REFRESH analyst.products_txnl;" 2 | impala-shell -f query_txnl.hql 3 | -------------------------------------------------------------------------------- /data-engineering/update_txnl_using_hive.hql: -------------------------------------------------------------------------------- 1 | USE analyst; 2 | UPDATE products_txnl 3 | SET price = 100 4 | WHERE price < 100; 5 | -------------------------------------------------------------------------------- /data-engineering/update_txnl_using_hive.sh: -------------------------------------------------------------------------------- 1 | beeline --verbose=false -u jdbc:hive2://localhost:10000 -f update_txnl_using_hive.hql 2 | -------------------------------------------------------------------------------- /data-engineering/view_dir_structure.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | hdfs dfs -ls /warehouse/tablespace/managed/hive/analyst.db/products_txnl 3 | -------------------------------------------------------------------------------- /hbase/apitests/README.md: -------------------------------------------------------------------------------- 1 | # Java example of using HBaseConfiguration.create() 2 | 3 | ## Compile using Maven: 4 | 5 | mvn compile 6 | 7 | ## To run: 8 | 9 | cd target/classes 10 | java -cp `hbase classpath` apitests.CreateTable [useold] 11 | 12 | 13 | -------------------------------------------------------------------------------- /hbase/apitests/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | apitests 6 | apitests 7 | 1.0-SNAPSHOT 8 | jar 9 | 10 | apitests 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | cloudera 20 | https://repository.cloudera.com/artifactory/cloudera-repos/ 21 | 22 | 23 | 24 | 25 | 26 | junit 27 | junit 28 | 3.8.1 29 | test 30 | 31 | 32 | 33 | org.apache.hbase 34 | hbase-client 35 | 0.98.6-cdh5.2.0 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | org.codehaus.mojo 44 | exec-maven-plugin 45 | 1.4.0 46 | 47 | java 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /hbase/apitests/run.sh: -------------------------------------------------------------------------------- 1 | java -cp `hbase classpath`:target/classes apitests.CreateTable 2 | -------------------------------------------------------------------------------- /hbase/client_maven_example/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | hbaseExample 4 | hbaseExample 5 | 0.0.1-SNAPSHOT 6 | 7 | 0.98.6-cdh5.2.0 8 | 9 | 10 | 11 | cloudera 12 | https://repository.cloudera.com/artifactory/cloudera-repos/ 13 | 14 | 15 | 16 | 17 | org.apache.hbase 18 | hbase-client 19 | ${hbase.version} 20 | 21 | 22 | mysql 23 | mysql-connector-java 24 | 8.0.16 25 | 26 | 27 | 28 | src 29 | 30 | 31 | maven-compiler-plugin 32 | 3.1 33 | 34 | 1.7 35 | 1.7 36 | 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /hbase/colfam_flush/.gitignore: -------------------------------------------------------------------------------- 1 | hbase/ 2 | thrift/ 3 | -------------------------------------------------------------------------------- /hbase/colfam_flush/README.md: -------------------------------------------------------------------------------- 1 | # README 2 | 3 | Script to insert random words into HBase 4 | 5 | # The HBase Python Thrift libraries must be installed 6 | 7 | For this script to work. 8 | 9 | # TODO 10 | 11 | - Find what the licensing is on the Python/Hbase thrift libraries and 12 | distribute with this code 13 | 14 | -------------------------------------------------------------------------------- /hbase/groovy/loadRandomData/README.md: -------------------------------------------------------------------------------- 1 | # README 2 | 3 | These scripts are written using the Groovy programming language. 4 | 5 | Groovy is an excellent language that runs on the Java Virtual Machine. 6 | 7 | Simple instructions for installing Groovy can be found at: 8 | http://groovy.codehaus.org/Installing+Groovy 9 | 10 | # Movie Lens Data 11 | 12 | ./loadRandomData.groovy relies on the Movie Lens dataset: 13 | 14 | - http://files.grouplens.org/datasets/movielens/ml-10m-README.html 15 | -------------------------------------------------------------------------------- /hbase/groovy/loadRandomData/compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Just compile/syntax check the file 3 | groovyc -cp `hadoop classpath`:`hbase classpath` ./loadRandomData.groovy 4 | -------------------------------------------------------------------------------- /hbase/groovy/loadRandomData/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | groovy -classpath `hbase classpath` loadRandomData.groovy 3 | -------------------------------------------------------------------------------- /hbase/hbase-sandbox/README.md: -------------------------------------------------------------------------------- 1 | # HBase Sandbox 2 | 3 | ## Stop Row Thing 4 | 5 | Java class that creates a table, splits it into X number of regions, then performs various Scans. 6 | Scans use no start/stop row, stop row and PrefixFilter to demonstrate the # of regions that are 7 | scanned and rows returned using various combinations of start/stop row, and Prefix filters. 8 | 9 | To run, use: 10 | 11 | ./runStopRowThing.sh 12 | 13 | You can use Eclipse to work with the code. To create an Eclipse project, use: 14 | 15 | mvn eclipse:eclipse 16 | 17 | It should then be possible to use Eclipse to modify / run the Java source code. 18 | 19 | Log level can be tuned by editing src/main/resources/log4j.properties and 20 | setting hbase.root.logger=INFO,console 21 | -------------------------------------------------------------------------------- /hbase/hbase-sandbox/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | hbasesandbox 6 | hbasesandbox 7 | 1.0-SNAPSHOT 8 | jar 9 | 10 | HBase Sandbox 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | cloudera 20 | https://repository.cloudera.com/artifactory/cloudera-repos/ 21 | 22 | 23 | 24 | 25 | 26 | junit 27 | junit 28 | 3.8.1 29 | test 30 | 31 | 32 | 33 | org.apache.hbase 34 | hbase-client 35 | 0.98.6-cdh5.2.0 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | org.codehaus.mojo 44 | exec-maven-plugin 45 | 1.4.0 46 | 47 | java 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /hbase/hbase-sandbox/runStopRowThing.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | mvn package 4 | java -cp target/hbasesandbox-1.0-SNAPSHOT.jar:`hbase classpath` hbasesandbox.StopRowThing 5 | -------------------------------------------------------------------------------- /hbase/hbase_hive_impala/README.org: -------------------------------------------------------------------------------- 1 | * This assumes that data is in HBase 2 | in the users.ratings column family 3 | 4 | key column = movieid 5 | 6 | userid value = rating 7 | 8 | Example, bob rated movie #100 a 5 and rated movie #123 with a 4: 9 | 10 | key 11 | 12 | bob 100:5, 123:4 13 | -------------------------------------------------------------------------------- /hbase/hbase_hive_impala/create_hbase_table.rb: -------------------------------------------------------------------------------- 1 | # Create HBase Table 2 | def drop_if_exists_and_create(name, *args) 3 | if @hbase.admin(@formatter).exists?(name.to_s) 4 | @hbase.admin(@formatter).disable name 5 | @hbase.admin(@formatter).drop name 6 | puts "Droppped table: " + name 7 | end 8 | 9 | @hbase.admin(@formatter).create name, *args 10 | puts "Created table: " + name + "\n\n" 11 | end 12 | 13 | drop_if_exists_and_create 'nate_hbase_sales_grouped', { NAME => 'cf1' } 14 | drop_if_exists_and_create 'nate_hbase_movie_ratings', { NAME => 'ratings' } 15 | 16 | put 'nate_hbase_movie_ratings', 'nate', 'ratings:star_wars', '5' 17 | put 'nate_hbase_movie_ratings', 'nate', 'ratings:clone_wars', '1' 18 | 19 | put 'nate_hbase_movie_ratings', 'steve', 'ratings:star_wars', '5' 20 | put 'nate_hbase_movie_ratings', 'steve', 'ratings:clone_wars', '1' 21 | 22 | put 'nate_hbase_movie_ratings', 'dumbo', 'ratings:star_wars', '1' 23 | put 'nate_hbase_movie_ratings', 'dumbo', 'ratings:clone_wars', '5' 24 | 25 | put 'nate_hbase_movie_ratings', 'suzie', 'ratings:beaches', '4' 26 | put 'nate_hbase_movie_ratings', 'suzie', 'ratings:magnolia', '4' 27 | 28 | exit 29 | -------------------------------------------------------------------------------- /hbase/hbase_hive_impala/create_hive_hbase_table.sql: -------------------------------------------------------------------------------- 1 | -- The following JARs might need to be added. 2 | -- Use Hive's ADD JAR command 3 | -- zookeeper.jar; 4 | -- hive-hbase-handler.jar; 5 | -- guava-11.0.2.jar; 6 | -- hbase-client.jar; 7 | -- hbase-common.jar; 8 | -- hbase-hadoop-compat.jar; 9 | -- hbase-hadoop2-compat.jar; 10 | -- hbase-protocol.jar; 11 | -- hbase-server.jar; 12 | -- htrace-core.jar; 13 | 14 | DROP TABLE IF EXISTS sales_grouped; 15 | 16 | CREATE EXTERNAL TABLE sales_grouped 17 | (customer_id INT, 18 | total_sales INT) 19 | STORED BY 20 | 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' 21 | WITH SERDEPROPERTIES ("hbase.columns.mapping" = 22 | ":key, 23 | cf1:total_sales") 24 | TBLPROPERTIES 25 | ("hbase.table.name" = "hbase_sales_grouped"); 26 | 27 | 28 | DROP TABLE IF EXISTS the_movie_ratings; 29 | 30 | -- ratings is simply the entire 'ratings' column family 31 | CREATE EXTERNAL TABLE the_movie_ratings 32 | (userid STRING, 33 | movie_ratings MAP) 34 | STORED BY 35 | 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' 36 | WITH SERDEPROPERTIES ("hbase.columns.mapping" = 37 | ":key, 38 | ratings:") -- just map entire column family to the movie_ratings MAP 39 | TBLPROPERTIES 40 | ("hbase.table.name" = "nate_hbase_movie_ratings"); 41 | -------------------------------------------------------------------------------- /hbase/hbase_hive_impala/map-hive-to-hbase-ratings.sql: -------------------------------------------------------------------------------- 1 | -- You may need to set hbase.zookeeper.quorum 2 | set hbase.zookeeper.quorum= 3 | 4 | -- The following JARs may be necessary, and distro/version dependent 5 | -- hbase-0.94.6--security.jar; 6 | -- hive-hbase-handler-0.10.0-.jar; 7 | 8 | CREATE EXTERNAL TABLE IF NOT EXISTS hbase_ratings 9 | (userid int, ratings MAP, lname STRING) 10 | STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' 11 | WITH SERDEPROPERTIES ("hbase.columns.mapping" =":key, ratings:, info:lname") 12 | TBLPROPERTIES ("hbase.table.name" = "user"); 13 | 14 | -- Find ratings for movie ID 400 15 | SELECT userid, ratings['2997'] FROM hbase_ratings WHERE ratings['2997'] IS NOT NULL; 16 | 17 | CREATE TABLE IF NOT EXISTS exported_hbase_ratings(userid int, movieid int, rating int) 18 | -------------------------------------------------------------------------------- /hbase/hbase_hive_impala/put_data_into_sales_aggregate_table.sh: -------------------------------------------------------------------------------- 1 | # Create HBase Table 2 | create 'hbase_sales_grouped', 'cf1' 3 | -------------------------------------------------------------------------------- /hbase/hbase_hive_impala/select_some_ratings.sql: -------------------------------------------------------------------------------- 1 | -- These add jar statements will vary, depending on distro 2 | -- zookeeper.jar; 3 | -- hive-hbase-handler.jar; 4 | -- guava-11.0.2.jar; 5 | -- hbase-client.jar; 6 | -- hbase-common.jar; 7 | -- hbase-hadoop-compat.jar; 8 | -- hbase-hadoop2-compat.jar; 9 | -- hbase-protocol.jar; 10 | -- hbase-server.jar; 11 | -- htrace-core.jar; 12 | 13 | -- SELECT * FROM the_movie_ratings WHERE movie_ratings['star_wars'] IS NOT NULL; 14 | 15 | -- Find top 2 users with the most movie ratings 16 | SELECT userid, MAP_KEYS(movie_ratings) AS the_count 17 | FROM the_movie_ratings; 18 | 19 | -- WHERE movie_ratings['star_wars'] IS NOT NULL 20 | -- GROUP BY the_count 21 | -- ORDER BY the_count DESC 22 | -- LIMIT 2; 23 | 24 | -------------------------------------------------------------------------------- /hbase/hotSpots/README.md: -------------------------------------------------------------------------------- 1 | # hotSpots 2 | 3 | Groovy scripts to create hotspots 4 | -------------------------------------------------------------------------------- /hbase/hotSpots/TransactionFactory.groovy: -------------------------------------------------------------------------------- 1 | public class TransactionFactory { 2 | 3 | Random rand = new Random() 4 | 5 | def curPrice = 100 6 | 7 | def tickers = [ 8 | [ symbol : 'IBM', price : 100], 9 | [ symbol : 'AAPL', price : 200], 10 | [ symbol : 'MSFT', price : 300], 11 | [ symbol : 'INTC', price : 50], 12 | [ symbol : 'BWLD', price : 200] 13 | ] 14 | 15 | def getNewTrans() { 16 | def dt = new Date() 17 | def ticker = tickers[rand.nextInt(tickers.size())] 18 | def thekey = dt.format('yyyy/MM/dd HH:mm:ss:SS') + ' ' + ticker['symbol'] 19 | 20 | return [ key: thekey, dt: dt.time.toString(), price:ticker['price'], symbol:ticker['symbol'] ] 21 | } 22 | 23 | 24 | } 25 | -------------------------------------------------------------------------------- /hbase/hotSpots/compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | groovyc TransactionFactory.groovy 3 | -------------------------------------------------------------------------------- /hbase/hotSpots/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | groovy --classpath `hbase classpath` ./TransactionImporter.groovy --numRows=10000000 3 | -------------------------------------------------------------------------------- /hbase/null_safe_joins/data.rb: -------------------------------------------------------------------------------- 1 | create 'hbase_a', 'cf1' 2 | create 'hbase_b', 'cf1' 3 | 4 | put 'hbase_a', '1', 'cf1:record_id', '1' 5 | put 'hbase_a', '1', 'cf1:record_name', 'bob' 6 | 7 | put 'hbase_b', '1', 'cf1:record_id', '1' 8 | put 'hbase_b', '1', 'cf1:record_name', 'B robert' 9 | 10 | # record_id for bob in both tables where record_id is NULL 11 | put 'hbase_a', '2', 'cf1:record_name', 'steve' 12 | put 'hbase_b', '2', 'cf1:record_name', 'steves record' 13 | 14 | exit 15 | -------------------------------------------------------------------------------- /hbase/random_words_python/README.md: -------------------------------------------------------------------------------- 1 | # README 2 | 3 | Script to insert random words into HBase 4 | 5 | # TODO 6 | 7 | Find what the licensing is on the Python/Hbase thrift libraries and distribute 8 | with this code 9 | 10 | -------------------------------------------------------------------------------- /hbase/random_words_python/insert_random_words.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from thrift.transport import TSocket 4 | from thrift.protocol import TBinaryProtocol 5 | from thrift.transport import TTransport 6 | from hbase import Hbase 7 | import os 8 | import os.path 9 | import random 10 | import sys 11 | 12 | if len(sys.argv) > 1: 13 | thriftserver = sys.argv[1] 14 | else: 15 | thriftserver = "localhost" 16 | 17 | random.seed() 18 | words = [line.strip() for line in open('/usr/share/dict/words')] 19 | max = len(words) - 1 20 | 21 | # Connect to HBase Thrift server 22 | # Assumes that thrift server is localhost 23 | transport = TTransport.TBufferedTransport(TSocket.TSocket(thriftserver, 9090)) 24 | protocol = TBinaryProtocol.TBinaryProtocolAccelerated(transport) 25 | 26 | # Create and open the client connection 27 | client = Hbase.Client(protocol) 28 | transport.open() 29 | 30 | # Create a list of mutations per batch 31 | mutationsbatch = [] 32 | 33 | # Create 1 billion data rows 34 | num_rows = 1000000 35 | batchsize = 10000 36 | 37 | mutationsbatch = [] 38 | for x in range(0, num_rows - 1): 39 | if x % batchsize == 0: 40 | print "Pushing " + str(x) 41 | client.mutateRows("words", mutationsbatch) 42 | mutationsbatch = [] 43 | r = random.randint(0, max) 44 | 45 | row = [] 46 | 47 | # Add this cell 48 | row.append(Hbase.Mutation(column="w:" + words[r] + "@" + str(x), value=str(1))) 49 | 50 | thisword = words[r] 51 | mutationsbatch.append(Hbase.BatchMutation(row=thisword + str(r),mutations=row)) 52 | 53 | # Run the mutations for the words 54 | client.mutateRows("words", mutationsbatch) 55 | 56 | transport.close() 57 | -------------------------------------------------------------------------------- /hbase/recordGenerator/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ~/tools/groovy/bin/groovy -classpath `hbase classpath` ./PutRandomRecords.groovy 3 | -------------------------------------------------------------------------------- /hbase/shell_stuff/README.md: -------------------------------------------------------------------------------- 1 | This is playing around with the hbase shell, and trying to 2 | get to underlying Java objects. 3 | 4 | To run these things, use: 5 | 6 | hbase shell 7 | 8 | 9 | Example: 10 | 11 | hbase shell inspect_HTable.rb 12 | 13 | # Run with parameters 14 | 15 | The check_new_stuff/run_parameter_script.sh is rudimentary example of invoking hbase/ruby 16 | script with parameters. 17 | 18 | 19 | -------------------------------------------------------------------------------- /hbase/shell_stuff/check_new_stuff/alter_table_properties_async.rb: -------------------------------------------------------------------------------- 1 | tbl = "test_movie" 2 | if @hbase.admin(@formatter).exists?(tbl) 3 | puts "Table #{tbl} already exists. Please drop or use different table" 4 | exit 1 5 | end 6 | 7 | create tbl, { NAME => 'desc'}, { READONLY => 'true' } 8 | put tbl, 'Star Wars', 'desc:title', 'Star Wars' 9 | 10 | alter_async tbl, READONLY => 'false' 11 | put tbl, 'Star Wars', 'desc:title', 'Star Wars' 12 | 13 | disable tbl 14 | drop tbl 15 | -------------------------------------------------------------------------------- /hbase/shell_stuff/check_new_stuff/alter_versions_async.rb: -------------------------------------------------------------------------------- 1 | tbl = "test_movie" 2 | disable tbl 3 | drop tbl 4 | if @hbase.admin(@formatter).exists?(tbl) 5 | puts "Table #{tbl} already exists. Please drop or use different table" 6 | exit 1 7 | end 8 | 9 | create tbl, { NAME => 'desc' } 10 | put tbl, 'Star Wars', 'desc:title', 'Star Wars' 11 | put tbl, 'Star Wars', 'desc:title', 'Star Wars:A New Hope' 12 | 13 | get tbl, 'Star Wars', { COLUMNS => 'desc:title', VERSIONS => 2 } 14 | 15 | alter_async tbl, NAME => 'desc', VERSIONS => 2 16 | put tbl, 'Star Wars', 'desc:title', 'Star Wars:A New New Hope' 17 | get tbl, 'Star Wars', { COLUMNS => 'desc:title', VERSIONS => 2 } 18 | 19 | puts "Checking alter status" 20 | alter_status tbl 21 | 22 | disable tbl 23 | drop tbl 24 | -------------------------------------------------------------------------------- /hbase/shell_stuff/check_new_stuff/colfam_async.rb: -------------------------------------------------------------------------------- 1 | tbl = 'nate_alter' 2 | 3 | if @hbase.admin(@formatter).exists?(tbl) 4 | puts "Table '#{tbl}' already exists. Please drop it first." 5 | exit 1 6 | end 7 | 8 | create tbl, { NAME => 'cf1' } 9 | put tbl, '1', 'cf1:col1', 'value' 10 | alter tbl, NAME => 'cf2' 11 | put tbl, '1', 'cf2:col2', 'value' 12 | alter tbl, NAME => 'cf3' 13 | put tbl, '1', 'cf3:col3', 'value' 14 | describe tbl 15 | scan tbl 16 | 17 | disable tbl 18 | drop tbl 19 | -------------------------------------------------------------------------------- /hbase/shell_stuff/check_new_stuff/create_simple_table.rb: -------------------------------------------------------------------------------- 1 | tbl = 'test_simple' 2 | 3 | create tbl, {NAME => 'desc'} 4 | describe tbl 5 | put tbl, 'Star Wars', 'desc:title', 'Star Wars' 6 | get tbl, 'Star Wars' 7 | 8 | disable tbl 9 | drop tbl 10 | exit 11 | -------------------------------------------------------------------------------- /hbase/shell_stuff/check_new_stuff/create_table_shorthand.rb: -------------------------------------------------------------------------------- 1 | # Shorthand 2 | tbl = 'test_shorthand' 3 | create tbl, 'movie', 'desc', 'media' 4 | put tbl, 'Phantom', 'desc:title', 'Phantom Menace' 5 | put tbl, 'Phantom', 'media:thumbs_down', 'thumbs_down' 6 | get tbl, 'Phantom' 7 | 8 | disable tbl 9 | drop tbl 10 | 11 | exit 12 | -------------------------------------------------------------------------------- /hbase/shell_stuff/check_new_stuff/create_table_two_colfams.rb: -------------------------------------------------------------------------------- 1 | # Create two colfams 2 | tbl = 'test_two_colfams' 3 | 4 | create tbl, {NAME => 'desc'}, {NAME => 'media'} 5 | describe tbl 6 | 7 | put tbl, 'Jedi', 'desc:title', 'Return of the Jedi' 8 | put tbl, 'Jedi', 'media:fanboy_picture', 'fanboy\'s picture' 9 | 10 | get tbl, 'Jedi' 11 | 12 | disable tbl 13 | drop tbl 14 | exit 15 | 16 | -------------------------------------------------------------------------------- /hbase/shell_stuff/check_new_stuff/create_table_two_versions.rb: -------------------------------------------------------------------------------- 1 | # Create tbl w/two versions 2 | tbl = 'test_two_vers' 3 | create tbl, {NAME => 'desc', VERSIONS => 2} 4 | describe tbl 5 | 6 | put tbl, 'Empire', 'desc:title', 'Empire Wimps Out' 7 | put tbl, 'Empire', 'desc:title', 'Empire Strikes Back' 8 | 9 | get tbl, 'Empire', { COLUMN=>'desc:title', VERSIONS=> 2} 10 | 11 | disable tbl 12 | drop tbl 13 | 14 | exit 15 | -------------------------------------------------------------------------------- /hbase/shell_stuff/check_new_stuff/delete.rb: -------------------------------------------------------------------------------- 1 | tbl = "test_movie" 2 | if @hbase.admin(@formatter).exists?(tbl) 3 | puts "Table #{tbl} already exists. Please drop or use different table" 4 | exit 1 5 | end 6 | 7 | create tbl, { NAME => 'desc', VERSIONS => 3 } 8 | put tbl, 'rowkey1', 'desc:title', 'New Hope' 9 | put tbl, 'rowkey1', 'desc:year', '1975', 1 10 | put tbl, 'rowkey1', 'desc:year', '1976', 2 11 | put tbl, 'rowkey1', 'desc:year', '1977', 3 12 | 13 | put tbl, 'rowkey2', 'desc:title', 'Empire Strikes Back' 14 | put tbl, 'rowkey2', 'desc:year', '1975', 1 15 | put tbl, 'rowkey2', 'desc:year', '1976', 2 16 | put tbl, 'rowkey2', 'desc:year', '1980', 3 17 | 18 | put tbl, 'rowkey3', 'desc:title', 'Return of the Jedi' 19 | put tbl, 'rowkey3', 'desc:year', '1975' 20 | put tbl, 'rowkey3', 'desc:year', '1976' 21 | put tbl, 'rowkey3', 'desc:year', '1982' 22 | 23 | puts "We have all three rows" 24 | scan tbl 25 | 26 | delete tbl, 'rowkey3', 'desc:year' 27 | puts "No Jedi years should be visible here" 28 | scan tbl, { STARTROW => 'rowkey3' } 29 | 30 | puts "No Empire year before 1980 should be here" 31 | delete tbl, 'rowkey2', 'desc:year', 2 32 | scan tbl, { STARTROW => 'rowkey2', ENDROW => 'rowkey3', VERSIONS => 3 } 33 | 34 | puts "No Star Wars rows should be here" 35 | deleteall tbl, 'rowkey1' 36 | scan tbl 37 | 38 | puts "No more rows should be here:" 39 | truncate tbl 40 | scan tbl 41 | 42 | disable tbl 43 | drop tbl 44 | 45 | exit 46 | 47 | -------------------------------------------------------------------------------- /hbase/shell_stuff/check_new_stuff/delete_colfam_async.rb: -------------------------------------------------------------------------------- 1 | tbl = 'nate_alter' 2 | 3 | if @hbase.admin(@formatter).exists?(tbl) 4 | puts "Table '#{tbl}' already exists. Please drop it first." 5 | exit 1 6 | end 7 | 8 | create tbl, { NAME => 'cf1' }, { NAME => 'cf2' } 9 | put tbl, 1, 'cf1:col1', 'value1' 10 | put tbl, 1, 'cf2:col2', 'value1' 11 | 12 | scan tbl 13 | 14 | puts "Now deleting cf2" 15 | alter tbl, NAME => 'cf2', METHOD => 'delete' 16 | 17 | scan tbl 18 | 19 | puts "*" * 10, "Watch this, we can delete only remaining colfam" 20 | alter tbl, NAME => 'cf1', METHOD => 'delete' 21 | scan tbl 22 | describe tbl 23 | 24 | disable tbl 25 | drop tbl 26 | exit 27 | -------------------------------------------------------------------------------- /hbase/shell_stuff/check_new_stuff/get_colfam.rb: -------------------------------------------------------------------------------- 1 | tbl = "test_movie" 2 | if @hbase.admin(@formatter).exists?(tbl) 3 | puts "Table #{tbl} already exists. Please drop or use different table" 4 | exit 1 5 | end 6 | 7 | create tbl, { NAME => 'desc' }, { NAME => 'ratings' } 8 | put tbl, 'Star Wars', 'desc:title', 'New Hope' 9 | put tbl, 'Star Wars', 'desc:year', '1977', 1274032629664 10 | put tbl, 'Star Wars', 'desc:year', '1978', 1274032629663 11 | put tbl, 'Star Wars', 'ratings:bob', '1' 12 | put tbl, 'Star Wars', 'ratings:steve', '5' 13 | 14 | 15 | puts "Getting data: We should only see data from desc colfam\n:" + 16 | "And we should see 1977 because it has a later timestamp" 17 | get tbl, 'Star Wars', { COLUMN => 'desc' } 18 | 19 | puts "Getting data with ['desc']" 20 | get tbl, 'Star Wars', { COLUMN => ['desc'] } 21 | 22 | # Note ['desc:'] invalid as of (at least) CDH 5.2 23 | puts "Getting data with ['desc:']" 24 | get tbl, 'Star Wars', { COLUMN => ['desc:'] } 25 | 26 | # Note 'desc:' invalid as of (at least) CDH 5.2 27 | puts "Getting the data with 'desc:'" 28 | get tbl, 'Star Wars', { COLUMN => 'desc:' } 29 | 30 | puts "Disabling/dropping #{tbl}" 31 | disable tbl 32 | drop tbl 33 | 34 | exit 35 | 36 | -------------------------------------------------------------------------------- /hbase/shell_stuff/check_new_stuff/namespace_create.rb: -------------------------------------------------------------------------------- 1 | create_namespace 'namespaceName' 2 | alter_namespace 'namespaceName', { METHOD => 'set', 'SOME_PROPERTY' => 'SOME_VALUE' } 3 | drop_namespace 'namespaceName' 4 | 5 | -------------------------------------------------------------------------------- /hbase/shell_stuff/check_new_stuff/namespace_tables.rb: -------------------------------------------------------------------------------- 1 | puts "Simple table" 2 | create_namespace 'entertainment' 3 | create 'entertainment:movie', { NAME => 'desc' } 4 | disable 'entertainment:movie' 5 | drop 'entertainment:movie' 6 | 7 | puts "Versions = 2" 8 | create 'entertainment:movie', { NAME => 'desc', VERSIONS => 2 } 9 | disable 'entertainment:movie' 10 | drop 'entertainment:movie' 11 | 12 | puts "Two colfams" 13 | create 'entertainment:movie', { NAME => 'desc', VERSIONS => 2 } 14 | disable 'entertainment:movie' 15 | drop 'entertainment:movie' 16 | 17 | # Only empty namespaces can be removed 18 | drop_namespace 'entertainment' 19 | -------------------------------------------------------------------------------- /hbase/shell_stuff/check_new_stuff/new_shell_commands.rb: -------------------------------------------------------------------------------- 1 | # Tested create and acceptance of VERSIONS parameter 2 | disable 'nate_movie' 3 | drop 'nate_movie' 4 | 5 | # 04-08 Shell Command Syntax 6 | create 'nate_movie', {NAME => 'desc', VERSIONS => 5} 7 | 8 | # Verification 9 | put 'nate_movie', 1, 'desc:title', 'Star Wars' 10 | put 'nate_movie', 1, 'desc:title', 'Star Wars version 2' 11 | put 'nate_movie', 1, 'desc:title', 'Star Wars version 3' 12 | put 'nate_movie', 1, 'desc:title', 'Star Wars version 4' 13 | put 'nate_movie', 1, 'desc:title', 'Star Wars version 5' 14 | 15 | get 'nate_movie', '1', {COLUMN=>'desc', VERSIONS=>3} 16 | 17 | # Pass parameters test 18 | if ARGV.length() 19 | get ARGV[1], ARGV[2] 20 | -------------------------------------------------------------------------------- /hbase/shell_stuff/check_new_stuff/parameter_create_tbl.rb: -------------------------------------------------------------------------------- 1 | if ARGV && ARGV.length() == 2 2 | tbl = ARGV[0] 3 | colfam = ARGV[1] 4 | create tbl, colfam 5 | describe tbl 6 | else 7 | puts "Usage: parameter_create_tbl.rb " 8 | exit 1 9 | end 10 | exit 0 11 | -------------------------------------------------------------------------------- /hbase/shell_stuff/check_new_stuff/run_parameter_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | hbase shell parameter_create_tbl.rb sometable somecolfam 3 | -------------------------------------------------------------------------------- /hbase/shell_stuff/check_new_stuff/scan_examples.rb: -------------------------------------------------------------------------------- 1 | tbl = "test_movie" 2 | if @hbase.admin(@formatter).exists?(tbl) 3 | puts "Table #{tbl} already exists. Please drop or use different table" 4 | exit 1 5 | end 6 | 7 | create tbl, { NAME => 'desc' }, { NAME => 'media' } 8 | put tbl, 'rowkey1', 'desc:title', 'New Hope' 9 | put tbl, 'rowkey1', 'media:type', 'Tape' 10 | 11 | put tbl, 'rowkey2', 'desc:title', 'Empire Strikes Back' 12 | put tbl, 'rowkey2', 'desc:year', '1980' 13 | put tbl, 'rowkey2', 'media:type', 'Tape' 14 | 15 | put tbl, 'rowkey3', 'desc:title', 'Jedi' 16 | put tbl, 'rowkey3', 'media:type', 'Tape' 17 | 18 | put tbl, 'rowkey4', 'desc:title', 'Phantom' 19 | put tbl, 'rowkey4', 'media:type', 'DVD' 20 | 21 | put tbl, 'rowkey5', 'desc:title', 'Clone' 22 | put tbl, 'rowkey5', 'media:type', 'DVD' 23 | 24 | 25 | scan tbl 26 | 27 | puts "limiting to 1" 28 | scan tbl, { LIMIT => 1 } 29 | 30 | puts "startrow of rowkey1, end of rowkey4" 31 | scan tbl, { STARTROW => 'rowkey1', STOPROW => 'rowkey4' } 32 | 33 | puts "only retrieve title and type fields" 34 | scan tbl, { COLUMNS => [ 'desc:title', 'media:type' ] } 35 | 36 | put tbl, 'desc:duration', 120 37 | scan tbl, { FILTER => "SingleColumnValueFilter('desc', 'duration', =, 'binary:120')" } 38 | 39 | disable tbl 40 | drop tbl 41 | 42 | exit 43 | 44 | -------------------------------------------------------------------------------- /hbase/shell_stuff/check_new_stuff/scan_filter.rb: -------------------------------------------------------------------------------- 1 | tbl = "test_movie" 2 | if @hbase.admin(@formatter).exists?(tbl) 3 | puts "Table #{tbl} already exists. Please drop or use different table" 4 | exit 1 5 | end 6 | 7 | create tbl, { NAME => 'desc' }, { NAME => 'media' } 8 | 9 | put tbl, 'Star Wars', 'desc:duration', 'binary:120' 10 | put tbl, 'Empire', 'desc:duration', 100 11 | put tbl, 'Jedi', 'desc:duration', '120' 12 | 13 | # Binary tells the filter what kind of comparator to use 14 | scan tbl, { FILTER => "SingleColumnValueFilter('desc', 'duration', =, 'binary:120')" } 15 | 16 | disable tbl 17 | drop tbl 18 | 19 | exit 20 | 21 | -------------------------------------------------------------------------------- /hbase/shell_stuff/check_new_stuff/unknown_arguments_warning.rb: -------------------------------------------------------------------------------- 1 | # Thankfully, invalid/unknown arguments cause HBase Shell 2 | # to print a warning 3 | create 'sometable', { NAME => 'cf1', VERSION => 1, VERSIONS => 2 } 4 | disable 'sometable' 5 | drop 'sometable' 6 | exit 7 | -------------------------------------------------------------------------------- /hbase/shell_stuff/check_new_stuff/versions_async.rb: -------------------------------------------------------------------------------- 1 | tbl = 'nate_alter' 2 | 3 | if @hbase.admin(@formatter).exists?(tbl) 4 | puts "Table '#{tbl}' already exists. Please drop it first." 5 | exit 1 6 | end 7 | 8 | create tbl, { NAME => 'cf1' } 9 | 10 | put tbl, '1', 'cf1:col1', 'value' 11 | put tbl, '1', 'cf1:col1', 'value' 12 | put tbl, '1', 'cf1:col1', 'value' 13 | put tbl, '1', 'cf1:col1', 'value' 14 | 15 | puts "*" * 10, "Get row -- we see 1 version is kept." 16 | get tbl, '1', { COLUMN => 'cf3:col3', VERSIONS => 5 } 17 | 18 | puts "*" * 10, "Now, alter versions to 5" 19 | alter tbl, NAME => 'cf3', VERSIONS => '5' 20 | 21 | put tbl, '1', 'cf1:col1', 'value' 22 | put tbl, '1', 'cf1:col1', 'value' 23 | put tbl, '1', 'cf1:col1', 'value' 24 | put tbl, '1', 'cf1:col1', 'value' 25 | puts "*" * 10, "Now we have more versions retained" 26 | get tbl, '1', { COLUMN => 'cf1:col1', VERSIONS => 5 } 27 | 28 | disable tbl 29 | drop tbl 30 | -------------------------------------------------------------------------------- /hbase/shell_stuff/inspect_HTable.rb: -------------------------------------------------------------------------------- 1 | tbl = get_table('njn_transactions') 2 | # Get to the underlying table for the REAL power, Jedi! 3 | puts "Here's the methods of the tbl.table: " 4 | puts tbl.table.methods 5 | exit 6 | -------------------------------------------------------------------------------- /hbase/shell_stuff/list_regions.rb: -------------------------------------------------------------------------------- 1 | require 'test/unit' 2 | extend Test::Unit::Assertions 3 | 4 | tbl = get_table('njn_transactions') 5 | # Get to the underlying tbl.table for the REAL power, my apprentice. 6 | puts "Here's the regions:" 7 | # get_region_locations returns a "NavigableMap" Java object that has a RegionInfo as a key, and ServerName as value 8 | # http://archive.cloudera.com/cdh5/cdh/5/hbase-0.96.1.1-cdh5.0.1/devapidocs/org/apache/hadoop/hbase/client/HTable.html#getRegionLocations%28%29 9 | tbl.table.get_region_locations.each_with_index do |region_thingy, idx| 10 | puts "-" * 100 11 | puts "Region " + idx.to_s 12 | 13 | # get_region_name_as_string is the same thing as .regionName(), except String vs. Byte Array 14 | regionName = region_thingy[0].get_region_name_as_string 15 | assert_equal regionName, Bytes.toString(region_thingy[0].regionName) 16 | 17 | # Print out info about this region 18 | puts "Region Name is: " + regionName 19 | puts region_thingy[0].toString() 20 | end 21 | 22 | puts "Now printing start keys of each region in this table: " 23 | tbl.table.get_start_keys.each do |byte_array_start_key| 24 | puts Bytes.toString(byte_array_start_key) 25 | end 26 | exit 27 | -------------------------------------------------------------------------------- /hbase/shell_stuff/list_tables.rb: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.hbase.client.HBaseAdmin 2 | 3 | admin = HBaseAdmin.new(@hbase.configuration) 4 | puts admin.getTableNames().to_a 5 | exit 6 | -------------------------------------------------------------------------------- /hbase/simpleConnection/SimpleCreateAndPut.groovy: -------------------------------------------------------------------------------- 1 | #!/bin/env groovy 2 | // This will load data from movieratings flat file into a table xx_users 3 | // The data is simply put into the key, and fake values are put into ratings column family 4 | // it is simply used to show data ingestion using the HBase API 5 | import org.apache.hadoop.hbase.HBaseConfiguration 6 | import org.apache.hadoop.hbase.HTableDescriptor 7 | import org.apache.hadoop.hbase.HColumnDescriptor 8 | import org.apache.hadoop.hbase.client.HBaseAdmin 9 | import org.apache.hadoop.hbase.client.HConnectionManager 10 | import org.apache.hadoop.hbase.client.HConnection 11 | import org.apache.hadoop.hbase.client.Put 12 | import org.apache.hadoop.hbase.util.Bytes 13 | import groovy.time.* 14 | 15 | /* Setup */ 16 | def ratingsTable 17 | def tableName = "njn_users" 18 | def shouldCreateTable = true 19 | def shouldPreSplit = true 20 | 21 | HBaseConfiguration conf = new HBaseConfiguration() 22 | HConnection connection = HConnectionManager.createConnection(conf) 23 | 24 | if (shouldCreateTable) { 25 | admin = new HBaseAdmin(conf) 26 | 27 | if (admin.tableExists(tableName)) { 28 | admin.disableTable(tableName) 29 | admin.deleteTable(tableName) 30 | } 31 | 32 | def desc = new HTableDescriptor(Bytes.toBytes(tableName)) 33 | desc.addFamily(new HColumnDescriptor(Bytes.toBytes("info"))) 34 | admin.createTable(desc) 35 | } 36 | 37 | ratingsTable = connection.getTable(tableName) 38 | 39 | def start = new Date() 40 | 41 | Put p = new Put(Bytes.toBytes("StevesKey")) 42 | p.add(Bytes.toBytes("info"), Bytes.toBytes("fname"), Bytes.toBytes("Steve")) 43 | ratingsTable.put(p) 44 | 45 | TimeDuration duration = TimeCategory.minus(new Date(), start) 46 | 47 | println "Done, a one-row insert took " + duration 48 | 49 | -------------------------------------------------------------------------------- /hbase/simpleConnection/compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | groovyc -classpath `hbase classpath` ./SimpleCreateAndPut.groovy 3 | -------------------------------------------------------------------------------- /hbase/simpleConnection/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | groovy -classpath `hbase classpath` ./SimpleCreateAndPut.groovy 3 | -------------------------------------------------------------------------------- /hdfs/data-visibility/README.md: -------------------------------------------------------------------------------- 1 | # README 2 | 3 | Illustrate that a file need not be a full HDFS block before 4 | data can be read from the file. 5 | 6 | 1. Run this in a shell: 7 | 8 | ./foo.pl | hadoop fs -put - data.txt 9 | 10 | 2. Open another shell and run this 11 | 12 | hadoop fs -ls data.txt.\_COPYING\_ 13 | 14 | hadoop fs -cat data.txt.\_COPYING\_ | head -n 10 15 | 16 | 3. *Note* Don't forget to Ctrl-C ./foo.pl!!!! 17 | -------------------------------------------------------------------------------- /hdfs/data-visibility/foo.pl: -------------------------------------------------------------------------------- 1 | #!/bin/env perl 2 | for($i=0; $i<=1_000_000; $i++) { 3 | if ($i % 10000 == 0) { 4 | sleep(1); 5 | print "Sleeping " . `date`; 6 | } 7 | print $i, "\n"; 8 | } 9 | -------------------------------------------------------------------------------- /hdfs/replication/run.sh: -------------------------------------------------------------------------------- 1 | TMP_FILE=words_`date "+%F%s"` 2 | hadoop fs -put /usr/share/dict/words $TMP_FILE 3 | hadoop fs -setrep 4 $TMP_FILE 4 | echo "Check out $TMP_FILE in your home dir." 5 | sleep 10 6 | hadoop fs -setrep 3 $TMP_FILE 7 | -------------------------------------------------------------------------------- /hdfs/webhdfs-httpfs/testdata.txt: -------------------------------------------------------------------------------- 1 | The quick brown fox 2 | jumped over the lazy dog 3 | -------------------------------------------------------------------------------- /hive/crlf/data.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 4 5 | 5 6 | 6 7 | 7 8 | 8 9 | 9 10 | 10 11 | -------------------------------------------------------------------------------- /hive/crlf/data_unix.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 4 5 | 5 6 | 6 7 | 7 8 | 8 9 | 9 10 | 10 11 | -------------------------------------------------------------------------------- /hive/crlf/get_max.sql: -------------------------------------------------------------------------------- 1 | select max(junk) as the_max, count(junk) as the_count 2 | from dosjunk 3 | group by junk 4 | order by the_max limit 100000 5 | -------------------------------------------------------------------------------- /hive/crlf/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script runs impala and hive and MapReduce wordcount. 3 | # Please remove whatever you don't want to run. 4 | set -e 5 | IMPALA_SERVER=$1 6 | OUTPUT_DIR=output/wc_dosjunk 7 | impala-shell -i $IMPALA_SERVER -f ./get_max.sql 8 | hive -f ./get_max.sql 9 | hadoop fs -test -d $OUTPUT_DIR && hadoop fs -rm -r $OUTPUT_DIR 10 | hadoop jar $EXAMPLES_DIR/hadoop-examples.jar wordcount dosjunk $OUTPUT_DIR 11 | hadoop fs -getmerge $OUTPUT_DIR wordcount_output.txt 12 | echo "Wordcount output is in wordcount_output.txt" 13 | -------------------------------------------------------------------------------- /hive/debate/analyze_debate.hql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS debate; 2 | CREATE TABLE debate(debatetext string); 3 | 4 | LOAD DATA LOCAL INPATH 'debate.txt' 5 | INTO TABLE debate; 6 | 7 | SELECT EXPLODE(NGRAMS( 8 | SENTENCES(debate.debatetext), 4, 10)) -- <<< Try this with 3 or 4 and see how results change 9 | AS x 10 | FROM debate 11 | -------------------------------------------------------------------------------- /hive/incremental_insert/README.TXT: -------------------------------------------------------------------------------- 1 | Read load_and_run.sql for a description of this project 2 | 3 | Run run.sh to see it in action 4 | -------------------------------------------------------------------------------- /hive/incremental_insert/employees.txt: -------------------------------------------------------------------------------- 1 | bobsupervisor 2 | steveprogrammer 3 | -------------------------------------------------------------------------------- /hive/incremental_insert/join_table.sql: -------------------------------------------------------------------------------- 1 | -- INSERT into join_table the name, title, nickname from the employees and nicknames, 2 | -- but DO NOT duplicate existing data in join_table 3 | INSERT INTO TABLE join_table 4 | SELECT e.name, e.title, n.nickname 5 | FROM employees e 6 | JOIN nicknames n ON e.name = n.name 7 | LEFT OUTER JOIN join_table jt 8 | ON (jt.name = e.name AND n.nickname = jt.nickname AND jt.title = e.title) 9 | WHERE jt.name IS NULL; 10 | -------------------------------------------------------------------------------- /hive/incremental_insert/load_and_run.sql: -------------------------------------------------------------------------------- 1 | -- This example shows how to use Hive to insert non-duplicate data 2 | -- into a join table. 3 | 4 | -- Employees table: 5 | -- bob supervisor 6 | 7 | -- Nicknames table: 8 | -- bob bob_nickname 9 | 10 | -- We create a join_table with a very simple initial 11 | -- dataset: 12 | -- bob supervisor bob_nickname 13 | -- steve programmer steve_nickname 14 | 15 | -- Then, we load *more* nicknames into the nicknames table, and only 16 | -- insert the new nickname relations into the join_table. 17 | -- bob another_bob_nickname 18 | 19 | -- We want the resulting join_table to include only: 20 | -- bob supervisor bob_nickname 21 | -- bob supervisor another_bob_nickname 22 | -- steve programmer steve_nickname 23 | -- steve programmer another_steve_nickname 24 | 25 | -- We don't want to get duplicate data in the join_table 26 | ADD FILE join_table.sql; 27 | CREATE DATABASE IF NOT EXISTS incremental_insert; 28 | 29 | USE incremental_insert; 30 | 31 | DROP TABLE IF EXISTS employees; 32 | CREATE TABLE employees(name STRING, title STRING); 33 | LOAD DATA LOCAL INPATH 'employees.txt' INTO TABLE employees; 34 | 35 | DROP TABLE IF EXISTS nicknames; 36 | CREATE TABLE nicknames(name STRING, nickname STRING); 37 | LOAD DATA LOCAL INPATH 'nicknames.txt' INTO TABLE nicknames; 38 | 39 | DROP TABLE IF EXISTS join_table; 40 | CREATE TABLE join_table(name STRING, title STRING, nickname STRING); 41 | 42 | -- Run the join 43 | SOURCE join_table.sql; 44 | SELECT COUNT(*) FROM join_table; 45 | 46 | -- Now, load more nicknames 47 | LOAD DATA LOCAL INPATH 'more_nicknames.txt' INTO TABLE nicknames; 48 | 49 | -- Run the join again 50 | SOURCE join_table.sql; 51 | SELECT COUNT(*) FROM join_table; 52 | 53 | -------------------------------------------------------------------------------- /hive/incremental_insert/more_nicknames.txt: -------------------------------------------------------------------------------- 1 | bobanother_bob_nickname 2 | steveanother_steve_nickname 3 | -------------------------------------------------------------------------------- /hive/incremental_insert/nicknames.txt: -------------------------------------------------------------------------------- 1 | bobbob_nickname 2 | stevesteve_nickname 3 | -------------------------------------------------------------------------------- /hive/incremental_insert/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | hive -S -v -f load_and_run.sql 3 | -------------------------------------------------------------------------------- /hive/partition-example/README.TXT: -------------------------------------------------------------------------------- 1 | The script run.sh will do everything for you. 2 | 3 | get_partition_info.sql has an example of EXPLAIN EXTENDED 4 | to show that Hive will use Partitions in a SELECT statement. 5 | Note that Hive is really smart, and if you SELECT * according to 6 | a partition, Hive will not run MapReduce, it will just perform a 7 | hadoop fs -get /user/hive/warehouse/yourtable/partition= :) 8 | -------------------------------------------------------------------------------- /hive/partition-example/create_and_load_employees.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE IF NOT EXISTS db1; 2 | DROP TABLE IF EXISTS db1.employees; 3 | CREATE TABLE db1.employees(name STRING, state STRING) 4 | ROW FORMAT DELIMITED 5 | FIELDS TERMINATED BY '\t'; 6 | LOAD DATA LOCAL INPATH 'employees.txt' INTO TABLE db1.employees; 7 | -------------------------------------------------------------------------------- /hive/partition-example/employees.txt: -------------------------------------------------------------------------------- 1 | Bob CA 2 | Steve CA 3 | Andy TX 4 | Sherry TX 5 | Silvia TX 6 | Cynthia TX 7 | Tex TX 8 | Alvin TX 9 | Nate LA 10 | Jerry TX 11 | Doug TX 12 | Terry CA 13 | Betty TX 14 | Bertha TX 15 | Walter TX 16 | Gus TX 17 | Jesse CA 18 | Lydia TX 19 | Hank TX 20 | Marie TX 21 | Fen IL 22 | Mike TX 23 | Jack CA 24 | Ben TX 25 | Ian NY 26 | Sarah TX 27 | Charles MO 28 | Tom MO 29 | Mirko CO 30 | Ted OH 31 | Kaufman MA 32 | Andrew MA 33 | -------------------------------------------------------------------------------- /hive/partition-example/get_partition_info.sql: -------------------------------------------------------------------------------- 1 | USE db1; 2 | SHOW PARTITIONS employees_partitioned; 3 | 4 | EXPLAIN EXTENDED 5 | SELECT name FROM EMPLOYEES 6 | WHERE state = 'MO'; 7 | -------------------------------------------------------------------------------- /hive/partition-example/partition_employees.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS db1.employees_partitioned; 2 | CREATE TABLE db1.employees_partitioned(name STRING) 3 | PARTITIONED BY (state STRING) 4 | ROW FORMAT DELIMITED 5 | FIELDS TERMINATED BY '\t'; 6 | 7 | set hive.exec.dynamic.partition=true; 8 | set hive.exec.dynamic.partition.mode=nonstrict; 9 | -- The columns you're partitioning by should be listed at the END of the SELECT statement 10 | INSERT OVERWRITE TABLE db1.employees_partitioned 11 | PARTITION (state) 12 | SELECT name, state FROM db1.employees; 13 | -------------------------------------------------------------------------------- /hive/partition-example/partition_employees_keep_orig_data.sql: -------------------------------------------------------------------------------- 1 | -- What if you want to keep the original data (state) 2 | -- in the partitioned table? Then create a dummy field 3 | -- in the original table, and select the "state" field twice 4 | -- below. 5 | DROP TABLE IF EXISTS db1.employees_partitioned_keep_orig_data; 6 | CREATE TABLE db1.employees_partitioned_keep_orig_data(name STRING, orig_state STRING) 7 | PARTITIONED BY (state STRING) 8 | ROW FORMAT DELIMITED 9 | FIELDS TERMINATED BY '\t'; 10 | 11 | set hive.exec.dynamic.partition=true; 12 | set hive.exec.dynamic.partition.mode=nonstrict; 13 | INSERT OVERWRITE TABLE db1.employees_partitioned_keep_orig_data 14 | PARTITION (state) 15 | SELECT name, state AS orig_state, state FROM db1.employees; 16 | -------------------------------------------------------------------------------- /hive/partition-example/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Script should stop if there's a failure 3 | set -e 4 | hive -S -f create_and_load_employees.sql 5 | hive -S -f partition_employees.sql 6 | hive -S -f partition_employees_keep_orig_data.sql 7 | echo "Browse the data in the /user/hive/warehouse/db1 directory" 8 | hive -S -v -f get_partition_info.sql 9 | -------------------------------------------------------------------------------- /hive/simple_queries/README.md: -------------------------------------------------------------------------------- 1 | # Simple Queries 2 | 3 | Tests for simple queries in Hive 4 | 5 | - subquery_in_where.sql : Meant to test LEFT SEMI JOIN versus subqueries in WHERE clauses. 6 | -------------------------------------------------------------------------------- /hive/simple_queries/create_tables.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE IF NOT EXISTS he; 2 | CREATE TABLE IF NOT EXISTS he.customers( 3 | cust_id STRING, 4 | first_name STRING) 5 | ROW FORMAT DELIMITED 6 | FIELDS TERMINATED BY '\t'; 7 | 8 | CREATE TABLE IF NOT EXISTS he.orders( 9 | order_id INT, 10 | cust_id STRING, 11 | first_name STRING, 12 | order_date STRING) 13 | ROW FORMAT DELIMITED 14 | FIELDS TERMINATED BY '\t'; 15 | -------------------------------------------------------------------------------- /hive/simple_queries/customers.txt: -------------------------------------------------------------------------------- 1 | nate nate 2 | bob bob 3 | steve steve 4 | carl carl 5 | sandy sandy 6 | tom tom 7 | rip rip 8 | zip zip 9 | -------------------------------------------------------------------------------- /hive/simple_queries/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | hive -f create_tables.sql 4 | hdfs dfs -put -f customers.txt /user/training 5 | hdfs dfs -put -f orders.txt /user/training 6 | hive -e "LOAD DATA INPATH '/user/training/customers.txt' OVERWRITE INTO TABLE he.customers" 7 | hive -e "LOAD DATA INPATH '/user/training/orders.txt' OVERWRITE INTO TABLE he.orders" 8 | -------------------------------------------------------------------------------- /hive/simple_queries/orders.txt: -------------------------------------------------------------------------------- 1 | 1 nate product1 2011-01-01 2 | 2 bob product1 2011-01-01 3 | 3 steve product1 2011-01-01 4 | 4 carl product1 2011-01-01 5 | 5 sandy product1 2011-01-01 6 | 6 tom product1 2011-01-01 7 | 6 rip product1 2011-01-01 8 | 7 rip product1 2012-01-01 9 | -------------------------------------------------------------------------------- /hive/simple_queries/subquery_in_where.sql: -------------------------------------------------------------------------------- 1 | SELECT c.cust_id FROM he.customers c 2 | WHERE cust_id IN 3 | (SELECT o.cust_id FROM he.orders o 4 | WHERE YEAR(o.order_date) = 2012); 5 | 6 | SELECT c.cust_id 7 | FROM he.customers c 8 | LEFT SEMI JOIN he.orders o 9 | ON (c.cust_id = o.cust_id 10 | AND YEAR(o.order_date) = 2012); 11 | -------------------------------------------------------------------------------- /hive/transform/awk-example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "BEFORE: " 3 | cat ../sample-data/transform-example.txt 4 | echo "AFTER AWK FILTER" 5 | cat ../sample-data/transform-example.txt | awk '! a[$1]++' 6 | -------------------------------------------------------------------------------- /hive/transform/legalpets.pl: -------------------------------------------------------------------------------- 1 | #!/bin/env perl 2 | %legal = qw/dog 1 3 | cat 1 4 | ferret 1 5 | bird 1 6 | chimpanzee 1/; 7 | 8 | my @petsIveSeen = (); 9 | while ($pet = <>) { 10 | chomp($pet); 11 | # debug -- this goes to /var/log/hadoop/userlogs///stderr 12 | print STDERR $pet; 13 | if ($legal{$pet}) { 14 | print "$pet\tYES\n"; 15 | } 16 | else { 17 | print "$pet\tNO\n"; 18 | } 19 | 20 | push(@petsIveSeen, $pet); 21 | } 22 | # debug -- this goes to /var/log/hadoop/userlogs///stderr 23 | print STDERR join(',', @petsIveSeen); 24 | -------------------------------------------------------------------------------- /hive/transform/transform-pets.hql: -------------------------------------------------------------------------------- 1 | ADD FILE /home/training/src/training-scripts/hive/transform/legalpets.pl; 2 | 3 | FROM pets 4 | SELECT TRANSFORM ( pet ) 5 | USING "legalpets.pl" 6 | AS name, islegal; 7 | -------------------------------------------------------------------------------- /hive/wordcount/README.TXT: -------------------------------------------------------------------------------- 1 | Basic idea: 2 | 3 | 1) Run wordcount on shakespare using MapReduce 4 | 2) Run wordcount on shakespeare using Hive and compare differences 5 | 6 | Steps 7 | 1) Upload shakespeare to cluster 8 | 2) Run wordcount using MapReduce 9 | 3) Create a table 'mapred_wordcount' containing the output from MapReduce job 10 | 11 | 4) Load shakespeare into a Hive table 12 | 5) Run Hive Query to create a hive_wordcount table 13 | 6) Compare hive_wordcount to mapred_wordcount 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | First run ./run-comparison.sh 23 | Then you can use ./create-external-table-for-mapreduce-output.hql 24 | to view the output of the mapreduce job more easily, 25 | or you can simply use 26 | 27 | hadoop fs -get output/wordcount 28 | -------------------------------------------------------------------------------- /hive/wordcount/README.md: -------------------------------------------------------------------------------- 1 | # Wordcount using Hive 2 | 3 | Upload data to a directory in HDFS. Specify 4 | the **absolute** path to the directory to *run.sh* 5 | 6 | Run ./run.sh /absolute/path/to/your/data 7 | -------------------------------------------------------------------------------- /hive/wordcount/compare.hql: -------------------------------------------------------------------------------- 1 | SET hive.cli.print.header=true; 2 | 3 | SELECT wordcount.word, wordcount.count AS hive_count, mr_wordcount.count AS mr_count 4 | FROM wordcount 5 | FULL OUTER JOIN mr_wordcount on (mr_wordcount.word = wordcount.word) 6 | WHERE wordcount.word IS NULL or 7 | mr_wordcount.word IS NULL; 8 | -------------------------------------------------------------------------------- /hive/wordcount/create-external-table-for-mapreduce-output.hql: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE mr_wordcount 2 | (word STRING, count INT) 3 | ROW FORMAT DELIMITED 4 | FIELDS TERMINATED BY '\t' 5 | LOCATION '/user/training/output/wordcount' 6 | -------------------------------------------------------------------------------- /hive/wordcount/run-mr-and-hive-queries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SHAKESPEARE_DIR=/user/training/shakespeare 3 | LOCAL_DATA_DIR=~/training_materials/developer/data/ 4 | OUTPUT_DIR=/user/training/output/wordcount 5 | 6 | # Already set in ~/.bashrc 7 | # EXAMPLES_DIR=/usr/lib/hadoop-0.20-mapreduce 8 | 9 | hadoop fs -test -d $SHAKESPEARE_DIR || { 10 | 11 | test -d $LOCAL_DATA_DIR/shakespeare || \ 12 | tar -C $LOCAL_DATA_DIR -xzvf $LOCAL_DATA_DIR/shakespeare.tar.gz 13 | 14 | hadoop fs -put $LOCAL_DATA_DIR/shakespeare $SHAKESPEARE_DIR 15 | } 16 | 17 | echo "RUNNING MAPREDUCE JOB" 18 | 19 | hadoop fs -rm -R $OUTPUT_DIR 20 | hadoop jar $EXAMPLES_DIR/hadoop-examples.jar wordcount $SHAKESPEARE_DIR \ 21 | $OUTPUT_DIR 22 | 23 | echo "RUNNING HIVE QUERY" 24 | hive -f ./wordcount.hql 25 | 26 | -------------------------------------------------------------------------------- /hive/wordcount/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DATA_DIRECTORY=$1 3 | test -z "$DATA_DIRECTORY" && { 4 | echo "Usage: run.sh " 5 | exit 1 6 | } 7 | 8 | hadoop fs -test -e $DATA_DIRECTORY || { 9 | echo "HDFS directory $DATA_DIRECTORY doesn't exist" 10 | echo "Usage: run.sh " 11 | exit 1 12 | } 13 | echo "RUNNING HIVE QUERY" 14 | hive -S -d input_directory=$DATA_DIRECTORY -f ./wordcount.hql 15 | 16 | -------------------------------------------------------------------------------- /hive/wordcount/wordcount.hql: -------------------------------------------------------------------------------- 1 | DROP TABLE shakespeare; 2 | CREATE EXTERNAL TABLE shakespeare (line STRING) 3 | LOCATION '/user/training/shakespeare'; 4 | 5 | DROP TABLE IF EXISTS wordcount; 6 | CREATE TABLE wordcount AS 7 | SELECT word, count(1) AS count 8 | FROM 9 | (SELECT explode(split(lcase(line), '\\W+')) AS word 10 | FROM shakespeare) words 11 | GROUP BY word ORDER BY word; 12 | -------------------------------------------------------------------------------- /impala/README.md: -------------------------------------------------------------------------------- 1 | # Simple Queries 2 | 3 | Tests for simple queries in Impala 4 | 5 | - subquery_in_where.sql : Meant to test LEFT SEMI JOIN versus subqueries in WHERE clauses. 6 | - Related Hive code/"test" in ../../hive/simple_queries 7 | -------------------------------------------------------------------------------- /impala/analytic-functions/ads.txt: -------------------------------------------------------------------------------- 1 | 2015-05-01 losing_clicks 2 | 2015-05-01 losing_clicks 3 | 2015-05-01 losing_clicks 4 | 2015-05-01 losing_clicks 5 | 2015-05-01 losing_clicks 6 | 2015-05-01 losing_clicks 7 | 2015-05-01 losing_clicks 8 | 2015-05-01 gaining 9 | 2015-05-02 losing_clicks 10 | 2015-05-02 losing_clicks 11 | 2015-05-02 losing_clicks 12 | 2015-05-02 losing_clicks 13 | 2015-05-02 losing_clicks 14 | 2015-05-02 losing_clicks 15 | 2015-05-02 gaining 16 | 2015-05-02 gaining 17 | 2015-05-03 losing_clicks 18 | 2015-05-03 losing_clicks 19 | 2015-05-03 losing_clicks 20 | 2015-05-03 losing_clicks 21 | 2015-05-03 losing_clicks 22 | 2015-05-03 gaining 23 | 2015-05-03 gaining 24 | 2015-05-03 gaining 25 | 2015-05-04 losing_clicks 26 | 2015-05-04 losing_clicks 27 | 2015-05-04 losing_clicks 28 | 2015-05-04 losing_clicks 29 | 2015-05-04 gaining 30 | 2015-05-04 gaining 31 | 2015-05-04 gaining 32 | 2015-05-04 gaining 33 | 2015-05-05 losing_clicks 34 | 2015-05-05 losing_clicks 35 | 2015-05-05 losing_clicks 36 | 2015-05-05 gaining 37 | 2015-05-05 gaining 38 | 2015-05-05 gaining 39 | 2015-05-05 gaining 40 | 2015-05-05 gaining 41 | 2015-05-06 losing_clicks 42 | 2015-05-06 losing_clicks 43 | 2015-05-06 gaining 44 | 2015-05-06 gaining 45 | 2015-05-06 gaining 46 | 2015-05-06 gaining 47 | 2015-05-06 gaining 48 | 2015-05-06 gaining 49 | 2015-05-07 losing_clicks 50 | 2015-05-07 gaining 51 | 2015-05-07 gaining 52 | 2015-05-07 gaining 53 | 2015-05-07 gaining 54 | 2015-05-07 gaining 55 | 2015-05-07 gaining 56 | 2015-05-07 gaining 57 | 2015-05-08 gaining 58 | 2015-05-08 gaining 59 | 2015-05-08 gaining 60 | 2015-05-08 gaining 61 | 2015-05-08 gaining 62 | 2015-05-08 gaining 63 | 2015-05-08 gaining 64 | 2015-05-08 gaining 65 | -------------------------------------------------------------------------------- /impala/analytic-functions/avg_ads.sql: -------------------------------------------------------------------------------- 1 | SELECT display_date, display_site, n, 2 | AVG(n) OVER 3 | (PARTITION BY display_site 4 | ORDER BY display_date 5 | ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS wavg 6 | FROM ( 7 | SELECT display_date, display_site, 8 | count(display_date) AS n 9 | FROM ads GROUP BY display_date, display_site 10 | ) ads 11 | ORDER BY display_site, display_date; 12 | -------------------------------------------------------------------------------- /impala/analytic-functions/avg_ads.txt: -------------------------------------------------------------------------------- 1 | +---------------------+---------------+---+------+ 2 | | display_date | display_site | n | wavg | 3 | +---------------------+---------------+---+------+ 4 | | 2015-05-01 00:00:00 | gaining | 1 | 1 | 5 | | 2015-05-02 00:00:00 | gaining | 2 | 1.5 | 6 | | 2015-05-03 00:00:00 | gaining | 3 | 2 | 7 | | 2015-05-04 00:00:00 | gaining | 4 | 2.5 | 8 | | 2015-05-05 00:00:00 | gaining | 5 | 3.5 | 9 | | 2015-05-06 00:00:00 | gaining | 6 | 4.5 | 10 | | 2015-05-07 00:00:00 | gaining | 7 | 5.5 | 11 | | 2015-05-08 00:00:00 | gaining | 8 | 6.5 | 12 | | 2015-05-01 00:00:00 | losing_clicks | 7 | 7 | 13 | | 2015-05-02 00:00:00 | losing_clicks | 6 | 6.5 | 14 | | 2015-05-03 00:00:00 | losing_clicks | 5 | 6 | 15 | | 2015-05-04 00:00:00 | losing_clicks | 4 | 5.5 | 16 | | 2015-05-05 00:00:00 | losing_clicks | 3 | 4.5 | 17 | | 2015-05-06 00:00:00 | losing_clicks | 2 | 3.5 | 18 | | 2015-05-07 00:00:00 | losing_clicks | 1 | 2.5 | 19 | +---------------------+---------------+---+------+ 20 | -------------------------------------------------------------------------------- /impala/analytic-functions/create_ads.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS ads; 2 | 3 | CREATE EXTERNAL TABLE ads 4 | (display_date TIMESTAMP, 5 | display_site STRING) 6 | ROW FORMAT DELIMITED 7 | FIELDS TERMINATED BY '\t' 8 | LOCATION '/dualcore/ads'; 9 | -------------------------------------------------------------------------------- /impala/analytic-functions/impala-version.txt: -------------------------------------------------------------------------------- 1 | Impala Shell v2.0.0-cdh5 (ecf30af) built on Sat Oct 11 13:56:06 PDT 2014 2 | Hadoop 2.5.0-cdh5.2.0 3 | Subversion http://github.com/cloudera/hadoop -r e1f20a08bde76a33b79df026d00a0c91b2298387 4 | Compiled by jenkins on 2014-10-11T21:00Z 5 | Compiled with protoc 2.5.0 6 | From source with checksum 309bccd135b199bdfdd6df5f3f4153d 7 | This command was run using /usr/lib/hadoop/hadoop-common-2.5.0-cdh5.2.0.jar 8 | -------------------------------------------------------------------------------- /impala/analytic-functions/lag_ads.sql: -------------------------------------------------------------------------------- 1 | SELECT display_date, display_site, n, 2 | LAG(n) OVER 3 | (PARTITION BY display_site 4 | ORDER BY display_date) AS nprev 5 | FROM ( 6 | SELECT display_date, display_site, 7 | count(display_date) AS n 8 | FROM ads GROUP BY display_date, display_site 9 | ) ads 10 | ORDER BY display_site, display_date; 11 | -------------------------------------------------------------------------------- /impala/analytic-functions/lag_ads.txt: -------------------------------------------------------------------------------- 1 | +---------------------+---------------+---+-------+ 2 | | display_date | display_site | n | nprev | 3 | +---------------------+---------------+---+-------+ 4 | | 2015-05-01 00:00:00 | gaining | 1 | NULL | 5 | | 2015-05-02 00:00:00 | gaining | 2 | 1 | 6 | | 2015-05-03 00:00:00 | gaining | 3 | 2 | 7 | | 2015-05-04 00:00:00 | gaining | 4 | 3 | 8 | | 2015-05-05 00:00:00 | gaining | 5 | 4 | 9 | | 2015-05-06 00:00:00 | gaining | 6 | 5 | 10 | | 2015-05-07 00:00:00 | gaining | 7 | 6 | 11 | | 2015-05-08 00:00:00 | gaining | 8 | 7 | 12 | | 2015-05-01 00:00:00 | losing_clicks | 7 | NULL | 13 | | 2015-05-02 00:00:00 | losing_clicks | 6 | 7 | 14 | | 2015-05-03 00:00:00 | losing_clicks | 5 | 6 | 15 | | 2015-05-04 00:00:00 | losing_clicks | 4 | 5 | 16 | | 2015-05-05 00:00:00 | losing_clicks | 3 | 4 | 17 | | 2015-05-06 00:00:00 | losing_clicks | 2 | 3 | 18 | | 2015-05-07 00:00:00 | losing_clicks | 1 | 2 | 19 | +---------------------+---------------+---+-------+ 20 | -------------------------------------------------------------------------------- /impala/analytic-functions/run.sh: -------------------------------------------------------------------------------- 1 | hdfs dfs -mkdir -p /dualcore/ads 2 | hdfs dfs -put ads.txt /dualcore/ads 3 | impala-shell -f create_ads.sql 4 | impala-shell -f lag_ads.sql -o lag_ads.txt 5 | impala-shell -f avg_ads.sql -o avg_ads.txt 6 | echo "Look in lag_ads.txt, and avg_ads.txt" 7 | -------------------------------------------------------------------------------- /impala/datatypes/decimal_vs_integer/README.md: -------------------------------------------------------------------------------- 1 | # Decimal vs. Integer 2 | 3 | Playing with Impala data types 4 | -------------------------------------------------------------------------------- /impala/datatypes/decimal_vs_integer/create_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE test_decimal 2 | (userid STRING, 3 | some_number DECIMAL); 4 | 5 | CREATE EXTERNAL TABLE test_integer 6 | (userid STRING, 7 | some_number INT) 8 | LOCATION '/user/hive/warehouse/test_decimal'; 9 | 10 | LOAD DATA INPATH data.txt INTO TABLE test_decimal; 11 | INVALIDATE METDATA; 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /impala/datatypes/decimal_vs_integer/data.txt: -------------------------------------------------------------------------------- 1 | bob 10 2 | bob 10.00 3 | bob 10.0 4 | nate .11 5 | -------------------------------------------------------------------------------- /impala/datatypes/decimal_vs_integer/run.sh: -------------------------------------------------------------------------------- 1 | IMPALAD=$1 2 | hdfs dfs -put data.txt 3 | impala-shell -i $IMPALAD -f ./create_table.sql 4 | impala-shell -i $IMPALAD -f ./run_queries.sql 5 | -------------------------------------------------------------------------------- /impala/datatypes/decimal_vs_integer/run_queries.sql: -------------------------------------------------------------------------------- 1 | SELECT some_number FROM test_decimal; 2 | SELECT some_number FROM test_integer; 3 | -------------------------------------------------------------------------------- /impala/dyn_test/README.md: -------------------------------------------------------------------------------- 1 | # Dynamic Partition Insert Tests 2 | 3 | Test whether existing partitions are affected by dynamic 4 | partition INSERT OVERWRITE statements. 5 | -------------------------------------------------------------------------------- /impala/dyn_test/branch_totals_monday.txt: -------------------------------------------------------------------------------- 1 | 2015 branch1 100 2 | 2016 branch2 100 3 | -------------------------------------------------------------------------------- /impala/dyn_test/branch_totals_tuesday.txt: -------------------------------------------------------------------------------- 1 | 2016 branch2 200 2 | 2017 branch3 1000 3 | -------------------------------------------------------------------------------- /impala/dyn_test/dyn_part.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS branch_totals; 2 | DROP TABLE IF EXISTS monday_totals; 3 | DROP TABLE IF EXISTS tuesday_totals; 4 | 5 | CREATE TABLE monday_totals( 6 | year INT, 7 | branch STRING, 8 | total INT 9 | ) 10 | ROW FORMAT DELIMITED 11 | FIELDS TERMINATED BY '\t'; 12 | 13 | LOAD DATA INPATH '/loudacre/branch_totals_monday.txt' 14 | INTO TABLE monday_totals; 15 | 16 | CREATE TABLE tuesday_totals LIKE monday_totals; 17 | 18 | LOAD DATA INPATH '/loudacre/branch_totals_tuesday.txt' 19 | INTO TABLE tuesday_totals; 20 | 21 | CREATE TABLE branch_totals( 22 | branch STRING, 23 | total INT) 24 | PARTITIONED BY (year INT) 25 | STORED AS PARQUET; 26 | 27 | INSERT OVERWRITE TABLE branch_totals 28 | PARTITION(year) 29 | SELECT branch, total, year 30 | FROM monday_totals; 31 | 32 | SELECT year, branch, total 33 | FROM branch_totals 34 | ORDER BY year; 35 | 36 | INSERT OVERWRITE TABLE branch_totals 37 | PARTITION(year) 38 | SELECT branch, total, year 39 | FROM tuesday_totals; 40 | 41 | SELECT year, branch, total 42 | FROM branch_totals 43 | ORDER BY year; 44 | -------------------------------------------------------------------------------- /impala/dyn_test/run_me_hive.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | hdfs dfs -mkdir -p /loudacre 3 | hdfs dfs -put -f branch_totals_monday.txt /loudacre 4 | hdfs dfs -put -f branch_totals_tuesday.txt /loudacre 5 | beeline -u jdbc:hive2://localhost:10000 \ 6 | -f dyn_part.sql \ 7 | --silent=true \ 8 | --hiveconf hive.exec.dynamic.partition=true \ 9 | --hiveconf hive.exec.dynamic.partition.mode=nonstrict 10 | 11 | 12 | -------------------------------------------------------------------------------- /impala/dyn_test/run_me_impala.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | hdfs dfs -mkdir -p /loudacre 3 | hdfs dfs -put -f branch_totals_monday.txt /loudacre 4 | hdfs dfs -put -f branch_totals_tuesday.txt /loudacre 5 | impala-shell -f dyn_part.sql --quiet 6 | -------------------------------------------------------------------------------- /impala/file_format_shootout/README.TXT: -------------------------------------------------------------------------------- 1 | There should be a database/table tpcds_sample.store_sales 2 | -------------------------------------------------------------------------------- /impala/file_format_shootout/count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Simply a script to run some counts, selects on the various tables 3 | IMPALAD=$1 4 | if [[ -z "$IMPALAD" ]]; then 5 | echo "Usage $0 " 6 | exit 1 7 | fi 8 | 9 | impala-shell --impalad $IMPALAD -q "INVALIDATE METADATA;" 10 | 11 | for tbl in seq_store_sales parquet_store_sales rc_store_sales store_sales 12 | do 13 | QUERY=$(cat < substr(s_zip, 1, 5) 24 | and ss_store_sk = s_store_sk 25 | and ss_sold_date_sk between 2451484 and 2451513 -- partition key filter 26 | group by 27 | i_brand, 28 | i_brand_id, 29 | i_manufact_id, 30 | i_manufact 31 | order by 32 | ext_price desc, 33 | i_brand, 34 | i_brand_id, 35 | i_manufact_id, 36 | i_manufact 37 | limit 100; 38 | -- end query 1 in stream 0 using template query19.tpl 39 | -------------------------------------------------------------------------------- /impala/file_format_shootout/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | IMPALAD=$1 4 | test -z "$IMPALAD" && { 5 | echo "Usage: $0 " 6 | exit 1 7 | } 8 | 9 | echo "This will drop and recreate parquet, sequencefile, rcfile data!. Press ENTER to continue, Ctrl-C to cancel" 10 | read GOAHEAD 11 | 12 | # Display commands before being run 13 | set -x 14 | 15 | # Zap tables prior to running. There should be a tpcds_sample database with store_sales table in it. 16 | hive -f ./drop_tables.sql 17 | # First, create the parquet table! 18 | impala-shell --impalad $IMPALAD --refresh_after_connect -f ./create_and_populate_parquet_table.sql 19 | 20 | # Then, use Impala to cheat and easily create/define the RC Table Definition 21 | # I haven't found out how to use the CREATE TABLE LIKE in Hive with RCFilez 22 | impala-shell --impalad $IMPALAD --refresh_after_connect -f ./create_rc_and_sequencefile_table.sql 23 | 24 | # Now, Use Hive to populate RC and SequenceFile Tables. Impala can't do that yet. 25 | # http://www.cloudera.com/content/cloudera-content/cloudera-docs/Impala/latest/Installing-and-Using-Impala/ciiu_file_formats.html 26 | hive -f ./populate_rc_and_sequencefile_table.sql 27 | -------------------------------------------------------------------------------- /impala/file_format_shootout/run_q_19.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | IMPALAD=$1 3 | DB=$2 4 | if [[ -z "$IMPALAD" ]]; then 5 | echo "Usage $0 " 6 | exit 1 7 | fi 8 | if [[ -z "$DB" ]]; then 9 | echo "Usage $0 " 10 | exit 1 11 | fi 12 | 13 | impala-shell --database=$DB --impalad=$IMPALAD -q "alter table big_tpcds_parquet.store_sales set cached in 'four_gig_pool';" 14 | # impala-shell --database=$DB --impalad=$IMPALAD -q "COMPUTE STATS store_sales;" 15 | impala-shell --database=$DB --impalad=$IMPALAD --query_file=./q19.sql 16 | # Don't run this vvvvvvvvvv 17 | # impala-shell --big_tpcds --impalad $IMPALAD -q "INVALIDATE METADATA;" 18 | -------------------------------------------------------------------------------- /impala/google-ngrams/README.md: -------------------------------------------------------------------------------- 1 | # README 2 | 3 | Impala script to search through some google-ngrams data from a *very* small 4 | subset of http://storage.googleapis.com/books/ngrams/books/datasetsv2.html 5 | -------------------------------------------------------------------------------- /impala/google-ngrams/count_spark.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE IF NOT EXISTS 2 | google_ngrams(line STRING) 3 | LOCATION '/google-ngrams'; 4 | 5 | REFRESH google_ngrams; 6 | 7 | SELECT COUNT(line) 8 | FROM google_ngrams 9 | WHERE line LIKE "%spark%"; 10 | -------------------------------------------------------------------------------- /impala/google-ngrams/find_spark.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE IF NOT EXISTS 2 | google_ngrams(line STRING) 3 | LOCATION '/google-ngrams'; 4 | 5 | REFRESH google_ngrams; 6 | 7 | SELECT COUNT(line) 8 | FROM google_ngrams 9 | WHERE line LIKE "%spark%"; 10 | -------------------------------------------------------------------------------- /impala/google-ngrams/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [[ -z "$1" ]]; then 3 | echo "Usage: $0 " 4 | exit 1 5 | fi 6 | 7 | impala-shell -i $1 -f find_spark.sql 8 | -------------------------------------------------------------------------------- /impala/impala-impyla-playground/README.TXT: -------------------------------------------------------------------------------- 1 | # README 2 | 3 | impala-impyla-playground is a set of loose scripts to use python's dynamic 4 | language features to easily write "unit test" for queries written in Impala 5 | -------------------------------------------------------------------------------- /impala/impala-impyla-playground/data/simple.txt: -------------------------------------------------------------------------------- 1 | 1 Nate 2 | 1 Nate 3 | 1 Nate 4 | 1 Nate 5 | 1 Nate 6 | 1 Nate 7 | 1 Nate 8 | 1 Nate 9 | 1 Nate 10 | 1 Nate 11 | 1 Nate 12 | 1 Nate 13 | 1 Nate 14 | 1 Nate 15 | 1 Nate 16 | 1 Nate 17 | 1 Nate 18 | 1 Nate 19 | 1 Nate 20 | 1 Nate 21 | 1 Nate 22 | 1 Nate 23 | 1 Nate 24 | 1 Nate 25 | 1 Nate 26 | 1 Nate 27 | 1 Nate 28 | 1 Nate 29 | 1 Nate 30 | 1 Nate 31 | 1 Nate 32 | 1 Nate 33 | 1 Nate 34 | 1 Nate 35 | 1 Nate 36 | 1 Nate 37 | 1 Nate 38 | 1 Nate 39 | 1 Nate 40 | 1 Nate 41 | 42 | -------------------------------------------------------------------------------- /impala/impala-impyla-playground/simple.py: -------------------------------------------------------------------------------- 1 | from impala.dbapi import connect 2 | conn = connect(host='localhost', port=21050) 3 | cur = conn.cursor() 4 | cur.execute(""" 5 | CREATE EXTERNAL TABLE IF NOT EXISTS simple(id INT, name STRING) 6 | ROW FORMAT DELIMITED 7 | FIELDS TERMINATED BY '\t' 8 | STORED AS TEXTFILE 9 | LOCATION '/user/cloudera/tables/simple'; 10 | """) 11 | 12 | cur.execute("SELECT * FROM simple"); 13 | results = cur.fetchall() 14 | assert 41 == len(results); 15 | 16 | 17 | -------------------------------------------------------------------------------- /impala/impyla/query_impala.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | import sys 3 | from impala.dbapi import connect 4 | 5 | which_db = "tpcds" 6 | impalad = "" 7 | 8 | if len(sys.argv) > 1: 9 | impalad = sys.argv[1] 10 | else: 11 | print "Usage: query_impyla.py " 12 | exit(1) 13 | 14 | def message(m): 15 | print "-" * 20 16 | print m 17 | 18 | def show_tables(db): 19 | cur.execute('SHOW TABLES IN %s' % db) 20 | tables = cur.fetchall() 21 | message("The tables in the %s database are: " % db) 22 | print tables 23 | 24 | def top_five_customers(db): 25 | # This method is obviously database dependent and assumes the TPCDS-DB 26 | 27 | cur.execute("DESCRIBE %s.customer" % db) 28 | 29 | message("Showing customer schema") 30 | print "Fields in customer are:" 31 | for fieldz in cur.fetchall(): 32 | print "%-25s %-25s" % (fieldz[0], fieldz[1]) 33 | 34 | message("Customer Data") 35 | cur.execute("SELECT c_last_name, c_first_name FROM %s.customer WHERE c_last_name IS NOT NULL ORDER BY c_last_name DESC LIMIT 50" % db) 36 | customers = cur.fetchall() 37 | 38 | print "%-25s %-25s\n%s" % ("Last Name", "First Name", "-" * 50) 39 | for c in customers: 40 | print "%-25s %-25s" % c 41 | 42 | 43 | conn = connect(host=impalad, port = 21050) 44 | cur = conn.cursor() 45 | 46 | show_tables(which_db) 47 | 48 | top_five_customers(which_db) 49 | 50 | -------------------------------------------------------------------------------- /impala/parquet/README.txt: -------------------------------------------------------------------------------- 1 | Query: select max(ss_coupon_amt) FROM tpcds.store_sales LIMIT 10 2 | +--------------------+ 3 | | max(ss_coupon_amt) | 4 | +--------------------+ 5 | | 19225 | 6 | +--------------------+ 7 | Returned 1 row(s) in 467.99s 8 | 9 | real 7m48.449s 10 | user 0m0.835s 11 | sys 0m0.230s 12 | 13 | -- Parquet 14 | Query: select max(ss_coupon_amt) FROM tpcds.parquet_store_sales LIMIT 10 15 | +--------------------+ 16 | | max(ss_coupon_amt) | 17 | +--------------------+ 18 | | 19225 | 19 | +--------------------+ 20 | Returned 1 row(s) in 7.44s 21 | real 0m7.869s 22 | user 0m0.430s 23 | sys 0m0.057s 24 | -------------------------------------------------------------------------------- /impala/parquet/run.sh: -------------------------------------------------------------------------------- 1 | IMPALAD=$1 2 | if [[ -z "$IMPALAD" ]]; then 3 | echo "Usage: run.sh " 4 | exit 1 5 | fi 6 | echo "Running against Parquet table......" 7 | time impala-shell --impalad $IMPALAD -q "select ss_coupon_amt FROM tpcds.parquet_store_sales WHERE ss_coupon_amt IS NOT NULL ORDER BY ss_coupon_amt DESC LIMIT 10;" 8 | echo "Running against Text table......" 9 | time impala-shell --impalad $IMPALAD -q "select ss_coupon_amt FROM tpcds.store_sales WHERE ss_coupon_amt IS NOT NULL ORDER BY ss_coupon_amt DESC LIMIT 10;" 10 | -------------------------------------------------------------------------------- /impala/refresh-and-invalidate/create-table.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE IF NOT EXISTS hadoop_examples; 2 | USE hadoop_examples; 3 | DROP TABLE IF EXISTS refresh_test; 4 | CREATE TABLE IF NOT EXISTS refresh_test(id INT); 5 | -------------------------------------------------------------------------------- /impala/refresh-and-invalidate/monday.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 4 5 | 5 6 | 6 7 | 7 8 | 8 9 | 9 10 | 10 11 | -------------------------------------------------------------------------------- /impala/refresh-and-invalidate/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | IMPALAD1=$1 4 | IMPALAD2=$2 5 | DBNAME=hadoop_examples 6 | TBLNAME=refresh_test 7 | 8 | if [[ -z "$IMPALAD1" || -z "$IMPALAD2" ]]; then 9 | echo "run.sh " 10 | exit 1 11 | fi 12 | 13 | # Using Hive, otherwise Impala doesn't drop existing files :-O 14 | hive -S -f ./create-table.sql 15 | hive -S -e "LOAD DATA LOCAL INPATH 'monday.txt' INTO TABLE $DBNAME.$TBLNAME" 16 | 17 | echo "Issuing query to $IMPALAD1 for # of rows" 18 | impala-shell --quiet --impalad $IMPALAD1 -q "SELECT COUNT(*) FROM $DBNAME.$TBLNAME" 19 | 20 | echo "Issuing query to $IMPALAD2 for # of rows" 21 | impala-shell --quiet --impalad $IMPALAD2 -q "SELECT COUNT(*) FROM $DBNAME.$TBLNAME" 22 | 23 | echo "Loading more data, but NO refresh" 24 | hive -S -e "LOAD DATA LOCAL INPATH 'tuesday.txt' INTO TABLE $DBNAME.$TBLNAME" 25 | 26 | echo "Issuing query to $IMPALAD1 for # of rows, should still see only 10" 27 | impala-shell --quiet --impalad $IMPALAD1 -q "SELECT COUNT(*) FROM $DBNAME.$TBLNAME" 28 | 29 | echo "Issuing REFRESH to $IMPALAD1 for # of rows, should now see 20" 30 | impala-shell --quiet --impalad $IMPALAD1 -q "REFRESH $DBNAME.$TBLNAME;SELECT COUNT(*) FROM $DBNAME.$TBLNAME" 31 | 32 | echo "Issuing count query to $IMPALAD2, withOUT refresh, should see 20 due to catalog server caching" 33 | impala-shell --quiet --impalad $IMPALAD2 -q "SELECT COUNT(*) FROM $DBNAME.$TBLNAME" 34 | -------------------------------------------------------------------------------- /impala/refresh-and-invalidate/tuesday.txt: -------------------------------------------------------------------------------- 1 | 11 2 | 12 3 | 13 4 | 14 5 | 15 6 | 16 7 | 17 8 | 18 9 | 19 10 | 20 11 | -------------------------------------------------------------------------------- /impala/refresh-and-invalidate/wednesday.txt: -------------------------------------------------------------------------------- 1 | 21 2 | 22 3 | 23 4 | 24 5 | 25 6 | 26 7 | 27 8 | 28 9 | 29 10 | 30 11 | -------------------------------------------------------------------------------- /impala/simple_queries/create_tables.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE IF NOT EXISTS he; 2 | CREATE TABLE IF NOT EXISTS he.customers( 3 | cust_id STRING, 4 | first_name STRING) 5 | ROW FORMAT DELIMITED 6 | FIELDS TERMINATED BY '\t'; 7 | 8 | CREATE TABLE IF NOT EXISTS he.orders( 9 | order_id INT, 10 | cust_id STRING, 11 | first_name STRING, 12 | order_date STRING) 13 | ROW FORMAT DELIMITED 14 | FIELDS TERMINATED BY '\t'; 15 | -------------------------------------------------------------------------------- /impala/simple_queries/customers.txt: -------------------------------------------------------------------------------- 1 | nate nate 2 | bob bob 3 | steve steve 4 | carl carl 5 | sandy sandy 6 | tom tom 7 | rip rip 8 | zip zip 9 | -------------------------------------------------------------------------------- /impala/simple_queries/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | impala-shell -f create_tables.sql 4 | hdfs dfs -put -f customers.txt /user/training 5 | hdfs dfs -put -f orders.txt /user/training 6 | impala-shell -q "LOAD DATA INPATH '/user/training/customers.txt' OVERWRITE INTO TABLE he.customers" 7 | impala-shell -q "LOAD DATA INPATH '/user/training/orders.txt' OVERWRITE INTO TABLE he.orders" 8 | -------------------------------------------------------------------------------- /impala/simple_queries/orders.txt: -------------------------------------------------------------------------------- 1 | 1 nate product1 2011-01-01 2 | 2 bob product1 2011-01-01 3 | 3 steve product1 2011-01-01 4 | 4 carl product1 2011-01-01 5 | 5 sandy product1 2011-01-01 6 | 6 tom product1 2011-01-01 7 | 6 rip product1 2011-01-01 8 | 7 rip product1 2012-01-01 9 | -------------------------------------------------------------------------------- /impala/simple_queries/subquery_in_where.sql: -------------------------------------------------------------------------------- 1 | SELECT c.cust_id FROM he.customers c 2 | WHERE cust_id IN 3 | (SELECT o.cust_id FROM he.orders o 4 | WHERE YEAR(o.order_date) = 2012); 5 | 6 | SELECT c.cust_id 7 | FROM he.customers c 8 | LEFT SEMI JOIN he.orders o 9 | ON (c.cust_id = o.cust_id 10 | AND YEAR(o.order_date) = 2012); 11 | -------------------------------------------------------------------------------- /impala/timestamps/README.md: -------------------------------------------------------------------------------- 1 | # Timestamps 2 | 3 | Example "queries" are from 4 | 5 | http://www.cloudera.com/content/cloudera/en/documentation/cloudera-impala/latest/topics/impala_timestamp.html#timestamp 6 | -------------------------------------------------------------------------------- /impala/timestamps/queries.sql: -------------------------------------------------------------------------------- 1 | select cast('1966-07-30' as timestamp); 2 | select cast('1985-09-25 17:45:30.005' as timestamp); 3 | select cast('08:30:00' as timestamp); 4 | select hour('1970-01-01 15:30:00'); -- Succeeds, returns 15. 5 | select hour('1970-01-01 15:30'); -- Returns NULL because seconds field required. 6 | select hour('1970-01-01 27:30:00'); -- Returns NULL because hour value out of range. 7 | select dayofweek('2004-06-13'); -- Returns 1, representing Sunday. 8 | select dayname('2004-06-13'); -- Returns 'Sunday'. 9 | select date_add('2004-06-13', 365); -- Returns 2005-06-13 with zeros for hh:mm:ss fields. 10 | select day('2004-06-13'); -- Returns 13. 11 | select datediff('1989-12-31','1984-09-01'); -- How many days between these 2 dates? 12 | select now(); -- Returns current date and time in local timezone. 13 | -------------------------------------------------------------------------------- /impala/timestamps/querying_timestamps.sql: -------------------------------------------------------------------------------- 1 | create table if not exists dates_and_times (actual_value_as_string STRING, t timestamp); 2 | 3 | insert into dates_and_times values 4 | ('1966-07-30', '1966-07-30'), 5 | ('1985-09-25 17:45:30.005', '1985-09-25 17:45:30.005'), 6 | ('08:30:00', '08:30:00'), 7 | (CAST(now() AS STRING), now()); 8 | 9 | select actual_value_as_string, hour(t) 10 | from dates_and_times 11 | order by actual_value_as_string; 12 | -------------------------------------------------------------------------------- /impala/tpcds/frequent_customers.sql: -------------------------------------------------------------------------------- 1 | -- For kicks, try store_sales versus parquet_store_sales 2 | -- For smaller data size, use tpcds_sample database 3 | USE tpcds_parquet; 4 | SELECT ss_customer_sk, 5 | COUNT(*) AS num_purchases, 6 | SUM(ss_net_profit) AS total_profit 7 | FROM store_sales 8 | GROUP BY ss_customer_sk 9 | ORDER BY num_purchases DESC 10 | LIMIT 100; 11 | -------------------------------------------------------------------------------- /impala/tpcds/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 100GB dataset! 3 | # 15 seconds with Parquet, versus 480 seconds with plain text 4 | IMPALAD=$1 5 | TABLE=tpcds.parquet_store_sales 6 | if [[ -z "$IMPALAD" ]]; then 7 | echo "Usage: $0 " 8 | exit 1 9 | fi 10 | 11 | impala-shell --impalad $IMPALAD -f ./frequent_customers.sql 12 | -------------------------------------------------------------------------------- /impala/tuning/compare_store_sales.sql: -------------------------------------------------------------------------------- 1 | SELECT count(*) FROM big_tpcds_parquet.store_sales; 2 | 3 | SUMMARY; 4 | PROFILE; 5 | 6 | SELECT count(*) FROM big_tpcds.store_sales; 7 | 8 | SUMMARY; 9 | PROFILE; 10 | 11 | -------------------------------------------------------------------------------- /impala/tuning/show_summary.sql: -------------------------------------------------------------------------------- 1 | /* You can run SUMMARY right after a query to get overall stats 2 | * and PROFILE to get details 3 | */ 4 | 5 | SELECT count(*) FROM shakespeare; 6 | 7 | SUMMARY; 8 | 9 | PROFILE; 10 | 11 | -------------------------------------------------------------------------------- /kafka-examples/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | he-kafka-examples.iml 3 | runme.sh 4 | -------------------------------------------------------------------------------- /kafka-examples/README.md: -------------------------------------------------------------------------------- 1 | # Running these 2 | 3 | I like the Maven exec plugin: 4 | 5 | mvn exec:java -Dexec.mainClass="com.cloudera.kafkaexamples.SimpleProducer" 6 | 7 | Also kinda cool to override the log4j properties at runtime: 8 | 9 | mvn exec:java \ 10 | -Dexec.mainClass="com.cloudera.kafkaexamples.SimpleProducer" \ 11 | -Dlog4j.configuration="file:/full/path/to/THIS_IS_COOL.properties" 12 | 13 | ## Running the From-Beginning Example 14 | 15 | You need to supply TOPIC and BOOTSTRAP_SERVERS environment variable(s) 16 | 17 | export TOPIC=customers 18 | export BOOTSTRAP_SERVERS= 19 | mvn exec:java \ 20 | -Dexec.mainClass="com.cloudera.kafkaexamples.SimpleConsumer" \ 21 | -Dexec.args="--from-beginning --group-id foogroup --bootstrap-server $BOOTSTRAP_SERVERS --topic $TOPIC" \ 22 | -Dlog4j.configuration="file:./log4jConfigs/seekToBeginning.properties" 23 | -------------------------------------------------------------------------------- /kafka-examples/THIS_IS_COOL.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=TRACE, stdout, fileAppender 2 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 3 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 4 | log4j.appender.stdout.layout.ConversionPattern=[%d] %p %m thread %X (%c)%n 5 | 6 | log4j.appender.fileAppender=org.apache.log4j.FileAppender 7 | log4j.appender.fileAppender.File=kafka-request.log 8 | log4j.appender.fileAppender.Append=False 9 | 10 | log4j.appender.fileAppender.layout=org.apache.log4j.EnhancedPatternLayout 11 | log4j.appender.fileAppender.layout.ConversionPattern= %-4r [%t] %-5p %c - %m%n 12 | 13 | 14 | 15 | # Turn on all our debugging info 16 | #log4j.logger.kafka=TRACE,fileAppender 17 | #log4j.logger.kafka.producer.async.DefaultEventHandler=DEBUG,stdout 18 | #log4j.logger.kafka.consumer.PartitionTopicInfo=TRACE,stdout 19 | #log4j.logger.kafka.request.logger=TRACE,fileAppender 20 | #log4j.additivity.kafka.request.logger=false 21 | #log4j.logger.kafka.network.Processor=TRACE,fileAppender 22 | #log4j.additivity.kafka.network.Processor=false 23 | #log4j.logger.org.I0Itec.zkclient.ZkClient=DEBUG 24 | -------------------------------------------------------------------------------- /kafka-examples/log4jConfigs/seekToBeginning.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, stdout, fileAppender 2 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 3 | log4j.appender.stdout.layout=org.apache.log4j.EnhancedPatternLayout 4 | log4j.appender.stdout.layout.ConversionPattern= %d{HH:mm:ss,SSS} %-5p %c - %m%n 5 | 6 | log4j.appender.fileAppender=org.apache.log4j.FileAppender 7 | log4j.appender.fileAppender.File=kafka-request.log 8 | log4j.appender.fileAppender.Append=False 9 | 10 | log4j.appender.fileAppender.layout=org.apache.log4j.EnhancedPatternLayout 11 | # http://logging.apache.org/log4j/1.2/apidocs/org/apache/log4j/EnhancedPatternLayout.html 12 | # Date, priority, class, message and linefeed. 13 | log4j.appender.fileAppender.layout.ConversionPattern= %d{HH:mm:ss,SSS} %-5p %c - %m%n 14 | 15 | # Turn on all our debugging info 16 | log4j.logger.com.cloudera.kafkaexamples=DEBUG,fileAppender,stdout 17 | -------------------------------------------------------------------------------- /kafka-examples/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=TRACE, fileAppender 2 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 3 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 4 | log4j.appender.stdout.layout.ConversionPattern=[%d] %p %m (%c)%n 5 | 6 | log4j.appender.fileAppender=org.apache.log4j.FileAppender 7 | log4j.appender.fileAppender.File=kafka-request.log 8 | log4j.appender.fileAppender.Append=False 9 | 10 | log4j.appender.fileAppender.layout=org.apache.log4j.PatternLayout 11 | log4j.appender.fileAppender.layout.ConversionPattern= %-4r [%t] %-5p %c %x - %m%n 12 | 13 | 14 | # Turn on all our debugging info 15 | #log4j.logger.kafka=TRACE,fileAppender 16 | #log4j.logger.kafka.producer.async.DefaultEventHandler=DEBUG,stdout 17 | #log4j.logger.kafka.consumer.PartitionTopicInfo=TRACE,stdout 18 | #log4j.logger.kafka.request.logger=TRACE,fileAppender 19 | #log4j.additivity.kafka.request.logger=false 20 | #log4j.logger.kafka.network.Processor=TRACE,fileAppender 21 | #log4j.additivity.kafka.network.Processor=false 22 | #log4j.logger.org.I0Itec.zkclient.ZkClient=DEBUG -------------------------------------------------------------------------------- /kite-sdk/README.md: -------------------------------------------------------------------------------- 1 | # Kite SDK Examples 2 | 3 | Playing with Kite SDK: http://kitesdk.org 4 | 5 | # Kite CLI 6 | 7 | http://kitesdk.org/docs/0.18.0/Install-Kite.html 8 | 9 | See install-kite-cli.sh 10 | -------------------------------------------------------------------------------- /kite-sdk/install-kite-cli.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Download kite-dataset commandline executable 3 | SOME_DIR_ON_PATH=~/bin 4 | cd $SOME_DIR_ON_PATH 5 | curl http://central.maven.org/maven2/org/kitesdk/kite-tools/0.18.0/kite-tools-0.18.0-binary.jar -o $SOME_DIR_ON_PATH/kite-dataset 6 | chmod +x $SOME_DIR_ON_PATH/kite-dataset 7 | -------------------------------------------------------------------------------- /kite-sdk/simple-cli/README.md: -------------------------------------------------------------------------------- 1 | Using HBASE_HOME=/opt/cloudera/parcels/CDH/lib/hadoop/../hbase 2 | 3 | Needed to export HIVE_HOME=/opt/cloudera/parcels/CDH/lib/hadoop/../hive 4 | -------------------------------------------------------------------------------- /kite-sdk/simple-cli/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export HIVE_HOME=/opt/cloudera/parcels/CDH/lib/hadoop/../hive 3 | debug=true kite-dataset -v create sandwiches -s sandwich.avsc 4 | -------------------------------------------------------------------------------- /kite-sdk/simple-cli/sandwich.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "type" : "record", 3 | "name" : "Sandwich", 4 | "doc" : "Schema generated by Kite", 5 | "fields" : [ { 6 | "name" : "name", 7 | "type" : [ "null", "string" ], 8 | "doc" : "Type inferred from 'Reuben'" 9 | }, { 10 | "name" : "description", 11 | "type" : [ "null", "string" ], 12 | "doc" : "Type inferred from 'Pastrami and sauerkraut on toasted rye with Russian dressing.'" 13 | } ] 14 | } -------------------------------------------------------------------------------- /kite-sdk/simple-cli/sandwiches.csv: -------------------------------------------------------------------------------- 1 | name,description 2 | Reuben,Pastrami and sauerkraut on toasted rye with Russian dressing. 3 | PBJ,Peanut butter and grape jelly on white bread. 4 | -------------------------------------------------------------------------------- /kudu/dataframes/kuduDF.scala: -------------------------------------------------------------------------------- 1 | val customersDF = spark.read.format("org.apache.kudu.spark.kudu"). 2 | option("kudu.master", "master-2:7051"). 3 | option("kudu.table", "customers"). 4 | load() 5 | 6 | customersDF.show(10) 7 | 8 | // Reverse the name 9 | val customersReversedNameDF = customersDF.withColumn("name", reverse(customersDF("name"))) 10 | 11 | customersReversedNameDF.write.format("org.apache.kudu.spark.kudu"). 12 | option("kudu.master", "master-2:7051"). 13 | option("kudu.table", "customers"). 14 | mode("append"). 15 | save() 16 | 17 | // Requery the data, m'kay? 18 | val customersDFAfterReverse = spark.read.format("org.apache.kudu.spark.kudu"). 19 | option("kudu.master", "master-2:7051"). 20 | option("kudu.table", "customers"). 21 | load() 22 | customersDFAfterReverse.show(10) 23 | -------------------------------------------------------------------------------- /kudu/range-partitioning/README.md: -------------------------------------------------------------------------------- 1 | Partitioning example 2 | 3 | Run the RUNME.sh script,then go to the Kudu tablet server, and 4 | there should be 27 tablets. Only 2 of the tablets should have data: 5 | 6 | * Tablet with - "a" as the bound 7 | * Tablet with "z" - as the bound 8 | 9 | Screenshot: https://www.evernote.com/l/AOKPPTreBGBGJowqPxYFX22VxNR0yGt1_QY 10 | -------------------------------------------------------------------------------- /kudu/range-partitioning/RUNME.sh: -------------------------------------------------------------------------------- 1 | hdfs dfs -mkdir -p /user/training/people 2 | hdfs dfs -rm /user/training/people/* 3 | hdfs dfs -put ./people.txt /user/training/people 4 | impala-shell -f ./create_people.sql 5 | -------------------------------------------------------------------------------- /kudu/range-partitioning/create_hashed_metrics.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS kudu_logs; 2 | DROP TABLE IF EXISTS logs; 3 | 4 | CREATE EXTERNAL TABLE logs 5 | (time BIGINT, 6 | metric STRING) 7 | ROW FORMAT DELIMITED 8 | FIELDS TERMINATED BY '\t' 9 | LOCATION '/user/training/logs'; 10 | 11 | CREATE TABLE kudu_logs 12 | DISTRIBUTE BY HASH(time) INTO 5 BUCKETS, RANGE(metric) 13 | SPLIT ROWS( 14 | ('1') 15 | ) 16 | TBLPROPERTIES( 17 | 'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler', 18 | 'kudu.table_name' = 'kudu_logs', 19 | 'kudu.master_addresses' = 'localhost:7051', 20 | 'kudu.key_columns' = 'time, metric') 21 | AS SELECT * FROM logs; 22 | -------------------------------------------------------------------------------- /kudu/range-partitioning/create_people.sql: -------------------------------------------------------------------------------- 1 | -- NOTE: First mkdir /user/training/people then 2 | -- hdfs dfs -put people.txt /user/training/people 3 | DROP TABLE IF EXISTS kudu_people; 4 | DROP TABLE IF EXISTS people; 5 | 6 | CREATE EXTERNAL TABLE people 7 | (name STRING) 8 | ROW FORMAT DELIMITED 9 | FIELDS TERMINATED BY '\t' 10 | LOCATION '/user/training/people'; 11 | 12 | CREATE TABLE kudu_people 13 | DISTRIBUTE BY RANGE(name) 14 | SPLIT ROWS( 15 | ("a"), 16 | ("b"), 17 | ("c"), 18 | ("d"), 19 | ("e"), 20 | ("f"), 21 | ("g"), 22 | ("h"), 23 | ("i"), 24 | ("j"), 25 | ("k"), 26 | ("l"), 27 | ("m"), 28 | ("n"), 29 | ("o"), 30 | ("p"), 31 | ("q"), 32 | ("r"), 33 | ("s"), 34 | ("t"), 35 | ("u"), 36 | ("v"), 37 | ("w"), 38 | ("x"), 39 | ("y"), 40 | ("z")) 41 | TBLPROPERTIES( 42 | 'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler', 43 | 'kudu.table_name' = 'kudu_people', 44 | 'kudu.master_addresses' = 'localhost:7051', 45 | 'kudu.key_columns' = 'name') 46 | AS SELECT * FROM people; 47 | -------------------------------------------------------------------------------- /kudu/range-partitioning/people.txt: -------------------------------------------------------------------------------- 1 | }this guy 2 | {curly brace dude} 3 | {{two curly brace dude} 4 | _underscore dude 5 | !someone 6 | [another person 7 | [[another person 8 | [[[another person 9 | -------------------------------------------------------------------------------- /mr/kill_job_from_mapper/README.md: -------------------------------------------------------------------------------- 1 | # Try Kill 2 | 3 | This MapReduce job shows how to kill a job from a Mapper / Reducer 4 | using the Context object. 5 | 6 | The downgrade() method was the toughest thing to find, and 7 | it helped to look through the source code for JobClient. 8 | 9 | # Setup 10 | 11 | Simply use ./run.sh to see this job kill itself from a Map. 12 | 13 | The job uses NLineInputFormat to create 1 Mapper for each line of 14 | ./somedata.txt. The Maps all read their *one* line of massive data, 15 | and if they don't find a "100", then the Map calls "killJob". 16 | 17 | # Challenge 18 | 19 | It would be cool for the Mapper who kills the job to report itself to the 20 | master (Application Master or JobTracker). This would make finding the 21 | "offending" Map task much easier for admins by looking at the Job History log, 22 | instead of scouring through 100 Maps whose state is "FAILED". 23 | 24 | The "state" of the Maps in this job is either "SUCCEEDED" or "FAILED". 25 | It would seem that "KILLED" would be a more fitting state for the Maps that 26 | were running when the job was killed. Hmmmm. 27 | -------------------------------------------------------------------------------- /mr/kill_job_from_mapper/compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | javac -cp `hadoop classpath` solution/*java 3 | -------------------------------------------------------------------------------- /mr/kill_job_from_mapper/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | hadoop fs -put -f somedata.txt 4 | rm -f solution/*.class 5 | rm -f TryKill.jar 6 | javac -cp `hadoop classpath` solution/*java 7 | jar cvf TryKill.jar solution/*.class 8 | hadoop jar TryKill.jar solution.TryKill -Dmapred.job.name="Job Kill From Mapper" somedata.txt 9 | -------------------------------------------------------------------------------- /mr/kill_job_from_mapper/somedata.txt: -------------------------------------------------------------------------------- 1 | 100 2 | 100 3 | 100 4 | 100 5 | 100 6 | 100 7 | 100 8 | CHAOS MONKEY 9 | 100 10 | 100 11 | 100 12 | -------------------------------------------------------------------------------- /mr/local_jobrunner/simple-example/.gitignore: -------------------------------------------------------------------------------- 1 | outputDir 2 | -------------------------------------------------------------------------------- /mr/local_jobrunner/simple-example/SimpleDriver.java: -------------------------------------------------------------------------------- 1 | import org.apache.hadoop.util.Tool; 2 | import org.apache.hadoop.util.ToolRunner; 3 | import org.apache.hadoop.conf.Configured; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 7 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 8 | import org.apache.hadoop.mapreduce.Job; 9 | 10 | public class SimpleDriver extends Configured implements Tool { 11 | public static void main(String[] args) throws Exception { 12 | int exitCode = ToolRunner.run(new Configuration(), new SimpleDriver(), args); 13 | } 14 | public int run (String [] args) throws Exception { 15 | if (args.length != 2) { 16 | System.out.printf( 17 | args.length + " - Usage: SimpleDriver \n"); 18 | System.exit(-1); 19 | } 20 | 21 | // Example of "new" way to instantiate Job 22 | Job job = Job.getInstance(getConf()); 23 | job.setJarByClass(SimpleDriver.class); 24 | job.setJobName("New Job constuctor example"); 25 | 26 | FileInputFormat.setInputPaths(job, new Path(args[0])); 27 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 28 | 29 | boolean success = job.waitForCompletion(true); 30 | return success ? 0 : 1; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /mr/local_jobrunner/simple-example/compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | JAR_FILE=SimpleDriver.jar 4 | rm -f *.class 5 | rm -f $JAR_FILE 6 | javac -Xlint:deprecation -cp `hadoop classpath` *java 7 | jar cvf $JAR_FILE *.class 8 | -------------------------------------------------------------------------------- /mr/local_jobrunner/simple-example/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | JAR_FILE=SimpleDriver.jar 4 | ./compile.sh || exit 1 5 | test -d outputDir && rm -rf outputDir 6 | 7 | hadoop jar $JAR_FILE SimpleDriver -fs=file:/// -jt=local -Dmapred.job.name="Simple THIS!" somedata.txt outputDir 8 | -------------------------------------------------------------------------------- /mr/local_jobrunner/simple-example/somedata.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 4 5 | 4A 6 | B 7 | C 8 | D 9 | 45 10 | 65 11 | 45 12 | 33 13 | 14 | -------------------------------------------------------------------------------- /mr/map_only_streaming/mapper.pl: -------------------------------------------------------------------------------- 1 | #!/bin/env perl 2 | while (<>) { 3 | $num++ 4 | } 5 | print "$num\tYep.\n"; 6 | -------------------------------------------------------------------------------- /mr/map_only_streaming/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | STREAMING_JAR=/opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar 3 | MAPPER=mapper.pl 4 | OUTPUT_DIR=output/map_only_streaming 5 | INPUT=tpcds_data.dat 6 | 7 | hadoop fs -rm -R $OUTPUT_DIR 8 | 9 | hadoop jar $STREAMING_JAR \ 10 | -D mapred.map.tasks=20 \ 11 | -D mapred.reduce.tasks=0 \ 12 | -input $INPUT -output $OUTPUT_DIR \ 13 | -mapper $MAPPER -file $MAPPER 14 | -------------------------------------------------------------------------------- /mr/maven_project_template_CDH4/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | MR1a 4 | NO_M2ECLIPSE_SUPPORT: Project files created with the maven-eclipse-plugin are not supported in M2Eclipse. 5 | 6 | 7 | 8 | org.eclipse.jdt.core.javabuilder 9 | 10 | 11 | 12 | org.eclipse.jdt.core.javanature 13 | 14 | -------------------------------------------------------------------------------- /mr/maven_project_template_CDH4/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | #Mon Oct 22 09:22:55 CEST 2012 2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 3 | eclipse.preferences.version=1 4 | org.eclipse.jdt.core.compiler.source=1.6 5 | org.eclipse.jdt.core.compiler.compliance=1.6 6 | -------------------------------------------------------------------------------- /mr/maven_project_template_CDH4/README.txt: -------------------------------------------------------------------------------- 1 | Project Template for CDH4.2 Maven based projects. 2 | 3 | 4 | 5 | mvn clean 6 | mvn compile 7 | mvn test 8 | -------------------------------------------------------------------------------- /mr/maven_project_template_CDH4/TUTORIAL/Maven and CDH4.odt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/maven_project_template_CDH4/TUTORIAL/Maven and CDH4.odt -------------------------------------------------------------------------------- /mr/maven_project_template_CDH4/src/main/java/CDHTRAINING/App.java: -------------------------------------------------------------------------------- 1 | package CDHTRAINING; 2 | 3 | /** 4 | * Hello world! 5 | * 6 | */ 7 | public class App 8 | { 9 | public static void main( String[] args ) 10 | { 11 | System.out.println( "Hello World!" ); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /mr/maven_project_template_CDH4/src/main/java/SumReducer.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.util.Iterator; 3 | 4 | import org.apache.hadoop.io.IntWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapred.OutputCollector; 7 | import org.apache.hadoop.mapred.MapReduceBase; 8 | import org.apache.hadoop.mapred.Reducer; 9 | import org.apache.hadoop.mapred.Reporter; 10 | 11 | public class SumReducer extends MapReduceBase implements 12 | Reducer { 13 | 14 | @Override 15 | public void reduce(Text key, Iterator values, 16 | OutputCollector output, Reporter reporter) 17 | throws IOException { 18 | 19 | int wordCount = 0; 20 | while (values.hasNext()) { 21 | IntWritable value = values.next(); 22 | wordCount += value.get(); 23 | } 24 | output.collect(key, new IntWritable(wordCount)); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /mr/maven_project_template_CDH4/src/main/java/WordMapper.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.LongWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapred.MapReduceBase; 7 | import org.apache.hadoop.mapred.Mapper; 8 | import org.apache.hadoop.mapred.OutputCollector; 9 | import org.apache.hadoop.mapred.Reporter; 10 | 11 | public class WordMapper extends MapReduceBase implements 12 | Mapper { 13 | 14 | @Override 15 | public void map(LongWritable key, Text value, 16 | OutputCollector output, Reporter reporter) 17 | throws IOException { 18 | String s = value.toString(); 19 | for (String word : s.split("\\W+")) { 20 | if (word.length() > 0) { 21 | output.collect(new Text(word), new IntWritable(1)); 22 | } 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /mr/maven_project_template_CDH4/src/test/java/CDHTRAINING/AppTest.java: -------------------------------------------------------------------------------- 1 | package CDHTRAINING; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /mr/maven_project_template_CDH4/target/classes/CDHTRAINING/App.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/maven_project_template_CDH4/target/classes/CDHTRAINING/App.class -------------------------------------------------------------------------------- /mr/maven_project_template_CDH4/target/classes/SumReducer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/maven_project_template_CDH4/target/classes/SumReducer.class -------------------------------------------------------------------------------- /mr/maven_project_template_CDH4/target/classes/WordMapper.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/maven_project_template_CDH4/target/classes/WordMapper.class -------------------------------------------------------------------------------- /mr/maven_project_template_CDH4/target/surefire-reports/CDHTRAINING.AppTest.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------- 2 | Test set: CDHTRAINING.AppTest 3 | ------------------------------------------------------------------------------- 4 | Tests run: 1, Failures: 0, Errors: 0, Skipped: 0, Time elapsed: 0.012 sec 5 | -------------------------------------------------------------------------------- /mr/maven_project_template_CDH4/target/surefire-reports/TestWordCount.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------- 2 | Test set: TestWordCount 3 | ------------------------------------------------------------------------------- 4 | Tests run: 3, Failures: 0, Errors: 0, Skipped: 0, Time elapsed: 1.584 sec 5 | -------------------------------------------------------------------------------- /mr/maven_project_template_CDH4/target/test-classes/CDHTRAINING/AppTest.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/maven_project_template_CDH4/target/test-classes/CDHTRAINING/AppTest.class -------------------------------------------------------------------------------- /mr/maven_project_template_CDH4/target/test-classes/TestWordCount.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/maven_project_template_CDH4/target/test-classes/TestWordCount.class -------------------------------------------------------------------------------- /mr/nlineinputformat/README.md: -------------------------------------------------------------------------------- 1 | # README 2 | 3 | This example shows NLineInputFormat to take records from 4 | task_list.txt and simply print them using ./mapper.pl 5 | 6 | It shows the NLineInputFormat can be used to create X mappers 7 | per line -- in this example, there's 1 mapper for each line of the 8 | input file. 9 | 10 | -------------------------------------------------------------------------------- /mr/nlineinputformat/generate_task_list.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | if len(sys.argv) > 1: 5 | num_lines = int(sys.argv[1]) 6 | else: 7 | num_lines = 1000 8 | 9 | for i in range(0, num_lines): 10 | print i 11 | -------------------------------------------------------------------------------- /mr/nlineinputformat/mapper.pl: -------------------------------------------------------------------------------- 1 | #!/bin/env perl 2 | while (<>) { 3 | print $_; 4 | } 5 | -------------------------------------------------------------------------------- /mr/nlineinputformat/task_list.txt: -------------------------------------------------------------------------------- 1 | 0 2 | 1 3 | 2 4 | 3 5 | 4 6 | 5 7 | 6 8 | 7 9 | 8 10 | 9 11 | 0 12 | 0 13 | 1 14 | 1 15 | 1 16 | 2 17 | 3 18 | 4 19 | 5 20 | 6 21 | 7 22 | 8 23 | 9 24 | 0 25 | 1 26 | 2 27 | 2 28 | 3 29 | 4 30 | 5 31 | 6 32 | 7 33 | 8 34 | 9 35 | 0 36 | 1 37 | 2 38 | 2 39 | 3 40 | 4 41 | 5 42 | 6 43 | 7 44 | 8 45 | 9 46 | 0 47 | 1 48 | 2 49 | -------------------------------------------------------------------------------- /mr/rest_api/basic.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # http://hadoop.apache.org/docs/r2.3.0/hadoop-yarn/hadoop-yarn-site/WebServicesIntro.html#URIs 3 | RESOURCE_MGR=$1 4 | test -z "$1" && { 5 | echo "Usage: $0 " 6 | exit 1 7 | } 8 | curl --compressed -H "Accept: application/json" -X GET "http://$RESOURCE_MGR:8088/ws/v1/cluster" 9 | -------------------------------------------------------------------------------- /mr/streaming_config_dumper/mapper.pl: -------------------------------------------------------------------------------- 1 | #!/bin/env perl 2 | # Just print this mapper's env vars 3 | while(<>) {} # <--- this is weird, the scripts won't finish (They'll crash with "Broken Pipe" errors unless you close STDIN explicitly 4 | # or use a while(<>) {} 5 | foreach $key(keys(%ENV)) { 6 | print $key, "\t", $ENV{$key}, "\n"; 7 | } 8 | print STDERR "I've fallen and can't get up! + $ENV{map_input_file}\n"; 9 | -------------------------------------------------------------------------------- /mr/streaming_config_dumper/reducer.pl: -------------------------------------------------------------------------------- 1 | #!/bin/env perl 2 | # Get all key value pairs, and just concatenate them together 3 | use strict; 4 | use warnings; 5 | my $curr_key; 6 | my $prev_key; 7 | my $curr_val; 8 | my $val; 9 | 10 | while(<>) { 11 | ($curr_key, $curr_val) = split /\t/; 12 | if ($prev_key && ($curr_key ne $prev_key)) { 13 | print $prev_key, "\t", $val, "\n"; 14 | $prev_key = $curr_key; 15 | $val = $curr_val; 16 | } 17 | else { 18 | $prev_key = $curr_key; 19 | $val .= " $curr_val"; 20 | } 21 | } 22 | if ($curr_key) { 23 | print $curr_key, "\t", $val, "\n"; 24 | } 25 | 26 | -------------------------------------------------------------------------------- /mr/streaming_config_dumper/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Purpose of this is to collect the ENV stuff from all map tasks 3 | # shows a lot of the cool stuff that Hadoop gives to streaming tasks 4 | MAPPER=mapper.pl 5 | REDUCER=reducer.pl 6 | OUTPUT_DIR=output/nothing 7 | INPUT=something.txt 8 | 9 | hadoop fs -test -e /user/training/something.txt || hadoop fs -put something.txt 10 | hadoop fs -rm -R $OUTPUT_DIR 11 | 12 | hadoop jar /usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-*.jar \ 13 | -D com.example.something=hello \ 14 | -input $INPUT -output $OUTPUT_DIR \ 15 | -mapper $MAPPER -file $MAPPER \ 16 | -reducer $REDUCER -file $REDUCER 17 | -------------------------------------------------------------------------------- /mr/streaming_config_dumper/something.txt: -------------------------------------------------------------------------------- 1 | something 2 | wherefore 3 | -------------------------------------------------------------------------------- /mr/total_order_partitioner/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | total.order.partitioner 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /mr/total_order_partitioner/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | #Thu Apr 19 09:46:53 CDT 2012 2 | eclipse.preferences.version=1 3 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 6 | org.eclipse.jdt.core.compiler.compliance=1.6 7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 12 | org.eclipse.jdt.core.compiler.source=1.6 13 | -------------------------------------------------------------------------------- /mr/total_order_partitioner/README.txt: -------------------------------------------------------------------------------- 1 | This is an example implementation for the TotalOrderPartitioner: 2 | 3 | Run it with our weblog test data: 4 | 5 | hadoop jar tot-ord-part.jar solution.ProcessLogs weblog RESULTxyz 6 | -------------------------------------------------------------------------------- /mr/total_order_partitioner/bin/solution/ProcessLogs.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/total_order_partitioner/bin/solution/ProcessLogs.class -------------------------------------------------------------------------------- /mr/total_order_partitioner/bin/solution/domain/MapperFunction.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/total_order_partitioner/bin/solution/domain/MapperFunction.class -------------------------------------------------------------------------------- /mr/total_order_partitioner/bin/solution/mr/CountReducer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/total_order_partitioner/bin/solution/mr/CountReducer.class -------------------------------------------------------------------------------- /mr/total_order_partitioner/bin/solution/mr/IdentityMapper.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/total_order_partitioner/bin/solution/mr/IdentityMapper.class -------------------------------------------------------------------------------- /mr/total_order_partitioner/bin/solution/mr/LogMonthMapper.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/total_order_partitioner/bin/solution/mr/LogMonthMapper.class -------------------------------------------------------------------------------- /mr/total_order_partitioner/bin/solution/mr/SumReducer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/total_order_partitioner/bin/solution/mr/SumReducer.class -------------------------------------------------------------------------------- /mr/total_order_partitioner/bin/solution/mr/WordMapper.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanNeff/hadoop-examples/4727128aa75e72cd957f56ee229c15c1b4e69bc4/mr/total_order_partitioner/bin/solution/mr/WordMapper.class -------------------------------------------------------------------------------- /mr/total_order_partitioner/src/solution/domain/MapperFunction.java: -------------------------------------------------------------------------------- 1 | package solution.domain; 2 | 3 | import java.util.Arrays; 4 | import java.util.List; 5 | 6 | import org.apache.hadoop.io.Text; 7 | 8 | /** 9 | * This is a helper class, which encapsulates the logic of our mapper in a 10 | * "non hadoop" class which can be tested even without MRUnit. 11 | * 12 | * @author training 13 | * 14 | */ 15 | public class MapperFunction { 16 | 17 | public static List months = Arrays.asList("Jan", "Feb", "Mar", 18 | "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"); 19 | 20 | static String[] kv = new String[2]; 21 | 22 | /** 23 | * Example input line: 96.7.4.14 - - [24/Apr/2011:04:20:11 -0400] 24 | * "GET /cat.jpg HTTP/1.1" 200 12433 25 | * 26 | */ 27 | public static String[] getKVPair(String value) { 28 | 29 | kv[0] = null; 30 | kv[1] = null; 31 | 32 | /* 33 | * Split the input line into space-delimited fields. 34 | */ 35 | String[] fields = value.split(" "); 36 | 37 | if (fields.length > 3) { 38 | 39 | /* 40 | * Save the first field in the line as the IP address. 41 | */ 42 | // String ip = fields[0]; 43 | kv[0] = fields[0]; 44 | 45 | /* 46 | * The fourth field contains [dd/Mmm/yyyy:hh:mm:ss]. Split the 47 | * fourth field into "/" delimited fields. The second of these 48 | * contains the month. 49 | */ 50 | String[] dtFields = fields[3].split("/"); 51 | 52 | if (dtFields.length > 1) { 53 | 54 | String theMonth = dtFields[1]; 55 | 56 | /* check if it's a valid month, if so, write it out */ 57 | if (months.contains(theMonth)) { 58 | kv[1] = theMonth; 59 | } 60 | } 61 | } 62 | 63 | return kv; 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /mr/total_order_partitioner/src/solution/mr/CountReducer.java: -------------------------------------------------------------------------------- 1 | package solution.mr; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Reducer; 8 | 9 | /* Counts the number of values associated with a key */ 10 | 11 | public class CountReducer extends Reducer { 12 | 13 | @Override 14 | public void reduce(Text key, Iterable values, Context context) 15 | throws IOException, InterruptedException { 16 | 17 | /* 18 | * Iterate over the values iterable and count the number 19 | * of values in it. Emit the key (unchanged) and an IntWritable 20 | * containing the number of values. 21 | */ 22 | 23 | int count = 0; 24 | 25 | /* 26 | * Use for loop to count items in the iterator. 27 | */ 28 | 29 | /* Ignore warnings that we 30 | * don't use the value -- in this case, we only need to count the 31 | * values, not use them. 32 | */ 33 | for (@SuppressWarnings("unused") 34 | Text value : values) { 35 | 36 | /* 37 | * for each item in the list, increment the count 38 | */ 39 | count++; 40 | } 41 | 42 | context.write(key, new IntWritable(count)); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /mr/total_order_partitioner/src/solution/mr/IdentityMapper.java: -------------------------------------------------------------------------------- 1 | package solution.mr; 2 | 3 | import java.io.IOException; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Mapper; 10 | 11 | import solution.domain.MapperFunction; 12 | 13 | 14 | public class IdentityMapper extends Mapper { 15 | 16 | 17 | /** 18 | * Example input line: 19 | * 96.7.4.14 - - [24/Apr/2011:04:20:11 -0400] "GET /cat.jpg HTTP/1.1" 200 12433 20 | * 21 | */ 22 | @Override 23 | public void map(Text key, Text value, Context context) 24 | throws IOException, InterruptedException { 25 | 26 | context.write( key, value); 27 | 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /mr/total_order_partitioner/src/solution/mr/LogMonthMapper.java: -------------------------------------------------------------------------------- 1 | package solution.mr; 2 | 3 | import java.io.IOException; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Mapper; 10 | 11 | import solution.domain.MapperFunction; 12 | 13 | 14 | public class LogMonthMapper extends Mapper { 15 | 16 | Text k = new Text(); 17 | Text v = new Text(); 18 | 19 | /** 20 | * Example input line: 21 | * 96.7.4.14 - - [24/Apr/2011:04:20:11 -0400] "GET /cat.jpg HTTP/1.1" 200 12433 22 | * 23 | */ 24 | @Override 25 | public void map(LongWritable key, Text value, Context context) 26 | throws IOException, InterruptedException { 27 | 28 | String[] kv = MapperFunction.getKVPair(value.toString()); 29 | 30 | 31 | if ( kv[1] != null ) { 32 | k.set( kv[1]+"."+kv[0] ); 33 | v.set( kv[1] ); 34 | context.write( k, v); 35 | } 36 | 37 | } 38 | 39 | 40 | 41 | } 42 | -------------------------------------------------------------------------------- /mr/total_order_partitioner/src/solution/mr/SumReducer.java: -------------------------------------------------------------------------------- 1 | package solution.mr; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Reducer; 8 | 9 | /* 10 | * To define a reduce function for your MapReduce job, subclass 11 | * the Reducer class and override the reduce method. 12 | * The class definition requires four parameters: 13 | * The data type of the input key (which is the output key type 14 | * from the mapper) 15 | * The data type of the input value (which is the output value 16 | * type from the mapper) 17 | * The data type of the output key 18 | * The data type of the output value 19 | */ 20 | public class SumReducer extends Reducer { 21 | 22 | /* 23 | * The reduce method runs once for each key received from 24 | * the shuffle and sort phase of the MapReduce framework. 25 | * The method receives a key of type Text, a set of values of type 26 | * IntWritable, and a Context object. 27 | */ 28 | @Override 29 | public void reduce(Text key, Iterable values, Context context) 30 | throws IOException, InterruptedException { 31 | int wordCount = 0; 32 | 33 | /* 34 | * For each value in the set of values passed to us by the mapper: 35 | */ 36 | for (IntWritable value : values) { 37 | 38 | /* 39 | * Add the value to the word count counter for this key. 40 | */ 41 | wordCount += value.get(); 42 | } 43 | 44 | /* 45 | * Call the write method on the Context object to emit a key 46 | * and a value from the reduce method. 47 | */ 48 | context.write(key, new IntWritable(wordCount)); 49 | } 50 | } -------------------------------------------------------------------------------- /mr/total_order_partitioner/tot-ord-part.jardesc: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /mr/yarn_containers/README.md: -------------------------------------------------------------------------------- 1 | # Container Memory Allocation 2 | 3 | These are the properties that I had to change/verify to make sure 4 | that NodeManagers could allocate, for example 8 containers for 5 | 512 MB maps, and 4 containers for 1024 MB maps 6 | 7 | ```xml 8 | 9 | yarn.nodemanager.resource.memory-mb 10 | 4096 11 | yarn-site.xml 12 | 13 | 14 | 15 | yarn.scheduler.minimum-allocation-mb 16 | 512 17 | yarn-site.xml 18 | 19 | 20 | 21 | yarn.nodemanager.resource.cpu-vcores 22 | 8 23 | yarn-site.xml 24 | 25 | ``` 26 | 27 | Also, you can set 28 | 29 | ``mapreduce.map.memory.mb`` or ``mapreduce.map.reduce.mb`` on a per-job basis like this: 30 | 31 | hadoop jar ./SleepJob.jar SleepJob -Dmapreduce.map.memory.mb=1024 -m 100 -r 10 -mt 240000 32 | 33 | Or, set the defaults in the mapred-site.xml. 34 | 35 | And, of course, if you really want the JVMs to actually use the memory or not 36 | use it, you must specify: 37 | 38 | mapreduce.map.java.opts # (Default is 200MB!!) 39 | mapreduce.reduce.java.opts # (Default is 200MB!!) 40 | 41 | -------------------------------------------------------------------------------- /mr/yarn_containers/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | rm SleepJob.class 3 | javac -cp `hadoop classpath` ./SleepJob.java 4 | jar cvf SleepJob.jar *.class 5 | # This will spawn 4 containers per Node Manager if each NM has 6 | # 4 GB of yarn.nodemanager.resource.memory-mb 7 | # (if it's the only job running of course) 8 | hadoop jar ./SleepJob.jar SleepJob -Dmapreduce.map.memory.mb=1024 -m 100 -r 10 -mt 240000 9 | 10 | # This will spawn 8 containers per Node Manager if each NM has 11 | # 4 GB of yarn.nodemanager.resource.memory-mb 12 | # (if it's the only job running of course) 13 | hadoop jar ./SleepJob.jar SleepJob -Dmapreduce.map.memory.mb=512 -m 100 -r 10 -mt 240000 14 | 15 | -------------------------------------------------------------------------------- /mr/yarn_containers/test_container_boundaries/README.md: -------------------------------------------------------------------------------- 1 | # Container Memory Management 2 | 3 | See ./run.sh and SleepJobWithArray.java for how YARN kills 4 | tasks which request more memory than their containers have. The SleepJobWithArray simply 5 | tries to instantiate an array of ints that is greater than the container's memory size. 6 | 7 | The weird thing is that I can't get YARN to kill the Java process simply because it 8 | tries to start with an Xmx (Or Xms) that's greater than the container's memory size. 9 | 10 | The java process actually has to have the code that requests > memory than the YARN container 11 | has. Notice that in SleepJobWithArray, it requests an array of 512 MB, which is under 12 | the Java Heap Size that's requested Xmx=1024m. 13 | 14 | -------------------------------------------------------------------------------- /mr/yarn_containers/test_container_boundaries/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Test how many times a naughty process is retried when it violates the container's memory size 3 | set -e 4 | rm -f *.class 5 | javac -cp `hadoop classpath` ./SleepJobWithArray.java 6 | jar cvf SleepJobWithArray.jar *.class 7 | 8 | # Max container size is 256 MB, but Java heap is 1024. 9 | # even *that* won't kill the job. Java code actually 10 | # must request the memory. 11 | 12 | # This won't be killed 13 | hadoop jar ./SleepJobWithArray.jar SleepJobWithArray \ 14 | -Dmapreduce.job.name="Sleep without init array" \ 15 | -Dmapreduce.map.memory.mb=256 \ 16 | -Dmapreduce.map.java.opts=-Xms1024m \ 17 | -DinitBigArray=false \ 18 | -m 1 -r 1 -mt 1000 19 | 20 | # This won't crash either 21 | hadoop jar ./SleepJobWithArray.jar SleepJobWithArray \ 22 | -Dmapreduce.job.name="Sleep, but init smallish array" \ 23 | -Dmapreduce.map.memory.mb=256 \ 24 | -Dmapreduce.map.java.opts=-Xms1024m \ 25 | -DinitBigArray=true \ 26 | -DbigArraySize=1000000 \ 27 | -m 1 -r 1 -mt 1000 28 | 29 | # This will crash after overstepping 256M container limit? 30 | hadoop jar ./SleepJobWithArray.jar SleepJobWithArray \ 31 | -Dmapreduce.job.name="Sleep, but init big array" \ 32 | -Dmapreduce.map.memory.mb=256 \ 33 | -Dmapreduce.map.java.opts=-Xms1024m \ 34 | -DinitBigArray=true \ 35 | -DbigArraySize=256000000 \ 36 | -m 1 -r 1 -mt 1000 37 | 38 | # This will NOT crash because we bumped the container size 39 | hadoop jar ./SleepJobWithArray.jar SleepJobWithArray \ 40 | -Dmapreduce.job.name="Sleep, but init big array" \ 41 | -Dmapreduce.map.memory.mb=512 \ 42 | -Dmapreduce.map.java.opts=-Xms1024m \ 43 | -DinitBigArray=true \ 44 | -DbigArraySize=256000000 \ 45 | -m 1 -r 1 -mt 1000 46 | -------------------------------------------------------------------------------- /pig/configuration/README.md: -------------------------------------------------------------------------------- 1 | # Configuration playground 2 | 3 | # Log verbosity 4 | 5 | The main examples deal with supressing log messages from Pig 6 | 7 | Copy the following files to /etc/pig/conf: 8 | 9 | log4j.local is an attempt to silence anything in localmode that's not ERROR 10 | -------------------------------------------------------------------------------- /pig/configuration/fixpig.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Script to deploy log4j.properties 3 | sudo cp /etc/pig/conf/log4j.properties /etc/pig/conf/log4j.properties.bak 4 | sudo cp log4j.local /etc/pig/conf/log4j.properties 5 | -------------------------------------------------------------------------------- /pig/configuration/log4j.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # ***** Set root logger level to DEBUG and its only appender to A. 17 | log4j.logger.org.apache.pig=ERROR, A 18 | log4j.logger.org.apache.hadoop.conf.Configuration=ERROR, A 19 | 20 | # ***** A is set to be a ConsoleAppender. 21 | log4j.appender.A=org.apache.log4j.ConsoleAppender 22 | # ***** A uses PatternLayout. 23 | log4j.appender.A.layout=org.apache.log4j.PatternLayout 24 | log4j.appender.A.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n 25 | 26 | 27 | -------------------------------------------------------------------------------- /pig/configuration/pig0.12/README.md: -------------------------------------------------------------------------------- 1 | # Pig 0.12 log4j -- suppress INFO and lower logs 2 | 3 | Had to change log level for a couple more packages 4 | See conf/log4j.properties 5 | 6 | 7 | Also, not a promis the this is the best way, but... 8 | -------------------------------------------------------------------------------- /pig/configuration/pig0.12/conf/log4j.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # ***** Set root logger level to DEBUG and its only appender to A. 17 | log4j.logger.mapred=ERROR, A 18 | log4j.logger.org.apache.hadoop.metrics.jvm=ERROR, A 19 | log4j.logger.org.apache.hadoop.mapreduce=ERROR, A 20 | log4j.logger.org.apache.hadoop.mapred=ERROR, A 21 | log4j.logger.org.apache.pig=ERROR, A 22 | log4j.logger.org.apache.hadoop.conf.Configuration=ERROR, A 23 | 24 | # ***** A is set to be a ConsoleAppender. 25 | log4j.appender.A=org.apache.log4j.ConsoleAppender 26 | # ***** A uses PatternLayout. 27 | log4j.appender.A.layout=org.apache.log4j.PatternLayout 28 | log4j.appender.A.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n 29 | 30 | 31 | -------------------------------------------------------------------------------- /pig/configuration/pig0.12/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Show log4j properties needed for Pig 0.12 CDH *quiet* mode 3 | pig -4 ./conf/log4j.properties -x local sales.pig 4 | -------------------------------------------------------------------------------- /pig/configuration/pig0.12/sales.pig: -------------------------------------------------------------------------------- 1 | sales = LOAD 'sales.txt' AS (salesperson_id, amount); 2 | sales_tuples = FOREACH sales GENERATE (salesperson_id, amount); -- This generates a bag with a single tuple field 3 | 4 | -- This is what you want (No parens) 5 | sales_bag = FOREACH sales GENERATE salesperson_id, amount; 6 | sh echo "This is sales BAG"; 7 | DUMP sales_bag; 8 | sh echo "This is sales TUPLE"; 9 | DUMP sales_tuples; 10 | -------------------------------------------------------------------------------- /pig/configuration/pig0.12/sales.txt: -------------------------------------------------------------------------------- 1 | bob 100 2 | steve 200 3 | -------------------------------------------------------------------------------- /pig/explain-split-vs-filter/explain-using-dot.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sudo yum install graphviz 3 | pig -x local -e 'explain -script ./using-filter.pig' -dot -out using-filter.dot 4 | pig -x local -e 'explain -script ./using-split.pig' -dot -out using-split.dot 5 | for fil in *dot; do 6 | # output all graphs in the *dot files (there's three per .dot file) 7 | # automagically generate filenames ($fil.1.pdf, $fil.2.pdf, etc) 8 | # enjoy 9 | dot -Tpdf $fil -O 10 | done 11 | -------------------------------------------------------------------------------- /pig/explain-split-vs-filter/using-filter.pig: -------------------------------------------------------------------------------- 1 | wlogs = load 'webcrawl.txt' as (pageid, url, timestamp); 2 | apr03 = filter wlogs by timestamp < '20110404'; 3 | apr02 = filter wlogs by timestamp < '20110403' and timestamp > '20110401'; 4 | apr01 = filter wlogs by timestamp < '20110402' and timestamp > '20110331'; 5 | store apr03 into 'filter/20110403'; 6 | store apr02 into 'filter/20110402'; 7 | store apr01 into 'filter/20110401'; 8 | -------------------------------------------------------------------------------- /pig/explain-split-vs-filter/using-split.pig: -------------------------------------------------------------------------------- 1 | wlogs = load 'webcrawl.txt' as (pageid, url, timestamp); 2 | split wlogs into apr03 if timestamp < '20110404', 3 | apr02 if timestamp < '20110403' and timestamp > '20110401', 4 | apr01 if timestamp < '20110402' and timestamp > '20110331'; 5 | 6 | store apr03 into 'split/20110403'; 7 | store apr02 into 'split/20110402'; 8 | store apr01 into 'split/20110401'; 9 | -------------------------------------------------------------------------------- /pig/explain-split-vs-filter/webcrawl.txt: -------------------------------------------------------------------------------- 1 | http://pig.apache.org 1 {(http://pig.apache.org/index.html),(http://pig.apache.org/releases.html),(http://pig.apache.org/about.html),(http://pig.apache.org/mailing_lists.html),(http://pig.apache.org/whoweare.html),(http://pig.apache.org/bylaws.html),(http://pig.apache.org/privacypolicy.html),(http://pig.apache.org/issue_tracking.html),(http://pig.apache.org/version_control.html),(http://pig.apache.org/philosophy.html)} 2 | http://pig.apache.org/index.html 1 {} 3 | -------------------------------------------------------------------------------- /pig/generate/conf/log4j.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # ***** Set root logger level to DEBUG and its only appender to A. 17 | log4j.logger.mapred=ERROR, A 18 | log4j.logger.org.apache.hadoop.metrics.jvm=ERROR, A 19 | log4j.logger.org.apache.hadoop.mapreduce=ERROR, A 20 | log4j.logger.org.apache.hadoop.mapred=ERROR, A 21 | log4j.logger.org.apache.pig=ERROR, A 22 | log4j.logger.org.apache.hadoop.conf.Configuration=ERROR, A 23 | 24 | # ***** A is set to be a ConsoleAppender. 25 | log4j.appender.A=org.apache.log4j.ConsoleAppender 26 | # ***** A uses PatternLayout. 27 | log4j.appender.A.layout=org.apache.log4j.PatternLayout 28 | log4j.appender.A.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n 29 | 30 | 31 | -------------------------------------------------------------------------------- /pig/generate/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Show log4j properties needed for Pig 0.12 CDH *quiet* mode 3 | pig -4 ./conf/log4j.properties -x local sales.pig 4 | -------------------------------------------------------------------------------- /pig/generate/sales.pig: -------------------------------------------------------------------------------- 1 | sales = LOAD 'sales.txt' AS (salesperson_id, amount); 2 | sales_tuples = FOREACH sales GENERATE (salesperson_id, amount); -- This generates a bag with a single tuple field 3 | 4 | -- This is what you want (No parens) 5 | sales_bag = FOREACH sales GENERATE salesperson_id, amount; 6 | sh echo "This is sales BAG"; 7 | DUMP sales_bag; 8 | sh echo "This is sales TUPLE"; 9 | DUMP sales_tuples; 10 | -------------------------------------------------------------------------------- /pig/generate/sales.txt: -------------------------------------------------------------------------------- 1 | bob 100 2 | steve 200 3 | -------------------------------------------------------------------------------- /pig/hcatalog/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # sudo yum install hcatalog 3 | pig -useHCatalog store_sales.pig 4 | -------------------------------------------------------------------------------- /pig/hcatalog/sample_store_sales.pig: -------------------------------------------------------------------------------- 1 | salez = LOAD 'tpcds.parquet_store_sales' USING org.apache.hcatalog.pig.HCatLoader(); 2 | sampld = SAMPLE salez 0.1; 3 | STORE sampld INTO 'zanky'; 4 | -------------------------------------------------------------------------------- /pig/hcatalog/store_sales.pig: -------------------------------------------------------------------------------- 1 | salez = LOAD 'tpcds.store_sales' USING org.apache.hcatalog.pig.HCatLoader(); 2 | DESCRIBE salez; 3 | -------------------------------------------------------------------------------- /pig/local-mode-hacks/README.md: -------------------------------------------------------------------------------- 1 | # Local Mode Hacks 2 | 3 | Wanted to find out if Pig can be called: 4 | 5 | - Both local execution and local filesystem is easy `$pig -x local` 6 | - Local execution but HDFS filesystem. `$ pig -jt local` seems to work, even on YARN 7 | 8 | Yay! 9 | -------------------------------------------------------------------------------- /pig/local-mode-hacks/read_some_hdfs_data.pig: -------------------------------------------------------------------------------- 1 | hdfs_data = LOAD 'THISISINHDFS.txt'; 2 | grpd = GROUP hdfs_data ALL; 3 | counted = FOREACH grpd GENERATE COUNT(hdfs_data); 4 | DUMP counted; 5 | 6 | -------------------------------------------------------------------------------- /pig/local-mode-hacks/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Put some data into HDFS and run Pig locally, but refer to data in HDFS 3 | hadoop fs -put -f somedata.txt THISISINHDFS.txt 4 | pig -jt local read_some_hdfs_data.pig 5 | -------------------------------------------------------------------------------- /pig/local-mode-hacks/somedata.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 4 5 | 5 6 | 6 7 | 7 8 | 8 9 | 9 10 | 10 11 | -------------------------------------------------------------------------------- /pig/round/data.txt: -------------------------------------------------------------------------------- 1 | bob 1.456 2 | bob 1.456 3 | bob 1.456 4 | bob 1.456 5 | bob 1.456 6 | bob 1.456 7 | bob 1.456 8 | steve 123.456 9 | supersteve 123.456 10 | supersteve 123.756 11 | supersteve 123.756 12 | supersteve 123.856 13 | supersteve 123.856 14 | supersteve 123.156 15 | fluffy 123.456 16 | ted 123.700 17 | harsh 123.456 18 | ian 123.456 19 | peabody 123.456 20 | squirko 123.456 21 | mirko 123.456 22 | kai 123.456 23 | sarah 123.456 24 | ted 123.700 25 | -------------------------------------------------------------------------------- /pig/round/results.txt: -------------------------------------------------------------------------------- 1 | bob 1.5 2 | bob 1.5 3 | bob 1.5 4 | bob 1.5 5 | bob 1.5 6 | bob 1.5 7 | bob 1.5 8 | steve 123.5 9 | supersteve 123.5 10 | supersteve 123.8 11 | supersteve 123.8 12 | supersteve 123.9 13 | supersteve 123.9 14 | supersteve 123.2 15 | fluffy 123.5 16 | ted 123.7 17 | harsh 123.5 18 | ian 123.5 19 | peabody 123.5 20 | squirko 123.5 21 | mirko 123.5 22 | kai 123.5 23 | sarah 123.5 24 | ted 123.7 25 | -------------------------------------------------------------------------------- /pig/round/round_this.pig: -------------------------------------------------------------------------------- 1 | data = LOAD 'data.txt' AS (name:chararray, amount:float); 2 | round = FOREACH data GENERATE name, (float)(ROUND(amount*10))/10 AS data; 3 | STORE round INTO 'round'; 4 | -------------------------------------------------------------------------------- /pig/sampling/sample_tpcds.pig: -------------------------------------------------------------------------------- 1 | sales = LOAD 'tpcds/store_sales'; 2 | sampl = SAMPLE sales 0.01; 3 | STORE sampl INTO 'tpcds_sample/store_sales'; 4 | -------------------------------------------------------------------------------- /spark/data-generator/hash-data-generator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | curl https://raw.githubusercontent.com/eneko/data-repository/master/data/words.txt > words.txt 3 | sudo -u hdfs hdfs dfs -mkdir /user/ec2-user 4 | sudo -u hdfs hdfs dfs -chown ec2-user /user/ec2-user 5 | hdfs dfs -put words.txt /user/ec2-user/words 6 | */ 7 | import java.security.MessageDigest 8 | 9 | val words = sc.textFile("words", 20) 10 | val moreWords = words.flatMap(word => List(word, word.toUpperCase(), 11 | word.toLowerCase(), 12 | word.reverse.toUpperCase(), 13 | word + "!", 14 | word + "?", 15 | word.reverse.toLowerCase())) 16 | 17 | val md5s = moreWords.mapPartitions{iterator => 18 | val md5 = MessageDigest.getInstance("MD5") 19 | val sha1 = MessageDigest.getInstance("SHA-1") 20 | val sha256 = MessageDigest.getInstance("SHA-256") 21 | iterator.map(word => List(word, 22 | md5.digest(word.getBytes).map("%02x".format(_)).mkString, 23 | sha1.digest(word.getBytes).map("%02x".format(_)).mkString, 24 | sha256.digest(word.getBytes).map("%02x".format(_)).mkString).mkString("\t")) 25 | } 26 | 27 | md5s.saveAsTextFile("hash-data") 28 | -------------------------------------------------------------------------------- /spark/data-parsing/data-parsing-using-try.scala: -------------------------------------------------------------------------------- 1 | import scala.util.{Try, Failure, Success} 2 | // Define test data 3 | // With thanks to http://rcardin.github.io/big-data/apache-spark/scala/programming/2016/09/25/try-again-apache-spark.html 4 | val orig_data = Array( 5 | "1", 6 | "2", 7 | "trash", 8 | "4" 9 | ) 10 | 11 | val data = sc.parallelize(orig_data) 12 | val weblogs = data.map(line => Try(line.toInt)) 13 | println(weblogs.getClass) 14 | val good = weblogs.filter(d => d.isSuccess) 15 | weblogs.collect() 16 | good.collect() 17 | -------------------------------------------------------------------------------- /spark/data-parsing/data-parsing.scala: -------------------------------------------------------------------------------- 1 | // Define test data 2 | val orig_data = Array( 3 | "1.2.3.4 - 12345 \"[1/1/2017 12:00:00]\" \"/some.jpg GET\" 200 9999", 4 | "1.2.3.4 - 12345 \"[1/1/2017 12:00:02]\" \"/home.html GET\" 200 9997", 5 | "trash", 6 | "1.2.3.5 - aaaaa" 7 | ) 8 | 9 | // case class to mimic data 10 | case class Weblog(ip:String, userid:String, req_ts:String) 11 | 12 | // Define regex to parse test data into case class (only ip, userid and req_ts 13 | // for now) 14 | val regex = """(.*) - (\d+) \"\[(.+)\]\".*""".r 15 | 16 | // Fancy print/debug function 17 | def printWeblog(weblog:Weblog) = 18 | println(s"""Data: $weblog 19 | Class: ${weblog.getClass} 20 | IP:${weblog.ip} 21 | Userid:${weblog.userid} 22 | Request Timestamp:${weblog.req_ts} 23 | ----------------""") 24 | 25 | // Go! 26 | val data = sc.parallelize(orig_data) 27 | val weblogs = data.map{ 28 | case regex(ip, userid, req_ts) => 29 | Weblog(ip, userid, req_ts) 30 | case line => 31 | Console.err.println(s"Unexpected line: $line") 32 | Weblog("error", line, "") 33 | } 34 | 35 | val errors = weblogs.filter(wl => wl.ip == "error") 36 | val not_errors = weblogs.filter(wl => wl.ip != "error") 37 | 38 | println("These are errors") 39 | for (r <- errors.collect()) { 40 | printWeblog(r) 41 | } 42 | 43 | println("These are NOT errors") 44 | for (r <- not_errors.collect()) { 45 | printWeblog(r) 46 | } 47 | 48 | -------------------------------------------------------------------------------- /spark/dataframes/README.org: -------------------------------------------------------------------------------- 1 | * Installation 2 | 3 | Most of these examples use data from the "hadoop-examples-data" directory in this 4 | repository. I recommend to run: 5 | 6 | $ hdfs dfs -put hadoop-examples-data 7 | 8 | * Column Renaming 9 | 10 | - Example in column_rename_after_joins.py 11 | - https://sparkbyexamples.com/spark/rename-a-column-on-spark-dataframes/ 12 | - https://stackoverflow.com/questions/50287558/how-to-rename-duplicated-columns-after-join 13 | - https://stackoverflow.com/questions/33778664/spark-dataframe-distinguish-columns-with-duplicated-name#33779190 14 | - References this KB article: https://kb.databricks.com/data/join-two-dataframes-duplicated-columns.html 15 | -------------------------------------------------------------------------------- /spark/dataframes/alias.scala: -------------------------------------------------------------------------------- 1 | // Note: Put data/people.json into the HDFS home directory 2 | val df = spark.read.json("people.json") 3 | 4 | df.select($"firstName" as "fn_as", 5 | $"firstName" alias "fname_alias", 6 | $"firstName" name "First_name").show() 7 | 8 | -------------------------------------------------------------------------------- /spark/dataframes/analyzingExerciseUsingSparkSQL.py: -------------------------------------------------------------------------------- 1 | # First, create / massage all possible data sources 2 | # ----- Accounts 3 | # Create a DataFrame based on the Hive accounts table 4 | accountsDF = spark.read.table("devsh.accounts")\ 5 | .select("acct_num", "acct_close_dt")\ 6 | .withColumnRenamed("acct_num", "account_id") 7 | 8 | accountsDF.createOrReplaceTempView("accounts") 9 | 10 | # ----- Account Devices 11 | # Load accountdevice data to HDFS in another terminal window 12 | # $ hdfs dfs -put $DEVDATA/accountdevice/ /devsh_loudacre/ 13 | accountDeviceDF = spark.read.option("header","true").\ 14 | option("inferSchema","true")\ 15 | .csv("/devsh_loudacre/accountdevice") 16 | 17 | accountDeviceDF.createOrReplaceTempView("account_devices") 18 | 19 | # ----- Devices 20 | devicesDF = spark.read.json("/devsh_loudacre/devices.json").withColumnRenamed("devnum", "device_id") 21 | devicesDF.createOrReplaceTempView("devices") 22 | 23 | # Spark SQL!!!!! 24 | 25 | sql = """ 26 | SELECT d.device_id, d.make, d.model, COUNT(*) AS number_of_devices 27 | FROM accounts a 28 | JOIN account_devices ad ON a.account_id = ad.account_id 29 | JOIN devices d ON d.device_id = ad.device_id 30 | WHERE a.acct_close_dt IS NULL 31 | GROUP BY d.device_id, d.make, d.model 32 | ORDER BY number_of_devices DESC 33 | """ 34 | 35 | activeDeviceCountsDF = spark.sql(sql) 36 | activeDeviceCountsDF.write\ 37 | .mode("overwrite")\ 38 | .option("path","/devsh_loudacre/active_device_counts_using_spark_sql")\ 39 | .saveAsTable("devsh.active_device_counts_using_spark_sql") 40 | -------------------------------------------------------------------------------- /spark/dataframes/antiJoin.py: -------------------------------------------------------------------------------- 1 | leftFile = "hadoop-examples-data/left.csv" 2 | rightFile = "hadoop-examples-data/right.csv" 3 | opts = { 4 | "inferSchema" : True, 5 | "header" : True } 6 | 7 | leftDF = spark.read.options(**opts).csv(leftFile) 8 | print("This is leftDF:") 9 | leftDF.show() 10 | 11 | rightDF = spark.read.options(**opts).csv(rightFile) 12 | print("This is rightDF:") 13 | rightDF.show() 14 | 15 | antiJoinLeftRightDF = leftDF.join(rightDF, "name", "left_anti") 16 | print("This is left anti join right") 17 | antiJoinLeftRightDF.show() 18 | 19 | antiJoinRightLeftDF = rightDF.join(leftDF, "name", "left_anti") 20 | print("This is right anti join left") 21 | antiJoinRightLeftDF.show() 22 | -------------------------------------------------------------------------------- /spark/dataframes/columnExpresssions.scala: -------------------------------------------------------------------------------- 1 | // Note: Put data/people.json into the HDFS home directory 2 | val df = spark.read.json("people.json") 3 | 4 | assert("org.apache.spark.sql.Column" == df("firstName").getClass.getName) 5 | 6 | val firstNameTimesThree = df("firstName") * 3 7 | assert("org.apache.spark.sql.Column" == firstNameTimesThree.getClass.getName) 8 | 9 | val someColLike = $"firstName" like "Bo%" 10 | assert("org.apache.spark.sql.Column" == someColLike.getClass.getName) 11 | 12 | -------------------------------------------------------------------------------- /spark/dataframes/creatingDataFrames.scala: -------------------------------------------------------------------------------- 1 | import sys.process._ 2 | 3 | if(Seq("hdfs", "dfs", "-test", "-d", "people").! == 0) { 4 | Seq("hdfs", "dfs", "-rm", "-R", "people").! 5 | } 6 | val someDFReader = spark.read 7 | assert("org.apache.spark.sql.DataFrameReader" == someDFReader.getClass.getName) 8 | 9 | val first = Seq(Tuple1("Arvin"), Tuple1("Betty")) 10 | val second = Seq(Tuple1("Chris"), Tuple1("Derek")) 11 | val third = Seq(Tuple1("Eric"), Tuple1("Ferris")) 12 | 13 | spark.createDataFrame(first).write.save("people") 14 | 15 | spark.createDataFrame(second). 16 | write.mode("append").save("people") 17 | 18 | spark.createDataFrame(third). 19 | write.mode("ignore").save("people") 20 | 21 | val allPeople = spark.read.load("people") 22 | allPeople.count() 23 | "hdfs dfs -ls people".! 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /spark/dataframes/grouping.scala: -------------------------------------------------------------------------------- 1 | // Note: Put data/people.json into the HDFS home directory 2 | val df = spark.read.json("people.json") 3 | 4 | val grpd = df.groupBy("testScore").agg(min($"firstName") as "First FirstName", 5 | max($"firstName") as "Last FirstName", 6 | min($"studyTime") as "min studytime", 7 | sum($"studyTime") as "all_studytime"). 8 | where($"all_studytime" > 100) 9 | grpd.show() 10 | 11 | // Grouping with LEFT OUTER JOIN, and showing zero for 12 | // count of rows with no corresponding child elements 13 | 14 | val customerSeq = Seq("""{ "firstName":"Nate", "id":1 }""", 15 | """{ "firstName":"Jackie", "id":2 }""") 16 | val customersDS = spark.createDataset(customerSeq) 17 | val customersDF = spark.read.json(customersDS) 18 | 19 | val ordersSeq = Seq("""{ "cust_id":1, "product":"Something", "order_id":2}""") 20 | val ordersDS = spark.createDataset(ordersSeq) 21 | val ordersDF = spark.read.json(ordersDS) 22 | 23 | val customerOrders = customersDF.join(ordersDF, $"id" === $"cust_id", "left_outer") 24 | 25 | val customerOrderCounts = customerOrders. 26 | groupBy($"firstName", $"id"). 27 | agg(count($"cust_id") as "num_orders") 28 | 29 | customerOrderCounts.show() 30 | 31 | -------------------------------------------------------------------------------- /spark/dataframes/hadoop-examples-data/01.csv: -------------------------------------------------------------------------------- 1 | firstname,lastname,middle 2 | Bob,Johnson,Midi 3 | Steve,Worrell,Midrow 4 | -------------------------------------------------------------------------------- /spark/dataframes/hadoop-examples-data/data.csv: -------------------------------------------------------------------------------- 1 | robert,10 2 | julie,20 3 | -------------------------------------------------------------------------------- /spark/dataframes/hadoop-examples-data/interests.json: -------------------------------------------------------------------------------- 1 | { "fname":"Julie", "interest":"Pets" } 2 | { "fname":"Julie", "interest":"Cats" } 3 | { "fname":"Julie", "interest":"Dogs" } 4 | { "fname":"Aaron", "interest":"Economics" } 5 | { "fname":"Steve", "interest":"Real Estate" } 6 | { "fname":"aaron", "interest":"Fake Estate" } 7 | -------------------------------------------------------------------------------- /spark/dataframes/hadoop-examples-data/left.csv: -------------------------------------------------------------------------------- 1 | name,region 2 | Nate,west 3 | Bob,west 4 | Sir Robin,west 5 | Bartholomew,west 6 | -------------------------------------------------------------------------------- /spark/dataframes/hadoop-examples-data/maxVals.json: -------------------------------------------------------------------------------- 1 | { "product":"Car", "price":2147483650 } 2 | { "product":"Maxwell TV", "price":2147483648 } 3 | { "product":"Radio", "price":2147483647 } 4 | { "product":"Junk", "price":-2147483650 } 5 | { "product":"Defunct", "price":-2147483647 } 6 | { "product":"Mortgage", "price":-2147483646 } 7 | { "product":"Bad Price", "price":"$23000" } 8 | -------------------------------------------------------------------------------- /spark/dataframes/hadoop-examples-data/people.json: -------------------------------------------------------------------------------- 1 | { "firstName":"Julie", "testScore":30, "studyTime":100 } 2 | { "firstName":"Aaron", "testScore":20, "studyTime":200 } 3 | { "firstName":"Steve", "testScore":40, "studyTime":300 } 4 | { "firstName":"Walter", "testScore":10, "studyTime":400 } 5 | { "firstName":"Josie", "testScore":30, "studyTime":500 } 6 | { "firstName":"Willie", "testScore":10, "studyTime":600 } 7 | -------------------------------------------------------------------------------- /spark/dataframes/hadoop-examples-data/right.csv: -------------------------------------------------------------------------------- 1 | name,region 2 | Nate,east 3 | Bob,east 4 | Sir Robin,east 5 | -------------------------------------------------------------------------------- /spark/dataframes/hadoop-examples-data/two.csv: -------------------------------------------------------------------------------- 1 | lastname,firstname 2 | Smith,Terry 3 | Jackson,Billy 4 | -------------------------------------------------------------------------------- /spark/dataframes/joins.scala: -------------------------------------------------------------------------------- 1 | // Note: Put data/people.json into the HDFS home directory 2 | val dfPeople = spark.read.json("people.json") 3 | val dfInterests = spark.read.json("interests.json") 4 | 5 | dfPeople.join(dfInterests, lower($"firstName") === lower($"fname"), "left_outer"). 6 | select("firstName", "interest").show() 7 | 8 | dfPeople.join(dfInterests, lower($"firstName") === lower($"fname"), "right_outer"). 9 | select("firstName", "interest").show() 10 | 11 | dfPeople.join(dfInterests).where($"firstName" === $"fname"). 12 | select("firstName", "interest").show() 13 | -------------------------------------------------------------------------------- /spark/dataframes/rowFunctions.scala: -------------------------------------------------------------------------------- 1 | // Misc. functions on Row objects 2 | val jsonSeq = Seq("""{ "firstName":"Nate", "iq":0, "prev_iq":200}""", 3 | """{ "firstName":"Jackie", "iq":200, "prev_iq":200 }""", 4 | """{ "firstName":"Ricky", "iq":-20}""") 5 | val peopleDS = spark.createDataset(jsonSeq) 6 | 7 | val peopleDF = spark.read.json(peopleDS) 8 | val firstRow = peopleDF.take(3)(2) 9 | 10 | println(firstRow.getClass) 11 | 12 | // println(firstRow.getAs[String]("firstName")) 13 | val firstName = firstRow.getAs[String]("firstName") 14 | val iq = firstRow.getAs[Long]("iq") 15 | println(firstName) 16 | println(firstName.getClass) 17 | println(iq) 18 | println(iq.getClass) 19 | -------------------------------------------------------------------------------- /spark/dataframes/saveDataFrameToDataSource.scala: -------------------------------------------------------------------------------- 1 | val tableName = "testpeople" 2 | 3 | val first = Seq(Tuple1("Arvin"), Tuple1("Betty")) 4 | val second = Seq(Tuple1("Chris"), Tuple1("Derek")) 5 | val third = Seq(Tuple1("Eric"), Tuple1("Ferris")) 6 | 7 | println("Writing first") 8 | spark.createDataFrame(first).write.mode("overwrite").saveAsTable(tableName) 9 | 10 | println("Writing second") 11 | spark.createDataFrame(second). 12 | write.mode("append").saveAsTable(tableName) 13 | 14 | println("Writing third") 15 | spark.createDataFrame(third).write. 16 | option("path", "peopleinmyhomedir"). 17 | saveAsTable(tableName) 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /spark/dataframes/schemasCSV.scala: -------------------------------------------------------------------------------- 1 | // Put the CSV files in this directory into a 2 | // directory "csvdata" in HDFS prior to running this 3 | val opts = Map("header" -> "true", 4 | "inferSchema" -> "true") 5 | val csvDF = spark.read.options(opts).csv("csvdata") 6 | csvDF.printSchema() 7 | // csvDF.show() 8 | 9 | 10 | // Try specifying a schema and see what happens 11 | import org.apache.spark.sql.types.{StringType,StructField,StructType} 12 | val columnsList = List( 13 | StructField("firstname", StringType), 14 | StructField("lastname", StringType) 15 | ) 16 | 17 | val peopleSchema = StructType(columnsList) 18 | val csvDFWithSchema = spark.read.option("header", "true"). 19 | schema(peopleSchema).csv("csvdata") 20 | csvDFWithSchema.show() 21 | -------------------------------------------------------------------------------- /spark/dataframes/spark_sql_udfs.py: -------------------------------------------------------------------------------- 1 | # Example of using Python UDFs in Spark SQL 2 | def my_uppercase(x): 3 | upper(x) 4 | 5 | my_uppercase_udf = udf(my_uppercase, returnType=IntegerType()) 6 | rides_clean.createOrReplaceTempView("rides_clean") 7 | spark.udf.register("my_uppercase_udf", my_uppercase_udf) 8 | spark.sql("select date_time, my_uppercase_udf(date_time) from rides_clean").show() 9 | -------------------------------------------------------------------------------- /spark/dataframes/windowing.py: -------------------------------------------------------------------------------- 1 | # Spark SQL syntax for windowing 2 | # https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-window.html 3 | df = spark.range(10) 4 | df.createOrReplaceTempView("df") 5 | spark.sql(""" 6 | SELECT id, 7 | COUNT(id) OVER (ORDER BY id ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) cum_count, 8 | SUM(id) OVER (ORDER BY id ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) cum_count 9 | FROM df 10 | """).show() 11 | 12 | Example of LAG 13 | spark.sql(""" 14 | SELECT rider_id, date_time, 15 | LAG(date_time) OVER (PARTITION BY rider_id ORDER BY date_time) date_time_previous 16 | FROM rides 17 | """).show() 18 | -------------------------------------------------------------------------------- /spark/dataframes/withColumn.scala: -------------------------------------------------------------------------------- 1 | val flightsSeq = Seq("""{ "flight_num":1, "dt":"2017-01-01" }""", 2 | """{ "flight_num":2, "dt":"2017-01-02" }""", 3 | """{ "flight_num":3, "dt":"2017-01-03" }""", 4 | """{ "flight_num":4, "dt":"2017-01-04" }""") 5 | 6 | val stringRDD = sc.parallelize(flightsSeq) 7 | val flightsDS = spark.read.json(stringRDD) 8 | 9 | // There's a better way to do this (see below!) 10 | val enhanced = flightsDS.withColumn( 11 | "DayOfWeek", date_format($"dt", "E")).withColumn( 12 | "isSaturday", date_format($"dt", "E") === "Sat").withColumn( 13 | "isSunday", date_format($"dt", "E") === "Sun").withColumn( 14 | "isMonday", date_format($"dt", "E") === "Mon").withColumn( 15 | "isTuesday", date_format($"dt", "E") === "Tue").withColumn( 16 | "isWednesday", date_format($"dt", "E") === "Wed").withColumn( 17 | "isThursday", date_format($"dt", "E") === "Thu").withColumn( 18 | "isFriday", date_format($"dt", "E") === "Fri") 19 | 20 | 21 | enhanced.show() 22 | 23 | // There's a better (e.g. functional way to do this but for now . . . .) 24 | var enhancedBetter = flightsDS 25 | for (day <- Array("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")) { 26 | enhancedBetter = 27 | enhancedBetter.withColumn("is" + day, 28 | date_format($"dt", "E") === day) 29 | } 30 | 31 | -------------------------------------------------------------------------------- /spark/get-python-examples.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # You can find the python tar gz using a command like this: 3 | # sudo find / -follow -iname "*python*tar*gz" 2>/dev/null 4 | PYTHON_EXAMPLES_FILE=/opt/cloudera/parcels/CDH/lib/spark/python.tar.gz 5 | cd 6 | mkdir -p ~/python-examples && cd ~/python-examples 7 | tar -xzvf $PYTHON_EXAMPLES_FILE 8 | ls -lR 9 | 10 | # Example of running code (Remove the # in front of spark-submit) 11 | # spark-submit als.py 12 | # 2>/dev/null is a quick and dirty add which silences noisy log messages 13 | # spark-submit pi.py 100 2>/dev/null 14 | -------------------------------------------------------------------------------- /spark/local_file.scala: -------------------------------------------------------------------------------- 1 | val f = sc.textFile("file:///some_local_file.txt") 2 | f.count() 3 | -------------------------------------------------------------------------------- /spark/log-level/README.md: -------------------------------------------------------------------------------- 1 | # Log-Levels 2 | 3 | Run ./run.sh to see the difference. If ./log4j.properties is in the working 4 | directory of Spark-Shell, then it will (hopefully) read and adhere to the log4j.properties 5 | in the directory where you fire off the shell. 6 | 7 | ./log4j.properties.with.debug.log.level is an example of setting Spark's 8 | log level to DEBUG. 9 | 10 | ./log4j.properties is an example of setting Spark's log level to ERROR 11 | (to include only critical error messages in Spark's output) 12 | 13 | $SPARK_HOME/conf 14 | 15 | Example, if you installed Spark in your home directory, 16 | $HOME/tools/spark-1.0.0/conf/log4j.properties 17 | -------------------------------------------------------------------------------- /spark/log-level/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=ERROR, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Settings to quiet third party logs that are too verbose 9 | log4j.logger.org.eclipse.jetty=WARN 10 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 11 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 12 | -------------------------------------------------------------------------------- /spark/log-level/log4j.properties.with.debug.log.level: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=DEBUG, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Settings to quiet third party logs that are too verbose 9 | log4j.logger.org.eclipse.jetty=WARN 10 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 11 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 12 | -------------------------------------------------------------------------------- /spark/log-level/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | OLD_DIR=$PWD 3 | cd $HOME 4 | echo "Running Spark Shell from your $HOME directory. There (hopefully is no) log4j.properties in $HOME." 5 | echo "Notice that WARN and INFO messages will appear when you run spark-shell" 6 | echo "Enter :quit in the Spark Shell when you come to the spark> prompt. Press a key to run" 7 | read FOO 8 | spark-shell 9 | 10 | clear 11 | cd $OLD_DIR 12 | echo "Now running Spark Shell with thel log4j.properties in this directory specified. " 13 | echo "WARN and INFO messages should NOT appear in the Spark shell." 14 | echo "Enter :quit in the Spark Shell when you come to the spark> prompt. Press a key to run" 15 | read FOO 16 | spark-shell 17 | -------------------------------------------------------------------------------- /spark/maven_example/README.md: -------------------------------------------------------------------------------- 1 | # Readme 2 | 3 | # This is only to be performed on the VM, and is in no 4 | # way an example of a good practice 5 | 6 | cd ~/training_materials/sparkdev/projects/countjpgs 7 | mv pom.xml pom.xml.bak 8 | curl https://raw.githubusercontent.com/NathanNeff/hadoop-examples/master/spark/maven_example/pom.xml > pom.xml 9 | 10 | # to hack the local repository on the VM, remove the local repository by un-commenting this line and running 11 | # it 12 | # rm -rf ~/.m2 13 | mvn compile 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /spark/pair/sales.txt: -------------------------------------------------------------------------------- 1 | bob 10 2 | steve 10 3 | mirko 10 4 | bob 10 5 | dave 10 6 | zip 10 7 | zip 100 8 | bob 10 9 | bob 10 10 | mirko 20 11 | -------------------------------------------------------------------------------- /spark/pair/sales_by_salesperson.scala: -------------------------------------------------------------------------------- 1 | val inputFile = "file:/home/training/src/hadoop-examples/spark/pair/sales.txt" 2 | val sales = sc.textFile(inputFile) 3 | val sales_pairs = 4 | sales.map(sale => (sale.split('\t')(0), 5 | (sale.split('\t')(1)).toInt)) 6 | val sales_by_salesperson = 7 | sales_pairs.reduceByKey((s1, s2) => 8 | s1 + s2) 9 | sales_by_salesperson.take(10) 10 | -------------------------------------------------------------------------------- /spark/pair/weblogs.scala: -------------------------------------------------------------------------------- 1 | val inputFile = "file:/home/training/src/hadoop-examples/spark/pair/weblogs.txt" 2 | val weblogs = sc.textFile(inputFile) 3 | val ips_and_page = weblogs.map(s => (s.split(' ')(0) + '::' + s.split(' ')(2))) 4 | ips_and_page.take(1) 5 | -------------------------------------------------------------------------------- /spark/pair/weblogs.txt: -------------------------------------------------------------------------------- 1 | 1.2.3.4 - /foo.html 2 | 1.2.3.5 - /foo.html 3 | 1.2.3.5 - /foo.html 4 | 1.2.3.6 - /foo.html 5 | 1.2.3.6 - /foo.html 6 | 1.2.3.4 - /foo.html 7 | 192.168.1.123 - /bar.html 8 | 192.168.1.124 - /bar.html 9 | 192.168.1.123 - /bar.html 10 | 192.168.1.124 - /bar.html 11 | 192.168.1.123 - /bar.html 12 | -------------------------------------------------------------------------------- /spark/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | spark-shell -nowarn -i wordcount.scala 3 | -------------------------------------------------------------------------------- /spark/simple/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 20 | 21 | 22 | fs.default.name 23 | file:/// 24 | 25 | 26 | -------------------------------------------------------------------------------- /spark/simple/count.scala: -------------------------------------------------------------------------------- 1 | val data = sc.textFile("data.txt") 2 | data.count() 3 | -------------------------------------------------------------------------------- /spark/simple/data.txt: -------------------------------------------------------------------------------- 1 | bob 10 2 | nate 20 3 | steve 30 4 | -------------------------------------------------------------------------------- /spark/simple/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=ERROR, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Settings to quiet third party logs that are too verbose 9 | log4j.logger.org.eclipse.jetty=WARN 10 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 11 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 12 | -------------------------------------------------------------------------------- /spark/somedata.txt: -------------------------------------------------------------------------------- 1 | spark 2 | shark 3 | spark 4 | squark 5 | lark 6 | bark 7 | lark 8 | Spark 9 | 10 | -------------------------------------------------------------------------------- /spark/spark-sql-scripts/README.md: -------------------------------------------------------------------------------- 1 | # spark-sql-scripts 2 | 3 | This directory contains simple Spark SQL snippets 4 | that are meant to be run from a Spark Shell that 5 | already has a sqlContext initialized. 6 | 7 | # create_table_and_load.scala 8 | 9 | Creates a DataFrame from an RDD using schema from 10 | an existing table. 11 | 12 | - TODO Need to investigate deprecation warning [1] (potentially from 13 | insertInto function?) and need to investigate KeyProviderCache exception [2] 14 | 15 | However, code still runs "correctly" and inserts into table. 16 | 17 | [1] warning: there were 1 deprecation warning(s); re-run with -deprecation for details 18 | 19 | [2] ERROR hdfs.KeyProviderCache: Could not find uri with key [dfs.encryption.key.provider.uri] 20 | to create a keyProvider !! 21 | 22 | -------------------------------------------------------------------------------- /spark/spark-sql-scripts/computeStats.md: -------------------------------------------------------------------------------- 1 | // Examples of table stats 2 | 3 | 4 | In Hive / Impala, run the following: 5 | 6 | CREATE TABLE accounts2 7 | STORED AS PARQUET 8 | AS SELECT * FROM accounts; 9 | 10 | In SparkSQL / Spark Shell, run the following 11 | 12 | spark.sql("describe extended accounts2").select("col_name", "data_type").collect().foreach(println) 13 | 14 | In Hive / Impala, run the following: 15 | 16 | COMPUTE STATS accounts2; 17 | 18 | Re-run this statement, notice that Spark SQL picks up the table stats (albeit very course stats like totalSize) 19 | spark.sql("describe extended accounts2").select("col_name", "data_type").collect().foreach(println) 20 | 21 | -------------------------------------------------------------------------------- /spark/spark-sql-scripts/create_table_and_load.scala: -------------------------------------------------------------------------------- 1 | // This is meant to be run inside the Spark Shell 2 | import sqlContext.implicits._ 3 | import org.apache.spark.sql._ 4 | 5 | sqlContext.sql("""CREATE EXTERNAL TABLE IF NOT EXISTS hadoop_examples_ips 6 | (ip STRING) 7 | ROW FORMAT DELIMITED 8 | FIELDS TERMINATED BY '\t'""") 9 | val ips_schema = sqlContext.table("hadoop_examples_ips").schema 10 | 11 | val data = Array("123.456.789.999 - bob - GET /cat_picture.jpg", 12 | "1.2.3.4 - steve GET /dog_picture.jpg") 13 | 14 | val weblogs = sc.parallelize(data) 15 | 16 | val ips = weblogs.map(line => Row(line.split(' ')(0))) 17 | 18 | val ips_df = sqlContext.createDataFrame(ips, ips_schema) 19 | 20 | ips_df.insertInto("hadoop_examples_ips") 21 | -------------------------------------------------------------------------------- /spark/spark-sql-scripts/create_table_and_load_parquet.scala: -------------------------------------------------------------------------------- 1 | // This is meant to be run inside the Spark Shell 2 | import sqlContext.implicits._ 3 | import org.apache.spark.sql._ 4 | 5 | val tableName = "hadoop_examples_ips_parquet" 6 | 7 | // don't forget the "s" function before """ 8 | sqlContext.sql(s"""CREATE EXTERNAL TABLE IF NOT EXISTS $tableName 9 | (ip STRING) 10 | STORED AS PARQUET""") 11 | 12 | val ips_schema = sqlContext.table(tableName).schema 13 | 14 | val data = Array("123.456.789.999 - bob - GET /cat_picture.jpg", 15 | "1.2.3.4 - steve GET /dog_picture.jpg") 16 | 17 | val weblogs = sc.parallelize(data) 18 | 19 | val ips = weblogs.map(line => Row(line.split(' ')(0))) 20 | 21 | val ips_df = sqlContext.createDataFrame(ips, ips_schema) 22 | 23 | ips_df.insertInto(tableName) 24 | -------------------------------------------------------------------------------- /spark/spark-sql-scripts/data/people.json: -------------------------------------------------------------------------------- 1 | {"name":"Bob", "relatives": [ {"name":"Robert","rel":"dad"}, {"name":"Roberta","rel":"mother"}]} 2 | {"name":"Steve", "relatives": [ {"name":"Steven","rel":"dad"}]} 3 | {"name":"Sharon", "relatives": [ {"name":"Ozzy","rel":"husband"}]} 4 | {"name":"Han Solo"} 5 | -------------------------------------------------------------------------------- /spark/spark-sql-scripts/parse_json.scala: -------------------------------------------------------------------------------- 1 | import sqlContext.implicits._ 2 | import org.apache.spark.sql._ 3 | 4 | val curDir = System.getProperty("user.dir") 5 | val json = sqlContext.read.json("file:" + curDir + "/data/people.json") 6 | json.printSchema() 7 | json.registerTempTable("json") 8 | 9 | /* 10 | TODO figure out what the table_name versus column_name 11 | means when EXPLODING arrays of structs 12 | https://docs.databricks.com/spark/latest/spark-sql/language-manual/select.html 13 | (Remove OUTER to only get people with relatives) 14 | */ 15 | val flattened_table = sqlContext.sql("""SELECT name, r.name AS relative_name, r.rel as foo 16 | FROM json 17 | LATERAL VIEW OUTER EXPLODE(relatives) r AS r""") 18 | 19 | flattened_table.show() 20 | flattened_table.printSchema 21 | 22 | 23 | -------------------------------------------------------------------------------- /spark/spark-sql/README.md: -------------------------------------------------------------------------------- 1 | ## Step 1: Add data to HDFS 2 | 3 | hdfs dfs -mkdir /user/training/ 4 | hdfs dfs -put data/favorite_foods /user/training 5 | 6 | ## Step 2 7 | 8 | ## Step 3: Profit! 9 | 10 | mvn package 11 | spark-submit --class examples.ExplodeAndFriends target/SparkSQLExamples-1.0.jar 12 | -------------------------------------------------------------------------------- /spark/spark-sql/data/favorite_foods/favorite_foods.txt: -------------------------------------------------------------------------------- 1 | 1 chocolate,bread,pot roast,chicken 2 | 2 pizza,milk,flowers,led zeppelin,rabbit,duck,pheasant,pheasant 3 | 3 nothing 4 | 4 5 | 5 cheese,milk,bananas,apples 6 | 6 carrots,celery,steak 7 | 7 chili 8 | 8 turkey 9 | 9 roasted duck 10 | 10 possum soup 11 | 11 stone soup,french fries 12 | 12 hamburgers,quail,milk 13 | -------------------------------------------------------------------------------- /spark/spark-sql/src/main/scala/examples/SimpleShowTables.scala: -------------------------------------------------------------------------------- 1 | package examples 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.sql.hive._ 6 | 7 | object MoreSQL { 8 | def main(args: Array[String]) { 9 | 10 | val sc = new SparkContext() 11 | sc.setLogLevel("FATAL") 12 | 13 | val sqlContext = new HiveContext(sc) 14 | 15 | sqlContext.sql("DROP TABLE IF EXISTS favorite_foods") 16 | val sql = """ 17 | CREATE EXTERNAL TABLE favorite_foods( 18 | userid INT, 19 | favorite_foods STRING) 20 | ROW FORMAT DELIMITED 21 | FIELDS TERMINATED BY '\t' 22 | LOCATION '/user/training/favorite_foods' 23 | """ 24 | sqlContext.sql(sql) 25 | 26 | 27 | // http://spark.apache.org/docs/1.6.0/api/scala/index.html#org.apache.spark.sql.DataFrame 28 | val fav_foods = sqlContext.read.table("favorite_foods") 29 | 30 | val expl = fav_foods.explode("favorite_foods", "favorite_food") { 31 | foods:String => foods.split(",") 32 | } 33 | 34 | expl.columns 35 | expl.show() 36 | 37 | val lateral = sqlContext.sql("SELECT userid, favorite_food FROM favorite_foods LATERAL VIEW explode(SPLIT(favorite_foods,',')) adTable AS favorite_food") 38 | 39 | lateral.columns 40 | lateral.show() 41 | 42 | sc.stop 43 | } 44 | } 45 | 46 | -------------------------------------------------------------------------------- /spark/spark-sql/src/main/scala/examples/explode_and_friends.scala: -------------------------------------------------------------------------------- 1 | package examples 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.sql.hive._ 6 | 7 | object ExplodeAndFriends { 8 | def main(args: Array[String]) { 9 | 10 | val sc = new SparkContext() 11 | sc.setLogLevel("FATAL") 12 | 13 | val sqlContext = new HiveContext(sc) 14 | 15 | sqlContext.sql("DROP TABLE IF EXISTS favorite_foods") 16 | val sql = """ 17 | CREATE EXTERNAL TABLE favorite_foods( 18 | userid INT, 19 | favorite_foods STRING) 20 | ROW FORMAT DELIMITED 21 | FIELDS TERMINATED BY '\t' 22 | LOCATION '/user/training/favorite_foods' 23 | """ 24 | sqlContext.sql(sql) 25 | 26 | 27 | // http://spark.apache.org/docs/1.6.0/api/scala/index.html#org.apache.spark.sql.DataFrame 28 | val fav_foods = sqlContext.read.table("favorite_foods") 29 | 30 | val expl = fav_foods.explode("favorite_foods", "favorite_food") { 31 | foods:String => foods.split(",") 32 | } 33 | 34 | expl.columns 35 | expl.show() 36 | 37 | val lateral = sqlContext.sql("SELECT userid, favorite_food FROM favorite_foods LATERAL VIEW explode(SPLIT(favorite_foods,',')) adTable AS favorite_food") 38 | 39 | lateral.columns 40 | lateral.show() 41 | 42 | sc.stop 43 | } 44 | } 45 | 46 | -------------------------------------------------------------------------------- /spark/spark-sql/src/main/scala/examples/more_sql.scala: -------------------------------------------------------------------------------- 1 | package examples 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.sql.hive._ 6 | 7 | object MoreSQL { 8 | def main(args: Array[String]) { 9 | 10 | val sc = new SparkContext() 11 | sc.setLogLevel("FATAL") 12 | 13 | val sqlContext = new HiveContext(sc) 14 | 15 | sqlContext.sql("DROP TABLE IF EXISTS favorite_foods") 16 | val sql = """ 17 | CREATE EXTERNAL TABLE favorite_foods( 18 | userid INT, 19 | favorite_foods STRING) 20 | ROW FORMAT DELIMITED 21 | FIELDS TERMINATED BY '\t' 22 | LOCATION '/user/training/favorite_foods' 23 | """ 24 | sqlContext.sql(sql) 25 | 26 | 27 | // http://spark.apache.org/docs/1.6.0/api/scala/index.html#org.apache.spark.sql.DataFrame 28 | val fav_foods = sqlContext.read.table("favorite_foods") 29 | 30 | val expl = fav_foods.explode("favorite_foods", "favorite_food") { 31 | foods:String => foods.split(",") 32 | } 33 | 34 | expl.columns 35 | expl.show() 36 | 37 | val lateral = sqlContext.sql("SELECT userid, favorite_food FROM favorite_foods LATERAL VIEW explode(SPLIT(favorite_foods,',')) adTable AS favorite_food") 38 | 39 | lateral.columns 40 | lateral.show() 41 | 42 | sc.stop 43 | } 44 | } 45 | 46 | -------------------------------------------------------------------------------- /spark/sparkml/kmeans.py: -------------------------------------------------------------------------------- 1 | from pyspark.ml.clustering import KMeans 2 | from pyspark.ml import Pipeline 3 | from pyspark.ml import PipelineModel 4 | from pyspark.sql import Row 5 | from pyspark.ml.feature import VectorAssembler 6 | from pyspark.sql.functions import split, col 7 | 8 | # Load the data from a directory with device status data including 9 | # latitude, longitude 10 | # Create a DataFrame that #has 2 columns named Lat and Lon from the 4th and 5th 11 | # fields in the file 12 | 13 | filename = input("Please enter the HDFS Directory where the data is located:") 14 | latLonDF = spark.read.csv(filename).\ 15 | select(col('_c3').cast('float').alias('lat'),\ 16 | col('_c4').cast('float').alias('lon'))\ 17 | .where("lat <> 0 and lon <> 0") 18 | 19 | 20 | # Create a vector assembler that will take in our DataFrame and convert the 21 | # inputCols specified 22 | va = VectorAssembler(inputCols=["lat","lon"],outputCol="features") 23 | 24 | # Use the vector assembler to transform the DataFame which will add a new 25 | # column called 'features' which will be of the Vector type 26 | 27 | vectorDF = va.transform(latLonDF) 28 | 29 | # Create a Kmeans estimator that takes the "features" column as input and set 30 | # the value for K to 5 with a tolerance of .01 and a seed # of 12345 31 | km= KMeans(k=5,tol=.01,seed=12345, featuresCol="features") 32 | 33 | kmModel = km.fit(vectorDF) 34 | 35 | # Print out the cluster centers 36 | for center in kmModel.clusterCenters(): print(center) 37 | predictionDF = kmModel.transform(vectorDF) 38 | predictionDF.show() 39 | 40 | # Same process via an ML pipeline 41 | # pl = Pipeline(stages=[va,km]) 42 | # plmodel = pl.fit(latLonDF) 43 | # predictions = plmodel.transform(latLonDF) 44 | # plmodel.write().overwrite().save("/loudacre/pipelineModel/") 45 | # plmodel1 = PipelineModel.load("/loudacre/pipelineModel/") 46 | # predictions.show(5) 47 | 48 | -------------------------------------------------------------------------------- /spark/structured_streaming_sensors/rate_source_simple.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple example of Spark Structured Streaming's rate source 3 | 4 | References: Changing a column's name: 5 | - https://stackoverflow.com/questions/34077353 6 | """ 7 | from pyspark.sql import SparkSession 8 | 9 | spark = SparkSession.builder.getOrCreate() 10 | 11 | # read data from a set of streaming files 12 | rateDF = spark.readStream.format("rate").option("rowsPerSecond", 50).load() 13 | 14 | # rateDF.printSchema() 15 | # root 16 | # |-- timestamp: timestamp (nullable = true) 17 | # |-- value: long (nullable = true) 18 | 19 | rateQuery = rateDF.writeStream.format("console").option("truncate", False).start() 20 | 21 | # Remove all columns except "timestamp" and rename 22 | renamedQuery = rateDF.selectExpr("timestamp AS ts").\ 23 | writeStream.\ 24 | format("console").option("truncate", False).start() 25 | 26 | # Call rateQuery.stop() 27 | # Call renamedQuery.stop() 28 | -------------------------------------------------------------------------------- /spark/tf-idf/tf-idf.spark: -------------------------------------------------------------------------------- 1 | myfiles = "hdfs://localhost:8020/user/training/mytext/" 2 | mytext = sc.wholeTextFiles(myfiles) 3 | mylines = mytext.map(lambda (filename, content) : ((os.path.basename(filename),content.replace("\n", " ")))) 4 | mywords = mylines.flatMapValues(lambda content : content.split(" ")) 5 | 6 | tf = mywords.map(lambda (filename, word) : ((filename, word), 1)).reduceByKey(lambda a,b : a+b) 7 | 8 | bign = mytext.count() 9 | 10 | df = tf.map(lambda ((file,word),count) : (word,1)).countByKey() 11 | 12 | import math 13 | 14 | tfidf = tf.map(lambda ((file,word),count) : ((file,word),count*math.log(bign/df.get(word)))) 15 | tfidf.collect() 16 | -------------------------------------------------------------------------------- /spark/wordcount.scala: -------------------------------------------------------------------------------- 1 | val textFile = sc.textFile("somedata.txt") 2 | textFile.count() // Number of items in this RDD 3 | textFile.first() // First item in this RDD 4 | val linesWithSpark = textFile.filter(line => line.contains("Spark")) 5 | textFile.filter(line => line.contains("Spark")).count() // How many lines contain "Spark"? 6 | println linesWithSpark 7 | exit 8 | -------------------------------------------------------------------------------- /spark/wordlength_with_details/data.txt: -------------------------------------------------------------------------------- 1 | one oompf 2 | otto 3 | ottoman 4 | terabyte 5 | tough 6 | thyme 7 | -------------------------------------------------------------------------------- /spark/wordlength_with_details/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=ERROR, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Settings to quiet third party logs that are too verbose 9 | log4j.logger.org.eclipse.jetty=WARN 10 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR 11 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 12 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 13 | -------------------------------------------------------------------------------- /spark/wordlength_with_details/wordlength_with_details.scala: -------------------------------------------------------------------------------- 1 | val input_dir = System.getProperty("user.dir") 2 | 3 | val words = sc.textFile("file:" + input_dir + "/data.txt"). 4 | flatMap(line => line.split(" ")) 5 | 6 | /* ("o", "other"), 7 | * ("o", "otto"), 8 | * ("t", "tomcat"), 9 | * ("o", "otto"), 10 | */ 11 | val first_letter_and_word = words.map( 12 | word => (word.substring(0,1), word)) 13 | 14 | val first_letter_counts = 15 | first_letter_and_word.map{ 16 | case (first_letter, word) => (first_letter, 1)}. 17 | reduceByKey((x,y) => (x + y)) 18 | 19 | 20 | /* Produce: 21 | * ("o", "otto,other") 22 | * ("t", "too",tomcat") 23 | */ 24 | val first_letter_and_wordlist = 25 | first_letter_and_word.distinct(). 26 | groupByKey(). 27 | mapValues(_.mkString(",")) 28 | 29 | val counts_with_words = 30 | first_letter_counts.join(first_letter_and_wordlist) 31 | 32 | val output_dir = "file:" + input_dir + "/wordlength_with_details" 33 | counts_with_words.saveAsTextFile(output_dir) 34 | 35 | 36 | -------------------------------------------------------------------------------- /sql-diffs/.gitignore: -------------------------------------------------------------------------------- 1 | *results.txt 2 | -------------------------------------------------------------------------------- /sql-diffs/README.md: -------------------------------------------------------------------------------- 1 | # README 2 | 3 | Illustration of variations in the SQL-dialects 4 | on a given version of Hadoop/Impala. 5 | 6 | To load data from ./accts.txt 7 | 8 | ./load_data.sh 9 | 10 | To compare SQL variations, look at query*.sql files, then run 11 | them using: 12 | 13 | ./run_queries.sh 14 | 15 | To compare output, review output files: 16 | 17 | ./query1.sql-hive-results.txt vs. 18 | ./query1.sql-impala-results.txt vs. 19 | ./query1.sql-mysql-results.txt 20 | 21 | You could use some diff script or just run diff. 22 | -------------------------------------------------------------------------------- /sql-diffs/accts.txt: -------------------------------------------------------------------------------- 1 | 1 Aaron Aardvark CO 2 | 2 Betty Aardvark CO 3 | 3 Cathy Aardvark IL 4 | 4 Steve Snake MO 5 | 5 Steve Slither CO 6 | 6 Steve Zeppelin CO 7 | 7 Lenny Zeppelin CO 8 | 8 Sal Stevens IL 9 | 20 Aaron Aardvark CO 10 | 21 Aaron Aardvark CO 11 | 22 Sandwichhead Aardvark CO 12 | 22 Sandwichhead Bigfoot CO 13 | -------------------------------------------------------------------------------- /sql-diffs/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | DB_NAME=$1 4 | USER_ID=$2 5 | PASSWORD=$3 6 | if [[ -z "$DB_NAME" || -z "$USER_ID" || -z "$PASSWORD" ]]; then 7 | echo "Usage: $0 " 8 | exit 1 9 | fi 10 | 11 | # Load MySQL 12 | mysql -u $USER_ID --password=$PASSWORD < " 9 | exit 1 10 | fi 11 | for SQL_QUERY_FILE in query*.sql; do 12 | impala-shell -d $DB_NAME -f $SQL_QUERY_FILE \ 13 | -o $SQL_QUERY_FILE-impala-results.txt \ 14 | --delimited 15 | mysql -u $USER_ID --password=$PASSWORD $DB_NAME --column-names=false \ 16 | < $SQL_QUERY_FILE > $SQL_QUERY_FILE-mysql-results.txt 17 | 18 | beeline -u jdbc:hive2://localhost:10000/$DB_NAME --silent --verbose=false \ 19 | --showHeader=false \ 20 | --username=$USER_ID --password=$PASSWORD \ 21 | -f $SQL_QUERY_FILE --outputformat=tsv2 | sed -e '/^$d/d' > $SQL_QUERY_FILE-hive-results.txt 22 | done 23 | -------------------------------------------------------------------------------- /sqoop/README.md: -------------------------------------------------------------------------------- 1 | # Sqoop Job 2 | 3 | 4 | execute ./sqoop-job-create 5 | sqoop job --list 6 | sqoop job --show 7 | sqoop job --exec import-accounts 8 | sqoop job --show 9 | -------------------------------------------------------------------------------- /sqoop/sqoop-job-create: -------------------------------------------------------------------------------- 1 | sqoop job --create import-accounts -- \ 2 | import \ 3 | --connect jdbc:mysql://servername/dbname \ 4 | --username something --password something \ 5 | --table accounts \ 6 | --target-dir /accounts \ 7 | --null-string '\\N' \ 8 | --null-non-string '\\N' \ 9 | --incremental append \ 10 | --check-column acct_num 11 | -------------------------------------------------------------------------------- /utils/random_crash/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_DIR=input/random-crash 3 | OUTPUT_DIR=output/random-crash 4 | STREAMING_JAR=/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-*.jar 5 | 6 | hadoop fs -test -d $INPUT_DIR && hadoop fs -rm -R $INPUT_DIR 7 | hadoop fs -test -d $OUTPUT_DIR && hadoop fs -rm -R $OUTPUT_DIR 8 | hadoop fs -mkdir $INPUT_DIR 9 | 10 | for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20;do 11 | hadoop fs -touchz $INPUT_DIR/$i 12 | done 13 | 14 | for j in 1 2 3 4 15 | do 16 | echo "Starting job $j" 17 | hadoop jar $STREAMING_JAR \ 18 | -Dmapred.reduce.tasks=0 \ 19 | -input $INPUT_DIR \ 20 | -output $OUTPUT_DIR \ 21 | -mapper mapper.pl \ 22 | -file mapper.pl \ 23 | -Dmapred.job.name=Random_Crash_$j & 24 | done 25 | -------------------------------------------------------------------------------- /utils/setup_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for dir in $EXAMPLES_DIR /opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce 3 | do 4 | test -d $dir && export EXAMPLES_DIR=$dir 5 | done 6 | 7 | test -d $EXAMPLES_DIR || { 8 | echo "Can't find examples dir $EXAMPLES_DIR" 9 | exit 1 10 | } 11 | 12 | export EXAMPLES_JAR=$EXAMPLES_DIR/hadoop-examples.jar 13 | 14 | test -f $EXAMPLES_JAR || { 15 | echo "Can't find $EXAMPLES_JAR" 16 | exit 1 17 | } 18 | -------------------------------------------------------------------------------- /utils/sleepjob/allocations.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 8 5 | 3 6 | FIFO 7 | 30 8 | 9 | 10 | 11 | 12 | 60 13 | 14 | -------------------------------------------------------------------------------- /utils/sleepjob/bigjob.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | OUTPUT_DIR=/user/training/nate/output/wordcount2 3 | INPUT_DIR=/user/training/nate/input 4 | hadoop fs -test -d $OUTPUT_DIR && hadoop fs -rm -R $OUTPUT_DIR 5 | hadoop jar /usr/lib/hadoop-0.20-mapreduce/hadoop-examples.jar wordcount $INPUT_DIR $OUTPUT_DIR 6 | -------------------------------------------------------------------------------- /utils/sleepjob/submitLONGSleepJob.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | hadoop jar ~/assets/sleep.jar SleepJob \ 3 | -D pool.name="BallHog" \ 4 | -D mapred.job.name="BallHogJob" \ 5 | -m 10 -r 10 -mt 300000 -rt 300000 & 6 | echo "Just submitted BallHog job" 7 | -------------------------------------------------------------------------------- /utils/sleepjob/submitReportsBOSSPool.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for month in January February March April May; do 3 | hadoop jar ~/assets/sleep.jar SleepJob \ 4 | -D pool.name="boss" \ 5 | -D mapred.job.name="FIFO $month" \ 6 | -m 10 -r 10 -mt 30000 -rt 30000 & 7 | sleep 2 8 | done 9 | #-fs hdfs://greg:8020 \ 10 | #-jt hari:8021 \ 11 | -------------------------------------------------------------------------------- /utils/teragen-and-terasort.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Usage is here: 3 | # http://www.michael-noll.com/blog/2011/04/09/benchmarking-and-stress-testing-an-hadoop-cluster-with-terasort-testdfsio-nnbench-mrbench/#teragen-generate-the-terasort-input-data-if-needed 4 | INPUT_DIR=data/teragendata_10_gb 5 | OUTPUT_DIR=output/terasort_10_gb 6 | 7 | # 1 GB = 10000000 8 | # 20 GB = 200000000 9 | TERAGEN_SIZE=100000000 10 | # Try Bumping # of Maps for Teragen :-) 11 | TERAGEN_MAPS=20 12 | FORCE_TERAGEN=1 13 | # Set RUN_COMPARISON to 1 if you want to run terasort (again) with io.sort.mb set differently 14 | RUN_COMPARISON=1 15 | OUTPUT_DIR2=output/terasort10_gb_again 16 | 17 | # If you don't have EXAMPLES_DIR set, you can manually set it here, 18 | # or let this code loop through possible dirs and find an existing directory 19 | test -z "$EXAMPLES_DIR" && { 20 | for d in /usr/lib/hadoop-0.20-mapreduce/ /opt/cloudera/parcels/CDH/lib/hadoop-0.20-mapreduce; 21 | do 22 | test -d $d && EXAMPLES_DIR=$d 23 | done 24 | } 25 | 26 | # hadoop jar hadoop-*examples*.jar teragen 27 | if ! hadoop fs -test -e $INPUT_DIR/part-00000 || [[ "$FORCE_TERAGEN" -ne "0" ]]; then 28 | echo "------------- Creating test data --------------" 29 | hadoop fs -rm -R $INPUT_DIR 30 | 31 | hadoop jar $EXAMPLES_DIR/hadoop-examples.jar teragen -Dmapred.map.tasks=$TERAGEN_MAPS $TERAGEN_SIZE $INPUT_DIR 32 | fi 33 | 34 | hadoop fs -test -d $OUTPUT_DIR && hadoop fs -rm -R $OUTPUT_DIR 35 | hadoop jar $EXAMPLES_DIR/hadoop-examples.jar terasort $INPUT_DIR $OUTPUT_DIR 36 | 37 | if [[ "1" -eq "$RUN_COMPARISON" ]]; then 38 | # This might be needed if running on EC2 (Admin Class) or smaller cluster 39 | # with less than 1GB Heap Size for Mappers 40 | # -Dmapred.child.java.opts=-Xmx512m 41 | hadoop jar $EXAMPLES_DIR/hadoop-examples.jar terasort -Dio.sort.mb=256 $INPUT_DIR $OUTPUT_DIR2 42 | fi 43 | --------------------------------------------------------------------------------