├── .gitignore ├── 01_getting_started ├── 01_getting_started.ipynb ├── 02_overview_of_spark_documentation.ipynb ├── 03_overview_of_spark_sql_cli.ipynb ├── 04_overview_of_spark_sql_properties.ipynb ├── 05_running_os_commands.ipynb ├── 06_understanding_warehouse_directory.ipynb ├── 07_managing_spark_metastore_databases.ipynb ├── 08_managing_spark_metastore_tables.ipynb ├── 09_retrieve_metadata_of_tables.ipynb ├── 10_role_of_spark_or_hive_metastore.ipynb └── 11_exercise_getting_started.ipynb ├── 02_overview_of_hdfs ├── 01_Overview of HDFS.ipynb ├── 02_using_hdfs_cli.ipynb ├── 03_getting_help_or_usage.ipynb ├── 04_listing_hdfs_files.ipynb ├── 05_managing_hdfs_directories.ipynb ├── 06_copying_files_from_local_to_hdfs.ipynb ├── 07_copying_files_from_hdfs_to_local.ipynb ├── 08_copying_files_from_hdfs_to_hdfs.ipynb ├── 09_previewing_data_in_hdfs_files.ipynb ├── 10_getting_file_metadata.ipynb ├── 11_hdfs_blocksize.ipynb ├── 12_hdfs_replication_factor.ipynb ├── 13_getting_hdfs_storage_usage.ipynb ├── 14_using_hdfs_stat_command.ipynb ├── 15_hdfs_file_permissions.ipynb ├── 16_overriding_properties.ipynb └── understanding_linux_file_system_240.ipynb ├── 03_basic_transformations ├── 01_basic_transformations.ipynb ├── 02_spark_sql_overview.ipynb ├── 03_define_problem_statement.ipynb ├── 04_preparing_tables.ipynb ├── 05_projecting_data.ipynb ├── 06_filtering_data.ipynb ├── 07_joining_tables_inner.ipynb ├── 08_joining_tables_outer.ipynb ├── 09_aggregating_data.ipynb ├── 10_sorting_data.ipynb ├── 11_conclusion_final_solution.ipynb └── 12_exercises_basic_sql_queries.ipynb ├── 04_basic_ddl_and_dml ├── 01_basic_ddl_and_dml.ipynb ├── 02_create_spark_metastore_tables.ipynb ├── 03_overview_of_data_types.ipynb ├── 04_adding_comments.ipynb ├── 05_loading_data_into_tables_local.ipynb ├── 06_loading_data_into_tables_hdfs.ipynb ├── 07_loading_data_append_and_overwrite.ipynb ├── 08_creating_external_tables.ipynb ├── 09_managed_vs_external_tables.ipynb ├── 10_overview_of_file_formats.ipynb ├── 11_dropping_tables_and_databases.ipynb ├── 12_truncating_tables.ipynb └── 13_exercises_managing_tables.ipynb ├── 05_dml_and_partitioning ├── 01_dml_and_partitioning.ipynb ├── 02_introduction_to_partitioning.ipynb ├── 03_creating_tables_using_parquet.ipynb ├── 04_load_vs_insert.ipynb ├── 05_inserting_data_using_stage_table.ipynb ├── 06_creating_partitioned_tables.ipynb ├── 07_adding_partitions_to_tables.ipynb ├── 08_loading_data_into_partitions.ipynb ├── 09_inserting_data_into_partitions.ipynb ├── 10_using_dynamic_partition_mode.ipynb └── 11_exercises_partitioned_tables.ipynb ├── 06_predefined_functions ├── 01_predefined_functions.ipynb ├── 02_overview_of_functions.ipynb ├── 03_validating_functions.ipynb ├── 04_string_manipulation_functions.ipynb ├── 05_date_manipulation_functions.ipynb ├── 06_overview_of_numeric_functions.ipynb ├── 07_data_type_conversion.ipynb ├── 08_handling_null_values.ipynb ├── 09_using_case_and_when.ipynb ├── 10_query_example_word_count.ipynb └── 11_exercises_pre_defined_functions.ipynb ├── 07_windowing_functions ├── 01_windowing_functions.ipynb ├── 02_prepare_database.ipynb ├── 03_overview_of_windowing_functions.ipynb ├── 04_aggregations_using_windowing_functions.ipynb ├── 05_using_lead_or_lag.ipynb ├── 06_getting_first_and_last_values.ipynb ├── 07_ranking_using_windowing_functions.ipynb ├── 08_order_of_execution_of_sql.ipynb ├── 09_overview_of_sub_queries.ipynb ├── 10_filtering_window_function_results.ipynb ├── 11_cumulative_or_moving_aggregations.ipynb └── 12_exercises_windowing_functions.ipynb ├── 08_analyze_github_activity └── 02_download_data.ipynb ├── LICENSE ├── README.md ├── Spark Data Frames to Temp Views.ipynb ├── _config.yml ├── _toc.yml └── spark-sql.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | -------------------------------------------------------------------------------- /01_getting_started/01_getting_started.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Getting Started\n", 8 | "\n", 9 | "Let us get started to get into Spark SQL. In this module we will see how to launch and use Spark SQL." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 1, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "* Overview of Spark Documentation\n", 42 | "* Launching and Using Spark SQL\n", 43 | "* Overview of Spark SQL Properties\n", 44 | "* Running OS Commands using Spark SQL\n", 45 | "* Understanding Warehouse Directory\n", 46 | "* Managing Spark Metastore Databases\n", 47 | "* Managing Spark Metastore Tables\n", 48 | "* Retrieve Metadata of Tables\n", 49 | "* Role of Spark or Hive Metastore" 50 | ] 51 | } 52 | ], 53 | "metadata": { 54 | "celltoolbar": "Tags", 55 | "kernelspec": { 56 | "display_name": "Apache Toree - Scala", 57 | "language": "scala", 58 | "name": "apache_toree_scala" 59 | }, 60 | "language_info": { 61 | "codemirror_mode": "text/x-scala", 62 | "file_extension": ".scala", 63 | "mimetype": "text/x-scala", 64 | "name": "scala", 65 | "pygments_lexer": "scala", 66 | "version": "2.11.12" 67 | } 68 | }, 69 | "nbformat": 4, 70 | "nbformat_minor": 4 71 | } 72 | -------------------------------------------------------------------------------- /01_getting_started/02_overview_of_spark_documentation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Overview of Spark Documentation\n", 8 | "\n", 9 | "Let us go through the details related to Spark Documentation. It is very important for you to get comfortable with Spark Documentation if you are aspiring for open book certification exams like CCA 175." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "* Click [here](https://spark.apache.org/docs/latest/sql-programming-guide.html) to go to latest Spark SQL and Data Frames documentation. \n", 42 | "* We typically get documentation for latest version.\n", 43 | "* We can replace **latest** in the URL with the version of Spark to get specific version's official documentation.\n", 44 | "* Also we have resources provided by **databricks**." 45 | ] 46 | } 47 | ], 48 | "metadata": { 49 | "celltoolbar": "Tags", 50 | "kernelspec": { 51 | "display_name": "Scala", 52 | "language": "scala", 53 | "name": "scala" 54 | }, 55 | "language_info": { 56 | "codemirror_mode": "text/x-scala", 57 | "file_extension": ".scala", 58 | "mimetype": "", 59 | "name": "Scala", 60 | "nbconverter_exporter": "", 61 | "version": "2.11.12" 62 | } 63 | }, 64 | "nbformat": 4, 65 | "nbformat_minor": 4 66 | } 67 | -------------------------------------------------------------------------------- /01_getting_started/03_overview_of_spark_sql_cli.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Overview of Spark SQL CLI\n", 8 | "\n", 9 | "Let us understand how to launch Spark SQL CLI." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "* Logon to the gateway node of the cluster.\n", 42 | "* We have 2 versions of Spark in our labs. One can use `spark-sql` to launch Spark SQL using 1.6.x and `spark2-sql` to launch Spark SQL using 2.3.x.\n", 43 | "* Launch Spark SQL CLI using `spark-sql`. In clustered mode we might have to add additional arguments. For example\n", 44 | "\n", 45 | "```\n", 46 | "spark2-sql \\\n", 47 | " --master yarn \\\n", 48 | " --conf spark.ui.port=0 \\\n", 49 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 50 | "```\n", 51 | "* One can get help using `spark-sql --help`\n", 52 | "* For e. g.: we can use `spark-sql --database training_retail` to connect to specific database. Here is the example in clustered mode.\n", 53 | "\n", 54 | "```\n", 55 | "spark2-sql \\\n", 56 | " --master yarn \\\n", 57 | " --conf spark.ui.port=0 \\\n", 58 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse \\\n", 59 | " --database ${USER}_retail\n", 60 | "```\n", 61 | "* Spark SQL CLI will be launched and will be connected to **${USER}_retail** database.\n", 62 | "* We can validate to which database we are connected to using `SELECT current_database()`" 63 | ] 64 | } 65 | ], 66 | "metadata": { 67 | "celltoolbar": "Tags", 68 | "kernelspec": { 69 | "display_name": "Apache Toree - Scala", 70 | "language": "scala", 71 | "name": "apache_toree_scala" 72 | }, 73 | "language_info": { 74 | "codemirror_mode": "text/x-scala", 75 | "file_extension": ".scala", 76 | "mimetype": "text/x-scala", 77 | "name": "scala", 78 | "pygments_lexer": "scala", 79 | "version": "2.11.12" 80 | } 81 | }, 82 | "nbformat": 4, 83 | "nbformat_minor": 4 84 | } 85 | -------------------------------------------------------------------------------- /01_getting_started/04_overview_of_spark_sql_properties.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Overview of Spark SQL Properties\n", 8 | "Let us understand details about Spark SQL properties which control Spark SQL run time environment. " 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 3, 14 | "metadata": { 15 | "tags": [ 16 | "remove-cell" 17 | ] 18 | }, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/html": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": 3, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "%%HTML\n", 33 | "" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "* Spark SQL inherits properties defined for Spark. There are some Spark SQL related properties as well and these are applicable even for Data Frames.\n", 41 | "* We can review these properties using Management Tools such as **Ambari** or **Cloudera Manager Web UI**\n", 42 | "* Spark run time behavior is controlled by HDFS Properties files, YARN Properties files, Hive Properties files etc in those clusters where Spark is integrated with Hadoop and Hive.\n", 43 | "* We can get all the properties using `SET;` in Spark SQL CLI\n", 44 | "\n", 45 | "Let us review some important properties in Spark SQL. \n", 46 | "\n", 47 | "```\n", 48 | "spark.sql.warehouse.dir\n", 49 | "spark.sql.catalogImplementation\n", 50 | "```\n", 51 | "* We can review the current value using `SET spark.sql.warehouse.dir;`" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "import org.apache.spark.sql.SparkSession\n", 61 | "\n", 62 | "val username = System.getProperty(\"user.name\")\n", 63 | "val spark = SparkSession.\n", 64 | " builder.\n", 65 | " config(\"spark.ui.port\", \"0\").\n", 66 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 67 | " enableHiveSupport.\n", 68 | " master(\"yarn\").\n", 69 | " appName(s\"${username} | Spark SQL - Getting Started\").\n", 70 | " getOrCreate" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "%%sql\n", 80 | "\n", 81 | "SET" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 2, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "text/plain": [ 92 | "Waiting for a Spark session to start..." 93 | ] 94 | }, 95 | "metadata": {}, 96 | "output_type": "display_data", 97 | "source": "user" 98 | }, 99 | { 100 | "data": { 101 | "text/plain": [ 102 | "+--------------------+--------------------+\n", 103 | "| key| value|\n", 104 | "+--------------------+--------------------+\n", 105 | "|spark.sql.warehou...|/user/itversity/w...|\n", 106 | "+--------------------+--------------------+\n", 107 | "\n" 108 | ] 109 | }, 110 | "execution_count": 2, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "%%sql\n", 117 | "\n", 118 | "SET spark.sql.warehouse.dir" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "* Properties with default values does not show up as part of `SET` command. But we can check and overwrite the values - for example" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "%%sql\n", 135 | "\n", 136 | "SET spark.sql.shuffle.partitions" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "* We can overwrite property by setting value using the same **SET** command, eg:" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "%%sql\n", 153 | "\n", 154 | "SET spark.sql.shuffle.partitions=2" 155 | ] 156 | } 157 | ], 158 | "metadata": { 159 | "celltoolbar": "Tags", 160 | "kernelspec": { 161 | "display_name": "Apache Toree - Scala", 162 | "language": "scala", 163 | "name": "apache_toree_scala" 164 | }, 165 | "language_info": { 166 | "codemirror_mode": "text/x-scala", 167 | "file_extension": ".scala", 168 | "mimetype": "text/x-scala", 169 | "name": "scala", 170 | "pygments_lexer": "scala", 171 | "version": "2.11.12" 172 | } 173 | }, 174 | "nbformat": 4, 175 | "nbformat_minor": 4 176 | } 177 | -------------------------------------------------------------------------------- /01_getting_started/05_running_os_commands.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Running OS Commands\n", 8 | "\n", 9 | "Let us understand how to run OS commands using Spark SQL CLI." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 4, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 4, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "* We can run OS commands using **!** at the beginning.\n", 42 | " * Listing local Files `!ls -ltr;`\n", 43 | " * Listing HDFS Files `!hdfs dfs -ls /public/retail_db;`" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "import sys.process._\n", 53 | "\n", 54 | "\"ls -ltr\"!" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "import sys.process._\n", 64 | "\n", 65 | "\"hdfs dfs -ls /public/retail_db\"!" 66 | ] 67 | } 68 | ], 69 | "metadata": { 70 | "celltoolbar": "Tags", 71 | "kernelspec": { 72 | "display_name": "Apache Toree - Scala", 73 | "language": "scala", 74 | "name": "apache_toree_scala" 75 | }, 76 | "language_info": { 77 | "codemirror_mode": "text/x-scala", 78 | "file_extension": ".scala", 79 | "mimetype": "text/x-scala", 80 | "name": "scala", 81 | "pygments_lexer": "scala", 82 | "version": "2.11.12" 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 4 87 | } 88 | -------------------------------------------------------------------------------- /01_getting_started/06_understanding_warehouse_directory.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Understanding Warehouse Directory\n", 8 | "\n", 9 | "Let us go through the details related to Spark Metastore Warehouse Directory." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 5, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 5, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "* A Database in Spark SQL is nothing but directory in underlying file system like HDFS. \n", 42 | "* A Spark Metastore Table is nothing but directory in underlying file systems like HDFS.\n", 43 | "* A Partition of Spark Metastore Table is nothing but directory in underlying file systems like HDFS under table.\n", 44 | "* Warehouse Directory is the base directory where directories related to databases, tables go by default.\n", 45 | "* It is controlled by `spark.sql.warehouse.dir`. You can get the value by saying `SET spark.sql.warehouse.dir;`\n", 46 | "> Do not overwrite this property Spark SQL CLI. It will not have any effect.\n", 47 | "* Underlying directory for a database will have **.db** extension." 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "import org.apache.spark.sql.SparkSession\n", 57 | "\n", 58 | "val username = System.getProperty(\"user.name\")\n", 59 | "val spark = SparkSession.\n", 60 | " builder.\n", 61 | " config(\"spark.ui.port\", \"0\").\n", 62 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 63 | " enableHiveSupport.\n", 64 | " master(\"yarn\").\n", 65 | " appName(s\"${username} | Spark SQL - Getting Started\").\n", 66 | " getOrCreate" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "%%sql\n", 76 | "\n", 77 | "SET spark.sql.warehouse.dir" 78 | ] 79 | } 80 | ], 81 | "metadata": { 82 | "celltoolbar": "Tags", 83 | "kernelspec": { 84 | "display_name": "Apache Toree - Scala", 85 | "language": "scala", 86 | "name": "apache_toree_scala" 87 | }, 88 | "language_info": { 89 | "codemirror_mode": "text/x-scala", 90 | "file_extension": ".scala", 91 | "mimetype": "text/x-scala", 92 | "name": "scala", 93 | "pygments_lexer": "scala", 94 | "version": "2.11.12" 95 | } 96 | }, 97 | "nbformat": 4, 98 | "nbformat_minor": 4 99 | } 100 | -------------------------------------------------------------------------------- /01_getting_started/07_managing_spark_metastore_databases.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Managing Spark Metastore Databases\n", 8 | "\n", 9 | "Let us undestand how to manage Spark Metastore Databases." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 8, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 8, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "* Make a habit of reviewing Language Manual.\n", 42 | "* We can create database using **CREATE DATABASE** Command.\n", 43 | "* For e. g.: `CREATE DATABASE itversity_demo;`\n", 44 | "* If the database exists it will fail. If you want to ignore with out throwing error you can use **IF NOT EXISTS**\n", 45 | "* e. g.: `CREATE DATABASE IF NOT EXISTS itversity_demo;`\n", 46 | "* We can list the databases using `SHOW databases;`\n", 47 | "* Spark Metastore is multi tenant database. To switch to a database, you can use **USE** Command. e. g.: `USE itversity_demo;`\n", 48 | "* We can drop empty database by using `DROP DATABASE itversity_demo;`.\n", 49 | "* Add cascade to drop all the tables and then the database `DROP DATABASE itversity_demo CASCADE;`.\n", 50 | "* We can also specify location while creating the database - `CREATE DATABASE itversity_demo LOCATION '/user/itversity/custom/itversity_demo.db'`" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "import org.apache.spark.sql.SparkSession\n", 60 | "\n", 61 | "val username = System.getProperty(\"user.name\")\n", 62 | "val spark = SparkSession.\n", 63 | " builder.\n", 64 | " config(\"spark.ui.port\", \"0\").\n", 65 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 66 | " enableHiveSupport.\n", 67 | " master(\"yarn\").\n", 68 | " appName(s\"${username} | Spark SQL - Getting Started\").\n", 69 | " getOrCreate" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "%%sql\n", 79 | "\n", 80 | "DROP DATABASE IF EXISTS itversity_demo" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "%%sql\n", 90 | "\n", 91 | "CREATE DATABASE itversity_demo" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "%%sql\n", 101 | "\n", 102 | "CREATE DATABASE itversity_demo" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "%%sql\n", 112 | "\n", 113 | "CREATE DATABASE IF NOT EXISTS itversity_demo" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "%%sql\n", 123 | "\n", 124 | "SHOW databases" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "%%sql\n", 134 | "\n", 135 | "SELECT current_database()" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "%%sql\n", 145 | "\n", 146 | "USE itversity_demo" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "%%sql\n", 156 | "\n", 157 | "SELECT current_database()" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "val username = System.getProperty(\"user.name\")" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "import sys.process._\n", 176 | "s\"hdfs dfs -ls /user/${username}/warehouse/${username}_demo.db\"!" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "%%sql\n", 186 | "\n", 187 | "CREATE TABLE table_demo (i INT)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "import sys.process._\n", 197 | "s\"hdfs dfs -ls /user/${username}/warehouse/${username}_demo.db\"!" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "%%sql\n", 207 | "\n", 208 | "DROP DATABASE itversity_demo CASCADE" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "import sys.process._\n", 218 | "s\"hdfs dfs -ls /user/${username}/warehouse/${username}_demo.db\"!" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "%%sql\n", 228 | "\n", 229 | "CREATE DATABASE itversity_demo LOCATION '/user/itversity/custom/itversity_demo.db'" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "import sys.process._\n", 239 | "s\"hdfs dfs -ls /user/${username}/custom\"! // Directory will be created if it does not exists" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "import sys.process._\n", 249 | "s\"hdfs dfs -ls /user/${username}/custom/${username}_demo.db\"!" 250 | ] 251 | } 252 | ], 253 | "metadata": { 254 | "celltoolbar": "Tags", 255 | "kernelspec": { 256 | "display_name": "Apache Toree - Scala", 257 | "language": "scala", 258 | "name": "apache_toree_scala" 259 | }, 260 | "language_info": { 261 | "codemirror_mode": "text/x-scala", 262 | "file_extension": ".scala", 263 | "mimetype": "text/x-scala", 264 | "name": "scala", 265 | "pygments_lexer": "scala", 266 | "version": "2.11.12" 267 | } 268 | }, 269 | "nbformat": 4, 270 | "nbformat_minor": 4 271 | } 272 | -------------------------------------------------------------------------------- /01_getting_started/08_managing_spark_metastore_tables.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Managing Spark Metastore Tables\n", 8 | "\n", 9 | "Let us create our first Spark Metastore table. We will also have a look into how to list the tables." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 9, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 9, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "* We will get into details related to DDL Commands at a later point in time. \n", 42 | "* For now we will just create our first table. We will get into the details about creating tables as part of subsequent sections.\n", 43 | "> Use your OS username as prefix for the databases, if you are using our labs" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "import org.apache.spark.sql.SparkSession\n", 53 | "\n", 54 | "val username = System.getProperty(\"user.name\")\n", 55 | "val spark = SparkSession.\n", 56 | " builder.\n", 57 | " config(\"spark.ui.port\", \"0\").\n", 58 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 59 | " enableHiveSupport.\n", 60 | " master(\"yarn\").\n", 61 | " appName(s\"${username} | Spark SQL - Getting Started\").\n", 62 | " getOrCreate" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "%%sql\n", 72 | "\n", 73 | "SELECT current_database()" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "%%sql\n", 83 | "\n", 84 | "DROP DATABASE itversity_retail CASCADE" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "%%sql\n", 94 | "\n", 95 | "CREATE DATABASE itversity_retail" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "%%sql\n", 105 | "\n", 106 | "USE itversity_retail" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "%%sql\n", 116 | "\n", 117 | "CREATE TABLE orders (\n", 118 | " order_id INT,\n", 119 | " order_date STRING,\n", 120 | " order_customer_id INT,\n", 121 | " order_status STRING\n", 122 | ") ROW FORMAT DELIMITED FIELDS TERMINATED BY ','" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "* We can list the tables using `SHOW tables;`" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "%%sql\n", 139 | "\n", 140 | "SHOW tables" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "* We can also drop the table using `DROP TABLE` command. We will get into more details at a later point in time.\n", 148 | "* We can also truncate the managed tables using `TRUNCATE TABLE` command." 149 | ] 150 | } 151 | ], 152 | "metadata": { 153 | "celltoolbar": "Tags", 154 | "kernelspec": { 155 | "display_name": "Apache Toree - Scala", 156 | "language": "scala", 157 | "name": "apache_toree_scala" 158 | }, 159 | "language_info": { 160 | "codemirror_mode": "text/x-scala", 161 | "file_extension": ".scala", 162 | "mimetype": "text/x-scala", 163 | "name": "scala", 164 | "pygments_lexer": "scala", 165 | "version": "2.11.12" 166 | } 167 | }, 168 | "nbformat": 4, 169 | "nbformat_minor": 4 170 | } 171 | -------------------------------------------------------------------------------- /01_getting_started/09_retrieve_metadata_of_tables.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Retrieve Metadata of Tables\n", 8 | "\n", 9 | "As the table is created, let us understand how to get the metadata of a table." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 10, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 10, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "* We can get metadata of Hive Tables using several commands.\n", 42 | " * DESCRIBE - e.g.: `DESCRIBE orders;`\n", 43 | " * DESCRIBE EXTENDED - e.g.: `DESCRIBE EXTENDED orders;`\n", 44 | " * DESCRIBE FORMATTED - e.g.: `DESCRIBE FORMATTED orders;`\n", 45 | "* **DESCRIBE** will give only field names and data types.\n", 46 | "* **DESCRIBE EXTENDED** will give all the metadata, but not in readable format in Hive. It is same as **DESCRIBE FORMATTED** in Spark SQL.\n", 47 | "* **DESCRIBE FORMATTED** will give metadata in readable format.\n", 48 | "\n", 49 | "**As the output is truncated using Jupyter, we will actually see the details using `spark-sql`**" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "import org.apache.spark.sql.SparkSession\n", 59 | "\n", 60 | "val username = System.getProperty(\"user.name\")\n", 61 | "val spark = SparkSession.\n", 62 | " builder.\n", 63 | " config(\"spark.ui.port\", \"0\").\n", 64 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 65 | " enableHiveSupport.\n", 66 | " master(\"yarn\").\n", 67 | " appName(s\"${username} | Spark SQL - Getting Started\").\n", 68 | " getOrCreate" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "%%sql\n", 78 | "\n", 79 | "SELECT current_database()" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "%%sql\n", 89 | "\n", 90 | "USE itversity_retail" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "%%sql\n", 100 | "\n", 101 | "SHOW tables" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "%%sql\n", 111 | "\n", 112 | "DESCRIBE orders" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "%%sql\n", 122 | "\n", 123 | "DESCRIBE EXTENDED orders" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "%%sql\n", 133 | "\n", 134 | "DESCRIBE FORMATTED orders" 135 | ] 136 | } 137 | ], 138 | "metadata": { 139 | "celltoolbar": "Tags", 140 | "kernelspec": { 141 | "display_name": "Apache Toree - Scala", 142 | "language": "scala", 143 | "name": "apache_toree_scala" 144 | }, 145 | "language_info": { 146 | "codemirror_mode": "text/x-scala", 147 | "file_extension": ".scala", 148 | "mimetype": "text/x-scala", 149 | "name": "scala", 150 | "pygments_lexer": "scala", 151 | "version": "2.11.12" 152 | } 153 | }, 154 | "nbformat": 4, 155 | "nbformat_minor": 4 156 | } 157 | -------------------------------------------------------------------------------- /01_getting_started/10_role_of_spark_or_hive_metastore.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Role of Spark or Hive Metastore\n", 8 | "\n", 9 | "Let us understand the role of Spark Metastore or Hive Metasore. We need to first understand details related to Metadata generated for Spark Metastore tables." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 11, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 11, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "* When we create a Spark Metastore table, there is metadata associated with it.\n", 42 | " * Table Name\n", 43 | " * Column Names and Data Types\n", 44 | " * Location\n", 45 | " * File Format\n", 46 | " * and more\n", 47 | "* This metadata has to be stored some where so that Query Engines such as Spark SQL can access the information to serve our queries.\n", 48 | "\n", 49 | "Let us understand where the metadata is stored.\n", 50 | "\n", 51 | "* Information is typically stored in relational database and it is called as metastore.\n", 52 | "* It is extensively used by Hive or Spark SQL engine for syntax and semantics check as well as execution of queries.\n", 53 | "* In our case it is stored in MySQL Database. Let us review the details by going through relevant properties." 54 | ] 55 | } 56 | ], 57 | "metadata": { 58 | "celltoolbar": "Tags", 59 | "kernelspec": { 60 | "display_name": "Apache Toree - Scala", 61 | "language": "scala", 62 | "name": "apache_toree_scala" 63 | }, 64 | "language_info": { 65 | "codemirror_mode": "text/x-scala", 66 | "file_extension": ".scala", 67 | "mimetype": "text/x-scala", 68 | "name": "scala", 69 | "pygments_lexer": "scala", 70 | "version": "2.11.12" 71 | } 72 | }, 73 | "nbformat": 4, 74 | "nbformat_minor": 4 75 | } 76 | -------------------------------------------------------------------------------- /01_getting_started/11_exercise_getting_started.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Exercise - Getting Started with Spark SQL\n", 8 | "\n", 9 | "Let's do a simple exercise to conclude this section." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 12, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 12, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "* Launch Spark SQL (don't use database) (use spark-sql command). Here is the script for our labs. In other environments, you can skip last line. I have also included commands to launch Spark using Scala or Python (for CCA 175 Certification purpose)\n", 42 | "\n", 43 | "**Using Spark SQL**\n", 44 | "\n", 45 | "```\n", 46 | "spark2-sql \\\n", 47 | " --master yarn \\\n", 48 | " --conf spark.ui.port=0 \\\n", 49 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 50 | "```\n", 51 | "\n", 52 | "**Using Scala**\n", 53 | "\n", 54 | "```\n", 55 | "spark2-shell \\\n", 56 | " --master yarn \\\n", 57 | " --conf spark.ui.port=0 \\\n", 58 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 59 | "```\n", 60 | "\n", 61 | "**Using Pyspark**\n", 62 | "\n", 63 | "```\n", 64 | "pyspark2 \\\n", 65 | " --master yarn \\\n", 66 | " --conf spark.ui.port=0 \\\n", 67 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 68 | "```\n", 69 | "\n", 70 | "* Create Database and exit (make sure to prefix database name with your OS username, e. g.: **training_retail** for OS user training)\n", 71 | "* Exit and launch connecting to your database\n", 72 | "* Create Table orders. You can use below script to create the table.\n", 73 | "\n", 74 | "```\n", 75 | "CREATE TABLE orders (\n", 76 | " order_id INT,\n", 77 | " order_date STRING,\n", 78 | " order_customer_id INT,\n", 79 | " order_status STRING\n", 80 | ") ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';\n", 81 | "```\n", 82 | "\n", 83 | "* List the tables\n", 84 | "* Describe the table and review the whole metadata" 85 | ] 86 | } 87 | ], 88 | "metadata": { 89 | "celltoolbar": "Tags", 90 | "kernelspec": { 91 | "display_name": "Apache Toree - Scala", 92 | "language": "scala", 93 | "name": "apache_toree_scala" 94 | }, 95 | "language_info": { 96 | "codemirror_mode": "text/x-scala", 97 | "file_extension": ".scala", 98 | "mimetype": "text/x-scala", 99 | "name": "scala", 100 | "pygments_lexer": "scala", 101 | "version": "2.11.12" 102 | } 103 | }, 104 | "nbformat": 4, 105 | "nbformat_minor": 4 106 | } 107 | -------------------------------------------------------------------------------- /02_overview_of_hdfs/01_Overview of HDFS.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Overview of HDFS\n", 8 | "\n", 9 | "As part of this module we will be covering all important aspects of HDFS that are required for development. We have covered all the essentials for the development.\n", 10 | "* Using HDFS CLI\n", 11 | "* Getting Help or Usage\n", 12 | "* Listing HDFS Files\n", 13 | "* Managing HDFS Directories\n", 14 | "* Copying files from HDFS to Local\n", 15 | "* Copying files from HDFS to HDFS\n", 16 | "* Previewing data in HDFS Files\n", 17 | "* Getting File Metadata\n", 18 | "* HDFS Block Size\n", 19 | "* HDFS Replication Factor\n", 20 | "* Getting HDFS Storage Usage\n", 21 | "* Using HDFS Stat Commands\n", 22 | "* HDFS File Permissions\n", 23 | "* Overriding Properties" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [] 32 | } 33 | ], 34 | "metadata": { 35 | "kernelspec": { 36 | "display_name": "Python 3", 37 | "language": "python", 38 | "name": "python3" 39 | }, 40 | "language_info": { 41 | "codemirror_mode": { 42 | "name": "ipython", 43 | "version": 3 44 | }, 45 | "file_extension": ".py", 46 | "mimetype": "text/x-python", 47 | "name": "python", 48 | "nbconvert_exporter": "python", 49 | "pygments_lexer": "ipython3", 50 | "version": "3.6.12" 51 | } 52 | }, 53 | "nbformat": 4, 54 | "nbformat_minor": 4 55 | } 56 | -------------------------------------------------------------------------------- /02_overview_of_hdfs/02_using_hdfs_cli.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Using HDFS CLI\n", 8 | "\n", 9 | "Let us understand how to use HDFS CLI to interact with HDFS.\n", 10 | "* Typically the cluster contain 3 types of nodes.\n", 11 | " * Gateway nodes or client nodes or edge nodes\n", 12 | " * Master nodes\n", 13 | " * Worker nodes\n", 14 | "* Developers like us will typically have access to Gateway nodes or Client nodes.\n", 15 | "* We can connect to Gateway nodes or Client nodes using SSH.\n", 16 | "* Once login, we can interact with HDFS either by using `hadoop fs` or `hdfs dfs`. Both of them are aliases to each other.\n", 17 | "* `hadoop` have other subcommands than `fs` and is typically used to interact with HDFS or Map Reduce as developers.\n", 18 | "* `hdfs` have other subcommands than `dfs`. It is typically used to not only manage files in HDFS but also administrative tasks related HDFS components such as **Namenode**, **Secondary Namenode**, **Datanode** etc.\n", 19 | "* As deveopers, our scope will be limited to use `hdfs dfs` or `hadoop fs` to interact with HDFS.\n", 20 | "* Both have sub commands and each of the sub command take additional control arguments. Let us understand the structure by taking the example of `hdfs dfs -ls -l -S -r /public`.\n", 21 | " * `hdfs` is the main command to manage all the components of HDFS.\n", 22 | " * `dfs` is the sub command to manage files in HDFS.\n", 23 | " * `-ls` is the file system command to list files in HDFS.\n", 24 | " * `-l -S -r` are control arguments for `-ls` to control the run time behavior of the command.\n", 25 | " * `/public` is the argument for the `-ls` command. It is path in HDFS. You will understad as you get into the details." 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "%%sh\n", 35 | "\n", 36 | "hadoop" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "%%sh\n", 46 | "\n", 47 | "hadoop fs -usage" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "%%sh\n", 57 | "\n", 58 | "hdfs" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "%%sh\n", 68 | "\n", 69 | "hdfs dfs -usage" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [] 78 | } 79 | ], 80 | "metadata": { 81 | "kernelspec": { 82 | "display_name": "Python 3", 83 | "language": "python", 84 | "name": "python3" 85 | }, 86 | "language_info": { 87 | "codemirror_mode": { 88 | "name": "ipython", 89 | "version": 3 90 | }, 91 | "file_extension": ".py", 92 | "mimetype": "text/x-python", 93 | "name": "python", 94 | "nbconvert_exporter": "python", 95 | "pygments_lexer": "ipython3", 96 | "version": "3.6.12" 97 | } 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 4 101 | } 102 | -------------------------------------------------------------------------------- /02_overview_of_hdfs/03_getting_help_or_usage.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Getting help or usage\n", 8 | "\n", 9 | "Let us explore details about how to list the commands and get the help or usage for given command." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "%%HTML\n", 23 | "" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "* Even though we can run commands from almost all the nodes in the clusters, we should only use Gateway to run HDFS Commands.\n", 31 | "* First we need to make sure designated Gateway server is Gateway for HDFS service so that we can run commands from Gateway node. In our case we have designated **gw02.itversity.com** or **gw03.itversity.com** as Gateways.\n", 32 | "* Typically Namenode process will be running on port number 8020. We can also pass namenode URI to access HDFS." 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "%%sh\n", 42 | "\n", 43 | "head -20 /etc/hadoop/conf/core-site.xml" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "%%sh\n", 53 | "\n", 54 | "hdfs dfs -ls /user/${USER}" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "%%sh\n", 64 | "\n", 65 | "hdfs dfs -ls hdfs://nn01.itversity.com:8020/user/${USER}" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "* `hadoop fs` or `hdfs dfs` – list all the commands available\n", 73 | "* `hadoop fs -usage` – will give us basic usage for given command\n", 74 | "* `hadoop fs -help` – will give us additional information for all the commands. It is same as just running `hadoop fs` or `hdfs dfs`.\n", 75 | "* We can run help on individual commands as well - example: `hadoop fs -help ls` or `hdfs dfs -help ls`" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "%%sh\n", 85 | "\n", 86 | "hdfs dfs -help" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "%%sh\n", 96 | "\n", 97 | "hdfs dfs -usage ls" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "%%sh\n", 107 | "\n", 108 | "hdfs dfs -help ls" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "%%sh\n", 118 | "\n", 119 | "hdfs dfs -ls /public/retail_db" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [] 128 | } 129 | ], 130 | "metadata": { 131 | "kernelspec": { 132 | "display_name": "Python 3", 133 | "language": "python", 134 | "name": "python3" 135 | }, 136 | "language_info": { 137 | "codemirror_mode": { 138 | "name": "ipython", 139 | "version": 3 140 | }, 141 | "file_extension": ".py", 142 | "mimetype": "text/x-python", 143 | "name": "python", 144 | "nbconvert_exporter": "python", 145 | "pygments_lexer": "ipython3", 146 | "version": "3.6.12" 147 | } 148 | }, 149 | "nbformat": 4, 150 | "nbformat_minor": 4 151 | } 152 | -------------------------------------------------------------------------------- /02_overview_of_hdfs/04_listing_hdfs_files.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Listing HDFS Files\n", 8 | "\n", 9 | "Now let us walk through different options we have with hdfs `ls` command to list the files." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "%%HTML\n", 23 | "" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "* We can get usage by running `hdfs dfs -usage ls`." 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "%%sh\n", 40 | "\n", 41 | "hdfs dfs -usage ls" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "* We can get help using `hdfs dfs -help ls`" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "%%sh\n", 58 | "\n", 59 | "hdfs dfs -help ls" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "* Let us list all the files in **/public/nyse_all/nyse_data** folder. It is one of the public data sets that are available under **/public**. By default files and folders are sorted in ascending order by name." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "%%sh\n", 76 | "\n", 77 | "hdfs dfs -ls /public/nyse_all/nyse_data" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "%%sh\n", 87 | "\n", 88 | "hdfs dfs -ls -r /public/nyse_all/nyse_data" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "* We can sort the files and directories by time using `-t` option. By default you will see latest files at top. We can reverse it by using `-t -r`." 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "%%sh\n", 105 | "\n", 106 | "hdfs dfs -ls -t /public/nyse_all/nyse_data" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "%%sh\n", 116 | "\n", 117 | "hdfs dfs -ls -t -r /public/nyse_all/nyse_data" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "* We can sort the files and directories by size using `-S`. By default, the files will be sorted in descending order by size. We can reverse the sorting order using `-S -r`." 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "%%sh\n", 134 | "\n", 135 | "hdfs dfs -ls -S /public/nyse_all/nyse_data" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "%%sh\n", 145 | "\n", 146 | "hdfs dfs -ls -S -r /public/nyse_all/nyse_data" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "%%sh\n", 156 | "\n", 157 | "hdfs dfs -ls -h /public/nyse_all/nyse_data" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "%%sh\n", 167 | "\n", 168 | "hdfs dfs -ls -h -t /public/nyse_all/nyse_data" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "%%sh\n", 178 | "\n", 179 | "hdfs dfs -ls -h -S /public/nyse_all/nyse_data" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "%%sh\n", 189 | "\n", 190 | "hdfs dfs -ls -h -S -r /public/nyse_all/nyse_data" 191 | ] 192 | } 193 | ], 194 | "metadata": { 195 | "kernelspec": { 196 | "display_name": "Python 3", 197 | "language": "python", 198 | "name": "python3" 199 | }, 200 | "language_info": { 201 | "codemirror_mode": { 202 | "name": "ipython", 203 | "version": 3 204 | }, 205 | "file_extension": ".py", 206 | "mimetype": "text/x-python", 207 | "name": "python", 208 | "nbconvert_exporter": "python", 209 | "pygments_lexer": "ipython3", 210 | "version": "3.6.12" 211 | } 212 | }, 213 | "nbformat": 4, 214 | "nbformat_minor": 4 215 | } 216 | -------------------------------------------------------------------------------- /02_overview_of_hdfs/05_managing_hdfs_directories.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Managing HDFS Directories\n", 8 | "\n", 9 | "Now let us have a look at how to create directories and manage ownership." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "%%HTML\n", 23 | "" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "* By default hdfs is superuser of HDFS\n", 31 | "* `hadoop fs -mkdir` or `hdfs dfs -mkdir` – to create directories\n", 32 | "* `hadoop fs -chown` or `hdfs dfs -chown` – to change ownership of files\n", 33 | "* `chown` can also be used to change the group. We can change the group using `-chgrp` command as well. Make sure to run `-help` on chgrp and check the details.\n", 34 | "* Here are the steps to create user space. Only users in HDFS group can take care of it.\n", 35 | " * Create directory with user id `itversity` under /user\n", 36 | " * Change ownership to the same name as the directory created earlier (/user/itversity)\n", 37 | " * You can validate permissions by using `hadoop fs -ls` or `hdfs dfs -ls` command on /user. Make sure to grep for the user name you are looking for.\n", 38 | "* Let's go ahead and create user space in HDFS for `itversity`. I have to login as sudoer and run below commands.\n", 39 | "\n", 40 | "```shell\n", 41 | "sudo -u hdfs hdfs dfs -mkdir /user/itversity\n", 42 | "sudo -u hdfs hdfs dfs -chown -R itversity:students /user/itversity\n", 43 | "hdfs dfs -ls /user|grep itversity\n", 44 | "```\n", 45 | "\n", 46 | "* You should be able to create folders under your home directory." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "%%sh\n", 56 | "\n", 57 | "hdfs dfs -ls /user/${USER}" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "%%sh\n", 67 | "\n", 68 | "hdfs dfs -mkdir /user/${USER}/retail_db" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "%%sh\n", 78 | "\n", 79 | "hdfs dfs -ls /user/${USER}" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "* You can create the directory structure using `mkdir -p`. The existing folders will be ignored and non existing folders will be created.\n", 87 | " * Let us run `hdfs dfs -mkdir -p /user/${USER}/retail_db/orders/year=2020`.\n", 88 | " * As `/user/${USER}/retail_db` already exists, it will be ignored.\n", 89 | " * Both `/user/${USER}/retail_db/orders` as well as `/user/${USER}/retail_db/orders/year=2020` will be created." 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "%%sh\n", 99 | "\n", 100 | "hdfs dfs -help mkdir" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "%%sh\n", 110 | "\n", 111 | "hdfs dfs -ls -R /user/${USER}/retail_db" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "%%sh\n", 121 | "\n", 122 | "hdfs dfs -mkdir -p /user/${USER}/retail_db/orders/year=2020" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "%%sh\n", 132 | "\n", 133 | "hdfs dfs -ls -R /user/${USER}/retail_db" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "* We can delete non empty directory using `hdfs dfs -rm -R` and empty directory using `hdfs dfs -rmdir`. We will explore `hdfs dfs -rm` in detail later." 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "%%sh\n", 150 | "\n", 151 | "hdfs dfs -help rmdir" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "%%sh\n", 161 | "\n", 162 | "hdfs dfs -rmdir /user/${USER}/retail_db/orders/year=2020" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "%%sh\n", 172 | "\n", 173 | "hdfs dfs -rm /user/${USER}/retail_db" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "%%sh\n", 183 | "\n", 184 | "hdfs dfs -rmdir /user/${USER}/retail_db" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "%%sh\n", 194 | "\n", 195 | "hdfs dfs -rm -R /user/${USER}/retail_db" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "%%sh\n", 205 | "\n", 206 | "hdfs dfs -ls /user/${USER}" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [] 215 | } 216 | ], 217 | "metadata": { 218 | "kernelspec": { 219 | "display_name": "Python 3", 220 | "language": "python", 221 | "name": "python3" 222 | }, 223 | "language_info": { 224 | "codemirror_mode": { 225 | "name": "ipython", 226 | "version": 3 227 | }, 228 | "file_extension": ".py", 229 | "mimetype": "text/x-python", 230 | "name": "python", 231 | "nbconvert_exporter": "python", 232 | "pygments_lexer": "ipython3", 233 | "version": "3.6.12" 234 | } 235 | }, 236 | "nbformat": 4, 237 | "nbformat_minor": 4 238 | } 239 | -------------------------------------------------------------------------------- /02_overview_of_hdfs/13_getting_hdfs_storage_usage.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Getting HDFS Storage Usage\n", 8 | "\n", 9 | "Let us get an overview of HDFS usage using `du` and `df` commands." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "%%HTML\n", 23 | "" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "* We can use `hdfs dfs -df` to get the current capacity and usage of HDFS.\n", 31 | "* We can use `hdfs dfs -du` to get the size occupied by a file or folder." 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 1, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "-df [-h] [ ...] :\n", 44 | " Shows the capacity, free and used space of the filesystem. If the filesystem has\n", 45 | " multiple partitions, and no path to a particular partition is specified, then\n", 46 | " the status of the root partitions will be shown.\n", 47 | " \n", 48 | " -h Formats the sizes of files in a human-readable fashion rather than a number\n", 49 | " of bytes. \n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "%%sh\n", 55 | "\n", 56 | "hdfs dfs -help df" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 2, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "Filesystem Size Used Available Use%\n", 69 | "hdfs://nn01.itversity.com:8020 18088946967552 7096522956949 9985130049852 39%\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "%%sh\n", 75 | "\n", 76 | "hdfs dfs -df" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 3, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | "Filesystem Size Used Available Use%\n", 89 | "hdfs://nn01.itversity.com:8020 16.5 T 6.5 T 9.1 T 39%\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "%%sh\n", 95 | "\n", 96 | "hdfs dfs -df -h" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 4, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "name": "stdout", 106 | "output_type": "stream", 107 | "text": [ 108 | "-du [-s] [-h] ... :\n", 109 | " Show the amount of space, in bytes, used by the files that match the specified\n", 110 | " file pattern. The following flags are optional:\n", 111 | " \n", 112 | " -s Rather than showing the size of each individual file that matches the \n", 113 | " pattern, shows the total (summary) size. \n", 114 | " -h Formats the sizes of files in a human-readable fashion rather than a number\n", 115 | " of bytes. \n", 116 | " \n", 117 | " Note that, even without the -s option, this only shows size summaries one level\n", 118 | " deep into a directory.\n", 119 | " \n", 120 | " The output is in the form \n", 121 | " \tsize\tname(full path)\n" 122 | ] 123 | } 124 | ], 125 | "source": [ 126 | "%%sh\n", 127 | "\n", 128 | "hdfs dfs -help du" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 5, 134 | "metadata": {}, 135 | "outputs": [ 136 | { 137 | "name": "stdout", 138 | "output_type": "stream", 139 | "text": [ 140 | "1029 /user/itversity/retail_db/categories\n", 141 | "953719 /user/itversity/retail_db/customers\n", 142 | "60 /user/itversity/retail_db/departments\n", 143 | "5408880 /user/itversity/retail_db/order_items\n", 144 | "2999944 /user/itversity/retail_db/orders\n", 145 | "174155 /user/itversity/retail_db/products\n" 146 | ] 147 | } 148 | ], 149 | "source": [ 150 | "%%sh\n", 151 | "\n", 152 | "hdfs dfs -du /user/${USER}/retail_db" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 6, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "name": "stdout", 162 | "output_type": "stream", 163 | "text": [ 164 | "9537787 /user/itversity/retail_db\n" 165 | ] 166 | } 167 | ], 168 | "source": [ 169 | "%%sh\n", 170 | "\n", 171 | "hdfs dfs -du -s /user/${USER}/retail_db" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 7, 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "1.0 K /user/itversity/retail_db/categories\n", 184 | "931.4 K /user/itversity/retail_db/customers\n", 185 | "60 /user/itversity/retail_db/departments\n", 186 | "5.2 M /user/itversity/retail_db/order_items\n", 187 | "2.9 M /user/itversity/retail_db/orders\n", 188 | "170.1 K /user/itversity/retail_db/products\n" 189 | ] 190 | } 191 | ], 192 | "source": [ 193 | "%%sh\n", 194 | "\n", 195 | "hdfs dfs -du -h /user/${USER}/retail_db" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 8, 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "name": "stdout", 205 | "output_type": "stream", 206 | "text": [ 207 | "9.1 M /user/itversity/retail_db\n" 208 | ] 209 | } 210 | ], 211 | "source": [ 212 | "%%sh\n", 213 | "\n", 214 | "hdfs dfs -du -s -h /user/${USER}/retail_db" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [] 223 | } 224 | ], 225 | "metadata": { 226 | "kernelspec": { 227 | "display_name": "Python 3", 228 | "language": "python", 229 | "name": "python3" 230 | }, 231 | "language_info": { 232 | "codemirror_mode": { 233 | "name": "ipython", 234 | "version": 3 235 | }, 236 | "file_extension": ".py", 237 | "mimetype": "text/x-python", 238 | "name": "python", 239 | "nbconvert_exporter": "python", 240 | "pygments_lexer": "ipython3", 241 | "version": "3.6.12" 242 | } 243 | }, 244 | "nbformat": 4, 245 | "nbformat_minor": 4 246 | } 247 | -------------------------------------------------------------------------------- /02_overview_of_hdfs/14_using_hdfs_stat_command.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Using HDFS Stat Commands\n", 8 | "\n", 9 | "Let us understand how to get details about HDFS files such as replication factor, block size etc." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "%%HTML\n", 23 | "" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "* `hdfs dfs -stat` can be used to get the statistics related to file or directory." 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "%%sh\n", 40 | "\n", 41 | "hdfs dfs -help stat" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "%%sh\n", 51 | "\n", 52 | "hdfs dfs -stat /user/${USER}/retail_db/orders" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "%%sh\n", 62 | "\n", 63 | "hdfs dfs -stat %b /user/${USER}/retail_db/orders/part-00000" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "%%sh\n", 73 | "\n", 74 | "hdfs dfs -stat %F /user/${USER}/retail_db/orders/part-00000" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "%%sh\n", 84 | "\n", 85 | "hdfs dfs -stat %F /user/${USER}/retail_db/orders/" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "%%sh\n", 95 | "\n", 96 | "hdfs dfs -stat %o /user/${USER}/retail_db/orders/part-00000" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "%%sh\n", 106 | "\n", 107 | "hdfs dfs -stat %r /user/${USER}/retail_db/orders/part-00000" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [] 116 | } 117 | ], 118 | "metadata": { 119 | "kernelspec": { 120 | "display_name": "Python 3", 121 | "language": "python", 122 | "name": "python3" 123 | }, 124 | "language_info": { 125 | "codemirror_mode": { 126 | "name": "ipython", 127 | "version": 3 128 | }, 129 | "file_extension": ".py", 130 | "mimetype": "text/x-python", 131 | "name": "python", 132 | "nbconvert_exporter": "python", 133 | "pygments_lexer": "ipython3", 134 | "version": "3.6.12" 135 | } 136 | }, 137 | "nbformat": 4, 138 | "nbformat_minor": 4 139 | } 140 | -------------------------------------------------------------------------------- /02_overview_of_hdfs/understanding_linux_file_system_240.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Understanding Linux File System\n", 8 | "\n", 9 | "Let us get quick overview of Linux File System.\n", 10 | "* All the public data sets are under `/data`.\n", 11 | "* Users have read only access on them.\n", 12 | "* We will go through some of the important commands to understand how we typically manage files using Linux.\n", 13 | " * `ls` - list the files.\n", 14 | " * `mkdir` - to create empty directory.\n", 15 | " * `cp` - to copy files.\n", 16 | " * `rm` - to delete the files or directories.\n", 17 | "* All these commands deal with local files on Linux. We need to use `hdfs dfs` or `hadoop fs` to deal with files in HDFS." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "%%sh\n", 27 | "\n", 28 | "uname -a" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "* We can access files in Linux File System using `ls` command. Also the file system starts with `/`. It is also called as root file system. By default files and folders will be alphabetically sorted.\n", 36 | "```{note}\n", 37 | "In Windows, the local file system starts with C:\\ (C Drive), D:\\ (D Drive) etc. \n", 38 | "```" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "%%sh\n", 48 | "\n", 49 | "ls /" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "* We typically use `ls -ltr` to get the list of files and directories along with their properties.\n", 57 | " * `l` for listing properties.\n", 58 | " * `t` for sorting files and folders based up on time. By default latest files comes on top.\n", 59 | " * `r` to reverse the sorting order.\n", 60 | "* `ls -ltr` will provide the list of files and directories sorted by time where latest files comes at the end." 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "%%sh\n", 70 | "\n", 71 | "ls -ltr /" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "%%sh\n", 81 | "\n", 82 | "ls -ltr /data" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "* You will have read only access to these local files. As demonstrated below only owner (root) have write access to **retail_db** folder where as others have read and execute permissions only." 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "%%sh\n", 99 | "\n", 100 | "ls -ltr /data/retail_db" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "* You can copy them to your user space with in HDFS. It will be **/user/YOUR_LOGIN_USER**.\n", 108 | "* You can determine your home directory on linux file system as well as your user space with in HDFS. You can run `echo $HOME` to get details about your home directory on linux file system. It is same as `/home/$USER`." 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "%%sh\n", 118 | "\n", 119 | "echo $HOME" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "* You can also copy these files to your home directory on linux file system using `cp` with options `r` and `f`.\n", 127 | "* It will take care of recursively copying folder `/data/retail_db` to `/user/${USER}/retail_db`" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "%%sh\n", 137 | "\n", 138 | "rm -rf /home/${USER}/retail_db" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "%%sh\n", 148 | "\n", 149 | "ls -ltr /home/${USER}/retail_db" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "%%sh\n", 159 | "\n", 160 | "cp -rf /data/retail_db /home/${USER}" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "%%sh\n", 170 | "\n", 171 | "ls -ltr /home/${USER}/retail_db" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "* We can also delete the folder `/user/${USER}/retail_db` recursively using `rm -rf`." 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "%%sh\n", 188 | "\n", 189 | "rm -rf /home/${USER}/retail_db" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [] 198 | } 199 | ], 200 | "metadata": { 201 | "kernelspec": { 202 | "display_name": "Python 3", 203 | "language": "python", 204 | "name": "python3" 205 | }, 206 | "language_info": { 207 | "codemirror_mode": { 208 | "name": "ipython", 209 | "version": 3 210 | }, 211 | "file_extension": ".py", 212 | "mimetype": "text/x-python", 213 | "name": "python", 214 | "nbconvert_exporter": "python", 215 | "pygments_lexer": "ipython3", 216 | "version": "3.6.12" 217 | } 218 | }, 219 | "nbformat": 4, 220 | "nbformat_minor": 4 221 | } 222 | -------------------------------------------------------------------------------- /03_basic_transformations/01_basic_transformations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Basic Transformations\n", 8 | "\n", 9 | "As part of this section we will see basic transformations we can perform on top of Data Frames such as filtering, aggregations, joins etc using SQL. We will build end to end solution by taking a simple problem statement." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 5, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 5, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "* Spark SQL – Overview\n", 42 | "* Define Problem Statement\n", 43 | "* Preparing Tables\n", 44 | "* Projecting Data\n", 45 | "* Filtering Data\n", 46 | "* Joining Tables - Inner\n", 47 | "* Joining Tables - Outer\n", 48 | "* Perform Aggregations\n", 49 | "* Sorting Data\n", 50 | "* Conclusion - Final Solution" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "val username = System.getProperty(\"user.name\")" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "import org.apache.spark.sql.SparkSession\n", 76 | "\n", 77 | "val username = System.getProperty(\"user.name\")\n", 78 | "val spark = SparkSession.\n", 79 | " builder.\n", 80 | " config(\"spark.ui.port\", \"0\").\n", 81 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 82 | " enableHiveSupport.\n", 83 | " appName(s\"${username} | Spark SQL - Basic Transformations\").\n", 84 | " master(\"yarn\").\n", 85 | " getOrCreate" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 93 | "\n", 94 | "**Using Spark SQL**\n", 95 | "\n", 96 | "```\n", 97 | "spark2-sql \\\n", 98 | " --master yarn \\\n", 99 | " --conf spark.ui.port=0 \\\n", 100 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 101 | "```\n", 102 | "\n", 103 | "**Using Scala**\n", 104 | "\n", 105 | "```\n", 106 | "spark2-shell \\\n", 107 | " --master yarn \\\n", 108 | " --conf spark.ui.port=0 \\\n", 109 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 110 | "```\n", 111 | "\n", 112 | "**Using Pyspark**\n", 113 | "\n", 114 | "```\n", 115 | "pyspark2 \\\n", 116 | " --master yarn \\\n", 117 | " --conf spark.ui.port=0 \\\n", 118 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 119 | "```" 120 | ] 121 | } 122 | ], 123 | "metadata": { 124 | "celltoolbar": "Tags", 125 | "kernelspec": { 126 | "display_name": "Scala", 127 | "language": "scala", 128 | "name": "scala" 129 | }, 130 | "language_info": { 131 | "codemirror_mode": "text/x-scala", 132 | "file_extension": ".scala", 133 | "mimetype": "", 134 | "name": "Scala", 135 | "nbconverter_exporter": "", 136 | "version": "2.11.12" 137 | } 138 | }, 139 | "nbformat": 4, 140 | "nbformat_minor": 4 141 | } 142 | -------------------------------------------------------------------------------- /03_basic_transformations/02_spark_sql_overview.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Spark SQL – Overview\n", 8 | "\n", 9 | "Let us get an overview of Spark SQL.\n", 10 | "\n", 11 | "Here are the standard operations which we typically perform as part of processing the data. In Spark we can perform these using Data Frame APIs or **Spark SQL**." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 6, 17 | "metadata": { 18 | "tags": [ 19 | "remove-cell" 20 | ] 21 | }, 22 | "outputs": [ 23 | { 24 | "data": { 25 | "text/html": [ 26 | "" 27 | ] 28 | }, 29 | "execution_count": 6, 30 | "metadata": {}, 31 | "output_type": "execute_result" 32 | } 33 | ], 34 | "source": [ 35 | "%%HTML\n", 36 | "" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "* Selection or Projection – select clause\n", 44 | " * It is also called as row level transformations.\n", 45 | " * Apply standardization rules (convert names and addresses to upper case).\n", 46 | " * Mask partial data (SSN and Date of births).\n", 47 | "* Filtering data – where clause\n", 48 | " * Get orders based on date or product or category.\n", 49 | "* Joins – join (supports outer join as well)\n", 50 | " * Join multiple data sets.\n", 51 | "* Aggregations – group by and aggregations with support of functions such as sum, avg, min, max etc\n", 52 | " * Get revenue for a given order\n", 53 | " * Get revenue for each order\n", 54 | " * Get daily revenue\n", 55 | "* Sorting – order by\n", 56 | " * Sort the final output by date.\n", 57 | " * Sort the final output by date, then by revenue in descending order.\n", 58 | " * Sort the final output by state or province, then by revenue in descending order.\n", 59 | "* Analytics Functions – aggregations, ranking and windowing functions\n", 60 | " * Get top 5 stores by revenue for each state.\n", 61 | " * Get top 5 products by revenue in each category." 62 | ] 63 | } 64 | ], 65 | "metadata": { 66 | "celltoolbar": "Tags", 67 | "kernelspec": { 68 | "display_name": "Apache Toree - Scala", 69 | "language": "scala", 70 | "name": "apache_toree_scala" 71 | }, 72 | "language_info": { 73 | "codemirror_mode": "text/x-scala", 74 | "file_extension": ".scala", 75 | "mimetype": "text/x-scala", 76 | "name": "scala", 77 | "pygments_lexer": "scala", 78 | "version": "2.11.12" 79 | } 80 | }, 81 | "nbformat": 4, 82 | "nbformat_minor": 4 83 | } 84 | -------------------------------------------------------------------------------- /03_basic_transformations/03_define_problem_statement.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Define Problem Statement\n", 8 | "\n", 9 | "Let us define problemt statement to get an overview of basic transformations using Spark SQL." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 7, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 7, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "* Get Daily Product Revenue using orders and order_items data set.\n", 42 | "* We have following fields in **orders**.\n", 43 | " * order_id\n", 44 | " * order_date\n", 45 | " * order_customer_id\n", 46 | " * order_status\n", 47 | "* We have following fields in **order_items**.\n", 48 | " * order_item_id\n", 49 | " * order_item_order_id\n", 50 | " * order_item_product_id\n", 51 | " * order_item_quantity\n", 52 | " * order_item_subtotal\n", 53 | " * order_item_product_price\n", 54 | "* We have one to many relationship between orders and order_items.\n", 55 | "* **orders.order_id** is **primary key** and **order_items.order_item_order_id** is foreign key to **orders.order_id**.\n", 56 | "* By the end of this module we will explore all standard transformation and get daily product revenue using following fields.\n", 57 | " * **orders.order_date**\n", 58 | " * **order_items.order_item_product_id**\n", 59 | " * **order_items.order_item_subtotal** (aggregated using date and product_id).\n", 60 | "* We will consider only **COMPLETE** or **CLOSED** orders." 61 | ] 62 | } 63 | ], 64 | "metadata": { 65 | "celltoolbar": "Tags", 66 | "kernelspec": { 67 | "display_name": "Apache Toree - Scala", 68 | "language": "scala", 69 | "name": "apache_toree_scala" 70 | }, 71 | "language_info": { 72 | "codemirror_mode": "text/x-scala", 73 | "file_extension": ".scala", 74 | "mimetype": "text/x-scala", 75 | "name": "scala", 76 | "pygments_lexer": "scala", 77 | "version": "2.11.12" 78 | } 79 | }, 80 | "nbformat": 4, 81 | "nbformat_minor": 4 82 | } 83 | -------------------------------------------------------------------------------- /03_basic_transformations/12_exercises_basic_sql_queries.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Exercises - Basic SQL Queries\n", 8 | "\n", 9 | "Here are some of the exercises for which you can write SQL queries to self evaluate." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "val username = System.getProperty(\"user.name\")" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import org.apache.spark.sql.SparkSession\n", 35 | "\n", 36 | "val username = System.getProperty(\"user.name\")\n", 37 | "val spark = SparkSession.\n", 38 | " builder.\n", 39 | " config(\"spark.ui.port\", \"0\").\n", 40 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 41 | " enableHiveSupport.\n", 42 | " appName(s\"${username} | Spark SQL - Basic Transformations\").\n", 43 | " master(\"yarn\").\n", 44 | " getOrCreate" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 52 | "\n", 53 | "**Using Spark SQL**\n", 54 | "\n", 55 | "```\n", 56 | "spark2-sql \\\n", 57 | " --master yarn \\\n", 58 | " --conf spark.ui.port=0 \\\n", 59 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 60 | "```\n", 61 | "\n", 62 | "**Using Scala**\n", 63 | "\n", 64 | "```\n", 65 | "spark2-shell \\\n", 66 | " --master yarn \\\n", 67 | " --conf spark.ui.port=0 \\\n", 68 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 69 | "```\n", 70 | "\n", 71 | "**Using Pyspark**\n", 72 | "\n", 73 | "```\n", 74 | "pyspark2 \\\n", 75 | " --master yarn \\\n", 76 | " --conf spark.ui.port=0 \\\n", 77 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 78 | "```" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "### Exercise 1 - Customer order count\n", 86 | "\n", 87 | "Get order count per customer for the month of 2014 January.\n", 88 | "* Tables - orders and customers\n", 89 | "* Data should be sorted in descending order by count and ascending order by customer id.\n", 90 | "* Output should contain customer_id, customer_first_name, customer_last_name and customer_order_count." 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "### Exercise 2 - Dormant Customers\n", 98 | "\n", 99 | "Get the customer details who have not placed any order for the month of 2014 January.\n", 100 | "* Tables - orders and customers\n", 101 | "* Data should be sorted in ascending order by customer_id\n", 102 | "* Output should contain all the fields from customers" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "### Exercise 3 - Revenue Per Customer\n", 110 | "\n", 111 | "Get the revenue generated by each customer for the month of 2014 January\n", 112 | "* Tables - orders, order_items and customers\n", 113 | "* Data should be sorted in descending order by revenue and then ascending order by customer_id\n", 114 | "* Output should contain customer_id, customer_first_name, customer_last_name, customer_revenue.\n", 115 | "* If there are no orders placed by customer, then the corresponding revenue for a give customer should be 0.\n", 116 | "* Consider only COMPLETE and CLOSED orders" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "### Exercise 4 - Revenue Per Category\n", 124 | "\n", 125 | "Get the revenue generated for each category for the month of 2014 January\n", 126 | "* Tables - orders, order_items, products and categories\n", 127 | "* Data should be sorted in ascending order by category_id.\n", 128 | "* Output should contain all the fields from category along with the revenue as category_revenue.\n", 129 | "* Consider only COMPLETE and CLOSED orders" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "### Exercise 5 - Product Count Per Department\n", 137 | "\n", 138 | "Get the products for each department.\n", 139 | "* Tables - departments, categories, products\n", 140 | "* Data should be sorted in ascending order by department_id\n", 141 | "* Output should contain all the fields from department and the product count as product_count" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [] 150 | } 151 | ], 152 | "metadata": { 153 | "celltoolbar": "Tags", 154 | "kernelspec": { 155 | "display_name": "Apache Toree - Scala", 156 | "language": "scala", 157 | "name": "apache_toree_scala" 158 | }, 159 | "language_info": { 160 | "codemirror_mode": "text/x-scala", 161 | "file_extension": ".scala", 162 | "mimetype": "text/x-scala", 163 | "name": "scala", 164 | "pygments_lexer": "scala", 165 | "version": "2.11.12" 166 | } 167 | }, 168 | "nbformat": 4, 169 | "nbformat_minor": 4 170 | } 171 | -------------------------------------------------------------------------------- /04_basic_ddl_and_dml/01_basic_ddl_and_dml.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Basic DDL and DML\n", 8 | "\n", 9 | "As part of this section we will primarily focus on basic DDL and DML using Spark Metastore.\n", 10 | "\n", 11 | "* Create Spark Metastore Tables\n", 12 | "* Overview of Data Types\n", 13 | "* Adding Comments\n", 14 | "* Loading Data into Tables - Local\n", 15 | "* Loading Data into Tables - HDFS\n", 16 | "* Loading Data - Append and Overwrite\n", 17 | "* Creating External Tables\n", 18 | "* Managed Tables vs. External Tables\n", 19 | "* Overview of File Formats\n", 20 | "* Dropping Tables and Databases\n", 21 | "* Truncating Tables" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": { 28 | "tags": [ 29 | "remove-cell" 30 | ] 31 | }, 32 | "outputs": [ 33 | { 34 | "data": { 35 | "text/html": [ 36 | "" 37 | ] 38 | }, 39 | "execution_count": 1, 40 | "metadata": {}, 41 | "output_type": "execute_result" 42 | } 43 | ], 44 | "source": [ 45 | "%%HTML\n", 46 | "" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "val username = System.getProperty(\"user.name\")" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "import org.apache.spark.sql.SparkSession\n", 72 | "\n", 73 | "val username = System.getProperty(\"user.name\")\n", 74 | "val spark = SparkSession.\n", 75 | " builder.\n", 76 | " config(\"spark.ui.port\", \"0\").\n", 77 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 78 | " enableHiveSupport.\n", 79 | " appName(s\"${username} | Spark SQL - Managing Tables - Basic DDL and DML\").\n", 80 | " master(\"yarn\").\n", 81 | " getOrCreate" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 89 | "\n", 90 | "**Using Spark SQL**\n", 91 | "\n", 92 | "```\n", 93 | "spark2-sql \\\n", 94 | " --master yarn \\\n", 95 | " --conf spark.ui.port=0 \\\n", 96 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 97 | "```\n", 98 | "\n", 99 | "**Using Scala**\n", 100 | "\n", 101 | "```\n", 102 | "spark2-shell \\\n", 103 | " --master yarn \\\n", 104 | " --conf spark.ui.port=0 \\\n", 105 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 106 | "```\n", 107 | "\n", 108 | "**Using Pyspark**\n", 109 | "\n", 110 | "```\n", 111 | "pyspark2 \\\n", 112 | " --master yarn \\\n", 113 | " --conf spark.ui.port=0 \\\n", 114 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 115 | "```" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [] 124 | } 125 | ], 126 | "metadata": { 127 | "kernelspec": { 128 | "display_name": "Apache Toree - Scala", 129 | "language": "scala", 130 | "name": "apache_toree_scala" 131 | }, 132 | "language_info": { 133 | "codemirror_mode": "text/x-scala", 134 | "file_extension": ".scala", 135 | "mimetype": "text/x-scala", 136 | "name": "scala", 137 | "pygments_lexer": "scala", 138 | "version": "2.11.12" 139 | } 140 | }, 141 | "nbformat": 4, 142 | "nbformat_minor": 4 143 | } 144 | -------------------------------------------------------------------------------- /04_basic_ddl_and_dml/04_adding_comments.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Adding Comments\n", 8 | "\n", 9 | "Let us understand how to create table with comments in Hive using orders as example." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 4, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 4, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "val username = System.getProperty(\"user.name\")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "import org.apache.spark.sql.SparkSession\n", 60 | "\n", 61 | "val username = System.getProperty(\"user.name\")\n", 62 | "val spark = SparkSession.\n", 63 | " builder.\n", 64 | " config(\"spark.ui.port\", \"0\").\n", 65 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 66 | " enableHiveSupport.\n", 67 | " appName(s\"${username} | Spark SQL - Managing Tables - Basic DDL and DML\").\n", 68 | " master(\"yarn\").\n", 69 | " getOrCreate" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 77 | "\n", 78 | "**Using Spark SQL**\n", 79 | "\n", 80 | "```\n", 81 | "spark2-sql \\\n", 82 | " --master yarn \\\n", 83 | " --conf spark.ui.port=0 \\\n", 84 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 85 | "```\n", 86 | "\n", 87 | "**Using Scala**\n", 88 | "\n", 89 | "```\n", 90 | "spark2-shell \\\n", 91 | " --master yarn \\\n", 92 | " --conf spark.ui.port=0 \\\n", 93 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 94 | "```\n", 95 | "\n", 96 | "**Using Pyspark**\n", 97 | "\n", 98 | "```\n", 99 | "pyspark2 \\\n", 100 | " --master yarn \\\n", 101 | " --conf spark.ui.port=0 \\\n", 102 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 103 | "```" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "* We can specify comments for both columns as well as tables using COMMENT keyword." 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "%%sql\n", 120 | "\n", 121 | "USE itversity_retail" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "%%sql\n", 131 | "\n", 132 | "DROP TABLE IF EXISTS orders" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "%%sql\n", 142 | "\n", 143 | "CREATE TABLE orders (\n", 144 | " order_id INT COMMENT 'Unique order id',\n", 145 | " order_date STRING COMMENT 'Date on which order is placed',\n", 146 | " order_customer_id INT COMMENT 'Customer id who placed the order',\n", 147 | " order_status STRING COMMENT 'Current status of the order'\n", 148 | ") COMMENT 'Table to save order level details'" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "* Using Spark SQL with Python or Scala" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "spark.sql(\"USE itversity_retail\")" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "spark.sql(\"DROP TABLE orders\")" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "spark.sql(\"\"\"\n", 183 | "CREATE TABLE orders (\n", 184 | " order_id STRING COMMENT 'Unique order id',\n", 185 | " order_date STRING COMMENT 'Date on which order is placed',\n", 186 | " order_customer_id INT COMMENT 'Customer id who placed the order',\n", 187 | " order_status STRING COMMENT 'Current status of the order'\n", 188 | ") COMMENT 'Table to save order level details'\n", 189 | "\"\"\")" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "* Default field delimiter is \\001 character.\n", 197 | "* We can see the comments using `DESCRIBE orders` or `DESCRIBE FORMATTED orders`." 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "%%sql\n", 207 | "\n", 208 | "DESCRIBE orders" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "%%sql\n", 218 | "\n", 219 | "DESCRIBE FORMATTED orders" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "spark.sql(\"DESCRIBE FORMATTED orders\").show(200, false) // Scala" 229 | ] 230 | } 231 | ], 232 | "metadata": { 233 | "kernelspec": { 234 | "display_name": "Apache Toree - Scala", 235 | "language": "scala", 236 | "name": "apache_toree_scala" 237 | }, 238 | "language_info": { 239 | "name": "" 240 | } 241 | }, 242 | "nbformat": 4, 243 | "nbformat_minor": 4 244 | } 245 | -------------------------------------------------------------------------------- /04_basic_ddl_and_dml/09_managed_vs_external_tables.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Managed Tables vs. External Tables\n", 8 | "\n", 9 | "Let us compare and contrast between Managed Tables and External Tables." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 9, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 9, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "val username = System.getProperty(\"user.name\")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "import org.apache.spark.sql.SparkSession\n", 60 | "\n", 61 | "val username = System.getProperty(\"user.name\")\n", 62 | "val spark = SparkSession.\n", 63 | " builder.\n", 64 | " config(\"spark.ui.port\", \"0\").\n", 65 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 66 | " enableHiveSupport.\n", 67 | " appName(s\"${username} | Spark SQL - Managing Tables - Basic DDL and DML\").\n", 68 | " master(\"yarn\").\n", 69 | " getOrCreate" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 77 | "\n", 78 | "**Using Spark SQL**\n", 79 | "\n", 80 | "```\n", 81 | "spark2-sql \\\n", 82 | " --master yarn \\\n", 83 | " --conf spark.ui.port=0 \\\n", 84 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 85 | "```\n", 86 | "\n", 87 | "**Using Scala**\n", 88 | "\n", 89 | "```\n", 90 | "spark2-shell \\\n", 91 | " --master yarn \\\n", 92 | " --conf spark.ui.port=0 \\\n", 93 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 94 | "```\n", 95 | "\n", 96 | "**Using Pyspark**\n", 97 | "\n", 98 | "```\n", 99 | "pyspark2 \\\n", 100 | " --master yarn \\\n", 101 | " --conf spark.ui.port=0 \\\n", 102 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 103 | "```" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "* When we say EXTERNAL and specify LOCATION or LOCATION alone as part of CREATE TABLE, it makes the table EXTERNAL.\n", 111 | "* Rest of the syntax is same as Managed Table.\n", 112 | "* However, when we drop **Managed Table**, it will delete metadata from metastore as well as data from HDFS.\n", 113 | "* When we drop **External Table**, only metadata will be dropped, not the data.\n", 114 | "* Typically we use **External Table** when same dataset is processed by multiple frameworks such as Hive, Pig, Spark etc.\n", 115 | "* We cannot run **TRUNCATE TABLE** command against External Tables." 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "%%sql\n", 125 | "\n", 126 | "USE itversity_retail" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "%%sql\n", 136 | "\n", 137 | "SHOW tables" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "spark.sql(\"DESCRIBE FORMATTED orders\").show(200, false)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "%%sql\n", 156 | "\n", 157 | "TRUNCATE TABLE orders" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "spark.sql(\"DESCRIBE FORMATTED order_items\").show(200, false)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "%%sql\n", 176 | "\n", 177 | "TRUNCATE TABLE order_items" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "%%sql\n", 187 | "\n", 188 | "DROP TABLE orders" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "%%sql\n", 198 | "\n", 199 | "DROP TABLE order_items" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "import sys.process._\n", 209 | "\n", 210 | "s\"hdfs dfs -ls /user/${username}/retail_db/orders\" !" 211 | ] 212 | } 213 | ], 214 | "metadata": { 215 | "kernelspec": { 216 | "display_name": "Apache Toree - Scala", 217 | "language": "scala", 218 | "name": "apache_toree_scala" 219 | }, 220 | "language_info": { 221 | "name": "" 222 | } 223 | }, 224 | "nbformat": 4, 225 | "nbformat_minor": 4 226 | } 227 | -------------------------------------------------------------------------------- /04_basic_ddl_and_dml/10_overview_of_file_formats.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Overview of File Formats\n", 8 | "Let us go through the details about different file formats supported by STORED AS Clause." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 10, 14 | "metadata": { 15 | "tags": [ 16 | "remove-cell" 17 | ] 18 | }, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/html": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": 10, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "%%HTML\n", 33 | "" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "val username = System.getProperty(\"user.name\")" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "import org.apache.spark.sql.SparkSession\n", 59 | "\n", 60 | "val username = System.getProperty(\"user.name\")\n", 61 | "val spark = SparkSession.\n", 62 | " builder.\n", 63 | " config(\"spark.ui.port\", \"0\").\n", 64 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 65 | " enableHiveSupport.\n", 66 | " appName(s\"${username} | Spark SQL - Managing Tables - Basic DDL and DML\").\n", 67 | " master(\"yarn\").\n", 68 | " getOrCreate" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 76 | "\n", 77 | "**Using Spark SQL**\n", 78 | "\n", 79 | "```\n", 80 | "spark2-sql \\\n", 81 | " --master yarn \\\n", 82 | " --conf spark.ui.port=0 \\\n", 83 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 84 | "```\n", 85 | "\n", 86 | "**Using Scala**\n", 87 | "\n", 88 | "```\n", 89 | "spark2-shell \\\n", 90 | " --master yarn \\\n", 91 | " --conf spark.ui.port=0 \\\n", 92 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 93 | "```\n", 94 | "\n", 95 | "**Using Pyspark**\n", 96 | "\n", 97 | "```\n", 98 | "pyspark2 \\\n", 99 | " --master yarn \\\n", 100 | " --conf spark.ui.port=0 \\\n", 101 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 102 | "```" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "* Go to this [page](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL) and review supported file formats.\n", 110 | "* Supported File Formats\n", 111 | " * TEXTFILE\n", 112 | " * ORC\n", 113 | " * PARQUET\n", 114 | " * AVRO\n", 115 | " * SEQUENCEFILE - is not important\n", 116 | " * JSONFILE - only available in recent vesions of Hive.\n", 117 | " * and more\n", 118 | "* We can even specify custom file formats (out of scope)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "%%sql\n", 128 | "\n", 129 | "DROP DATABASE IF EXISTS itversity_sms CASCADE" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "%%sql\n", 139 | "\n", 140 | "CREATE DATABASE IF NOT EXISTS itversity_sms" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "%%sql\n", 150 | "\n", 151 | "USE itversity_sms" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "%%sql\n", 161 | "\n", 162 | "CREATE TABLE students (\n", 163 | " student_id INT,\n", 164 | " student_first_name STRING,\n", 165 | " student_last_name STRING,\n", 166 | " student_phone_numbers ARRAY,\n", 167 | " student_address STRUCT\n", 168 | ") STORED AS parquet" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "%%sql\n", 178 | "\n", 179 | "INSERT INTO students VALUES (1, 'Scott', 'Tiger', NULL, NULL)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "%%sql\n", 189 | "\n", 190 | "INSERT INTO students VALUES (2, 'Donald', 'Duck', ARRAY('1234567890', '2345678901'), NULL)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "%%sql\n", 200 | "\n", 201 | "INSERT INTO students VALUES \n", 202 | " (3, 'Mickey', 'Mouse', ARRAY('1234567890', '2345678901'), STRUCT('A Street', 'One City', 'Some State', '12345')),\n", 203 | " (4, 'Bubble', 'Guppy', ARRAY('5678901234', '6789012345'), STRUCT('Bubbly Street', 'Guppy', 'La la land', '45678'))" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "%%sql\n", 213 | "\n", 214 | "SELECT * FROM students" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "import sys.process._\n", 224 | "val username = System.getProperty(\"user.name\")" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "s\"hdfs dfs -ls /user/${username}/warehouse/${username}_sms.db/students\"!" 234 | ] 235 | } 236 | ], 237 | "metadata": { 238 | "kernelspec": { 239 | "display_name": "Apache Toree - Scala", 240 | "language": "scala", 241 | "name": "apache_toree_scala" 242 | }, 243 | "language_info": { 244 | "name": "" 245 | } 246 | }, 247 | "nbformat": 4, 248 | "nbformat_minor": 4 249 | } 250 | -------------------------------------------------------------------------------- /04_basic_ddl_and_dml/11_dropping_tables_and_databases.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Dropping Tables and Databases\n", 8 | "\n", 9 | "Let us understand how to DROP Spark Metastore Tables as well as Databases." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 11, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 11, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "val username = System.getProperty(\"user.name\")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "import org.apache.spark.sql.SparkSession\n", 60 | "\n", 61 | "val username = System.getProperty(\"user.name\")\n", 62 | "val spark = SparkSession.\n", 63 | " builder.\n", 64 | " config(\"spark.ui.port\", \"0\").\n", 65 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 66 | " enableHiveSupport.\n", 67 | " appName(s\"${username} | Spark SQL - Managing Tables - Basic DDL and DML\").\n", 68 | " master(\"yarn\").\n", 69 | " getOrCreate" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 77 | "\n", 78 | "**Using Spark SQL**\n", 79 | "\n", 80 | "```\n", 81 | "spark2-sql \\\n", 82 | " --master yarn \\\n", 83 | " --conf spark.ui.port=0 \\\n", 84 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 85 | "```\n", 86 | "\n", 87 | "**Using Scala**\n", 88 | "\n", 89 | "```\n", 90 | "spark2-shell \\\n", 91 | " --master yarn \\\n", 92 | " --conf spark.ui.port=0 \\\n", 93 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 94 | "```\n", 95 | "\n", 96 | "**Using Pyspark**\n", 97 | "\n", 98 | "```\n", 99 | "pyspark2 \\\n", 100 | " --master yarn \\\n", 101 | " --conf spark.ui.port=0 \\\n", 102 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 103 | "```" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "* We can use **DROP TABLE** command to drop the table.. Let us drop orders table" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "%%sql\n", 120 | "\n", 121 | "CREATE DATABASE IF NOT EXISTS itversity_retail" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "%%sql\n", 131 | "\n", 132 | "USE itversity_retail" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "%%sql\n", 142 | "\n", 143 | "SHOW tables" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "%%sql\n", 153 | "\n", 154 | "CREATE TABLE IF NOT EXISTS orders (\n", 155 | " order_id INT,\n", 156 | " order_date STRING,\n", 157 | " order_customer_id INT,\n", 158 | " order_status STRING\n", 159 | ") ROW FORMAT DELIMITED FIELDS TERMINATED BY ','" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "%%sql\n", 169 | "\n", 170 | "DROP TABLE orders" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "%%sql\n", 180 | "\n", 181 | "DROP TABLE IF EXISTS orders" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "* **DROP TABLE** on managed table will delete both metadata in metastore as well as data in HDFS, while **DROP TABLE** on external table will only delete metadata in metastore.\n", 189 | "* We can drop database by using **DROP DATABASE** Command. However we need to drop all the tables in the database first.\n", 190 | "* Here is the example to drop the database itversity_retail - `DROP DATABASE itversity_retail`\n", 191 | "* We can also drop all the tables and databases by adding **CASCADE**." 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "%%sql\n", 201 | "\n", 202 | "DROP DATABASE itversity_retail" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "%%sql\n", 212 | "\n", 213 | "DROP DATABASE IF EXISTS itversity_retail CASCADE" 214 | ] 215 | } 216 | ], 217 | "metadata": { 218 | "kernelspec": { 219 | "display_name": "Apache Toree - Scala", 220 | "language": "scala", 221 | "name": "apache_toree_scala" 222 | }, 223 | "language_info": { 224 | "name": "" 225 | } 226 | }, 227 | "nbformat": 4, 228 | "nbformat_minor": 4 229 | } 230 | -------------------------------------------------------------------------------- /04_basic_ddl_and_dml/12_truncating_tables.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Truncating Tables\n", 8 | "\n", 9 | "Let us understand how to truncate tables." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 12, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 12, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "val username = System.getProperty(\"user.name\")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "import org.apache.spark.sql.SparkSession\n", 60 | "\n", 61 | "val username = System.getProperty(\"user.name\")\n", 62 | "val spark = SparkSession.\n", 63 | " builder.\n", 64 | " config(\"spark.ui.port\", \"0\").\n", 65 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 66 | " enableHiveSupport.\n", 67 | " appName(s\"${username} | Spark SQL - Managing Tables - Basic DDL and DML\").\n", 68 | " master(\"yarn\").\n", 69 | " getOrCreate" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 77 | "\n", 78 | "**Using Spark SQL**\n", 79 | "\n", 80 | "```\n", 81 | "spark2-sql \\\n", 82 | " --master yarn \\\n", 83 | " --conf spark.ui.port=0 \\\n", 84 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 85 | "```\n", 86 | "\n", 87 | "**Using Scala**\n", 88 | "\n", 89 | "```\n", 90 | "spark2-shell \\\n", 91 | " --master yarn \\\n", 92 | " --conf spark.ui.port=0 \\\n", 93 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 94 | "```\n", 95 | "\n", 96 | "**Using Pyspark**\n", 97 | "\n", 98 | "```\n", 99 | "pyspark2 \\\n", 100 | " --master yarn \\\n", 101 | " --conf spark.ui.port=0 \\\n", 102 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 103 | "```" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "* **TRUNCATE** works only for managed tables. Only data will be deleted, structure will be retained.\n", 111 | "* Launch Spark SQL" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "%%sql\n", 121 | "\n", 122 | "CREATE DATABASE IF NOT EXISTS itversity_retail" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "%%sql\n", 132 | "\n", 133 | "SHOW tables" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "%%sql\n", 143 | "\n", 144 | "DROP TABLE IF EXISTS orders" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "%%sql\n", 154 | "\n", 155 | "CREATE TABLE orders (\n", 156 | " order_id INT,\n", 157 | " order_date STRING,\n", 158 | " order_customer_id INT,\n", 159 | " order_status STRING\n", 160 | ") ROW FORMAT DELIMITED FIELDS TERMINATED BY ','" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "%%sql\n", 170 | "\n", 171 | "LOAD DATA LOCAL INPATH '/data/retail_db/orders'\n", 172 | " INTO TABLE orders" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "%%sql\n", 182 | "\n", 183 | "SELECT * FROM orders LIMIT 10" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "%%sql\n", 193 | "\n", 194 | "TRUNCATE TABLE orders" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "%%sql\n", 204 | "\n", 205 | "SELECT * FROM orders LIMIT 10" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "%%sql\n", 215 | "\n", 216 | "DROP TABLE IF EXISTS orders" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "%%sql\n", 226 | "\n", 227 | "CREATE EXTERNAL TABLE orders (\n", 228 | " order_id INT,\n", 229 | " order_date STRING,\n", 230 | " order_customer_id INT,\n", 231 | " order_status STRING\n", 232 | ") ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\n", 233 | "LOCATION '/user/itversity/external/retail_db/orders'" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "%%sql\n", 243 | "\n", 244 | "LOAD DATA LOCAL INPATH '/data/retail_db/orders'\n", 245 | " OVERWRITE INTO TABLE orders" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "%%sql\n", 255 | "\n", 256 | "SELECT * FROM orders LIMIT 10" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "%%sql\n", 266 | "\n", 267 | "TRUNCATE TABLE orders" 268 | ] 269 | } 270 | ], 271 | "metadata": { 272 | "kernelspec": { 273 | "display_name": "Apache Toree - Scala", 274 | "language": "scala", 275 | "name": "apache_toree_scala" 276 | }, 277 | "language_info": { 278 | "name": "" 279 | } 280 | }, 281 | "nbformat": 4, 282 | "nbformat_minor": 4 283 | } 284 | -------------------------------------------------------------------------------- /05_dml_and_partitioning/01_dml_and_partitioning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# DML and Partitioning\n", 8 | "\n", 9 | "As part of this section we will continue understanding further concepts related to DML and also get into the details related to partitioning tables. With respect to DML, earlier we have seen how to use LOAD command, now we will see how to use INSERT command primarily to get query results copied into a table.\n", 10 | "\n", 11 | "* Introduction to Partitioning\n", 12 | "* Creating Tables using Parquet\n", 13 | "* LOAD vs. INSERT\n", 14 | "* Inserting Data using Stage Table\n", 15 | "* Creating Partitioned Tables\n", 16 | "* Adding Partitions to Tables\n", 17 | "* Loading data into Partitions\n", 18 | "* Inserting Data into Partitions\n", 19 | "* Using Dynamic Partition Mode\n", 20 | "* Exercise - Partitioned Tables" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": { 27 | "tags": [ 28 | "remove-cell" 29 | ] 30 | }, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/html": [ 35 | "" 36 | ] 37 | }, 38 | "execution_count": 1, 39 | "metadata": {}, 40 | "output_type": "execute_result" 41 | } 42 | ], 43 | "source": [ 44 | "%%HTML\n", 45 | "" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "val username = System.getProperty(\"user.name\")" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "import org.apache.spark.sql.SparkSession\n", 71 | "\n", 72 | "val username = System.getProperty(\"user.name\")\n", 73 | "val spark = SparkSession.\n", 74 | " builder.\n", 75 | " config(\"spark.ui.port\", \"0\").\n", 76 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 77 | " enableHiveSupport.\n", 78 | " appName(s\"${username} | Spark SQL - Managing Tables - DML and Partitioning\").\n", 79 | " master(\"yarn\").\n", 80 | " getOrCreate" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 88 | "\n", 89 | "**Using Spark SQL**\n", 90 | "\n", 91 | "```\n", 92 | "spark2-sql \\\n", 93 | " --master yarn \\\n", 94 | " --conf spark.ui.port=0 \\\n", 95 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 96 | "```\n", 97 | "\n", 98 | "**Using Scala**\n", 99 | "\n", 100 | "```\n", 101 | "spark2-shell \\\n", 102 | " --master yarn \\\n", 103 | " --conf spark.ui.port=0 \\\n", 104 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 105 | "```\n", 106 | "\n", 107 | "**Using Pyspark**\n", 108 | "\n", 109 | "```\n", 110 | "pyspark2 \\\n", 111 | " --master yarn \\\n", 112 | " --conf spark.ui.port=0 \\\n", 113 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 114 | "```" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "**Unlike Hive, Spark SQL does not support Bucketing which is similar to Hash Partitioning. However, Delta Lake does. Delta Lake is 3rd party library which facilitate us additional capabilities such as ACID transactions on top of Spark Metastore tables**\n", 122 | "\n", 123 | "Let us make sure that we have orders table with data as we will be using it to populate partitioned tables very soon." 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "%%sql\n", 133 | "\n", 134 | "USE itversity_retail" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "%%sql\n", 144 | "\n", 145 | "SHOW tables" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "%%sql\n", 155 | "\n", 156 | "DROP TABLE orders" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "%%sql\n", 166 | "\n", 167 | "SELECT current_database()" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "%%sql\n", 177 | "\n", 178 | "CREATE TABLE IF NOT EXISTS itversity_retail.orders (\n", 179 | " order_id INT,\n", 180 | " order_date STRING,\n", 181 | " order_customer_id INT,\n", 182 | " order_status STRING\n", 183 | ") ROW FORMAT DELIMITED FIELDS TERMINATED BY ','" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "%%sql\n", 193 | "\n", 194 | "LOAD DATA LOCAL INPATH '/data/retail_db/orders'\n", 195 | " OVERWRITE INTO TABLE orders" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "%%sql\n", 205 | "\n", 206 | "SELECT count(1) FROM orders" 207 | ] 208 | } 209 | ], 210 | "metadata": { 211 | "kernelspec": { 212 | "display_name": "Apache Toree - Scala", 213 | "language": "scala", 214 | "name": "apache_toree_scala" 215 | }, 216 | "language_info": { 217 | "codemirror_mode": "text/x-scala", 218 | "file_extension": ".scala", 219 | "mimetype": "text/x-scala", 220 | "name": "scala", 221 | "pygments_lexer": "scala", 222 | "version": "2.11.12" 223 | } 224 | }, 225 | "nbformat": 4, 226 | "nbformat_minor": 4 227 | } 228 | -------------------------------------------------------------------------------- /05_dml_and_partitioning/02_introduction_to_partitioning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Introduction to Partitioning\n", 8 | "\n", 9 | "Let us get an overview of partitioning of Spark Metastore tables." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "* It is similar to list partitioning where each partition is equal to a particular value for a given column.\n", 42 | "* Spark Metastore does not support range partitioning and bucketing. Bucketing is supported in Hive which is similar to Hash Partitioning.\n", 43 | "* Once the table is created, we can add static partitions and then load or insert data into it.\n", 44 | "* Spark Metastore also support creation of partitions dynamically, where partitions will be created based up on the partition column value.\n", 45 | "* A Partitioned table can be managed or external." 46 | ] 47 | } 48 | ], 49 | "metadata": { 50 | "kernelspec": { 51 | "display_name": "Apache Toree - Scala", 52 | "language": "scala", 53 | "name": "apache_toree_scala" 54 | }, 55 | "language_info": { 56 | "name": "" 57 | } 58 | }, 59 | "nbformat": 4, 60 | "nbformat_minor": 4 61 | } 62 | -------------------------------------------------------------------------------- /05_dml_and_partitioning/04_load_vs_insert.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## LOAD vs. INSERT\n", 8 | "\n", 9 | "Let us compare and contrast LOAD and INSERT commands. These are the main approaches using which we get data into Spark Metastore tables." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 4, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 4, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "val username = System.getProperty(\"user.name\")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "import org.apache.spark.sql.SparkSession\n", 60 | "\n", 61 | "val username = System.getProperty(\"user.name\")\n", 62 | "val spark = SparkSession.\n", 63 | " builder.\n", 64 | " config(\"spark.ui.port\", \"0\").\n", 65 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 66 | " enableHiveSupport.\n", 67 | " appName(s\"${username} | Spark SQL - Managing Tables - DML and Partitioning\").\n", 68 | " master(\"yarn\").\n", 69 | " getOrCreate" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 77 | "\n", 78 | "**Using Spark SQL**\n", 79 | "\n", 80 | "```\n", 81 | "spark2-sql \\\n", 82 | " --master yarn \\\n", 83 | " --conf spark.ui.port=0 \\\n", 84 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 85 | "```\n", 86 | "\n", 87 | "**Using Scala**\n", 88 | "\n", 89 | "```\n", 90 | "spark2-shell \\\n", 91 | " --master yarn \\\n", 92 | " --conf spark.ui.port=0 \\\n", 93 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 94 | "```\n", 95 | "\n", 96 | "**Using Pyspark**\n", 97 | "\n", 98 | "```\n", 99 | "pyspark2 \\\n", 100 | " --master yarn \\\n", 101 | " --conf spark.ui.port=0 \\\n", 102 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 103 | "```" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "* LOAD will copy the files by dividing them into blocks.\n", 111 | "* LOAD is the fastest way of getting data into Spark Metastore tables. However, there will be minimal validations at File level. \n", 112 | "* There will be no transformations or validations at data level.\n", 113 | "* If it require any transformation while getting data into Spark Metastore table, then we need to use INSERT command.\n", 114 | "* Here are some of the usage scenarios of insert:\n", 115 | " * Changing delimiters in case of text file format\n", 116 | " * Changing file format\n", 117 | " * Loading data into partitioned or bucketed tables (if bucketing is supported).\n", 118 | " * Apply any other transformations at data level (widely used)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "%%sql\n", 128 | "\n", 129 | "USE itversity_retail" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "%%sql\n", 139 | "\n", 140 | "DROP TABLE IF EXISTS order_items" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "%%sql\n", 150 | "\n", 151 | "CREATE TABLE order_items (\n", 152 | " order_item_id INT,\n", 153 | " order_item_order_id INT,\n", 154 | " order_item_product_id INT,\n", 155 | " order_item_quantity INT,\n", 156 | " order_item_subtotal FLOAT,\n", 157 | " order_item_product_price FLOAT\n", 158 | ") STORED AS parquet" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "%%sql\n", 168 | "\n", 169 | "LOAD DATA LOCAL INPATH '/data/retail_db/order_items'\n", 170 | " INTO TABLE order_items" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "val username = System.getProperty(\"user.name\")" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "import sys.process._\n", 189 | "\n", 190 | "s\"hdfs dfs -ls /user/${username}/warehouse/${username}_retail.db/order_items\" !" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "%%sql\n", 200 | "\n", 201 | "SELECT * FROM order_items LIMIT 10" 202 | ] 203 | } 204 | ], 205 | "metadata": { 206 | "kernelspec": { 207 | "display_name": "Apache Toree - Scala", 208 | "language": "scala", 209 | "name": "apache_toree_scala" 210 | }, 211 | "language_info": { 212 | "name": "" 213 | } 214 | }, 215 | "nbformat": 4, 216 | "nbformat_minor": 4 217 | } 218 | -------------------------------------------------------------------------------- /05_dml_and_partitioning/06_creating_partitioned_tables.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Creating Partitioned Tables\n", 8 | "\n", 9 | "Let us understand how to create partitioned table and get data into that table." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 6, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 6, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "val username = System.getProperty(\"user.name\")" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "import org.apache.spark.sql.SparkSession\n", 53 | "\n", 54 | "val username = System.getProperty(\"user.name\")\n", 55 | "val spark = SparkSession.\n", 56 | " builder.\n", 57 | " config(\"spark.ui.port\", \"0\").\n", 58 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 59 | " enableHiveSupport.\n", 60 | " appName(s\"${username} | Spark SQL - Managing Tables - DML and Partitioning\").\n", 61 | " master(\"yarn\").\n", 62 | " getOrCreate" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 70 | "\n", 71 | "**Using Spark SQL**\n", 72 | "\n", 73 | "```\n", 74 | "spark2-sql \\\n", 75 | " --master yarn \\\n", 76 | " --conf spark.ui.port=0 \\\n", 77 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 78 | "```\n", 79 | "\n", 80 | "**Using Scala**\n", 81 | "\n", 82 | "```\n", 83 | "spark2-shell \\\n", 84 | " --master yarn \\\n", 85 | " --conf spark.ui.port=0 \\\n", 86 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 87 | "```\n", 88 | "\n", 89 | "**Using Pyspark**\n", 90 | "\n", 91 | "```\n", 92 | "pyspark2 \\\n", 93 | " --master yarn \\\n", 94 | " --conf spark.ui.port=0 \\\n", 95 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 96 | "```" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "* Earlier we have already created orders table. We will use that as reference and create partitioned table.\n", 104 | "* We can use `PARTITIONED BY` clause to define the **column along with data type**. In our case we will use **order_month as partition column**.\n", 105 | "* We will not be able to directly load the data into the partitioned table using our original orders data (as data is not in sync with structure).\n", 106 | "\n", 107 | "Here is the example of creating partitioned tables in Spark Metastore." 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "%%sql\n", 117 | "\n", 118 | "USE itversity_retail" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "%%sql\n", 128 | "\n", 129 | "SHOW tables" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "* Drop orders_part if it already exists" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "%%sql\n", 146 | "\n", 147 | "DROP TABLE IF EXISTS orders_part" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "%%sql\n", 157 | "\n", 158 | "CREATE TABLE orders_part (\n", 159 | " order_id INT,\n", 160 | " order_date STRING,\n", 161 | " order_customer_id INT,\n", 162 | " order_status STRING\n", 163 | ") PARTITIONED BY (order_month INT)\n", 164 | "ROW FORMAT DELIMITED FIELDS TERMINATED BY ','" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "%%sql\n", 174 | "\n", 175 | "DESCRIBE orders_part" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "spark.sql(\"DESCRIBE FORMATTED orders_part\").show(200, false)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "import sys.process._\n", 194 | "\n", 195 | "s\"hdfs dfs -ls /user/${username}/warehouse/${username}_retail.db/orders_part\" !" 196 | ] 197 | } 198 | ], 199 | "metadata": { 200 | "kernelspec": { 201 | "display_name": "Apache Toree - Scala", 202 | "language": "scala", 203 | "name": "apache_toree_scala" 204 | }, 205 | "language_info": { 206 | "name": "" 207 | } 208 | }, 209 | "nbformat": 4, 210 | "nbformat_minor": 4 211 | } 212 | -------------------------------------------------------------------------------- /05_dml_and_partitioning/09_inserting_data_into_partitions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Inserting Data into Partitions\n", 8 | "\n", 9 | "Let us understand how to use insert to get data into static partitions in Spark Metastore from existing table called as orders." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 9, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 9, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "val username = System.getProperty(\"user.name\")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "import org.apache.spark.sql.SparkSession\n", 60 | "\n", 61 | "val username = System.getProperty(\"user.name\")\n", 62 | "val spark = SparkSession.\n", 63 | " builder.\n", 64 | " config(\"spark.ui.port\", \"0\").\n", 65 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 66 | " enableHiveSupport.\n", 67 | " appName(s\"${username} | Spark SQL - Managing Tables - DML and Partitioning\").\n", 68 | " master(\"yarn\").\n", 69 | " getOrCreate" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 77 | "\n", 78 | "**Using Spark SQL**\n", 79 | "\n", 80 | "```\n", 81 | "spark2-sql \\\n", 82 | " --master yarn \\\n", 83 | " --conf spark.ui.port=0 \\\n", 84 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 85 | "```\n", 86 | "\n", 87 | "**Using Scala**\n", 88 | "\n", 89 | "```\n", 90 | "spark2-shell \\\n", 91 | " --master yarn \\\n", 92 | " --conf spark.ui.port=0 \\\n", 93 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 94 | "```\n", 95 | "\n", 96 | "**Using Pyspark**\n", 97 | "\n", 98 | "```\n", 99 | "pyspark2 \\\n", 100 | " --master yarn \\\n", 101 | " --conf spark.ui.port=0 \\\n", 102 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 103 | "```" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "* Let us recap what is covered so far related to partitioned tables.\n", 111 | " * We have created a table called as orders_part with order_month of type INT as partitioned column.\n", 112 | " * We have added 4 static partitions for 201307, 201308, 201309 and 201310 using ALTER TABLE command.\n", 113 | " * Once the table is created and partitions are added we have pre-processed the data to get data into the partitions using LOAD command.\n", 114 | "* It is not practical to use LOAD command always. We typically use `INSERT` via stage table to copy data into partitioned table.\n", 115 | "* We can pre-create partitions in partitioned tables and insert data into partitions using appropriate `INSERT `command. One need to ensure that required filter condition is applied to get the data relevant to the partition that is being populated.\n", 116 | "* We can also create partitions dynamically which we will see as part of the next topic." 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "%%sql\n", 126 | "\n", 127 | "USE itversity_retail" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "%%sql\n", 137 | "\n", 138 | "ALTER TABLE orders_part ADD PARTITION (order_month=201311)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "%%sql\n", 148 | "\n", 149 | "SELECT count(1) FROM orders_part" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "%%sql\n", 159 | "\n", 160 | "INSERT INTO TABLE orders_part PARTITION (order_month=201311)\n", 161 | " SELECT * FROM orders WHERE order_date LIKE '2013-11%'" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "%%sql\n", 171 | "\n", 172 | "SELECT count(1) FROM orders_part" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "import sys.process._\n", 182 | "\n", 183 | "s\"hdfs dfs -ls -R /user/${username}/warehouse/${username}_retail.db/orders_part\" !" 184 | ] 185 | } 186 | ], 187 | "metadata": { 188 | "kernelspec": { 189 | "display_name": "Apache Toree - Scala", 190 | "language": "scala", 191 | "name": "apache_toree_scala" 192 | }, 193 | "language_info": { 194 | "name": "" 195 | } 196 | }, 197 | "nbformat": 4, 198 | "nbformat_minor": 4 199 | } 200 | -------------------------------------------------------------------------------- /05_dml_and_partitioning/11_exercises_partitioned_tables.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Exercise - Partitioned Tables\n", 8 | "\n", 9 | "Let us take care of this exercise related to partitioning to self evaluate our comfort level in working with partitioned tables." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 11, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 11, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "val username = System.getProperty(\"user.name\")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "import org.apache.spark.sql.SparkSession\n", 60 | "\n", 61 | "val username = System.getProperty(\"user.name\")\n", 62 | "val spark = SparkSession.\n", 63 | " builder.\n", 64 | " config(\"spark.ui.port\", \"0\").\n", 65 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 66 | " enableHiveSupport.\n", 67 | " appName(s\"${username} | Spark SQL - Managing Tables - DML and Partitioning\").\n", 68 | " master(\"yarn\").\n", 69 | " getOrCreate" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 77 | "\n", 78 | "**Using Spark SQL**\n", 79 | "\n", 80 | "```\n", 81 | "spark2-sql \\\n", 82 | " --master yarn \\\n", 83 | " --conf spark.ui.port=0 \\\n", 84 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 85 | "```\n", 86 | "\n", 87 | "**Using Scala**\n", 88 | "\n", 89 | "```\n", 90 | "spark2-shell \\\n", 91 | " --master yarn \\\n", 92 | " --conf spark.ui.port=0 \\\n", 93 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 94 | "```\n", 95 | "\n", 96 | "**Using Pyspark**\n", 97 | "\n", 98 | "```\n", 99 | "pyspark2 \\\n", 100 | " --master yarn \\\n", 101 | " --conf spark.ui.port=0 \\\n", 102 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 103 | "```" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "* Duration: **30 Minutes**\n", 111 | "* Use data from **/data/nyse_all/nyse_data**\n", 112 | "* Use database **YOUR_OS_USER_NAME_nyse**\n", 113 | "* Create partitioned table **nyse_eod_part**\n", 114 | "* Field Names: stockticker, tradedate, openprice, highprice, lowprice, closeprice, volume\n", 115 | "* Determine correct data types based on the values\n", 116 | "* Create Managed table with \",\" as delimiter.\n", 117 | "* Partition Field should be **tradeyear** and of type **INT** (one partition for corresponding year)\n", 118 | "* Insert data into partitioned table using dynamic partition mode.\n", 119 | "* Here are the steps to come up with the solution.\n", 120 | " * Review the files under **/data/nyse_all/nyse_data** - determine data types (For example: tradedate should be INT and volume should be BIGINT)\n", 121 | " * Create database **YOUR_OS_USER_NAME_nyse** (if it does not exists)\n", 122 | " * Create non partitioned stage table\n", 123 | " * Load data into non partitioned stage table\n", 124 | " * Validate the count and also see that data is as expected by running simple select query.\n", 125 | " * Create partitioned table\n", 126 | " * Set required properties to use dynamic partition\n", 127 | " * Insert data into partitioned table - here is how you can compute year from tradedate of type int `year(to_date(cast(tradedate AS STRING), 'yyyyMMdd')) AS tradeyear`\n", 128 | " * Run below validate commands to validate" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### Validation\n", 136 | "Here are the instructions to validate the results.\n", 137 | "* Run `hdfs dfs -ls /user/YOUR_OS_USER_NAME/warehouse/YOUR_OS_USER_NAME_nyse.db/nyse_eod_part`\n", 138 | "* Run `SHOW PARTITIONS YOUR_OS_USER_NAME_nyse.nyse_eod_part`. You should see partitions for all the years using which you have loaded the data.\n", 139 | "* Run `SELECT count(1) FROM YOUR_OS_USER_NAME_nyse.nyse_eod_part`. The count should match the number of records in our dataset.\n", 140 | "* You can compare with the output generated by this simple Python code which is validated in our labs.\n", 141 | "\n", 142 | "```\n", 143 | "import pandas as pd\n", 144 | "import glob\n", 145 | "\n", 146 | "path = r'/data/nyse_all/nyse_data' # use your path\n", 147 | "all_files = glob.glob(path + \"/*.txt.gz\")\n", 148 | "\n", 149 | "li = []\n", 150 | "\n", 151 | "for filename in all_files:\n", 152 | " df = pd.read_csv(filename, index_col=None, header=None)\n", 153 | " li.append(df)\n", 154 | "\n", 155 | "frame = pd.concat(li, axis=0, ignore_index=True)\n", 156 | "frame.shape\n", 157 | "```" 158 | ] 159 | } 160 | ], 161 | "metadata": { 162 | "kernelspec": { 163 | "display_name": "Apache Toree - Scala", 164 | "language": "scala", 165 | "name": "apache_toree_scala" 166 | }, 167 | "language_info": { 168 | "name": "" 169 | } 170 | }, 171 | "nbformat": 4, 172 | "nbformat_minor": 4 173 | } 174 | -------------------------------------------------------------------------------- /06_predefined_functions/01_predefined_functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Predefined Functions\n", 8 | "\n", 9 | "Let us go through the functions that can be used while processing the data. These are typically applied on columns to get derived values from existing column values." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "%%HTML\n", 23 | "" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "* Overview of Functions\n", 31 | "* Validating Functions\n", 32 | "* String Manipulation Functions\n", 33 | "* Date Manipulation Functions\n", 34 | "* Overview of Numeric Functions\n", 35 | "* Data Type Conversion\n", 36 | "* Handling NULL Values\n", 37 | "* Using CASE and WHEN\n", 38 | "* Query Example - Word Count" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [] 47 | } 48 | ], 49 | "metadata": { 50 | "kernelspec": { 51 | "display_name": "Spark 2 - Scala", 52 | "language": "scala", 53 | "name": "spark_2_scala" 54 | }, 55 | "language_info": { 56 | "codemirror_mode": "text/x-scala", 57 | "file_extension": ".scala", 58 | "mimetype": "text/x-scala", 59 | "name": "scala", 60 | "pygments_lexer": "scala", 61 | "version": "2.11.12" 62 | } 63 | }, 64 | "nbformat": 4, 65 | "nbformat_minor": 4 66 | } 67 | -------------------------------------------------------------------------------- /06_predefined_functions/02_overview_of_functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Overview of Functions\n", 8 | "Let us get overview of pre-defined functions in Spark SQL." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": { 15 | "tags": [ 16 | "remove-cell" 17 | ] 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "%%HTML\n", 22 | "" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "val username = System.getProperty(\"user.name\")" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "import org.apache.spark.sql.SparkSession\n", 48 | "\n", 49 | "val username = System.getProperty(\"user.name\")\n", 50 | "val spark = SparkSession.\n", 51 | " builder.\n", 52 | " config(\"spark.ui.port\", \"0\").\n", 53 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 54 | " enableHiveSupport.\n", 55 | " appName(s\"${username} | Spark SQL - Predefined Functions\").\n", 56 | " master(\"yarn\").\n", 57 | " getOrCreate" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 65 | "\n", 66 | "**Using Spark SQL**\n", 67 | "\n", 68 | "```\n", 69 | "spark2-sql \\\n", 70 | " --master yarn \\\n", 71 | " --conf spark.ui.port=0 \\\n", 72 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 73 | "```\n", 74 | "\n", 75 | "**Using Scala**\n", 76 | "\n", 77 | "```\n", 78 | "spark2-shell \\\n", 79 | " --master yarn \\\n", 80 | " --conf spark.ui.port=0 \\\n", 81 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 82 | "```\n", 83 | "\n", 84 | "**Using Pyspark**\n", 85 | "\n", 86 | "```\n", 87 | "pyspark2 \\\n", 88 | " --master yarn \\\n", 89 | " --conf spark.ui.port=0 \\\n", 90 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 91 | "```" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "* We can get list of functions by running `SHOW functions`\n", 99 | "* We can use DESCRIBE command to get the syntax and symantecs of a function - `DESCRIBE FUNCTION substr`\n", 100 | "* Following are the categories of functions that are more commonly used.\n", 101 | " * String Manipulation\n", 102 | " * Date Manipulation\n", 103 | " * Numeric Functions\n", 104 | " * Type Conversion Functions\n", 105 | " * CASE and WHEN\n", 106 | " * and more" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "%%sql\n", 116 | "\n", 117 | "SHOW functions" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "spark.sql(\"SHOW functions\").show(300, false)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "spark.catalog.listFunctions.show(300, false)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "%%sql\n", 145 | "\n", 146 | "DESCRIBE FUNCTION substr" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "spark.sql(\"DESCRIBE FUNCTION substr\").show(false)" 156 | ] 157 | } 158 | ], 159 | "metadata": { 160 | "kernelspec": { 161 | "display_name": "Spark 2 - Scala", 162 | "language": "scala", 163 | "name": "spark_2_scala" 164 | }, 165 | "language_info": { 166 | "codemirror_mode": "text/x-scala", 167 | "file_extension": ".scala", 168 | "mimetype": "text/x-scala", 169 | "name": "scala", 170 | "pygments_lexer": "scala", 171 | "version": "2.11.12" 172 | } 173 | }, 174 | "nbformat": 4, 175 | "nbformat_minor": 4 176 | } 177 | -------------------------------------------------------------------------------- /06_predefined_functions/07_data_type_conversion.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Data Type Conversion\n", 8 | "\n", 9 | "Let us understand how we can type cast to change the data type of extracted value to its original type." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "%%HTML\n", 23 | "" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "val username = System.getProperty(\"user.name\")" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "import org.apache.spark.sql.SparkSession\n", 49 | "\n", 50 | "val username = System.getProperty(\"user.name\")\n", 51 | "val spark = SparkSession.\n", 52 | " builder.\n", 53 | " config(\"spark.ui.port\", \"0\").\n", 54 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 55 | " enableHiveSupport.\n", 56 | " appName(s\"${username} | Spark SQL - Predefined Functions\").\n", 57 | " master(\"yarn\").\n", 58 | " getOrCreate" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 66 | "\n", 67 | "**Using Spark SQL**\n", 68 | "\n", 69 | "```\n", 70 | "spark2-sql \\\n", 71 | " --master yarn \\\n", 72 | " --conf spark.ui.port=0 \\\n", 73 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 74 | "```\n", 75 | "\n", 76 | "**Using Scala**\n", 77 | "\n", 78 | "```\n", 79 | "spark2-shell \\\n", 80 | " --master yarn \\\n", 81 | " --conf spark.ui.port=0 \\\n", 82 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 83 | "```\n", 84 | "\n", 85 | "**Using Pyspark**\n", 86 | "\n", 87 | "```\n", 88 | "pyspark2 \\\n", 89 | " --master yarn \\\n", 90 | " --conf spark.ui.port=0 \\\n", 91 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 92 | "```" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "%%sql\n", 102 | "\n", 103 | "SELECT current_date AS current_date" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "%%sql\n", 113 | "\n", 114 | "SELECT split(current_date, '-')[1] AS month" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "%%sql\n", 124 | "\n", 125 | "SELECT cast(split(current_date, '-')[1] AS INT) AS month" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "%%sql\n", 135 | "\n", 136 | "SELECT cast('0.04' AS FLOAT) AS result" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "%%sql\n", 146 | "\n", 147 | "SELECT cast('0.04' AS INT) AS zero" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "%%sql\n", 157 | "\n", 158 | "SELECT cast('xyz' AS INT) AS returns_null" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "%%sql\n", 168 | "\n", 169 | "CREATE EXTERNAL TABLE IF NOT EXISTS orders_single_column (\n", 170 | " s STRING\n", 171 | ") LOCATION '/user/itversity/warehouse/itversity_retail.db/orders'" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "%%sql\n", 181 | "\n", 182 | "SELECT * FROM orders_single_column LIMIT 10" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "%%sql\n", 192 | "\n", 193 | "SELECT split(s, ',')[0] AS order_id,\n", 194 | " split(s, ',')[1] AS order_date,\n", 195 | " split(s, ',')[2] AS order_customer_id,\n", 196 | " split(s, ',')[3] AS order_status\n", 197 | "FROM orders_single_column LIMIT 10" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "%%sql\n", 207 | "\n", 208 | "SELECT cast(split(s, ',')[0] AS INT) AS order_id,\n", 209 | " cast(split(s, ',')[1] AS TIMESTAMP) AS order_date,\n", 210 | " cast(split(s, ',')[2] AS INT) AS order_customer_id,\n", 211 | " cast(split(s, ',')[3] AS STRING) AS order_status\n", 212 | "FROM orders_single_column LIMIT 10" 213 | ] 214 | } 215 | ], 216 | "metadata": { 217 | "kernelspec": { 218 | "display_name": "Spark 2 - Scala", 219 | "language": "scala", 220 | "name": "spark_2_scala" 221 | }, 222 | "language_info": { 223 | "codemirror_mode": "text/x-scala", 224 | "file_extension": ".scala", 225 | "mimetype": "text/x-scala", 226 | "name": "scala", 227 | "pygments_lexer": "scala", 228 | "version": "2.11.12" 229 | } 230 | }, 231 | "nbformat": 4, 232 | "nbformat_minor": 4 233 | } 234 | -------------------------------------------------------------------------------- /06_predefined_functions/09_using_case_and_when.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Using CASE and WHEN\n", 8 | "At times we might have to select values from multiple columns conditionally." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": { 15 | "tags": [ 16 | "remove-cell" 17 | ] 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "%%HTML\n", 22 | "" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "val username = System.getProperty(\"user.name\")" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "import org.apache.spark.sql.SparkSession\n", 48 | "\n", 49 | "val username = System.getProperty(\"user.name\")\n", 50 | "val spark = SparkSession.\n", 51 | " builder.\n", 52 | " config(\"spark.ui.port\", \"0\").\n", 53 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 54 | " enableHiveSupport.\n", 55 | " appName(s\"${username} | Spark SQL - Predefined Functions\").\n", 56 | " master(\"yarn\").\n", 57 | " getOrCreate" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 65 | "\n", 66 | "**Using Spark SQL**\n", 67 | "\n", 68 | "```\n", 69 | "spark2-sql \\\n", 70 | " --master yarn \\\n", 71 | " --conf spark.ui.port=0 \\\n", 72 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 73 | "```\n", 74 | "\n", 75 | "**Using Scala**\n", 76 | "\n", 77 | "```\n", 78 | "spark2-shell \\\n", 79 | " --master yarn \\\n", 80 | " --conf spark.ui.port=0 \\\n", 81 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 82 | "```\n", 83 | "\n", 84 | "**Using Pyspark**\n", 85 | "\n", 86 | "```\n", 87 | "pyspark2 \\\n", 88 | " --master yarn \\\n", 89 | " --conf spark.ui.port=0 \\\n", 90 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 91 | "```" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "* We can use `CASE` and `WHEN` for that.\n", 99 | "* Let us implement this conditional logic to come up with derived order_status.\n", 100 | " * If order_status is COMPLETE or CLOSED, set COMPLETED\n", 101 | " * If order_status have PENDING in it, then we will say PENDING\n", 102 | " * If order_status have PROCESSING or PAYMENT_REVIEW in it, then we will say PENDING\n", 103 | " * We will set all others as OTHER\n", 104 | "* We can also have `ELSE` as part of `CASE` and `WHEN`." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "%%sql\n", 114 | "\n", 115 | "USE itversity_retail" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "%%sql\n", 125 | "\n", 126 | "SHOW tables" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "%%sql\n", 136 | "\n", 137 | "SELECT DISTINCT order_status FROM orders LIMIT 10" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "%%sql\n", 147 | "\n", 148 | "SELECT o.*,\n", 149 | " CASE WHEN order_status IN ('COMPLETE', 'CLOSED') THEN 'COMPLETED'\n", 150 | " END AS updated_order_status\n", 151 | "FROM orders o\n", 152 | "LIMIT 10" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "%%sql\n", 162 | "\n", 163 | "SELECT o.*,\n", 164 | " CASE WHEN order_status IN ('COMPLETE', 'CLOSED') THEN 'COMPLETED'\n", 165 | " ELSE order_status\n", 166 | " END AS updated_order_status\n", 167 | "FROM orders o\n", 168 | "LIMIT 10" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "%%sql\n", 178 | "\n", 179 | "SELECT o.*,\n", 180 | " CASE \n", 181 | " WHEN order_status IN ('COMPLETE', 'CLOSED') THEN 'COMPLETED'\n", 182 | " WHEN order_status LIKE '%PENDING%' THEN 'PENDING'\n", 183 | " ELSE 'OTHER'\n", 184 | " END AS updated_order_status\n", 185 | "FROM orders o\n", 186 | "LIMIT 10" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "%%sql\n", 196 | "\n", 197 | "SELECT o.*,\n", 198 | " CASE \n", 199 | " WHEN order_status IN ('COMPLETE', 'CLOSED') THEN 'COMPLETED'\n", 200 | " WHEN order_status LIKE '%PENDING%' OR order_status IN ('PROCESSING', 'PAYMENT_REVIEW')\n", 201 | " THEN 'PENDING'\n", 202 | " ELSE 'OTHER'\n", 203 | " END AS updated_order_status\n", 204 | "FROM orders o\n", 205 | "LIMIT 10" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "%%sql\n", 215 | "\n", 216 | "SELECT DISTINCT order_status,\n", 217 | " CASE \n", 218 | " WHEN order_status IN ('COMPLETE', 'CLOSED') THEN 'COMPLETED'\n", 219 | " WHEN order_status LIKE '%PENDING%' OR order_status IN ('PROCESSING', 'PAYMENT_REVIEW')\n", 220 | " THEN 'PENDING'\n", 221 | " ELSE 'OTHER'\n", 222 | " END AS updated_order_status\n", 223 | "FROM orders\n", 224 | "ORDER BY updated_order_status" 225 | ] 226 | } 227 | ], 228 | "metadata": { 229 | "kernelspec": { 230 | "display_name": "Spark 2 - Scala", 231 | "language": "scala", 232 | "name": "spark_2_scala" 233 | }, 234 | "language_info": { 235 | "codemirror_mode": "text/x-scala", 236 | "file_extension": ".scala", 237 | "mimetype": "text/x-scala", 238 | "name": "scala", 239 | "pygments_lexer": "scala", 240 | "version": "2.11.12" 241 | } 242 | }, 243 | "nbformat": 4, 244 | "nbformat_minor": 4 245 | } 246 | -------------------------------------------------------------------------------- /07_windowing_functions/01_windowing_functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Windowing Functions\n", 8 | "\n", 9 | "As part of this section we will primarily talk about Windowing Functions. These are also known as Analytic Functions in Databases like Oracle." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "%%HTML\n", 23 | "" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "* Prepare HR Database\n", 31 | "* Overview of Windowing Functions\n", 32 | "* Aggregations using Windowing Functions\n", 33 | "* Getting LEAD and LAG values\n", 34 | "* Getting first and last values\n", 35 | "* Ranking using Windowing Functions\n", 36 | "* Understanding order of execution of SQL\n", 37 | "* Overview of Nested Sub Queries\n", 38 | "* Filtering - Window Function Results" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "import org.apache.spark.sql.SparkSession\n", 48 | "\n", 49 | "val username = System.getProperty(\"user.name\")\n", 50 | "val spark = SparkSession.\n", 51 | " builder.\n", 52 | " config(\"spark.ui.port\", \"0\").\n", 53 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 54 | " enableHiveSupport.\n", 55 | " appName(s\"${username} | Spark SQL - Windowing Functions\").\n", 56 | " master(\"yarn\").\n", 57 | " getOrCreate" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "%%sql\n", 67 | "\n", 68 | "SET spark.sql.shuffle.partitions=2" 69 | ] 70 | } 71 | ], 72 | "metadata": { 73 | "kernelspec": { 74 | "display_name": "Apache Toree - Scala", 75 | "language": "scala", 76 | "name": "apache_toree_scala" 77 | }, 78 | "language_info": { 79 | "codemirror_mode": "text/x-scala", 80 | "file_extension": ".scala", 81 | "mimetype": "text/x-scala", 82 | "name": "scala", 83 | "pygments_lexer": "scala", 84 | "version": "2.11.12" 85 | } 86 | }, 87 | "nbformat": 4, 88 | "nbformat_minor": 4 89 | } 90 | -------------------------------------------------------------------------------- /07_windowing_functions/02_prepare_database.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Prepare HR Database\n", 8 | "\n", 9 | "Let us prepare HR database with **EMPLOYEES** Table. We will be using this for some of the examples as well as exercises related to Window Functions." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "%%HTML\n", 23 | "" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "val username = System.getProperty(\"user.name\")" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "import org.apache.spark.sql.SparkSession\n", 49 | "\n", 50 | "val username = System.getProperty(\"user.name\")\n", 51 | "val spark = SparkSession.\n", 52 | " builder.\n", 53 | " config(\"spark.ui.port\", \"0\").\n", 54 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 55 | " enableHiveSupport.\n", 56 | " appName(s\"${username} | Spark SQL - Windowing Functions\").\n", 57 | " master(\"yarn\").\n", 58 | " getOrCreate" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 66 | "\n", 67 | "**Using Spark SQL**\n", 68 | "\n", 69 | "```\n", 70 | "spark2-sql \\\n", 71 | " --master yarn \\\n", 72 | " --conf spark.ui.port=0 \\\n", 73 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 74 | "```\n", 75 | "\n", 76 | "**Using Scala**\n", 77 | "\n", 78 | "```\n", 79 | "spark2-shell \\\n", 80 | " --master yarn \\\n", 81 | " --conf spark.ui.port=0 \\\n", 82 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 83 | "```\n", 84 | "\n", 85 | "**Using Pyspark**\n", 86 | "\n", 87 | "```\n", 88 | "pyspark2 \\\n", 89 | " --master yarn \\\n", 90 | " --conf spark.ui.port=0 \\\n", 91 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 92 | "```" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "* Create Database **itversity_hr** (replace itversity with your OS User Name)\n", 100 | "* Create table **employees** in **itversity_hr** database.\n", 101 | "* Load data into the table.\n", 102 | "\n", 103 | "First let us start with creating the database." 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "%%sql\n", 113 | "\n", 114 | "DROP DATABASE itversity_hr CASCADE" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "%%sql\n", 124 | "\n", 125 | "CREATE DATABASE itversity_hr" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "%%sql\n", 135 | "\n", 136 | "USE itversity_hr" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "%%sql\n", 146 | "\n", 147 | "SELECT current_database()" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "As the database is created, let us go ahead and add table to it." 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "%%sql\n", 164 | "\n", 165 | "CREATE TABLE employees (\n", 166 | " employee_id int,\n", 167 | " first_name varchar(20),\n", 168 | " last_name varchar(25),\n", 169 | " email varchar(25),\n", 170 | " phone_number varchar(20),\n", 171 | " hire_date date,\n", 172 | " job_id varchar(10),\n", 173 | " salary decimal(8,2),\n", 174 | " commission_pct decimal(2,2),\n", 175 | " manager_id int,\n", 176 | " department_id int\n", 177 | ") ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t'" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "Let us load the data and validate the table." 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "%%sql\n", 194 | "\n", 195 | "LOAD DATA LOCAL INPATH '/data/hr_db/employees' \n", 196 | "INTO TABLE employees" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "%%sql\n", 206 | "\n", 207 | "SELECT * FROM employees LIMIT 10" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "%%sql\n", 217 | "\n", 218 | "SELECT employee_id, department_id, salary FROM employees LIMIT 10" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "%%sql\n", 228 | "\n", 229 | "SELECT count(1) FROM employees" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [] 238 | } 239 | ], 240 | "metadata": { 241 | "kernelspec": { 242 | "display_name": "Apache Toree - Scala", 243 | "language": "scala", 244 | "name": "apache_toree_scala" 245 | }, 246 | "language_info": { 247 | "codemirror_mode": "text/x-scala", 248 | "file_extension": ".scala", 249 | "mimetype": "text/x-scala", 250 | "name": "scala", 251 | "pygments_lexer": "scala", 252 | "version": "2.11.12" 253 | } 254 | }, 255 | "nbformat": 4, 256 | "nbformat_minor": 4 257 | } 258 | -------------------------------------------------------------------------------- /07_windowing_functions/03_overview_of_windowing_functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Overview of Windowing Functions\n", 8 | "\n", 9 | "Let us get an overview of Analytics or Windowing Functions in Spark SQL." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "%%HTML\n", 23 | "" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "val username = System.getProperty(\"user.name\")" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "import org.apache.spark.sql.SparkSession\n", 49 | "\n", 50 | "val username = System.getProperty(\"user.name\")\n", 51 | "val spark = SparkSession.\n", 52 | " builder.\n", 53 | " config(\"spark.ui.port\", \"0\").\n", 54 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 55 | " enableHiveSupport.\n", 56 | " appName(s\"${username} | Spark SQL - Windowing Functions\").\n", 57 | " master(\"yarn\").\n", 58 | " getOrCreate" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 66 | "\n", 67 | "**Using Spark SQL**\n", 68 | "\n", 69 | "```\n", 70 | "spark2-sql \\\n", 71 | " --master yarn \\\n", 72 | " --conf spark.ui.port=0 \\\n", 73 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 74 | "```\n", 75 | "\n", 76 | "**Using Scala**\n", 77 | "\n", 78 | "```\n", 79 | "spark2-shell \\\n", 80 | " --master yarn \\\n", 81 | " --conf spark.ui.port=0 \\\n", 82 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 83 | "```\n", 84 | "\n", 85 | "**Using Pyspark**\n", 86 | "\n", 87 | "```\n", 88 | "pyspark2 \\\n", 89 | " --master yarn \\\n", 90 | " --conf spark.ui.port=0 \\\n", 91 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 92 | "```" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "* Aggregate Functions (`sum`, `min`, `max`, `avg`)\n", 100 | "* Window Functions (`lead`, `lag`, `first_value`, `last_value`)\n", 101 | "* Rank Functions (`rank`, `dense_rank`, `row_number` etc)\n", 102 | "* For all the functions we use `OVER` clause.\n", 103 | "* For aggregate functions we typically use `PARTITION BY`\n", 104 | "* For global ranking and windowing functions we can use `ORDER BY sorting_column` and for ranking and windowing with in a partition or group we can use `PARTITION BY partition_column ORDER BY sorting_column`." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "%%sql\n", 114 | "\n", 115 | "USE itversity_hr" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "%%sql\n", 125 | "\n", 126 | "SELECT employee_id, department_id, salary FROM employees LIMIT 10" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "%%sql\n", 136 | "\n", 137 | "SELECT employee_id, department_id, salary,\n", 138 | " count(1) OVER (PARTITION BY department_id) AS employee_count,\n", 139 | " rank() OVER (ORDER BY salary DESC) AS rnk,\n", 140 | " lead(employee_id) OVER (PARTITION BY department_id ORDER BY salary DESC) AS lead_emp_id,\n", 141 | " lead(salary) OVER (PARTITION BY department_id ORDER BY salary DESC) AS lead_emp_sal\n", 142 | "FROM employees\n", 143 | "ORDER BY employee_id" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [] 152 | } 153 | ], 154 | "metadata": { 155 | "kernelspec": { 156 | "display_name": "Apache Toree - Scala", 157 | "language": "scala", 158 | "name": "apache_toree_scala" 159 | }, 160 | "language_info": { 161 | "codemirror_mode": "text/x-scala", 162 | "file_extension": ".scala", 163 | "mimetype": "text/x-scala", 164 | "name": "scala", 165 | "pygments_lexer": "scala", 166 | "version": "2.11.12" 167 | } 168 | }, 169 | "nbformat": 4, 170 | "nbformat_minor": 4 171 | } 172 | -------------------------------------------------------------------------------- /07_windowing_functions/09_overview_of_sub_queries.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Overview of Sub Queries\n", 8 | "\n", 9 | "Let us recap about Sub Queries." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 9, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "" 25 | ] 26 | }, 27 | "execution_count": 9, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%%HTML\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "val username = System.getProperty(\"user.name\")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "import org.apache.spark.sql.SparkSession\n", 60 | "\n", 61 | "val username = System.getProperty(\"user.name\")\n", 62 | "val spark = SparkSession.\n", 63 | " builder.\n", 64 | " config(\"spark.ui.port\", \"0\").\n", 65 | " config(\"spark.sql.warehouse.dir\", s\"/user/${username}/warehouse\").\n", 66 | " enableHiveSupport.\n", 67 | " appName(s\"${username} | Spark SQL - Windowing Functions\").\n", 68 | " master(\"yarn\").\n", 69 | " getOrCreate" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 77 | "\n", 78 | "**Using Spark SQL**\n", 79 | "\n", 80 | "```\n", 81 | "spark2-sql \\\n", 82 | " --master yarn \\\n", 83 | " --conf spark.ui.port=0 \\\n", 84 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 85 | "```\n", 86 | "\n", 87 | "**Using Scala**\n", 88 | "\n", 89 | "```\n", 90 | "spark2-shell \\\n", 91 | " --master yarn \\\n", 92 | " --conf spark.ui.port=0 \\\n", 93 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 94 | "```\n", 95 | "\n", 96 | "**Using Pyspark**\n", 97 | "\n", 98 | "```\n", 99 | "pyspark2 \\\n", 100 | " --master yarn \\\n", 101 | " --conf spark.ui.port=0 \\\n", 102 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 103 | "```" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "* We typically have Sub Queries in **FROM** Clause.\n", 111 | "* We need not provide alias to the Sub Queries in **FROM** Clause in Spark SQL. In earlier versions, you might have to provide alias for the Sub Query.\n", 112 | "* We use Sub Queries quite often over queries using Analytics/Windowing Functions" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "%%sql\n", 122 | "\n", 123 | "SELECT * FROM (SELECT current_date)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "%%sql\n", 133 | "\n", 134 | "SELECT * FROM (SELECT current_date) AS q" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "Let us see few more examples with respect to Sub Queries." 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "%%sql\n", 151 | "\n", 152 | "USE itversity_retail" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "%%sql\n", 162 | "\n", 163 | "SELECT * FROM (\n", 164 | " SELECT order_date, count(1) AS order_count\n", 165 | " FROM orders\n", 166 | " GROUP BY order_date\n", 167 | ") q\n", 168 | "LIMIT 10" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "```{note}\n", 176 | "Here is an example of how we can filter based up on the derived columns using sub query. However, this can be achieved with direct query as well using `HAVING`. \n", 177 | "```" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "%%sql\n", 187 | "\n", 188 | "SELECT * FROM (\n", 189 | " SELECT order_date, count(1) AS order_count\n", 190 | " FROM orders\n", 191 | " GROUP BY order_date\n", 192 | ") q\n", 193 | "WHERE q.order_count > 10" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "%%sql\n", 203 | "\n", 204 | "SELECT order_date, count(1) AS order_count\n", 205 | "FROM orders\n", 206 | "GROUP BY order_date\n", 207 | " HAVING count(1) > 10" 208 | ] 209 | } 210 | ], 211 | "metadata": { 212 | "kernelspec": { 213 | "display_name": "Apache Toree - Scala", 214 | "language": "scala", 215 | "name": "apache_toree_scala" 216 | }, 217 | "language_info": { 218 | "codemirror_mode": "text/x-scala", 219 | "file_extension": ".scala", 220 | "mimetype": "text/x-scala", 221 | "name": "scala", 222 | "pygments_lexer": "scala", 223 | "version": "2.11.12" 224 | } 225 | }, 226 | "nbformat": 4, 227 | "nbformat_minor": 4 228 | } 229 | -------------------------------------------------------------------------------- /08_analyze_github_activity/02_download_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "IOPub data rate exceeded.\n", 13 | "The notebook server will temporarily stop sending output\n", 14 | "to the client in order to avoid crashing it.\n", 15 | "To change this limit, set the config variable\n", 16 | "`--NotebookApp.iopub_data_rate_limit`.\n", 17 | "\n", 18 | "Current values:\n", 19 | "NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)\n", 20 | "NotebookApp.rate_limit_window=3.0 (secs)\n", 21 | "\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "%%sh\n", 27 | "\n", 28 | "wget https://data.gharchive.org/2021-01-13-{0..23}.json.gz -P /data/gharchive/\n", 29 | "wget https://data.gharchive.org/2021-01-14-{0..23}.json.gz -P /data/gharchive/\n", 30 | "wget https://data.gharchive.org/2021-01-15-{0..23}.json.gz -P /data/gharchive/\n", 31 | "wget https://data.gharchive.org/2021-01-16-{0..23}.json.gz -P /data/gharchive/" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [] 40 | } 41 | ], 42 | "metadata": { 43 | "kernelspec": { 44 | "display_name": "Python 3", 45 | "language": "python", 46 | "name": "python3" 47 | }, 48 | "language_info": { 49 | "codemirror_mode": { 50 | "name": "ipython", 51 | "version": 3 52 | }, 53 | "file_extension": ".py", 54 | "mimetype": "text/x-python", 55 | "name": "python", 56 | "nbconvert_exporter": "python", 57 | "pygments_lexer": "ipython3", 58 | "version": "3.6.8" 59 | } 60 | }, 61 | "nbformat": 4, 62 | "nbformat_minor": 4 63 | } 64 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 itversity 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CCA 175 Spark and Hadoop Developer 2 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | title : Apache Spark using SQL 2 | author : Durga Gadiraju 3 | copyright : "ITVersity, Inc" 4 | 5 | repository: 6 | url : https://github.com/itversity/spark-sql 7 | html: 8 | use_repository_button : true 9 | use_issues_button : true 10 | use_edit_page_button : true 11 | extra_navbar : Subscribe to our Newsletter 12 | google_analytics_id : UA-80990145-12 13 | 14 | exclude_patterns : [_build, README.md, "**.ipynb_checkpoints"] 15 | 16 | execute: 17 | execute_notebooks : off -------------------------------------------------------------------------------- /_toc.yml: -------------------------------------------------------------------------------- 1 | - file: spark-sql.ipynb 2 | 3 | - file: 01_getting_started/01_getting_started 4 | sections: 5 | - file: 01_getting_started/01_getting_started 6 | - file: 01_getting_started/02_overview_of_spark_documentation 7 | - file: 01_getting_started/03_overview_of_spark_sql_cli 8 | - file: 01_getting_started/04_overview_of_spark_sql_properties 9 | - file: 01_getting_started/05_running_os_commands 10 | - file: 01_getting_started/06_understanding_warehouse_directory 11 | - file: 01_getting_started/07_managing_spark_metastore_databases 12 | - file: 01_getting_started/08_managing_spark_metastore_tables 13 | - file: 01_getting_started/09_retrieve_metadata_of_tables 14 | - file: 01_getting_started/10_role_of_spark_or_hive_metastore 15 | - file: 01_getting_started/11_exercise_getting_started 16 | 17 | - file: 02_overview_of_hdfs/01_Overview of HDFS 18 | sections: 19 | - file: 02_overview_of_hdfs/01_Overview of HDFS 20 | - file: 02_overview_of_hdfs/02_using_hdfs_cli 21 | - file: 02_overview_of_hdfs/03_getting_help_or_usage 22 | - file: 02_overview_of_hdfs/04_listing_hdfs_files 23 | - file: 02_overview_of_hdfs/05_managing_hdfs_directories 24 | - file: 02_overview_of_hdfs/06_copying_files_from_local_to_hdfs 25 | - file: 02_overview_of_hdfs/07_copying_files_from_hdfs_to_local 26 | - file: 02_overview_of_hdfs/08_copying_files_from_hdfs_to_hdfs 27 | - file: 02_overview_of_hdfs/09_previewing_data_in_hdfs_files 28 | - file: 02_overview_of_hdfs/10_getting_file_metadata 29 | - file: 02_overview_of_hdfs/11_hdfs_blocksize 30 | - file: 02_overview_of_hdfs/12_hdfs_replication_factor 31 | - file: 02_overview_of_hdfs/13_getting_hdfs_storage_usage 32 | - file: 02_overview_of_hdfs/14_using_hdfs_stat_command 33 | - file: 02_overview_of_hdfs/15_hdfs_file_permissions 34 | - file: 02_overview_of_hdfs/16_overriding_properties 35 | - file: 02_overview_of_hdfs/understanding_linux_file_system_240 36 | 37 | - file: 03_basic_transformations/01_basic_transformations 38 | sections: 39 | - file: 03_basic_transformations/01_basic_transformations 40 | - file: 03_basic_transformations/02_spark_sql_overview 41 | - file: 03_basic_transformations/03_define_problem_statement 42 | - file: 03_basic_transformations/04_preparing_tables 43 | - file: 03_basic_transformations/05_projecting_data 44 | - file: 03_basic_transformations/06_filtering_data 45 | - file: 03_basic_transformations/07_joining_tables_inner 46 | - file: 03_basic_transformations/08_joining_tables_outer 47 | - file: 03_basic_transformations/09_aggregating_data 48 | - file: 03_basic_transformations/10_sorting_data 49 | - file: 03_basic_transformations/11_conclusion_final_solution 50 | - file: 03_basic_transformations/12_exercises_basic_sql_queries 51 | 52 | - file: 04_basic_ddl_and_dml/01_basic_ddl_and_dml 53 | sections: 54 | - file: 04_basic_ddl_and_dml/01_basic_ddl_and_dml 55 | - file: 04_basic_ddl_and_dml/02_create_spark_metastore_tables 56 | - file: 04_basic_ddl_and_dml/03_overview_of_data_types 57 | - file: 04_basic_ddl_and_dml/04_adding_comments 58 | - file: 04_basic_ddl_and_dml/05_loading_data_into_tables_local 59 | - file: 04_basic_ddl_and_dml/06_loading_data_into_tables_hdfs 60 | - file: 04_basic_ddl_and_dml/07_loading_data_append_and_overwrite 61 | - file: 04_basic_ddl_and_dml/08_creating_external_tables 62 | - file: 04_basic_ddl_and_dml/09_managed_vs_external_tables 63 | - file: 04_basic_ddl_and_dml/10_overview_of_file_formats 64 | - file: 04_basic_ddl_and_dml/11_dropping_tables_and_databases 65 | - file: 04_basic_ddl_and_dml/12_truncating_tables 66 | - file: 04_basic_ddl_and_dml/13_exercises_managing_tables 67 | 68 | - file: 05_dml_and_partitioning/01_dml_and_partitioning 69 | sections: 70 | - file: 05_dml_and_partitioning/01_dml_and_partitioning 71 | - file: 05_dml_and_partitioning/02_introduction_to_partitioning 72 | - file: 05_dml_and_partitioning/03_creating_tables_using_parquet 73 | - file: 05_dml_and_partitioning/04_load_vs_insert 74 | - file: 05_dml_and_partitioning/05_inserting_data_using_stage_table 75 | - file: 05_dml_and_partitioning/06_creating_partitioned_tables 76 | - file: 05_dml_and_partitioning/07_adding_partitions_to_tables 77 | - file: 05_dml_and_partitioning/08_loading_data_into_partitions 78 | - file: 05_dml_and_partitioning/09_inserting_data_into_partitions 79 | - file: 05_dml_and_partitioning/10_using_dynamic_partition_mode 80 | - file: 05_dml_and_partitioning/11_exercises_partitioned_tables 81 | 82 | - file: 06_predefined_functions/01_predefined_functions 83 | sections: 84 | - file: 06_predefined_functions/01_predefined_functions 85 | - file: 06_predefined_functions/02_overview_of_functions 86 | - file: 06_predefined_functions/03_validating_functions 87 | - file: 06_predefined_functions/04_string_manipulation_functions 88 | - file: 06_predefined_functions/05_date_manipulation_functions 89 | - file: 06_predefined_functions/06_overview_of_numeric_functions 90 | - file: 06_predefined_functions/07_data_type_conversion 91 | - file: 06_predefined_functions/08_handling_null_values 92 | - file: 06_predefined_functions/09_using_case_and_when 93 | - file: 06_predefined_functions/10_query_example_word_count 94 | - file: 06_predefined_functions/11_exercises_pre_defined_functions 95 | 96 | - file: 07_windowing_functions/01_windowing_functions 97 | sections: 98 | - file: 07_windowing_functions/01_windowing_functions 99 | - file: 07_windowing_functions/02_prepare_database 100 | - file: 07_windowing_functions/03_overview_of_windowing_functions 101 | - file: 07_windowing_functions/04_aggregations_using_windowing_functions 102 | - file: 07_windowing_functions/05_using_lead_or_lag 103 | - file: 07_windowing_functions/06_getting_first_and_last_values 104 | - file: 07_windowing_functions/07_ranking_using_windowing_functions 105 | - file: 07_windowing_functions/08_order_of_execution_of_sql 106 | - file: 07_windowing_functions/09_overview_of_sub_queries 107 | - file: 07_windowing_functions/10_filtering_window_function_results 108 | - file: 07_windowing_functions/11_cumulative_or_moving_aggregations 109 | - file: 07_windowing_functions/12_exercises_windowing_functions 110 | -------------------------------------------------------------------------------- /spark-sql.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Apache Spark using SQL\n", 8 | "\n", 9 | "This course is primarily designed to learn Spark SQL as part of Data Engineering." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## About Spark SQL\n", 17 | "\n", 18 | "Spark SQL is one of the popular SQL framework as part of Big Data landscape. It is an open source SQL engine based up on Spark's distributed computing framework.\n", 19 | "\n", 20 | "Here are some of the usages of Spark SQL.\n", 21 | "* Implement transformation rules as part of Data Engineering or Data Processing Pipelines.\n", 22 | "* Run Ad Hoc queries on top of data stored in storage systems such as HDFS, s3, Azure Blob etc.\n", 23 | "* Connect BI tools such as Tableau, Power BI etc and run reports.\n", 24 | "\n", 25 | "You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS. You will be able to learn the same way as demonstrated." 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## Course Details\n", 33 | "\n", 34 | "This course is primarily designed to go through SQL capabilities of Spark SQL. As part of this course you will be learning the following topics.\n", 35 | "\n", 36 | "* Getting Started\n", 37 | "* Basic Transformations\n", 38 | "* Basic DDL and DML\n", 39 | "* DML (Contd) and Partitioning\n", 40 | "* Predefined Funtions\n", 41 | "* Windowing Functions" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## Desired Audience\n", 49 | "\n", 50 | "Here are the desired audience for this course.\n", 51 | "\n", 52 | "* Experienced application developers to understand key aspects of Spark SQL.\n", 53 | "* Data Engineers and Data Warehouse Developers to understand key aspects of Spark SQL to build batch or streaming pipelines.\n", 54 | "* Testers to improve their scripting abilities to validate data in the files, tables etc." 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "## Prerequisites\n", 62 | "\n", 63 | "Here are the prerequisites before signing up for the course.\n", 64 | "\n", 65 | "````{panels}\n", 66 | "**Logistics**\n", 67 | "\n", 68 | "* Computer with decent configuration\n", 69 | " * At least 4 GB RAM\n", 70 | " * 8 GB RAM is highly desired\n", 71 | "* Chrome Browser\n", 72 | "* High Speed Internet\n", 73 | "\n", 74 | "---\n", 75 | "\n", 76 | "**Desired Skills**\n", 77 | "\n", 78 | "* Engineering or Science Degree\n", 79 | "* Ability to use computer\n", 80 | "* Knowledge or working experience with databases is highly desired\n", 81 | "\n", 82 | "````" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [] 91 | } 92 | ], 93 | "metadata": { 94 | "kernelspec": { 95 | "display_name": "Python 3", 96 | "language": "python", 97 | "name": "python3" 98 | }, 99 | "language_info": { 100 | "codemirror_mode": { 101 | "name": "ipython", 102 | "version": 3 103 | }, 104 | "file_extension": ".py", 105 | "mimetype": "text/x-python", 106 | "name": "python", 107 | "nbconvert_exporter": "python", 108 | "pygments_lexer": "ipython3", 109 | "version": "3.6.8" 110 | } 111 | }, 112 | "nbformat": 4, 113 | "nbformat_minor": 4 114 | } 115 | --------------------------------------------------------------------------------