├── .gitignore ├── 01_getting_started_pyspark ├── 01_getting_started_pyspark.ipynb ├── 02_platforms_to_practice.ipynb ├── 03_setup_spark_locally_windows.ipynb ├── 04_setup_spark_locally_mac.ipynb ├── 05_setup_spark_locally_ubuntu.ipynb ├── 06_using_itversity_labs.ipynb ├── 07_using_google_colab.ipynb ├── 08_overview_of_filesystems.ipynb ├── 09_different_spark_modules.ipynb ├── 10_spark_cluster_manager_types.ipynb ├── 11_launching_pyspark_cli.ipynb ├── 12_using_jupyter_lab_interface.ipynb └── 13_word_count_using_spark.ipynb ├── 02_quick_recap_of_python ├── 01_quick_recap_of_python.ipynb ├── 02_data_engineering_life_cycle.ipynb ├── 03_getting_started_python_and_pip.ipynb ├── 04_python_cli_or_jupyter_notebook.ipynb ├── 05_basic_programming_constructs.ipynb ├── 06_developing_functions.ipynb ├── 07_lambda_functions.ipynb ├── 08_overview_of_collections.ipynb ├── 09_overview_of_pandas_dataframes.ipynb ├── 10_limitations_of_pandas.ipynb ├── 11_development_life_cycle.ipynb └── 12_exercises_recap_of_python.ipynb ├── 03_data_processing_overview ├── 01_data_processing_overview.ipynb ├── 02_prerequisites_and_objectives.ipynb ├── 03_starting_spark_context.ipynb ├── 04_overview_of_spark_read_apis.ipynb ├── 05_understand_airlines_data.ipynb ├── 06_inferring_schema.ipynb ├── 07_previewing_airlines_data.ipynb ├── 08_overview_of_dataframe_apis.ipynb ├── 09_overview_of_functions.ipynb ├── 10_overview_of_spark_write_apis.ipynb ├── 11_reorganizing_airlines_data.ipynb ├── 12_previewing_reorganized_data.ipynb ├── 13_analyze_and_understand_data.ipynb └── 14_conclusion_data_processing_overview.ipynb ├── 04_processing_column_data ├── 01_processing_column_data.ipynb ├── 02_predefined_functions.ipynb ├── 03_create_dummy_dataframes.ipynb ├── 04_categories_of_functions.ipynb ├── 05_special_functions_col_and_lit.ipynb ├── 06_common_string_manipulation_functions.ipynb ├── 07_extracting_strings_using_substring.ipynb ├── 08_extracting_strings_using_split.ipynb ├── 09_padding_characters_around_strings.ipynb ├── 10_trimming_characters_from_strings.ipynb ├── 11_date_and_time_manipulation_functions.ipynb ├── 12_date_and_time_arithmetic.ipynb ├── 13_using_date_and_time_trunc_functions.ipynb ├── 14_date_and_time_extract_functions.ipynb ├── 15_using_to_date_and_to_timestamp.ipynb ├── 16_using_date_format_function.ipynb ├── 17_dealing_with_unix_timestamp.ipynb ├── 18_dealing_with_nulls.ipynb ├── 19_using_case_and_when.ipynb ├── 20_conclusion_predefined_functions.ipynb └── 21_exercises_predefined_functions.ipynb ├── 05_basic_transformations ├── 01_basic_transformations.ipynb ├── 02_overview_of_basic_transformations.ipynb ├── 03_data_frame_for_basic_transformations.ipynb ├── 04_basic_filtering_of_data.ipynb ├── 05_filtering_example_using_dates.ipynb ├── 06_boolean_operators.ipynb ├── 07_using_in_operator_or_isin_function.ipynb ├── 08_using_like_operator_or_function.ipynb ├── 09_using_between_operator.ipynb ├── 10_dealing_with_nulls_while_filtering.ipynb ├── 11_total_aggregations.ipynb ├── 12_aggregate_data_using_groupby.ipynb ├── 13_aggregate_data_using_rollup.ipynb ├── 14_aggregate_data_using_cube.ipynb ├── 15_overview_of_sorting_data_frames.ipynb ├── 16_solution_problem_1.ipynb ├── 17_solution_problem_2.ipynb └── 18_solution_problem_3.ipynb ├── 06_joining_data_sets ├── 01_joining_data_sets.ipynb ├── 02_preparing_data_sets_for_joins.ipynb ├── 03_analyze_data_sets_for_joins.ipynb ├── 04_problem_statements_for_joins.ipynb ├── 05_overview_of_joins.ipynb ├── 06_using_inner_joins.ipynb ├── 07_left_or_right_outer_join.ipynb ├── 08_solutions_problem_1.ipynb ├── 09_solutions_problem_2.ipynb ├── 10_solutions_problem_3.ipynb ├── 11_solutions_problem_4.ipynb ├── 12_solutions_problem_5.ipynb ├── 13_solutions_problem_6.ipynb ├── 14_solutions_problem_7.ipynb └── 15_solutions_problem_8.ipynb ├── 07_windowing_functions ├── 01_windowing_functions.ipynb ├── 02_overview_of_windowing_functions.ipynb ├── 03_aggregate_functions.ipynb ├── 04_using_rowsBetween_and_rangeBetween.ipynb ├── 05_ranking_functions.ipynb ├── 06_using_lead_or_lag.ipynb ├── 07_using_first_and_last_functions.ipynb └── 10_aggregate_functions_examples.ipynb ├── 08_spark_metastore ├── 01_spark_metastore.ipynb ├── 02_overview_of_spark_metastore.ipynb ├── 03_exploring_spark_catalog.ipynb ├── 04_creating_metastore_tables_using_catalog.ipynb ├── 05_inferring_schema_for_tables.ipynb ├── 06_define_schema_for_tables_using_structtype.ipynb ├── 07_inserting_into_existing_tables.ipynb ├── 08_read_and_process_data_from_metastore_tables.ipynb ├── 09_creating_partitioned_tables.ipynb ├── 10_saving_as_partitioned_tables.ipynb ├── 11_creating_temp_views.ipynb └── 12_using_spark_sql.ipynb ├── 09_analyzing_gharchive_data ├── 02_download_data.ipynb ├── 03_copy_to_hdfs_landing.ipynb ├── 04_create_external_table.ipynb ├── 05_overview_of_json.ipynb ├── 06_get_new_repositories.ipynb ├── 07_get_repository_pushes.ipynb └── 99_ghdata_queries.ipynb ├── 12_special_data_types ├── 02 Create Tables with Special Types.ipynb ├── 03 Create Data Frame with Special Types.ipynb ├── 04 Exploding Arrays into Records.ipynb ├── 05 Generating Arrays from Strings.ipynb ├── 06 Processing Arrays.ipynb └── 07 Projecting Struct and Map Columns.ipynb ├── CNAME ├── GitHubActivity.ipynb ├── LICENSE ├── README.md ├── _config.yml ├── _toc.yml ├── generate_toc.ipynb └── mastering-pyspark.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /01_getting_started_pyspark/01_getting_started_pyspark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Getting Started" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Setup Spark Locally - Ubuntu\n", 15 | "\n", 16 | "Let us setup Spark Locally on Ubuntu.\n", 17 | "\n", 18 | "* Install latest version of Anaconda\n", 19 | "* Make sure Jupyter Notebook is setup and validated.\n", 20 | "* Setup Spark and Validate.\n", 21 | "* Setup Environment Variables to integrate Pyspark with Jupyter Notebook.\n", 22 | "* Launch Jupyter Notebook using `pyspark` command.\n", 23 | "* Setup PyCharm (IDE) for application development." 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Setup Spark Locally - Mac\n", 31 | "\n", 32 | "### Let us setup Spark Locally on Ubuntu.\n", 33 | "\n", 34 | "* Install latest version of Anaconda\n", 35 | "* Make sure Jupyter Notebook is setup and validated.\n", 36 | "* Setup Spark and Validate.\n", 37 | "* Setup Environment Variables to integrate Pyspark with Jupyter Notebook.\n", 38 | "* Launch Jupyter Notebook using `pyspark` command.\n", 39 | "* Setup PyCharm (IDE) for application development." 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## Signing up for ITVersity Labs\n", 47 | "\n", 48 | "Here are the steps for signing to ITVersity labs.\n", 49 | "* Go to https://labs.itversity.com\n", 50 | "* Sign up to our website\n", 51 | "* Purchase lab access\n", 52 | "* Go to lab page and create lab account\n", 53 | "* Login and practice" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## Using ITVersity Labs\n", 61 | "\n", 62 | "Let us understand how to submit the Spark Jobs in ITVersity Labs.\n", 63 | "\n", 64 | "* You can either use Jupyter based environment or `pyspark` in terminal to submit jobs in ITVersity labs.\n", 65 | "* You can also submit Spark jobs using `spark-submit` command.\n", 66 | "* As we are using Python we can also use the help command to get the documentation - for example `help(spark.read.csv)`" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "## Interacting with File Systems\n", 74 | "\n", 75 | "Let us understand how to interact with file system using %fs command from Databricks Notebook.\n", 76 | "\n", 77 | "* We can access datasets using %fs magic command in Databricks notebook\n", 78 | "* By default, we will see files under dbfs\n", 79 | "* We can list the files using ls command - e. g.: `%fs ls`\n", 80 | "* Databricks provides lot of datasets for free under databricks-datasets\n", 81 | "* If the cluster is integrated with AWS or Azure Blob we can access files by specifying the appropriate protocol (e.g.: s3:// for s3)\n", 82 | "* List of commands available under `%fs`\n", 83 | " * Copying files or directories `-cp`\n", 84 | " * Moving files or directories `-mv`\n", 85 | " * Creating directories `-mkdirs`\n", 86 | " * Deleting files and directories `-rm`\n", 87 | " * We can copy or delete directories recursively using `-r` or `--recursive`" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "## Getting File Metadata\n", 95 | "\n", 96 | "Let us review the source location to get number of files and the size of the data we are going to process.\n", 97 | "\n", 98 | "* Location of airlines data dbfs:/databricks-datasets/airlines\n", 99 | "* We can get first 1000 files using %fs ls dbfs:/databricks-datasets/airlines\n", 100 | "* Location contain 1919 Files, however we will not be able to see all the details using %fs command.\n", 101 | "* Databricks File System commands does not have capability to understand metadata of files such as size in details.\n", 102 | "* When Spark Cluster is started, it will create 2 objects - spark and sc\n", 103 | "* sc is of type SparkContext and spark is of type SparkSession\n", 104 | "* Spark uses HDFS APIs to interact with the file system and we can access HDFS APIs using sc._jsc and sc._jvm to get file metadata.\n", 105 | "* Here are the steps to get the file metadata.\n", 106 | " * Get Hadoop Configuration using `sc._jsc.hadoopConfiguration()` - let's say `conf`\n", 107 | " * We can pass conf to `sc._jvm.org.apache.hadoop.fs.FileSystem` get to get FileSystem object - let's say `fs`\n", 108 | " * We can build `path` object by passing the path as string to `sc._jvm.org.apache.hadoop.fs.Path`\n", 109 | " * We can invoke `listStatus` on top of fs by passing path which will return an array of FileStatus objects - let's say files. \n", 110 | " * Each `FileStatus` object have all the metadata of each file.\n", 111 | " * We can use `len` on files to get number of files.\n", 112 | " * We can use `>getLen` on each `FileStatus` object to get the size of each file. \n", 113 | " * Cumulative size of all files can be achieved using `sum(map(lambda file: file.getLen(), files))`\n", 114 | " \n", 115 | "Let us first get list of files " 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "%fs ls dbfs:/databricks-datasets/airlines" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "Here is the consolidated script to get number of files and cumulative size of all files in a given folder." 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "conf = sc._jsc.hadoopConfiguration()\n", 141 | "fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(conf)\n", 142 | "path = sc._jvm.org.apache.hadoop.fs.Path(\"dbfs:/databricks-datasets/airlines\")\n", 143 | "\n", 144 | "files = fs.listStatus(path)\n", 145 | "sum(map(lambda file: file.getLen(), files))/1024/1024/1024" 146 | ] 147 | } 148 | ], 149 | "metadata": { 150 | "kernelspec": { 151 | "display_name": "Pyspark 2", 152 | "language": "python", 153 | "name": "pyspark2" 154 | }, 155 | "language_info": { 156 | "codemirror_mode": { 157 | "name": "ipython", 158 | "version": 3 159 | }, 160 | "file_extension": ".py", 161 | "mimetype": "text/x-python", 162 | "name": "python", 163 | "nbconvert_exporter": "python", 164 | "pygments_lexer": "ipython3", 165 | "version": "3.6.8" 166 | } 167 | }, 168 | "nbformat": 4, 169 | "nbformat_minor": 4 170 | } 171 | -------------------------------------------------------------------------------- /01_getting_started_pyspark/02_platforms_to_practice.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Platforms to Practice\n", 8 | "\n", 9 | "Let us understand different platforms we can leverage to practice Apache Spark using Python.\n", 10 | "\n", 11 | "* Local Setup\n", 12 | "* Databricks Platform\n", 13 | "* Setting up your own cluster\n", 14 | "* ITVersity Labs" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [] 23 | } 24 | ], 25 | "metadata": { 26 | "kernelspec": { 27 | "display_name": "Python 3", 28 | "language": "python", 29 | "name": "python3" 30 | }, 31 | "language_info": { 32 | "codemirror_mode": { 33 | "name": "ipython", 34 | "version": 3 35 | }, 36 | "file_extension": ".py", 37 | "mimetype": "text/x-python", 38 | "name": "python", 39 | "nbconvert_exporter": "python", 40 | "pygments_lexer": "ipython3", 41 | "version": "3.6.8" 42 | } 43 | }, 44 | "nbformat": 4, 45 | "nbformat_minor": 4 46 | } 47 | -------------------------------------------------------------------------------- /01_getting_started_pyspark/03_setup_spark_locally_windows.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Setup Spark Locally - Windows\n", 8 | "\n", 9 | "Let us understand how to setup Spark locally on Windows. Even though it can be setup directly, we would recommend to use virtual machine.\n", 10 | "\n", 11 | "* Here are the pre-requisites to setup Spark locally on Windows using Virtual Machine.\n", 12 | "* Make sure to setup Virtual Box and then Vagrant.\n", 13 | "* " 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [] 22 | } 23 | ], 24 | "metadata": { 25 | "kernelspec": { 26 | "display_name": "Python 3", 27 | "language": "python", 28 | "name": "python3" 29 | }, 30 | "language_info": { 31 | "codemirror_mode": { 32 | "name": "ipython", 33 | "version": 3 34 | }, 35 | "file_extension": ".py", 36 | "mimetype": "text/x-python", 37 | "name": "python", 38 | "nbconvert_exporter": "python", 39 | "pygments_lexer": "ipython3", 40 | "version": "3.6.8" 41 | } 42 | }, 43 | "nbformat": 4, 44 | "nbformat_minor": 4 45 | } 46 | -------------------------------------------------------------------------------- /01_getting_started_pyspark/04_setup_spark_locally_mac.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Setup Spark Locally - Mac\n", 8 | "\n", 9 | "Let us understand how to setup Spark locally on Mac.\n", 10 | "\n", 11 | "* Here are the pre-requisites to setup Spark Locally on mac.\n", 12 | " * At least 8 GB RAM is highly desired.\n", 13 | " * Make sure JDK 1.8 is setup\n", 14 | " * Make sure to have Python 3. If you do not have it, you can install it using **homebrew**.\n", 15 | "* Here are the steps to setup Pyspark and validate.\n", 16 | " * Create Python Virtual Environment - `python3 -m venv spark-venv`.\n", 17 | " * Activate the virtual environment - `source spark-venv/bin/activate`.\n", 18 | " * Run `pip install pyspark==2.4.6` to install Spark 2.4.6.\n", 19 | " * Run `pyspark` to launch Spark CLI using Python as programming language.\n", 20 | "* Here are some of the limitations related to running Spark locally.\n", 21 | " * You will be able to run Spark using local mode by default. But you will not be able to get the feel of Big Data.\n", 22 | " * Actual production implementations will be on multinode cluters, which run using YARN or Spark Stand Alone or Mesos.\n", 23 | " * You can understand the development process but you will not be able to explore best practices to build effective large scale data engineering solutions." 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [] 32 | } 33 | ], 34 | "metadata": { 35 | "kernelspec": { 36 | "display_name": "Python 3", 37 | "language": "python", 38 | "name": "python3" 39 | }, 40 | "language_info": { 41 | "codemirror_mode": { 42 | "name": "ipython", 43 | "version": 3 44 | }, 45 | "file_extension": ".py", 46 | "mimetype": "text/x-python", 47 | "name": "python", 48 | "nbconvert_exporter": "python", 49 | "pygments_lexer": "ipython3", 50 | "version": "3.6.8" 51 | } 52 | }, 53 | "nbformat": 4, 54 | "nbformat_minor": 4 55 | } 56 | -------------------------------------------------------------------------------- /01_getting_started_pyspark/05_setup_spark_locally_ubuntu.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Setup Spark Locally - Ubuntu\n", 8 | "\n", 9 | "Let us understand how to setup Spark locally on Ubuntu.\n", 10 | "\n", 11 | "* Here are the pre-requisites to setup Spark Locally on Ubuntu.\n", 12 | " * At least 8 GB RAM is highly desired.\n", 13 | " * Make sure JDK 1.8 is setup\n", 14 | " * Make sure to have Python 3. If you do not have it, you can install it using **apt** or **snap**.\n", 15 | "* Here are the steps to setup Pyspark and validate.\n", 16 | " * Create Python Virtual Environment - `python3 -m venv spark-venv`.\n", 17 | " * Activate the virtual environment - `source spark-venv/bin/activate`.\n", 18 | " * Run `pip install pyspark==2.4.6` to install Spark 2.4.6.\n", 19 | " * Run `pyspark` to launch Spark CLI using Python as programming language.\n", 20 | "* Here are some of the limitations related to running Spark locally.\n", 21 | " * You will be able to run Spark using local mode by default. But you will not be able to get the feel of Big Data.\n", 22 | " * Actual production implementations will be on multinode cluters, which run using YARN or Spark Stand Alone or Mesos.\n", 23 | " * You can understand the development process but you will not be able to explore best practices to build effective large scale data engineering solutions." 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [] 32 | } 33 | ], 34 | "metadata": { 35 | "kernelspec": { 36 | "display_name": "Python 3", 37 | "language": "python", 38 | "name": "python3" 39 | }, 40 | "language_info": { 41 | "codemirror_mode": { 42 | "name": "ipython", 43 | "version": 3 44 | }, 45 | "file_extension": ".py", 46 | "mimetype": "text/x-python", 47 | "name": "python", 48 | "nbconvert_exporter": "python", 49 | "pygments_lexer": "ipython3", 50 | "version": "3.6.8" 51 | } 52 | }, 53 | "nbformat": 4, 54 | "nbformat_minor": 4 55 | } 56 | -------------------------------------------------------------------------------- /01_getting_started_pyspark/06_using_itversity_labs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Using ITVersity Labs\n", 8 | "\n", 9 | "Let me demonstrate how to use ITVersity Labs to practice Spark.\n", 10 | "* Once you sign up for the lab, you will get access to the cluster via Jupyter based environment.\n", 11 | "* You can connect to the labs using browser and practice in interactive fashion.\n", 12 | "* You can either use our material or upload your material to practice using Jupyter based environment.\n", 13 | "* Here are some of the advantages of using our labs.\n", 14 | " * Interactive or Integrated learning experience.\n", 15 | " * Access to multi node cluster.\n", 16 | " * Pre-configured data sets as well as databases.\n", 17 | " * You will be focused on the learning rather than troubleshooting the issues." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [] 26 | } 27 | ], 28 | "metadata": { 29 | "kernelspec": { 30 | "display_name": "Python 3", 31 | "language": "python", 32 | "name": "python3" 33 | }, 34 | "language_info": { 35 | "codemirror_mode": { 36 | "name": "ipython", 37 | "version": 3 38 | }, 39 | "file_extension": ".py", 40 | "mimetype": "text/x-python", 41 | "name": "python", 42 | "nbconvert_exporter": "python", 43 | "pygments_lexer": "ipython3", 44 | "version": "3.6.8" 45 | } 46 | }, 47 | "nbformat": 4, 48 | "nbformat_minor": 4 49 | } 50 | -------------------------------------------------------------------------------- /01_getting_started_pyspark/07_using_google_colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [] 9 | } 10 | ], 11 | "metadata": { 12 | "kernelspec": { 13 | "display_name": "Python 3", 14 | "language": "python", 15 | "name": "python3" 16 | }, 17 | "language_info": { 18 | "codemirror_mode": { 19 | "name": "ipython", 20 | "version": 3 21 | }, 22 | "file_extension": ".py", 23 | "mimetype": "text/x-python", 24 | "name": "python", 25 | "nbconvert_exporter": "python", 26 | "pygments_lexer": "ipython3", 27 | "version": "3.6.8" 28 | } 29 | }, 30 | "nbformat": 4, 31 | "nbformat_minor": 4 32 | } 33 | -------------------------------------------------------------------------------- /01_getting_started_pyspark/08_overview_of_filesystems.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Overview of File Systems\n", 8 | "\n", 9 | "Let us get an overview of File Systems you can work with while learning Spark.\n", 10 | "\n", 11 | "* Here are the file systems that can be used to learn Spark.\n", 12 | " * Local file system when you run in local mode.\n", 13 | " * Hadoop Distributed File System.\n", 14 | " * AWS S3\n", 15 | " * Azure Blob\n", 16 | " * GCP Cloud Storage\n", 17 | " * and other supported file systems.\n", 18 | "* It is quite straight forward to learn underlying file system. You just need to focus on the following:\n", 19 | " * Copy files into the file system from different sources.\n", 20 | " * Validate files in the file system.\n", 21 | " * Ability to preview the data using Spark related APIs or direct tools.\n", 22 | " * Delete files from the file system.\n", 23 | "* Typically we ingest data into underlying file system using tools such as Informatica, Talend, NiFi, Kafka, custom applications etc. " 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [] 32 | } 33 | ], 34 | "metadata": { 35 | "kernelspec": { 36 | "display_name": "Python 3", 37 | "language": "python", 38 | "name": "python3" 39 | }, 40 | "language_info": { 41 | "codemirror_mode": { 42 | "name": "ipython", 43 | "version": 3 44 | }, 45 | "file_extension": ".py", 46 | "mimetype": "text/x-python", 47 | "name": "python", 48 | "nbconvert_exporter": "python", 49 | "pygments_lexer": "ipython3", 50 | "version": "3.6.8" 51 | } 52 | }, 53 | "nbformat": 4, 54 | "nbformat_minor": 4 55 | } 56 | -------------------------------------------------------------------------------- /01_getting_started_pyspark/09_different_spark_modules.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Different Spark Modules\n", 8 | "\n", 9 | "Let us understand details about different spark modules. We will be focusing on high level modules that are made available since Spark 2.2 and later.\n", 10 | "* Here are the different Spark Modules.\n", 11 | " * Spark Core - RDD and Map Reduce APIs\n", 12 | " * Spark Data Frames and Spark SQL\n", 13 | " * Spark Structured Streaming\n", 14 | " * Spark MLLib (Data Frame based)\n", 15 | "* As engineers, we need not focus too much on Spark Core libraries to build Data Pipelines. We should focus on Spark Data Frames as well as Spark SQL." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [] 24 | } 25 | ], 26 | "metadata": { 27 | "kernelspec": { 28 | "display_name": "Python 3", 29 | "language": "python", 30 | "name": "python3" 31 | }, 32 | "language_info": { 33 | "codemirror_mode": { 34 | "name": "ipython", 35 | "version": 3 36 | }, 37 | "file_extension": ".py", 38 | "mimetype": "text/x-python", 39 | "name": "python", 40 | "nbconvert_exporter": "python", 41 | "pygments_lexer": "ipython3", 42 | "version": "3.6.8" 43 | } 44 | }, 45 | "nbformat": 4, 46 | "nbformat_minor": 4 47 | } 48 | -------------------------------------------------------------------------------- /01_getting_started_pyspark/10_spark_cluster_manager_types.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Spark Cluster Manager Types\n", 8 | "\n", 9 | "Let us get an overview of different Spark Cluster Managers on which typically Spark Applications are deployed.\n", 10 | "\n", 11 | "* Here are the supported cluster manager types.\n", 12 | " * Local (used for development and unit testing).\n", 13 | " * Stand Alone\n", 14 | " * YARN\n", 15 | " * Mesos\n", 16 | "* Here are the popular distributions which use YARN to deploy Spark Applications.\n", 17 | " * Cloudera\n", 18 | " * AWS EMR\n", 19 | " * Google Dataproc\n", 20 | " * Hortonworks\n", 21 | " * MapR\n", 22 | "* Databricks uses Stand Alone for running or deploying Spark Jobs." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [] 31 | } 32 | ], 33 | "metadata": { 34 | "kernelspec": { 35 | "display_name": "Python 3", 36 | "language": "python", 37 | "name": "python3" 38 | }, 39 | "language_info": { 40 | "codemirror_mode": { 41 | "name": "ipython", 42 | "version": 3 43 | }, 44 | "file_extension": ".py", 45 | "mimetype": "text/x-python", 46 | "name": "python", 47 | "nbconvert_exporter": "python", 48 | "pygments_lexer": "ipython3", 49 | "version": "3.6.8" 50 | } 51 | }, 52 | "nbformat": 4, 53 | "nbformat_minor": 4 54 | } 55 | -------------------------------------------------------------------------------- /01_getting_started_pyspark/11_launching_pyspark_cli.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Launching Spark CLI\n", 8 | "\n", 9 | "Let us understand how to launch Pyspark CLI. We will be covering both local as well as our labs.\n", 10 | "* Once pyspark is installed you can run `pyspark` to launch Pyspark CLI.\n", 11 | "* In our labs, we have integrated Spark with Hadoop and Hive and you can interact with Hive Database as well.\n", 12 | "* You need to run the following command to launch Pyspark using Terminal.\n", 13 | "\n", 14 | "```shell\n", 15 | "export PYSPARK_PYTHON=python3\n", 16 | "export SPARK_MAJOR_VERSION=2\n", 17 | "pyspark --master yarn \\\n", 18 | " --conf spark.ui.port=0 \\\n", 19 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 20 | "```\n", 21 | "\n", 22 | "* Alternatively, you can also run the following command to launch Pyspark CLI.\n", 23 | "\n", 24 | "```shell\n", 25 | "pyspark --master yarn \\\n", 26 | " --conf spark.ui.port=0 \\\n", 27 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 28 | "```\n", 29 | "\n", 30 | "* Here is what happens when you launch Pyspark CLI.\n", 31 | " * Launches Python CLI.\n", 32 | " * All Spark related libraries will be loaded.\n", 33 | " * Creates SparkSession as well as SparkContext objects.\n", 34 | " * It facilitates us to explore Spark APIs in interactive fashion." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [] 43 | } 44 | ], 45 | "metadata": { 46 | "kernelspec": { 47 | "display_name": "Python 3", 48 | "language": "python", 49 | "name": "python3" 50 | }, 51 | "language_info": { 52 | "codemirror_mode": { 53 | "name": "ipython", 54 | "version": 3 55 | }, 56 | "file_extension": ".py", 57 | "mimetype": "text/x-python", 58 | "name": "python", 59 | "nbconvert_exporter": "python", 60 | "pygments_lexer": "ipython3", 61 | "version": "3.6.8" 62 | } 63 | }, 64 | "nbformat": 4, 65 | "nbformat_minor": 4 66 | } 67 | -------------------------------------------------------------------------------- /01_getting_started_pyspark/12_using_jupyter_lab_interface.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Using Jupyter Lab Interface\n", 8 | "\n", 9 | "As part of our labs, you can learn Spark using Jupyter based interface.\n", 10 | "* Make sure you are using right kernel **Pyspark 2** (top right corner of the notebook).\n", 11 | "* Use below code to start the Spark Session object so that you can learn Spark in interactive fashion." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from pyspark.sql import SparkSession" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import getpass\n", 30 | "username = getpass.getuser()" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 4, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "spark = SparkSession. \\\n", 40 | " builder. \\\n", 41 | " config('spark.ui.port', '0'). \\\n", 42 | " config(\"spark.sql.warehouse.dir\", f\"/user/{username}/warehouse\"). \\\n", 43 | " enableHiveSupport(). \\\n", 44 | " appName(f'{username} | Getting Started'). \\\n", 45 | " master('yarn'). \\\n", 46 | " getOrCreate()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [] 55 | } 56 | ], 57 | "metadata": { 58 | "kernelspec": { 59 | "display_name": "Pyspark 2", 60 | "language": "python", 61 | "name": "pyspark2" 62 | }, 63 | "language_info": { 64 | "codemirror_mode": { 65 | "name": "ipython", 66 | "version": 3 67 | }, 68 | "file_extension": ".py", 69 | "mimetype": "text/x-python", 70 | "name": "python", 71 | "nbconvert_exporter": "python", 72 | "pygments_lexer": "ipython3", 73 | "version": "3.6.8" 74 | } 75 | }, 76 | "nbformat": 4, 77 | "nbformat_minor": 4 78 | } 79 | -------------------------------------------------------------------------------- /01_getting_started_pyspark/13_word_count_using_spark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [] 9 | } 10 | ], 11 | "metadata": { 12 | "kernelspec": { 13 | "display_name": "Python 3", 14 | "language": "python", 15 | "name": "python3" 16 | }, 17 | "language_info": { 18 | "codemirror_mode": { 19 | "name": "ipython", 20 | "version": 3 21 | }, 22 | "file_extension": ".py", 23 | "mimetype": "text/x-python", 24 | "name": "python", 25 | "nbconvert_exporter": "python", 26 | "pygments_lexer": "ipython3", 27 | "version": "3.6.8" 28 | } 29 | }, 30 | "nbformat": 4, 31 | "nbformat_minor": 4 32 | } 33 | -------------------------------------------------------------------------------- /02_quick_recap_of_python/01_quick_recap_of_python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Quick Recap of Python\n", 8 | "\n", 9 | "Let us quickly recap of some of the core programming concepts of Python before we get into Spark.\n", 10 | "* Data Engineering Life Cycle\n", 11 | "* Getting Started - Python and pip\n", 12 | "* Python CLI or Jupyter Notebook\n", 13 | "* Basic Programming Constructs\n", 14 | "* Developing Functions\n", 15 | "* Lambda Functions\n", 16 | "* Overview of Collections\n", 17 | "* Overview of Pandas Data Frames\n", 18 | "* Development Life Cycle" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [] 27 | } 28 | ], 29 | "metadata": { 30 | "kernelspec": { 31 | "display_name": "Pyspark 2", 32 | "language": "python", 33 | "name": "pyspark2" 34 | }, 35 | "language_info": { 36 | "codemirror_mode": { 37 | "name": "ipython", 38 | "version": 3 39 | }, 40 | "file_extension": ".py", 41 | "mimetype": "text/x-python", 42 | "name": "python", 43 | "nbconvert_exporter": "python", 44 | "pygments_lexer": "ipython3", 45 | "version": "3.6.8" 46 | }, 47 | "pycharm": { 48 | "stem_cell": { 49 | "cell_type": "raw", 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "source": [] 54 | } 55 | } 56 | }, 57 | "nbformat": 4, 58 | "nbformat_minor": 4 59 | } 60 | -------------------------------------------------------------------------------- /02_quick_recap_of_python/02_data_engineering_life_cycle.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Data Engineering Life Cycle\n", 8 | "\n", 9 | "Let us first understand the Data Engineering Life Cycle. We typically read the data, process it by applying business rules and write the data back to different targets\n", 10 | "* Read the data from different sources.\n", 11 | " * Files\n", 12 | " * Databases\n", 13 | " * Mainframes\n", 14 | " * APIs\n", 15 | "* Processing the data\n", 16 | " * Row Level Transformations\n", 17 | " * Aggregations\n", 18 | " * Sorting\n", 19 | " * Ranking\n", 20 | " * Joining multiple data sets\n", 21 | "* Write data to different targets.\n", 22 | " * Files\n", 23 | " * Databases\n", 24 | " * Mainframes\n", 25 | " * APIs" 26 | ] 27 | } 28 | ], 29 | "metadata": { 30 | "kernelspec": { 31 | "display_name": "Python 3", 32 | "language": "python", 33 | "name": "python3" 34 | }, 35 | "language_info": { 36 | "codemirror_mode": { 37 | "name": "ipython", 38 | "version": 3 39 | }, 40 | "file_extension": ".py", 41 | "mimetype": "text/x-python", 42 | "name": "python", 43 | "nbconvert_exporter": "python", 44 | "pygments_lexer": "ipython3", 45 | "version": "3.6.8" 46 | } 47 | }, 48 | "nbformat": 4, 49 | "nbformat_minor": 4 50 | } 51 | -------------------------------------------------------------------------------- /02_quick_recap_of_python/03_getting_started_python_and_pip.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Getting Started - Python and pip\n", 8 | "\n", 9 | "Let us get started with Python and pip.\n", 10 | "* Python is a programming language. We can install Python from [python.org](https://www.python.org).\n", 11 | "* Python contain certain functionality in the form of functions out of the box. However, over a period of time the open source community of Python have come up with libraries for the common and reusable functionality.\n", 12 | "* Those libraries can be installed using `pip`. It is nothing but Python's package manager.\n", 13 | "* You can install, upgrade, uninstall, list the details of installed libraries using `pip`.\n", 14 | "* Here is one of the common practice we follow with respect to starting a new application using Python as programming language.\n", 15 | " * Make sure right version of Python is installed for your project/application.\n", 16 | " * Create virtual environment for your project.\n", 17 | " * Activate and install required libraries.\n", 18 | " * Develop the required functionality.\n", 19 | " * We can deactivate the virtual environment or activate other virtual environment if we have to switch over to another Python application.\n", 20 | "\n", 21 | "```shell\n", 22 | "python3 -m venv demo\n", 23 | "source demo/bin/activate\n", 24 | "pip install pandas\n", 25 | "```\n", 26 | "\n", 27 | "```{note}\n", 28 | "As part of the demo we have used CLI. However, in actual projects we typically use IDEs such as Pycharm.\n", 29 | "```" 30 | ] 31 | } 32 | ], 33 | "metadata": { 34 | "kernelspec": { 35 | "display_name": "Python 3", 36 | "language": "python", 37 | "name": "python3" 38 | }, 39 | "language_info": { 40 | "codemirror_mode": { 41 | "name": "ipython", 42 | "version": 3 43 | }, 44 | "file_extension": ".py", 45 | "mimetype": "text/x-python", 46 | "name": "python", 47 | "nbconvert_exporter": "python", 48 | "pygments_lexer": "ipython3", 49 | "version": "3.6.8" 50 | } 51 | }, 52 | "nbformat": 4, 53 | "nbformat_minor": 4 54 | } 55 | -------------------------------------------------------------------------------- /02_quick_recap_of_python/04_python_cli_or_jupyter_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Python CLI or Jupyter Notebook\n", 8 | "\n", 9 | "We can use Python CLI or Jupyter Notebook to explore APIs.\n", 10 | "* We can launch Python CLI using `python` command.\n", 11 | "* We can launch the Jupyter Notebook using the `jupyter notebook` command.\n", 12 | "* A web service will be started on port number 8888 by default.\n", 13 | "* We can go to the browser and connect to the web server using IP address and port number.\n", 14 | "* We should be able to explore code in interactive fashion.\n", 15 | "* We can issue magic commands such as %%sh to run shell commands, %%md to document using markdown etc." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "### Tasks\n", 23 | "\n", 24 | "Let us perform these tasks to just recollect how to use Python CLI or Jupyter Notebook.\n", 25 | "* Create variables `i` and `j` assigning `10` and `20.5` respectively." 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "i = 10\n", 35 | "j = 20.5" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "* Add the values and assign result to `res`." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "30.5\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "res = i + j\n", 60 | "print(str(res))" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "* Get the `type` of `i`, `j` and `res`." 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 3, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "data": { 77 | "text/plain": [ 78 | "int" 79 | ] 80 | }, 81 | "execution_count": 3, 82 | "metadata": {}, 83 | "output_type": "execute_result" 84 | } 85 | ], 86 | "source": [ 87 | "type(i)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 4, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "data": { 97 | "text/plain": [ 98 | "float" 99 | ] 100 | }, 101 | "execution_count": 4, 102 | "metadata": {}, 103 | "output_type": "execute_result" 104 | } 105 | ], 106 | "source": [ 107 | "type(j)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 5, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "float" 119 | ] 120 | }, 121 | "execution_count": 5, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "type(res)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "* Get the help on `int`." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "help(int)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 7, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/plain": [ 154 | "\u001b[0;31mInit signature:\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m/\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 155 | "\u001b[0;31mDocstring:\u001b[0m \n", 156 | "int(x=0) -> integer\n", 157 | "int(x, base=10) -> integer\n", 158 | "\n", 159 | "Convert a number or string to an integer, or return 0 if no arguments\n", 160 | "are given. If x is a number, return x.__int__(). For floating point\n", 161 | "numbers, this truncates towards zero.\n", 162 | "\n", 163 | "If x is not a number or if base is given, then x must be a string,\n", 164 | "bytes, or bytearray instance representing an integer literal in the\n", 165 | "given base. The literal can be preceded by '+' or '-' and be surrounded\n", 166 | "by whitespace. The base defaults to 10. Valid bases are 0 and 2-36.\n", 167 | "Base 0 means to interpret the base from the string as an integer literal.\n", 168 | ">>> int('0b100', base=0)\n", 169 | "4\n", 170 | "\u001b[0;31mType:\u001b[0m type\n", 171 | "\u001b[0;31mSubclasses:\u001b[0m bool, IntEnum, IntFlag, _NamedIntConstant\n" 172 | ] 173 | }, 174 | "metadata": {}, 175 | "output_type": "display_data" 176 | } 177 | ], 178 | "source": [ 179 | "int?" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "* Get the help on `startswith` that is available on `str`." 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 8, 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "name": "stdout", 196 | "output_type": "stream", 197 | "text": [ 198 | "Help on method_descriptor:\n", 199 | "\n", 200 | "startswith(...)\n", 201 | " S.startswith(prefix[, start[, end]]) -> bool\n", 202 | " \n", 203 | " Return True if S starts with the specified prefix, False otherwise.\n", 204 | " With optional start, test S beginning at that position.\n", 205 | " With optional end, stop comparing S at that position.\n", 206 | " prefix can also be a tuple of strings to try.\n", 207 | "\n" 208 | ] 209 | } 210 | ], 211 | "source": [ 212 | "help(str.startswith)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 9, 218 | "metadata": {}, 219 | "outputs": [ 220 | { 221 | "data": { 222 | "text/plain": [ 223 | "\u001b[0;31mDocstring:\u001b[0m\n", 224 | "S.startswith(prefix[, start[, end]]) -> bool\n", 225 | "\n", 226 | "Return True if S starts with the specified prefix, False otherwise.\n", 227 | "With optional start, test S beginning at that position.\n", 228 | "With optional end, stop comparing S at that position.\n", 229 | "prefix can also be a tuple of strings to try.\n", 230 | "\u001b[0;31mType:\u001b[0m method_descriptor\n" 231 | ] 232 | }, 233 | "metadata": {}, 234 | "output_type": "display_data" 235 | } 236 | ], 237 | "source": [ 238 | "str.startswith?" 239 | ] 240 | } 241 | ], 242 | "metadata": { 243 | "kernelspec": { 244 | "display_name": "Python 3", 245 | "language": "python", 246 | "name": "python3" 247 | }, 248 | "language_info": { 249 | "name": "" 250 | } 251 | }, 252 | "nbformat": 4, 253 | "nbformat_minor": 4 254 | } 255 | -------------------------------------------------------------------------------- /02_quick_recap_of_python/05_basic_programming_constructs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Basic Programming Constructs\n", 8 | "\n", 9 | "Let us recollect some of the basic programming constructs of Python.\n", 10 | "\n", 11 | "* We can perform all standard arithmetic operations using standard operators.\n", 12 | " * `+` for addition\n", 13 | " * `-` for subtraction\n", 14 | " * `*` for multiplication\n", 15 | " * `/` for division\n", 16 | " * `%` for modulus\n", 17 | "* Comparison Operations (==, !=, <, >, <=, >=, etc) \n", 18 | " * All the comparison operators return a True or False (Boolean value)\n", 19 | "* Conditionals (if) \n", 20 | " * We typically use comparison operators as part of conditionals.\n", 21 | "* Loops (for) \n", 22 | " * We can iterate through collection using `for i in l` where l is a standard collection such as list or set.\n", 23 | " * Python provides special function called as `range` which will return a collection of integers between the given range. It excludes the upper bound value.\n", 24 | "* In Python, scope is defined by indentation." 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "### Tasks\n", 32 | " \n", 33 | "Let us perform few tasks to quickly recap basic programming constructs of Python.\n", 34 | " * Get all the odd numbers between 1 and 15." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 14, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "data": { 44 | "text/plain": [ 45 | "\u001b[0;31mInit signature:\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m/\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 46 | "\u001b[0;31mDocstring:\u001b[0m \n", 47 | "range(stop) -> range object\n", 48 | "range(start, stop[, step]) -> range object\n", 49 | "\n", 50 | "Return an object that produces a sequence of integers from start (inclusive)\n", 51 | "to stop (exclusive) by step. range(i, j) produces i, i+1, i+2, ..., j-1.\n", 52 | "start defaults to 0, and stop is omitted! range(4) produces 0, 1, 2, 3.\n", 53 | "These are exactly the valid indices for a list of 4 elements.\n", 54 | "When step is given, it specifies the increment (or decrement).\n", 55 | "\u001b[0;31mType:\u001b[0m type\n", 56 | "\u001b[0;31mSubclasses:\u001b[0m \n" 57 | ] 58 | }, 59 | "metadata": {}, 60 | "output_type": "display_data" 61 | } 62 | ], 63 | "source": [ 64 | "range?" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 16, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/plain": [ 75 | "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]" 76 | ] 77 | }, 78 | "execution_count": 16, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "list(range(1, 16))" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 17, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "data": { 94 | "text/plain": [ 95 | "[1, 3, 5, 7, 9, 11, 13, 15]" 96 | ] 97 | }, 98 | "execution_count": 17, 99 | "metadata": {}, 100 | "output_type": "execute_result" 101 | } 102 | ], 103 | "source": [ 104 | "list(range(1, 16, 2))" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "* Print all those numbers which are divisible by 3 from the above list." 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 19, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "Not divisible by 3\n", 124 | "3\n", 125 | "Not divisible by 3\n", 126 | "Not divisible by 3\n", 127 | "9\n", 128 | "Not divisible by 3\n", 129 | "Not divisible by 3\n", 130 | "15\n" 131 | ] 132 | } 133 | ], 134 | "source": [ 135 | "for i in list(range(1, 16, 2)):\n", 136 | " if i % 3 == 0: \n", 137 | " print(i)\n", 138 | " else:\n", 139 | " print('Not divisible by 3')" 140 | ] 141 | } 142 | ], 143 | "metadata": { 144 | "kernelspec": { 145 | "display_name": "Python 3", 146 | "language": "python", 147 | "name": "python3" 148 | }, 149 | "language_info": { 150 | "codemirror_mode": { 151 | "name": "ipython", 152 | "version": 3 153 | }, 154 | "file_extension": ".py", 155 | "mimetype": "text/x-python", 156 | "name": "python", 157 | "nbconvert_exporter": "python", 158 | "pygments_lexer": "ipython3", 159 | "version": "3.6.8" 160 | } 161 | }, 162 | "nbformat": 4, 163 | "nbformat_minor": 4 164 | } 165 | -------------------------------------------------------------------------------- /02_quick_recap_of_python/06_developing_functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Developing Functions\n", 8 | "\n", 9 | "Let us understand how to develop functions using Python as programming language.\n", 10 | "* Function starts with `def` followed by function name.\n", 11 | "* Parameters can be of different types.\n", 12 | " * Required\n", 13 | " * Keyword\n", 14 | " * Variable Number\n", 15 | " * Functions\n", 16 | "* Functions which take another function as an argument is called higher order functions.\n", 17 | "\n", 18 | "### Tasks\n", 19 | "\n", 20 | "Let us perform few tasks to understand how to develop functions in Python. \n", 21 | "* Sum of integers between lower bound and upper bound using formula." 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 20, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "def sumOfN(n):\n", 31 | " return int((n * (n + 1)) / 2)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 21, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/plain": [ 42 | "55" 43 | ] 44 | }, 45 | "execution_count": 21, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "sumOfN(10)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 22, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "def sumOfIntegers(lb, ub):\n", 61 | " return sumOfN(ub) - sumOfN(lb -1)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 23, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "data": { 71 | "text/plain": [ 72 | "45" 73 | ] 74 | }, 75 | "execution_count": 23, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "sumOfIntegers(5, 10)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "* Sum of integers between lower bound and upper bound using loops." 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 24, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "def sumOfIntegers(lb, ub):\n", 98 | " total = 0\n", 99 | " for e in range(lb, ub + 1):\n", 100 | " total += e\n", 101 | " return total" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 25, 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "data": { 111 | "text/plain": [ 112 | "55" 113 | ] 114 | }, 115 | "execution_count": 25, 116 | "metadata": {}, 117 | "output_type": "execute_result" 118 | } 119 | ], 120 | "source": [ 121 | "sumOfIntegers(1, 10)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 26, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "data": { 131 | "text/plain": [ 132 | "45" 133 | ] 134 | }, 135 | "execution_count": 26, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "sumOfIntegers(5, 10)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "* Sum of squares of integers between lower bound and upper bound using loops." 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 27, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "def sumOfSquares(lb, ub):\n", 158 | " total = lb * lb\n", 159 | " for e in range(lb + 1, ub + 1):\n", 160 | " total += e * e\n", 161 | " return total" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 28, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "data": { 171 | "text/plain": [ 172 | "29" 173 | ] 174 | }, 175 | "execution_count": 28, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "sumOfSquares(2, 4)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "* Sum of the even numbers between lower bound and upper bound using loops." 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 29, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "def sumOfEvens(lb, ub):\n", 198 | " total = lb if lb % 2 == 0 else 0\n", 199 | " for e in range(lb + 1, ub + 1):\n", 200 | " total += e if e%2==0 else 0\n", 201 | " return total" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 30, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "data": { 211 | "text/plain": [ 212 | "6" 213 | ] 214 | }, 215 | "execution_count": 30, 216 | "metadata": {}, 217 | "output_type": "execute_result" 218 | } 219 | ], 220 | "source": [ 221 | "sumOfEvens(2, 4)" 222 | ] 223 | } 224 | ], 225 | "metadata": { 226 | "kernelspec": { 227 | "display_name": "Python 3", 228 | "language": "python", 229 | "name": "python3" 230 | }, 231 | "language_info": { 232 | "codemirror_mode": { 233 | "name": "ipython", 234 | "version": 3 235 | }, 236 | "file_extension": ".py", 237 | "mimetype": "text/x-python", 238 | "name": "python", 239 | "nbconvert_exporter": "python", 240 | "pygments_lexer": "ipython3", 241 | "version": "3.6.8" 242 | } 243 | }, 244 | "nbformat": 4, 245 | "nbformat_minor": 4 246 | } 247 | -------------------------------------------------------------------------------- /02_quick_recap_of_python/08_overview_of_collections.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Overview of Collections\n", 8 | "\n", 9 | "Let's quickly recap about Collections and Tuples in Python. We will primarily talk about collections that comes as part of Python standard library such as `list`, `set`, `dict` and `tuple`.\n", 10 | "* Group of elements with length and index - `list`\n", 11 | "* Group of unique elements - `set`\n", 12 | "* Group of key value pairs - `dict`\n", 13 | "* While `list` and `set` contain group of homogeneous elements, `dict` and `tuple` contains group of heterogeneous elements.\n", 14 | "* `list` or `set` are analogous to a database table while `dict` or `tuple` are analogous to individual record.\n", 15 | "* Typically we create list of tuples or dicts or set of tuples or dicts. Also a dict can be considered as list of pairs. A pair is nothing but a tuple with 2 elements.\n", 16 | "* `list` and `dict` are quite extensively used compared to `set` and `tuple`.\n", 17 | "* We typically use Map Reduce APIs to process the data in collections. There are also some pre-defined functions such as `len`, `sum`, `min`, `max` etc for aggregating data in collections." 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "### Tasks\n", 25 | "\n", 26 | "Let us perform few tasks to quickly recap details about Collections and Tuples in Python. We will also quickly recap about Map Reduce APIs.\n", 27 | "\n", 28 | "* Create a collection of orders by reading data from a file." 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "%%sh\n", 38 | "\n", 39 | "ls -ltr /data/retail_db/orders/part-00000" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "orders_path = \"/data/retail_db/orders/part-00000\"\n", 49 | "orders = open(orders_path). \\\n", 50 | " read(). \\\n", 51 | " splitlines()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "* Get all unique order statuses. Make sure data is sorted in alphabetical order." 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "sorted(set(map(lambda o: o.split(\",\")[3], orders)))" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "* Get count of all unique dates." 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "len(list(map(lambda o: o.split(\",\")[1], orders)))" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "* Sort the data in orders in ascending order by order_customer_id and then order_date." 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "sorted(orders, key=lambda k: (int(k.split(\",\")[2]), k.split(\",\")[1]))" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "* Create a collection of order_items by reading data from a file." 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "order_items_path = \"/data/retail_db/order_items/part-00000\"\n", 116 | "order_items = open(order_items_path). \\\n", 117 | " read(). \\\n", 118 | " splitlines()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "* Get revenue for a given order_item_order_id." 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "def get_order_revenue(order_items, order_id):\n", 135 | " order_items_filtered = filter(lambda oi: \n", 136 | " int(oi.split(\",\")[1]) == 2, \n", 137 | " order_items\n", 138 | " )\n", 139 | " order_items_map = map(lambda oi: \n", 140 | " float(oi.split(\",\")[4]), \n", 141 | " order_items_filtered\n", 142 | " )\n", 143 | " return round(sum(order_items_map), 2)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "get_order_revenue(order_items, 2)" 153 | ] 154 | } 155 | ], 156 | "metadata": { 157 | "kernelspec": { 158 | "display_name": "Python 3", 159 | "language": "python", 160 | "name": "python3" 161 | }, 162 | "language_info": { 163 | "codemirror_mode": { 164 | "name": "ipython", 165 | "version": 3 166 | }, 167 | "file_extension": ".py", 168 | "mimetype": "text/x-python", 169 | "name": "python", 170 | "nbconvert_exporter": "python", 171 | "pygments_lexer": "ipython3", 172 | "version": "3.6.8" 173 | } 174 | }, 175 | "nbformat": 4, 176 | "nbformat_minor": 4 177 | } 178 | -------------------------------------------------------------------------------- /02_quick_recap_of_python/09_overview_of_pandas_dataframes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Overview of Pandas Data Frames\n", 8 | "\n", 9 | "While collections are typically the group of objects or tuples or simple strings, we need to parse them to further process the data. This process is tedious at times.\n", 10 | "* With Data Frames we can define the structure.\n", 11 | "* Data Frame is nothing but group of rows where each row have multiple attributes with names.\n", 12 | "* Data Frame is similar to a Database Table or Spreadsheet with Header.\n", 13 | "* Pandas provide rich and simple functions to convert data in files into Data Frames and process them\n", 14 | "* Data can be read from files into Data Frame using functions such as read_csv.\n", 15 | "* We can perform all standard operations on Data Frames.\n", 16 | " * Projection or Selection \n", 17 | " * Filtering \n", 18 | " * Aggregations \n", 19 | " * Joins \n", 20 | " * Sorting" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "### Tasks\n", 28 | "\n", 29 | "Let us perform few tasks to recap the usage of Pandas Data Frames.\n", 30 | " \n", 31 | "* Read order items data from the location on your system. In mine it is /data/retail_db/order_items/part-00000. Use the information below to define schema.\n", 32 | "* It has 6 fields with the below names in the same order as specified below.\n", 33 | " * order_item_id\n", 34 | " * order_item_order_id\n", 35 | " * order_item_product_id\n", 36 | " * order_item_quantity\n", 37 | " * order_item_subtotal\n", 38 | " * order_item_product_price" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "import pandas as pd\n", 48 | "order_items_path = \"/data/retail_db/order_items/part-00000\"\n", 49 | "order_items = pd. \\\n", 50 | " read_csv(order_items_path,\n", 51 | " names=[\"order_item_id\", \"order_item_order_id\",\n", 52 | " \"order_item_product_id\", \"order_item_quantity\",\n", 53 | " \"order_item_subtotal\", \"order_item_product_price\"\n", 54 | " ]\n", 55 | " )" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "* Project order_item_order_id and order_item_subtotal" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "order_items[[\"order_item_id\", \"order_item_subtotal\"]]" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "* Filter for order_item_order_id 2" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "order_items.query(\"order_item_order_id == 2\")" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "* Compute revenue for order_item_order_id 2" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "order_items. \\\n", 104 | " query(\"order_item_order_id == 2\")[\"order_item_subtotal\"]. \\\n", 105 | " sum()" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "* Get number of items and revenue for each order id. Give alias to the order revenue as **revenue**." 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "order_items. \\\n", 122 | " groupby(\"order_item_order_id\")[\"order_item_subtotal\"]. \\\n", 123 | " sum()" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "order_items. \\\n", 133 | " groupby(\"order_item_order_id\")[\"order_item_subtotal\"]. \\\n", 134 | " agg(['sum', 'count']). \\\n", 135 | " rename(columns={'sum': 'revenue'})" 136 | ] 137 | } 138 | ], 139 | "metadata": { 140 | "kernelspec": { 141 | "display_name": "Python 3", 142 | "language": "python", 143 | "name": "python3" 144 | }, 145 | "language_info": { 146 | "codemirror_mode": { 147 | "name": "ipython", 148 | "version": 3 149 | }, 150 | "file_extension": ".py", 151 | "mimetype": "text/x-python", 152 | "name": "python", 153 | "nbconvert_exporter": "python", 154 | "pygments_lexer": "ipython3", 155 | "version": "3.6.8" 156 | } 157 | }, 158 | "nbformat": 4, 159 | "nbformat_minor": 4 160 | } 161 | -------------------------------------------------------------------------------- /02_quick_recap_of_python/10_limitations_of_pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Limitations of Pandas\n", 8 | "\n", 9 | "We can use Pandas for data processing. It provides rich APIs to read data from different sources, process the data and then write it to different targets.\n", 10 | "* Pandas works well for light weight data processing.\n", 11 | "* Pandas is typically single threaded, which means only one process take care of processing the data.\n", 12 | "* As data volume grows, the processing time might grow exponentially and also run into resource contention.\n", 13 | "* It is not trivial to use distributed processing using Pandas APIs. We will end up struggling with multi threading rather than business logic.\n", 14 | "* There are Distributed Computing Frameworks such as Hadoop Map Reduce, Spark etc to take care of data processing at scale on multi node Hadoop or Spark Clusters.\n", 15 | "* Both Hadoop Map Reduce and Spark comes with Distributed Computing Frameworks as well as APIs.\n", 16 | "\n", 17 | "**Pandas is typically used for light weight Data Processing and Spark is used for Data Processing at Scale.**" 18 | ] 19 | } 20 | ], 21 | "metadata": { 22 | "kernelspec": { 23 | "display_name": "Python 3", 24 | "language": "python", 25 | "name": "python3" 26 | }, 27 | "language_info": { 28 | "codemirror_mode": { 29 | "name": "ipython", 30 | "version": 3 31 | }, 32 | "file_extension": ".py", 33 | "mimetype": "text/x-python", 34 | "name": "python", 35 | "nbconvert_exporter": "python", 36 | "pygments_lexer": "ipython3", 37 | "version": "3.6.8" 38 | } 39 | }, 40 | "nbformat": 4, 41 | "nbformat_minor": 4 42 | } 43 | -------------------------------------------------------------------------------- /02_quick_recap_of_python/11_development_life_cycle.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Development Life Cycle\n", 8 | "\n", 9 | "Let us understand the development life cycle. We typically use IDEs such as PyCharm to develop Python based applications.\n", 10 | "\n", 11 | "* Create Project - retail\n", 12 | "* Choose the interpreter 3.x\n", 13 | "* Make sure plugins such as pandas are installed.\n", 14 | "* Create config.py script for externalizing run time parameters such as input path, output path etc.\n", 15 | "* Create app folder for the source code." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "### Tasks\n", 23 | "\n", 24 | "Let us develop a simple application to understand end to end development life cycle.\n", 25 | "\n", 26 | "* Read the data from order_items\n", 27 | "* Get revenue for each order id\n", 28 | "* Save the output which contain order id and revenue to a file.\n", 29 | "\n", 30 | "Click [here](https://github.com/dgadiraju/python-retail/tree/v1.0) for the complete code for the above tasks." 31 | ] 32 | } 33 | ], 34 | "metadata": { 35 | "kernelspec": { 36 | "display_name": "Python 3", 37 | "language": "python", 38 | "name": "python3" 39 | }, 40 | "language_info": { 41 | "codemirror_mode": { 42 | "name": "ipython", 43 | "version": 3 44 | }, 45 | "file_extension": ".py", 46 | "mimetype": "text/x-python", 47 | "name": "python", 48 | "nbconvert_exporter": "python", 49 | "pygments_lexer": "ipython3", 50 | "version": "3.6.8" 51 | } 52 | }, 53 | "nbformat": 4, 54 | "nbformat_minor": 4 55 | } 56 | -------------------------------------------------------------------------------- /02_quick_recap_of_python/12_exercises_recap_of_python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Exercises - Recap of Python\n", 8 | "\n", 9 | "Let us perform few exercises to understand how to process the data. We will use LinkedIn data to perform some basic data processing using Python.\n", 10 | "\n", 11 | "* Get LinkedIn archive.\n", 12 | " * Go to https://linkedin.com\n", 13 | " * Me on top -> Settings & Privacy\n", 14 | " * Then go to \"How LinkedIn users your data\" -> Getting a copy of your data\n", 15 | " * Register and download. You will get a link as part of the email.\n", 16 | "* Data contain multiple CSV files. We will limit the analysis to **Contacts.csv** and **Connections.csv**.\n", 17 | "* Get the number of **contacts** with out email ids.\n", 18 | "* Get the number of **contacts** from each source.\n", 19 | "* Get the number of **connections** with each title.\n", 20 | "* Get the number of **connections** from each company.\n", 21 | "* Get the number of **contacts** for each month in the year 2018.\n", 22 | "* Use Postgres or MySQL as databases (you can setup in your laptop) and write **connections** data to the database" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [] 31 | } 32 | ], 33 | "metadata": { 34 | "kernelspec": { 35 | "display_name": "Python 3", 36 | "language": "python", 37 | "name": "python3" 38 | }, 39 | "language_info": { 40 | "codemirror_mode": { 41 | "name": "ipython", 42 | "version": 3 43 | }, 44 | "file_extension": ".py", 45 | "mimetype": "text/x-python", 46 | "name": "python", 47 | "nbconvert_exporter": "python", 48 | "pygments_lexer": "ipython3", 49 | "version": "3.6.8" 50 | } 51 | }, 52 | "nbformat": 4, 53 | "nbformat_minor": 4 54 | } 55 | -------------------------------------------------------------------------------- /03_data_processing_overview/01_data_processing_overview.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "pycharm": { 7 | "name": "#%% md\n" 8 | } 9 | }, 10 | "source": [ 11 | "# Data Processing - Overview\n", 12 | "\n", 13 | "As part of this section we will get an overview about Data Processing using Spark with Python.\n", 14 | "* Pre-requisites and Objectives\n", 15 | "* Starting Spark Context\n", 16 | "* Overview of Spark read APIs\n", 17 | "* Understand airlines data\n", 18 | "* Inferring Schema\n", 19 | "* Previewing airlines data\n", 20 | "* Overview of Data Frame APIs\n", 21 | "* Overview of Functions\n", 22 | "* Overview of Spark Write APIs\n", 23 | "* Reorganizing airlines data\n", 24 | "* Previewing reorganized data\n", 25 | "* Analyze and Understand Data\n", 26 | "* Conclusion" 27 | ] 28 | } 29 | ], 30 | "metadata": { 31 | "kernelspec": { 32 | "display_name": "Pyspark 2", 33 | "language": "python", 34 | "name": "pyspark2" 35 | }, 36 | "language_info": { 37 | "codemirror_mode": { 38 | "name": "ipython", 39 | "version": 3 40 | }, 41 | "file_extension": ".py", 42 | "mimetype": "text/x-python", 43 | "name": "python", 44 | "nbconvert_exporter": "python", 45 | "pygments_lexer": "ipython3", 46 | "version": "3.6.8" 47 | }, 48 | "pycharm": { 49 | "stem_cell": { 50 | "cell_type": "raw", 51 | "metadata": { 52 | "collapsed": false 53 | }, 54 | "source": [ 55 | "\n", 56 | "\n", 57 | "\n" 58 | ] 59 | } 60 | } 61 | }, 62 | "nbformat": 4, 63 | "nbformat_minor": 4 64 | } 65 | -------------------------------------------------------------------------------- /03_data_processing_overview/02_prerequisites_and_objectives.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "pycharm": { 7 | "name": "#%% md\n" 8 | } 9 | }, 10 | "source": [ 11 | "## Prerequisites and Objectives\n", 12 | "\n", 13 | "Let us understand prerequisites before getting into the topics related to this section.\n", 14 | "* Good understanding of Data Processing using Python.\n", 15 | "* Data Processing Life Cycle\n", 16 | " * Reading Data from files\n", 17 | " * Processing Data using APIs\n", 18 | " * Writing Processed Data back to files\n", 19 | "* We can also use Databases as sources and sinks. It will be covered at a later point in time.\n", 20 | "* We can also read data in streaming fashion which is out of the scope of this course.\n", 21 | "\n", 22 | "We will get an overview of the Data Processing Life Cycle using Pyspark by the end of the section or module.\n", 23 | "* Read airlines data from the file.\n", 24 | "* Preview the schema and data to understand the characteristics of the data.\n", 25 | "* Get an overview of Data Frame APIs as well as functions used to process the data.\n", 26 | "* Check if there are any duplicates in the data.\n", 27 | "* Get an overview of how to write data in Data Frames to Files using File Formats such as Parquet using Compression.\n", 28 | "* Reorganize the data by month with different file format and using partitioning strategy.\n", 29 | "* We will deep dive into Data Frame APIs to process the data in subsequent sections or modules." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [] 38 | } 39 | ], 40 | "metadata": { 41 | "kernelspec": { 42 | "display_name": "Python 3", 43 | "language": "python", 44 | "name": "python3" 45 | }, 46 | "language_info": { 47 | "codemirror_mode": { 48 | "name": "ipython", 49 | "version": 3 50 | }, 51 | "file_extension": ".py", 52 | "mimetype": "text/x-python", 53 | "name": "python", 54 | "nbconvert_exporter": "python", 55 | "pygments_lexer": "ipython3", 56 | "version": "3.6.8" 57 | } 58 | }, 59 | "nbformat": 4, 60 | "nbformat_minor": 4 61 | } 62 | -------------------------------------------------------------------------------- /03_data_processing_overview/03_starting_spark_context.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Starting Spark Context\n", 8 | "\n", 9 | "Let us start Spark Context using SparkSession." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "\n" 25 | ], 26 | "text/plain": [ 27 | "" 28 | ] 29 | }, 30 | "metadata": {}, 31 | "output_type": "display_data" 32 | } 33 | ], 34 | "source": [ 35 | "%%HTML\n", 36 | "" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "* `SparkSession` is a class that is part of `pyspark.sql` package.\n", 44 | "* It is a wrapper on top of Spark Context.\n", 45 | "* When Spark application is submitted using `spark-submit` or `spark-shell` or `pyspark`, a web service called as Spark Context will be started.\n", 46 | "* Spark Context maintains the context of all the jobs that are submitted until it is killed.\n", 47 | "* `SparkSession` is nothing but wrapper on top of Spark Context.\n", 48 | "* We need to first create SparkSession object with any name. But typically we use `spark`. Once it is created, several APIs will be exposed including `read`.\n", 49 | "* We need to at least set Application Name and also specify the execution mode in which Spark Context should run while creating `SparkSession` object.\n", 50 | "* We can use `appName` to specify name for the application and `master` to specify the execution mode.\n", 51 | "* Below is the sample code snippet which will start the Spark Session object for us." 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "from pyspark.sql import SparkSession" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "import getpass\n", 77 | "username = getpass.getuser()" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "username" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "spark = SparkSession. \\\n", 96 | " builder. \\\n", 97 | " config('spark.ui.port', '0'). \\\n", 98 | " config(\"spark.sql.warehouse.dir\", f\"/user/{username}/warehouse\"). \\\n", 99 | " enableHiveSupport(). \\\n", 100 | " appName(f'{username} | Python - Data Processing - Overview'). \\\n", 101 | " master('yarn'). \\\n", 102 | " getOrCreate()" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 110 | "\n", 111 | "**Using Spark SQL**\n", 112 | "\n", 113 | "```\n", 114 | "spark2-sql \\\n", 115 | " --master yarn \\\n", 116 | " --conf spark.ui.port=0 \\\n", 117 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 118 | "```\n", 119 | "\n", 120 | "**Using Scala**\n", 121 | "\n", 122 | "```\n", 123 | "spark2-shell \\\n", 124 | " --master yarn \\\n", 125 | " --conf spark.ui.port=0 \\\n", 126 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 127 | "```\n", 128 | "\n", 129 | "**Using Pyspark**\n", 130 | "\n", 131 | "```\n", 132 | "pyspark2 \\\n", 133 | " --master yarn \\\n", 134 | " --conf spark.ui.port=0 \\\n", 135 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 136 | "```" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "spark" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [] 154 | } 155 | ], 156 | "metadata": { 157 | "kernelspec": { 158 | "display_name": "Pyspark 2", 159 | "language": "python", 160 | "name": "pyspark2" 161 | }, 162 | "language_info": { 163 | "codemirror_mode": { 164 | "name": "ipython", 165 | "version": 3 166 | }, 167 | "file_extension": ".py", 168 | "mimetype": "text/x-python", 169 | "name": "python", 170 | "nbconvert_exporter": "python", 171 | "pygments_lexer": "ipython3", 172 | "version": "3.6.12" 173 | } 174 | }, 175 | "nbformat": 4, 176 | "nbformat_minor": 4 177 | } 178 | -------------------------------------------------------------------------------- /03_data_processing_overview/04_overview_of_spark_read_apis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Overview of Spark read APIs\n", 8 | "\n", 9 | "Let us get the overview of Spark read APIs to read files of different formats." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "\n" 25 | ], 26 | "text/plain": [ 27 | "" 28 | ] 29 | }, 30 | "metadata": {}, 31 | "output_type": "display_data" 32 | } 33 | ], 34 | "source": [ 35 | "%%HTML\n", 36 | "" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "* `spark` has a bunch of APIs to read data from files of different formats.\n", 44 | "* All APIs are exposed under `spark.read`\n", 45 | " * `text` - to read single column data from text files as well as reading each of the whole text file as one record.\n", 46 | " * `csv`- to read text files with delimiters. Default is a comma, but we can use other delimiters as well.\n", 47 | " * `json` - to read data from JSON files\n", 48 | " * `orc` - to read data from ORC files\n", 49 | " * `parquet` - to read data from Parquet files.\n", 50 | " * We can also read data from other file formats by plugging in and by using `spark.read.format`\n", 51 | "* We can also pass options based on the file formats.\n", 52 | " * `inferSchema` - to infer the data types of the columns based on the data.\n", 53 | " * `header` - to use header to get the column names in case of text files.\n", 54 | " * `schema` - to explicitly specify the schema.\n", 55 | "* We can get the help on APIs like `spark.read.csv` using `help(spark.read.csv)`.\n", 56 | "* Reading delimited data from text files." 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "from pyspark.sql import SparkSession\n", 73 | "\n", 74 | "import getpass\n", 75 | "username = getpass.getuser()\n", 76 | "\n", 77 | "spark = SparkSession. \\\n", 78 | " builder. \\\n", 79 | " config('spark.ui.port', '0'). \\\n", 80 | " config(\"spark.sql.warehouse.dir\", f\"/user/{username}/warehouse\"). \\\n", 81 | " enableHiveSupport(). \\\n", 82 | " appName(f'{username} | Python - Data Processing - Overview'). \\\n", 83 | " master('yarn'). \\\n", 84 | " getOrCreate()" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 92 | "\n", 93 | "**Using Spark SQL**\n", 94 | "\n", 95 | "```\n", 96 | "spark2-sql \\\n", 97 | " --master yarn \\\n", 98 | " --conf spark.ui.port=0 \\\n", 99 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 100 | "```\n", 101 | "\n", 102 | "**Using Scala**\n", 103 | "\n", 104 | "```\n", 105 | "spark2-shell \\\n", 106 | " --master yarn \\\n", 107 | " --conf spark.ui.port=0 \\\n", 108 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 109 | "```\n", 110 | "\n", 111 | "**Using Pyspark**\n", 112 | "\n", 113 | "```\n", 114 | "pyspark2 \\\n", 115 | " --master yarn \\\n", 116 | " --conf spark.ui.port=0 \\\n", 117 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 118 | "```" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "spark" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "spark.read" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "spark.read.csv?" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "help(spark.read.csv)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "spark. \\\n", 164 | " read. \\\n", 165 | " csv('/public/retail_db/orders',\n", 166 | " schema='''\n", 167 | " order_id INT, \n", 168 | " order_date STRING, \n", 169 | " order_customer_id INT, \n", 170 | " order_status STRING\n", 171 | " '''\n", 172 | " ). \\\n", 173 | " show()" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "* Reading JSON data from text files. We can infer schema from the data as each JSON object contain both column name and value.\n", 181 | "* Example for JSON\n", 182 | "\n", 183 | "```json\n", 184 | "{\n", 185 | " \"order_id\": 1, \n", 186 | " \"order_date\": \"2013-07-25 00:00:00.0\", \n", 187 | " \"order_customer_id\": 12345, \n", 188 | " \"order_status\": \"COMPLETE\"\n", 189 | "}\n", 190 | "```" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "spark.read.json?" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "spark. \\\n", 209 | " read. \\\n", 210 | " json('/public/retail_db_json/orders'). \\\n", 211 | " show()" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [] 220 | } 221 | ], 222 | "metadata": { 223 | "kernelspec": { 224 | "display_name": "Pyspark 2", 225 | "language": "python", 226 | "name": "pyspark2" 227 | }, 228 | "language_info": { 229 | "codemirror_mode": { 230 | "name": "ipython", 231 | "version": 3 232 | }, 233 | "file_extension": ".py", 234 | "mimetype": "text/x-python", 235 | "name": "python", 236 | "nbconvert_exporter": "python", 237 | "pygments_lexer": "ipython3", 238 | "version": "3.6.12" 239 | } 240 | }, 241 | "nbformat": 4, 242 | "nbformat_minor": 4 243 | } 244 | -------------------------------------------------------------------------------- /03_data_processing_overview/05_understand_airlines_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Understand airlines data\n", 8 | "Let us read one of the files and understand more about the data to determine right API with right options to process data later." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": { 15 | "tags": [ 16 | "remove-cell" 17 | ] 18 | }, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/html": [ 23 | "\n" 24 | ], 25 | "text/plain": [ 26 | "" 27 | ] 28 | }, 29 | "metadata": {}, 30 | "output_type": "display_data" 31 | } 32 | ], 33 | "source": [ 34 | "%%HTML\n", 35 | "" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "* Our airlines data is in text file format.\n", 43 | "* We can use `spark.read.text` on one of the files to preview the data and understand the following\n", 44 | " * Whether header is present in files or not.\n", 45 | " * Field Delimiter that is being used.\n", 46 | "* Once we determine details about header and field delimiter we can use `spark.read.csv` with appropriate options to read the data." 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "from pyspark.sql import SparkSession\n", 63 | "\n", 64 | "import getpass\n", 65 | "username = getpass.getuser()\n", 66 | "\n", 67 | "spark = SparkSession. \\\n", 68 | " builder. \\\n", 69 | " config('spark.ui.port', '0'). \\\n", 70 | " config(\"spark.sql.warehouse.dir\", f\"/user/{username}/warehouse\"). \\\n", 71 | " enableHiveSupport(). \\\n", 72 | " appName(f'{username} | Python - Data Processing - Overview'). \\\n", 73 | " master('yarn'). \\\n", 74 | " getOrCreate()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 82 | "\n", 83 | "**Using Spark SQL**\n", 84 | "\n", 85 | "```\n", 86 | "spark2-sql \\\n", 87 | " --master yarn \\\n", 88 | " --conf spark.ui.port=0 \\\n", 89 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 90 | "```\n", 91 | "\n", 92 | "**Using Scala**\n", 93 | "\n", 94 | "```\n", 95 | "spark2-shell \\\n", 96 | " --master yarn \\\n", 97 | " --conf spark.ui.port=0 \\\n", 98 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 99 | "```\n", 100 | "\n", 101 | "**Using Pyspark**\n", 102 | "\n", 103 | "```\n", 104 | "pyspark2 \\\n", 105 | " --master yarn \\\n", 106 | " --conf spark.ui.port=0 \\\n", 107 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 108 | "```" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "%%sh\n", 118 | "\n", 119 | "hdfs dfs -ls -h /public/airlines_all/airlines/part-00000" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "pycharm": { 127 | "name": "#%%\n" 128 | } 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "airlines = spark.read. \\\n", 133 | " text(\"/public/airlines_all/airlines/part-00000\")" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "type(airlines)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "help(airlines.show)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "airlines.show(truncate=False)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "help(spark.read.text)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": { 175 | "pycharm": { 176 | "name": "#%% md\n" 177 | } 178 | }, 179 | "source": [ 180 | "* Data have header and each field is delimited by a comma." 181 | ] 182 | } 183 | ], 184 | "metadata": { 185 | "kernelspec": { 186 | "display_name": "Pyspark 2", 187 | "language": "python", 188 | "name": "pyspark2" 189 | }, 190 | "language_info": { 191 | "codemirror_mode": { 192 | "name": "ipython", 193 | "version": 3 194 | }, 195 | "file_extension": ".py", 196 | "mimetype": "text/x-python", 197 | "name": "python", 198 | "nbconvert_exporter": "python", 199 | "pygments_lexer": "ipython3", 200 | "version": "3.6.12" 201 | } 202 | }, 203 | "nbformat": 4, 204 | "nbformat_minor": 4 205 | } 206 | -------------------------------------------------------------------------------- /03_data_processing_overview/06_inferring_schema.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "pycharm": { 7 | "name": "#%% md\n" 8 | } 9 | }, 10 | "source": [ 11 | "## Inferring Schema\n", 12 | "\n", 13 | "Let us understand how we can quickly get schema using one file and apply on other files." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": { 20 | "tags": [ 21 | "remove-cell" 22 | ] 23 | }, 24 | "outputs": [ 25 | { 26 | "data": { 27 | "text/html": [ 28 | "\n" 29 | ], 30 | "text/plain": [ 31 | "" 32 | ] 33 | }, 34 | "metadata": {}, 35 | "output_type": "display_data" 36 | } 37 | ], 38 | "source": [ 39 | "%%HTML\n", 40 | "" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": { 46 | "pycharm": { 47 | "name": "#%% md\n" 48 | } 49 | }, 50 | "source": [ 51 | "* We can pass the file name pattern to `spark.read.csv` and read all the data in files under **hdfs://public/airlines_all/airlines** into Data Frame.\n", 52 | "* We can use options such as `header` and `inferSchema` to assign names and data types.\n", 53 | "* However `inferSchema` will end up going through the entire data to assign schema. We can use samplingRatio to process fraction of data and then infer the schema.\n", 54 | "* In case if the data in all the files have similar structure, we should be able to get the schema using one file and then apply it on others.\n", 55 | "* In our airlines data, schema is consistent across all the files and hence we should be able to get the schema by going through one file and apply on the entire dataset." 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "from pyspark.sql import SparkSession\n", 72 | "\n", 73 | "import getpass\n", 74 | "username = getpass.getuser()\n", 75 | "\n", 76 | "spark = SparkSession. \\\n", 77 | " builder. \\\n", 78 | " config('spark.ui.port', '0'). \\\n", 79 | " config(\"spark.sql.warehouse.dir\", f\"/user/{username}/warehouse\"). \\\n", 80 | " enableHiveSupport(). \\\n", 81 | " appName(f'{username} | Python - Data Processing - Overview'). \\\n", 82 | " master('yarn'). \\\n", 83 | " getOrCreate()" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 91 | "\n", 92 | "**Using Spark SQL**\n", 93 | "\n", 94 | "```\n", 95 | "spark2-sql \\\n", 96 | " --master yarn \\\n", 97 | " --conf spark.ui.port=0 \\\n", 98 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 99 | "```\n", 100 | "\n", 101 | "**Using Scala**\n", 102 | "\n", 103 | "```\n", 104 | "spark2-shell \\\n", 105 | " --master yarn \\\n", 106 | " --conf spark.ui.port=0 \\\n", 107 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 108 | "```\n", 109 | "\n", 110 | "**Using Pyspark**\n", 111 | "\n", 112 | "```\n", 113 | "pyspark2 \\\n", 114 | " --master yarn \\\n", 115 | " --conf spark.ui.port=0 \\\n", 116 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 117 | "```" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "airlines_part_00000 = spark.read. \\\n", 127 | " csv(\"/public/airlines_all/airlines/part-00000\",\n", 128 | " header=True,\n", 129 | " inferSchema=True\n", 130 | " )" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "type(airlines_part_00000)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "airlines_part_00000.show(truncate=False)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "airlines_part_00000.printSchema()" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "airlines_part_00000.schema" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "type(airlines_part_00000.schema)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "pycharm": { 183 | "name": "#%%\n" 184 | } 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "airlines_schema = spark.read. \\\n", 189 | " csv(\"/public/airlines_all/airlines/part-00000\",\n", 190 | " header=True,\n", 191 | " inferSchema=True\n", 192 | " ). \\\n", 193 | " schema" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "type(airlines_schema)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "help(spark.read.csv)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "airlines = spark.read. \\\n", 221 | " schema(airlines_schema). \\\n", 222 | " csv(\"/public/airlines_all/airlines/part*\",\n", 223 | " header=True\n", 224 | " )" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "airlines = spark.read. \\\n", 234 | " csv(\"/public/airlines_all/airlines/part*\",\n", 235 | " schema=airlines_schema,\n", 236 | " header=True\n", 237 | " )" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "help(airlines)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "airlines." 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "airlines.show()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "airlines.printSchema()" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "airlines.count()" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [] 291 | } 292 | ], 293 | "metadata": { 294 | "kernelspec": { 295 | "display_name": "Pyspark 2", 296 | "language": "python", 297 | "name": "pyspark2" 298 | }, 299 | "language_info": { 300 | "codemirror_mode": { 301 | "name": "ipython", 302 | "version": 3 303 | }, 304 | "file_extension": ".py", 305 | "mimetype": "text/x-python", 306 | "name": "python", 307 | "nbconvert_exporter": "python", 308 | "pygments_lexer": "ipython3", 309 | "version": "3.6.12" 310 | } 311 | }, 312 | "nbformat": 4, 313 | "nbformat_minor": 4 314 | } 315 | -------------------------------------------------------------------------------- /03_data_processing_overview/07_previewing_airlines_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Previewing airlines data\n", 8 | "Let us preview the airlines data to understand more about it." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": { 15 | "tags": [ 16 | "remove-cell" 17 | ] 18 | }, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/html": [ 23 | "\n" 24 | ], 25 | "text/plain": [ 26 | "" 27 | ] 28 | }, 29 | "metadata": {}, 30 | "output_type": "display_data" 31 | } 32 | ], 33 | "source": [ 34 | "%%HTML\n", 35 | "" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "* As we have too many files, we will just process ten files and preview the data.\n", 43 | "* File Name: **hdfs://public/airlines_all/airlines/part-0000***\n", 44 | "* `spark.read.csv` will create a variable or object of type Data Frame." 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from pyspark.sql import SparkSession\n", 61 | "\n", 62 | "import getpass\n", 63 | "username = getpass.getuser()\n", 64 | "\n", 65 | "spark = SparkSession. \\\n", 66 | " builder. \\\n", 67 | " config('spark.ui.port', '0'). \\\n", 68 | " config(\"spark.sql.warehouse.dir\", f\"/user/{username}/warehouse\"). \\\n", 69 | " enableHiveSupport(). \\\n", 70 | " appName(f'{username} | Python - Data Processing - Overview'). \\\n", 71 | " master('yarn'). \\\n", 72 | " getOrCreate()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 80 | "\n", 81 | "**Using Spark SQL**\n", 82 | "\n", 83 | "```\n", 84 | "spark2-sql \\\n", 85 | " --master yarn \\\n", 86 | " --conf spark.ui.port=0 \\\n", 87 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 88 | "```\n", 89 | "\n", 90 | "**Using Scala**\n", 91 | "\n", 92 | "```\n", 93 | "spark2-shell \\\n", 94 | " --master yarn \\\n", 95 | " --conf spark.ui.port=0 \\\n", 96 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 97 | "```\n", 98 | "\n", 99 | "**Using Pyspark**\n", 100 | "\n", 101 | "```\n", 102 | "pyspark2 \\\n", 103 | " --master yarn \\\n", 104 | " --conf spark.ui.port=0 \\\n", 105 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 106 | "```" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": { 113 | "pycharm": { 114 | "name": "#%%\n" 115 | } 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "airlines_schema = spark.read. \\\n", 120 | " csv(\"/public/airlines_all/airlines/part-00000\",\n", 121 | " header=True,\n", 122 | " inferSchema=True\n", 123 | " ). \\\n", 124 | " schema" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "airlines = spark.read. \\\n", 134 | " schema(airlines_schema). \\\n", 135 | " csv(\"/public/airlines_all/airlines/part*\",\n", 136 | " header=True\n", 137 | " )" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "A Data Frame will have structure or schema.\n", 145 | "\n", 146 | "* We can print the schema using `airlines.printSchema()`\n", 147 | "* We can preview the data using `airlines.show()`. By default it shows 20 records and some of the column values might be truncated for readability purpose.\n", 148 | "* We can review the details of show by using `help(airlines.show)`\n", 149 | "* We can pass custom number of records and say `truncate=False` to show complete information of all the records requested. It will facilitate us to preview all columns with desired number of records.\n", 150 | "\n", 151 | "\n" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "pycharm": { 159 | "name": "#%%\n" 160 | } 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "airlines.show(100, truncate=False)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "* We can get the number of records or rows in a Data Frame using `airlines.count()`\n", 172 | "* In Databricks Notebook, we can use `display` to preview the data using Visualization feature\n", 173 | "* We can perform all kinds of standard transformations on our data. We need to have good knowledge of functions on Data Frames as well as functions on columns to apply all standard transformations.\n", 174 | "* Let us also validate if there are duplicates in our data, if yes we will remove duplicates while reorganizing the data later.\n" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": { 181 | "pycharm": { 182 | "name": "#%%\n" 183 | } 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "airlines_schema = spark.read. \\\n", 188 | " csv(\"/public/airlines_all/airlines/part-00000\",\n", 189 | " header=True,\n", 190 | " inferSchema=True\n", 191 | " ). \\\n", 192 | " schema" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "%%sh\n", 202 | "\n", 203 | "hdfs dfs -ls /public/airlines_all/airlines/part-0000*" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": { 210 | "pycharm": { 211 | "name": "#%%\n" 212 | } 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "airlines = spark.read. \\\n", 217 | " schema(airlines_schema). \\\n", 218 | " csv(\"/public/airlines_all/airlines/part-0000*\",\n", 219 | " header=True\n", 220 | " )" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "airlines.printSchema()" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "airlines.show()" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "airlines.show(100, truncate=False)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "airlines.count()" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "airlines.distinct().count()" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [] 274 | } 275 | ], 276 | "metadata": { 277 | "kernelspec": { 278 | "display_name": "Pyspark 2", 279 | "language": "python", 280 | "name": "pyspark2" 281 | }, 282 | "language_info": { 283 | "codemirror_mode": { 284 | "name": "ipython", 285 | "version": 3 286 | }, 287 | "file_extension": ".py", 288 | "mimetype": "text/x-python", 289 | "name": "python", 290 | "nbconvert_exporter": "python", 291 | "pygments_lexer": "ipython3", 292 | "version": "3.6.12" 293 | } 294 | }, 295 | "nbformat": 4, 296 | "nbformat_minor": 4 297 | } 298 | -------------------------------------------------------------------------------- /03_data_processing_overview/08_overview_of_dataframe_apis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Overview of Data Frame APIs\n", 8 | "\n", 9 | "Let us get an overview of Data Frame APIs to process data in Data Frames." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "\n" 25 | ], 26 | "text/plain": [ 27 | "" 28 | ] 29 | }, 30 | "metadata": {}, 31 | "output_type": "display_data" 32 | } 33 | ], 34 | "source": [ 35 | "%%HTML\n", 36 | "" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "* Row Level Transformations or Projection of Data can be done using `select`, `selectExpr`, `withColumn`, `drop` on Data Frame.\n", 44 | "* We typically apply functions from `pyspark.sql.functions` on columns using `select` and `withColumn`\n", 45 | "* Filtering is typically done either by using `filter` or `where` on Data Frame.\n", 46 | "* We can pass the condition to `filter` or `where` either by using SQL Style or Programming Language Style.\n", 47 | "* Global Aggregations can be performed directly on the Data Frame.\n", 48 | "* By Key or Grouping Aggregations are typically performed using `groupBy` and then aggregate functions using `agg`\n", 49 | "* We can sort the data in Data Frame using `sort` or `orderBy`\n", 50 | "* We will talk about Window Functions later. We can use use Window Functions for some advanced Aggregations and Ranking." 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "from pyspark.sql import SparkSession\n", 67 | "\n", 68 | "import getpass\n", 69 | "username = getpass.getuser()\n", 70 | "\n", 71 | "spark = SparkSession. \\\n", 72 | " builder. \\\n", 73 | " config('spark.ui.port', '0'). \\\n", 74 | " config(\"spark.sql.warehouse.dir\", f\"/user/{username}/warehouse\"). \\\n", 75 | " enableHiveSupport(). \\\n", 76 | " appName(f'{username} | Python - Data Processing - Overview'). \\\n", 77 | " master('yarn'). \\\n", 78 | " getOrCreate()" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 86 | "\n", 87 | "**Using Spark SQL**\n", 88 | "\n", 89 | "```\n", 90 | "spark2-sql \\\n", 91 | " --master yarn \\\n", 92 | " --conf spark.ui.port=0 \\\n", 93 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 94 | "```\n", 95 | "\n", 96 | "**Using Scala**\n", 97 | "\n", 98 | "```\n", 99 | "spark2-shell \\\n", 100 | " --master yarn \\\n", 101 | " --conf spark.ui.port=0 \\\n", 102 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 103 | "```\n", 104 | "\n", 105 | "**Using Pyspark**\n", 106 | "\n", 107 | "```\n", 108 | "pyspark2 \\\n", 109 | " --master yarn \\\n", 110 | " --conf spark.ui.port=0 \\\n", 111 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 112 | "```" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "### Tasks\n", 120 | "\n", 121 | "Let us understand how to project the data using different options such as `select`, `selectExpr`, `withColumn`, `drop.`\n", 122 | "\n", 123 | "* Create Dataframe **employees** using Collection" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "pycharm": { 131 | "name": "#%%\n" 132 | } 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "employees = [(1, \"Scott\", \"Tiger\", 1000.0, \"united states\"),\n", 137 | " (2, \"Henry\", \"Ford\", 1250.0, \"India\"),\n", 138 | " (3, \"Nick\", \"Junior\", 750.0, \"united KINGDOM\"),\n", 139 | " (4, \"Bill\", \"Gomes\", 1500.0, \"AUSTRALIA\")\n", 140 | " ]" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "type(employees)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "employees[0]" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "type(employees[0])" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "spark.createDataFrame?" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "employeesDF = spark. \\\n", 186 | " createDataFrame(employees,\n", 187 | " schema=\"\"\"employee_id INT, first_name STRING, \n", 188 | " last_name STRING, salary FLOAT, nationality STRING\"\"\"\n", 189 | " )" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "employeesDF.printSchema()" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "employeesDF.show()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "* Project employee first name and last name.\n" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": { 221 | "pycharm": { 222 | "name": "#%%\n" 223 | } 224 | }, 225 | "outputs": [], 226 | "source": [ 227 | "employeesDF. \\\n", 228 | " select(\"first_name\", \"last_name\"). \\\n", 229 | " show()" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "* Project all the fields except for Nationality" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": { 243 | "pycharm": { 244 | "name": "#%%\n" 245 | } 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "employeesDF. \\\n", 250 | " drop(\"nationality\"). \\\n", 251 | " show()" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "from pyspark.sql.functions import *" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "employeesDF. \\\n", 270 | " withColumn('full_name', concat('first_name', lit(' '), 'last_name')). \\\n", 271 | " show()" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "employeesDF.selectExpr('*', 'concat(first_name, \" \", last_name) AS full_name').show()" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "**We will explore most of the APIs to process data in Data Frames as we get into the data processing at a later point in time**" 288 | ] 289 | } 290 | ], 291 | "metadata": { 292 | "kernelspec": { 293 | "display_name": "Pyspark 2", 294 | "language": "python", 295 | "name": "pyspark2" 296 | }, 297 | "language_info": { 298 | "codemirror_mode": { 299 | "name": "ipython", 300 | "version": 3 301 | }, 302 | "file_extension": ".py", 303 | "mimetype": "text/x-python", 304 | "name": "python", 305 | "nbconvert_exporter": "python", 306 | "pygments_lexer": "ipython3", 307 | "version": "3.6.12" 308 | } 309 | }, 310 | "nbformat": 4, 311 | "nbformat_minor": 4 312 | } 313 | -------------------------------------------------------------------------------- /03_data_processing_overview/11_reorganizing_airlines_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Reorganizing airlines data\n", 8 | "\n", 9 | "Let us reorganize our airlines data to fewer files where data is compressed and also partitioned by Month.\n", 10 | "* We have ~1920 files of ~64MB Size.\n", 11 | "* Data is in the range of 1987 October and 2008 December (255 months)\n", 12 | "* By default it uses ~1920 threads to process the data and it might end up with too many small files. We can avoid that by using repartition and then partition by the month.\n", 13 | "* Here are the steps we are going to follow to partition by flight month and save the data to /user/[YOUR_USER_NAME]/airlines.\n", 14 | " * Read one file first and get the schema.\n", 15 | " * Read the entire data by applying the schema from the previous step.\n", 16 | " * Add additional column flightmonth using withColumn by using lpad on month column and concat functions. We need to do this as the month in our data set is of type integer and we want to pad with 0 for months till september to format it into YYYYMM.\n", 17 | " * Repartition the data into 255 based on the number of months using flightmonth\n", 18 | " * Partition the data by partitionBy while writing the data to the target location.\n", 19 | " * We will use parquet file format which will automatically compresses data using Snappy algorithm.\n", 20 | " \n", 21 | "**This process will take time, once it is done we will review the target location to which data is copied by partitioning using month**" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "from pyspark.sql import SparkSession\n", 38 | "\n", 39 | "import getpass\n", 40 | "username = getpass.getuser()\n", 41 | "\n", 42 | "spark = SparkSession. \\\n", 43 | " builder. \\\n", 44 | " config('spark.ui.port', '0'). \\\n", 45 | " config(\"spark.sql.warehouse.dir\", f\"/user/{username}/warehouse\"). \\\n", 46 | " enableHiveSupport(). \\\n", 47 | " appName(f'{username} | Python - Data Processing - Overview'). \\\n", 48 | " master('yarn'). \\\n", 49 | " getOrCreate()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 57 | "\n", 58 | "**Using Spark SQL**\n", 59 | "\n", 60 | "```\n", 61 | "spark2-sql \\\n", 62 | " --master yarn \\\n", 63 | " --conf spark.ui.port=0 \\\n", 64 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 65 | "```\n", 66 | "\n", 67 | "**Using Scala**\n", 68 | "\n", 69 | "```\n", 70 | "spark2-shell \\\n", 71 | " --master yarn \\\n", 72 | " --conf spark.ui.port=0 \\\n", 73 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 74 | "```\n", 75 | "\n", 76 | "**Using Pyspark**\n", 77 | "\n", 78 | "```\n", 79 | "pyspark2 \\\n", 80 | " --master yarn \\\n", 81 | " --conf spark.ui.port=0 \\\n", 82 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 83 | "```" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "from pyspark.sql import SparkSession\n", 93 | "\n", 94 | "# spark.stop()\n", 95 | "spark = SparkSession. \\\n", 96 | " builder. \\\n", 97 | " config('spark.dynamicAllocation.enabled', 'false'). \\\n", 98 | " config('spark.executor.instances', 40). \\\n", 99 | " appName('Data Processing - Overview'). \\\n", 100 | " master('yarn'). \\\n", 101 | " getOrCreate()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "spark" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": { 117 | "pycharm": { 118 | "name": "#%%\n" 119 | } 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "from pyspark.sql.functions import concat, lpad\n", 124 | "\n", 125 | "airlines_schema = spark.read. \\\n", 126 | " csv('/public/airlines_all/airlines/part-00000',\n", 127 | " header=True,\n", 128 | " inferSchema=True\n", 129 | " ). \\\n", 130 | " schema" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "airlines = spark.read. \\\n", 140 | " schema(airlines_schema). \\\n", 141 | " csv('/public/airlines_all/airlines/part*',\n", 142 | " header=True\n", 143 | " )" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "airlines.printSchema()" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "airlines.show()" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "airlines.count()" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "airlines.distinct().count()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "help(airlines.write.parquet)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "spark.conf.set(\"spark.sql.shuffle.partitions\", \"255\")\n", 198 | "airlines. \\\n", 199 | " distinct(). \\\n", 200 | " withColumn('flightmonth', concat('year', lpad('month', 2, '0'))). \\\n", 201 | " repartition(255, 'flightmonth'). \\\n", 202 | " write. \\\n", 203 | " mode('overwrite'). \\\n", 204 | " partitionBy('flightmonth'). \\\n", 205 | " format('parquet'). \\\n", 206 | " save(f'/user/{username}/airlines-part')" 207 | ] 208 | } 209 | ], 210 | "metadata": { 211 | "kernelspec": { 212 | "display_name": "Python 3", 213 | "language": "python", 214 | "name": "python3" 215 | }, 216 | "language_info": { 217 | "codemirror_mode": { 218 | "name": "ipython", 219 | "version": 3 220 | }, 221 | "file_extension": ".py", 222 | "mimetype": "text/x-python", 223 | "name": "python", 224 | "nbconvert_exporter": "python", 225 | "pygments_lexer": "ipython3", 226 | "version": "3.6.8" 227 | } 228 | }, 229 | "nbformat": 4, 230 | "nbformat_minor": 4 231 | } 232 | -------------------------------------------------------------------------------- /03_data_processing_overview/12_previewing_reorganized_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "pycharm": { 7 | "name": "#%% md\n" 8 | } 9 | }, 10 | "source": [ 11 | "## Previewing reorganized data\n", 12 | "Let us preview the data using reorganized data.\n", 13 | "* We will use new location going forward - **/public/airlines_all/airlines-part**. Data is already copied into that location.\n", 14 | "* We have partitioned data by month and stored in that location.\n", 15 | "* Instead of using complete data set we will read the data from one partition **/public/airlines_all/airlines-part/flightmonth=200801**\n", 16 | "* First let us create a DataFrame object by using `spark.read.parquet(\"/public/airlines_all/airlines-part/flightmonth=200801\")` - let's say airlines. \n", 17 | "* We can get the schema of the DataFrame using `airlines.printSchema()`\n", 18 | "* Use `airlines.show()` or `airlines.show(100, truncate=False)` to preview the data.\n", 19 | "* We can also use `display(airlines)` to get airlines data in tabular format as part of Databricks Notebook.\n", 20 | "* We can also use `airlines.describe().show()` to get some statistics about the Data Frame and `airlines.count()` to get the number of records in the DataFrame." 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "from pyspark.sql import SparkSession\n", 37 | "\n", 38 | "import getpass\n", 39 | "username = getpass.getuser()\n", 40 | "\n", 41 | "spark = SparkSession. \\\n", 42 | " builder. \\\n", 43 | " config('spark.ui.port', '0'). \\\n", 44 | " config(\"spark.sql.warehouse.dir\", f\"/user/{username}/warehouse\"). \\\n", 45 | " enableHiveSupport(). \\\n", 46 | " appName(f'{username} | Python - Data Processing - Overview'). \\\n", 47 | " master('yarn'). \\\n", 48 | " getOrCreate()" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 56 | "\n", 57 | "**Using Spark SQL**\n", 58 | "\n", 59 | "```\n", 60 | "spark2-sql \\\n", 61 | " --master yarn \\\n", 62 | " --conf spark.ui.port=0 \\\n", 63 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 64 | "```\n", 65 | "\n", 66 | "**Using Scala**\n", 67 | "\n", 68 | "```\n", 69 | "spark2-shell \\\n", 70 | " --master yarn \\\n", 71 | " --conf spark.ui.port=0 \\\n", 72 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 73 | "```\n", 74 | "\n", 75 | "**Using Pyspark**\n", 76 | "\n", 77 | "```\n", 78 | "pyspark2 \\\n", 79 | " --master yarn \\\n", 80 | " --conf spark.ui.port=0 \\\n", 81 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 82 | "```" 83 | ] 84 | } 85 | ], 86 | "metadata": { 87 | "kernelspec": { 88 | "display_name": "Python 3", 89 | "language": "python", 90 | "name": "python3" 91 | }, 92 | "language_info": { 93 | "codemirror_mode": { 94 | "name": "ipython", 95 | "version": 3 96 | }, 97 | "file_extension": ".py", 98 | "mimetype": "text/x-python", 99 | "name": "python", 100 | "nbconvert_exporter": "python", 101 | "pygments_lexer": "ipython3", 102 | "version": "3.6.8" 103 | } 104 | }, 105 | "nbformat": 4, 106 | "nbformat_minor": 4 107 | } 108 | -------------------------------------------------------------------------------- /03_data_processing_overview/13_analyze_and_understand_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Analyze and Understand Data\n", 8 | "Let us analyze and understand more about the data in detail using data of 2008 January.\n", 9 | "* First let us read the data for the month of 2008 January." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "from pyspark.sql import SparkSession\n", 26 | "\n", 27 | "import getpass\n", 28 | "username = getpass.getuser()\n", 29 | "\n", 30 | "spark = SparkSession. \\\n", 31 | " builder. \\\n", 32 | " config('spark.ui.port', '0'). \\\n", 33 | " config(\"spark.sql.warehouse.dir\", f\"/user/{username}/warehouse\"). \\\n", 34 | " enableHiveSupport(). \\\n", 35 | " appName(f'{username} | Python - Data Processing - Overview'). \\\n", 36 | " master('yarn'). \\\n", 37 | " getOrCreate()" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 45 | "\n", 46 | "**Using Spark SQL**\n", 47 | "\n", 48 | "```\n", 49 | "spark2-sql \\\n", 50 | " --master yarn \\\n", 51 | " --conf spark.ui.port=0 \\\n", 52 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 53 | "```\n", 54 | "\n", 55 | "**Using Scala**\n", 56 | "\n", 57 | "```\n", 58 | "spark2-shell \\\n", 59 | " --master yarn \\\n", 60 | " --conf spark.ui.port=0 \\\n", 61 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 62 | "```\n", 63 | "\n", 64 | "**Using Pyspark**\n", 65 | "\n", 66 | "```\n", 67 | "pyspark2 \\\n", 68 | " --master yarn \\\n", 69 | " --conf spark.ui.port=0 \\\n", 70 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 71 | "```" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "pycharm": { 79 | "name": "#%%\n" 80 | } 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "airlines_path = \"/public/airlines_all/airlines-part/flightmonth=200801\"\n", 85 | "\n", 86 | "airlines = spark. \\\n", 87 | " read. \\\n", 88 | " parquet(airlines_path)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "airlines.count()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "airlines.printSchema()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "* Get number of records - `airlines.count()`\n", 114 | "* Go through the list of columns and understand the purpose of them.\n", 115 | " * Year\n", 116 | " * Month\n", 117 | " * DayOfMonth\n", 118 | " * CRSDepTime - Scheduled Departure Time\n", 119 | " * DepTime - Actual Departure Time.\n", 120 | " * DepDelay - Departure Delay in Minutes\n", 121 | " * CRSArrTime - Scheduled Arrival Time\n", 122 | " * ArrTime - Actual Arrival Time.\n", 123 | " * ArrDelay - Arrival Delay in Minutes.\n", 124 | " * UniqueCarrier - Carrier or Airlines\n", 125 | " * FlightNum - Flight Number\n", 126 | " * Distance - Distance between Origin and Destination\n", 127 | " * IsDepDelayed - this is set to yes for those flights where departure is delayed.\n", 128 | " * IsArrDelayed -- this is set to yes for those flights where arrival is delayed.\n", 129 | "* Get number of unique origins" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": { 136 | "pycharm": { 137 | "name": "#%%\n" 138 | } 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "airlines. \\\n", 143 | " select(\"Origin\"). \\\n", 144 | " distinct(). \\\n", 145 | " count()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "* Get number of unique destinations" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": { 159 | "pycharm": { 160 | "name": "#%%\n" 161 | } 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "airlines. \\\n", 166 | " select(\"Dest\"). \\\n", 167 | " distinct(). \\\n", 168 | " count()" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "* Get all unique carriers" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "airlines. \\\n", 185 | " select('UniqueCarrier'). \\\n", 186 | " distinct(). \\\n", 187 | " show()" 188 | ] 189 | } 190 | ], 191 | "metadata": { 192 | "kernelspec": { 193 | "display_name": "Python 3", 194 | "language": "python", 195 | "name": "python3" 196 | }, 197 | "language_info": { 198 | "codemirror_mode": { 199 | "name": "ipython", 200 | "version": 3 201 | }, 202 | "file_extension": ".py", 203 | "mimetype": "text/x-python", 204 | "name": "python", 205 | "nbconvert_exporter": "python", 206 | "pygments_lexer": "ipython3", 207 | "version": "3.6.8" 208 | } 209 | }, 210 | "nbformat": 4, 211 | "nbformat_minor": 4 212 | } 213 | -------------------------------------------------------------------------------- /03_data_processing_overview/14_conclusion_data_processing_overview.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Conclusion\n", 8 | "Let us recap about key takeaways from this module.\n", 9 | "* APIs to read the data from files into Data Frame.\n", 10 | "* Previewing Schema and the data in Data Frame.\n", 11 | "* Overview of Data Frame APIs and Functions\n", 12 | "* Writing data from Data Frame into Files\n", 13 | "* Reorganizing the airlines data by month\n", 14 | "* Simple APIs to analyze the data.\n", 15 | "Now it is time for us to deep dive into APIs to perform all the standard transformations as part of Data Processing.\n" 16 | ] 17 | } 18 | ], 19 | "metadata": { 20 | "kernelspec": { 21 | "display_name": "Python 3", 22 | "language": "python", 23 | "name": "python3" 24 | }, 25 | "language_info": { 26 | "codemirror_mode": { 27 | "name": "ipython", 28 | "version": 3 29 | }, 30 | "file_extension": ".py", 31 | "mimetype": "text/x-python", 32 | "name": "python", 33 | "nbconvert_exporter": "python", 34 | "pygments_lexer": "ipython3", 35 | "version": "3.6.8" 36 | } 37 | }, 38 | "nbformat": 4, 39 | "nbformat_minor": 4 40 | } 41 | -------------------------------------------------------------------------------- /04_processing_column_data/01_processing_column_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Processing Column Data\n", 8 | "\n", 9 | "As part of this module we will explore the functions available under `org.apache.spark.sql.functions` to derive new values from existing column values with in a Data Frame.\n", 10 | "\n", 11 | "* Pre-defined Functions\n", 12 | "* Create Dummy Data Frame\n", 13 | "* Categories of Functions\n", 14 | "* Special Functions - col and lit\n", 15 | "* String Manipulation Functions - 1\n", 16 | "* String Manipulation Functions - 2\n", 17 | "* Date and Time Overview\n", 18 | "* Date and Time Arithmetic\n", 19 | "* Date and Time - trunc and date_trunc\n", 20 | "* Date and Time - Extracting Information\n", 21 | "* Dealing with Unix Timestamp\n", 22 | "* Example - Word Count\n", 23 | "* Conclusion" 24 | ] 25 | } 26 | ], 27 | "metadata": { 28 | "kernelspec": { 29 | "display_name": "Pyspark 2", 30 | "language": "python", 31 | "name": "pyspark2" 32 | }, 33 | "language_info": { 34 | "codemirror_mode": { 35 | "name": "ipython", 36 | "version": 3 37 | }, 38 | "file_extension": ".py", 39 | "mimetype": "text/x-python", 40 | "name": "python", 41 | "nbconvert_exporter": "python", 42 | "pygments_lexer": "ipython3", 43 | "version": "3.6.12" 44 | }, 45 | "name": "04 Processing Column Data", 46 | "notebookId": 2221802076484304, 47 | "pycharm": { 48 | "stem_cell": { 49 | "cell_type": "raw", 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "source": [ 54 | "\n", 55 | "\n", 56 | "\n" 57 | ] 58 | } 59 | } 60 | }, 61 | "nbformat": 4, 62 | "nbformat_minor": 4 63 | } 64 | -------------------------------------------------------------------------------- /04_processing_column_data/04_categories_of_functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Categories of Functions\n", 8 | "\n", 9 | "There are approximately 300 functions under `pyspark.sql.functions`. At a higher level they can be grouped into a few categories." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "\n" 25 | ], 26 | "text/plain": [ 27 | "" 28 | ] 29 | }, 30 | "metadata": {}, 31 | "output_type": "display_data" 32 | } 33 | ], 34 | "source": [ 35 | "%%HTML\n", 36 | "" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "* String Manipulation Functions\n", 44 | " * Case Conversion - `lower`, `upper`\n", 45 | " * Getting Length - `length`\n", 46 | " * Extracting substrings - `substring`, `split`\n", 47 | " * Trimming - `trim`, `ltrim`, `rtrim`\n", 48 | " * Padding - `lpad`, `rpad`\n", 49 | " * Concatenating string - `concat`, `concat_ws`\n", 50 | "* Date Manipulation Functions\n", 51 | " * Getting current date and time - `current_date`, `current_timestamp`\n", 52 | " * Date Arithmetic - `date_add`, `date_sub`, `datediff`, `months_between`, `add_months`, `next_day`\n", 53 | " * Beginning and Ending Date or Time - `last_day`, `trunc`, `date_trunc`\n", 54 | " * Formatting Date - `date_format`\n", 55 | " * Extracting Information - `dayofyear`, `dayofmonth`, `dayofweek`, `year`, `month`\n", 56 | "* Aggregate Functions\n", 57 | " * `count`, `countDistinct`\n", 58 | " * `sum`, `avg`\n", 59 | " * `min`, `max`\n", 60 | "* Other Functions - We will explore depending on the use cases.\n", 61 | " * `CASE` and `WHEN`\n", 62 | " * `CAST` for type casting\n", 63 | " * Functions to manage special types such as `ARRAY`, `MAP`, `STRUCT` type columns\n", 64 | " * Many others" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [] 73 | } 74 | ], 75 | "metadata": { 76 | "kernelspec": { 77 | "display_name": "Python 3", 78 | "language": "python", 79 | "name": "python3" 80 | }, 81 | "language_info": { 82 | "codemirror_mode": { 83 | "name": "ipython", 84 | "version": 3 85 | }, 86 | "file_extension": ".py", 87 | "mimetype": "text/x-python", 88 | "name": "python", 89 | "nbconvert_exporter": "python", 90 | "pygments_lexer": "ipython3", 91 | "version": "3.6.12" 92 | } 93 | }, 94 | "nbformat": 4, 95 | "nbformat_minor": 4 96 | } 97 | -------------------------------------------------------------------------------- /04_processing_column_data/11_date_and_time_manipulation_functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Date and Time Manipulation Functions\n", 8 | "Let us get started with Date and Time manipulation functions. As part of this topic we will focus on the date and timestamp format." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": { 15 | "tags": [ 16 | "remove-cell" 17 | ] 18 | }, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/html": [ 23 | "\n" 24 | ], 25 | "text/plain": [ 26 | "" 27 | ] 28 | }, 29 | "metadata": {}, 30 | "output_type": "display_data" 31 | } 32 | ], 33 | "source": [ 34 | "%%HTML\n", 35 | "" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "* We can use `current_date` to get today’s server date. \n", 43 | " * Date will be returned using **yyyy-MM-dd** format.\n", 44 | "* We can use `current_timestamp` to get current server time. \n", 45 | " * Timestamp will be returned using **yyyy-MM-dd HH:mm:ss:SSS** format.\n", 46 | " * Hours will be by default in 24 hour format." 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 1, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "from pyspark.sql import SparkSession\n", 63 | "\n", 64 | "import getpass\n", 65 | "username = getpass.getuser()\n", 66 | "\n", 67 | "spark = SparkSession. \\\n", 68 | " builder. \\\n", 69 | " config('spark.ui.port', '0'). \\\n", 70 | " config(\"spark.sql.warehouse.dir\", f\"/user/{username}/warehouse\"). \\\n", 71 | " enableHiveSupport(). \\\n", 72 | " appName(f'{username} | Python - Processing Column Data'). \\\n", 73 | " master('yarn'). \\\n", 74 | " getOrCreate()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 82 | "\n", 83 | "**Using Spark SQL**\n", 84 | "\n", 85 | "```\n", 86 | "spark2-sql \\\n", 87 | " --master yarn \\\n", 88 | " --conf spark.ui.port=0 \\\n", 89 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 90 | "```\n", 91 | "\n", 92 | "**Using Scala**\n", 93 | "\n", 94 | "```\n", 95 | "spark2-shell \\\n", 96 | " --master yarn \\\n", 97 | " --conf spark.ui.port=0 \\\n", 98 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 99 | "```\n", 100 | "\n", 101 | "**Using Pyspark**\n", 102 | "\n", 103 | "```\n", 104 | "pyspark2 \\\n", 105 | " --master yarn \\\n", 106 | " --conf spark.ui.port=0 \\\n", 107 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 108 | "```" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 2, 114 | "metadata": { 115 | "pycharm": { 116 | "name": "#%%\n" 117 | } 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "l = [(\"X\", )]" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 3, 127 | "metadata": { 128 | "pycharm": { 129 | "name": "#%%\n" 130 | } 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "df = spark.createDataFrame(l).toDF(\"dummy\")" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 4, 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "+-----+\n", 147 | "|dummy|\n", 148 | "+-----+\n", 149 | "| X|\n", 150 | "+-----+\n", 151 | "\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "df.show()" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 5, 162 | "metadata": { 163 | "pycharm": { 164 | "name": "#%%\n" 165 | } 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "from pyspark.sql.functions import current_date, current_timestamp" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 6, 175 | "metadata": { 176 | "pycharm": { 177 | "name": "#%%\n" 178 | } 179 | }, 180 | "outputs": [ 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | "+--------------+\n", 186 | "|current_date()|\n", 187 | "+--------------+\n", 188 | "| 2021-02-28|\n", 189 | "+--------------+\n", 190 | "\n" 191 | ] 192 | } 193 | ], 194 | "source": [ 195 | "df.select(current_date()).show() #yyyy-MM-dd" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 7, 201 | "metadata": { 202 | "pycharm": { 203 | "name": "#%%\n" 204 | } 205 | }, 206 | "outputs": [ 207 | { 208 | "name": "stdout", 209 | "output_type": "stream", 210 | "text": [ 211 | "+-----------------------+\n", 212 | "|current_timestamp() |\n", 213 | "+-----------------------+\n", 214 | "|2021-02-28 18:34:08.548|\n", 215 | "+-----------------------+\n", 216 | "\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "df.select(current_timestamp()).show(truncate=False) #yyyy-MM-dd HH:mm:ss.SSS" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "* We can convert a string which contain date or timestamp in non-standard format to standard date or time using `to_date` or `to_timestamp` function respectively." 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 8, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "from pyspark.sql.functions import lit, to_date, to_timestamp" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 9, 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "name": "stdout", 247 | "output_type": "stream", 248 | "text": [ 249 | "+----------+\n", 250 | "| to_date|\n", 251 | "+----------+\n", 252 | "|2021-02-28|\n", 253 | "+----------+\n", 254 | "\n" 255 | ] 256 | } 257 | ], 258 | "source": [ 259 | "df.select(to_date(lit('20210228'), 'yyyyMMdd').alias('to_date')).show()" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 10, 265 | "metadata": {}, 266 | "outputs": [ 267 | { 268 | "name": "stdout", 269 | "output_type": "stream", 270 | "text": [ 271 | "+-------------------+\n", 272 | "| to_timestamp|\n", 273 | "+-------------------+\n", 274 | "|2021-02-28 17:25:00|\n", 275 | "+-------------------+\n", 276 | "\n" 277 | ] 278 | } 279 | ], 280 | "source": [ 281 | "df.select(to_timestamp(lit('20210228 1725'), 'yyyyMMdd HHmm').alias('to_timestamp')).show()" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [] 290 | } 291 | ], 292 | "metadata": { 293 | "kernelspec": { 294 | "display_name": "Pyspark 2", 295 | "language": "python", 296 | "name": "pyspark2" 297 | }, 298 | "language_info": { 299 | "codemirror_mode": { 300 | "name": "ipython", 301 | "version": 3 302 | }, 303 | "file_extension": ".py", 304 | "mimetype": "text/x-python", 305 | "name": "python", 306 | "nbconvert_exporter": "python", 307 | "pygments_lexer": "ipython3", 308 | "version": "3.6.12" 309 | } 310 | }, 311 | "nbformat": 4, 312 | "nbformat_minor": 4 313 | } 314 | -------------------------------------------------------------------------------- /04_processing_column_data/20_conclusion_predefined_functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Conclusion\n", 8 | "\n", 9 | "As part of this module we have gone through list of functions that can be applied on top of columns for row level transformations.\n", 10 | "\n", 11 | "* There are approximately 300 pre-defined functions.\n", 12 | "* Functions can be broadly categorized into String Manipulation Functions, Date Manipulation Functions, Numeric Functions etc.\n", 13 | "* Typically when we read data from source, we get data in the form of strings and we need to apply functions to apply standardization rules, data type conversion, transformation rules etc.\n", 14 | "* Most of these functions can be used while projection using `select`, `selectExpr`, `withColumn` etc as well as part of `filter` or `where`, `groupBy`, `orderBy` or `sort` etc.\n", 15 | "* For `selectExpr` we need to use the functions using SQL Style syntax.\n", 16 | "* There are special functions such as `col` and `lit`. `col` is used to pass column names as column type for some of the functions while `lit` is used to pass literals as values as part of expressions (eg: `concat($\"first_name\", lit(\", \"), $\"last_name\")`)." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from pyspark.sql import SparkSession\n", 33 | "\n", 34 | "import getpass\n", 35 | "username = getpass.getuser()\n", 36 | "\n", 37 | "spark = SparkSession. \\\n", 38 | " builder. \\\n", 39 | " config('spark.ui.port', '0'). \\\n", 40 | " config(\"spark.sql.warehouse.dir\", f\"/user/{username}/warehouse\"). \\\n", 41 | " enableHiveSupport(). \\\n", 42 | " appName(f'{username} | Python - Processing Column Data'). \\\n", 43 | " master('yarn'). \\\n", 44 | " getOrCreate()" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 52 | "\n", 53 | "**Using Spark SQL**\n", 54 | "\n", 55 | "```\n", 56 | "spark2-sql \\\n", 57 | " --master yarn \\\n", 58 | " --conf spark.ui.port=0 \\\n", 59 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 60 | "```\n", 61 | "\n", 62 | "**Using Scala**\n", 63 | "\n", 64 | "```\n", 65 | "spark2-shell \\\n", 66 | " --master yarn \\\n", 67 | " --conf spark.ui.port=0 \\\n", 68 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 69 | "```\n", 70 | "\n", 71 | "**Using Pyspark**\n", 72 | "\n", 73 | "```\n", 74 | "pyspark2 \\\n", 75 | " --master yarn \\\n", 76 | " --conf spark.ui.port=0 \\\n", 77 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 78 | "```" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "employees = [(1, \"Scott\", \"Tiger\", 1000.0, 10,\n", 88 | " \"united states\", \"+1 123 456 7890\", \"123 45 6789\"\n", 89 | " ),\n", 90 | " (2, \"Henry\", \"Ford\", 1250.0, None,\n", 91 | " \"India\", \"+91 234 567 8901\", \"456 78 9123\"\n", 92 | " ),\n", 93 | " (3, \"Nick\", \"Junior\", 750.0, '',\n", 94 | " \"united KINGDOM\", \"+44 111 111 1111\", \"222 33 4444\"\n", 95 | " ),\n", 96 | " (4, \"Bill\", \"Gomes\", 1500.0, 10,\n", 97 | " \"AUSTRALIA\", \"+61 987 654 3210\", \"789 12 6118\"\n", 98 | " )\n", 99 | " ]" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "employeesDF = spark. \\\n", 109 | " createDataFrame(employees,\n", 110 | " schema=\"\"\"employee_id INT, first_name STRING, \n", 111 | " last_name STRING, salary FLOAT, bonus STRING, nationality STRING,\n", 112 | " phone_number STRING, ssn STRING\"\"\"\n", 113 | " )" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "employeesDF.show()" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "from pyspark.sql.functions import coalesce, lit" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "employeesDF.withColumn('payment', col('salary') + (col('salary') * coalesce(col('bonus').cast('int'), lit(0)) / 100)). \\\n", 141 | " show()" 142 | ] 143 | } 144 | ], 145 | "metadata": { 146 | "kernelspec": { 147 | "display_name": "Python 3", 148 | "language": "python", 149 | "name": "python3" 150 | }, 151 | "language_info": { 152 | "codemirror_mode": { 153 | "name": "ipython", 154 | "version": 3 155 | }, 156 | "file_extension": ".py", 157 | "mimetype": "text/x-python", 158 | "name": "python", 159 | "nbconvert_exporter": "python", 160 | "pygments_lexer": "ipython3", 161 | "version": "3.6.8" 162 | } 163 | }, 164 | "nbformat": 4, 165 | "nbformat_minor": 4 166 | } 167 | -------------------------------------------------------------------------------- /04_processing_column_data/21_exercises_predefined_functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [] 9 | } 10 | ], 11 | "metadata": { 12 | "kernelspec": { 13 | "display_name": "Python 3", 14 | "language": "python", 15 | "name": "python3" 16 | }, 17 | "language_info": { 18 | "codemirror_mode": { 19 | "name": "ipython", 20 | "version": 3 21 | }, 22 | "file_extension": ".py", 23 | "mimetype": "text/x-python", 24 | "name": "python", 25 | "nbconvert_exporter": "python", 26 | "pygments_lexer": "ipython3", 27 | "version": "3.6.8" 28 | } 29 | }, 30 | "nbformat": 4, 31 | "nbformat_minor": 4 32 | } 33 | -------------------------------------------------------------------------------- /05_basic_transformations/01_basic_transformations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Basic Transformations\n", 8 | "\n", 9 | "Let us deep dive about how we can perform basic transformations using Spark Dataframe APIs.\n", 10 | "\n", 11 | "* Overview of Basic Transformations\n", 12 | "* Overview of filtering\n", 13 | "* Total Aggregations\n", 14 | "* Grouped Aggregations\n", 15 | "* Overview of Sorting\n", 16 | "* Solutions - Problem 1\n", 17 | "* Solutions - Problem 2\n", 18 | "* Solutions - Problem 3" 19 | ] 20 | } 21 | ], 22 | "metadata": { 23 | "kernelspec": { 24 | "display_name": "Pyspark 2", 25 | "language": "python", 26 | "name": "pyspark2" 27 | }, 28 | "language_info": { 29 | "codemirror_mode": { 30 | "name": "ipython", 31 | "version": 3 32 | }, 33 | "file_extension": ".py", 34 | "mimetype": "text/x-python", 35 | "name": "python", 36 | "nbconvert_exporter": "python", 37 | "pygments_lexer": "ipython3", 38 | "version": "3.6.8" 39 | }, 40 | "pycharm": { 41 | "stem_cell": { 42 | "cell_type": "raw", 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "source": [ 47 | "\n", 48 | "\n", 49 | "\n" 50 | ] 51 | } 52 | } 53 | }, 54 | "nbformat": 4, 55 | "nbformat_minor": 4 56 | } 57 | -------------------------------------------------------------------------------- /05_basic_transformations/02_overview_of_basic_transformations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Overview of Basic Transformations\n", 8 | "\n", 9 | "Let us define problem statements to learn more about Data Frame APIs. We will try to cover filtering, aggregations and sorting as part of solutions for these problem statements." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "\n" 25 | ], 26 | "text/plain": [ 27 | "" 28 | ] 29 | }, 30 | "metadata": {}, 31 | "output_type": "display_data" 32 | } 33 | ], 34 | "source": [ 35 | "%%HTML\n", 36 | "" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "* Get total number of flights as well as number of flights which are delayed in departure and number of flights delayed in arrival. \n", 44 | " * Output should contain 3 columns - **FlightCount**, **DepDelayedCount**, **ArrDelayedCount**\n", 45 | "* Get number of flights which are delayed in departure and number of flights delayed in arrival for each day along with number of flights departed for each day. \n", 46 | " * Output should contain 4 columns - **FlightDate**, **FlightCount**, **DepDelayedCount**, **ArrDelayedCount**\n", 47 | " * **FlightDate** should be of **yyyy-MM-dd** format.\n", 48 | " * Data should be **sorted** in ascending order by **flightDate**\n", 49 | "* Get all the flights which are departed late but arrived early (**IsArrDelayed is NO**).\n", 50 | " * Output should contain - **FlightCRSDepTime**, **UniqueCarrier**, **FlightNum**, **Origin**, **Dest**, **DepDelay**, **ArrDelay**\n", 51 | " * **FlightCRSDepTime** need to be computed using **Year**, **Month**, **DayOfMonth**, **CRSDepTime**\n", 52 | " * **FlightCRSDepTime** should be displayed using **yyyy-MM-dd HH:mm** format.\n", 53 | " * Output should be sorted by **FlightCRSDepTime** and then by the difference between **DepDelay** and **ArrDelay**\n", 54 | " * Also get the count of such flights" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [] 63 | } 64 | ], 65 | "metadata": { 66 | "kernelspec": { 67 | "display_name": "Pyspark 2", 68 | "language": "python", 69 | "name": "pyspark2" 70 | }, 71 | "language_info": { 72 | "codemirror_mode": { 73 | "name": "ipython", 74 | "version": 3 75 | }, 76 | "file_extension": ".py", 77 | "mimetype": "text/x-python", 78 | "name": "python", 79 | "nbconvert_exporter": "python", 80 | "pygments_lexer": "ipython3", 81 | "version": "3.6.12" 82 | } 83 | }, 84 | "nbformat": 4, 85 | "nbformat_minor": 4 86 | } 87 | -------------------------------------------------------------------------------- /06_joining_data_sets/01_joining_data_sets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Joining Data Sets\n", 8 | "\n", 9 | "Let us understand how to join multiple Data Sets using Spark based APIs.\n", 10 | "* Prepare Data Sets for joins\n", 11 | "* Analyze Data Sets for joins\n", 12 | "* Problem Statements for Joins\n", 13 | "* Overview of Joins\n", 14 | "* Solutions for the problem statements" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [] 23 | } 24 | ], 25 | "metadata": { 26 | "kernelspec": { 27 | "display_name": "Pyspark 2", 28 | "language": "python", 29 | "name": "pyspark2" 30 | }, 31 | "language_info": { 32 | "codemirror_mode": { 33 | "name": "ipython", 34 | "version": 3 35 | }, 36 | "file_extension": ".py", 37 | "mimetype": "text/x-python", 38 | "name": "python", 39 | "nbconvert_exporter": "python", 40 | "pygments_lexer": "ipython3", 41 | "version": "3.6.8" 42 | }, 43 | "pycharm": { 44 | "stem_cell": { 45 | "cell_type": "raw", 46 | "metadata": { 47 | "collapsed": false 48 | }, 49 | "source": [] 50 | } 51 | } 52 | }, 53 | "nbformat": 4, 54 | "nbformat_minor": 4 55 | } 56 | -------------------------------------------------------------------------------- /06_joining_data_sets/04_problem_statements_for_joins.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Problem Statements for Joins\n", 8 | "\n", 9 | "Let us understand how to join Data Frames by using some problem statements. We will use 2008 January airtraffic data along with Airport Codes." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "\n" 25 | ], 26 | "text/plain": [ 27 | "" 28 | ] 29 | }, 30 | "metadata": {}, 31 | "output_type": "display_data" 32 | } 33 | ], 34 | "source": [ 35 | "%%HTML\n", 36 | "" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "* Get number of flights departed from each of the US airport.\n", 44 | "* Get number of flights departed from each of the state.\n", 45 | "* Get the list of airports in the US from which flights are not departed.\n", 46 | "* Check if there are any origins in airtraffic data which do not have record in airport-codes.\n", 47 | "* Get the total number of flights from the airports that do not contain entries in airport-codes.\n", 48 | "* Get the total number of flights per airport that do not contain entries in airport-codes." 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [] 57 | } 58 | ], 59 | "metadata": { 60 | "kernelspec": { 61 | "display_name": "Pyspark 2", 62 | "language": "python", 63 | "name": "pyspark2" 64 | }, 65 | "language_info": { 66 | "codemirror_mode": { 67 | "name": "ipython", 68 | "version": 3 69 | }, 70 | "file_extension": ".py", 71 | "mimetype": "text/x-python", 72 | "name": "python", 73 | "nbconvert_exporter": "python", 74 | "pygments_lexer": "ipython3", 75 | "version": "3.6.12" 76 | } 77 | }, 78 | "nbformat": 4, 79 | "nbformat_minor": 4 80 | } 81 | -------------------------------------------------------------------------------- /06_joining_data_sets/05_overview_of_joins.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Overview of Joins\n", 8 | "\n", 9 | "Let us get an overview of joining Data Frames." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "\n" 25 | ], 26 | "text/plain": [ 27 | "" 28 | ] 29 | }, 30 | "metadata": {}, 31 | "output_type": "display_data" 32 | } 33 | ], 34 | "source": [ 35 | "%%HTML\n", 36 | "" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "* Our data cannot be stored in one table. It will be stored in multiple tables and the tables might be related.\n", 44 | " * When it comes to transactional systems, we typically define tables based on Normalization Principles.\n", 45 | " * When it comes to data warehousing applications, we typically define tables using Dimensional Modeling.\n", 46 | " * Either of the approach data is scattered into multiple tables and relationships are defined.\n", 47 | " * Typically tables are related with one to one, one to many, many to many relationships.\n", 48 | "* When we have 2 Data Sets that are related based on a common key we typically perform join.\n", 49 | "* There are different types of joins.\n", 50 | " * INNER JOIN\n", 51 | " * OUTER JOIN (LEFT or RIGHT)\n", 52 | " * FULL OUTER JOIN (a LEFT OUTER JOIN b UNION a RIGHT OUTER JOIN b)\n", 53 | " " 54 | ] 55 | } 56 | ], 57 | "metadata": { 58 | "kernelspec": { 59 | "display_name": "Pyspark 2", 60 | "language": "python", 61 | "name": "pyspark2" 62 | }, 63 | "language_info": { 64 | "codemirror_mode": { 65 | "name": "ipython", 66 | "version": 3 67 | }, 68 | "file_extension": ".py", 69 | "mimetype": "text/x-python", 70 | "name": "python", 71 | "nbconvert_exporter": "python", 72 | "pygments_lexer": "ipython3", 73 | "version": "3.6.12" 74 | } 75 | }, 76 | "nbformat": 4, 77 | "nbformat_minor": 4 78 | } 79 | -------------------------------------------------------------------------------- /07_windowing_functions/01_windowing_functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Windowing Functions\n", 8 | "\n", 9 | "As part of this module let us get into Windowing Functions." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Starting Spark Context\n", 17 | "\n", 18 | "Let us start spark context for this Notebook so that we can execute the code provided." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "from pyspark.sql import SparkSession\n", 28 | "\n", 29 | "import getpass\n", 30 | "username = getpass.getuser()\n", 31 | "\n", 32 | "spark = SparkSession. \\\n", 33 | " builder. \\\n", 34 | " config('spark.ui.port', '0'). \\\n", 35 | " enableHiveSupport. \\\n", 36 | " appName(f'{username} | Python - Windowing Functions'). \\\n", 37 | " master('yarn'). \\\n", 38 | " getOrCreate()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "spark.conf.set('spark.sql.shuffle.partitions', '2')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [] 56 | } 57 | ], 58 | "metadata": { 59 | "kernelspec": { 60 | "display_name": "Pyspark 2", 61 | "language": "python", 62 | "name": "pyspark2" 63 | }, 64 | "language_info": { 65 | "codemirror_mode": { 66 | "name": "ipython", 67 | "version": 3 68 | }, 69 | "file_extension": ".py", 70 | "mimetype": "text/x-python", 71 | "name": "python", 72 | "nbconvert_exporter": "python", 73 | "pygments_lexer": "ipython3", 74 | "version": "3.6.12" 75 | } 76 | }, 77 | "nbformat": 4, 78 | "nbformat_minor": 4 79 | } 80 | -------------------------------------------------------------------------------- /08_spark_metastore/01_spark_metastore.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Spark Metastore\n", 8 | "\n", 9 | "Let us understand how to interact with metastore tables using Spark based APIs." 10 | ] 11 | } 12 | ], 13 | "metadata": { 14 | "kernelspec": { 15 | "display_name": "Pyspark 2", 16 | "language": "python", 17 | "name": "pyspark2" 18 | }, 19 | "language_info": { 20 | "codemirror_mode": { 21 | "name": "ipython", 22 | "version": 3 23 | }, 24 | "file_extension": ".py", 25 | "mimetype": "text/x-python", 26 | "name": "python", 27 | "nbconvert_exporter": "python", 28 | "pygments_lexer": "ipython3", 29 | "version": "3.6.12" 30 | } 31 | }, 32 | "nbformat": 4, 33 | "nbformat_minor": 4 34 | } 35 | -------------------------------------------------------------------------------- /08_spark_metastore/02_overview_of_spark_metastore.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Overview of Spark Metastore\n", 8 | "\n", 9 | "Let us get an overview of Spark Metastore and how we can leverage it to manage databases and tables on top of Big Data based file systems such as HDFS, s3 etc." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "tags": [ 17 | "remove-cell" 18 | ] 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "\n" 25 | ], 26 | "text/plain": [ 27 | "" 28 | ] 29 | }, 30 | "metadata": {}, 31 | "output_type": "display_data" 32 | } 33 | ], 34 | "source": [ 35 | "%%HTML\n", 36 | "" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "* Quite often we need to deal with structured data and the most popular way of processing structured data is by using Databases, Tables and then SQL.\n", 44 | "* Spark Metastore (similar to Hive Metastore) will facilitate us to manage databases and tables.\n", 45 | "* Typically Metastore is setup using traditional relational database technologies such as **Oracle**, **MySQL**, **Postgres** etc." 46 | ] 47 | } 48 | ], 49 | "metadata": { 50 | "kernelspec": { 51 | "display_name": "Python 3", 52 | "language": "python", 53 | "name": "python3" 54 | }, 55 | "language_info": { 56 | "codemirror_mode": { 57 | "name": "ipython", 58 | "version": 3 59 | }, 60 | "file_extension": ".py", 61 | "mimetype": "text/x-python", 62 | "name": "python", 63 | "nbconvert_exporter": "python", 64 | "pygments_lexer": "ipython3", 65 | "version": "3.6.12" 66 | } 67 | }, 68 | "nbformat": 4, 69 | "nbformat_minor": 4 70 | } 71 | -------------------------------------------------------------------------------- /09_analyzing_gharchive_data/02_download_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "IOPub data rate exceeded.\n", 13 | "The notebook server will temporarily stop sending output\n", 14 | "to the client in order to avoid crashing it.\n", 15 | "To change this limit, set the config variable\n", 16 | "`--NotebookApp.iopub_data_rate_limit`.\n", 17 | "\n", 18 | "Current values:\n", 19 | "NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)\n", 20 | "NotebookApp.rate_limit_window=3.0 (secs)\n", 21 | "\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "%%sh\n", 27 | "\n", 28 | "wget https://data.gharchive.org/2021-01-13-{0..23}.json.gz -P /data/gharchive/\n", 29 | "wget https://data.gharchive.org/2021-01-14-{0..23}.json.gz -P /data/gharchive/\n", 30 | "wget https://data.gharchive.org/2021-01-15-{0..23}.json.gz -P /data/gharchive/\n", 31 | "wget https://data.gharchive.org/2021-01-16-{0..23}.json.gz -P /data/gharchive/" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [] 40 | } 41 | ], 42 | "metadata": { 43 | "kernelspec": { 44 | "display_name": "Python 3", 45 | "language": "python", 46 | "name": "python3" 47 | }, 48 | "language_info": { 49 | "codemirror_mode": { 50 | "name": "ipython", 51 | "version": 3 52 | }, 53 | "file_extension": ".py", 54 | "mimetype": "text/x-python", 55 | "name": "python", 56 | "nbconvert_exporter": "python", 57 | "pygments_lexer": "ipython3", 58 | "version": "3.6.8" 59 | } 60 | }, 61 | "nbformat": 4, 62 | "nbformat_minor": 4 63 | } 64 | -------------------------------------------------------------------------------- /09_analyzing_gharchive_data/05_overview_of_json.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyspark.sql import SparkSession\n", 10 | "\n", 11 | "import getpass\n", 12 | "username = getpass.getuser()\n", 13 | "\n", 14 | "spark = SparkSession. \\\n", 15 | " builder. \\\n", 16 | " config('spark.ui.port', '0'). \\\n", 17 | " config(\"spark.sql.warehouse.dir\", f\"/user/{username}/warehouse\"). \\\n", 18 | " enableHiveSupport(). \\\n", 19 | " appName(f'{username} | Analyze GitHub Archive Data'). \\\n", 20 | " master('yarn'). \\\n", 21 | " getOrCreate()" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "%%sh\n", 31 | "\n", 32 | "hdfs dfs -ls /user/${USER}/itv-github/landing/" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 5, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "ghdata = spark.read.json(f'/user/{username}/itv-github/landing/2021-01-13-0.json.gz')" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "ghdata.printSchema()" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "ghdata.select('repo').show()" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 8, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "root\n", 72 | " |-- repo: struct (nullable = true)\n", 73 | " | |-- id: long (nullable = true)\n", 74 | " | |-- name: string (nullable = true)\n", 75 | " | |-- url: string (nullable = true)\n", 76 | "\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "ghdata.select('repo').printSchema()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 9, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "name": "stdout", 91 | "output_type": "stream", 92 | "text": [ 93 | "+---------+--------------------+--------------------+\n", 94 | "| id| name| url|\n", 95 | "+---------+--------------------+--------------------+\n", 96 | "| 67224522| i-RIC/prepost-gui|https://api.githu...|\n", 97 | "|329141406| kaneda96/React-quiz|https://api.githu...|\n", 98 | "|221279833|archesproject/arc...|https://api.githu...|\n", 99 | "|182814691| Audentio/kinetic|https://api.githu...|\n", 100 | "| 4542716| NixOS/nixpkgs|https://api.githu...|\n", 101 | "|329130975| eterwin/schastota|https://api.githu...|\n", 102 | "|104382627|littlebizzy/slick...|https://api.githu...|\n", 103 | "|302490178| qmk/qmk_keyboards|https://api.githu...|\n", 104 | "|156042726|MaybeNotWrong/lc-sep|https://api.githu...|\n", 105 | "|329144511|direwolf-github/e...|https://api.githu...|\n", 106 | "| 91074692|zalando/postgres-...|https://api.githu...|\n", 107 | "|280011532| GeopJr/GeopJr|https://api.githu...|\n", 108 | "| 32481543|cBioPortal/cbiopo...|https://api.githu...|\n", 109 | "|270887418|feedarchive/freen...|https://api.githu...|\n", 110 | "|322448852|ehenn345/hf_helpe...|https://api.githu...|\n", 111 | "|325641835|machinegunhairy/P...|https://api.githu...|\n", 112 | "|189429001| mlysy/realPSD|https://api.githu...|\n", 113 | "|307762661|stevennguyen99/re...|https://api.githu...|\n", 114 | "|214051777|leighmcculloch/st...|https://api.githu...|\n", 115 | "| 97922418|leanprover-commun...|https://api.githu...|\n", 116 | "+---------+--------------------+--------------------+\n", 117 | "only showing top 20 rows\n", 118 | "\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "ghdata.select('repo.id', 'repo.name', 'repo.url').show()" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "ghdata.select('created_at', 'repo.*').show()" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 11, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "root\n", 145 | " |-- commits: array (nullable = true)\n", 146 | " | |-- element: struct (containsNull = true)\n", 147 | " | | |-- author: struct (nullable = true)\n", 148 | " | | | |-- email: string (nullable = true)\n", 149 | " | | | |-- name: string (nullable = true)\n", 150 | " | | |-- distinct: boolean (nullable = true)\n", 151 | " | | |-- message: string (nullable = true)\n", 152 | " | | |-- sha: string (nullable = true)\n", 153 | " | | |-- url: string (nullable = true)\n", 154 | "\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "ghdata.select('payload.commits').printSchema()" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 14, 165 | "metadata": {}, 166 | "outputs": [ 167 | { 168 | "data": { 169 | "text/plain": [ 170 | "90911" 171 | ] 172 | }, 173 | "execution_count": 14, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "ghdata.count()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 13, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "name": "stdout", 189 | "output_type": "stream", 190 | "text": [ 191 | "root\n", 192 | " |-- commits: struct (nullable = true)\n", 193 | " | |-- author: struct (nullable = true)\n", 194 | " | | |-- email: string (nullable = true)\n", 195 | " | | |-- name: string (nullable = true)\n", 196 | " | |-- distinct: boolean (nullable = true)\n", 197 | " | |-- message: string (nullable = true)\n", 198 | " | |-- sha: string (nullable = true)\n", 199 | " | |-- url: string (nullable = true)\n", 200 | "\n" 201 | ] 202 | } 203 | ], 204 | "source": [ 205 | "from pyspark.sql.functions import explode\n", 206 | "ghdata. \\\n", 207 | " select(explode('payload.commits').alias('commits')). \\\n", 208 | " printSchema()" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 15, 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "data": { 218 | "text/plain": [ 219 | "75708" 220 | ] 221 | }, 222 | "execution_count": 15, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | } 226 | ], 227 | "source": [ 228 | "from pyspark.sql.functions import explode\n", 229 | "ghdata. \\\n", 230 | " select(explode('payload.commits').alias('commits')). \\\n", 231 | " count()" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 17, 237 | "metadata": {}, 238 | "outputs": [ 239 | { 240 | "data": { 241 | "text/plain": [ 242 | "119495" 243 | ] 244 | }, 245 | "execution_count": 17, 246 | "metadata": {}, 247 | "output_type": "execute_result" 248 | } 249 | ], 250 | "source": [ 251 | "from pyspark.sql.functions import explode_outer\n", 252 | "ghdata. \\\n", 253 | " select(explode_outer('payload.commits').alias('commits')). \\\n", 254 | " count()" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [] 263 | } 264 | ], 265 | "metadata": { 266 | "kernelspec": { 267 | "display_name": "Pyspark 2", 268 | "language": "python", 269 | "name": "pyspark2" 270 | }, 271 | "language_info": { 272 | "codemirror_mode": { 273 | "name": "ipython", 274 | "version": 3 275 | }, 276 | "file_extension": ".py", 277 | "mimetype": "text/x-python", 278 | "name": "python", 279 | "nbconvert_exporter": "python", 280 | "pygments_lexer": "ipython3", 281 | "version": "3.6.8" 282 | } 283 | }, 284 | "nbformat": 4, 285 | "nbformat_minor": 4 286 | } 287 | -------------------------------------------------------------------------------- /09_analyzing_gharchive_data/06_get_new_repositories.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyspark.sql import SparkSession\n", 10 | "\n", 11 | "import getpass\n", 12 | "username = getpass.getuser()\n", 13 | "\n", 14 | "spark = SparkSession. \\\n", 15 | " builder. \\\n", 16 | " config('spark.ui.port', '0'). \\\n", 17 | " config(\"spark.sql.warehouse.dir\", f\"/user/{username}/warehouse\"). \\\n", 18 | " enableHiveSupport(). \\\n", 19 | " appName(f'{username} | Analyze GitHub Archive Data'). \\\n", 20 | " master('yarn'). \\\n", 21 | " getOrCreate()" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "ghdata = spark.read.table(f'{username}_raw.ghactivity')" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "ghdata.printSchema()" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 5, 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "name": "stdout", 49 | "output_type": "stream", 50 | "text": [ 51 | "+-----------------------------+\n", 52 | "|type |\n", 53 | "+-----------------------------+\n", 54 | "|PullRequestReviewEvent |\n", 55 | "|PushEvent |\n", 56 | "|GollumEvent |\n", 57 | "|ReleaseEvent |\n", 58 | "|CommitCommentEvent |\n", 59 | "|CreateEvent |\n", 60 | "|PullRequestReviewCommentEvent|\n", 61 | "|IssueCommentEvent |\n", 62 | "|DeleteEvent |\n", 63 | "|IssuesEvent |\n", 64 | "|ForkEvent |\n", 65 | "|PublicEvent |\n", 66 | "|MemberEvent |\n", 67 | "|WatchEvent |\n", 68 | "|PullRequestEvent |\n", 69 | "+-----------------------------+\n", 70 | "\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "ghdata. \\\n", 76 | " select('type'). \\\n", 77 | " distinct(). \\\n", 78 | " show(truncate=False)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 6, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "+----------+\n", 91 | "| ref_type|\n", 92 | "+----------+\n", 93 | "| null|\n", 94 | "| tag|\n", 95 | "| branch|\n", 96 | "|repository|\n", 97 | "+----------+\n", 98 | "\n" 99 | ] 100 | } 101 | ], 102 | "source": [ 103 | "ghdata. \\\n", 104 | " select('payload.ref_type'). \\\n", 105 | " distinct(). \\\n", 106 | " show()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 8, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "+----------+-----------+\n", 119 | "| ref_type|event_count|\n", 120 | "+----------+-----------+\n", 121 | "| null| 8550769|\n", 122 | "| branch| 1183829|\n", 123 | "|repository| 438739|\n", 124 | "| tag| 131003|\n", 125 | "+----------+-----------+\n", 126 | "\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "from pyspark.sql.functions import count, col, lit\n", 132 | "\n", 133 | "ghdata. \\\n", 134 | " groupBy('payload.ref_type'). \\\n", 135 | " agg(count(lit(1)).alias('event_count')). \\\n", 136 | " orderBy(col('event_count').desc()). \\\n", 137 | " show()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "from pyspark.sql.functions import substring, count, col, lit\n", 147 | "\n", 148 | "ghdata. \\\n", 149 | " filter('payload.ref_type = \"repository\" AND type = \"CreateEvent\"'). \\\n", 150 | " groupBy(substring('created_at', 1, 10).alias('created_dt')). \\\n", 151 | " agg(count(lit(1)).alias('repo_count')). \\\n", 152 | " orderBy('created_dt'). \\\n", 153 | " show()" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [] 162 | } 163 | ], 164 | "metadata": { 165 | "kernelspec": { 166 | "display_name": "Pyspark 2", 167 | "language": "python", 168 | "name": "pyspark2" 169 | }, 170 | "language_info": { 171 | "codemirror_mode": { 172 | "name": "ipython", 173 | "version": 3 174 | }, 175 | "file_extension": ".py", 176 | "mimetype": "text/x-python", 177 | "name": "python", 178 | "nbconvert_exporter": "python", 179 | "pygments_lexer": "ipython3", 180 | "version": "3.6.8" 181 | } 182 | }, 183 | "nbformat": 4, 184 | "nbformat_minor": 4 185 | } 186 | -------------------------------------------------------------------------------- /09_analyzing_gharchive_data/07_get_repository_pushes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyspark.sql import SparkSession\n", 10 | "\n", 11 | "import getpass\n", 12 | "username = getpass.getuser()\n", 13 | "\n", 14 | "spark = SparkSession. \\\n", 15 | " builder. \\\n", 16 | " config('spark.ui.port', '0'). \\\n", 17 | " config(\"spark.sql.warehouse.dir\", f\"/user/{username}/warehouse\"). \\\n", 18 | " enableHiveSupport(). \\\n", 19 | " appName(f'{username} | Analyze GitHub Archive Data'). \\\n", 20 | " master('yarn'). \\\n", 21 | " getOrCreate()" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "ghdata = spark.read.table(f'{username}_raw.ghactivity')" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "ghdata.printSchema()" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "ghdata.count()" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "from pyspark.sql.functions import count, countDistinct\n", 58 | "ghdata. \\\n", 59 | " filter('payload.push_id IS NOT NULL'). \\\n", 60 | " select(\n", 61 | " count('payload.push_id'), \n", 62 | " countDistinct('payload.push_id'),\n", 63 | " countDistinct('repo.id')\n", 64 | " ). \\\n", 65 | " show()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "ghdata. \\\n", 75 | " filter('payload.push_id IS NOT NULL'). \\\n", 76 | " select('payload.push_id'). \\\n", 77 | " distinct(). \\\n", 78 | " count()" 79 | ] 80 | } 81 | ], 82 | "metadata": { 83 | "kernelspec": { 84 | "display_name": "Pyspark 2", 85 | "language": "python", 86 | "name": "pyspark2" 87 | }, 88 | "language_info": { 89 | "codemirror_mode": { 90 | "name": "ipython", 91 | "version": 3 92 | }, 93 | "file_extension": ".py", 94 | "mimetype": "text/x-python", 95 | "name": "python", 96 | "nbconvert_exporter": "python", 97 | "pygments_lexer": "ipython3", 98 | "version": "3.6.8" 99 | } 100 | }, 101 | "nbformat": 4, 102 | "nbformat_minor": 4 103 | } 104 | -------------------------------------------------------------------------------- /09_analyzing_gharchive_data/99_ghdata_queries.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyspark.sql import SparkSession\n", 10 | "\n", 11 | "import getpass\n", 12 | "username = getpass.getuser()\n", 13 | "\n", 14 | "spark = SparkSession. \\\n", 15 | " builder. \\\n", 16 | " config('spark.ui.port', '0'). \\\n", 17 | " config(\"spark.sql.warehouse.dir\", f\"/user/{username}/warehouse\"). \\\n", 18 | " enableHiveSupport(). \\\n", 19 | " appName(f'{username} | Analyze GitHub Archive Data'). \\\n", 20 | " master('yarn'). \\\n", 21 | " getOrCreate()" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "%%sh\n", 31 | "\n", 32 | "hdfs dfs -ls /user/${USER}/itv-github/landing/" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "ghdata = spark.read.json(f'/user/{username}/itv-github/landing/2021-01-13-0.json.gz')" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "ghdata.printSchema()" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 7, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "+-----------------------------+-----+\n", 63 | "|type |count|\n", 64 | "+-----------------------------+-----+\n", 65 | "|PullRequestReviewEvent |2493 |\n", 66 | "|PushEvent |48569|\n", 67 | "|GollumEvent |227 |\n", 68 | "|ReleaseEvent |425 |\n", 69 | "|CommitCommentEvent |319 |\n", 70 | "|CreateEvent |11762|\n", 71 | "|PullRequestReviewCommentEvent|1727 |\n", 72 | "|IssueCommentEvent |6062 |\n", 73 | "|DeleteEvent |2812 |\n", 74 | "|IssuesEvent |2419 |\n", 75 | "|ForkEvent |1697 |\n", 76 | "|PublicEvent |376 |\n", 77 | "|MemberEvent |206 |\n", 78 | "|WatchEvent |4488 |\n", 79 | "|PullRequestEvent |7329 |\n", 80 | "+-----------------------------+-----+\n", 81 | "\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "ghdata. \\\n", 87 | " groupBy('type'). \\\n", 88 | " count(). \\\n", 89 | " show(truncate=False)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 10, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "+---------+-----+\n", 102 | "|action |count|\n", 103 | "+---------+-----+\n", 104 | "|null |65762|\n", 105 | "|created |10282|\n", 106 | "|reopened |67 |\n", 107 | "|closed |4407 |\n", 108 | "|published|425 |\n", 109 | "|opened |5274 |\n", 110 | "|added |206 |\n", 111 | "|started |4488 |\n", 112 | "+---------+-----+\n", 113 | "\n" 114 | ] 115 | } 116 | ], 117 | "source": [ 118 | "ghdata. \\\n", 119 | " groupBy('payload.action'). \\\n", 120 | " count(). \\\n", 121 | " show(truncate=False)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 14, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "root\n", 134 | " |-- head: string (nullable = true)\n", 135 | "\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "ghdata.select('payload.head').printSchema()" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 16, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "+--------------------+\n", 153 | "| ref|\n", 154 | "+--------------------+\n", 155 | "| null|\n", 156 | "| main|\n", 157 | "| null|\n", 158 | "|snyk-fix-1e1002e1...|\n", 159 | "| null|\n", 160 | "| refs/heads/master|\n", 161 | "| refs/heads/master|\n", 162 | "| refs/heads/main|\n", 163 | "| refs/heads/master|\n", 164 | "| null|\n", 165 | "| null|\n", 166 | "| refs/heads/master|\n", 167 | "|refs/heads/locald...|\n", 168 | "| refs/heads/master|\n", 169 | "| refs/heads/main|\n", 170 | "| null|\n", 171 | "| null|\n", 172 | "| refs/heads/main|\n", 173 | "|refs/heads/cap35-...|\n", 174 | "| null|\n", 175 | "+--------------------+\n", 176 | "only showing top 20 rows\n", 177 | "\n" 178 | ] 179 | } 180 | ], 181 | "source": [ 182 | "ghdata.select('payload.ref').show()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [] 191 | } 192 | ], 193 | "metadata": { 194 | "kernelspec": { 195 | "display_name": "Pyspark 2", 196 | "language": "python", 197 | "name": "pyspark2" 198 | }, 199 | "language_info": { 200 | "codemirror_mode": { 201 | "name": "ipython", 202 | "version": 3 203 | }, 204 | "file_extension": ".py", 205 | "mimetype": "text/x-python", 206 | "name": "python", 207 | "nbconvert_exporter": "python", 208 | "pygments_lexer": "ipython3", 209 | "version": "3.6.8" 210 | } 211 | }, 212 | "nbformat": 4, 213 | "nbformat_minor": 4 214 | } 215 | -------------------------------------------------------------------------------- /12_special_data_types/02 Create Tables with Special Types.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Create Table with Special Types\n", 8 | "As part of this topic we will create table with special types such as `ARRAY`, `STRUCT` and `MAP`.\n", 9 | "\n", 10 | "* We can define columns with types such as `ARRAY`, `STRUCT` and `MAP`.\n", 11 | "* We will also insert some data into the table. The syntax to insert data into special type columns is almost similar. " 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 3, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "from pyspark.sql import SparkSession\n", 28 | "\n", 29 | "import getpass\n", 30 | "username = getpass.getuser()\n", 31 | "\n", 32 | "spark = SparkSession. \\\n", 33 | " builder. \\\n", 34 | " config('spark.ui.port', '0'). \\\n", 35 | " config(\"spark.sql.warehouse.dir\", f\"/user/{username}/warehouse\"). \\\n", 36 | " enableHiveSupport(). \\\n", 37 | " appName(f'{username} | Python - Special Data Types'). \\\n", 38 | " master('yarn'). \\\n", 39 | " getOrCreate()" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 47 | "\n", 48 | "**Using Spark SQL**\n", 49 | "\n", 50 | "```\n", 51 | "spark2-sql \\\n", 52 | " --master yarn \\\n", 53 | " --conf spark.ui.port=0 \\\n", 54 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 55 | "```\n", 56 | "\n", 57 | "**Using Scala**\n", 58 | "\n", 59 | "```\n", 60 | "spark2-shell \\\n", 61 | " --master yarn \\\n", 62 | " --conf spark.ui.port=0 \\\n", 63 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 64 | "```\n", 65 | "\n", 66 | "**Using Pyspark**\n", 67 | "\n", 68 | "```\n", 69 | "pyspark2 \\\n", 70 | " --master yarn \\\n", 71 | " --conf spark.ui.port=0 \\\n", 72 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 73 | "```" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 4, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "data": { 83 | "text/plain": [ 84 | "DataFrame[]" 85 | ] 86 | }, 87 | "execution_count": 4, 88 | "metadata": {}, 89 | "output_type": "execute_result" 90 | } 91 | ], 92 | "source": [ 93 | "spark.sql(f'CREATE DATABASE IF NOT EXISTS {username}_demo')" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 5, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/plain": [ 104 | "DataFrame[]" 105 | ] 106 | }, 107 | "execution_count": 5, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | } 111 | ], 112 | "source": [ 113 | "spark.sql(f'USE {username}_demo')" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 6, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "data": { 123 | "text/plain": [ 124 | "DataFrame[]" 125 | ] 126 | }, 127 | "execution_count": 6, 128 | "metadata": {}, 129 | "output_type": "execute_result" 130 | } 131 | ], 132 | "source": [ 133 | "spark.sql('DROP TABLE IF EXISTS employees')" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 7, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "DataFrame[]" 145 | ] 146 | }, 147 | "execution_count": 7, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "spark.sql(\"\"\"\n", 154 | " CREATE TABLE employees (\n", 155 | " employee_id INT,\n", 156 | " employee_first_name STRING,\n", 157 | " employee_last_name STRING,\n", 158 | " employee_salary FLOAT,\n", 159 | " employee_nationality STRING,\n", 160 | " employee_email_ids ARRAY,\n", 161 | " employee_phone_numbers MAP,\n", 162 | " employee_ssn STRING,\n", 163 | " employee_address STRUCT\n", 164 | " )\n", 165 | "\"\"\")" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 8, 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/plain": [ 176 | "DataFrame[]" 177 | ] 178 | }, 179 | "execution_count": 8, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "spark.sql(\"\"\"\n", 186 | " INSERT INTO employees\n", 187 | " VALUES (1, 'Scott', 'Tiger', 1000.0, 'United States', \n", 188 | " ARRAY('scott@tiger.com', 'stiger@companyx.com'),\n", 189 | " MAP('Home', '+1 234 567 8901', 'Office', '+1 345 678 9012'), '789 12 6118', STRUCT('1234 ABC St', 'My City', 'My State', 13455)\n", 190 | " )\n", 191 | "\"\"\")" 192 | ] 193 | } 194 | ], 195 | "metadata": { 196 | "kernelspec": { 197 | "display_name": "Pyspark 2", 198 | "language": "python", 199 | "name": "pyspark2" 200 | }, 201 | "language_info": { 202 | "codemirror_mode": { 203 | "name": "ipython", 204 | "version": 3 205 | }, 206 | "file_extension": ".py", 207 | "mimetype": "text/x-python", 208 | "name": "python", 209 | "nbconvert_exporter": "python", 210 | "pygments_lexer": "ipython3", 211 | "version": "3.6.12" 212 | } 213 | }, 214 | "nbformat": 4, 215 | "nbformat_minor": 4 216 | } 217 | -------------------------------------------------------------------------------- /12_special_data_types/03 Create Data Frame with Special Types.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Create Data Frame with Special Types\n", 8 | "As part of this topic we will create Data Frame with special types such as `ARRAY`, `STRUCT` and `MAP`.\n", 9 | "\n", 10 | "* Create list with appropriate types.\n", 11 | "* Create Data Frame using list and define schema with relevant types.\n", 12 | "* We will print schema as well as preview the data.\n", 13 | "* We will also see how we can insert data in the data frame with special types into the Metastore table." 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "Let us start spark context for this Notebook so that we can execute the code provided. You can sign up for our [10 node state of the art cluster/labs](https://labs.itversity.com/plans) to learn Spark SQL using our unique integrated LMS." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "from pyspark.sql import SparkSession\n", 30 | "\n", 31 | "import getpass\n", 32 | "username = getpass.getuser()\n", 33 | "\n", 34 | "spark = SparkSession. \\\n", 35 | " builder. \\\n", 36 | " config('spark.ui.port', '0'). \\\n", 37 | " config(\"spark.sql.warehouse.dir\", f\"/user/{username}/warehouse\"). \\\n", 38 | " enableHiveSupport(). \\\n", 39 | " appName(f'{username} | Python - Special Data Types'). \\\n", 40 | " master('yarn'). \\\n", 41 | " getOrCreate()" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "If you are going to use CLIs, you can use Spark SQL using one of the 3 approaches.\n", 49 | "\n", 50 | "**Using Spark SQL**\n", 51 | "\n", 52 | "```\n", 53 | "spark2-sql \\\n", 54 | " --master yarn \\\n", 55 | " --conf spark.ui.port=0 \\\n", 56 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 57 | "```\n", 58 | "\n", 59 | "**Using Scala**\n", 60 | "\n", 61 | "```\n", 62 | "spark2-shell \\\n", 63 | " --master yarn \\\n", 64 | " --conf spark.ui.port=0 \\\n", 65 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 66 | "```\n", 67 | "\n", 68 | "**Using Pyspark**\n", 69 | "\n", 70 | "```\n", 71 | "pyspark2 \\\n", 72 | " --master yarn \\\n", 73 | " --conf spark.ui.port=0 \\\n", 74 | " --conf spark.sql.warehouse.dir=/user/${USER}/warehouse\n", 75 | "```" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "spark.sql(f'USE {username}_demo')" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "employees = [\n", 94 | " (2, \"Henry\", \"Ford\", 1250.0, \n", 95 | " \"India\", ['henry@ford.com', 'hford@companyx.com'], \n", 96 | " {\"Home\": \"+91 234 567 8901\", \"Office\": \"+91 345 678 9012\"}, \n", 97 | " \"456 78 9123\", ('111 BCD Cir', 'Some City', 'Some State', 500091)\n", 98 | " ),\n", 99 | " (3, \"Nick\", \"Junior\", 750.0, \n", 100 | " \"United Kingdom\", ['nick@junior.com', 'njunior@companyx.com'], \n", 101 | " {\"Home\": \"+44 111 111 1111\", \"Office\": \"+44 222 222 2222\"}, \n", 102 | " \"222 33 4444\", ('222 Giant Cly', 'UK City', 'UK Province', None)\n", 103 | " ),\n", 104 | " (4, \"Bill\", \"Gomes\", 1500.0, \n", 105 | " \"Australia\", ['bill@gomes.com', 'bgomes@companyx.com'], \n", 106 | " {\"Home\": \"+61 987 654 3210\", \"Office\": \"+61 876 543 2109\"}, \n", 107 | " \"789 12 6118\", None\n", 108 | " )\n", 109 | "]" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "employees_df = spark.createDataFrame(\n", 119 | " employees,\n", 120 | " schema=\"\"\"employee_id INT, employee_first_name STRING, employee_last_name STRING,\n", 121 | " employee_salary FLOAT, employee_nationality STRING, employee_email_ids ARRAY,\n", 122 | " employee_phone_numbers MAP, employee_ssn STRING,\n", 123 | " employee_address STRUCT\n", 124 | " \"\"\"\n", 125 | ")" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "employees_df.printSchema()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "employees_df.show(truncate=False)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "employees_df.write.insertInto('employees')" 153 | ] 154 | } 155 | ], 156 | "metadata": { 157 | "kernelspec": { 158 | "display_name": "Pyspark 2", 159 | "language": "python", 160 | "name": "pyspark2" 161 | }, 162 | "language_info": { 163 | "codemirror_mode": { 164 | "name": "ipython", 165 | "version": 3 166 | }, 167 | "file_extension": ".py", 168 | "mimetype": "text/x-python", 169 | "name": "python", 170 | "nbconvert_exporter": "python", 171 | "pygments_lexer": "ipython3", 172 | "version": "3.6.12" 173 | } 174 | }, 175 | "nbformat": 4, 176 | "nbformat_minor": 4 177 | } 178 | -------------------------------------------------------------------------------- /12_special_data_types/06 Processing Arrays.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [] 9 | } 10 | ], 11 | "metadata": { 12 | "kernelspec": { 13 | "display_name": "Pyspark 2", 14 | "language": "python", 15 | "name": "pyspark2" 16 | }, 17 | "language_info": { 18 | "codemirror_mode": { 19 | "name": "ipython", 20 | "version": 3 21 | }, 22 | "file_extension": ".py", 23 | "mimetype": "text/x-python", 24 | "name": "python", 25 | "nbconvert_exporter": "python", 26 | "pygments_lexer": "ipython3", 27 | "version": "3.6.12" 28 | } 29 | }, 30 | "nbformat": 4, 31 | "nbformat_minor": 4 32 | } 33 | -------------------------------------------------------------------------------- /CNAME: -------------------------------------------------------------------------------- 1 | pyspark.itversity.com -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 itversity 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mastering Pyspark 2 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | title : Mastering Pyspark 2 | author : Durga Gadiraju 3 | copyright : "ITVersity, Inc" 4 | 5 | repository: 6 | url : https://github.com/itversity/pyspark 7 | html: 8 | use_repository_button : true 9 | use_issues_button : true 10 | use_edit_page_button : true 11 | extra_navbar : Subscribe to our Newsletter 12 | 13 | exclude_patterns : [_build, README.md, "**.ipynb_checkpoints"] 14 | 15 | execute: 16 | execute_notebooks : off -------------------------------------------------------------------------------- /_toc.yml: -------------------------------------------------------------------------------- 1 | file: mastering-pyspark 2 | 3 | sections: 4 | 5 | - file: 01_getting_started_pyspark/01_getting_started_pyspark 6 | sections: 7 | - file: 01_getting_started_pyspark/01_getting_started_pyspark 8 | - file: 01_getting_started_pyspark/02_platforms_to_practice 9 | - file: 01_getting_started_pyspark/03_setup_spark_locally_windows 10 | - file: 01_getting_started_pyspark/04_setup_spark_locally_mac 11 | - file: 01_getting_started_pyspark/05_setup_spark_locally_ubuntu 12 | - file: 01_getting_started_pyspark/06_using_itversity_labs 13 | - file: 01_getting_started_pyspark/07_using_google_colab 14 | - file: 01_getting_started_pyspark/08_overview_of_filesystems 15 | - file: 01_getting_started_pyspark/09_different_spark_modules 16 | - file: 01_getting_started_pyspark/10_spark_cluster_manager_types 17 | - file: 01_getting_started_pyspark/11_launching_pyspark_cli 18 | - file: 01_getting_started_pyspark/12_using_jupyter_lab_interface 19 | - file: 01_getting_started_pyspark/13_word_count_using_spark 20 | 21 | - file: 02_quick_recap_of_python/01_quick_recap_of_python 22 | sections: 23 | - file: 02_quick_recap_of_python/01_quick_recap_of_python 24 | - file: 02_quick_recap_of_python/02_data_engineering_life_cycle 25 | - file: 02_quick_recap_of_python/03_getting_started_python_and_pip 26 | - file: 02_quick_recap_of_python/04_python_cli_or_jupyter_notebook 27 | - file: 02_quick_recap_of_python/05_basic_programming_constructs 28 | - file: 02_quick_recap_of_python/06_developing_functions 29 | - file: 02_quick_recap_of_python/07_lambda_functions 30 | - file: 02_quick_recap_of_python/08_overview_of_collections 31 | - file: 02_quick_recap_of_python/09_overview_of_pandas_dataframes 32 | - file: 02_quick_recap_of_python/10_limitations_of_pandas 33 | - file: 02_quick_recap_of_python/11_development_life_cycle 34 | - file: 02_quick_recap_of_python/12_exercises_recap_of_python 35 | 36 | - file: 03_data_processing_overview/01_data_processing_overview 37 | sections: 38 | - file: 03_data_processing_overview/01_data_processing_overview 39 | - file: 03_data_processing_overview/02_prerequisites_and_objectives 40 | - file: 03_data_processing_overview/03_starting_spark_context 41 | - file: 03_data_processing_overview/04_overview_of_spark_read_apis 42 | - file: 03_data_processing_overview/05_understand_airlines_data 43 | - file: 03_data_processing_overview/06_inferring_schema 44 | - file: 03_data_processing_overview/07_previewing_airlines_data 45 | - file: 03_data_processing_overview/08_overview_of_dataframe_apis 46 | - file: 03_data_processing_overview/09_overview_of_functions 47 | - file: 03_data_processing_overview/10_overview_of_spark_write_apis 48 | - file: 03_data_processing_overview/11_reorganizing_airlines_data 49 | - file: 03_data_processing_overview/12_previewing_reorganized_data 50 | - file: 03_data_processing_overview/13_analyze_and_understand_data 51 | - file: 03_data_processing_overview/14_conclusion_data_processing_overview 52 | 53 | - file: 04_processing_column_data/01_processing_column_data 54 | sections: 55 | - file: 04_processing_column_data/01_processing_column_data 56 | - file: 04_processing_column_data/02_predefined_functions 57 | - file: 04_processing_column_data/03_create_dummy_dataframes 58 | - file: 04_processing_column_data/04_categories_of_functions 59 | - file: 04_processing_column_data/05_special_functions_col_and_lit 60 | - file: 04_processing_column_data/06_common_string_manipulation_functions 61 | - file: 04_processing_column_data/07_extracting_strings_using_substring 62 | - file: 04_processing_column_data/08_extracting_strings_using_split 63 | - file: 04_processing_column_data/09_padding_characters_around_strings 64 | - file: 04_processing_column_data/10_trimming_characters_from_strings 65 | - file: 04_processing_column_data/11_date_and_time_manipulation_functions 66 | - file: 04_processing_column_data/12_date_and_time_arithmetic 67 | - file: 04_processing_column_data/13_using_date_and_time_trunc_functions 68 | - file: 04_processing_column_data/14_date_and_time_extract_functions 69 | - file: 04_processing_column_data/15_using_to_date_and_to_timestamp 70 | - file: 04_processing_column_data/16_using_date_format_function 71 | - file: 04_processing_column_data/17_dealing_with_unix_timestamp 72 | - file: 04_processing_column_data/18_dealing_with_nulls 73 | - file: 04_processing_column_data/19_using_case_and_when 74 | - file: 04_processing_column_data/20_conclusion_predefined_functions 75 | - file: 04_processing_column_data/21_exercises_predefined_functions 76 | 77 | - file: 05_basic_transformations/01_basic_transformations 78 | sections: 79 | - file: 05_basic_transformations/01_basic_transformations 80 | - file: 05_basic_transformations/02_overview_of_basic_transformations 81 | - file: 05_basic_transformations/03_data_frame_for_basic_transformations 82 | - file: 05_basic_transformations/04_basic_filtering_of_data 83 | - file: 05_basic_transformations/05_filtering_example_using_dates 84 | - file: 05_basic_transformations/06_boolean_operators 85 | - file: 05_basic_transformations/07_using_in_operator_or_isin_function 86 | - file: 05_basic_transformations/08_using_like_operator_or_function 87 | - file: 05_basic_transformations/09_using_between_operator 88 | - file: 05_basic_transformations/10_dealing_with_nulls_while_filtering 89 | - file: 05_basic_transformations/11_total_aggregations 90 | - file: 05_basic_transformations/12_aggregate_data_using_groupby 91 | - file: 05_basic_transformations/13_aggregate_data_using_rollup 92 | - file: 05_basic_transformations/14_aggregate_data_using_cube 93 | - file: 05_basic_transformations/15_overview_of_sorting_data_frames 94 | - file: 05_basic_transformations/16_solution_problem_1 95 | - file: 05_basic_transformations/17_solution_problem_2 96 | - file: 05_basic_transformations/18_solution_problem_3 97 | 98 | - file: 06_joining_data_sets/01_joining_data_sets 99 | sections: 100 | - file: 06_joining_data_sets/01_joining_data_sets 101 | - file: 06_joining_data_sets/02_preparing_data_sets_for_joins 102 | - file: 06_joining_data_sets/03_analyze_data_sets_for_joins 103 | - file: 06_joining_data_sets/04_problem_statements_for_joins 104 | - file: 06_joining_data_sets/05_overview_of_joins 105 | - file: 06_joining_data_sets/06_using_inner_joins 106 | - file: 06_joining_data_sets/07_left_or_right_outer_join 107 | - file: 06_joining_data_sets/08_solutions_problem_1 108 | - file: 06_joining_data_sets/09_solutions_problem_2 109 | - file: 06_joining_data_sets/10_solutions_problem_3 110 | - file: 06_joining_data_sets/11_solutions_problem_4 111 | - file: 06_joining_data_sets/12_solutions_problem_5 112 | - file: 06_joining_data_sets/13_solutions_problem_6 113 | - file: 06_joining_data_sets/14_solutions_problem_7 114 | - file: 06_joining_data_sets/15_solutions_problem_8 115 | 116 | - file: 07_windowing_functions/01_windowing_functions 117 | sections: 118 | - file: 07_windowing_functions/01_windowing_functions 119 | - file: 07_windowing_functions/02_overview_of_windowing_functions 120 | - file: 07_windowing_functions/03_aggregate_functions 121 | - file: 07_windowing_functions/04_using_rowsBetween_and_rangeBetween 122 | - file: 07_windowing_functions/05_ranking_functions 123 | - file: 07_windowing_functions/06_using_lead_or_lag 124 | - file: 07_windowing_functions/07_using_first_and_last_functions 125 | - file: 07_windowing_functions/10_aggregate_functions_examples 126 | 127 | - file: 08_spark_metastore/01_spark_metastore 128 | sections: 129 | - file: 08_spark_metastore/01_spark_metastore 130 | - file: 08_spark_metastore/02_overview_of_spark_metastore 131 | - file: 08_spark_metastore/03_exploring_spark_catalog 132 | - file: 08_spark_metastore/04_creating_metastore_tables_using_catalog 133 | - file: 08_spark_metastore/05_inferring_schema_for_tables 134 | - file: 08_spark_metastore/06_define_schema_for_tables_using_structtype 135 | - file: 08_spark_metastore/07_inserting_into_existing_tables 136 | - file: 08_spark_metastore/08_read_and_process_data_from_metastore_tables 137 | - file: 08_spark_metastore/09_creating_partitioned_tables 138 | - file: 08_spark_metastore/10_saving_as_partitioned_tables-Copy1 139 | - file: 08_spark_metastore/11_creating_temp_views 140 | - file: 08_spark_metastore/12_using_spark_sql -------------------------------------------------------------------------------- /mastering-pyspark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Mastering Pyspark\n", 8 | "\n", 9 | "This course is primarily designed to master data engineering using pyspark leveraging Data Frame APIs." 10 | ] 11 | } 12 | ], 13 | "metadata": { 14 | "kernelspec": { 15 | "display_name": "Python 3", 16 | "language": "python", 17 | "name": "python3" 18 | }, 19 | "language_info": { 20 | "codemirror_mode": { 21 | "name": "ipython", 22 | "version": 3 23 | }, 24 | "file_extension": ".py", 25 | "mimetype": "text/x-python", 26 | "name": "python", 27 | "nbconvert_exporter": "python", 28 | "pygments_lexer": "ipython3", 29 | "version": "3.6.12" 30 | } 31 | }, 32 | "nbformat": 4, 33 | "nbformat_minor": 4 34 | } 35 | --------------------------------------------------------------------------------