├── .gitignore ├── 01-PySpark-Get-Started.ipynb ├── 02-Create-SparkContext.ipynb ├── 03-Create-SparkSession.ipynb ├── 04-RDD-Operations.ipynb ├── 05-DataFrame-Intro.ipynb ├── 06-DataFrame-from-various-data-source.ipynb ├── 07-DataFrame-Operations.ipynb ├── 08-Spark-SQL.ipynb ├── LICENSE ├── README.md ├── data ├── data.txt ├── persons.csv ├── products.csv ├── products.parquet │ ├── ._SUCCESS.crc │ ├── .part-00000-1677f901-2d4d-4de5-b1e2-436dae4a802e-c000.snappy.parquet.crc │ ├── _SUCCESS │ └── part-00000-1677f901-2d4d-4de5-b1e2-436dae4a802e-c000.snappy.parquet ├── products_multiline.json ├── products_singleline.json └── stocks.txt └── output.txt ├── ._SUCCESS.crc ├── .part-00000.crc ├── .part-00001.crc ├── .part-00002.crc ├── .part-00003.crc ├── .part-00004.crc ├── .part-00005.crc ├── .part-00006.crc ├── .part-00007.crc ├── .part-00008.crc ├── .part-00009.crc ├── .part-00010.crc ├── .part-00011.crc ├── _SUCCESS ├── part-00000 ├── part-00001 ├── part-00002 ├── part-00003 ├── part-00004 ├── part-00005 ├── part-00006 ├── part-00007 ├── part-00008 ├── part-00009 ├── part-00010 └── part-00011 /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ -------------------------------------------------------------------------------- /01-PySpark-Get-Started.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "fe9fd1c0-db30-47b1-bbe2-0b1cbd97a9e2", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Set the PySpark environment variables\n", 11 | "import os\n", 12 | "os.environ['SPARK_HOME'] = \"/Users/coder2j/Apps/Spark\"\n", 13 | "os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'\n", 14 | "os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'\n", 15 | "os.environ['PYSPARK_PYTHON'] = 'python'" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "id": "4a915758-1498-4831-820b-a44fd888e87b", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "# Import PySpark\n", 26 | "from pyspark.sql import SparkSession" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "id": "bb53020b-1e79-4893-a13a-4968fa120fa3", 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "name": "stderr", 37 | "output_type": "stream", 38 | "text": [ 39 | "Setting default log level to \"WARN\".\n", 40 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 41 | "23/06/25 21:26:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "# Create a SparkSession\n", 47 | "spark = SparkSession.builder \\\n", 48 | " .appName(\"PySpark-Get-Started\") \\\n", 49 | " .getOrCreate()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 4, 55 | "id": "dde43975-a1f5-4ad1-88a3-76eb84215f2b", 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stderr", 60 | "output_type": "stream", 61 | "text": [ 62 | " \r" 63 | ] 64 | }, 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "+-------+---+\n", 70 | "| Name|Age|\n", 71 | "+-------+---+\n", 72 | "| Alice| 25|\n", 73 | "| Bob| 30|\n", 74 | "|Charlie| 35|\n", 75 | "+-------+---+\n", 76 | "\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "# Test the setup\n", 82 | "data = [(\"Alice\", 25), (\"Bob\", 30), (\"Charlie\", 35)]\n", 83 | "df = spark.createDataFrame(data, [\"Name\", \"Age\"])\n", 84 | "df.show()" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "id": "096c5a89-058c-488a-9d9e-146fdb6a44dd", 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [] 94 | } 95 | ], 96 | "metadata": { 97 | "kernelspec": { 98 | "display_name": "Python 3 (ipykernel)", 99 | "language": "python", 100 | "name": "python3" 101 | }, 102 | "language_info": { 103 | "codemirror_mode": { 104 | "name": "ipython", 105 | "version": 3 106 | }, 107 | "file_extension": ".py", 108 | "mimetype": "text/x-python", 109 | "name": "python", 110 | "nbconvert_exporter": "python", 111 | "pygments_lexer": "ipython3", 112 | "version": "3.10.8" 113 | } 114 | }, 115 | "nbformat": 4, 116 | "nbformat_minor": 5 117 | } 118 | -------------------------------------------------------------------------------- /02-Create-SparkContext.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "9b42a8f0-9a63-461c-95dc-27847f5b0a40", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Set the PySpark environment variables\n", 11 | "import os\n", 12 | "os.environ['SPARK_HOME'] = \"/Users/coder2j/Apps/Spark\"\n", 13 | "os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'\n", 14 | "os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'\n", 15 | "os.environ['PYSPARK_PYTHON'] = 'python'" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "id": "8f8e53e1-7a47-4dc4-9450-6be3c466f0b3", 21 | "metadata": {}, 22 | "source": [ 23 | "## Create SparkContext in Apache Spark version 1.x" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 4, 29 | "id": "287412ee-4384-4621-b7c0-ff50013f9785", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "from pyspark import SparkContext\n", 34 | "\n", 35 | "# Create a SparkContext object\n", 36 | "sc = SparkContext(appName=\"MySparkApplication\")" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 5, 42 | "id": "78f5b680-6bc0-42ef-99f6-83fd0da5e729", 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/html": [ 48 | "\n", 49 | "
\n", 50 | "

SparkContext

\n", 51 | "\n", 52 | "

Spark UI

\n", 53 | "\n", 54 | "
\n", 55 | "
Version
\n", 56 | "
v3.4.1
\n", 57 | "
Master
\n", 58 | "
local[*]
\n", 59 | "
AppName
\n", 60 | "
MySparkApplication
\n", 61 | "
\n", 62 | "
\n", 63 | " " 64 | ], 65 | "text/plain": [ 66 | "" 67 | ] 68 | }, 69 | "execution_count": 5, 70 | "metadata": {}, 71 | "output_type": "execute_result" 72 | } 73 | ], 74 | "source": [ 75 | "sc" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 6, 81 | "id": "c57fa548-9522-4e51-8c2d-4c6937eb3b3e", 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "# Shut down the current active SparkContext\n", 86 | "sc.stop()" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "id": "6cbd0d0d-2d17-44b9-ae3b-76b413b760c3", 92 | "metadata": {}, 93 | "source": [ 94 | "## Create SparkContext in Apache Spark version 2.x and later" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 7, 100 | "id": "0508898d-8b48-4b16-bae9-87cead0489b1", 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "from pyspark.sql import SparkSession\n", 105 | "\n", 106 | "# Create a SparkSession\n", 107 | "spark = SparkSession.builder \\\n", 108 | " .appName(\"MySparkApplication\") \\\n", 109 | " .getOrCreate()\n", 110 | "\n", 111 | "# Get the SparkContext from the SparkSession\n", 112 | "sc = spark.sparkContext\n" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 8, 118 | "id": "82c7ca1e-dd73-4306-ab7f-01bc371eb94e", 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "data": { 123 | "text/html": [ 124 | "\n", 125 | "
\n", 126 | "

SparkContext

\n", 127 | "\n", 128 | "

Spark UI

\n", 129 | "\n", 130 | "
\n", 131 | "
Version
\n", 132 | "
v3.4.1
\n", 133 | "
Master
\n", 134 | "
local[*]
\n", 135 | "
AppName
\n", 136 | "
MySparkApplication
\n", 137 | "
\n", 138 | "
\n", 139 | " " 140 | ], 141 | "text/plain": [ 142 | "" 143 | ] 144 | }, 145 | "execution_count": 8, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | } 149 | ], 150 | "source": [ 151 | "sc" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 9, 157 | "id": "9a3361a7-cb9d-49a3-b80b-820e490711e2", 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "# Shut down the current active SparkContext\n", 162 | "sc.stop() #or spark.stop()" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "id": "10336d77-bc36-4101-8055-dd7a2496d4da", 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [] 172 | } 173 | ], 174 | "metadata": { 175 | "kernelspec": { 176 | "display_name": "Python 3 (ipykernel)", 177 | "language": "python", 178 | "name": "python3" 179 | }, 180 | "language_info": { 181 | "codemirror_mode": { 182 | "name": "ipython", 183 | "version": 3 184 | }, 185 | "file_extension": ".py", 186 | "mimetype": "text/x-python", 187 | "name": "python", 188 | "nbconvert_exporter": "python", 189 | "pygments_lexer": "ipython3", 190 | "version": "3.10.8" 191 | } 192 | }, 193 | "nbformat": 4, 194 | "nbformat_minor": 5 195 | } 196 | -------------------------------------------------------------------------------- /03-Create-SparkSession.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "5f80ab89-af0e-4139-8f6a-0a382310f34c", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Set the PySpark environment variables\n", 11 | "import os\n", 12 | "os.environ['SPARK_HOME'] = \"/Users/coder2j/Apps/Spark\"\n", 13 | "os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'\n", 14 | "os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'\n", 15 | "os.environ['PYSPARK_PYTHON'] = 'python'" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "id": "ee8d77ba-685b-4c20-8c09-ee2b2fc44abd", 21 | "metadata": {}, 22 | "source": [ 23 | "## Create SparkSession in Apache Spark" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "id": "1df25dc4-3ffc-490d-9c03-d022cb2e7235", 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "name": "stderr", 34 | "output_type": "stream", 35 | "text": [ 36 | "Setting default log level to \"WARN\".\n", 37 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 38 | "23/07/16 15:31:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "from pyspark.sql import SparkSession\n", 44 | "\n", 45 | "# Create a SparkSession\n", 46 | "spark = SparkSession.builder \\\n", 47 | " .appName(\"MySparkApplication\") \\\n", 48 | " .config(\"spark.executor.memory\", \"2g\") \\\n", 49 | " .config(\"spark.sql.shuffle.partitions\", \"4\") \\\n", 50 | " .getOrCreate()" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 5, 56 | "id": "db19ae45-2c01-408b-b807-f883a2d796c3", 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/html": [ 62 | "\n", 63 | "
\n", 64 | "

SparkSession - in-memory

\n", 65 | " \n", 66 | "
\n", 67 | "

SparkContext

\n", 68 | "\n", 69 | "

Spark UI

\n", 70 | "\n", 71 | "
\n", 72 | "
Version
\n", 73 | "
v3.4.1
\n", 74 | "
Master
\n", 75 | "
local[*]
\n", 76 | "
AppName
\n", 77 | "
MySparkApplication
\n", 78 | "
\n", 79 | "
\n", 80 | " \n", 81 | "
\n", 82 | " " 83 | ], 84 | "text/plain": [ 85 | "" 86 | ] 87 | }, 88 | "execution_count": 5, 89 | "metadata": {}, 90 | "output_type": "execute_result" 91 | } 92 | ], 93 | "source": [ 94 | "# Perform operations using the SparkSession\n", 95 | "spark" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 6, 101 | "id": "d0c09484-8c96-4c73-9dde-cd8587b0c80f", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "# Shut down the current active SparkSession\n", 106 | "spark.stop()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "id": "f11e547f-54bd-43ee-b051-6722802bd567", 112 | "metadata": {}, 113 | "source": [] 114 | } 115 | ], 116 | "metadata": { 117 | "kernelspec": { 118 | "display_name": "Python 3 (ipykernel)", 119 | "language": "python", 120 | "name": "python3" 121 | }, 122 | "language_info": { 123 | "codemirror_mode": { 124 | "name": "ipython", 125 | "version": 3 126 | }, 127 | "file_extension": ".py", 128 | "mimetype": "text/x-python", 129 | "name": "python", 130 | "nbconvert_exporter": "python", 131 | "pygments_lexer": "ipython3", 132 | "version": "3.10.8" 133 | } 134 | }, 135 | "nbformat": 4, 136 | "nbformat_minor": 5 137 | } 138 | -------------------------------------------------------------------------------- /04-RDD-Operations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "5c0ddfc8-a1d1-4bb4-9cd5-180f11e4f3af", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Set the PySpark environment variables\n", 11 | "import os\n", 12 | "os.environ['SPARK_HOME'] = \"/Users/coder2j/Apps/Spark\"\n", 13 | "os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'\n", 14 | "os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'\n", 15 | "os.environ['PYSPARK_PYTHON'] = 'python'" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 3, 21 | "id": "ff054b1d-f6f7-4c8f-9b50-56d3b5ed1ac9", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "from pyspark.sql import SparkSession" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 4, 31 | "id": "f4658324-c133-4921-b53f-dd6141558f98", 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "name": "stderr", 36 | "output_type": "stream", 37 | "text": [ 38 | "Setting default log level to \"WARN\".\n", 39 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 40 | "23/07/16 18:20:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "# Create a SparkSession\n", 46 | "spark = SparkSession.builder.appName(\"RDD-Demo\").getOrCreate()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "id": "97a5a364-a829-4b89-8cb0-6872d0bdafb3", 52 | "metadata": {}, 53 | "source": [ 54 | "### How to create RDDs" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 5, 60 | "id": "0e31a7ea-d6fd-49f4-88cd-40c2bda5838a", 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "numbers = [1, 2, 3, 4, 5]\n", 65 | "rdd = spark.sparkContext.parallelize(numbers)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 6, 71 | "id": "09f84b37-e1a3-4d90-929e-a60c9063d669", 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "[1, 2, 3, 4, 5]" 78 | ] 79 | }, 80 | "execution_count": 6, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "# Collect action: Retrieve all elements of the RDD\n", 87 | "rdd.collect()" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 7, 93 | "id": "fd2ee436-1186-488f-8294-b46ce9c67cac", 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "# Create an RDD from a list of tuples\n", 98 | "data = [(\"Alice\", 25), (\"Bob\", 30), (\"Charlie\", 35), (\"Alice\", 40)]\n", 99 | "rdd = spark.sparkContext.parallelize(data)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 8, 105 | "id": "adad76e0-2c10-4a41-b947-89547fe94d35", 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "All elements of the rdd: [('Alice', 25), ('Bob', 30), ('Charlie', 35), ('Alice', 40)]\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "# Collect action: Retrieve all elements of the RDD\n", 118 | "print(\"All elements of the rdd: \", rdd.collect())" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "id": "44f98372-bc7a-43d7-b1c1-6d7d102bee29", 124 | "metadata": {}, 125 | "source": [ 126 | "### RDDs Operation: Actions " 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 9, 132 | "id": "60353b82-fd00-4e94-b11c-d31e8e005122", 133 | "metadata": {}, 134 | "outputs": [ 135 | { 136 | "name": "stderr", 137 | "output_type": "stream", 138 | "text": [ 139 | "[Stage 2:====================================================> (11 + 1) / 12]\r" 140 | ] 141 | }, 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "The total number of elements in rdd: 4\n" 147 | ] 148 | }, 149 | { 150 | "name": "stderr", 151 | "output_type": "stream", 152 | "text": [ 153 | " \r" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "# Count action: Count the number of elements in the RDD\n", 159 | "count = rdd.count()\n", 160 | "print(\"The total number of elements in rdd: \", count)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 10, 166 | "id": "456dbe75-0182-47ba-aaf9-34ad9ab06f55", 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "The first element of the rdd: ('Alice', 25)\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "# First action: Retrieve the first element of the RDD\n", 179 | "first_element = rdd.first()\n", 180 | "print(\"The first element of the rdd: \", first_element)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 11, 186 | "id": "979ae249-efb6-4b8b-b8e5-0cddea496ff9", 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "name": "stdout", 191 | "output_type": "stream", 192 | "text": [ 193 | "The first two elements of the rdd: [('Alice', 25), ('Bob', 30)]\n" 194 | ] 195 | } 196 | ], 197 | "source": [ 198 | "# Take action: Retrieve the n elements of the RDD\n", 199 | "taken_elements = rdd.take(2)\n", 200 | "print(\"The first two elements of the rdd: \", taken_elements)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 12, 206 | "id": "38bd8336-338e-4876-96e0-5e9aa19b5b36", 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "name": "stderr", 211 | "output_type": "stream", 212 | "text": [ 213 | "('Charlie', 35)\n", 214 | "('Alice', 25)\n", 215 | "('Bob', 30)\n", 216 | "('Alice', 40)\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "# Foreach action: Print each element of the RDD\n", 222 | "rdd.foreach(lambda x: print(x))" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "id": "c30f740c-f5fd-48d1-9e8b-a0f78caab408", 228 | "metadata": {}, 229 | "source": [ 230 | "### RDDs Operation: Transformations " 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 13, 236 | "id": "b3f7d23b-f246-4797-97ff-a56766657d53", 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "# Map transformation: Convert name to uppercase\n", 241 | "mapped_rdd = rdd.map(lambda x: (x[0].upper(), x[1]))" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 14, 247 | "id": "78d8ef21-d4d1-4361-b448-6c23e251e8f0", 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "name": "stdout", 252 | "output_type": "stream", 253 | "text": [ 254 | "rdd with uppercease name: [('ALICE', 25), ('BOB', 30), ('CHARLIE', 35), ('ALICE', 40)]\n" 255 | ] 256 | } 257 | ], 258 | "source": [ 259 | "result = mapped_rdd.collect()\n", 260 | "print(\"rdd with uppercease name: \", result)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 15, 266 | "id": "2211dbf5-64be-4966-bcb4-e11fdbc9363f", 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "data": { 271 | "text/plain": [ 272 | "[('Charlie', 35), ('Alice', 40)]" 273 | ] 274 | }, 275 | "execution_count": 15, 276 | "metadata": {}, 277 | "output_type": "execute_result" 278 | } 279 | ], 280 | "source": [ 281 | "# Filter transformation: Filter records where age is greater than 30\n", 282 | "filtered_rdd = rdd.filter(lambda x: x[1] > 30)\n", 283 | "filtered_rdd.collect()" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 16, 289 | "id": "325614c2-ede1-45f4-9818-6e89cb72e044", 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "data": { 294 | "text/plain": [ 295 | "[('Alice', 65), ('Bob', 30), ('Charlie', 35)]" 296 | ] 297 | }, 298 | "execution_count": 16, 299 | "metadata": {}, 300 | "output_type": "execute_result" 301 | } 302 | ], 303 | "source": [ 304 | "# ReduceByKey transformation: Calculate the total age for each name\n", 305 | "reduced_rdd = rdd.reduceByKey(lambda x, y: x + y)\n", 306 | "reduced_rdd.collect()" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 17, 312 | "id": "1a5e96bb-f8ce-4239-949f-184648b60ae7", 313 | "metadata": {}, 314 | "outputs": [ 315 | { 316 | "data": { 317 | "text/plain": [ 318 | "[('Alice', 40), ('Charlie', 35), ('Bob', 30), ('Alice', 25)]" 319 | ] 320 | }, 321 | "execution_count": 17, 322 | "metadata": {}, 323 | "output_type": "execute_result" 324 | } 325 | ], 326 | "source": [ 327 | "# SortBy transformation: Sort the RDD by age in descending order\n", 328 | "sorted_rdd = rdd.sortBy(lambda x: x[1], ascending=False)\n", 329 | "sorted_rdd.collect()" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "id": "d51d11e9-260f-421d-94d9-350e5c6146bb", 335 | "metadata": {}, 336 | "source": [ 337 | "### Save RDDs to text file and read RDDs from text file" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 18, 343 | "id": "c8389f05-063a-4a52-beb2-efc4e50daa6a", 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "# Save action: Save the RDD to a text file\n", 348 | "rdd.saveAsTextFile(\"output.txt\")" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 19, 354 | "id": "2a557485-8d1b-431c-aba5-dcbe21f3970d", 355 | "metadata": {}, 356 | "outputs": [ 357 | { 358 | "data": { 359 | "text/plain": [ 360 | "[\"('Alice', 40)\", \"('Bob', 30)\", \"('Alice', 25)\", \"('Charlie', 35)\"]" 361 | ] 362 | }, 363 | "execution_count": 19, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "# create rdd from text file\n", 370 | "rdd_text = spark.sparkContext.textFile(\"output.txt\")\n", 371 | "rdd_text.collect()" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "id": "c799ec33-2a47-4d8e-b239-92f25c8e7a37", 377 | "metadata": {}, 378 | "source": [ 379 | "### Shut down Spark Session" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 20, 385 | "id": "b501edd4-5d9a-4ffe-8cbb-abcf077868c4", 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "spark.stop()" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "id": "695cb724-4691-441e-97f1-7320c109a62f", 396 | "metadata": {}, 397 | "outputs": [], 398 | "source": [] 399 | } 400 | ], 401 | "metadata": { 402 | "kernelspec": { 403 | "display_name": "Python 3 (ipykernel)", 404 | "language": "python", 405 | "name": "python3" 406 | }, 407 | "language_info": { 408 | "codemirror_mode": { 409 | "name": "ipython", 410 | "version": 3 411 | }, 412 | "file_extension": ".py", 413 | "mimetype": "text/x-python", 414 | "name": "python", 415 | "nbconvert_exporter": "python", 416 | "pygments_lexer": "ipython3", 417 | "version": "3.10.8" 418 | } 419 | }, 420 | "nbformat": 4, 421 | "nbformat_minor": 5 422 | } 423 | -------------------------------------------------------------------------------- /05-DataFrame-Intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 37, 6 | "id": "68a6d7a7-693c-4fae-804f-3d92a1a30e35", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Set the PySpark environment variables\n", 11 | "import os\n", 12 | "os.environ['SPARK_HOME'] = \"/Users/coder2j/Apps/Spark\"\n", 13 | "os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'\n", 14 | "os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'\n", 15 | "os.environ['PYSPARK_PYTHON'] = 'python'" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 38, 21 | "id": "b9cb5875-598a-4d74-be55-3795530d9206", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "from pyspark.sql import SparkSession\n", 26 | "from pyspark.sql.functions import desc" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 39, 32 | "id": "afd73f68-a07d-47df-952e-38f0e681320c", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# Create a SparkSession\n", 37 | "spark = SparkSession.builder.appName(\"DataFrame-Demo\").getOrCreate()" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "id": "3b294de5-c4a9-4f03-8dbc-ee29caaf0b99", 43 | "metadata": {}, 44 | "source": [ 45 | "### Using RDDs" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 40, 51 | "id": "97c0daf5-9971-4e48-ae98-fc090a4edf14", 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "name": "stderr", 56 | "output_type": "stream", 57 | "text": [ 58 | " \r" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "rdd = spark.sparkContext.textFile(\"./data/data.txt\")\n", 64 | "result_rdd = rdd.flatMap(lambda line: line.split(\" \")) \\\n", 65 | " .map(lambda word: (word, 1)) \\\n", 66 | " .reduceByKey(lambda a, b: a + b) \\\n", 67 | " .sortBy(lambda x: x[1], ascending=False)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 41, 73 | "id": "96f3a503-e709-4248-91cf-5a55a1ee3549", 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "[('the', 12),\n", 80 | " ('of', 7),\n", 81 | " ('a', 7),\n", 82 | " ('in', 5),\n", 83 | " ('distributed', 5),\n", 84 | " ('Spark', 4),\n", 85 | " ('is', 3),\n", 86 | " ('as', 3),\n", 87 | " ('API', 3),\n", 88 | " ('on', 3)]" 89 | ] 90 | }, 91 | "execution_count": 41, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "result_rdd.take(10)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "id": "a02b9bba-8a25-4e8e-aed9-edb6700b391b", 103 | "metadata": {}, 104 | "source": [ 105 | "### Using DataFrames" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 42, 111 | "id": "248e9131-53fb-4c39-b301-4d415dcc169d", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "df = spark.read.text(\"./data/data.txt\")\n", 116 | "\n", 117 | "result_df = df.selectExpr(\"explode(split(value, ' ')) as word\") \\\n", 118 | " .groupBy(\"word\").count().orderBy(desc(\"count\"))" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 43, 124 | "id": "18231aa3-f162-4261-9a65-275f088f1675", 125 | "metadata": {}, 126 | "outputs": [ 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "[Row(word='the', count=12),\n", 131 | " Row(word='of', count=7),\n", 132 | " Row(word='a', count=7),\n", 133 | " Row(word='in', count=5),\n", 134 | " Row(word='distributed', count=5),\n", 135 | " Row(word='Spark', count=4),\n", 136 | " Row(word='API', count=3),\n", 137 | " Row(word='RDD', count=3),\n", 138 | " Row(word='is', count=3),\n", 139 | " Row(word='on', count=3)]" 140 | ] 141 | }, 142 | "execution_count": 43, 143 | "metadata": {}, 144 | "output_type": "execute_result" 145 | } 146 | ], 147 | "source": [ 148 | "result_df.take(10)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 44, 154 | "id": "a4ece8d9-ed91-4b4e-802f-a6e260dc46b4", 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "spark.stop()" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "id": "e509b04d-32b3-4b17-ab4e-6ed025b09762", 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [] 168 | } 169 | ], 170 | "metadata": { 171 | "kernelspec": { 172 | "display_name": "Python 3 (ipykernel)", 173 | "language": "python", 174 | "name": "python3" 175 | }, 176 | "language_info": { 177 | "codemirror_mode": { 178 | "name": "ipython", 179 | "version": 3 180 | }, 181 | "file_extension": ".py", 182 | "mimetype": "text/x-python", 183 | "name": "python", 184 | "nbconvert_exporter": "python", 185 | "pygments_lexer": "ipython3", 186 | "version": "3.10.8" 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 5 191 | } 192 | -------------------------------------------------------------------------------- /06-DataFrame-from-various-data-source.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "3d303751-5833-413a-8698-7d9cc74001cc", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Set the PySpark environment variables\n", 11 | "import os\n", 12 | "os.environ['SPARK_HOME'] = \"/Users/coder2j/Apps/Spark\"\n", 13 | "os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'\n", 14 | "os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'\n", 15 | "os.environ['PYSPARK_PYTHON'] = 'python'" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "id": "fbcf8801-018a-4f78-b0ea-cb83e4660e96", 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stderr", 26 | "output_type": "stream", 27 | "text": [ 28 | "Setting default log level to \"WARN\".\n", 29 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 30 | "23/08/13 20:43:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 31 | ] 32 | } 33 | ], 34 | "source": [ 35 | "from pyspark.sql import SparkSession\n", 36 | "\n", 37 | "# Create a SparkSession\n", 38 | "spark = SparkSession.builder.appName(\"Create-DataFrame\").getOrCreate()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "id": "2d12d97d-4961-4b5e-b571-8df65be48d94", 44 | "metadata": {}, 45 | "source": [ 46 | "### Read CSV file into DataFrame" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "id": "b8446479-fc5d-4422-9a1e-129ece15f941", 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "id,name,category,quantity,price\n", 60 | "1,iPhone 12,Electronics,10,899.99\n", 61 | "2,Nike Air Max 90,Clothing,25,119.99\n", 62 | "3,KitchenAid Stand Mixer,Home Appliances,5,299.99\n", 63 | "4,The Great Gatsby,Books,50,12.99\n", 64 | "5,L'Oreal Paris Mascara,Beauty,100,9.99\n", 65 | "6,Yoga Mat,Sports,30,29.99\n", 66 | "7,Samsung 4K Smart TV,Electronics,8,799.99\n", 67 | "8,Levi's Jeans,Clothing,15,49.99\n", 68 | "9,Dyson Vacuum Cleaner,Home Appliances,3,399.99\n" 69 | ] 70 | } 71 | ], 72 | "source": [ 73 | "%%bash \n", 74 | "head -10 ./data/products.csv" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "id": "65f9db29-2efb-4874-a512-9add235d292d", 80 | "metadata": {}, 81 | "source": [ 82 | "#### Read CSV with header" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 6, 88 | "id": "7580ed5e-8cac-4af5-8955-1eea0ffe4c0a", 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "# Read CSV file into DataFrame\n", 93 | "csv_file_path = \"./data/products.csv\"\n", 94 | "df = spark.read.csv(csv_file_path, header=True)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 7, 100 | "id": "e595ec04-655d-4caa-91ec-d8e298c3183b", 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "root\n", 108 | " |-- id: string (nullable = true)\n", 109 | " |-- name: string (nullable = true)\n", 110 | " |-- category: string (nullable = true)\n", 111 | " |-- quantity: string (nullable = true)\n", 112 | " |-- price: string (nullable = true)\n", 113 | "\n", 114 | "+---+--------------------+---------------+--------+------+\n", 115 | "| id| name| category|quantity| price|\n", 116 | "+---+--------------------+---------------+--------+------+\n", 117 | "| 1| iPhone 12| Electronics| 10|899.99|\n", 118 | "| 2| Nike Air Max 90| Clothing| 25|119.99|\n", 119 | "| 3|KitchenAid Stand ...|Home Appliances| 5|299.99|\n", 120 | "| 4| The Great Gatsby| Books| 50| 12.99|\n", 121 | "| 5|L'Oreal Paris Mas...| Beauty| 100| 9.99|\n", 122 | "+---+--------------------+---------------+--------+------+\n", 123 | "only showing top 5 rows\n", 124 | "\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "# Display schema of DataFrame\n", 130 | "df.printSchema()\n", 131 | "\n", 132 | "# Display content of DataFrame\n", 133 | "df.show(5)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "id": "1f43038c-30e3-45fc-b927-f1c62c1cdf84", 139 | "metadata": {}, 140 | "source": [ 141 | "#### Read CSV with an explicit schema definition" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 8, 147 | "id": "6a105df6-1196-4a1c-95df-a54fd699986d", 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "# import necessary types\n", 152 | "from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 9, 158 | "id": "cecfacfc-5488-40a8-b26e-fefc07c61d88", 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "# Define the schema\n", 163 | "schema = StructType([\n", 164 | " StructField(name=\"id\", dataType=IntegerType(), nullable=True),\n", 165 | " StructField(name=\"name\", dataType=StringType(), nullable=True),\n", 166 | " StructField(name=\"category\", dataType=StringType(), nullable=True),\n", 167 | " StructField(name=\"quantity\", dataType=IntegerType(), nullable=True),\n", 168 | " StructField(name=\"price\", dataType=DoubleType(), nullable=True)\n", 169 | "])" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 10, 175 | "id": "268f68db-1f89-4b94-979b-da0aa16b990f", 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "# Read CSV file into DataFrame with schema definition\n", 180 | "csv_file_path = \"./data/products.csv\"\n", 181 | "df = spark.read.csv(csv_file_path, header=True, schema=schema)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 11, 187 | "id": "d1ffe583-5460-484c-b136-ee4715f4b0d4", 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "name": "stdout", 192 | "output_type": "stream", 193 | "text": [ 194 | "root\n", 195 | " |-- id: integer (nullable = true)\n", 196 | " |-- name: string (nullable = true)\n", 197 | " |-- category: string (nullable = true)\n", 198 | " |-- quantity: integer (nullable = true)\n", 199 | " |-- price: double (nullable = true)\n", 200 | "\n", 201 | "+---+--------------------+---------------+--------+------+\n", 202 | "| id| name| category|quantity| price|\n", 203 | "+---+--------------------+---------------+--------+------+\n", 204 | "| 1| iPhone 12| Electronics| 10|899.99|\n", 205 | "| 2| Nike Air Max 90| Clothing| 25|119.99|\n", 206 | "| 3|KitchenAid Stand ...|Home Appliances| 5|299.99|\n", 207 | "| 4| The Great Gatsby| Books| 50| 12.99|\n", 208 | "| 5|L'Oreal Paris Mas...| Beauty| 100| 9.99|\n", 209 | "+---+--------------------+---------------+--------+------+\n", 210 | "only showing top 5 rows\n", 211 | "\n" 212 | ] 213 | } 214 | ], 215 | "source": [ 216 | "# Display schema of DataFrame\n", 217 | "df.printSchema()\n", 218 | "\n", 219 | "# Display content of DataFrame\n", 220 | "df.show(5)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "id": "1f099209-1c77-4c2b-8142-944ad92d4723", 226 | "metadata": {}, 227 | "source": [ 228 | "#### Read CSV with inferSchema" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 12, 234 | "id": "13f37a98-6810-43f5-8229-267d6528cea5", 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "# Read CSV file into DataFrame with inferSchema\n", 239 | "csv_file_path = \"./data/products.csv\"\n", 240 | "df = spark.read.csv(csv_file_path, header=True, inferSchema=True)" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 13, 246 | "id": "c53d37bd-6bc6-4eaf-b0c0-879d3302df8d", 247 | "metadata": {}, 248 | "outputs": [ 249 | { 250 | "name": "stdout", 251 | "output_type": "stream", 252 | "text": [ 253 | "root\n", 254 | " |-- id: integer (nullable = true)\n", 255 | " |-- name: string (nullable = true)\n", 256 | " |-- category: string (nullable = true)\n", 257 | " |-- quantity: integer (nullable = true)\n", 258 | " |-- price: double (nullable = true)\n", 259 | "\n", 260 | "+---+--------------------+---------------+--------+------+\n", 261 | "| id| name| category|quantity| price|\n", 262 | "+---+--------------------+---------------+--------+------+\n", 263 | "| 1| iPhone 12| Electronics| 10|899.99|\n", 264 | "| 2| Nike Air Max 90| Clothing| 25|119.99|\n", 265 | "| 3|KitchenAid Stand ...|Home Appliances| 5|299.99|\n", 266 | "| 4| The Great Gatsby| Books| 50| 12.99|\n", 267 | "| 5|L'Oreal Paris Mas...| Beauty| 100| 9.99|\n", 268 | "+---+--------------------+---------------+--------+------+\n", 269 | "only showing top 5 rows\n", 270 | "\n" 271 | ] 272 | } 273 | ], 274 | "source": [ 275 | "# Display schema of DataFrame\n", 276 | "df.printSchema()\n", 277 | "\n", 278 | "# Display content of DataFrame\n", 279 | "df.show(5)" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "id": "16cd7777-ced0-4f55-9825-f6a868636d47", 285 | "metadata": {}, 286 | "source": [ 287 | "### Read JSON file into DataFrame" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "id": "4d339da0-ea17-474b-82e8-54785bc2ecf3", 293 | "metadata": {}, 294 | "source": [ 295 | "#### Single Line JSON" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 14, 301 | "id": "d3fb6fdc-da18-4edd-a815-3152cfc2dcfd", 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "name": "stdout", 306 | "output_type": "stream", 307 | "text": [ 308 | "{\"id\":1,\"name\":\"iPhone 12\",\"category\":\"Electronics\",\"quantity\":10,\"price\":899.99}\n", 309 | "{\"id\":2,\"name\":\"Nike Air Max 90\",\"category\":\"Clothing\",\"quantity\":25,\"price\":119.99}\n", 310 | "{\"id\":3,\"name\":\"KitchenAid Stand Mixer\",\"category\":\"Home Appliances\",\"quantity\":5,\"price\":299.99}\n", 311 | "{\"id\":4,\"name\":\"The Great Gatsby\",\"category\":\"Books\",\"quantity\":50,\"price\":12.99}\n", 312 | "{\"id\":5,\"name\":\"L'Oreal Paris Mascara\",\"category\":\"Beauty\",\"quantity\":100,\"price\":9.99}\n", 313 | "{\"id\":6,\"name\":\"Yoga Mat\",\"category\":\"Sports\",\"quantity\":30,\"price\":29.99}\n", 314 | "{\"id\":7,\"name\":\"Samsung 4K Smart TV\",\"category\":\"Electronics\",\"quantity\":8,\"price\":799.99}\n", 315 | "{\"id\":8,\"name\":\"Levi's Jeans\",\"category\":\"Clothing\",\"quantity\":15,\"price\":49.99}\n", 316 | "{\"id\":9,\"name\":\"Dyson Vacuum Cleaner\",\"category\":\"Home Appliances\",\"quantity\":3,\"price\":399.99}\n", 317 | "{\"id\":10,\"name\":\"Harry Potter Series\",\"category\":\"Books\",\"quantity\":20,\"price\":15.99}\n" 318 | ] 319 | } 320 | ], 321 | "source": [ 322 | "%%bash\n", 323 | "head -10 data/products_singleline.json" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 15, 329 | "id": "6cfe5f2b-7720-403b-a8bf-aec931bca199", 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "# Read single line JSON\n", 334 | "# Each row is a JSON record, records are separated by new line\n", 335 | "json_file_path = \"./data/products_singleline.json\"\n", 336 | "df = spark.read.json(json_file_path)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 16, 342 | "id": "0cc72e44-36ca-44cb-8d31-36f473e52f9a", 343 | "metadata": {}, 344 | "outputs": [ 345 | { 346 | "name": "stdout", 347 | "output_type": "stream", 348 | "text": [ 349 | "root\n", 350 | " |-- category: string (nullable = true)\n", 351 | " |-- id: long (nullable = true)\n", 352 | " |-- name: string (nullable = true)\n", 353 | " |-- price: double (nullable = true)\n", 354 | " |-- quantity: long (nullable = true)\n", 355 | "\n", 356 | "+---------------+---+--------------------+------+--------+\n", 357 | "| category| id| name| price|quantity|\n", 358 | "+---------------+---+--------------------+------+--------+\n", 359 | "| Electronics| 1| iPhone 12|899.99| 10|\n", 360 | "| Clothing| 2| Nike Air Max 90|119.99| 25|\n", 361 | "|Home Appliances| 3|KitchenAid Stand ...|299.99| 5|\n", 362 | "| Books| 4| The Great Gatsby| 12.99| 50|\n", 363 | "| Beauty| 5|L'Oreal Paris Mas...| 9.99| 100|\n", 364 | "+---------------+---+--------------------+------+--------+\n", 365 | "only showing top 5 rows\n", 366 | "\n" 367 | ] 368 | } 369 | ], 370 | "source": [ 371 | "# Display schema of DataFrame\n", 372 | "df.printSchema()\n", 373 | "\n", 374 | "# Display content of DataFrame\n", 375 | "df.show(5)" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "id": "6f7db26b-be18-4602-9cd8-780fc82294f6", 381 | "metadata": {}, 382 | "source": [ 383 | "#### Multi-lines JSON" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 17, 389 | "id": "c76efef7-a884-464e-8132-dabfde493dfb", 390 | "metadata": {}, 391 | "outputs": [ 392 | { 393 | "name": "stdout", 394 | "output_type": "stream", 395 | "text": [ 396 | "[\n", 397 | " {\n", 398 | " \"id\": 1,\n", 399 | " \"name\": \"iPhone 12\",\n", 400 | " \"category\": \"Electronics\",\n", 401 | " \"quantity\": 10,\n", 402 | " \"price\": 899.99\n", 403 | " },\n", 404 | " {\n", 405 | " \"id\": 2,\n", 406 | " \"name\": \"Nike Air Max 90\",\n", 407 | " \"category\": \"Clothing\",\n", 408 | " \"quantity\": 25,\n", 409 | " \"price\": 119.99\n", 410 | " },\n", 411 | " {\n", 412 | " \"id\": 3,\n", 413 | " \"name\": \"KitchenAid Stand Mixer\",\n", 414 | " \"category\": \"Home Appliances\",\n", 415 | " \"quantity\": 5,\n" 416 | ] 417 | } 418 | ], 419 | "source": [ 420 | "%%bash\n", 421 | "head -20 data/products_multiline.json" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 18, 427 | "id": "89755543-83fd-4f85-8829-e34ec16e9dec", 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "# Read multi-line JSON\n", 432 | "# JSON is an array of record, records are separated by a comma.\n", 433 | "# each record is defined in multiple lines\n", 434 | "json_file_path = \"./data/products_multiline.json\"\n", 435 | "df = spark.read.json(json_file_path, multiLine=True)" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 19, 441 | "id": "c0aa892a-6784-424f-b40a-7469035cd891", 442 | "metadata": {}, 443 | "outputs": [ 444 | { 445 | "name": "stdout", 446 | "output_type": "stream", 447 | "text": [ 448 | "root\n", 449 | " |-- category: string (nullable = true)\n", 450 | " |-- id: long (nullable = true)\n", 451 | " |-- name: string (nullable = true)\n", 452 | " |-- price: double (nullable = true)\n", 453 | " |-- quantity: long (nullable = true)\n", 454 | "\n", 455 | "+---------------+---+--------------------+------+--------+\n", 456 | "| category| id| name| price|quantity|\n", 457 | "+---------------+---+--------------------+------+--------+\n", 458 | "| Electronics| 1| iPhone 12|899.99| 10|\n", 459 | "| Clothing| 2| Nike Air Max 90|119.99| 25|\n", 460 | "|Home Appliances| 3|KitchenAid Stand ...|299.99| 5|\n", 461 | "| Books| 4| The Great Gatsby| 12.99| 50|\n", 462 | "| Beauty| 5|L'Oreal Paris Mas...| 9.99| 100|\n", 463 | "+---------------+---+--------------------+------+--------+\n", 464 | "only showing top 5 rows\n", 465 | "\n" 466 | ] 467 | } 468 | ], 469 | "source": [ 470 | "# Display schema of DataFrame\n", 471 | "df.printSchema()\n", 472 | "\n", 473 | "# Display content of DataFrame\n", 474 | "df.show(5)" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 20, 480 | "id": "5f538b5b-c116-4a3a-8675-dffa5f59d047", 481 | "metadata": {}, 482 | "outputs": [ 483 | { 484 | "name": "stderr", 485 | "output_type": "stream", 486 | "text": [ 487 | " \r" 488 | ] 489 | } 490 | ], 491 | "source": [ 492 | "# write dataframe into parquet file\n", 493 | "parquet_file_path = \"./data/products.parquet\"\n", 494 | "df.write.parquet(parquet_file_path)" 495 | ] 496 | }, 497 | { 498 | "cell_type": "markdown", 499 | "id": "c0e3b6a8-6273-407c-b523-3b6a9b795d73", 500 | "metadata": {}, 501 | "source": [ 502 | "### Read parquet file into DataFrame" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 21, 508 | "id": "98025d25-3cd5-4ee4-9218-60d9b206cee0", 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [ 512 | "df = spark.read.parquet(parquet_file_path)" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 22, 518 | "id": "394fea18-891c-4422-b484-bfb8cadb4b51", 519 | "metadata": {}, 520 | "outputs": [ 521 | { 522 | "name": "stdout", 523 | "output_type": "stream", 524 | "text": [ 525 | "root\n", 526 | " |-- category: string (nullable = true)\n", 527 | " |-- id: long (nullable = true)\n", 528 | " |-- name: string (nullable = true)\n", 529 | " |-- price: double (nullable = true)\n", 530 | " |-- quantity: long (nullable = true)\n", 531 | "\n", 532 | "+---------------+---+--------------------+------+--------+\n", 533 | "| category| id| name| price|quantity|\n", 534 | "+---------------+---+--------------------+------+--------+\n", 535 | "| Electronics| 1| iPhone 12|899.99| 10|\n", 536 | "| Clothing| 2| Nike Air Max 90|119.99| 25|\n", 537 | "|Home Appliances| 3|KitchenAid Stand ...|299.99| 5|\n", 538 | "| Books| 4| The Great Gatsby| 12.99| 50|\n", 539 | "| Beauty| 5|L'Oreal Paris Mas...| 9.99| 100|\n", 540 | "+---------------+---+--------------------+------+--------+\n", 541 | "only showing top 5 rows\n", 542 | "\n" 543 | ] 544 | } 545 | ], 546 | "source": [ 547 | "# Display schema of DataFrame\n", 548 | "df.printSchema()\n", 549 | "\n", 550 | "# Display content of DataFrame\n", 551 | "df.show(5)" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": 23, 557 | "id": "27689ca9-304b-40b0-970d-7459545b4983", 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [ 561 | "spark.stop()" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": null, 567 | "id": "1771b724-c7a5-4f5a-97c1-0dac5f1b03c9", 568 | "metadata": {}, 569 | "outputs": [], 570 | "source": [] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": null, 575 | "id": "a4657b0b-5d1e-4987-863a-bbbc9bc564b0", 576 | "metadata": {}, 577 | "outputs": [], 578 | "source": [] 579 | } 580 | ], 581 | "metadata": { 582 | "kernelspec": { 583 | "display_name": "Python 3 (ipykernel)", 584 | "language": "python", 585 | "name": "python3" 586 | }, 587 | "language_info": { 588 | "codemirror_mode": { 589 | "name": "ipython", 590 | "version": 3 591 | }, 592 | "file_extension": ".py", 593 | "mimetype": "text/x-python", 594 | "name": "python", 595 | "nbconvert_exporter": "python", 596 | "pygments_lexer": "ipython3", 597 | "version": "3.10.8" 598 | } 599 | }, 600 | "nbformat": 4, 601 | "nbformat_minor": 5 602 | } 603 | -------------------------------------------------------------------------------- /07-DataFrame-Operations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 18, 6 | "id": "fa80df4e-9ddc-4db6-a801-717cd67ae883", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Set the PySpark environment variables\n", 11 | "import os\n", 12 | "os.environ['SPARK_HOME'] = \"/Users/coder2j/Apps/Spark\"\n", 13 | "os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'\n", 14 | "os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'\n", 15 | "os.environ['PYSPARK_PYTHON'] = 'python'" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 19, 21 | "id": "01cd1819-fc7d-422e-916f-4ef8fc180bfb", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "from pyspark.sql import SparkSession\n", 26 | "\n", 27 | "# Create a SparkSession\n", 28 | "spark = SparkSession.builder.appName(\"DataFrame-Operations\").getOrCreate()" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 20, 34 | "id": "3a81039c-3491-4c8a-8ae4-5dab6b0af501", 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "id,name,category,quantity,price\n", 42 | "1,iPhone,Electronics,10,899.99\n", 43 | "2,Macbook,Electronics,5,1299.99\n", 44 | "3,iPad,Electronics,15,499.99\n", 45 | "4,Samsung TV,Electronics,8,799.99\n", 46 | "5,LG TV,Electronics,10,699.99\n", 47 | "6,Nike Shoes,Clothing,30,99.99\n", 48 | "7,Adidas Shoes,Clothing,25,89.99\n", 49 | "8,Sony Headphones,Electronics,12,149.99\n", 50 | "9,Beats Headphones,Electronics,20,199.99\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "%%bash\n", 56 | "head -10 data/stocks.txt" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 21, 62 | "id": "5b7e24a1-861d-4929-b3bd-5683f5bc5c1d", 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "# Load the synthetic data into a DataFrame\n", 67 | "data_file_path = \"./data/stocks.txt\"\n", 68 | "df = spark.read.csv(data_file_path, header=True, inferSchema=True)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 22, 74 | "id": "cdacf979-c105-4f65-9f28-4efc4c88c07a", 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | "root\n", 82 | " |-- id: integer (nullable = true)\n", 83 | " |-- name: string (nullable = true)\n", 84 | " |-- category: string (nullable = true)\n", 85 | " |-- quantity: integer (nullable = true)\n", 86 | " |-- price: double (nullable = true)\n", 87 | "\n", 88 | "Initial DataFrame:\n", 89 | "+---+----------------+-----------+--------+-------+\n", 90 | "| id| name| category|quantity| price|\n", 91 | "+---+----------------+-----------+--------+-------+\n", 92 | "| 1| iPhone|Electronics| 10| 899.99|\n", 93 | "| 2| Macbook|Electronics| 5|1299.99|\n", 94 | "| 3| iPad|Electronics| 15| 499.99|\n", 95 | "| 4| Samsung TV|Electronics| 8| 799.99|\n", 96 | "| 5| LG TV|Electronics| 10| 699.99|\n", 97 | "| 6| Nike Shoes| Clothing| 30| 99.99|\n", 98 | "| 7| Adidas Shoes| Clothing| 25| 89.99|\n", 99 | "| 8| Sony Headphones|Electronics| 12| 149.99|\n", 100 | "| 9|Beats Headphones|Electronics| 20| 199.99|\n", 101 | "| 10| Dining Table| Furniture| 10| 249.99|\n", 102 | "+---+----------------+-----------+--------+-------+\n", 103 | "only showing top 10 rows\n", 104 | "\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "# Display schema of DataFrame\n", 110 | "df.printSchema()\n", 111 | "\n", 112 | "# Show the initial DataFrame\n", 113 | "print(\"Initial DataFrame:\")\n", 114 | "df.show(10)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "id": "f43e6f84-16f0-4f6f-a7cf-a3de49d6ea53", 120 | "metadata": {}, 121 | "source": [ 122 | "### Select: Choose specific columns." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 23, 128 | "id": "7552160c-a792-4817-ab7c-117d5440a52c", 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "name": "stdout", 133 | "output_type": "stream", 134 | "text": [ 135 | "Selected Columns:\n", 136 | "+---+----------------+-------+\n", 137 | "| id| name| price|\n", 138 | "+---+----------------+-------+\n", 139 | "| 1| iPhone| 899.99|\n", 140 | "| 2| Macbook|1299.99|\n", 141 | "| 3| iPad| 499.99|\n", 142 | "| 4| Samsung TV| 799.99|\n", 143 | "| 5| LG TV| 699.99|\n", 144 | "| 6| Nike Shoes| 99.99|\n", 145 | "| 7| Adidas Shoes| 89.99|\n", 146 | "| 8| Sony Headphones| 149.99|\n", 147 | "| 9|Beats Headphones| 199.99|\n", 148 | "| 10| Dining Table| 249.99|\n", 149 | "+---+----------------+-------+\n", 150 | "only showing top 10 rows\n", 151 | "\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "# Select specific columns\n", 157 | "selected_columns = df.select(\"id\", \"name\", \"price\")\n", 158 | "print(\"Selected Columns:\")\n", 159 | "selected_columns.show(10)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "id": "1ba2c8d3-1e26-456d-94e3-6afac5a4a4a7", 165 | "metadata": {}, 166 | "source": [ 167 | "### Filter: Apply conditions to filter rows." 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 24, 173 | "id": "0fc8c608-2fa3-4565-849b-ebc2b62025fe", 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "name": "stdout", 178 | "output_type": "stream", 179 | "text": [ 180 | "Filtered Data: 12\n", 181 | "+---+--------------+-----------+--------+-----+\n", 182 | "| id| name| category|quantity|price|\n", 183 | "+---+--------------+-----------+--------+-----+\n", 184 | "| 6| Nike Shoes| Clothing| 30|99.99|\n", 185 | "| 7| Adidas Shoes| Clothing| 25|89.99|\n", 186 | "| 12| Apples| Food| 100| 0.5|\n", 187 | "| 13| Bananas| Food| 150| 0.25|\n", 188 | "| 14| Oranges| Food| 120| 0.75|\n", 189 | "| 15|Chicken Breast| Food| 50| 3.99|\n", 190 | "| 16| Salmon Fillet| Food| 30| 5.99|\n", 191 | "| 24| Laptop Bag|Accessories| 25|29.99|\n", 192 | "| 25| Backpack|Accessories| 30|24.99|\n", 193 | "| 28| Jeans| Clothing| 30|59.99|\n", 194 | "| 29| T-shirt| Clothing| 50|14.99|\n", 195 | "| 30| Sneakers| Clothing| 40|79.99|\n", 196 | "+---+--------------+-----------+--------+-----+\n", 197 | "\n" 198 | ] 199 | } 200 | ], 201 | "source": [ 202 | "# Filter rows based on a condition\n", 203 | "filtered_data = df.filter(df.quantity > 20)\n", 204 | "print(\"Filtered Data:\", filtered_data.count())\n", 205 | "filtered_data.show()" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "id": "18283acf-69eb-4140-a4dd-273c9eb5eafd", 211 | "metadata": {}, 212 | "source": [ 213 | "### GroupBy: Group data based on specific columns \n", 214 | "### Aggregations: Perform functions like sum, average, etc., on grouped data." 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 25, 220 | "id": "96d2db93-de0e-4707-81a8-3cfbb84dcf3d", 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "name": "stdout", 225 | "output_type": "stream", 226 | "text": [ 227 | "Grouped and Aggregated Data:\n", 228 | "+-----------+-------------+------------------+\n", 229 | "| category|sum(quantity)| avg(price)|\n", 230 | "+-----------+-------------+------------------+\n", 231 | "| Food| 450|2.2960000000000003|\n", 232 | "| Sports| 35| 34.99|\n", 233 | "|Electronics| 98| 586.6566666666665|\n", 234 | "| Clothing| 200| 99.2757142857143|\n", 235 | "| Furniture| 41| 141.99|\n", 236 | "|Accessories| 55| 27.49|\n", 237 | "+-----------+-------------+------------------+\n", 238 | "\n" 239 | ] 240 | } 241 | ], 242 | "source": [ 243 | "# GroupBy and Aggregations\n", 244 | "grouped_data = df.groupBy(\"category\").agg({\"quantity\": \"sum\", \"price\": \"avg\"})\n", 245 | "print(\"Grouped and Aggregated Data:\")\n", 246 | "grouped_data.show()" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "id": "a233a823-73b1-404f-b8e0-90ee5e78c4e4", 252 | "metadata": {}, 253 | "source": [ 254 | "### Join: Combine multiple DataFrames based on specified columns." 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 26, 260 | "id": "494f996e-57ee-44c0-82d8-814ee777653e", 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "name": "stdout", 265 | "output_type": "stream", 266 | "text": [ 267 | "Joined Data:\n", 268 | "+---+----------------+-----------+--------+-------+-----------+\n", 269 | "| id| name| category|quantity| price| category|\n", 270 | "+---+----------------+-----------+--------+-------+-----------+\n", 271 | "| 1| iPhone|Electronics| 10| 899.99|Electronics|\n", 272 | "| 2| Macbook|Electronics| 5|1299.99|Electronics|\n", 273 | "| 3| iPad|Electronics| 15| 499.99|Electronics|\n", 274 | "| 4| Samsung TV|Electronics| 8| 799.99|Electronics|\n", 275 | "| 5| LG TV|Electronics| 10| 699.99|Electronics|\n", 276 | "| 6| Nike Shoes| Clothing| 30| 99.99| Clothing|\n", 277 | "| 7| Adidas Shoes| Clothing| 25| 89.99| Clothing|\n", 278 | "| 8| Sony Headphones|Electronics| 12| 149.99|Electronics|\n", 279 | "| 9|Beats Headphones|Electronics| 20| 199.99|Electronics|\n", 280 | "| 10| Dining Table| Furniture| 10| 249.99| Furniture|\n", 281 | "+---+----------------+-----------+--------+-------+-----------+\n", 282 | "\n" 283 | ] 284 | } 285 | ], 286 | "source": [ 287 | "# Join with another DataFrame\n", 288 | "df2 = df.select(\"id\", \"category\").limit(10)\n", 289 | "joined_data = df.join(df2, \"id\", \"inner\")\n", 290 | "print(\"Joined Data:\")\n", 291 | "joined_data.show()" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "id": "6e71e549-8194-4a95-ae3a-9db0a6afa5dc", 297 | "metadata": {}, 298 | "source": [ 299 | "### Sort: Arrange rows based on one or more columns." 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 27, 305 | "id": "133ab21b-84ca-48e7-a8bf-8c7e487e455c", 306 | "metadata": {}, 307 | "outputs": [ 308 | { 309 | "name": "stdout", 310 | "output_type": "stream", 311 | "text": [ 312 | "Sorted Data:\n", 313 | "+---+--------------+-----------+--------+-----+\n", 314 | "| id| name| category|quantity|price|\n", 315 | "+---+--------------+-----------+--------+-----+\n", 316 | "| 13| Bananas| Food| 150| 0.25|\n", 317 | "| 12| Apples| Food| 100| 0.5|\n", 318 | "| 14| Oranges| Food| 120| 0.75|\n", 319 | "| 15|Chicken Breast| Food| 50| 3.99|\n", 320 | "| 16| Salmon Fillet| Food| 30| 5.99|\n", 321 | "| 29| T-shirt| Clothing| 50|14.99|\n", 322 | "| 19| Yoga Mat| Sports| 20|19.99|\n", 323 | "| 25| Backpack|Accessories| 30|24.99|\n", 324 | "| 24| Laptop Bag|Accessories| 25|29.99|\n", 325 | "| 20| Dumbbell Set| Sports| 15|49.99|\n", 326 | "+---+--------------+-----------+--------+-----+\n", 327 | "only showing top 10 rows\n", 328 | "\n" 329 | ] 330 | } 331 | ], 332 | "source": [ 333 | "# Sort by a column\n", 334 | "sorted_data = df.orderBy(\"price\")\n", 335 | "print(\"Sorted Data:\")\n", 336 | "sorted_data.show(10)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 28, 342 | "id": "a5a0a80e-dc5c-4569-ade9-e209560eccb8", 343 | "metadata": {}, 344 | "outputs": [ 345 | { 346 | "name": "stdout", 347 | "output_type": "stream", 348 | "text": [ 349 | "Sorted Data Descending:\n", 350 | "+---+----------------+-----------+--------+-------+\n", 351 | "| id| name| category|quantity| price|\n", 352 | "+---+----------------+-----------+--------+-------+\n", 353 | "| 2| Macbook|Electronics| 5|1299.99|\n", 354 | "| 1| iPhone|Electronics| 10| 899.99|\n", 355 | "| 4| Samsung TV|Electronics| 8| 799.99|\n", 356 | "| 5| LG TV|Electronics| 10| 699.99|\n", 357 | "| 26| Camera|Electronics| 10| 599.99|\n", 358 | "| 3| iPad|Electronics| 15| 499.99|\n", 359 | "| 10| Dining Table| Furniture| 10| 249.99|\n", 360 | "| 17| Leather Jacket| Clothing| 15| 199.99|\n", 361 | "| 9|Beats Headphones|Electronics| 20| 199.99|\n", 362 | "| 18| Winter Coat| Clothing| 10| 149.99|\n", 363 | "+---+----------------+-----------+--------+-------+\n", 364 | "only showing top 10 rows\n", 365 | "\n" 366 | ] 367 | } 368 | ], 369 | "source": [ 370 | "# Sort by a column desc\n", 371 | "from pyspark.sql.functions import col, desc\n", 372 | "sorted_data = df.orderBy(col(\"price\").desc(), col(\"id\").desc())\n", 373 | "print(\"Sorted Data Descending:\")\n", 374 | "sorted_data.show(10)" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "id": "67580e00-93f8-4579-9972-fc64a0654366", 380 | "metadata": {}, 381 | "source": [ 382 | "### Distinct: Get unique rows." 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 29, 388 | "id": "744e6638-5689-4509-9df4-4abb03cd9e9b", 389 | "metadata": {}, 390 | "outputs": [ 391 | { 392 | "name": "stdout", 393 | "output_type": "stream", 394 | "text": [ 395 | "Distinct Product Categories:\n", 396 | "+-----------+\n", 397 | "| category|\n", 398 | "+-----------+\n", 399 | "| Food|\n", 400 | "| Sports|\n", 401 | "|Electronics|\n", 402 | "| Clothing|\n", 403 | "| Furniture|\n", 404 | "|Accessories|\n", 405 | "+-----------+\n", 406 | "\n" 407 | ] 408 | } 409 | ], 410 | "source": [ 411 | "# Get distinct product category\n", 412 | "distinct_rows = df.select(\"category\").distinct()\n", 413 | "print(\"Distinct Product Categories:\")\n", 414 | "distinct_rows.show()" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "id": "4c847aac-dcde-4589-aef7-c0ec95c2f80f", 420 | "metadata": {}, 421 | "source": [ 422 | "### Drop: Remove specified columns." 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": 30, 428 | "id": "21d4afa3-20b5-4299-931a-0fba2655b509", 429 | "metadata": {}, 430 | "outputs": [ 431 | { 432 | "name": "stdout", 433 | "output_type": "stream", 434 | "text": [ 435 | "Dropped Columns:\n", 436 | "+---+----------------+-------+\n", 437 | "| id| name| price|\n", 438 | "+---+----------------+-------+\n", 439 | "| 1| iPhone| 899.99|\n", 440 | "| 2| Macbook|1299.99|\n", 441 | "| 3| iPad| 499.99|\n", 442 | "| 4| Samsung TV| 799.99|\n", 443 | "| 5| LG TV| 699.99|\n", 444 | "| 6| Nike Shoes| 99.99|\n", 445 | "| 7| Adidas Shoes| 89.99|\n", 446 | "| 8| Sony Headphones| 149.99|\n", 447 | "| 9|Beats Headphones| 199.99|\n", 448 | "| 10| Dining Table| 249.99|\n", 449 | "+---+----------------+-------+\n", 450 | "only showing top 10 rows\n", 451 | "\n" 452 | ] 453 | } 454 | ], 455 | "source": [ 456 | "# Drop columns\n", 457 | "dropped_columns = df.drop(\"quantity\", \"category\")\n", 458 | "print(\"Dropped Columns:\")\n", 459 | "dropped_columns.show(10)" 460 | ] 461 | }, 462 | { 463 | "cell_type": "markdown", 464 | "id": "afc28820-f951-4c1e-99bc-e56ff434cc11", 465 | "metadata": {}, 466 | "source": [ 467 | "### WithColumn: Add new calculated columns." 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 31, 473 | "id": "fb391702-9e54-4e3d-822b-0d0f1d0d08e6", 474 | "metadata": {}, 475 | "outputs": [ 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "DataFrame with New Column:\n", 481 | "+---+----------------+-----------+--------+-------+-------+\n", 482 | "| id| name| category|quantity| price|revenue|\n", 483 | "+---+----------------+-----------+--------+-------+-------+\n", 484 | "| 1| iPhone|Electronics| 10| 899.99| 8999.9|\n", 485 | "| 2| Macbook|Electronics| 5|1299.99|6499.95|\n", 486 | "| 3| iPad|Electronics| 15| 499.99|7499.85|\n", 487 | "| 4| Samsung TV|Electronics| 8| 799.99|6399.92|\n", 488 | "| 5| LG TV|Electronics| 10| 699.99| 6999.9|\n", 489 | "| 6| Nike Shoes| Clothing| 30| 99.99| 2999.7|\n", 490 | "| 7| Adidas Shoes| Clothing| 25| 89.99|2249.75|\n", 491 | "| 8| Sony Headphones|Electronics| 12| 149.99|1799.88|\n", 492 | "| 9|Beats Headphones|Electronics| 20| 199.99| 3999.8|\n", 493 | "| 10| Dining Table| Furniture| 10| 249.99| 2499.9|\n", 494 | "+---+----------------+-----------+--------+-------+-------+\n", 495 | "only showing top 10 rows\n", 496 | "\n" 497 | ] 498 | } 499 | ], 500 | "source": [ 501 | "# Add a new calculated column\n", 502 | "df_with_new_column = df.withColumn(\"revenue\", df.quantity * df.price)\n", 503 | "print(\"DataFrame with New Column:\")\n", 504 | "df_with_new_column.show(10)" 505 | ] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "id": "c2138074-68c4-403b-aac6-41fee4595417", 510 | "metadata": {}, 511 | "source": [ 512 | "### Alias: Rename columns for better readability." 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 32, 518 | "id": "669657f4-63a0-48f6-bc9e-a1bebeae466e", 519 | "metadata": {}, 520 | "outputs": [ 521 | { 522 | "name": "stdout", 523 | "output_type": "stream", 524 | "text": [ 525 | "DataFrame with Aliased Column:\n", 526 | "+---+----------------+-----------+--------+-------------+\n", 527 | "| id| name| category|quantity|product_price|\n", 528 | "+---+----------------+-----------+--------+-------------+\n", 529 | "| 1| iPhone|Electronics| 10| 899.99|\n", 530 | "| 2| Macbook|Electronics| 5| 1299.99|\n", 531 | "| 3| iPad|Electronics| 15| 499.99|\n", 532 | "| 4| Samsung TV|Electronics| 8| 799.99|\n", 533 | "| 5| LG TV|Electronics| 10| 699.99|\n", 534 | "| 6| Nike Shoes| Clothing| 30| 99.99|\n", 535 | "| 7| Adidas Shoes| Clothing| 25| 89.99|\n", 536 | "| 8| Sony Headphones|Electronics| 12| 149.99|\n", 537 | "| 9|Beats Headphones|Electronics| 20| 199.99|\n", 538 | "| 10| Dining Table| Furniture| 10| 249.99|\n", 539 | "+---+----------------+-----------+--------+-------------+\n", 540 | "only showing top 10 rows\n", 541 | "\n" 542 | ] 543 | } 544 | ], 545 | "source": [ 546 | "# Rename columns using alias\n", 547 | "df_with_alias = df.withColumnRenamed(\"price\", \"product_price\")\n", 548 | "print(\"DataFrame with Aliased Column:\")\n", 549 | "df_with_alias.show(10)" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 33, 555 | "id": "f305239c-fa2b-4378-9de8-bfc58e5f244f", 556 | "metadata": {}, 557 | "outputs": [], 558 | "source": [ 559 | "# Stop the SparkSession\n", 560 | "spark.stop()" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": null, 566 | "id": "b89aa6d1-157a-46da-b9a2-c09feeb2c82e", 567 | "metadata": {}, 568 | "outputs": [], 569 | "source": [] 570 | } 571 | ], 572 | "metadata": { 573 | "kernelspec": { 574 | "display_name": "Python 3 (ipykernel)", 575 | "language": "python", 576 | "name": "python3" 577 | }, 578 | "language_info": { 579 | "codemirror_mode": { 580 | "name": "ipython", 581 | "version": 3 582 | }, 583 | "file_extension": ".py", 584 | "mimetype": "text/x-python", 585 | "name": "python", 586 | "nbconvert_exporter": "python", 587 | "pygments_lexer": "ipython3", 588 | "version": "3.10.8" 589 | } 590 | }, 591 | "nbformat": 4, 592 | "nbformat_minor": 5 593 | } 594 | -------------------------------------------------------------------------------- /08-Spark-SQL.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 52, 6 | "id": "7f41cf64-565f-48b4-a5e5-4d48fc404270", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Set the PySpark environment variables\n", 11 | "import os\n", 12 | "os.environ['SPARK_HOME'] = \"/Users/coder2j/Apps/Spark\"\n", 13 | "os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'\n", 14 | "os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'\n", 15 | "os.environ['PYSPARK_PYTHON'] = 'python'" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 54, 21 | "id": "958a52c6-da3d-45b5-9c2e-27e579c6066d", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "from pyspark.sql import SparkSession\n", 26 | "\n", 27 | "# Create a SparkSession\n", 28 | "spark = SparkSession.builder.appName(\"DataFrameSQL\").getOrCreate()" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 55, 34 | "id": "d3a86cbb-583f-427d-970b-de58f9e7bcf4", 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "name,age,gender,salary\n", 42 | "John Doe,30,Male,50000\n", 43 | "Jane Smith,25,Female,45000\n", 44 | "David Johnson,35,Male,60000\n", 45 | "Emily Davis,28,Female,52000\n", 46 | "Michael Wilson,40,Male,75000\n", 47 | "Sarah Brown,32,Female,58000\n", 48 | "Robert Lee,29,Male,51000\n", 49 | "Lisa Garcia,27,Female,49000\n", 50 | "James Martinez,38,Male,70000\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "%%bash\n", 56 | "head -10 ./data/persons.csv" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "id": "24c56d31-71d2-48c0-8d22-b7eb9c4c347f", 62 | "metadata": {}, 63 | "source": [ 64 | "### Load Data into a DataFrame" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 56, 70 | "id": "dcd9c1d7-ccc2-4cee-8547-c23f3bee42bb", 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "# Load the synthetic data into a DataFrame\n", 75 | "data_file_path = \"./data/persons.csv\"\n", 76 | "df = spark.read.csv(data_file_path, header=True, inferSchema=True)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 57, 82 | "id": "99b0d2db-5d0e-4e82-9d45-95029f614319", 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | "root\n", 90 | " |-- name: string (nullable = true)\n", 91 | " |-- age: integer (nullable = true)\n", 92 | " |-- gender: string (nullable = true)\n", 93 | " |-- salary: integer (nullable = true)\n", 94 | "\n", 95 | "Initial DataFrame:\n", 96 | "+------------------+---+------+------+\n", 97 | "| name|age|gender|salary|\n", 98 | "+------------------+---+------+------+\n", 99 | "| John Doe| 30| Male| 50000|\n", 100 | "| Jane Smith| 25|Female| 45000|\n", 101 | "| David Johnson| 35| Male| 60000|\n", 102 | "| Emily Davis| 28|Female| 52000|\n", 103 | "| Michael Wilson| 40| Male| 75000|\n", 104 | "| Sarah Brown| 32|Female| 58000|\n", 105 | "| Robert Lee| 29| Male| 51000|\n", 106 | "| Lisa Garcia| 27|Female| 49000|\n", 107 | "| James Martinez| 38| Male| 70000|\n", 108 | "|Jennifer Rodriguez| 26|Female| 47000|\n", 109 | "+------------------+---+------+------+\n", 110 | "only showing top 10 rows\n", 111 | "\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "# Display schema of DataFrame\n", 117 | "df.printSchema()\n", 118 | "\n", 119 | "# Show the initial DataFrame\n", 120 | "print(\"Initial DataFrame:\")\n", 121 | "df.show(10)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "id": "1293db0e-3af1-40d3-9fed-15cb5c6d54f1", 127 | "metadata": {}, 128 | "source": [ 129 | "### Register the DataFrame as a Temporary Table" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 58, 135 | "id": "439040dd-8c13-48c9-9a72-acd313da407a", 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "# Register the DataFrame as a Temporary Table\n", 140 | "df.createOrReplaceTempView(\"my_table\")" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "id": "3c0ed5db-3985-4251-935a-3c01edd47005", 146 | "metadata": {}, 147 | "source": [ 148 | "### Perform SQL-like Queries" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 59, 154 | "id": "ccf8aa7d-e77f-4fca-93d9-fd4f2581803d", 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stdout", 159 | "output_type": "stream", 160 | "text": [ 161 | "+------------------+---+------+------+\n", 162 | "| name|age|gender|salary|\n", 163 | "+------------------+---+------+------+\n", 164 | "| John Doe| 30| Male| 50000|\n", 165 | "| David Johnson| 35| Male| 60000|\n", 166 | "| Emily Davis| 28|Female| 52000|\n", 167 | "| Michael Wilson| 40| Male| 75000|\n", 168 | "| Sarah Brown| 32|Female| 58000|\n", 169 | "| Robert Lee| 29| Male| 51000|\n", 170 | "| Lisa Garcia| 27|Female| 49000|\n", 171 | "| James Martinez| 38| Male| 70000|\n", 172 | "|Jennifer Rodriguez| 26|Female| 47000|\n", 173 | "| William Anderson| 33| Male| 62000|\n", 174 | "| Karen Hernandez| 31|Female| 55000|\n", 175 | "|Christopher Taylor| 37| Male| 69000|\n", 176 | "| Matthew Davis| 36| Male| 67000|\n", 177 | "| Patricia White| 29|Female| 50000|\n", 178 | "| Daniel Miller| 34| Male| 64000|\n", 179 | "| Elizabeth Jackson| 30|Female| 52000|\n", 180 | "| Joseph Harris| 28| Male| 53000|\n", 181 | "| Linda Martin| 39|Female| 71000|\n", 182 | "+------------------+---+------+------+\n", 183 | "\n" 184 | ] 185 | } 186 | ], 187 | "source": [ 188 | "# Select all rows where age is greater than 25\n", 189 | "result = spark.sql(\"SELECT * FROM my_table WHERE age > 25\")\n", 190 | "\n", 191 | "result.show()" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 60, 197 | "id": "8b88f371-fd06-4aed-9fca-bdb4ce1362ff", 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "name": "stdout", 202 | "output_type": "stream", 203 | "text": [ 204 | "+------+----------+\n", 205 | "|gender|avg_salary|\n", 206 | "+------+----------+\n", 207 | "|Female| 52300.0|\n", 208 | "| Male| 62100.0|\n", 209 | "+------+----------+\n", 210 | "\n" 211 | ] 212 | } 213 | ], 214 | "source": [ 215 | "# Compute the average salary by gender\n", 216 | "avg_salary_by_gender = spark.sql(\"SELECT gender, AVG(salary) as avg_salary FROM my_table GROUP BY gender\")\n", 217 | "avg_salary_by_gender.show()" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "id": "62b21e82-65e2-4b09-b344-a5cc72d9d430", 223 | "metadata": {}, 224 | "source": [ 225 | "### Creating and managing temporary views." 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 61, 231 | "id": "6469c424-87cd-4dfa-b015-1c3f8e591de0", 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "# Create a temporary view\n", 236 | "df.createOrReplaceTempView(\"people\")" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 62, 242 | "id": "9198c861-e79a-4fac-80cc-747c94f062ce", 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "name": "stdout", 247 | "output_type": "stream", 248 | "text": [ 249 | "+------------------+---+------+------+\n", 250 | "| name|age|gender|salary|\n", 251 | "+------------------+---+------+------+\n", 252 | "| John Doe| 30| Male| 50000|\n", 253 | "| David Johnson| 35| Male| 60000|\n", 254 | "| Emily Davis| 28|Female| 52000|\n", 255 | "| Michael Wilson| 40| Male| 75000|\n", 256 | "| Sarah Brown| 32|Female| 58000|\n", 257 | "| Robert Lee| 29| Male| 51000|\n", 258 | "| Lisa Garcia| 27|Female| 49000|\n", 259 | "| James Martinez| 38| Male| 70000|\n", 260 | "|Jennifer Rodriguez| 26|Female| 47000|\n", 261 | "| William Anderson| 33| Male| 62000|\n", 262 | "| Karen Hernandez| 31|Female| 55000|\n", 263 | "|Christopher Taylor| 37| Male| 69000|\n", 264 | "| Matthew Davis| 36| Male| 67000|\n", 265 | "| Patricia White| 29|Female| 50000|\n", 266 | "| Daniel Miller| 34| Male| 64000|\n", 267 | "| Elizabeth Jackson| 30|Female| 52000|\n", 268 | "| Joseph Harris| 28| Male| 53000|\n", 269 | "| Linda Martin| 39|Female| 71000|\n", 270 | "+------------------+---+------+------+\n", 271 | "\n" 272 | ] 273 | } 274 | ], 275 | "source": [ 276 | "# Query the temporary view\n", 277 | "result = spark.sql(\"SELECT * FROM people WHERE age > 25\")\n", 278 | "\n", 279 | "result.show()" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 63, 285 | "id": "b66e2493-82c9-4a46-a2c5-049fe9572e7b", 286 | "metadata": {}, 287 | "outputs": [ 288 | { 289 | "data": { 290 | "text/plain": [ 291 | "True" 292 | ] 293 | }, 294 | "execution_count": 63, 295 | "metadata": {}, 296 | "output_type": "execute_result" 297 | } 298 | ], 299 | "source": [ 300 | "# Check if a temporary view exists\n", 301 | "view_exists = spark.catalog.tableExists(\"people\")\n", 302 | "view_exists" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 64, 308 | "id": "6d363610-7dc6-4a80-bc2d-c1269691e437", 309 | "metadata": {}, 310 | "outputs": [ 311 | { 312 | "data": { 313 | "text/plain": [ 314 | "True" 315 | ] 316 | }, 317 | "execution_count": 64, 318 | "metadata": {}, 319 | "output_type": "execute_result" 320 | } 321 | ], 322 | "source": [ 323 | "# Drop a temporary view\n", 324 | "spark.catalog.dropTempView(\"people\")" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 65, 330 | "id": "38f7e2b9-8d1a-4060-8791-7cbfed9d8b22", 331 | "metadata": {}, 332 | "outputs": [ 333 | { 334 | "data": { 335 | "text/plain": [ 336 | "False" 337 | ] 338 | }, 339 | "execution_count": 65, 340 | "metadata": {}, 341 | "output_type": "execute_result" 342 | } 343 | ], 344 | "source": [ 345 | "# Check if a temporary view exists\n", 346 | "view_exists = spark.catalog.tableExists(\"people\")\n", 347 | "view_exists" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "id": "8bcdf631-ee1d-496f-858b-a13d0b43cb18", 353 | "metadata": {}, 354 | "source": [ 355 | "### Subquries" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": 66, 361 | "id": "5cc8db71-4862-4169-b2d0-7e38ccdf8331", 362 | "metadata": {}, 363 | "outputs": [ 364 | { 365 | "name": "stdout", 366 | "output_type": "stream", 367 | "text": [ 368 | "+---+-------+\n", 369 | "| id| name|\n", 370 | "+---+-------+\n", 371 | "| 1| John|\n", 372 | "| 2| Alice|\n", 373 | "| 3| Bob|\n", 374 | "| 4| Emily|\n", 375 | "| 5| David|\n", 376 | "| 6| Sarah|\n", 377 | "| 7|Michael|\n", 378 | "| 8| Lisa|\n", 379 | "| 9|William|\n", 380 | "+---+-------+\n", 381 | "\n", 382 | "+----------+---+------+\n", 383 | "|department| id|salary|\n", 384 | "+----------+---+------+\n", 385 | "| HR| 1| 60000|\n", 386 | "| HR| 2| 55000|\n", 387 | "| HR| 3| 58000|\n", 388 | "| IT| 4| 70000|\n", 389 | "| IT| 5| 72000|\n", 390 | "| IT| 6| 68000|\n", 391 | "| Sales| 7| 75000|\n", 392 | "| Sales| 8| 78000|\n", 393 | "| Sales| 9| 77000|\n", 394 | "+----------+---+------+\n", 395 | "\n" 396 | ] 397 | } 398 | ], 399 | "source": [ 400 | "# Create DataFrames\n", 401 | "employee_data = [\n", 402 | " (1, \"John\"), (2, \"Alice\"), (3, \"Bob\"), (4, \"Emily\"),\n", 403 | " (5, \"David\"), (6, \"Sarah\"), (7, \"Michael\"), (8, \"Lisa\"),\n", 404 | " (9, \"William\")\n", 405 | "]\n", 406 | "employees = spark.createDataFrame(employee_data, [\"id\", \"name\"])\n", 407 | "\n", 408 | "salary_data = [\n", 409 | " (\"HR\", 1, 60000), (\"HR\", 2, 55000), (\"HR\", 3, 58000),\n", 410 | " (\"IT\", 4, 70000), (\"IT\", 5, 72000), (\"IT\", 6, 68000),\n", 411 | " (\"Sales\", 7, 75000), (\"Sales\", 8, 78000), (\"Sales\", 9, 77000)\n", 412 | "]\n", 413 | "salaries = spark.createDataFrame(salary_data, [\"department\", \"id\", \"salary\"])\n", 414 | "\n", 415 | "employees.show()\n", 416 | "\n", 417 | "salaries.show()" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 67, 423 | "id": "871a978f-82f9-4388-afae-77d4b0e7297b", 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "# Register as temporary views\n", 428 | "employees.createOrReplaceTempView(\"employees\")\n", 429 | "salaries.createOrReplaceTempView(\"salaries\")" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 68, 435 | "id": "c0a3c981-b8d4-409c-bd3a-aaff55ee3661", 436 | "metadata": {}, 437 | "outputs": [ 438 | { 439 | "name": "stdout", 440 | "output_type": "stream", 441 | "text": [ 442 | "+-------+\n", 443 | "| name|\n", 444 | "+-------+\n", 445 | "| Emily|\n", 446 | "| David|\n", 447 | "|Michael|\n", 448 | "| Lisa|\n", 449 | "|William|\n", 450 | "+-------+\n", 451 | "\n" 452 | ] 453 | } 454 | ], 455 | "source": [ 456 | "# Subquery to find employees with salaries above average\n", 457 | "result = spark.sql(\"\"\"\n", 458 | " SELECT name\n", 459 | " FROM employees\n", 460 | " WHERE id IN (\n", 461 | " SELECT id\n", 462 | " FROM salaries\n", 463 | " WHERE salary > (SELECT AVG(salary) FROM salaries)\n", 464 | " )\n", 465 | "\"\"\")\n", 466 | "\n", 467 | "result.show()" 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "id": "5237a76b-d747-4096-a013-41cc913cd9c0", 473 | "metadata": {}, 474 | "source": [ 475 | "### Window Function" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 69, 481 | "id": "afd58076-cada-4552-a676-7919cb17ba2b", 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [ 485 | "from pyspark.sql.window import Window\n", 486 | "from pyspark.sql import functions as F" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": 70, 492 | "id": "5669387e-aa09-4830-a4c4-c32056e38d5f", 493 | "metadata": {}, 494 | "outputs": [ 495 | { 496 | "name": "stdout", 497 | "output_type": "stream", 498 | "text": [ 499 | "+----------+---+------+-------+\n", 500 | "|department| id|salary| name|\n", 501 | "+----------+---+------+-------+\n", 502 | "| HR| 1| 60000| John|\n", 503 | "| HR| 2| 55000| Alice|\n", 504 | "| HR| 3| 58000| Bob|\n", 505 | "| IT| 4| 70000| Emily|\n", 506 | "| IT| 5| 72000| David|\n", 507 | "| IT| 6| 68000| Sarah|\n", 508 | "| Sales| 7| 75000|Michael|\n", 509 | "| Sales| 8| 78000| Lisa|\n", 510 | "| Sales| 9| 77000|William|\n", 511 | "+----------+---+------+-------+\n", 512 | "\n" 513 | ] 514 | } 515 | ], 516 | "source": [ 517 | "employee_salary = spark.sql(\"\"\"\n", 518 | " select salaries.*, employees.name\n", 519 | " from salaries \n", 520 | " left join employees on salaries.id = employees.id\n", 521 | "\"\"\")\n", 522 | "\n", 523 | "employee_salary.show()" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": 71, 529 | "id": "0f2fb287-69a4-4860-a13b-f76646e2f465", 530 | "metadata": {}, 531 | "outputs": [], 532 | "source": [ 533 | "# Create a window specification\n", 534 | "window_spec = Window.partitionBy(\"department\").orderBy(F.desc(\"salary\"))" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": 72, 540 | "id": "dcf9ca4b-f1ce-43f1-ac72-bac31c32e34f", 541 | "metadata": {}, 542 | "outputs": [ 543 | { 544 | "name": "stdout", 545 | "output_type": "stream", 546 | "text": [ 547 | "+----------+---+------+-------+----+\n", 548 | "|department| id|salary| name|rank|\n", 549 | "+----------+---+------+-------+----+\n", 550 | "| HR| 1| 60000| John| 1|\n", 551 | "| HR| 3| 58000| Bob| 2|\n", 552 | "| HR| 2| 55000| Alice| 3|\n", 553 | "| IT| 5| 72000| David| 1|\n", 554 | "| IT| 4| 70000| Emily| 2|\n", 555 | "| IT| 6| 68000| Sarah| 3|\n", 556 | "| Sales| 8| 78000| Lisa| 1|\n", 557 | "| Sales| 9| 77000|William| 2|\n", 558 | "| Sales| 7| 75000|Michael| 3|\n", 559 | "+----------+---+------+-------+----+\n", 560 | "\n" 561 | ] 562 | } 563 | ], 564 | "source": [ 565 | "# Calculate the rank of employees within each department based on salary\n", 566 | "employee_salary.withColumn(\"rank\", F.rank().over(window_spec)).show()" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 73, 572 | "id": "7d85a9f5-3da2-4ef7-b201-7bb43ccc6bc2", 573 | "metadata": {}, 574 | "outputs": [], 575 | "source": [ 576 | "# Stop the SparkSession\n", 577 | "spark.stop()" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": null, 583 | "id": "fea9d104-e4b3-4be0-9fae-43fdbc391691", 584 | "metadata": {}, 585 | "outputs": [], 586 | "source": [] 587 | } 588 | ], 589 | "metadata": { 590 | "kernelspec": { 591 | "display_name": "Python 3 (ipykernel)", 592 | "language": "python", 593 | "name": "python3" 594 | }, 595 | "language_info": { 596 | "codemirror_mode": { 597 | "name": "ipython", 598 | "version": 3 599 | }, 600 | "file_extension": ".py", 601 | "mimetype": "text/x-python", 602 | "name": "python", 603 | "nbconvert_exporter": "python", 604 | "pygments_lexer": "ipython3", 605 | "version": "3.10.8" 606 | } 607 | }, 608 | "nbformat": 4, 609 | "nbformat_minor": 5 610 | } 611 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 coder2j 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PySpark Tutorial for Beginners - Jupyter Notebooks 2 | 3 | Welcome to the PySpark Tutorial for Beginners GitHub repository! This repository contains a collection of Jupyter notebooks used in my comprehensive [YouTube video: PySpark tutorial for beginners](https://youtu.be/EB8lfdxpirM). These notebooks provide hands-on examples and code snippets to help you understand and practice PySpark concepts covered in the tutorial video. 4 | 5 | If you find this tutorial helpful, consider sharing this video with your friends and colleagues to help them unlock the power of PySpark and unlock the following bonus videos. 6 | 7 | 🎁 Bonus Videos: 8 | - Hit **50,000 views** to unlock a video about building an **end-to-end machine-learning pipeline with PySpark**. 9 | - Hit **100,000 views** to unlock another video video about **end-to-end spark streaming**. 10 | 11 | Do you like this tutorial? Why not check out my other video of [Airflow Tutorial for Beginners](https://youtu.be/K9AnJ9_ZAXE), which has more than **350k views 👀** and around **7k likes 👍**. 12 | 13 | Don't forget to subscribe to my [YouTube channel](https://www.youtube.com/c/coder2j) and [my blog](https://coder2j.com/) for more exciting tutorials like this. And connect me on [X/Twitter](https://twitter.com/coder2j) and [Linkedin](https://www.linkedin.com/in/coder2j/), I post content there regularly too. Thank you for your support! ❤️ 14 | 15 | 16 | ## Table of Contents 17 | 18 | - [Introduction](#introduction) 19 | - [Getting Started](#getting-started) 20 | - [Notebook Descriptions](#notebook-descriptions) 21 | - [Prerequisites](#prerequisites) 22 | - [Usage](#usage) 23 | - [Contributing](#contributing) 24 | - [License](#license) 25 | 26 | ## Introduction 27 | 28 | In our [PySpark tutorial video](https://youtu.be/EB8lfdxpirM), we covered various topics, including Spark installation, SparkContext, SparkSession, RDD transformations and actions, Spark DataFrames, Spark SQL, and more. These Jupyter notebooks are designed to complement the video content, allowing you to follow along, experiment, and practice your PySpark skills. 29 | 30 | ## Getting Started 31 | 32 | To get started with the Jupyter notebooks, follow these steps: 33 | 34 | 1. Clone this GitHub repository to your local machine using the following command: 35 | 36 | ```bash 37 | git clone https://github.com/coder2j/pyspark-tutorial.git 38 | ``` 39 | 40 | 2. Ensure you have Python and Jupyter Notebook installed on your machine. 41 | 42 | 3. Follow the YouTube video part 2: Spark Installation to make sure Spark has been installed on your machine. 43 | 44 | 4. Launch Jupyter Notebook by running: 45 | 46 | ```bash 47 | jupyter notebook 48 | ``` 49 | 50 | 5. Open the notebook you want to work on and start experimenting with PySpark. 51 | 52 | ## Notebook Descriptions 53 | 54 | - **Notebook 1 - 01-PySpark-Get-Started**: Instructions and commands for setting the PySpark environment variables to use spark in jupyter notebook. 55 | 56 | - **Notebook 2 - 02-Create-SparkContext**: Creating SparkContext objects in different PySpark versions. 57 | 58 | 59 | - **Notebook 3 - 03-Create-SparkSession.ipynb**: Creating SparkSession objects in PySpark. 60 | 61 | - **Notebook 4 - 04-RDD-Operations.ipynb**: Creating RDD and Demonstrating RDD transformations and actions. 62 | 63 | - **Notebook 5 - 05-DataFrame-Intro.ipynb**: Introduction to Spark DataFrames and differences compared to RDD. 64 | 65 | - **Notebook 6 - 06-DataFrame-from-various-data-source.ipynb**: Creating Spark Dataframe from various data sources. 66 | 67 | - **Notebook 7 - 07-DataFrame-Operations.ipynb**: Performing Spark Dataframe operations like filtering, aggregation, etc. 68 | 69 | - **Notebook 8 - 08-Spark-SQL.ipynb**: Converting Spark Dataframe to a temporary table or view and performing SQL operations using Spark SQL. 70 | 71 | Feel free to explore and run these notebooks at your own pace. 72 | 73 | ## Prerequisites 74 | 75 | To make the most of these notebooks, you should have the following prerequisites: 76 | 77 | - Basic knowledge of Python programming. 78 | 79 | - Understanding of data processing concepts (though no prior PySpark experience is required). 80 | 81 | ## Usage 82 | 83 | These notebooks are meant for self-learning and practice. Follow along with the [tutorial video](https://youtu.be/EB8lfdxpirM) to gain a deeper understanding of PySpark concepts. Experiment with the code, modify it and try additional exercises to solidify your skills. 84 | 85 | ## Contributing 86 | 87 | If you'd like to contribute to this repository by adding more notebooks, improving documentation, or fixing issues, please feel free to fork the repository, make your changes, and submit a pull request. We welcome contributions from the community! 88 | 89 | ## License 90 | 91 | This project is licensed under the [MIT License](LICENSE.md). 92 | -------------------------------------------------------------------------------- /data/data.txt: -------------------------------------------------------------------------------- 1 | Apache Spark has its architectural foundation in the resilient distributed dataset (RDD), a read-only multiset of data items distributed over a cluster of machines, that is maintained in a fault-tolerant way. The Dataframe API was released as an abstraction on top of the RDD, followed by the Dataset API. In Spark 1.x, the RDD was the primary application programming interface (API), but as of Spark 2.x use of the Dataset API is encouraged even though the RDD API is not deprecated. The RDD technology still underlies the Dataset API. 2 | 3 | Spark and its RDDs were developed in 2012 in response to limitations in the MapReduce cluster computing paradigm, which forces a particular linear dataflow structure on distributed programs: MapReduce programs read input data from disk, map a function across the data, reduce the results of the map, and store reduction results on disk. Spark's RDDs function as a working set for distributed programs that offers a (deliberately) restricted form of distributed shared memory. -------------------------------------------------------------------------------- /data/persons.csv: -------------------------------------------------------------------------------- 1 | name,age,gender,salary 2 | John Doe,30,Male,50000 3 | Jane Smith,25,Female,45000 4 | David Johnson,35,Male,60000 5 | Emily Davis,28,Female,52000 6 | Michael Wilson,40,Male,75000 7 | Sarah Brown,32,Female,58000 8 | Robert Lee,29,Male,51000 9 | Lisa Garcia,27,Female,49000 10 | James Martinez,38,Male,70000 11 | Jennifer Rodriguez,26,Female,47000 12 | William Anderson,33,Male,62000 13 | Karen Hernandez,31,Female,55000 14 | Christopher Taylor,37,Male,69000 15 | Mary Gonzalez,24,Female,44000 16 | Matthew Davis,36,Male,67000 17 | Patricia White,29,Female,50000 18 | Daniel Miller,34,Male,64000 19 | Elizabeth Jackson,30,Female,52000 20 | Joseph Harris,28,Male,53000 21 | Linda Martin,39,Female,71000 22 | -------------------------------------------------------------------------------- /data/products.csv: -------------------------------------------------------------------------------- 1 | id,name,category,quantity,price 2 | 1,iPhone 12,Electronics,10,899.99 3 | 2,Nike Air Max 90,Clothing,25,119.99 4 | 3,KitchenAid Stand Mixer,Home Appliances,5,299.99 5 | 4,The Great Gatsby,Books,50,12.99 6 | 5,L'Oreal Paris Mascara,Beauty,100,9.99 7 | 6,Yoga Mat,Sports,30,29.99 8 | 7,Samsung 4K Smart TV,Electronics,8,799.99 9 | 8,Levi's Jeans,Clothing,15,49.99 10 | 9,Dyson Vacuum Cleaner,Home Appliances,3,399.99 11 | 10,Harry Potter Series,Books,20,15.99 12 | 11,MAC Lipstick,Beauty,75,16.99 13 | 12,Adidas Running Shoes,Sports,22,59.99 14 | 13,PlayStation 5,Electronics,12,499.99 15 | 14,Hooded Sweatshirt,Clothing,10,34.99 16 | 15,Coffee Maker,Home Appliances,7,89.99 17 | 16,To Kill a Mockingbird,Books,15,9.99 18 | 17,Skincare Set,Beauty,50,49.99 19 | 18,Yoga Ball,Sports,18,19.99 20 | 19,Sony Noise-Canceling Headphones,Electronics,6,299.99 21 | 20,Puma T-shirt,Clothing,40,19.99 22 | -------------------------------------------------------------------------------- /data/products.parquet/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /data/products.parquet/.part-00000-1677f901-2d4d-4de5-b1e2-436dae4a802e-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/data/products.parquet/.part-00000-1677f901-2d4d-4de5-b1e2-436dae4a802e-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /data/products.parquet/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/data/products.parquet/_SUCCESS -------------------------------------------------------------------------------- /data/products.parquet/part-00000-1677f901-2d4d-4de5-b1e2-436dae4a802e-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/data/products.parquet/part-00000-1677f901-2d4d-4de5-b1e2-436dae4a802e-c000.snappy.parquet -------------------------------------------------------------------------------- /data/products_multiline.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "id": 1, 4 | "name": "iPhone 12", 5 | "category": "Electronics", 6 | "quantity": 10, 7 | "price": 899.99 8 | }, 9 | { 10 | "id": 2, 11 | "name": "Nike Air Max 90", 12 | "category": "Clothing", 13 | "quantity": 25, 14 | "price": 119.99 15 | }, 16 | { 17 | "id": 3, 18 | "name": "KitchenAid Stand Mixer", 19 | "category": "Home Appliances", 20 | "quantity": 5, 21 | "price": 299.99 22 | }, 23 | { 24 | "id": 4, 25 | "name": "The Great Gatsby", 26 | "category": "Books", 27 | "quantity": 50, 28 | "price": 12.99 29 | }, 30 | { 31 | "id": 5, 32 | "name": "L'Oreal Paris Mascara", 33 | "category": "Beauty", 34 | "quantity": 100, 35 | "price": 9.99 36 | }, 37 | { 38 | "id": 6, 39 | "name": "Yoga Mat", 40 | "category": "Sports", 41 | "quantity": 30, 42 | "price": 29.99 43 | }, 44 | { 45 | "id": 7, 46 | "name": "Samsung 4K Smart TV", 47 | "category": "Electronics", 48 | "quantity": 8, 49 | "price": 799.99 50 | }, 51 | { 52 | "id": 8, 53 | "name": "Levi's Jeans", 54 | "category": "Clothing", 55 | "quantity": 15, 56 | "price": 49.99 57 | }, 58 | { 59 | "id": 9, 60 | "name": "Dyson Vacuum Cleaner", 61 | "category": "Home Appliances", 62 | "quantity": 3, 63 | "price": 399.99 64 | }, 65 | { 66 | "id": 10, 67 | "name": "Harry Potter Series", 68 | "category": "Books", 69 | "quantity": 20, 70 | "price": 15.99 71 | }, 72 | { 73 | "id": 11, 74 | "name": "MAC Lipstick", 75 | "category": "Beauty", 76 | "quantity": 75, 77 | "price": 16.99 78 | }, 79 | { 80 | "id": 12, 81 | "name": "Adidas Running Shoes", 82 | "category": "Sports", 83 | "quantity": 22, 84 | "price": 59.99 85 | }, 86 | { 87 | "id": 13, 88 | "name": "PlayStation 5", 89 | "category": "Electronics", 90 | "quantity": 12, 91 | "price": 499.99 92 | }, 93 | { 94 | "id": 14, 95 | "name": "Hooded Sweatshirt", 96 | "category": "Clothing", 97 | "quantity": 10, 98 | "price": 34.99 99 | }, 100 | { 101 | "id": 15, 102 | "name": "Coffee Maker", 103 | "category": "Home Appliances", 104 | "quantity": 7, 105 | "price": 89.99 106 | }, 107 | { 108 | "id": 16, 109 | "name": "To Kill a Mockingbird", 110 | "category": "Books", 111 | "quantity": 15, 112 | "price": 9.99 113 | }, 114 | { 115 | "id": 17, 116 | "name": "Skincare Set", 117 | "category": "Beauty", 118 | "quantity": 50, 119 | "price": 49.99 120 | }, 121 | { 122 | "id": 18, 123 | "name": "Yoga Ball", 124 | "category": "Sports", 125 | "quantity": 18, 126 | "price": 19.99 127 | }, 128 | { 129 | "id": 19, 130 | "name": "Sony Noise-Canceling Headphones", 131 | "category": "Electronics", 132 | "quantity": 6, 133 | "price": 299.99 134 | }, 135 | { 136 | "id": 20, 137 | "name": "Puma T-shirt", 138 | "category": "Clothing", 139 | "quantity": 40, 140 | "price": 19.99 141 | } 142 | ] 143 | -------------------------------------------------------------------------------- /data/products_singleline.json: -------------------------------------------------------------------------------- 1 | {"id":1,"name":"iPhone 12","category":"Electronics","quantity":10,"price":899.99} 2 | {"id":2,"name":"Nike Air Max 90","category":"Clothing","quantity":25,"price":119.99} 3 | {"id":3,"name":"KitchenAid Stand Mixer","category":"Home Appliances","quantity":5,"price":299.99} 4 | {"id":4,"name":"The Great Gatsby","category":"Books","quantity":50,"price":12.99} 5 | {"id":5,"name":"L'Oreal Paris Mascara","category":"Beauty","quantity":100,"price":9.99} 6 | {"id":6,"name":"Yoga Mat","category":"Sports","quantity":30,"price":29.99} 7 | {"id":7,"name":"Samsung 4K Smart TV","category":"Electronics","quantity":8,"price":799.99} 8 | {"id":8,"name":"Levi's Jeans","category":"Clothing","quantity":15,"price":49.99} 9 | {"id":9,"name":"Dyson Vacuum Cleaner","category":"Home Appliances","quantity":3,"price":399.99} 10 | {"id":10,"name":"Harry Potter Series","category":"Books","quantity":20,"price":15.99} 11 | {"id":11,"name":"MAC Lipstick","category":"Beauty","quantity":75,"price":16.99} 12 | {"id":12,"name":"Adidas Running Shoes","category":"Sports","quantity":22,"price":59.99} 13 | {"id":13,"name":"PlayStation 5","category":"Electronics","quantity":12,"price":499.99} 14 | {"id":14,"name":"Hooded Sweatshirt","category":"Clothing","quantity":10,"price":34.99} 15 | {"id":15,"name":"Coffee Maker","category":"Home Appliances","quantity":7,"price":89.99} 16 | {"id":16,"name":"To Kill a Mockingbird","category":"Books","quantity":15,"price":9.99} 17 | {"id":17,"name":"Skincare Set","category":"Beauty","quantity":50,"price":49.99} 18 | {"id":18,"name":"Yoga Ball","category":"Sports","quantity":18,"price":19.99} 19 | {"id":19,"name":"Sony Noise-Canceling Headphones","category":"Electronics","quantity":6,"price":299.99} 20 | {"id":20,"name":"Puma T-shirt","category":"Clothing","quantity":40,"price":19.99} -------------------------------------------------------------------------------- /data/stocks.txt: -------------------------------------------------------------------------------- 1 | id,name,category,quantity,price 2 | 1,iPhone,Electronics,10,899.99 3 | 2,Macbook,Electronics,5,1299.99 4 | 3,iPad,Electronics,15,499.99 5 | 4,Samsung TV,Electronics,8,799.99 6 | 5,LG TV,Electronics,10,699.99 7 | 6,Nike Shoes,Clothing,30,99.99 8 | 7,Adidas Shoes,Clothing,25,89.99 9 | 8,Sony Headphones,Electronics,12,149.99 10 | 9,Beats Headphones,Electronics,20,199.99 11 | 10,Dining Table,Furniture,10,249.99 12 | 11,Study Desk,Furniture,8,149.99 13 | 12,Apples,Food,100,0.5 14 | 13,Bananas,Food,150,0.25 15 | 14,Oranges,Food,120,0.75 16 | 15,Chicken Breast,Food,50,3.99 17 | 16,Salmon Fillet,Food,30,5.99 18 | 17,Leather Jacket,Clothing,15,199.99 19 | 18,Winter Coat,Clothing,10,149.99 20 | 19,Yoga Mat,Sports,20,19.99 21 | 20,Dumbbell Set,Sports,15,49.99 22 | 21,Coffee Table,Furniture,5,129.99 23 | 22,Office Chair,Furniture,8,79.99 24 | 23,Bookshelf,Furniture,10,99.99 25 | 24,Laptop Bag,Accessories,25,29.99 26 | 25,Backpack,Accessories,30,24.99 27 | 26,Camera,Electronics,10,599.99 28 | 27,Printer,Electronics,8,129.99 29 | 28,Jeans,Clothing,30,59.99 30 | 29,T-shirt,Clothing,50,14.99 31 | 30,Sneakers,Clothing,40,79.99 32 | -------------------------------------------------------------------------------- /output.txt/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /output.txt/.part-00000.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /output.txt/.part-00001.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /output.txt/.part-00002.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/.part-00002.crc -------------------------------------------------------------------------------- /output.txt/.part-00003.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /output.txt/.part-00004.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /output.txt/.part-00005.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/.part-00005.crc -------------------------------------------------------------------------------- /output.txt/.part-00006.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /output.txt/.part-00007.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /output.txt/.part-00008.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/.part-00008.crc -------------------------------------------------------------------------------- /output.txt/.part-00009.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /output.txt/.part-00010.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /output.txt/.part-00011.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/.part-00011.crc -------------------------------------------------------------------------------- /output.txt/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/_SUCCESS -------------------------------------------------------------------------------- /output.txt/part-00000: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/part-00000 -------------------------------------------------------------------------------- /output.txt/part-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/part-00001 -------------------------------------------------------------------------------- /output.txt/part-00002: -------------------------------------------------------------------------------- 1 | ('Alice', 25) 2 | -------------------------------------------------------------------------------- /output.txt/part-00003: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/part-00003 -------------------------------------------------------------------------------- /output.txt/part-00004: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/part-00004 -------------------------------------------------------------------------------- /output.txt/part-00005: -------------------------------------------------------------------------------- 1 | ('Bob', 30) 2 | -------------------------------------------------------------------------------- /output.txt/part-00006: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/part-00006 -------------------------------------------------------------------------------- /output.txt/part-00007: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/part-00007 -------------------------------------------------------------------------------- /output.txt/part-00008: -------------------------------------------------------------------------------- 1 | ('Charlie', 35) 2 | -------------------------------------------------------------------------------- /output.txt/part-00009: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/part-00009 -------------------------------------------------------------------------------- /output.txt/part-00010: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/part-00010 -------------------------------------------------------------------------------- /output.txt/part-00011: -------------------------------------------------------------------------------- 1 | ('Alice', 40) 2 | --------------------------------------------------------------------------------