├── .gitignore
├── 01-PySpark-Get-Started.ipynb
├── 02-Create-SparkContext.ipynb
├── 03-Create-SparkSession.ipynb
├── 04-RDD-Operations.ipynb
├── 05-DataFrame-Intro.ipynb
├── 06-DataFrame-from-various-data-source.ipynb
├── 07-DataFrame-Operations.ipynb
├── 08-Spark-SQL.ipynb
├── LICENSE
├── README.md
├── data
    ├── data.txt
    ├── persons.csv
    ├── products.csv
    ├── products.parquet
    │   ├── ._SUCCESS.crc
    │   ├── .part-00000-1677f901-2d4d-4de5-b1e2-436dae4a802e-c000.snappy.parquet.crc
    │   ├── _SUCCESS
    │   └── part-00000-1677f901-2d4d-4de5-b1e2-436dae4a802e-c000.snappy.parquet
    ├── products_multiline.json
    ├── products_singleline.json
    └── stocks.txt
└── output.txt
    ├── ._SUCCESS.crc
    ├── .part-00000.crc
    ├── .part-00001.crc
    ├── .part-00002.crc
    ├── .part-00003.crc
    ├── .part-00004.crc
    ├── .part-00005.crc
    ├── .part-00006.crc
    ├── .part-00007.crc
    ├── .part-00008.crc
    ├── .part-00009.crc
    ├── .part-00010.crc
    ├── .part-00011.crc
    ├── _SUCCESS
    ├── part-00000
    ├── part-00001
    ├── part-00002
    ├── part-00003
    ├── part-00004
    ├── part-00005
    ├── part-00006
    ├── part-00007
    ├── part-00008
    ├── part-00009
    ├── part-00010
    └── part-00011


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/


--------------------------------------------------------------------------------
/01-PySpark-Get-Started.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "fe9fd1c0-db30-47b1-bbe2-0b1cbd97a9e2",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Set the PySpark environment variables\n",
 11 |     "import os\n",
 12 |     "os.environ['SPARK_HOME'] = \"/Users/coder2j/Apps/Spark\"\n",
 13 |     "os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'\n",
 14 |     "os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'\n",
 15 |     "os.environ['PYSPARK_PYTHON'] = 'python'"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 2,
 21 |    "id": "4a915758-1498-4831-820b-a44fd888e87b",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "# Import PySpark\n",
 26 |     "from pyspark.sql import SparkSession"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 3,
 32 |    "id": "bb53020b-1e79-4893-a13a-4968fa120fa3",
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stderr",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "Setting default log level to \"WARN\".\n",
 40 |       "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
 41 |       "23/06/25 21:26:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
 42 |      ]
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "# Create a SparkSession\n",
 47 |     "spark = SparkSession.builder \\\n",
 48 |     "    .appName(\"PySpark-Get-Started\") \\\n",
 49 |     "    .getOrCreate()"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 4,
 55 |    "id": "dde43975-a1f5-4ad1-88a3-76eb84215f2b",
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stderr",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "                                                                                \r"
 63 |      ]
 64 |     },
 65 |     {
 66 |      "name": "stdout",
 67 |      "output_type": "stream",
 68 |      "text": [
 69 |       "+-------+---+\n",
 70 |       "|   Name|Age|\n",
 71 |       "+-------+---+\n",
 72 |       "|  Alice| 25|\n",
 73 |       "|    Bob| 30|\n",
 74 |       "|Charlie| 35|\n",
 75 |       "+-------+---+\n",
 76 |       "\n"
 77 |      ]
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "# Test the setup\n",
 82 |     "data = [(\"Alice\", 25), (\"Bob\", 30), (\"Charlie\", 35)]\n",
 83 |     "df = spark.createDataFrame(data, [\"Name\", \"Age\"])\n",
 84 |     "df.show()"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "id": "096c5a89-058c-488a-9d9e-146fdb6a44dd",
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": []
 94 |   }
 95 |  ],
 96 |  "metadata": {
 97 |   "kernelspec": {
 98 |    "display_name": "Python 3 (ipykernel)",
 99 |    "language": "python",
100 |    "name": "python3"
101 |   },
102 |   "language_info": {
103 |    "codemirror_mode": {
104 |     "name": "ipython",
105 |     "version": 3
106 |    },
107 |    "file_extension": ".py",
108 |    "mimetype": "text/x-python",
109 |    "name": "python",
110 |    "nbconvert_exporter": "python",
111 |    "pygments_lexer": "ipython3",
112 |    "version": "3.10.8"
113 |   }
114 |  },
115 |  "nbformat": 4,
116 |  "nbformat_minor": 5
117 | }
118 | 


--------------------------------------------------------------------------------
/02-Create-SparkContext.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "9b42a8f0-9a63-461c-95dc-27847f5b0a40",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Set the PySpark environment variables\n",
 11 |     "import os\n",
 12 |     "os.environ['SPARK_HOME'] = \"/Users/coder2j/Apps/Spark\"\n",
 13 |     "os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'\n",
 14 |     "os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'\n",
 15 |     "os.environ['PYSPARK_PYTHON'] = 'python'"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "id": "8f8e53e1-7a47-4dc4-9450-6be3c466f0b3",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Create SparkContext in Apache Spark version 1.x"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 4,
 29 |    "id": "287412ee-4384-4621-b7c0-ff50013f9785",
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "from pyspark import SparkContext\n",
 34 |     "\n",
 35 |     "# Create a SparkContext object\n",
 36 |     "sc = SparkContext(appName=\"MySparkApplication\")"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 5,
 42 |    "id": "78f5b680-6bc0-42ef-99f6-83fd0da5e729",
 43 |    "metadata": {},
 44 |    "outputs": [
 45 |     {
 46 |      "data": {
 47 |       "text/html": [
 48 |        "\n",
 49 |        "        <div>\n",
 50 |        "            <p><b>SparkContext</b></p>\n",
 51 |        "\n",
 52 |        "            <p><a href=\"http://imac.fritz.box:4040\">Spark UI</a></p>\n",
 53 |        "\n",
 54 |        "            <dl>\n",
 55 |        "              <dt>Version</dt>\n",
 56 |        "                <dd><code>v3.4.1</code></dd>\n",
 57 |        "              <dt>Master</dt>\n",
 58 |        "                <dd><code>local[*]</code></dd>\n",
 59 |        "              <dt>AppName</dt>\n",
 60 |        "                <dd><code>MySparkApplication</code></dd>\n",
 61 |        "            </dl>\n",
 62 |        "        </div>\n",
 63 |        "        "
 64 |       ],
 65 |       "text/plain": [
 66 |        "<SparkContext master=local[*] appName=MySparkApplication>"
 67 |       ]
 68 |      },
 69 |      "execution_count": 5,
 70 |      "metadata": {},
 71 |      "output_type": "execute_result"
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "sc"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 6,
 81 |    "id": "c57fa548-9522-4e51-8c2d-4c6937eb3b3e",
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "# Shut down the current active SparkContext\n",
 86 |     "sc.stop()"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "id": "6cbd0d0d-2d17-44b9-ae3b-76b413b760c3",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "## Create SparkContext in Apache Spark version 2.x and later"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 7,
100 |    "id": "0508898d-8b48-4b16-bae9-87cead0489b1",
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "from pyspark.sql import SparkSession\n",
105 |     "\n",
106 |     "# Create a SparkSession\n",
107 |     "spark = SparkSession.builder \\\n",
108 |     "    .appName(\"MySparkApplication\") \\\n",
109 |     "    .getOrCreate()\n",
110 |     "\n",
111 |     "# Get the SparkContext from the SparkSession\n",
112 |     "sc = spark.sparkContext\n"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 8,
118 |    "id": "82c7ca1e-dd73-4306-ab7f-01bc371eb94e",
119 |    "metadata": {},
120 |    "outputs": [
121 |     {
122 |      "data": {
123 |       "text/html": [
124 |        "\n",
125 |        "        <div>\n",
126 |        "            <p><b>SparkContext</b></p>\n",
127 |        "\n",
128 |        "            <p><a href=\"http://imac.fritz.box:4040\">Spark UI</a></p>\n",
129 |        "\n",
130 |        "            <dl>\n",
131 |        "              <dt>Version</dt>\n",
132 |        "                <dd><code>v3.4.1</code></dd>\n",
133 |        "              <dt>Master</dt>\n",
134 |        "                <dd><code>local[*]</code></dd>\n",
135 |        "              <dt>AppName</dt>\n",
136 |        "                <dd><code>MySparkApplication</code></dd>\n",
137 |        "            </dl>\n",
138 |        "        </div>\n",
139 |        "        "
140 |       ],
141 |       "text/plain": [
142 |        "<SparkContext master=local[*] appName=MySparkApplication>"
143 |       ]
144 |      },
145 |      "execution_count": 8,
146 |      "metadata": {},
147 |      "output_type": "execute_result"
148 |     }
149 |    ],
150 |    "source": [
151 |     "sc"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 9,
157 |    "id": "9a3361a7-cb9d-49a3-b80b-820e490711e2",
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "# Shut down the current active SparkContext\n",
162 |     "sc.stop() #or spark.stop()"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "id": "10336d77-bc36-4101-8055-dd7a2496d4da",
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": []
172 |   }
173 |  ],
174 |  "metadata": {
175 |   "kernelspec": {
176 |    "display_name": "Python 3 (ipykernel)",
177 |    "language": "python",
178 |    "name": "python3"
179 |   },
180 |   "language_info": {
181 |    "codemirror_mode": {
182 |     "name": "ipython",
183 |     "version": 3
184 |    },
185 |    "file_extension": ".py",
186 |    "mimetype": "text/x-python",
187 |    "name": "python",
188 |    "nbconvert_exporter": "python",
189 |    "pygments_lexer": "ipython3",
190 |    "version": "3.10.8"
191 |   }
192 |  },
193 |  "nbformat": 4,
194 |  "nbformat_minor": 5
195 | }
196 | 


--------------------------------------------------------------------------------
/03-Create-SparkSession.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "5f80ab89-af0e-4139-8f6a-0a382310f34c",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Set the PySpark environment variables\n",
 11 |     "import os\n",
 12 |     "os.environ['SPARK_HOME'] = \"/Users/coder2j/Apps/Spark\"\n",
 13 |     "os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'\n",
 14 |     "os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'\n",
 15 |     "os.environ['PYSPARK_PYTHON'] = 'python'"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "id": "ee8d77ba-685b-4c20-8c09-ee2b2fc44abd",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Create SparkSession in Apache Spark"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 1,
 29 |    "id": "1df25dc4-3ffc-490d-9c03-d022cb2e7235",
 30 |    "metadata": {},
 31 |    "outputs": [
 32 |     {
 33 |      "name": "stderr",
 34 |      "output_type": "stream",
 35 |      "text": [
 36 |       "Setting default log level to \"WARN\".\n",
 37 |       "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
 38 |       "23/07/16 15:31:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
 39 |      ]
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "from pyspark.sql import SparkSession\n",
 44 |     "\n",
 45 |     "# Create a SparkSession\n",
 46 |     "spark = SparkSession.builder \\\n",
 47 |     "    .appName(\"MySparkApplication\") \\\n",
 48 |     "    .config(\"spark.executor.memory\", \"2g\") \\\n",
 49 |     "    .config(\"spark.sql.shuffle.partitions\", \"4\") \\\n",
 50 |     "    .getOrCreate()"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 5,
 56 |    "id": "db19ae45-2c01-408b-b807-f883a2d796c3",
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "data": {
 61 |       "text/html": [
 62 |        "\n",
 63 |        "            <div>\n",
 64 |        "                <p><b>SparkSession - in-memory</b></p>\n",
 65 |        "                \n",
 66 |        "        <div>\n",
 67 |        "            <p><b>SparkContext</b></p>\n",
 68 |        "\n",
 69 |        "            <p><a href=\"http://imac.fritz.box:4040\">Spark UI</a></p>\n",
 70 |        "\n",
 71 |        "            <dl>\n",
 72 |        "              <dt>Version</dt>\n",
 73 |        "                <dd><code>v3.4.1</code></dd>\n",
 74 |        "              <dt>Master</dt>\n",
 75 |        "                <dd><code>local[*]</code></dd>\n",
 76 |        "              <dt>AppName</dt>\n",
 77 |        "                <dd><code>MySparkApplication</code></dd>\n",
 78 |        "            </dl>\n",
 79 |        "        </div>\n",
 80 |        "        \n",
 81 |        "            </div>\n",
 82 |        "        "
 83 |       ],
 84 |       "text/plain": [
 85 |        "<pyspark.sql.session.SparkSession at 0x10bb6e170>"
 86 |       ]
 87 |      },
 88 |      "execution_count": 5,
 89 |      "metadata": {},
 90 |      "output_type": "execute_result"
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "# Perform operations using the SparkSession\n",
 95 |     "spark"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 6,
101 |    "id": "d0c09484-8c96-4c73-9dde-cd8587b0c80f",
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "# Shut down the current active SparkSession\n",
106 |     "spark.stop()"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "id": "f11e547f-54bd-43ee-b051-6722802bd567",
112 |    "metadata": {},
113 |    "source": []
114 |   }
115 |  ],
116 |  "metadata": {
117 |   "kernelspec": {
118 |    "display_name": "Python 3 (ipykernel)",
119 |    "language": "python",
120 |    "name": "python3"
121 |   },
122 |   "language_info": {
123 |    "codemirror_mode": {
124 |     "name": "ipython",
125 |     "version": 3
126 |    },
127 |    "file_extension": ".py",
128 |    "mimetype": "text/x-python",
129 |    "name": "python",
130 |    "nbconvert_exporter": "python",
131 |    "pygments_lexer": "ipython3",
132 |    "version": "3.10.8"
133 |   }
134 |  },
135 |  "nbformat": 4,
136 |  "nbformat_minor": 5
137 | }
138 | 


--------------------------------------------------------------------------------
/04-RDD-Operations.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "5c0ddfc8-a1d1-4bb4-9cd5-180f11e4f3af",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Set the PySpark environment variables\n",
 11 |     "import os\n",
 12 |     "os.environ['SPARK_HOME'] = \"/Users/coder2j/Apps/Spark\"\n",
 13 |     "os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'\n",
 14 |     "os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'\n",
 15 |     "os.environ['PYSPARK_PYTHON'] = 'python'"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 3,
 21 |    "id": "ff054b1d-f6f7-4c8f-9b50-56d3b5ed1ac9",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from pyspark.sql import SparkSession"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 4,
 31 |    "id": "f4658324-c133-4921-b53f-dd6141558f98",
 32 |    "metadata": {},
 33 |    "outputs": [
 34 |     {
 35 |      "name": "stderr",
 36 |      "output_type": "stream",
 37 |      "text": [
 38 |       "Setting default log level to \"WARN\".\n",
 39 |       "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
 40 |       "23/07/16 18:20:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
 41 |      ]
 42 |     }
 43 |    ],
 44 |    "source": [
 45 |     "# Create a SparkSession\n",
 46 |     "spark = SparkSession.builder.appName(\"RDD-Demo\").getOrCreate()"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "id": "97a5a364-a829-4b89-8cb0-6872d0bdafb3",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "### How to create RDDs"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 5,
 60 |    "id": "0e31a7ea-d6fd-49f4-88cd-40c2bda5838a",
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "numbers = [1, 2, 3, 4, 5]\n",
 65 |     "rdd = spark.sparkContext.parallelize(numbers)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 6,
 71 |    "id": "09f84b37-e1a3-4d90-929e-a60c9063d669",
 72 |    "metadata": {},
 73 |    "outputs": [
 74 |     {
 75 |      "data": {
 76 |       "text/plain": [
 77 |        "[1, 2, 3, 4, 5]"
 78 |       ]
 79 |      },
 80 |      "execution_count": 6,
 81 |      "metadata": {},
 82 |      "output_type": "execute_result"
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "# Collect action: Retrieve all elements of the RDD\n",
 87 |     "rdd.collect()"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 7,
 93 |    "id": "fd2ee436-1186-488f-8294-b46ce9c67cac",
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "# Create an RDD from a list of tuples\n",
 98 |     "data = [(\"Alice\", 25), (\"Bob\", 30), (\"Charlie\", 35), (\"Alice\", 40)]\n",
 99 |     "rdd = spark.sparkContext.parallelize(data)"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 8,
105 |    "id": "adad76e0-2c10-4a41-b947-89547fe94d35",
106 |    "metadata": {},
107 |    "outputs": [
108 |     {
109 |      "name": "stdout",
110 |      "output_type": "stream",
111 |      "text": [
112 |       "All elements of the rdd:  [('Alice', 25), ('Bob', 30), ('Charlie', 35), ('Alice', 40)]\n"
113 |      ]
114 |     }
115 |    ],
116 |    "source": [
117 |     "# Collect action: Retrieve all elements of the RDD\n",
118 |     "print(\"All elements of the rdd: \", rdd.collect())"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "id": "44f98372-bc7a-43d7-b1c1-6d7d102bee29",
124 |    "metadata": {},
125 |    "source": [
126 |     "### RDDs Operation: Actions "
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 9,
132 |    "id": "60353b82-fd00-4e94-b11c-d31e8e005122",
133 |    "metadata": {},
134 |    "outputs": [
135 |     {
136 |      "name": "stderr",
137 |      "output_type": "stream",
138 |      "text": [
139 |       "[Stage 2:====================================================>    (11 + 1) / 12]\r"
140 |      ]
141 |     },
142 |     {
143 |      "name": "stdout",
144 |      "output_type": "stream",
145 |      "text": [
146 |       "The total number of elements in rdd:  4\n"
147 |      ]
148 |     },
149 |     {
150 |      "name": "stderr",
151 |      "output_type": "stream",
152 |      "text": [
153 |       "                                                                                \r"
154 |      ]
155 |     }
156 |    ],
157 |    "source": [
158 |     "# Count action: Count the number of elements in the RDD\n",
159 |     "count = rdd.count()\n",
160 |     "print(\"The total number of elements in rdd: \", count)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 10,
166 |    "id": "456dbe75-0182-47ba-aaf9-34ad9ab06f55",
167 |    "metadata": {},
168 |    "outputs": [
169 |     {
170 |      "name": "stdout",
171 |      "output_type": "stream",
172 |      "text": [
173 |       "The first element of the rdd:  ('Alice', 25)\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "# First action: Retrieve the first element of the RDD\n",
179 |     "first_element = rdd.first()\n",
180 |     "print(\"The first element of the rdd: \", first_element)"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 11,
186 |    "id": "979ae249-efb6-4b8b-b8e5-0cddea496ff9",
187 |    "metadata": {},
188 |    "outputs": [
189 |     {
190 |      "name": "stdout",
191 |      "output_type": "stream",
192 |      "text": [
193 |       "The first two elements of the rdd:  [('Alice', 25), ('Bob', 30)]\n"
194 |      ]
195 |     }
196 |    ],
197 |    "source": [
198 |     "# Take action: Retrieve the n elements of the RDD\n",
199 |     "taken_elements = rdd.take(2)\n",
200 |     "print(\"The first two elements of the rdd: \", taken_elements)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 12,
206 |    "id": "38bd8336-338e-4876-96e0-5e9aa19b5b36",
207 |    "metadata": {},
208 |    "outputs": [
209 |     {
210 |      "name": "stderr",
211 |      "output_type": "stream",
212 |      "text": [
213 |       "('Charlie', 35)\n",
214 |       "('Alice', 25)\n",
215 |       "('Bob', 30)\n",
216 |       "('Alice', 40)\n"
217 |      ]
218 |     }
219 |    ],
220 |    "source": [
221 |     "# Foreach action: Print each element of the RDD\n",
222 |     "rdd.foreach(lambda x: print(x))"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "markdown",
227 |    "id": "c30f740c-f5fd-48d1-9e8b-a0f78caab408",
228 |    "metadata": {},
229 |    "source": [
230 |     "### RDDs Operation: Transformations "
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 13,
236 |    "id": "b3f7d23b-f246-4797-97ff-a56766657d53",
237 |    "metadata": {},
238 |    "outputs": [],
239 |    "source": [
240 |     "# Map transformation: Convert name to uppercase\n",
241 |     "mapped_rdd = rdd.map(lambda x: (x[0].upper(), x[1]))"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 14,
247 |    "id": "78d8ef21-d4d1-4361-b448-6c23e251e8f0",
248 |    "metadata": {},
249 |    "outputs": [
250 |     {
251 |      "name": "stdout",
252 |      "output_type": "stream",
253 |      "text": [
254 |       "rdd with uppercease name:  [('ALICE', 25), ('BOB', 30), ('CHARLIE', 35), ('ALICE', 40)]\n"
255 |      ]
256 |     }
257 |    ],
258 |    "source": [
259 |     "result = mapped_rdd.collect()\n",
260 |     "print(\"rdd with uppercease name: \", result)"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 15,
266 |    "id": "2211dbf5-64be-4966-bcb4-e11fdbc9363f",
267 |    "metadata": {},
268 |    "outputs": [
269 |     {
270 |      "data": {
271 |       "text/plain": [
272 |        "[('Charlie', 35), ('Alice', 40)]"
273 |       ]
274 |      },
275 |      "execution_count": 15,
276 |      "metadata": {},
277 |      "output_type": "execute_result"
278 |     }
279 |    ],
280 |    "source": [
281 |     "# Filter transformation: Filter records where age is greater than 30\n",
282 |     "filtered_rdd = rdd.filter(lambda x: x[1] > 30)\n",
283 |     "filtered_rdd.collect()"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 16,
289 |    "id": "325614c2-ede1-45f4-9818-6e89cb72e044",
290 |    "metadata": {},
291 |    "outputs": [
292 |     {
293 |      "data": {
294 |       "text/plain": [
295 |        "[('Alice', 65), ('Bob', 30), ('Charlie', 35)]"
296 |       ]
297 |      },
298 |      "execution_count": 16,
299 |      "metadata": {},
300 |      "output_type": "execute_result"
301 |     }
302 |    ],
303 |    "source": [
304 |     "# ReduceByKey transformation: Calculate the total age for each name\n",
305 |     "reduced_rdd = rdd.reduceByKey(lambda x, y: x + y)\n",
306 |     "reduced_rdd.collect()"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": 17,
312 |    "id": "1a5e96bb-f8ce-4239-949f-184648b60ae7",
313 |    "metadata": {},
314 |    "outputs": [
315 |     {
316 |      "data": {
317 |       "text/plain": [
318 |        "[('Alice', 40), ('Charlie', 35), ('Bob', 30), ('Alice', 25)]"
319 |       ]
320 |      },
321 |      "execution_count": 17,
322 |      "metadata": {},
323 |      "output_type": "execute_result"
324 |     }
325 |    ],
326 |    "source": [
327 |     "# SortBy transformation: Sort the RDD by age in descending order\n",
328 |     "sorted_rdd = rdd.sortBy(lambda x: x[1], ascending=False)\n",
329 |     "sorted_rdd.collect()"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "markdown",
334 |    "id": "d51d11e9-260f-421d-94d9-350e5c6146bb",
335 |    "metadata": {},
336 |    "source": [
337 |     "### Save RDDs to text file and read RDDs from text file"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 18,
343 |    "id": "c8389f05-063a-4a52-beb2-efc4e50daa6a",
344 |    "metadata": {},
345 |    "outputs": [],
346 |    "source": [
347 |     "# Save action: Save the RDD to a text file\n",
348 |     "rdd.saveAsTextFile(\"output.txt\")"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": 19,
354 |    "id": "2a557485-8d1b-431c-aba5-dcbe21f3970d",
355 |    "metadata": {},
356 |    "outputs": [
357 |     {
358 |      "data": {
359 |       "text/plain": [
360 |        "[\"('Alice', 40)\", \"('Bob', 30)\", \"('Alice', 25)\", \"('Charlie', 35)\"]"
361 |       ]
362 |      },
363 |      "execution_count": 19,
364 |      "metadata": {},
365 |      "output_type": "execute_result"
366 |     }
367 |    ],
368 |    "source": [
369 |     "# create rdd from text file\n",
370 |     "rdd_text = spark.sparkContext.textFile(\"output.txt\")\n",
371 |     "rdd_text.collect()"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "markdown",
376 |    "id": "c799ec33-2a47-4d8e-b239-92f25c8e7a37",
377 |    "metadata": {},
378 |    "source": [
379 |     "### Shut down Spark Session"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": 20,
385 |    "id": "b501edd4-5d9a-4ffe-8cbb-abcf077868c4",
386 |    "metadata": {},
387 |    "outputs": [],
388 |    "source": [
389 |     "spark.stop()"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": null,
395 |    "id": "695cb724-4691-441e-97f1-7320c109a62f",
396 |    "metadata": {},
397 |    "outputs": [],
398 |    "source": []
399 |   }
400 |  ],
401 |  "metadata": {
402 |   "kernelspec": {
403 |    "display_name": "Python 3 (ipykernel)",
404 |    "language": "python",
405 |    "name": "python3"
406 |   },
407 |   "language_info": {
408 |    "codemirror_mode": {
409 |     "name": "ipython",
410 |     "version": 3
411 |    },
412 |    "file_extension": ".py",
413 |    "mimetype": "text/x-python",
414 |    "name": "python",
415 |    "nbconvert_exporter": "python",
416 |    "pygments_lexer": "ipython3",
417 |    "version": "3.10.8"
418 |   }
419 |  },
420 |  "nbformat": 4,
421 |  "nbformat_minor": 5
422 | }
423 | 


--------------------------------------------------------------------------------
/05-DataFrame-Intro.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 37,
  6 |    "id": "68a6d7a7-693c-4fae-804f-3d92a1a30e35",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Set the PySpark environment variables\n",
 11 |     "import os\n",
 12 |     "os.environ['SPARK_HOME'] = \"/Users/coder2j/Apps/Spark\"\n",
 13 |     "os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'\n",
 14 |     "os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'\n",
 15 |     "os.environ['PYSPARK_PYTHON'] = 'python'"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 38,
 21 |    "id": "b9cb5875-598a-4d74-be55-3795530d9206",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from pyspark.sql import SparkSession\n",
 26 |     "from pyspark.sql.functions import desc"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 39,
 32 |    "id": "afd73f68-a07d-47df-952e-38f0e681320c",
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "# Create a SparkSession\n",
 37 |     "spark = SparkSession.builder.appName(\"DataFrame-Demo\").getOrCreate()"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "id": "3b294de5-c4a9-4f03-8dbc-ee29caaf0b99",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "### Using RDDs"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 40,
 51 |    "id": "97c0daf5-9971-4e48-ae98-fc090a4edf14",
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "name": "stderr",
 56 |      "output_type": "stream",
 57 |      "text": [
 58 |       "                                                                                \r"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "rdd = spark.sparkContext.textFile(\"./data/data.txt\")\n",
 64 |     "result_rdd = rdd.flatMap(lambda line: line.split(\" \")) \\\n",
 65 |     "    .map(lambda word: (word, 1)) \\\n",
 66 |     "    .reduceByKey(lambda a, b: a + b) \\\n",
 67 |     "    .sortBy(lambda x: x[1], ascending=False)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 41,
 73 |    "id": "96f3a503-e709-4248-91cf-5a55a1ee3549",
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "data": {
 78 |       "text/plain": [
 79 |        "[('the', 12),\n",
 80 |        " ('of', 7),\n",
 81 |        " ('a', 7),\n",
 82 |        " ('in', 5),\n",
 83 |        " ('distributed', 5),\n",
 84 |        " ('Spark', 4),\n",
 85 |        " ('is', 3),\n",
 86 |        " ('as', 3),\n",
 87 |        " ('API', 3),\n",
 88 |        " ('on', 3)]"
 89 |       ]
 90 |      },
 91 |      "execution_count": 41,
 92 |      "metadata": {},
 93 |      "output_type": "execute_result"
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "result_rdd.take(10)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "id": "a02b9bba-8a25-4e8e-aed9-edb6700b391b",
103 |    "metadata": {},
104 |    "source": [
105 |     "### Using DataFrames"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 42,
111 |    "id": "248e9131-53fb-4c39-b301-4d415dcc169d",
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "df = spark.read.text(\"./data/data.txt\")\n",
116 |     "\n",
117 |     "result_df = df.selectExpr(\"explode(split(value, ' ')) as word\") \\\n",
118 |     "    .groupBy(\"word\").count().orderBy(desc(\"count\"))"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 43,
124 |    "id": "18231aa3-f162-4261-9a65-275f088f1675",
125 |    "metadata": {},
126 |    "outputs": [
127 |     {
128 |      "data": {
129 |       "text/plain": [
130 |        "[Row(word='the', count=12),\n",
131 |        " Row(word='of', count=7),\n",
132 |        " Row(word='a', count=7),\n",
133 |        " Row(word='in', count=5),\n",
134 |        " Row(word='distributed', count=5),\n",
135 |        " Row(word='Spark', count=4),\n",
136 |        " Row(word='API', count=3),\n",
137 |        " Row(word='RDD', count=3),\n",
138 |        " Row(word='is', count=3),\n",
139 |        " Row(word='on', count=3)]"
140 |       ]
141 |      },
142 |      "execution_count": 43,
143 |      "metadata": {},
144 |      "output_type": "execute_result"
145 |     }
146 |    ],
147 |    "source": [
148 |     "result_df.take(10)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 44,
154 |    "id": "a4ece8d9-ed91-4b4e-802f-a6e260dc46b4",
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "spark.stop()"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "id": "e509b04d-32b3-4b17-ab4e-6ed025b09762",
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": []
168 |   }
169 |  ],
170 |  "metadata": {
171 |   "kernelspec": {
172 |    "display_name": "Python 3 (ipykernel)",
173 |    "language": "python",
174 |    "name": "python3"
175 |   },
176 |   "language_info": {
177 |    "codemirror_mode": {
178 |     "name": "ipython",
179 |     "version": 3
180 |    },
181 |    "file_extension": ".py",
182 |    "mimetype": "text/x-python",
183 |    "name": "python",
184 |    "nbconvert_exporter": "python",
185 |    "pygments_lexer": "ipython3",
186 |    "version": "3.10.8"
187 |   }
188 |  },
189 |  "nbformat": 4,
190 |  "nbformat_minor": 5
191 | }
192 | 


--------------------------------------------------------------------------------
/06-DataFrame-from-various-data-source.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "3d303751-5833-413a-8698-7d9cc74001cc",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Set the PySpark environment variables\n",
 11 |     "import os\n",
 12 |     "os.environ['SPARK_HOME'] = \"/Users/coder2j/Apps/Spark\"\n",
 13 |     "os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'\n",
 14 |     "os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'\n",
 15 |     "os.environ['PYSPARK_PYTHON'] = 'python'"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 2,
 21 |    "id": "fbcf8801-018a-4f78-b0ea-cb83e4660e96",
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stderr",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "Setting default log level to \"WARN\".\n",
 29 |       "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
 30 |       "23/08/13 20:43:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
 31 |      ]
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "from pyspark.sql import SparkSession\n",
 36 |     "\n",
 37 |     "# Create a SparkSession\n",
 38 |     "spark = SparkSession.builder.appName(\"Create-DataFrame\").getOrCreate()"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "id": "2d12d97d-4961-4b5e-b571-8df65be48d94",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "### Read CSV file into DataFrame"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "id": "b8446479-fc5d-4422-9a1e-129ece15f941",
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "name": "stdout",
 57 |      "output_type": "stream",
 58 |      "text": [
 59 |       "id,name,category,quantity,price\n",
 60 |       "1,iPhone 12,Electronics,10,899.99\n",
 61 |       "2,Nike Air Max 90,Clothing,25,119.99\n",
 62 |       "3,KitchenAid Stand Mixer,Home Appliances,5,299.99\n",
 63 |       "4,The Great Gatsby,Books,50,12.99\n",
 64 |       "5,L'Oreal Paris Mascara,Beauty,100,9.99\n",
 65 |       "6,Yoga Mat,Sports,30,29.99\n",
 66 |       "7,Samsung 4K Smart TV,Electronics,8,799.99\n",
 67 |       "8,Levi's Jeans,Clothing,15,49.99\n",
 68 |       "9,Dyson Vacuum Cleaner,Home Appliances,3,399.99\n"
 69 |      ]
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "%%bash \n",
 74 |     "head -10 ./data/products.csv"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "id": "65f9db29-2efb-4874-a512-9add235d292d",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "#### Read CSV with header"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 6,
 88 |    "id": "7580ed5e-8cac-4af5-8955-1eea0ffe4c0a",
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "# Read CSV file into DataFrame\n",
 93 |     "csv_file_path = \"./data/products.csv\"\n",
 94 |     "df = spark.read.csv(csv_file_path, header=True)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 7,
100 |    "id": "e595ec04-655d-4caa-91ec-d8e298c3183b",
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "name": "stdout",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "root\n",
108 |       " |-- id: string (nullable = true)\n",
109 |       " |-- name: string (nullable = true)\n",
110 |       " |-- category: string (nullable = true)\n",
111 |       " |-- quantity: string (nullable = true)\n",
112 |       " |-- price: string (nullable = true)\n",
113 |       "\n",
114 |       "+---+--------------------+---------------+--------+------+\n",
115 |       "| id|                name|       category|quantity| price|\n",
116 |       "+---+--------------------+---------------+--------+------+\n",
117 |       "|  1|           iPhone 12|    Electronics|      10|899.99|\n",
118 |       "|  2|     Nike Air Max 90|       Clothing|      25|119.99|\n",
119 |       "|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|\n",
120 |       "|  4|    The Great Gatsby|          Books|      50| 12.99|\n",
121 |       "|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|\n",
122 |       "+---+--------------------+---------------+--------+------+\n",
123 |       "only showing top 5 rows\n",
124 |       "\n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "# Display schema of DataFrame\n",
130 |     "df.printSchema()\n",
131 |     "\n",
132 |     "# Display content of DataFrame\n",
133 |     "df.show(5)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "id": "1f43038c-30e3-45fc-b927-f1c62c1cdf84",
139 |    "metadata": {},
140 |    "source": [
141 |     "#### Read CSV with an explicit schema definition"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 8,
147 |    "id": "6a105df6-1196-4a1c-95df-a54fd699986d",
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "# import necessary types\n",
152 |     "from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 9,
158 |    "id": "cecfacfc-5488-40a8-b26e-fefc07c61d88",
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "# Define the schema\n",
163 |     "schema = StructType([\n",
164 |     "    StructField(name=\"id\", dataType=IntegerType(), nullable=True),\n",
165 |     "    StructField(name=\"name\", dataType=StringType(), nullable=True),\n",
166 |     "    StructField(name=\"category\", dataType=StringType(), nullable=True),\n",
167 |     "    StructField(name=\"quantity\", dataType=IntegerType(), nullable=True),\n",
168 |     "    StructField(name=\"price\", dataType=DoubleType(), nullable=True)\n",
169 |     "])"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 10,
175 |    "id": "268f68db-1f89-4b94-979b-da0aa16b990f",
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "# Read CSV file into DataFrame with schema definition\n",
180 |     "csv_file_path = \"./data/products.csv\"\n",
181 |     "df = spark.read.csv(csv_file_path, header=True, schema=schema)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 11,
187 |    "id": "d1ffe583-5460-484c-b136-ee4715f4b0d4",
188 |    "metadata": {},
189 |    "outputs": [
190 |     {
191 |      "name": "stdout",
192 |      "output_type": "stream",
193 |      "text": [
194 |       "root\n",
195 |       " |-- id: integer (nullable = true)\n",
196 |       " |-- name: string (nullable = true)\n",
197 |       " |-- category: string (nullable = true)\n",
198 |       " |-- quantity: integer (nullable = true)\n",
199 |       " |-- price: double (nullable = true)\n",
200 |       "\n",
201 |       "+---+--------------------+---------------+--------+------+\n",
202 |       "| id|                name|       category|quantity| price|\n",
203 |       "+---+--------------------+---------------+--------+------+\n",
204 |       "|  1|           iPhone 12|    Electronics|      10|899.99|\n",
205 |       "|  2|     Nike Air Max 90|       Clothing|      25|119.99|\n",
206 |       "|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|\n",
207 |       "|  4|    The Great Gatsby|          Books|      50| 12.99|\n",
208 |       "|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|\n",
209 |       "+---+--------------------+---------------+--------+------+\n",
210 |       "only showing top 5 rows\n",
211 |       "\n"
212 |      ]
213 |     }
214 |    ],
215 |    "source": [
216 |     "# Display schema of DataFrame\n",
217 |     "df.printSchema()\n",
218 |     "\n",
219 |     "# Display content of DataFrame\n",
220 |     "df.show(5)"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "id": "1f099209-1c77-4c2b-8142-944ad92d4723",
226 |    "metadata": {},
227 |    "source": [
228 |     "#### Read CSV with inferSchema"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 12,
234 |    "id": "13f37a98-6810-43f5-8229-267d6528cea5",
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "# Read CSV file into DataFrame with inferSchema\n",
239 |     "csv_file_path = \"./data/products.csv\"\n",
240 |     "df = spark.read.csv(csv_file_path, header=True, inferSchema=True)"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 13,
246 |    "id": "c53d37bd-6bc6-4eaf-b0c0-879d3302df8d",
247 |    "metadata": {},
248 |    "outputs": [
249 |     {
250 |      "name": "stdout",
251 |      "output_type": "stream",
252 |      "text": [
253 |       "root\n",
254 |       " |-- id: integer (nullable = true)\n",
255 |       " |-- name: string (nullable = true)\n",
256 |       " |-- category: string (nullable = true)\n",
257 |       " |-- quantity: integer (nullable = true)\n",
258 |       " |-- price: double (nullable = true)\n",
259 |       "\n",
260 |       "+---+--------------------+---------------+--------+------+\n",
261 |       "| id|                name|       category|quantity| price|\n",
262 |       "+---+--------------------+---------------+--------+------+\n",
263 |       "|  1|           iPhone 12|    Electronics|      10|899.99|\n",
264 |       "|  2|     Nike Air Max 90|       Clothing|      25|119.99|\n",
265 |       "|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|\n",
266 |       "|  4|    The Great Gatsby|          Books|      50| 12.99|\n",
267 |       "|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|\n",
268 |       "+---+--------------------+---------------+--------+------+\n",
269 |       "only showing top 5 rows\n",
270 |       "\n"
271 |      ]
272 |     }
273 |    ],
274 |    "source": [
275 |     "# Display schema of DataFrame\n",
276 |     "df.printSchema()\n",
277 |     "\n",
278 |     "# Display content of DataFrame\n",
279 |     "df.show(5)"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "id": "16cd7777-ced0-4f55-9825-f6a868636d47",
285 |    "metadata": {},
286 |    "source": [
287 |     "### Read JSON file into DataFrame"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "markdown",
292 |    "id": "4d339da0-ea17-474b-82e8-54785bc2ecf3",
293 |    "metadata": {},
294 |    "source": [
295 |     "#### Single Line JSON"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 14,
301 |    "id": "d3fb6fdc-da18-4edd-a815-3152cfc2dcfd",
302 |    "metadata": {},
303 |    "outputs": [
304 |     {
305 |      "name": "stdout",
306 |      "output_type": "stream",
307 |      "text": [
308 |       "{\"id\":1,\"name\":\"iPhone 12\",\"category\":\"Electronics\",\"quantity\":10,\"price\":899.99}\n",
309 |       "{\"id\":2,\"name\":\"Nike Air Max 90\",\"category\":\"Clothing\",\"quantity\":25,\"price\":119.99}\n",
310 |       "{\"id\":3,\"name\":\"KitchenAid Stand Mixer\",\"category\":\"Home Appliances\",\"quantity\":5,\"price\":299.99}\n",
311 |       "{\"id\":4,\"name\":\"The Great Gatsby\",\"category\":\"Books\",\"quantity\":50,\"price\":12.99}\n",
312 |       "{\"id\":5,\"name\":\"L'Oreal Paris Mascara\",\"category\":\"Beauty\",\"quantity\":100,\"price\":9.99}\n",
313 |       "{\"id\":6,\"name\":\"Yoga Mat\",\"category\":\"Sports\",\"quantity\":30,\"price\":29.99}\n",
314 |       "{\"id\":7,\"name\":\"Samsung 4K Smart TV\",\"category\":\"Electronics\",\"quantity\":8,\"price\":799.99}\n",
315 |       "{\"id\":8,\"name\":\"Levi's Jeans\",\"category\":\"Clothing\",\"quantity\":15,\"price\":49.99}\n",
316 |       "{\"id\":9,\"name\":\"Dyson Vacuum Cleaner\",\"category\":\"Home Appliances\",\"quantity\":3,\"price\":399.99}\n",
317 |       "{\"id\":10,\"name\":\"Harry Potter Series\",\"category\":\"Books\",\"quantity\":20,\"price\":15.99}\n"
318 |      ]
319 |     }
320 |    ],
321 |    "source": [
322 |     "%%bash\n",
323 |     "head -10 data/products_singleline.json"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 15,
329 |    "id": "6cfe5f2b-7720-403b-a8bf-aec931bca199",
330 |    "metadata": {},
331 |    "outputs": [],
332 |    "source": [
333 |     "# Read single line JSON\n",
334 |     "# Each row is a JSON record, records are separated by new line\n",
335 |     "json_file_path = \"./data/products_singleline.json\"\n",
336 |     "df = spark.read.json(json_file_path)"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 16,
342 |    "id": "0cc72e44-36ca-44cb-8d31-36f473e52f9a",
343 |    "metadata": {},
344 |    "outputs": [
345 |     {
346 |      "name": "stdout",
347 |      "output_type": "stream",
348 |      "text": [
349 |       "root\n",
350 |       " |-- category: string (nullable = true)\n",
351 |       " |-- id: long (nullable = true)\n",
352 |       " |-- name: string (nullable = true)\n",
353 |       " |-- price: double (nullable = true)\n",
354 |       " |-- quantity: long (nullable = true)\n",
355 |       "\n",
356 |       "+---------------+---+--------------------+------+--------+\n",
357 |       "|       category| id|                name| price|quantity|\n",
358 |       "+---------------+---+--------------------+------+--------+\n",
359 |       "|    Electronics|  1|           iPhone 12|899.99|      10|\n",
360 |       "|       Clothing|  2|     Nike Air Max 90|119.99|      25|\n",
361 |       "|Home Appliances|  3|KitchenAid Stand ...|299.99|       5|\n",
362 |       "|          Books|  4|    The Great Gatsby| 12.99|      50|\n",
363 |       "|         Beauty|  5|L'Oreal Paris Mas...|  9.99|     100|\n",
364 |       "+---------------+---+--------------------+------+--------+\n",
365 |       "only showing top 5 rows\n",
366 |       "\n"
367 |      ]
368 |     }
369 |    ],
370 |    "source": [
371 |     "# Display schema of DataFrame\n",
372 |     "df.printSchema()\n",
373 |     "\n",
374 |     "# Display content of DataFrame\n",
375 |     "df.show(5)"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "markdown",
380 |    "id": "6f7db26b-be18-4602-9cd8-780fc82294f6",
381 |    "metadata": {},
382 |    "source": [
383 |     "#### Multi-lines JSON"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": 17,
389 |    "id": "c76efef7-a884-464e-8132-dabfde493dfb",
390 |    "metadata": {},
391 |    "outputs": [
392 |     {
393 |      "name": "stdout",
394 |      "output_type": "stream",
395 |      "text": [
396 |       "[\n",
397 |       "  {\n",
398 |       "    \"id\": 1,\n",
399 |       "    \"name\": \"iPhone 12\",\n",
400 |       "    \"category\": \"Electronics\",\n",
401 |       "    \"quantity\": 10,\n",
402 |       "    \"price\": 899.99\n",
403 |       "  },\n",
404 |       "  {\n",
405 |       "    \"id\": 2,\n",
406 |       "    \"name\": \"Nike Air Max 90\",\n",
407 |       "    \"category\": \"Clothing\",\n",
408 |       "    \"quantity\": 25,\n",
409 |       "    \"price\": 119.99\n",
410 |       "  },\n",
411 |       "  {\n",
412 |       "    \"id\": 3,\n",
413 |       "    \"name\": \"KitchenAid Stand Mixer\",\n",
414 |       "    \"category\": \"Home Appliances\",\n",
415 |       "    \"quantity\": 5,\n"
416 |      ]
417 |     }
418 |    ],
419 |    "source": [
420 |     "%%bash\n",
421 |     "head -20 data/products_multiline.json"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": 18,
427 |    "id": "89755543-83fd-4f85-8829-e34ec16e9dec",
428 |    "metadata": {},
429 |    "outputs": [],
430 |    "source": [
431 |     "# Read multi-line JSON\n",
432 |     "# JSON is an array of record, records are separated by a comma.\n",
433 |     "# each record is defined in multiple lines\n",
434 |     "json_file_path = \"./data/products_multiline.json\"\n",
435 |     "df = spark.read.json(json_file_path, multiLine=True)"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": 19,
441 |    "id": "c0aa892a-6784-424f-b40a-7469035cd891",
442 |    "metadata": {},
443 |    "outputs": [
444 |     {
445 |      "name": "stdout",
446 |      "output_type": "stream",
447 |      "text": [
448 |       "root\n",
449 |       " |-- category: string (nullable = true)\n",
450 |       " |-- id: long (nullable = true)\n",
451 |       " |-- name: string (nullable = true)\n",
452 |       " |-- price: double (nullable = true)\n",
453 |       " |-- quantity: long (nullable = true)\n",
454 |       "\n",
455 |       "+---------------+---+--------------------+------+--------+\n",
456 |       "|       category| id|                name| price|quantity|\n",
457 |       "+---------------+---+--------------------+------+--------+\n",
458 |       "|    Electronics|  1|           iPhone 12|899.99|      10|\n",
459 |       "|       Clothing|  2|     Nike Air Max 90|119.99|      25|\n",
460 |       "|Home Appliances|  3|KitchenAid Stand ...|299.99|       5|\n",
461 |       "|          Books|  4|    The Great Gatsby| 12.99|      50|\n",
462 |       "|         Beauty|  5|L'Oreal Paris Mas...|  9.99|     100|\n",
463 |       "+---------------+---+--------------------+------+--------+\n",
464 |       "only showing top 5 rows\n",
465 |       "\n"
466 |      ]
467 |     }
468 |    ],
469 |    "source": [
470 |     "# Display schema of DataFrame\n",
471 |     "df.printSchema()\n",
472 |     "\n",
473 |     "# Display content of DataFrame\n",
474 |     "df.show(5)"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "code",
479 |    "execution_count": 20,
480 |    "id": "5f538b5b-c116-4a3a-8675-dffa5f59d047",
481 |    "metadata": {},
482 |    "outputs": [
483 |     {
484 |      "name": "stderr",
485 |      "output_type": "stream",
486 |      "text": [
487 |       "                                                                                \r"
488 |      ]
489 |     }
490 |    ],
491 |    "source": [
492 |     "# write dataframe into parquet file\n",
493 |     "parquet_file_path = \"./data/products.parquet\"\n",
494 |     "df.write.parquet(parquet_file_path)"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "markdown",
499 |    "id": "c0e3b6a8-6273-407c-b523-3b6a9b795d73",
500 |    "metadata": {},
501 |    "source": [
502 |     "### Read parquet file into DataFrame"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "code",
507 |    "execution_count": 21,
508 |    "id": "98025d25-3cd5-4ee4-9218-60d9b206cee0",
509 |    "metadata": {},
510 |    "outputs": [],
511 |    "source": [
512 |     "df = spark.read.parquet(parquet_file_path)"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": 22,
518 |    "id": "394fea18-891c-4422-b484-bfb8cadb4b51",
519 |    "metadata": {},
520 |    "outputs": [
521 |     {
522 |      "name": "stdout",
523 |      "output_type": "stream",
524 |      "text": [
525 |       "root\n",
526 |       " |-- category: string (nullable = true)\n",
527 |       " |-- id: long (nullable = true)\n",
528 |       " |-- name: string (nullable = true)\n",
529 |       " |-- price: double (nullable = true)\n",
530 |       " |-- quantity: long (nullable = true)\n",
531 |       "\n",
532 |       "+---------------+---+--------------------+------+--------+\n",
533 |       "|       category| id|                name| price|quantity|\n",
534 |       "+---------------+---+--------------------+------+--------+\n",
535 |       "|    Electronics|  1|           iPhone 12|899.99|      10|\n",
536 |       "|       Clothing|  2|     Nike Air Max 90|119.99|      25|\n",
537 |       "|Home Appliances|  3|KitchenAid Stand ...|299.99|       5|\n",
538 |       "|          Books|  4|    The Great Gatsby| 12.99|      50|\n",
539 |       "|         Beauty|  5|L'Oreal Paris Mas...|  9.99|     100|\n",
540 |       "+---------------+---+--------------------+------+--------+\n",
541 |       "only showing top 5 rows\n",
542 |       "\n"
543 |      ]
544 |     }
545 |    ],
546 |    "source": [
547 |     "# Display schema of DataFrame\n",
548 |     "df.printSchema()\n",
549 |     "\n",
550 |     "# Display content of DataFrame\n",
551 |     "df.show(5)"
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "code",
556 |    "execution_count": 23,
557 |    "id": "27689ca9-304b-40b0-970d-7459545b4983",
558 |    "metadata": {},
559 |    "outputs": [],
560 |    "source": [
561 |     "spark.stop()"
562 |    ]
563 |   },
564 |   {
565 |    "cell_type": "code",
566 |    "execution_count": null,
567 |    "id": "1771b724-c7a5-4f5a-97c1-0dac5f1b03c9",
568 |    "metadata": {},
569 |    "outputs": [],
570 |    "source": []
571 |   },
572 |   {
573 |    "cell_type": "code",
574 |    "execution_count": null,
575 |    "id": "a4657b0b-5d1e-4987-863a-bbbc9bc564b0",
576 |    "metadata": {},
577 |    "outputs": [],
578 |    "source": []
579 |   }
580 |  ],
581 |  "metadata": {
582 |   "kernelspec": {
583 |    "display_name": "Python 3 (ipykernel)",
584 |    "language": "python",
585 |    "name": "python3"
586 |   },
587 |   "language_info": {
588 |    "codemirror_mode": {
589 |     "name": "ipython",
590 |     "version": 3
591 |    },
592 |    "file_extension": ".py",
593 |    "mimetype": "text/x-python",
594 |    "name": "python",
595 |    "nbconvert_exporter": "python",
596 |    "pygments_lexer": "ipython3",
597 |    "version": "3.10.8"
598 |   }
599 |  },
600 |  "nbformat": 4,
601 |  "nbformat_minor": 5
602 | }
603 | 


--------------------------------------------------------------------------------
/07-DataFrame-Operations.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 18,
  6 |    "id": "fa80df4e-9ddc-4db6-a801-717cd67ae883",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Set the PySpark environment variables\n",
 11 |     "import os\n",
 12 |     "os.environ['SPARK_HOME'] = \"/Users/coder2j/Apps/Spark\"\n",
 13 |     "os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'\n",
 14 |     "os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'\n",
 15 |     "os.environ['PYSPARK_PYTHON'] = 'python'"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 19,
 21 |    "id": "01cd1819-fc7d-422e-916f-4ef8fc180bfb",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from pyspark.sql import SparkSession\n",
 26 |     "\n",
 27 |     "# Create a SparkSession\n",
 28 |     "spark = SparkSession.builder.appName(\"DataFrame-Operations\").getOrCreate()"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 20,
 34 |    "id": "3a81039c-3491-4c8a-8ae4-5dab6b0af501",
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "name": "stdout",
 39 |      "output_type": "stream",
 40 |      "text": [
 41 |       "id,name,category,quantity,price\n",
 42 |       "1,iPhone,Electronics,10,899.99\n",
 43 |       "2,Macbook,Electronics,5,1299.99\n",
 44 |       "3,iPad,Electronics,15,499.99\n",
 45 |       "4,Samsung TV,Electronics,8,799.99\n",
 46 |       "5,LG TV,Electronics,10,699.99\n",
 47 |       "6,Nike Shoes,Clothing,30,99.99\n",
 48 |       "7,Adidas Shoes,Clothing,25,89.99\n",
 49 |       "8,Sony Headphones,Electronics,12,149.99\n",
 50 |       "9,Beats Headphones,Electronics,20,199.99\n"
 51 |      ]
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "%%bash\n",
 56 |     "head -10 data/stocks.txt"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 21,
 62 |    "id": "5b7e24a1-861d-4929-b3bd-5683f5bc5c1d",
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "# Load the synthetic data into a DataFrame\n",
 67 |     "data_file_path = \"./data/stocks.txt\"\n",
 68 |     "df = spark.read.csv(data_file_path, header=True, inferSchema=True)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 22,
 74 |    "id": "cdacf979-c105-4f65-9f28-4efc4c88c07a",
 75 |    "metadata": {},
 76 |    "outputs": [
 77 |     {
 78 |      "name": "stdout",
 79 |      "output_type": "stream",
 80 |      "text": [
 81 |       "root\n",
 82 |       " |-- id: integer (nullable = true)\n",
 83 |       " |-- name: string (nullable = true)\n",
 84 |       " |-- category: string (nullable = true)\n",
 85 |       " |-- quantity: integer (nullable = true)\n",
 86 |       " |-- price: double (nullable = true)\n",
 87 |       "\n",
 88 |       "Initial DataFrame:\n",
 89 |       "+---+----------------+-----------+--------+-------+\n",
 90 |       "| id|            name|   category|quantity|  price|\n",
 91 |       "+---+----------------+-----------+--------+-------+\n",
 92 |       "|  1|          iPhone|Electronics|      10| 899.99|\n",
 93 |       "|  2|         Macbook|Electronics|       5|1299.99|\n",
 94 |       "|  3|            iPad|Electronics|      15| 499.99|\n",
 95 |       "|  4|      Samsung TV|Electronics|       8| 799.99|\n",
 96 |       "|  5|           LG TV|Electronics|      10| 699.99|\n",
 97 |       "|  6|      Nike Shoes|   Clothing|      30|  99.99|\n",
 98 |       "|  7|    Adidas Shoes|   Clothing|      25|  89.99|\n",
 99 |       "|  8| Sony Headphones|Electronics|      12| 149.99|\n",
100 |       "|  9|Beats Headphones|Electronics|      20| 199.99|\n",
101 |       "| 10|    Dining Table|  Furniture|      10| 249.99|\n",
102 |       "+---+----------------+-----------+--------+-------+\n",
103 |       "only showing top 10 rows\n",
104 |       "\n"
105 |      ]
106 |     }
107 |    ],
108 |    "source": [
109 |     "# Display schema of DataFrame\n",
110 |     "df.printSchema()\n",
111 |     "\n",
112 |     "# Show the initial DataFrame\n",
113 |     "print(\"Initial DataFrame:\")\n",
114 |     "df.show(10)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "id": "f43e6f84-16f0-4f6f-a7cf-a3de49d6ea53",
120 |    "metadata": {},
121 |    "source": [
122 |     "### Select: Choose specific columns."
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 23,
128 |    "id": "7552160c-a792-4817-ab7c-117d5440a52c",
129 |    "metadata": {},
130 |    "outputs": [
131 |     {
132 |      "name": "stdout",
133 |      "output_type": "stream",
134 |      "text": [
135 |       "Selected Columns:\n",
136 |       "+---+----------------+-------+\n",
137 |       "| id|            name|  price|\n",
138 |       "+---+----------------+-------+\n",
139 |       "|  1|          iPhone| 899.99|\n",
140 |       "|  2|         Macbook|1299.99|\n",
141 |       "|  3|            iPad| 499.99|\n",
142 |       "|  4|      Samsung TV| 799.99|\n",
143 |       "|  5|           LG TV| 699.99|\n",
144 |       "|  6|      Nike Shoes|  99.99|\n",
145 |       "|  7|    Adidas Shoes|  89.99|\n",
146 |       "|  8| Sony Headphones| 149.99|\n",
147 |       "|  9|Beats Headphones| 199.99|\n",
148 |       "| 10|    Dining Table| 249.99|\n",
149 |       "+---+----------------+-------+\n",
150 |       "only showing top 10 rows\n",
151 |       "\n"
152 |      ]
153 |     }
154 |    ],
155 |    "source": [
156 |     "# Select specific columns\n",
157 |     "selected_columns = df.select(\"id\", \"name\", \"price\")\n",
158 |     "print(\"Selected Columns:\")\n",
159 |     "selected_columns.show(10)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "id": "1ba2c8d3-1e26-456d-94e3-6afac5a4a4a7",
165 |    "metadata": {},
166 |    "source": [
167 |     "### Filter: Apply conditions to filter rows."
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 24,
173 |    "id": "0fc8c608-2fa3-4565-849b-ebc2b62025fe",
174 |    "metadata": {},
175 |    "outputs": [
176 |     {
177 |      "name": "stdout",
178 |      "output_type": "stream",
179 |      "text": [
180 |       "Filtered Data: 12\n",
181 |       "+---+--------------+-----------+--------+-----+\n",
182 |       "| id|          name|   category|quantity|price|\n",
183 |       "+---+--------------+-----------+--------+-----+\n",
184 |       "|  6|    Nike Shoes|   Clothing|      30|99.99|\n",
185 |       "|  7|  Adidas Shoes|   Clothing|      25|89.99|\n",
186 |       "| 12|        Apples|       Food|     100|  0.5|\n",
187 |       "| 13|       Bananas|       Food|     150| 0.25|\n",
188 |       "| 14|       Oranges|       Food|     120| 0.75|\n",
189 |       "| 15|Chicken Breast|       Food|      50| 3.99|\n",
190 |       "| 16| Salmon Fillet|       Food|      30| 5.99|\n",
191 |       "| 24|    Laptop Bag|Accessories|      25|29.99|\n",
192 |       "| 25|      Backpack|Accessories|      30|24.99|\n",
193 |       "| 28|         Jeans|   Clothing|      30|59.99|\n",
194 |       "| 29|       T-shirt|   Clothing|      50|14.99|\n",
195 |       "| 30|      Sneakers|   Clothing|      40|79.99|\n",
196 |       "+---+--------------+-----------+--------+-----+\n",
197 |       "\n"
198 |      ]
199 |     }
200 |    ],
201 |    "source": [
202 |     "# Filter rows based on a condition\n",
203 |     "filtered_data = df.filter(df.quantity > 20)\n",
204 |     "print(\"Filtered Data:\", filtered_data.count())\n",
205 |     "filtered_data.show()"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "id": "18283acf-69eb-4140-a4dd-273c9eb5eafd",
211 |    "metadata": {},
212 |    "source": [
213 |     "### GroupBy: Group data based on specific columns \n",
214 |     "### Aggregations: Perform functions like sum, average, etc., on grouped data."
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 25,
220 |    "id": "96d2db93-de0e-4707-81a8-3cfbb84dcf3d",
221 |    "metadata": {},
222 |    "outputs": [
223 |     {
224 |      "name": "stdout",
225 |      "output_type": "stream",
226 |      "text": [
227 |       "Grouped and Aggregated Data:\n",
228 |       "+-----------+-------------+------------------+\n",
229 |       "|   category|sum(quantity)|        avg(price)|\n",
230 |       "+-----------+-------------+------------------+\n",
231 |       "|       Food|          450|2.2960000000000003|\n",
232 |       "|     Sports|           35|             34.99|\n",
233 |       "|Electronics|           98| 586.6566666666665|\n",
234 |       "|   Clothing|          200|  99.2757142857143|\n",
235 |       "|  Furniture|           41|            141.99|\n",
236 |       "|Accessories|           55|             27.49|\n",
237 |       "+-----------+-------------+------------------+\n",
238 |       "\n"
239 |      ]
240 |     }
241 |    ],
242 |    "source": [
243 |     "# GroupBy and Aggregations\n",
244 |     "grouped_data = df.groupBy(\"category\").agg({\"quantity\": \"sum\", \"price\": \"avg\"})\n",
245 |     "print(\"Grouped and Aggregated Data:\")\n",
246 |     "grouped_data.show()"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "markdown",
251 |    "id": "a233a823-73b1-404f-b8e0-90ee5e78c4e4",
252 |    "metadata": {},
253 |    "source": [
254 |     "### Join: Combine multiple DataFrames based on specified columns."
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 26,
260 |    "id": "494f996e-57ee-44c0-82d8-814ee777653e",
261 |    "metadata": {},
262 |    "outputs": [
263 |     {
264 |      "name": "stdout",
265 |      "output_type": "stream",
266 |      "text": [
267 |       "Joined Data:\n",
268 |       "+---+----------------+-----------+--------+-------+-----------+\n",
269 |       "| id|            name|   category|quantity|  price|   category|\n",
270 |       "+---+----------------+-----------+--------+-------+-----------+\n",
271 |       "|  1|          iPhone|Electronics|      10| 899.99|Electronics|\n",
272 |       "|  2|         Macbook|Electronics|       5|1299.99|Electronics|\n",
273 |       "|  3|            iPad|Electronics|      15| 499.99|Electronics|\n",
274 |       "|  4|      Samsung TV|Electronics|       8| 799.99|Electronics|\n",
275 |       "|  5|           LG TV|Electronics|      10| 699.99|Electronics|\n",
276 |       "|  6|      Nike Shoes|   Clothing|      30|  99.99|   Clothing|\n",
277 |       "|  7|    Adidas Shoes|   Clothing|      25|  89.99|   Clothing|\n",
278 |       "|  8| Sony Headphones|Electronics|      12| 149.99|Electronics|\n",
279 |       "|  9|Beats Headphones|Electronics|      20| 199.99|Electronics|\n",
280 |       "| 10|    Dining Table|  Furniture|      10| 249.99|  Furniture|\n",
281 |       "+---+----------------+-----------+--------+-------+-----------+\n",
282 |       "\n"
283 |      ]
284 |     }
285 |    ],
286 |    "source": [
287 |     "# Join with another DataFrame\n",
288 |     "df2 = df.select(\"id\", \"category\").limit(10)\n",
289 |     "joined_data = df.join(df2, \"id\", \"inner\")\n",
290 |     "print(\"Joined Data:\")\n",
291 |     "joined_data.show()"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "markdown",
296 |    "id": "6e71e549-8194-4a95-ae3a-9db0a6afa5dc",
297 |    "metadata": {},
298 |    "source": [
299 |     "### Sort: Arrange rows based on one or more columns."
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": 27,
305 |    "id": "133ab21b-84ca-48e7-a8bf-8c7e487e455c",
306 |    "metadata": {},
307 |    "outputs": [
308 |     {
309 |      "name": "stdout",
310 |      "output_type": "stream",
311 |      "text": [
312 |       "Sorted Data:\n",
313 |       "+---+--------------+-----------+--------+-----+\n",
314 |       "| id|          name|   category|quantity|price|\n",
315 |       "+---+--------------+-----------+--------+-----+\n",
316 |       "| 13|       Bananas|       Food|     150| 0.25|\n",
317 |       "| 12|        Apples|       Food|     100|  0.5|\n",
318 |       "| 14|       Oranges|       Food|     120| 0.75|\n",
319 |       "| 15|Chicken Breast|       Food|      50| 3.99|\n",
320 |       "| 16| Salmon Fillet|       Food|      30| 5.99|\n",
321 |       "| 29|       T-shirt|   Clothing|      50|14.99|\n",
322 |       "| 19|      Yoga Mat|     Sports|      20|19.99|\n",
323 |       "| 25|      Backpack|Accessories|      30|24.99|\n",
324 |       "| 24|    Laptop Bag|Accessories|      25|29.99|\n",
325 |       "| 20|  Dumbbell Set|     Sports|      15|49.99|\n",
326 |       "+---+--------------+-----------+--------+-----+\n",
327 |       "only showing top 10 rows\n",
328 |       "\n"
329 |      ]
330 |     }
331 |    ],
332 |    "source": [
333 |     "# Sort by a column\n",
334 |     "sorted_data = df.orderBy(\"price\")\n",
335 |     "print(\"Sorted Data:\")\n",
336 |     "sorted_data.show(10)"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 28,
342 |    "id": "a5a0a80e-dc5c-4569-ade9-e209560eccb8",
343 |    "metadata": {},
344 |    "outputs": [
345 |     {
346 |      "name": "stdout",
347 |      "output_type": "stream",
348 |      "text": [
349 |       "Sorted Data Descending:\n",
350 |       "+---+----------------+-----------+--------+-------+\n",
351 |       "| id|            name|   category|quantity|  price|\n",
352 |       "+---+----------------+-----------+--------+-------+\n",
353 |       "|  2|         Macbook|Electronics|       5|1299.99|\n",
354 |       "|  1|          iPhone|Electronics|      10| 899.99|\n",
355 |       "|  4|      Samsung TV|Electronics|       8| 799.99|\n",
356 |       "|  5|           LG TV|Electronics|      10| 699.99|\n",
357 |       "| 26|          Camera|Electronics|      10| 599.99|\n",
358 |       "|  3|            iPad|Electronics|      15| 499.99|\n",
359 |       "| 10|    Dining Table|  Furniture|      10| 249.99|\n",
360 |       "| 17|  Leather Jacket|   Clothing|      15| 199.99|\n",
361 |       "|  9|Beats Headphones|Electronics|      20| 199.99|\n",
362 |       "| 18|     Winter Coat|   Clothing|      10| 149.99|\n",
363 |       "+---+----------------+-----------+--------+-------+\n",
364 |       "only showing top 10 rows\n",
365 |       "\n"
366 |      ]
367 |     }
368 |    ],
369 |    "source": [
370 |     "# Sort by a column desc\n",
371 |     "from pyspark.sql.functions import col, desc\n",
372 |     "sorted_data = df.orderBy(col(\"price\").desc(), col(\"id\").desc())\n",
373 |     "print(\"Sorted Data Descending:\")\n",
374 |     "sorted_data.show(10)"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "markdown",
379 |    "id": "67580e00-93f8-4579-9972-fc64a0654366",
380 |    "metadata": {},
381 |    "source": [
382 |     "### Distinct: Get unique rows."
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": 29,
388 |    "id": "744e6638-5689-4509-9df4-4abb03cd9e9b",
389 |    "metadata": {},
390 |    "outputs": [
391 |     {
392 |      "name": "stdout",
393 |      "output_type": "stream",
394 |      "text": [
395 |       "Distinct Product Categories:\n",
396 |       "+-----------+\n",
397 |       "|   category|\n",
398 |       "+-----------+\n",
399 |       "|       Food|\n",
400 |       "|     Sports|\n",
401 |       "|Electronics|\n",
402 |       "|   Clothing|\n",
403 |       "|  Furniture|\n",
404 |       "|Accessories|\n",
405 |       "+-----------+\n",
406 |       "\n"
407 |      ]
408 |     }
409 |    ],
410 |    "source": [
411 |     "# Get distinct product category\n",
412 |     "distinct_rows = df.select(\"category\").distinct()\n",
413 |     "print(\"Distinct Product Categories:\")\n",
414 |     "distinct_rows.show()"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "markdown",
419 |    "id": "4c847aac-dcde-4589-aef7-c0ec95c2f80f",
420 |    "metadata": {},
421 |    "source": [
422 |     "### Drop: Remove specified columns."
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": 30,
428 |    "id": "21d4afa3-20b5-4299-931a-0fba2655b509",
429 |    "metadata": {},
430 |    "outputs": [
431 |     {
432 |      "name": "stdout",
433 |      "output_type": "stream",
434 |      "text": [
435 |       "Dropped Columns:\n",
436 |       "+---+----------------+-------+\n",
437 |       "| id|            name|  price|\n",
438 |       "+---+----------------+-------+\n",
439 |       "|  1|          iPhone| 899.99|\n",
440 |       "|  2|         Macbook|1299.99|\n",
441 |       "|  3|            iPad| 499.99|\n",
442 |       "|  4|      Samsung TV| 799.99|\n",
443 |       "|  5|           LG TV| 699.99|\n",
444 |       "|  6|      Nike Shoes|  99.99|\n",
445 |       "|  7|    Adidas Shoes|  89.99|\n",
446 |       "|  8| Sony Headphones| 149.99|\n",
447 |       "|  9|Beats Headphones| 199.99|\n",
448 |       "| 10|    Dining Table| 249.99|\n",
449 |       "+---+----------------+-------+\n",
450 |       "only showing top 10 rows\n",
451 |       "\n"
452 |      ]
453 |     }
454 |    ],
455 |    "source": [
456 |     "# Drop columns\n",
457 |     "dropped_columns = df.drop(\"quantity\", \"category\")\n",
458 |     "print(\"Dropped Columns:\")\n",
459 |     "dropped_columns.show(10)"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "markdown",
464 |    "id": "afc28820-f951-4c1e-99bc-e56ff434cc11",
465 |    "metadata": {},
466 |    "source": [
467 |     "### WithColumn: Add new calculated columns."
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "code",
472 |    "execution_count": 31,
473 |    "id": "fb391702-9e54-4e3d-822b-0d0f1d0d08e6",
474 |    "metadata": {},
475 |    "outputs": [
476 |     {
477 |      "name": "stdout",
478 |      "output_type": "stream",
479 |      "text": [
480 |       "DataFrame with New Column:\n",
481 |       "+---+----------------+-----------+--------+-------+-------+\n",
482 |       "| id|            name|   category|quantity|  price|revenue|\n",
483 |       "+---+----------------+-----------+--------+-------+-------+\n",
484 |       "|  1|          iPhone|Electronics|      10| 899.99| 8999.9|\n",
485 |       "|  2|         Macbook|Electronics|       5|1299.99|6499.95|\n",
486 |       "|  3|            iPad|Electronics|      15| 499.99|7499.85|\n",
487 |       "|  4|      Samsung TV|Electronics|       8| 799.99|6399.92|\n",
488 |       "|  5|           LG TV|Electronics|      10| 699.99| 6999.9|\n",
489 |       "|  6|      Nike Shoes|   Clothing|      30|  99.99| 2999.7|\n",
490 |       "|  7|    Adidas Shoes|   Clothing|      25|  89.99|2249.75|\n",
491 |       "|  8| Sony Headphones|Electronics|      12| 149.99|1799.88|\n",
492 |       "|  9|Beats Headphones|Electronics|      20| 199.99| 3999.8|\n",
493 |       "| 10|    Dining Table|  Furniture|      10| 249.99| 2499.9|\n",
494 |       "+---+----------------+-----------+--------+-------+-------+\n",
495 |       "only showing top 10 rows\n",
496 |       "\n"
497 |      ]
498 |     }
499 |    ],
500 |    "source": [
501 |     "# Add a new calculated column\n",
502 |     "df_with_new_column = df.withColumn(\"revenue\", df.quantity * df.price)\n",
503 |     "print(\"DataFrame with New Column:\")\n",
504 |     "df_with_new_column.show(10)"
505 |    ]
506 |   },
507 |   {
508 |    "cell_type": "markdown",
509 |    "id": "c2138074-68c4-403b-aac6-41fee4595417",
510 |    "metadata": {},
511 |    "source": [
512 |     "### Alias: Rename columns for better readability."
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": 32,
518 |    "id": "669657f4-63a0-48f6-bc9e-a1bebeae466e",
519 |    "metadata": {},
520 |    "outputs": [
521 |     {
522 |      "name": "stdout",
523 |      "output_type": "stream",
524 |      "text": [
525 |       "DataFrame with Aliased Column:\n",
526 |       "+---+----------------+-----------+--------+-------------+\n",
527 |       "| id|            name|   category|quantity|product_price|\n",
528 |       "+---+----------------+-----------+--------+-------------+\n",
529 |       "|  1|          iPhone|Electronics|      10|       899.99|\n",
530 |       "|  2|         Macbook|Electronics|       5|      1299.99|\n",
531 |       "|  3|            iPad|Electronics|      15|       499.99|\n",
532 |       "|  4|      Samsung TV|Electronics|       8|       799.99|\n",
533 |       "|  5|           LG TV|Electronics|      10|       699.99|\n",
534 |       "|  6|      Nike Shoes|   Clothing|      30|        99.99|\n",
535 |       "|  7|    Adidas Shoes|   Clothing|      25|        89.99|\n",
536 |       "|  8| Sony Headphones|Electronics|      12|       149.99|\n",
537 |       "|  9|Beats Headphones|Electronics|      20|       199.99|\n",
538 |       "| 10|    Dining Table|  Furniture|      10|       249.99|\n",
539 |       "+---+----------------+-----------+--------+-------------+\n",
540 |       "only showing top 10 rows\n",
541 |       "\n"
542 |      ]
543 |     }
544 |    ],
545 |    "source": [
546 |     "# Rename columns using alias\n",
547 |     "df_with_alias = df.withColumnRenamed(\"price\", \"product_price\")\n",
548 |     "print(\"DataFrame with Aliased Column:\")\n",
549 |     "df_with_alias.show(10)"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "code",
554 |    "execution_count": 33,
555 |    "id": "f305239c-fa2b-4378-9de8-bfc58e5f244f",
556 |    "metadata": {},
557 |    "outputs": [],
558 |    "source": [
559 |     "# Stop the SparkSession\n",
560 |     "spark.stop()"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": null,
566 |    "id": "b89aa6d1-157a-46da-b9a2-c09feeb2c82e",
567 |    "metadata": {},
568 |    "outputs": [],
569 |    "source": []
570 |   }
571 |  ],
572 |  "metadata": {
573 |   "kernelspec": {
574 |    "display_name": "Python 3 (ipykernel)",
575 |    "language": "python",
576 |    "name": "python3"
577 |   },
578 |   "language_info": {
579 |    "codemirror_mode": {
580 |     "name": "ipython",
581 |     "version": 3
582 |    },
583 |    "file_extension": ".py",
584 |    "mimetype": "text/x-python",
585 |    "name": "python",
586 |    "nbconvert_exporter": "python",
587 |    "pygments_lexer": "ipython3",
588 |    "version": "3.10.8"
589 |   }
590 |  },
591 |  "nbformat": 4,
592 |  "nbformat_minor": 5
593 | }
594 | 


--------------------------------------------------------------------------------
/08-Spark-SQL.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 52,
  6 |    "id": "7f41cf64-565f-48b4-a5e5-4d48fc404270",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Set the PySpark environment variables\n",
 11 |     "import os\n",
 12 |     "os.environ['SPARK_HOME'] = \"/Users/coder2j/Apps/Spark\"\n",
 13 |     "os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'\n",
 14 |     "os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'\n",
 15 |     "os.environ['PYSPARK_PYTHON'] = 'python'"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 54,
 21 |    "id": "958a52c6-da3d-45b5-9c2e-27e579c6066d",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from pyspark.sql import SparkSession\n",
 26 |     "\n",
 27 |     "# Create a SparkSession\n",
 28 |     "spark = SparkSession.builder.appName(\"DataFrameSQL\").getOrCreate()"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 55,
 34 |    "id": "d3a86cbb-583f-427d-970b-de58f9e7bcf4",
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "name": "stdout",
 39 |      "output_type": "stream",
 40 |      "text": [
 41 |       "name,age,gender,salary\n",
 42 |       "John Doe,30,Male,50000\n",
 43 |       "Jane Smith,25,Female,45000\n",
 44 |       "David Johnson,35,Male,60000\n",
 45 |       "Emily Davis,28,Female,52000\n",
 46 |       "Michael Wilson,40,Male,75000\n",
 47 |       "Sarah Brown,32,Female,58000\n",
 48 |       "Robert Lee,29,Male,51000\n",
 49 |       "Lisa Garcia,27,Female,49000\n",
 50 |       "James Martinez,38,Male,70000\n"
 51 |      ]
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "%%bash\n",
 56 |     "head -10 ./data/persons.csv"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "id": "24c56d31-71d2-48c0-8d22-b7eb9c4c347f",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "### Load Data into a DataFrame"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 56,
 70 |    "id": "dcd9c1d7-ccc2-4cee-8547-c23f3bee42bb",
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "# Load the synthetic data into a DataFrame\n",
 75 |     "data_file_path = \"./data/persons.csv\"\n",
 76 |     "df = spark.read.csv(data_file_path, header=True, inferSchema=True)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 57,
 82 |    "id": "99b0d2db-5d0e-4e82-9d45-95029f614319",
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stdout",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "root\n",
 90 |       " |-- name: string (nullable = true)\n",
 91 |       " |-- age: integer (nullable = true)\n",
 92 |       " |-- gender: string (nullable = true)\n",
 93 |       " |-- salary: integer (nullable = true)\n",
 94 |       "\n",
 95 |       "Initial DataFrame:\n",
 96 |       "+------------------+---+------+------+\n",
 97 |       "|              name|age|gender|salary|\n",
 98 |       "+------------------+---+------+------+\n",
 99 |       "|          John Doe| 30|  Male| 50000|\n",
100 |       "|        Jane Smith| 25|Female| 45000|\n",
101 |       "|     David Johnson| 35|  Male| 60000|\n",
102 |       "|       Emily Davis| 28|Female| 52000|\n",
103 |       "|    Michael Wilson| 40|  Male| 75000|\n",
104 |       "|       Sarah Brown| 32|Female| 58000|\n",
105 |       "|        Robert Lee| 29|  Male| 51000|\n",
106 |       "|       Lisa Garcia| 27|Female| 49000|\n",
107 |       "|    James Martinez| 38|  Male| 70000|\n",
108 |       "|Jennifer Rodriguez| 26|Female| 47000|\n",
109 |       "+------------------+---+------+------+\n",
110 |       "only showing top 10 rows\n",
111 |       "\n"
112 |      ]
113 |     }
114 |    ],
115 |    "source": [
116 |     "# Display schema of DataFrame\n",
117 |     "df.printSchema()\n",
118 |     "\n",
119 |     "# Show the initial DataFrame\n",
120 |     "print(\"Initial DataFrame:\")\n",
121 |     "df.show(10)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "id": "1293db0e-3af1-40d3-9fed-15cb5c6d54f1",
127 |    "metadata": {},
128 |    "source": [
129 |     "### Register the DataFrame as a Temporary Table"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 58,
135 |    "id": "439040dd-8c13-48c9-9a72-acd313da407a",
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "# Register the DataFrame as a Temporary Table\n",
140 |     "df.createOrReplaceTempView(\"my_table\")"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "id": "3c0ed5db-3985-4251-935a-3c01edd47005",
146 |    "metadata": {},
147 |    "source": [
148 |     "### Perform SQL-like Queries"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 59,
154 |    "id": "ccf8aa7d-e77f-4fca-93d9-fd4f2581803d",
155 |    "metadata": {},
156 |    "outputs": [
157 |     {
158 |      "name": "stdout",
159 |      "output_type": "stream",
160 |      "text": [
161 |       "+------------------+---+------+------+\n",
162 |       "|              name|age|gender|salary|\n",
163 |       "+------------------+---+------+------+\n",
164 |       "|          John Doe| 30|  Male| 50000|\n",
165 |       "|     David Johnson| 35|  Male| 60000|\n",
166 |       "|       Emily Davis| 28|Female| 52000|\n",
167 |       "|    Michael Wilson| 40|  Male| 75000|\n",
168 |       "|       Sarah Brown| 32|Female| 58000|\n",
169 |       "|        Robert Lee| 29|  Male| 51000|\n",
170 |       "|       Lisa Garcia| 27|Female| 49000|\n",
171 |       "|    James Martinez| 38|  Male| 70000|\n",
172 |       "|Jennifer Rodriguez| 26|Female| 47000|\n",
173 |       "|  William Anderson| 33|  Male| 62000|\n",
174 |       "|   Karen Hernandez| 31|Female| 55000|\n",
175 |       "|Christopher Taylor| 37|  Male| 69000|\n",
176 |       "|     Matthew Davis| 36|  Male| 67000|\n",
177 |       "|    Patricia White| 29|Female| 50000|\n",
178 |       "|     Daniel Miller| 34|  Male| 64000|\n",
179 |       "| Elizabeth Jackson| 30|Female| 52000|\n",
180 |       "|     Joseph Harris| 28|  Male| 53000|\n",
181 |       "|      Linda Martin| 39|Female| 71000|\n",
182 |       "+------------------+---+------+------+\n",
183 |       "\n"
184 |      ]
185 |     }
186 |    ],
187 |    "source": [
188 |     "# Select all rows where age is greater than 25\n",
189 |     "result = spark.sql(\"SELECT * FROM my_table WHERE age > 25\")\n",
190 |     "\n",
191 |     "result.show()"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 60,
197 |    "id": "8b88f371-fd06-4aed-9fca-bdb4ce1362ff",
198 |    "metadata": {},
199 |    "outputs": [
200 |     {
201 |      "name": "stdout",
202 |      "output_type": "stream",
203 |      "text": [
204 |       "+------+----------+\n",
205 |       "|gender|avg_salary|\n",
206 |       "+------+----------+\n",
207 |       "|Female|   52300.0|\n",
208 |       "|  Male|   62100.0|\n",
209 |       "+------+----------+\n",
210 |       "\n"
211 |      ]
212 |     }
213 |    ],
214 |    "source": [
215 |     "# Compute the average salary by gender\n",
216 |     "avg_salary_by_gender = spark.sql(\"SELECT gender, AVG(salary) as avg_salary FROM my_table GROUP BY gender\")\n",
217 |     "avg_salary_by_gender.show()"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "id": "62b21e82-65e2-4b09-b344-a5cc72d9d430",
223 |    "metadata": {},
224 |    "source": [
225 |     "### Creating and managing temporary views."
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 61,
231 |    "id": "6469c424-87cd-4dfa-b015-1c3f8e591de0",
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "# Create a temporary view\n",
236 |     "df.createOrReplaceTempView(\"people\")"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 62,
242 |    "id": "9198c861-e79a-4fac-80cc-747c94f062ce",
243 |    "metadata": {},
244 |    "outputs": [
245 |     {
246 |      "name": "stdout",
247 |      "output_type": "stream",
248 |      "text": [
249 |       "+------------------+---+------+------+\n",
250 |       "|              name|age|gender|salary|\n",
251 |       "+------------------+---+------+------+\n",
252 |       "|          John Doe| 30|  Male| 50000|\n",
253 |       "|     David Johnson| 35|  Male| 60000|\n",
254 |       "|       Emily Davis| 28|Female| 52000|\n",
255 |       "|    Michael Wilson| 40|  Male| 75000|\n",
256 |       "|       Sarah Brown| 32|Female| 58000|\n",
257 |       "|        Robert Lee| 29|  Male| 51000|\n",
258 |       "|       Lisa Garcia| 27|Female| 49000|\n",
259 |       "|    James Martinez| 38|  Male| 70000|\n",
260 |       "|Jennifer Rodriguez| 26|Female| 47000|\n",
261 |       "|  William Anderson| 33|  Male| 62000|\n",
262 |       "|   Karen Hernandez| 31|Female| 55000|\n",
263 |       "|Christopher Taylor| 37|  Male| 69000|\n",
264 |       "|     Matthew Davis| 36|  Male| 67000|\n",
265 |       "|    Patricia White| 29|Female| 50000|\n",
266 |       "|     Daniel Miller| 34|  Male| 64000|\n",
267 |       "| Elizabeth Jackson| 30|Female| 52000|\n",
268 |       "|     Joseph Harris| 28|  Male| 53000|\n",
269 |       "|      Linda Martin| 39|Female| 71000|\n",
270 |       "+------------------+---+------+------+\n",
271 |       "\n"
272 |      ]
273 |     }
274 |    ],
275 |    "source": [
276 |     "# Query the temporary view\n",
277 |     "result = spark.sql(\"SELECT * FROM people WHERE age > 25\")\n",
278 |     "\n",
279 |     "result.show()"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": 63,
285 |    "id": "b66e2493-82c9-4a46-a2c5-049fe9572e7b",
286 |    "metadata": {},
287 |    "outputs": [
288 |     {
289 |      "data": {
290 |       "text/plain": [
291 |        "True"
292 |       ]
293 |      },
294 |      "execution_count": 63,
295 |      "metadata": {},
296 |      "output_type": "execute_result"
297 |     }
298 |    ],
299 |    "source": [
300 |     "# Check if a temporary view exists\n",
301 |     "view_exists = spark.catalog.tableExists(\"people\")\n",
302 |     "view_exists"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 64,
308 |    "id": "6d363610-7dc6-4a80-bc2d-c1269691e437",
309 |    "metadata": {},
310 |    "outputs": [
311 |     {
312 |      "data": {
313 |       "text/plain": [
314 |        "True"
315 |       ]
316 |      },
317 |      "execution_count": 64,
318 |      "metadata": {},
319 |      "output_type": "execute_result"
320 |     }
321 |    ],
322 |    "source": [
323 |     "# Drop a temporary view\n",
324 |     "spark.catalog.dropTempView(\"people\")"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 65,
330 |    "id": "38f7e2b9-8d1a-4060-8791-7cbfed9d8b22",
331 |    "metadata": {},
332 |    "outputs": [
333 |     {
334 |      "data": {
335 |       "text/plain": [
336 |        "False"
337 |       ]
338 |      },
339 |      "execution_count": 65,
340 |      "metadata": {},
341 |      "output_type": "execute_result"
342 |     }
343 |    ],
344 |    "source": [
345 |     "# Check if a temporary view exists\n",
346 |     "view_exists = spark.catalog.tableExists(\"people\")\n",
347 |     "view_exists"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "markdown",
352 |    "id": "8bcdf631-ee1d-496f-858b-a13d0b43cb18",
353 |    "metadata": {},
354 |    "source": [
355 |     "### Subquries"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": 66,
361 |    "id": "5cc8db71-4862-4169-b2d0-7e38ccdf8331",
362 |    "metadata": {},
363 |    "outputs": [
364 |     {
365 |      "name": "stdout",
366 |      "output_type": "stream",
367 |      "text": [
368 |       "+---+-------+\n",
369 |       "| id|   name|\n",
370 |       "+---+-------+\n",
371 |       "|  1|   John|\n",
372 |       "|  2|  Alice|\n",
373 |       "|  3|    Bob|\n",
374 |       "|  4|  Emily|\n",
375 |       "|  5|  David|\n",
376 |       "|  6|  Sarah|\n",
377 |       "|  7|Michael|\n",
378 |       "|  8|   Lisa|\n",
379 |       "|  9|William|\n",
380 |       "+---+-------+\n",
381 |       "\n",
382 |       "+----------+---+------+\n",
383 |       "|department| id|salary|\n",
384 |       "+----------+---+------+\n",
385 |       "|        HR|  1| 60000|\n",
386 |       "|        HR|  2| 55000|\n",
387 |       "|        HR|  3| 58000|\n",
388 |       "|        IT|  4| 70000|\n",
389 |       "|        IT|  5| 72000|\n",
390 |       "|        IT|  6| 68000|\n",
391 |       "|     Sales|  7| 75000|\n",
392 |       "|     Sales|  8| 78000|\n",
393 |       "|     Sales|  9| 77000|\n",
394 |       "+----------+---+------+\n",
395 |       "\n"
396 |      ]
397 |     }
398 |    ],
399 |    "source": [
400 |     "# Create DataFrames\n",
401 |     "employee_data = [\n",
402 |     "    (1, \"John\"), (2, \"Alice\"), (3, \"Bob\"), (4, \"Emily\"),\n",
403 |     "    (5, \"David\"), (6, \"Sarah\"), (7, \"Michael\"), (8, \"Lisa\"),\n",
404 |     "    (9, \"William\")\n",
405 |     "]\n",
406 |     "employees = spark.createDataFrame(employee_data, [\"id\", \"name\"])\n",
407 |     "\n",
408 |     "salary_data = [\n",
409 |     "    (\"HR\", 1, 60000), (\"HR\", 2, 55000), (\"HR\", 3, 58000),\n",
410 |     "    (\"IT\", 4, 70000), (\"IT\", 5, 72000), (\"IT\", 6, 68000),\n",
411 |     "    (\"Sales\", 7, 75000), (\"Sales\", 8, 78000), (\"Sales\", 9, 77000)\n",
412 |     "]\n",
413 |     "salaries = spark.createDataFrame(salary_data, [\"department\", \"id\", \"salary\"])\n",
414 |     "\n",
415 |     "employees.show()\n",
416 |     "\n",
417 |     "salaries.show()"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": 67,
423 |    "id": "871a978f-82f9-4388-afae-77d4b0e7297b",
424 |    "metadata": {},
425 |    "outputs": [],
426 |    "source": [
427 |     "# Register as temporary views\n",
428 |     "employees.createOrReplaceTempView(\"employees\")\n",
429 |     "salaries.createOrReplaceTempView(\"salaries\")"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": 68,
435 |    "id": "c0a3c981-b8d4-409c-bd3a-aaff55ee3661",
436 |    "metadata": {},
437 |    "outputs": [
438 |     {
439 |      "name": "stdout",
440 |      "output_type": "stream",
441 |      "text": [
442 |       "+-------+\n",
443 |       "|   name|\n",
444 |       "+-------+\n",
445 |       "|  Emily|\n",
446 |       "|  David|\n",
447 |       "|Michael|\n",
448 |       "|   Lisa|\n",
449 |       "|William|\n",
450 |       "+-------+\n",
451 |       "\n"
452 |      ]
453 |     }
454 |    ],
455 |    "source": [
456 |     "# Subquery to find employees with salaries above average\n",
457 |     "result = spark.sql(\"\"\"\n",
458 |     "    SELECT name\n",
459 |     "    FROM employees\n",
460 |     "    WHERE id IN (\n",
461 |     "        SELECT id\n",
462 |     "        FROM salaries\n",
463 |     "        WHERE salary > (SELECT AVG(salary) FROM salaries)\n",
464 |     "    )\n",
465 |     "\"\"\")\n",
466 |     "\n",
467 |     "result.show()"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "markdown",
472 |    "id": "5237a76b-d747-4096-a013-41cc913cd9c0",
473 |    "metadata": {},
474 |    "source": [
475 |     "### Window Function"
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "code",
480 |    "execution_count": 69,
481 |    "id": "afd58076-cada-4552-a676-7919cb17ba2b",
482 |    "metadata": {},
483 |    "outputs": [],
484 |    "source": [
485 |     "from pyspark.sql.window import Window\n",
486 |     "from pyspark.sql import functions as F"
487 |    ]
488 |   },
489 |   {
490 |    "cell_type": "code",
491 |    "execution_count": 70,
492 |    "id": "5669387e-aa09-4830-a4c4-c32056e38d5f",
493 |    "metadata": {},
494 |    "outputs": [
495 |     {
496 |      "name": "stdout",
497 |      "output_type": "stream",
498 |      "text": [
499 |       "+----------+---+------+-------+\n",
500 |       "|department| id|salary|   name|\n",
501 |       "+----------+---+------+-------+\n",
502 |       "|        HR|  1| 60000|   John|\n",
503 |       "|        HR|  2| 55000|  Alice|\n",
504 |       "|        HR|  3| 58000|    Bob|\n",
505 |       "|        IT|  4| 70000|  Emily|\n",
506 |       "|        IT|  5| 72000|  David|\n",
507 |       "|        IT|  6| 68000|  Sarah|\n",
508 |       "|     Sales|  7| 75000|Michael|\n",
509 |       "|     Sales|  8| 78000|   Lisa|\n",
510 |       "|     Sales|  9| 77000|William|\n",
511 |       "+----------+---+------+-------+\n",
512 |       "\n"
513 |      ]
514 |     }
515 |    ],
516 |    "source": [
517 |     "employee_salary = spark.sql(\"\"\"\n",
518 |     "    select  salaries.*, employees.name\n",
519 |     "    from salaries \n",
520 |     "    left join employees on salaries.id = employees.id\n",
521 |     "\"\"\")\n",
522 |     "\n",
523 |     "employee_salary.show()"
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "code",
528 |    "execution_count": 71,
529 |    "id": "0f2fb287-69a4-4860-a13b-f76646e2f465",
530 |    "metadata": {},
531 |    "outputs": [],
532 |    "source": [
533 |     "# Create a window specification\n",
534 |     "window_spec = Window.partitionBy(\"department\").orderBy(F.desc(\"salary\"))"
535 |    ]
536 |   },
537 |   {
538 |    "cell_type": "code",
539 |    "execution_count": 72,
540 |    "id": "dcf9ca4b-f1ce-43f1-ac72-bac31c32e34f",
541 |    "metadata": {},
542 |    "outputs": [
543 |     {
544 |      "name": "stdout",
545 |      "output_type": "stream",
546 |      "text": [
547 |       "+----------+---+------+-------+----+\n",
548 |       "|department| id|salary|   name|rank|\n",
549 |       "+----------+---+------+-------+----+\n",
550 |       "|        HR|  1| 60000|   John|   1|\n",
551 |       "|        HR|  3| 58000|    Bob|   2|\n",
552 |       "|        HR|  2| 55000|  Alice|   3|\n",
553 |       "|        IT|  5| 72000|  David|   1|\n",
554 |       "|        IT|  4| 70000|  Emily|   2|\n",
555 |       "|        IT|  6| 68000|  Sarah|   3|\n",
556 |       "|     Sales|  8| 78000|   Lisa|   1|\n",
557 |       "|     Sales|  9| 77000|William|   2|\n",
558 |       "|     Sales|  7| 75000|Michael|   3|\n",
559 |       "+----------+---+------+-------+----+\n",
560 |       "\n"
561 |      ]
562 |     }
563 |    ],
564 |    "source": [
565 |     "# Calculate the rank of employees within each department based on salary\n",
566 |     "employee_salary.withColumn(\"rank\", F.rank().over(window_spec)).show()"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "code",
571 |    "execution_count": 73,
572 |    "id": "7d85a9f5-3da2-4ef7-b201-7bb43ccc6bc2",
573 |    "metadata": {},
574 |    "outputs": [],
575 |    "source": [
576 |     "# Stop the SparkSession\n",
577 |     "spark.stop()"
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "code",
582 |    "execution_count": null,
583 |    "id": "fea9d104-e4b3-4be0-9fae-43fdbc391691",
584 |    "metadata": {},
585 |    "outputs": [],
586 |    "source": []
587 |   }
588 |  ],
589 |  "metadata": {
590 |   "kernelspec": {
591 |    "display_name": "Python 3 (ipykernel)",
592 |    "language": "python",
593 |    "name": "python3"
594 |   },
595 |   "language_info": {
596 |    "codemirror_mode": {
597 |     "name": "ipython",
598 |     "version": 3
599 |    },
600 |    "file_extension": ".py",
601 |    "mimetype": "text/x-python",
602 |    "name": "python",
603 |    "nbconvert_exporter": "python",
604 |    "pygments_lexer": "ipython3",
605 |    "version": "3.10.8"
606 |   }
607 |  },
608 |  "nbformat": 4,
609 |  "nbformat_minor": 5
610 | }
611 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 coder2j
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PySpark Tutorial for Beginners - Jupyter Notebooks
 2 | 
 3 | Welcome to the PySpark Tutorial for Beginners GitHub repository! This repository contains a collection of Jupyter notebooks used in my comprehensive [YouTube video: PySpark tutorial for beginners](https://youtu.be/EB8lfdxpirM). These notebooks provide hands-on examples and code snippets to help you understand and practice PySpark concepts covered in the tutorial video.
 4 | 
 5 | If you find this tutorial helpful, consider sharing this video with your friends and colleagues to help them unlock the power of PySpark and unlock the following bonus videos.
 6 | 
 7 | 🎁 Bonus Videos:
 8 | - Hit **50,000 views** to unlock a video about building an **end-to-end machine-learning pipeline with PySpark**. 
 9 | - Hit **100,000 views** to unlock another video video about **end-to-end spark streaming**.
10 | 
11 | Do you like this tutorial? Why not check out my other video of [Airflow Tutorial for Beginners](https://youtu.be/K9AnJ9_ZAXE), which has more than **350k views 👀** and around **7k likes 👍**.
12 | 
13 | Don't forget to subscribe to my [YouTube channel](https://www.youtube.com/c/coder2j) and [my blog](https://coder2j.com/) for more exciting tutorials like this. And connect me on [X/Twitter](https://twitter.com/coder2j) and [Linkedin](https://www.linkedin.com/in/coder2j/), I post content there regularly too. Thank you for your support! ❤️
14 | 
15 | 
16 | ## Table of Contents
17 | 
18 | - [Introduction](#introduction)
19 | - [Getting Started](#getting-started)
20 | - [Notebook Descriptions](#notebook-descriptions)
21 | - [Prerequisites](#prerequisites)
22 | - [Usage](#usage)
23 | - [Contributing](#contributing)
24 | - [License](#license)
25 | 
26 | ## Introduction
27 | 
28 | In our [PySpark tutorial video](https://youtu.be/EB8lfdxpirM), we covered various topics, including Spark installation, SparkContext, SparkSession, RDD transformations and actions, Spark DataFrames, Spark SQL, and more. These Jupyter notebooks are designed to complement the video content, allowing you to follow along, experiment, and practice your PySpark skills.
29 | 
30 | ## Getting Started
31 | 
32 | To get started with the Jupyter notebooks, follow these steps:
33 | 
34 | 1. Clone this GitHub repository to your local machine using the following command:
35 | 
36 |    ```bash
37 |    git clone https://github.com/coder2j/pyspark-tutorial.git
38 |    ```
39 | 
40 | 2. Ensure you have Python and Jupyter Notebook installed on your machine.
41 | 
42 | 3. Follow the YouTube video part 2: Spark Installation to make sure Spark has been installed on your machine.
43 | 
44 | 4. Launch Jupyter Notebook by running:
45 | 
46 |    ```bash
47 |    jupyter notebook
48 |    ```
49 | 
50 | 5. Open the notebook you want to work on and start experimenting with PySpark.
51 | 
52 | ## Notebook Descriptions
53 | 
54 | - **Notebook 1 - 01-PySpark-Get-Started**: Instructions and commands for setting the PySpark environment variables to use spark in jupyter notebook.
55 | 
56 | - **Notebook 2 - 02-Create-SparkContext**: Creating SparkContext objects in different PySpark versions.
57 | 
58 | 
59 | - **Notebook 3 - 03-Create-SparkSession.ipynb**: Creating SparkSession objects in PySpark.
60 | 
61 | - **Notebook 4 - 04-RDD-Operations.ipynb**: Creating RDD and Demonstrating RDD transformations and actions.
62 | 
63 | - **Notebook 5 - 05-DataFrame-Intro.ipynb**: Introduction to Spark DataFrames and differences compared to RDD.
64 | 
65 | - **Notebook 6 - 06-DataFrame-from-various-data-source.ipynb**: Creating Spark Dataframe from various data sources.
66 | 
67 | - **Notebook 7 - 07-DataFrame-Operations.ipynb**: Performing Spark Dataframe operations like filtering, aggregation, etc.
68 | 
69 | - **Notebook 8 - 08-Spark-SQL.ipynb**: Converting Spark Dataframe to a temporary table or view and performing SQL operations using Spark SQL.
70 | 
71 | Feel free to explore and run these notebooks at your own pace.
72 | 
73 | ## Prerequisites
74 | 
75 | To make the most of these notebooks, you should have the following prerequisites:
76 | 
77 | - Basic knowledge of Python programming.
78 | 
79 | - Understanding of data processing concepts (though no prior PySpark experience is required).
80 | 
81 | ## Usage
82 | 
83 | These notebooks are meant for self-learning and practice. Follow along with the [tutorial video](https://youtu.be/EB8lfdxpirM) to gain a deeper understanding of PySpark concepts. Experiment with the code, modify it and try additional exercises to solidify your skills.
84 | 
85 | ## Contributing
86 | 
87 | If you'd like to contribute to this repository by adding more notebooks, improving documentation, or fixing issues, please feel free to fork the repository, make your changes, and submit a pull request. We welcome contributions from the community!
88 | 
89 | ## License
90 | 
91 | This project is licensed under the [MIT License](LICENSE.md).
92 | 


--------------------------------------------------------------------------------
/data/data.txt:
--------------------------------------------------------------------------------
1 | Apache Spark has its architectural foundation in the resilient distributed dataset (RDD), a read-only multiset of data items distributed over a cluster of machines, that is maintained in a fault-tolerant way. The Dataframe API was released as an abstraction on top of the RDD, followed by the Dataset API. In Spark 1.x, the RDD was the primary application programming interface (API), but as of Spark 2.x use of the Dataset API is encouraged even though the RDD API is not deprecated. The RDD technology still underlies the Dataset API.
2 | 
3 | Spark and its RDDs were developed in 2012 in response to limitations in the MapReduce cluster computing paradigm, which forces a particular linear dataflow structure on distributed programs: MapReduce programs read input data from disk, map a function across the data, reduce the results of the map, and store reduction results on disk. Spark's RDDs function as a working set for distributed programs that offers a (deliberately) restricted form of distributed shared memory.


--------------------------------------------------------------------------------
/data/persons.csv:
--------------------------------------------------------------------------------
 1 | name,age,gender,salary
 2 | John Doe,30,Male,50000
 3 | Jane Smith,25,Female,45000
 4 | David Johnson,35,Male,60000
 5 | Emily Davis,28,Female,52000
 6 | Michael Wilson,40,Male,75000
 7 | Sarah Brown,32,Female,58000
 8 | Robert Lee,29,Male,51000
 9 | Lisa Garcia,27,Female,49000
10 | James Martinez,38,Male,70000
11 | Jennifer Rodriguez,26,Female,47000
12 | William Anderson,33,Male,62000
13 | Karen Hernandez,31,Female,55000
14 | Christopher Taylor,37,Male,69000
15 | Mary Gonzalez,24,Female,44000
16 | Matthew Davis,36,Male,67000
17 | Patricia White,29,Female,50000
18 | Daniel Miller,34,Male,64000
19 | Elizabeth Jackson,30,Female,52000
20 | Joseph Harris,28,Male,53000
21 | Linda Martin,39,Female,71000
22 | 


--------------------------------------------------------------------------------
/data/products.csv:
--------------------------------------------------------------------------------
 1 | id,name,category,quantity,price
 2 | 1,iPhone 12,Electronics,10,899.99
 3 | 2,Nike Air Max 90,Clothing,25,119.99
 4 | 3,KitchenAid Stand Mixer,Home Appliances,5,299.99
 5 | 4,The Great Gatsby,Books,50,12.99
 6 | 5,L'Oreal Paris Mascara,Beauty,100,9.99
 7 | 6,Yoga Mat,Sports,30,29.99
 8 | 7,Samsung 4K Smart TV,Electronics,8,799.99
 9 | 8,Levi's Jeans,Clothing,15,49.99
10 | 9,Dyson Vacuum Cleaner,Home Appliances,3,399.99
11 | 10,Harry Potter Series,Books,20,15.99
12 | 11,MAC Lipstick,Beauty,75,16.99
13 | 12,Adidas Running Shoes,Sports,22,59.99
14 | 13,PlayStation 5,Electronics,12,499.99
15 | 14,Hooded Sweatshirt,Clothing,10,34.99
16 | 15,Coffee Maker,Home Appliances,7,89.99
17 | 16,To Kill a Mockingbird,Books,15,9.99
18 | 17,Skincare Set,Beauty,50,49.99
19 | 18,Yoga Ball,Sports,18,19.99
20 | 19,Sony Noise-Canceling Headphones,Electronics,6,299.99
21 | 20,Puma T-shirt,Clothing,40,19.99
22 | 


--------------------------------------------------------------------------------
/data/products.parquet/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/data/products.parquet/.part-00000-1677f901-2d4d-4de5-b1e2-436dae4a802e-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/data/products.parquet/.part-00000-1677f901-2d4d-4de5-b1e2-436dae4a802e-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/data/products.parquet/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/data/products.parquet/_SUCCESS


--------------------------------------------------------------------------------
/data/products.parquet/part-00000-1677f901-2d4d-4de5-b1e2-436dae4a802e-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/data/products.parquet/part-00000-1677f901-2d4d-4de5-b1e2-436dae4a802e-c000.snappy.parquet


--------------------------------------------------------------------------------
/data/products_multiline.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "id": 1,
  4 |     "name": "iPhone 12",
  5 |     "category": "Electronics",
  6 |     "quantity": 10,
  7 |     "price": 899.99
  8 |   },
  9 |   {
 10 |     "id": 2,
 11 |     "name": "Nike Air Max 90",
 12 |     "category": "Clothing",
 13 |     "quantity": 25,
 14 |     "price": 119.99
 15 |   },
 16 |   {
 17 |     "id": 3,
 18 |     "name": "KitchenAid Stand Mixer",
 19 |     "category": "Home Appliances",
 20 |     "quantity": 5,
 21 |     "price": 299.99
 22 |   },
 23 |   {
 24 |     "id": 4,
 25 |     "name": "The Great Gatsby",
 26 |     "category": "Books",
 27 |     "quantity": 50,
 28 |     "price": 12.99
 29 |   },
 30 |   {
 31 |     "id": 5,
 32 |     "name": "L'Oreal Paris Mascara",
 33 |     "category": "Beauty",
 34 |     "quantity": 100,
 35 |     "price": 9.99
 36 |   },
 37 |   {
 38 |     "id": 6,
 39 |     "name": "Yoga Mat",
 40 |     "category": "Sports",
 41 |     "quantity": 30,
 42 |     "price": 29.99
 43 |   },
 44 |   {
 45 |     "id": 7,
 46 |     "name": "Samsung 4K Smart TV",
 47 |     "category": "Electronics",
 48 |     "quantity": 8,
 49 |     "price": 799.99
 50 |   },
 51 |   {
 52 |     "id": 8,
 53 |     "name": "Levi's Jeans",
 54 |     "category": "Clothing",
 55 |     "quantity": 15,
 56 |     "price": 49.99
 57 |   },
 58 |   {
 59 |     "id": 9,
 60 |     "name": "Dyson Vacuum Cleaner",
 61 |     "category": "Home Appliances",
 62 |     "quantity": 3,
 63 |     "price": 399.99
 64 |   },
 65 |   {
 66 |     "id": 10,
 67 |     "name": "Harry Potter Series",
 68 |     "category": "Books",
 69 |     "quantity": 20,
 70 |     "price": 15.99
 71 |   },
 72 |   {
 73 |     "id": 11,
 74 |     "name": "MAC Lipstick",
 75 |     "category": "Beauty",
 76 |     "quantity": 75,
 77 |     "price": 16.99
 78 |   },
 79 |   {
 80 |     "id": 12,
 81 |     "name": "Adidas Running Shoes",
 82 |     "category": "Sports",
 83 |     "quantity": 22,
 84 |     "price": 59.99
 85 |   },
 86 |   {
 87 |     "id": 13,
 88 |     "name": "PlayStation 5",
 89 |     "category": "Electronics",
 90 |     "quantity": 12,
 91 |     "price": 499.99
 92 |   },
 93 |   {
 94 |     "id": 14,
 95 |     "name": "Hooded Sweatshirt",
 96 |     "category": "Clothing",
 97 |     "quantity": 10,
 98 |     "price": 34.99
 99 |   },
100 |   {
101 |     "id": 15,
102 |     "name": "Coffee Maker",
103 |     "category": "Home Appliances",
104 |     "quantity": 7,
105 |     "price": 89.99
106 |   },
107 |   {
108 |     "id": 16,
109 |     "name": "To Kill a Mockingbird",
110 |     "category": "Books",
111 |     "quantity": 15,
112 |     "price": 9.99
113 |   },
114 |   {
115 |     "id": 17,
116 |     "name": "Skincare Set",
117 |     "category": "Beauty",
118 |     "quantity": 50,
119 |     "price": 49.99
120 |   },
121 |   {
122 |     "id": 18,
123 |     "name": "Yoga Ball",
124 |     "category": "Sports",
125 |     "quantity": 18,
126 |     "price": 19.99
127 |   },
128 |   {
129 |     "id": 19,
130 |     "name": "Sony Noise-Canceling Headphones",
131 |     "category": "Electronics",
132 |     "quantity": 6,
133 |     "price": 299.99
134 |   },
135 |   {
136 |     "id": 20,
137 |     "name": "Puma T-shirt",
138 |     "category": "Clothing",
139 |     "quantity": 40,
140 |     "price": 19.99
141 |   }
142 | ]
143 | 


--------------------------------------------------------------------------------
/data/products_singleline.json:
--------------------------------------------------------------------------------
 1 | {"id":1,"name":"iPhone 12","category":"Electronics","quantity":10,"price":899.99}
 2 | {"id":2,"name":"Nike Air Max 90","category":"Clothing","quantity":25,"price":119.99}
 3 | {"id":3,"name":"KitchenAid Stand Mixer","category":"Home Appliances","quantity":5,"price":299.99}
 4 | {"id":4,"name":"The Great Gatsby","category":"Books","quantity":50,"price":12.99}
 5 | {"id":5,"name":"L'Oreal Paris Mascara","category":"Beauty","quantity":100,"price":9.99}
 6 | {"id":6,"name":"Yoga Mat","category":"Sports","quantity":30,"price":29.99}
 7 | {"id":7,"name":"Samsung 4K Smart TV","category":"Electronics","quantity":8,"price":799.99}
 8 | {"id":8,"name":"Levi's Jeans","category":"Clothing","quantity":15,"price":49.99}
 9 | {"id":9,"name":"Dyson Vacuum Cleaner","category":"Home Appliances","quantity":3,"price":399.99}
10 | {"id":10,"name":"Harry Potter Series","category":"Books","quantity":20,"price":15.99}
11 | {"id":11,"name":"MAC Lipstick","category":"Beauty","quantity":75,"price":16.99}
12 | {"id":12,"name":"Adidas Running Shoes","category":"Sports","quantity":22,"price":59.99}
13 | {"id":13,"name":"PlayStation 5","category":"Electronics","quantity":12,"price":499.99}
14 | {"id":14,"name":"Hooded Sweatshirt","category":"Clothing","quantity":10,"price":34.99}
15 | {"id":15,"name":"Coffee Maker","category":"Home Appliances","quantity":7,"price":89.99}
16 | {"id":16,"name":"To Kill a Mockingbird","category":"Books","quantity":15,"price":9.99}
17 | {"id":17,"name":"Skincare Set","category":"Beauty","quantity":50,"price":49.99}
18 | {"id":18,"name":"Yoga Ball","category":"Sports","quantity":18,"price":19.99}
19 | {"id":19,"name":"Sony Noise-Canceling Headphones","category":"Electronics","quantity":6,"price":299.99}
20 | {"id":20,"name":"Puma T-shirt","category":"Clothing","quantity":40,"price":19.99}


--------------------------------------------------------------------------------
/data/stocks.txt:
--------------------------------------------------------------------------------
 1 | id,name,category,quantity,price
 2 | 1,iPhone,Electronics,10,899.99
 3 | 2,Macbook,Electronics,5,1299.99
 4 | 3,iPad,Electronics,15,499.99
 5 | 4,Samsung TV,Electronics,8,799.99
 6 | 5,LG TV,Electronics,10,699.99
 7 | 6,Nike Shoes,Clothing,30,99.99
 8 | 7,Adidas Shoes,Clothing,25,89.99
 9 | 8,Sony Headphones,Electronics,12,149.99
10 | 9,Beats Headphones,Electronics,20,199.99
11 | 10,Dining Table,Furniture,10,249.99
12 | 11,Study Desk,Furniture,8,149.99
13 | 12,Apples,Food,100,0.5
14 | 13,Bananas,Food,150,0.25
15 | 14,Oranges,Food,120,0.75
16 | 15,Chicken Breast,Food,50,3.99
17 | 16,Salmon Fillet,Food,30,5.99
18 | 17,Leather Jacket,Clothing,15,199.99
19 | 18,Winter Coat,Clothing,10,149.99
20 | 19,Yoga Mat,Sports,20,19.99
21 | 20,Dumbbell Set,Sports,15,49.99
22 | 21,Coffee Table,Furniture,5,129.99
23 | 22,Office Chair,Furniture,8,79.99
24 | 23,Bookshelf,Furniture,10,99.99
25 | 24,Laptop Bag,Accessories,25,29.99
26 | 25,Backpack,Accessories,30,24.99
27 | 26,Camera,Electronics,10,599.99
28 | 27,Printer,Electronics,8,129.99
29 | 28,Jeans,Clothing,30,59.99
30 | 29,T-shirt,Clothing,50,14.99
31 | 30,Sneakers,Clothing,40,79.99
32 | 


--------------------------------------------------------------------------------
/output.txt/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/output.txt/.part-00000.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/output.txt/.part-00001.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/output.txt/.part-00002.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/.part-00002.crc


--------------------------------------------------------------------------------
/output.txt/.part-00003.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/output.txt/.part-00004.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/output.txt/.part-00005.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/.part-00005.crc


--------------------------------------------------------------------------------
/output.txt/.part-00006.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/output.txt/.part-00007.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/output.txt/.part-00008.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/.part-00008.crc


--------------------------------------------------------------------------------
/output.txt/.part-00009.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/output.txt/.part-00010.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/output.txt/.part-00011.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/.part-00011.crc


--------------------------------------------------------------------------------
/output.txt/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/_SUCCESS


--------------------------------------------------------------------------------
/output.txt/part-00000:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/part-00000


--------------------------------------------------------------------------------
/output.txt/part-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/part-00001


--------------------------------------------------------------------------------
/output.txt/part-00002:
--------------------------------------------------------------------------------
1 | ('Alice', 25)
2 | 


--------------------------------------------------------------------------------
/output.txt/part-00003:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/part-00003


--------------------------------------------------------------------------------
/output.txt/part-00004:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/part-00004


--------------------------------------------------------------------------------
/output.txt/part-00005:
--------------------------------------------------------------------------------
1 | ('Bob', 30)
2 | 


--------------------------------------------------------------------------------
/output.txt/part-00006:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/part-00006


--------------------------------------------------------------------------------
/output.txt/part-00007:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/part-00007


--------------------------------------------------------------------------------
/output.txt/part-00008:
--------------------------------------------------------------------------------
1 | ('Charlie', 35)
2 | 


--------------------------------------------------------------------------------
/output.txt/part-00009:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/part-00009


--------------------------------------------------------------------------------
/output.txt/part-00010:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder2j/pyspark-tutorial/860be30c06bbe1fb8f58aacfc118ab8757d06f88/output.txt/part-00010


--------------------------------------------------------------------------------
/output.txt/part-00011:
--------------------------------------------------------------------------------
1 | ('Alice', 40)
2 | 


--------------------------------------------------------------------------------