├── gitbook ├── src │ ├── cover.jpg │ ├── README.md │ ├── Chapter 1 - Basics │ │ ├── Section 1 - Useful Material.md │ │ ├── Section 2 - Creating your First Data Object.md │ │ ├── Section 4 - More Comfortable with SQL.md │ │ └── Section 3 - Reading your First Dataset.md │ ├── Chapter 2 - Exploring the Spark APIs │ │ ├── Section 1.1 - Struct Types.md │ │ ├── Section 2 - Performing your First Transformations.md │ │ ├── Section 2.5 - Casting Columns to Different Type.md │ │ ├── Section 1.2 - Arrays and Lists.md │ │ ├── Section 2.6 - Filtering Data.md │ │ ├── Section 2.8 - Case Statements.md │ │ ├── Section 1.3 - Maps and Dictionaries.md │ │ ├── Section 2.2 - Selecting a Subset of Columns.md │ │ ├── Section 2.3 - Creating New Columns and Transforming Data.md │ │ ├── Section 2.1 - Looking at Your Data.md │ │ ├── Section 2.4 - Constant Values and Column Expressions.md │ │ ├── Section 3.2 - Range Join Conditions .md │ │ ├── Section 2.12 - Performing Joins .md │ │ └── Section 2.7 - Equality Statements in Spark and Comparison with Nulls.md │ ├── SUMMARY.md │ ├── Chapter 3 - Aggregates │ │ ├── Section 1 - Clean Aggregations.md │ │ └── Section 2 - Non Deterministic Ordering for GroupBys.md │ ├── Chapter 4 - Window Objects │ │ ├── Section 2 - Ordering High Frequency Data with a Window Object.md │ │ └── Section 1 - Default Behaviour of a Window Object.md │ ├── Chapter 6 - Tuning & Spark Parameters │ │ └── Section 1.1 - Understanding how Spark Works.md │ └── Chapter 7 - High Performance Code │ │ └── Section 1.4 - Joins on Skewed Data .md ├── .bookignore ├── book.json ├── Vagrantfile ├── Dockerfile ├── package.json ├── Makefile ├── convert-ipynb2markdown.py └── gitbook-auto-summary.py ├── src ├── images │ ├── key-terms.png │ ├── mapreduce.png │ ├── master-slave.png │ └── ieee-floating-point-representation.png ├── data │ └── pets.csv ├── Random │ └── understanding-error-logs.ipynb ├── Chapter 1 - Basics │ ├── Section 1 - Useful Material.md │ ├── Section 2 - Creating your First Data Object.ipynb │ ├── Section 4 - More Comfortable with SQL?.ipynb │ └── Section 3 - Reading your First Dataset.ipynb ├── Chapter 2 - Exploring the Spark APIs │ ├── Section 1.1 - Struct Types.ipynb │ ├── Section 1.2 - Arrays and Lists.ipynb │ ├── Section 1.3 - Maps and Dictionaries.ipynb │ ├── Section 2.5 - Casting Columns to Different Type.ipynb │ └── Section 2 - Performing your First Transformations.ipynb └── Chapter 6 - Tuning & Spark Parameters │ └── Section 1.1 - Understanding how Spark Works.md ├── .github └── ISSUE_TEMPLATE │ └── feature-topic-request.md └── .gitignore /gitbook/src/cover.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ericxiao251/spark-syntax/HEAD/gitbook/src/cover.jpg -------------------------------------------------------------------------------- /src/images/key-terms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ericxiao251/spark-syntax/HEAD/src/images/key-terms.png -------------------------------------------------------------------------------- /src/images/mapreduce.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ericxiao251/spark-syntax/HEAD/src/images/mapreduce.png -------------------------------------------------------------------------------- /src/images/master-slave.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ericxiao251/spark-syntax/HEAD/src/images/master-slave.png -------------------------------------------------------------------------------- /src/images/ieee-floating-point-representation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ericxiao251/spark-syntax/HEAD/src/images/ieee-floating-point-representation.png -------------------------------------------------------------------------------- /gitbook/.bookignore: -------------------------------------------------------------------------------- 1 | .bookignore 2 | .gitignore 3 | *.py 4 | .vagrant/ 5 | Makefile 6 | Vagrantfile 7 | Dockerfile 8 | package.json 9 | node_modules/ 10 | tmp*/ 11 | -------------------------------------------------------------------------------- /src/data/pets.csv: -------------------------------------------------------------------------------- 1 | id,breed_id,nickname,birthday,age,color,weight 2 | 1,1,"King",2014-11-22 12:30:31,5,"brown",10.0 3 | 2,3,"Argus",2016-11-22 10:05:10,10,,5.5 4 | 3,1,"Chewie",2016-11-22 10:05:10,15,,12 5 | 3,2,"Maple",2018-11-22 10:05:10,17,"white",3.4 6 | 4,2,,2019-01-01 10:05:10,13,,10 7 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-topic-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature/Topic request 3 | about: Suggest a topic that you would like me to cover 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the topic:** 11 | 12 | 13 | **Describe the format of the answer/section:** 14 | 15 | 16 | **Additional Context/Information:** 17 | 18 | -------------------------------------------------------------------------------- /gitbook/book.json: -------------------------------------------------------------------------------- 1 | { 2 | "gitbook": "3.2.x", 3 | "root": "./src", 4 | "pdf": { 5 | "toc": true, 6 | "pageNumbers": true, 7 | "fontSize": 14, 8 | "paperSize": "a4", 9 | "margin": { 10 | "right": 62, 11 | "left": 62, 12 | "top": 56, 13 | "bottom": 56 14 | } 15 | }, 16 | "plugins": [ 17 | "-sharing", 18 | "katex@1.1.3", 19 | "edit-link@2.0.x", 20 | "github@2.0.x" 21 | ], 22 | "pluginsConfig": { 23 | "edit-link": { 24 | "base": "https://github.com/ericxiao251/spark-syntax/edit/master", 25 | "label": "Edit" 26 | }, 27 | "github": { 28 | "url": "https://github.com/ericxiao251/spark-syntax" 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /gitbook/Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | Vagrant.configure("2") do |config| 5 | 6 | # activate x11 for publishing pdf, epub and mobi ebooks 7 | config.ssh.forward_x11 = true 8 | config.ssh.forward_agent = true 9 | 10 | config.vm.box = "ubuntu/trusty64" 11 | 12 | config.vm.network "forwarded_port", guest: 4000, host: 4000 13 | 14 | config.vm.provider "virtualbox" do |vb| 15 | vb.customize ["modifyvm", :id, "--ioapic", "on"] 16 | vb.customize ["modifyvm", :id, "--memory", "1024"] 17 | vb.customize ["modifyvm", :id, "--cpus", "2"] 18 | end 19 | 20 | config.vm.provision "shell", inline: <<-SHELL 21 | sudo apt-get update 22 | sudo apt-get install -y git 23 | curl -sL https://deb.nodesource.com/setup_4.x | sudo -E bash - 24 | sudo apt-get install -y nodejs 25 | sudo -v && wget -nv -O- https://download.calibre-ebook.com/linux-installer.sh | sudo sh /dev/stdin 26 | cd /vagrant 27 | make install 28 | make build 29 | SHELL 30 | end 31 | -------------------------------------------------------------------------------- /gitbook/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | ENV LANG C.UTF-8 4 | 5 | ARG DEBIAN_FRONTEND=noninteractive 6 | 7 | RUN apt-get update && apt-get install -y --no-install-recommends \ 8 | apt-utils software-properties-common nodejs npm git make sudo 9 | 10 | RUN ln -s /usr/bin/nodejs /usr/bin/node 11 | 12 | # ignore gpg key exit status 13 | RUN add-apt-repository -y ppa:jonathonf/calibre; exit 0 14 | 15 | # install calibre v3.29 16 | RUN apt-get update && apt-get install -y calibre 17 | 18 | RUN npm install -g gitbook-cli@2.3.0 19 | 20 | # Replace 1000 with your user / group id 21 | RUN export uid=1000 gid=1000 && \ 22 | mkdir -p /app/gitbook && \ 23 | echo "docker:x:${uid}:${gid}:Docker,,,:/app:/bin/bash" >> /etc/passwd && \ 24 | echo "docker:x:${uid}:" >> /etc/group && \ 25 | echo "docker ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/docker && \ 26 | chmod 0440 /etc/sudoers.d/docker && \ 27 | chown ${uid}:${gid} -R /app 28 | 29 | USER docker 30 | 31 | RUN gitbook fetch 3.2.x 32 | 33 | WORKDIR /app/gitbook 34 | 35 | EXPOSE 4000 36 | -------------------------------------------------------------------------------- /gitbook/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "spark-syntax", 3 | "version": "0.2.0", 4 | "description": "spark-syntax", 5 | "main": "index.html", 6 | "dependencies": { 7 | "gitbook-cli": "2.3.0", 8 | "gitbook-plugin-edit-link": "^2.0.2", 9 | "gitbook-plugin-github": "^2.0.0", 10 | "gitbook-plugin-katex": "^1.1.3" 11 | }, 12 | "devDependencies": {}, 13 | "scripts": { 14 | "gitbook:prepare": "gitbook install", 15 | "gitbook:watch": "npm run gitbook:prepare && gitbook serve", 16 | "gitbook:build": "npm run gitbook:prepare && gitbook build", 17 | "gitbook:pdf": "gitbook pdf . ./_book/spark-syntax.pdf", 18 | "gitbook:epub": "gitbook epub . ./_book/spark-syntax.epub", 19 | "gitbook:mobi": "gitbook mobi . ./_book/spark-syntax.mobi" 20 | }, 21 | "repository": { 22 | "type": "git", 23 | "url": "git+https://github.com/ericxiao251/spark-syntax.git" 24 | }, 25 | "keywords": [ 26 | "golang" 27 | ], 28 | "author": "Jan Newmarch", 29 | "license": "CC-BY-NC-SA-3.0", 30 | "bugs": { 31 | "url": "https://github.com/ericxiao251/spark-syntax/issues" 32 | }, 33 | "homepage": "https://github.com/ericxiao251/spark-syntax#readme" 34 | } 35 | -------------------------------------------------------------------------------- /gitbook/src/README.md: -------------------------------------------------------------------------------- 1 | # Spark-Syntax 2 | 3 | This is a public repo documenting all of the "best practices" of writing PySpark code from what I have learnt from working with `PySpark` for 3 years. This will mainly focus on the `Spark DataFrames and SQL` library. 4 | 5 | # Contributing/Topic Requests 6 | 7 | If you notice an improvements in terms of typos, spellings, grammar, etc. feel free to create a PR and I'll review it 😁, you'll most likely be right. 8 | 9 | If you have any topics that I could potentially go over, please create an **issue** and describe the topic. You can create an issue [here](https://github.com/ericxiao251/spark-syntax/issues). I'll try my best to address it 😁. 10 | 11 | # Acknowledgement 12 | 13 | If you found this book helpful, please give a star on the [github repo](https://github.com/ericxiao251/spark-syntax) to show some love! 14 | 15 | Huge thanks to Levon for turning everything into a gitbook. You can follow his github at https://github.com/tumregels. 16 | 17 | # Other Formats 18 | 19 | Ebook can be downloaded as: 20 | * [pdf](https://github.com/ericxiao251/spark-syntax/raw/gh-pages/spark-syntax.pdf) 21 | * [epub](https://github.com/ericxiao251/spark-syntax/raw/gh-pages/spark-syntax.epub) 22 | * [mobi](https://github.com/ericxiao251/spark-syntax/raw/gh-pages/spark-syntax.mobi) 23 | -------------------------------------------------------------------------------- /gitbook/Makefile: -------------------------------------------------------------------------------- 1 | SHELL := /bin/bash 2 | 3 | .PHONY: all install prepare build watch publish pdf epub mobi clean dockbuild dockrun 4 | 5 | all: install build 6 | 7 | install: # install gitbook-cli 8 | npm install 9 | 10 | prepare: 11 | npm run gitbook:prepare 12 | 13 | build: 14 | npm run gitbook:build 15 | 16 | watch: 17 | npm run gitbook:watch 18 | 19 | pdf: 20 | npm run gitbook:pdf 21 | 22 | epub: 23 | npm run gitbook:epub 24 | 25 | mobi: 26 | npm run gitbook:mobi 27 | 28 | publish: build pdf epub mobi 29 | cd _book && \ 30 | git config --global user.name "publisher" && \ 31 | git config --global user.email "publisher@git.hub" && \ 32 | git init && \ 33 | git commit --allow-empty -m 'update gh-pages' && \ 34 | git checkout -b gh-pages && \ 35 | git add . && \ 36 | git commit -am 'update gh-pages' && \ 37 | git push https://github.com/ericxiao251/spark-syntax gh-pages --force 38 | 39 | generate: 40 | python3 convert-ipynb2markdown.py && \ 41 | python3 gitbook-auto-summary.py && \ 42 | make publish 43 | 44 | clean: 45 | rm -rf _book 46 | rm -rf node_modules 47 | rm -rf tmp* 48 | 49 | # build docker image 50 | dockbuild: 51 | docker build -t gitbook . 52 | 53 | # use x11 for publishing pdf, epub and mobi ebooks, tested on ubuntu 16.04. \ 54 | `make dockrun` will create a container, build the gitbook and attach a terminal, \ 55 | to run other commands such as `make watch`, `make publish`. 56 | dockrun: 57 | docker run -ti --rm -e DISPLAY=${DISPLAY} \ 58 | -v /tmp/.X11-unix:/tmp/.X11-unix \ 59 | -v ${HOME}/.Xauthority:/root/.Xauthority \ 60 | -v ${shell pwd}:/app/gitbook \ 61 | --net=host gitbook /bin/bash -c "make build && bash" 62 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ignore spark stuff 2 | metastore_db/* 3 | .DS_Store 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # dotenv 87 | .env 88 | 89 | # virtualenv 90 | .venv 91 | venv/ 92 | ENV/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | 107 | .vagrant/ 108 | _book/ 109 | node_modules/ 110 | tmp*/ -------------------------------------------------------------------------------- /gitbook/convert-ipynb2markdown.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import os.path as op 4 | import sys 5 | from shutil import copyfile 6 | 7 | try: 8 | assert sys.version_info.major == 3 9 | assert sys.version_info.minor > 5 10 | except AssertionError: 11 | raise RuntimeError('converter requires Python 3.6+!') 12 | 13 | basedir = op.abspath(op.dirname(__file__)) 14 | 15 | markdown_dir = op.join(basedir, 'src') 16 | ipynb_dir = op.abspath(op.join(basedir, os.pardir, 'src')) 17 | 18 | os.system(f'rm -rf {markdown_dir}/*/') # delete chapter folders only 19 | 20 | # convert ipynb to md 21 | files_ipynb = glob.glob(f'{ipynb_dir}/**/*.ipynb', recursive=True) 22 | for file_ipynb in files_ipynb: 23 | file_ipynb = op.abspath(file_ipynb) 24 | if 'Random' in file_ipynb: 25 | continue 26 | 27 | file_md = file_ipynb.replace('src', 'gitbook/src') \ 28 | .replace('.ipynb', '.md') \ 29 | .replace('(', '<').replace(')', '>').replace('?', '') 30 | os.makedirs(op.dirname(file_md), exist_ok=True) 31 | cmd = f'jupyter nbconvert --to markdown "{file_ipynb}" --output "{file_md}"' 32 | os.system(cmd) 33 | 34 | # copy md to md 35 | files_md = glob.glob(f'{ipynb_dir}/**/*.md', recursive=True) 36 | for file_md in files_md: 37 | file_md = op.abspath(file_md) 38 | cp_file_md = file_md.replace('src', 'gitbook/src') 39 | os.makedirs(op.dirname(cp_file_md), exist_ok=True) 40 | copyfile(file_md, cp_file_md) 41 | 42 | style = """\ 43 | 56 | """ 57 | 58 | # cleanup 59 | files = glob.glob(f'{markdown_dir}/**/*.md', recursive=True) 60 | for file in files: 61 | with open(file, 'r') as f: 62 | content = f.read() 63 | content_new = content.replace(style, '') 64 | with open(file, 'w') as f: 65 | f.write(content_new) 66 | -------------------------------------------------------------------------------- /src/Random/understanding-error-logs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Error Log #1\n", 8 | "```\n", 9 | "Stdoutput Caused by: \n", 10 | "org.apache.spark.SparkException: Job aborted due to stage failure: ShuffleMapStage 32 (parquet at NativeMethodAccessorImpl.java:0) has failed the maximum allowable number of times: 4. \n", 11 | "...\n", 12 | "Most recent failure reason: org.apache.spark.shuffle.FetchFailedException: Too large frame: 4410995563\n", 13 | "```\n", 14 | "### What does this mean?\n", 15 | "* When performing a `join` on multiple `DataFrame`s, data is usually shuffled in smaller chunks called `partitions`. \n", 16 | "* This errors indicates that the **partitions shuffled are too large**.\n", 17 | "\n", 18 | "### Solution\n", 19 | "* The default partition size is `200`. \n", 20 | "* Try to playing around with the `spark.sql.shuffle.partitions` parameter, use values that are a power of 2, ie. 2^11 = `2048`.\n", 21 | "* Increasing the number of partitions will decrease the size of each partition." 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "# Error Log #2\n", 29 | "```\n", 30 | "Stdoutput py4j.protocol.Py4JJavaError: An error occurred while calling o3092.freqItems.\n", 31 | "Stdoutput : org.apache.spark.SparkException: Job aborted due to stage failure: Total size of serialized results of 43334 tasks (5.4 GB) is bigger than spark.driver.maxResultSize (5.4 GB)\n", 32 | "```\n", 33 | "### What does this mean?\n", 34 | "* The amount of data that you are pulling back to the driver to is large!\n", 35 | "* This is the result of performing some sort of `collect` which brings all the data to one processor, the driver.\n", 36 | "\n", 37 | "### Solution\n", 38 | "* If this is really what you want, then increasing the driver's heap might help.\n", 39 | "* Alternately if this isn't what you want, try instead of a `collect` use a `head`, `take`, etc. This will only take a collect a couple of rows to the driver." 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [] 48 | } 49 | ], 50 | "metadata": { 51 | "kernelspec": { 52 | "display_name": "Python 3", 53 | "language": "python", 54 | "name": "python3" 55 | }, 56 | "language_info": { 57 | "codemirror_mode": { 58 | "name": "ipython", 59 | "version": 2 60 | }, 61 | "file_extension": ".py", 62 | "mimetype": "text/x-python", 63 | "name": "python", 64 | "nbconvert_exporter": "python", 65 | "pygments_lexer": "ipython2", 66 | "version": "2.7.15" 67 | } 68 | }, 69 | "nbformat": 4, 70 | "nbformat_minor": 2 71 | } 72 | -------------------------------------------------------------------------------- /src/Chapter 1 - Basics/Section 1 - Useful Material.md: -------------------------------------------------------------------------------- 1 | ### Spark API Documents 2 | I always find myself referencing the `PySpark API` documentation and have it opened as a seperate browser at work. A majority of your Spark application will be written with the functions found in the document. 3 | 4 | It can be found with this link (I suggest you bookmark it 😀): 5 | 6 | [PySpark latest API docs](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame) 7 | 8 | ### Ask Google 9 | When in doubt ask Google, there are a lot of crowdsourced questions and answers on Stack Overflow. 10 | 11 | ### Companies that Contribute to Spark 12 | **Databricks** and **Cloudera** contribute heavily to Spark and they provide a lot of good blogs about writting performant Spark code. 13 | 14 | **Note:** The author of Spark, Matei Zaharia also cofounded Databricks the company. 15 | 16 | ### Conference Talks 17 | There are a lot of Spark conferences throughout the year where speakers from the companies above or the big tech companies come speak about their advances and experiences with Spark at scale. I find these talks very insightful into writing "real big data" applications. These talks also cover a broader subject matter like how to manage a large spark cluster, etc. 18 | 19 | Example: [Apache Spark - Spark + AI Summit San Francisco 2018](https://www.youtube.com/watch?v=MKJq2CrzCno&list=PLTPXxbhUt-YXXCAsjdkts-r_cJnOV7AG2) 20 | 21 | ### Spark Books 22 | The O'reilly books on Spark is how I got into Spark. They are either written by some highly profiled people in the Spark community (Holden Karau) or the original members that created Spark back in the AmpLabs days (Matei Zaharia). 23 | 24 | The two that I would recommend are: 25 | * [Learning Spark: Lightning-Fast Big Data Analysis](https://www.amazon.ca/Learning-Spark-Lightning-Fast-Data-Analysis/dp/1449358624) 26 | * Back in the early days of Spark, this was the only book out there. I started off with this book. It gives a nice overview of everything in Spark. 27 | * It might be a bit outdated but none-the-less it will give you an appreciation for how far Spark has come. 28 | * This is written by Holden Karau and Matei Zahario most noticably. 29 | 30 | * [Spark: The Definitive Guide: Big Data Processing Made Simple](https://www.amazon.ca/Spark-Definitive-Guide-Processing-Simple/dp/1491912219/ref=pd_lpo_sbs_14_t_0?_encoding=UTF8&psc=1&refRID=KDD7QV3DP5X6RRM4HR14) 31 | * This book is more up-to-date as it talks more in-depth about `Spark SQL` and the `DataFrame`s API. 32 | * This book is written by Matei Zahario most noticably. 33 | 34 | ### Spark Release Docs 35 | Spark is an open-source project under Apache, and releases new features regularly. If you want to be up-to-date with the newest features I recommend following their releases/news: 36 | 37 | [Spark Releases](https://spark.apache.org/releases/) 38 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 1 - Basics/Section 1 - Useful Material.md: -------------------------------------------------------------------------------- 1 | ### Spark API Documents 2 | I always find myself referencing the `PySpark API` documentation and have it opened as a seperate browser at work. A majority of your Spark application will be written with the functions found in the document. 3 | 4 | It can be found with this link (I suggest you bookmark it 😀): 5 | 6 | [PySpark latest API docs](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame) 7 | 8 | ### Ask Google 9 | When in doubt ask Google, there are a lot of crowdsourced questions and answers on Stack Overflow. 10 | 11 | ### Companies that Contribute to Spark 12 | **Databricks** and **Cloudera** contribute heavily to Spark and they provide a lot of good blogs about writting performant Spark code. 13 | 14 | **Note:** The author of Spark, Matei Zaharia also cofounded Databricks the company. 15 | 16 | ### Conference Talks 17 | There are a lot of Spark conferences throughout the year where speakers from the companies above or the big tech companies come speak about their advances and experiences with Spark at scale. I find these talks very insightful into writing "real big data" applications. These talks also cover a broader subject matter like how to manage a large spark cluster, etc. 18 | 19 | Example: [Apache Spark - Spark + AI Summit San Francisco 2018](https://www.youtube.com/watch?v=MKJq2CrzCno&list=PLTPXxbhUt-YXXCAsjdkts-r_cJnOV7AG2) 20 | 21 | ### Spark Books 22 | The O'reilly books on Spark is how I got into Spark. They are either written by some highly profiled people in the Spark community (Holden Karau) or the original members that created Spark back in the AmpLabs days (Matei Zaharia). 23 | 24 | The two that I would recommend are: 25 | * [Learning Spark: Lightning-Fast Big Data Analysis](https://www.amazon.ca/Learning-Spark-Lightning-Fast-Data-Analysis/dp/1449358624) 26 | * Back in the early days of Spark, this was the only book out there. I started off with this book. It gives a nice overview of everything in Spark. 27 | * It might be a bit outdated but none-the-less it will give you an appreciation for how far Spark has come. 28 | * This is written by Holden Karau and Matei Zahario most noticably. 29 | 30 | * [Spark: The Definitive Guide: Big Data Processing Made Simple](https://www.amazon.ca/Spark-Definitive-Guide-Processing-Simple/dp/1491912219/ref=pd_lpo_sbs_14_t_0?_encoding=UTF8&psc=1&refRID=KDD7QV3DP5X6RRM4HR14) 31 | * This book is more up-to-date as it talks more in-depth about `Spark SQL` and the `DataFrame`s API. 32 | * This book is written by Matei Zahario most noticably. 33 | 34 | ### Spark Release Docs 35 | Spark is an open-source project under Apache, and releases new features regularly. If you want to be up-to-date with the newest features I recommend following their releases/news: 36 | 37 | [Spark Releases](https://spark.apache.org/releases/) 38 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 1 - Basics/Section 2 - Creating your First Data Object.md: -------------------------------------------------------------------------------- 1 | 2 | ### Library Imports 3 | 4 | 5 | ```python 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql import types as T 8 | ``` 9 | 10 | ### Template 11 | 12 | 13 | ```python 14 | spark = ( 15 | SparkSession.builder 16 | .master("local") 17 | .appName("Exploring Joins") 18 | .config("spark.some.config.option", "some-value") 19 | .getOrCreate() 20 | ) 21 | 22 | sc = spark.sparkContext 23 | ``` 24 | 25 | ### Create a DataFrame 26 | 27 | 28 | ```python 29 | schema = T.StructType([ 30 | T.StructField("pet_id", T.IntegerType(), False), 31 | T.StructField("name", T.StringType(), True), 32 | T.StructField("age", T.IntegerType(), True), 33 | ]) 34 | 35 | data = [ 36 | (1, "Bear", 13), 37 | (2, "Chewie", 12), 38 | (2, "Roger", 1), 39 | ] 40 | 41 | pet_df = spark.createDataFrame( 42 | data=data, 43 | schema=schema 44 | ) 45 | 46 | pet_df.toPandas() 47 | ``` 48 | 49 | 50 | 51 | 52 |

53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 |

	pet_id	name	age
0	1	Bear	13
1	2	Chewie	12
2	2	Roger	1

83 |

84 | 85 | 86 | 87 | ### Background 88 | There are 3 datatypes in spark `RDD`, `DataFrame` and `Dataset`. As mentioned before, we will focus on the `DataFrame` datatype. 89 | 90 | * This is most performant and commonly used datatype. 91 | * `RDD`s are a thing of the past and you should refrain from using them unless you can't do the transformation in `DataFrame`s. 92 | * `Dataset`s are a thing in `Spark scala`. 93 | 94 | If you have used a `DataFrame` in Pandas, this is the same thing. If you haven't, a dataframe is similar to a `csv` or `excel` file. There are columns and rows that you can perform transformations on. You can search online for better descriptions of what a `DataFrame` is. 95 | 96 | ### What Happened? 97 | For any `DataFrame (df)` that you work with in Spark you should provide it with 2 things: 98 | 1. a `schema` for the data. Providing a `schema` explicitly makes it clearer to the reader and sometimes even more performant, if we can know that a column is `nullable`. This means providing 3 things: 99 | * the `name` of the column 100 | * the `datatype` of the column 101 | * the `nullability` of the column 102 | 2. the data. Normally you would read data stored in `gcs`, `aws` etc and store it in a `df`, but there will be the off-times that you will need to create one. 103 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 2 - Exploring the Spark APIs/Section 1.1 - Struct Types.md: -------------------------------------------------------------------------------- 1 | 2 | ### Library Imports 3 | 4 | 5 | ```python 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql import types as T 8 | 9 | from pyspark.sql import functions as F 10 | 11 | from datetime import datetime 12 | from decimal import Decimal 13 | ``` 14 | 15 | ### Template 16 | 17 | 18 | ```python 19 | spark = ( 20 | SparkSession.builder 21 | .master("local") 22 | .appName("Section 1.1 - Struct Types") 23 | .config("spark.some.config.option", "some-value") 24 | .getOrCreate() 25 | ) 26 | 27 | sc = spark.sparkContext 28 | 29 | import os 30 | 31 | data_path = "/data/pets.csv" 32 | base_path = os.path.dirname(os.getcwd()) 33 | path = base_path + data_path 34 | ``` 35 | 36 | 37 | ```python 38 | pets = spark.read.csv(path, header=True) 39 | pets.show() 40 | ``` 41 | 42 | +---+--------+--------+-------------------+---+-----+------+ 43 | | id|breed_id|nickname| birthday|age|color|weight| 44 | +---+--------+--------+-------------------+---+-----+------+ 45 | | 1| 1| King|2014-11-22 12:30:31| 5|brown| 10.0| 46 | | 2| 3| Argus|2016-11-22 10:05:10| 10| null| 5.5| 47 | | 3| 1| Chewie|2016-11-22 10:05:10| 15| null| 12| 48 | | 3| 2| Maple|2018-11-22 10:05:10| 17|white| 3.4| 49 | | 4| 2| null|2019-01-01 10:05:10| 13| null| 10| 50 | +---+--------+--------+-------------------+---+-----+------+ 51 | 52 | 53 | 54 | ### Struct Types 55 | 56 | What are they used for? TODO 57 | 58 | 59 | ```python 60 | ( 61 | pets 62 | .withColumn('struct_col', F.struct('nickname', 'birthday', 'age', 'color')) 63 | .withColumn('nickname_from_struct', F.col('struct_col').nickname) 64 | .show() 65 | ) 66 | ``` 67 | 68 | +---+--------+--------+-------------------+---+-----+------+--------------------+--------------------+ 69 | | id|breed_id|nickname| birthday|age|color|weight| struct_col|nickname_from_struct| 70 | +---+--------+--------+-------------------+---+-----+------+--------------------+--------------------+ 71 | | 1| 1| King|2014-11-22 12:30:31| 5|brown| 10.0|[King, 2014-11-22...| King| 72 | | 2| 3| Argus|2016-11-22 10:05:10| 10| null| 5.5|[Argus, 2016-11-2...| Argus| 73 | | 3| 1| Chewie|2016-11-22 10:05:10| 15| null| 12|[Chewie, 2016-11-...| Chewie| 74 | | 3| 2| Maple|2018-11-22 10:05:10| 17|white| 3.4|[Maple, 2018-11-2...| Maple| 75 | | 4| 2| null|2019-01-01 10:05:10| 13| null| 10|[, 2019-01-01 10:...| null| 76 | +---+--------+--------+-------------------+---+-----+------+--------------------+--------------------+ 77 | 78 | 79 | 80 | **What Happened?** 81 | 82 | We created a `struct` type column consisting of the columns `'nickname', 'birthday', 'age', 'color'`. Then we accessed a member `nickname` from the struct. 83 | 84 | ### Summary 85 | 86 | * TODO: Fix a use-case. 87 | * It is pretty easy creating and accessing `struct` datatypes. 88 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 1 - Basics/Section 4 - More Comfortable with SQL.md: -------------------------------------------------------------------------------- 1 | 2 | ### Library Imports 3 | 4 | 5 | ```python 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql import types as T 8 | ``` 9 | 10 | ### Template 11 | 12 | 13 | ```python 14 | spark = ( 15 | SparkSession.builder 16 | .master("local") 17 | .appName("Section 4 - More Comfortable with SQL?") 18 | .config("spark.some.config.option", "some-value") 19 | .getOrCreate() 20 | ) 21 | 22 | sc = spark.sparkContext 23 | 24 | import os 25 | 26 | data_path = "/data/pets.csv" 27 | base_path = os.path.dirname(os.getcwd()) 28 | path = base_path + data_path 29 | 30 | df = spark.read.csv(path, header=True) 31 | df.toPandas() 32 | ``` 33 | 34 | 35 | 36 | 37 |

38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 |

	id	species_id	name	birthday	color
0	1	1	King	2014-11-22 12:30:31	brown
1	2	3	Argus	2016-11-22 10:05:10	None

68 |

69 | 70 | 71 | 72 | ### Register DataFrame as a SQL Table 73 | 74 | 75 | ```python 76 | df.createOrReplaceTempView("pets") 77 | ``` 78 | 79 | ### What Happened? 80 | The first step in making a `df` queryable with `SQL` is to **register** the table as a sql table. 81 | 82 | This particular function will **replace** any previously registered **local** table named `pets` as a result. There are other functions that will register a dataframe with slightly different behavior. You can check the reference docs if this isn't the desired behavior: [docs](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.createGlobalTempView) 83 | 84 | ### Let Write a SQL Query! 85 | 86 | 87 | ```python 88 | df_2 = spark.sql(""" 89 | SELECT 90 | * 91 | FROM pets 92 | WHERE name = 'Argus' 93 | """) 94 | 95 | df_2.toPandas() 96 | ``` 97 | 98 | 99 | 100 | 101 |

102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 |

	id	species_id	name	birthday	color
0	2	3	Argus	2016-11-22 10:05:10	None

124 |

125 | 126 | 127 | 128 | ### What Happened? 129 | Once your `df` is registered, call the spark `sc` function on your `spark session` object. It takes a `sql string` as an input and outputs a new `df`. 130 | 131 | ### Conclusion? 132 | If you're more comfortable with writing `sql` than python/spark code, then you can do so with a spark `df`! We do this by: 133 | 1. Register the `df` with `df.createOrReplaceTempView('table')`. 134 | 2. Call the `sql` function on your `spark session` with a `sql string` as an input. 135 | 3. You're done! 136 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 2 - Exploring the Spark APIs/Section 2 - Performing your First Transformations.md: -------------------------------------------------------------------------------- 1 | 2 | ### Library Imports 3 | 4 | 5 | ```python 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql import types as T 8 | 9 | from pyspark.sql import functions as F 10 | 11 | from datetime import datetime 12 | from decimal import Decimal 13 | ``` 14 | 15 | ### Template 16 | 17 | 18 | ```python 19 | spark = ( 20 | SparkSession.builder 21 | .master("local") 22 | .appName("Section 2 - Performing your First Transformations") 23 | .config("spark.some.config.option", "some-value") 24 | .getOrCreate() 25 | ) 26 | 27 | sc = spark.sparkContext 28 | 29 | import os 30 | 31 | data_path = "/data/pets.csv" 32 | base_path = os.path.dirname(os.getcwd()) 33 | path = base_path + data_path 34 | ``` 35 | 36 | 37 | ```python 38 | pets = spark.read.csv(path, header=True) 39 | pets.toPandas() 40 | ``` 41 | 42 | 43 | 44 | 45 |

46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 |

	id	breed_id	nickname	birthday	age	color
0	1	1	King	2014-11-22 12:30:31	5	brown
1	2	3	Argus	2016-11-22 10:05:10	10	None
2	3	1	Chewie	2016-11-22 10:05:10	15	None

88 |

89 | 90 | 91 | 92 | ### Transformation 93 | 94 | 95 | ```python 96 | ( 97 | pets 98 | .withColumn('birthday_date', F.col('birthday').cast('date')) 99 | .withColumn('owned_by', F.lit('me')) 100 | .withColumnRenamed('id', 'pet_id') 101 | .where(F.col('birthday_date') > datetime(2015,1,1)) 102 | ).toPandas() 103 | ``` 104 | 105 | 106 | 107 | 108 |

109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 |

	pet_id	breed_id	nickname	birthday	age	color	birthday_date	owned_by
0	2	3	Argus	2016-11-22 10:05:10	10	None	2016-11-22	me
1	3	1	Chewie	2016-11-22 10:05:10	15	None	2016-11-22	me

148 |

149 | 150 | 151 | 152 | **What Happened?** 153 | * We renamed the `primary key` of our `df` 154 | * We truncated the precision of our date types. 155 | * we filtered our dataset to a smaller subset. 156 | * We created a new column describing who own these pets. 157 | 158 | #### Summary 159 | We performed a variety of spark transformations to transform our data, we will go through these transformations in detailed in the following section. 160 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 2 - Exploring the Spark APIs/Section 2.5 - Casting Columns to Different Type.md: -------------------------------------------------------------------------------- 1 | 2 | ### Library Imports 3 | 4 | 5 | ```python 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql import types as T 8 | 9 | from pyspark.sql import functions as F 10 | 11 | from datetime import datetime 12 | from decimal import Decimal 13 | ``` 14 | 15 | ### Template 16 | 17 | 18 | ```python 19 | spark = ( 20 | SparkSession.builder 21 | .master("local") 22 | .appName("Section 2.5 - Casting Columns to Different Type") 23 | .config("spark.some.config.option", "some-value") 24 | .getOrCreate() 25 | ) 26 | 27 | sc = spark.sparkContext 28 | 29 | import os 30 | 31 | data_path = "/data/pets.csv" 32 | base_path = os.path.dirname(os.getcwd()) 33 | path = base_path + data_path 34 | ``` 35 | 36 | 37 | ```python 38 | pets = spark.read.csv(path, header=True) 39 | pets.toPandas() 40 | ``` 41 | 42 | 43 | 44 | 45 |

	id	breed_id	nickname	birthday	age	color
0	1	1	King	2014-11-22 12:30:31	5	brown
1	2	3	Argus	2016-11-22 10:05:10	10	None
2	3	1	Chewie	2016-11-22 10:05:10	15	None

88 |

89 | 90 | 91 | 92 | ### Casting Columns in Different Types 93 | 94 | Sometimes your data can be read in as all `unicode`/`string` in which you will need to cast them to the correct type. Or Simply you want to change the type of a column as a part of your transformation. 95 | 96 | ### Option 1 - `cast()` 97 | 98 | 99 | ```python 100 | ( 101 | pets 102 | .select('birthday') 103 | .withColumn('birthday_date', F.col('birthday').cast('date')) 104 | .withColumn('birthday_date_2', F.col('birthday').cast(T.DateType())) 105 | .toPandas() 106 | ) 107 | ``` 108 | 109 | 110 | 111 | 112 |

113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 |

	birthday	birthday_date	birthday_date_2
0	2014-11-22 12:30:31	2014-11-22	2014-11-22
1	2016-11-22 10:05:10	2016-11-22	2016-11-22
2	2016-11-22 10:05:10	2016-11-22	2016-11-22

143 |

144 | 145 | 146 | 147 | **What Happened?** 148 | 149 | There are 2 ways that you can `cast` a column. 150 | 1. Use a string (`cast('date')`). 151 | 2. Use the spark types (`cast(T.DateType())`). 152 | 153 | I tend to use a string as it's shorter, one less import and in more editors there will be syntax highlighting for the string. 154 | 155 | ### Summary 156 | 157 | * We learnt about two ways of casting a column. 158 | * The first way is a bit more cleaner IMO. 159 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 2 - Exploring the Spark APIs/Section 1.2 - Arrays and Lists.md: -------------------------------------------------------------------------------- 1 | 2 | ### Library Imports 3 | 4 | 5 | ```python 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql import types as T 8 | 9 | from pyspark.sql import functions as F 10 | 11 | from datetime import datetime 12 | from decimal import Decimal 13 | ``` 14 | 15 | ### Template 16 | 17 | 18 | ```python 19 | spark = ( 20 | SparkSession.builder 21 | .master("local") 22 | .appName("Section 2.1.2 - Arrays and Lists") 23 | .config("spark.some.config.option", "some-value") 24 | .getOrCreate() 25 | ) 26 | 27 | sc = spark.sparkContext 28 | 29 | def get_csv_schema(*args): 30 | return T.StructType([ 31 | T.StructField(*arg) 32 | for arg in args 33 | ]) 34 | 35 | def read_csv(fname, schema): 36 | return spark.read.csv( 37 | path=fname, 38 | header=True, 39 | schema=get_csv_schema(*schema) 40 | ) 41 | 42 | import os 43 | 44 | data_path = "/data/pets.csv" 45 | base_path = os.path.dirname(os.getcwd()) 46 | path = base_path + data_path 47 | ``` 48 | 49 | 50 | ```python 51 | pets = read_csv( 52 | fname=path, 53 | schema=[ 54 | ("id", T.LongType(), False), 55 | ("breed_id", T.LongType(), True), 56 | ("nickname", T.StringType(), True), 57 | ("birthday", T.TimestampType(), True), 58 | ("age", T.LongType(), True), 59 | ("color", T.StringType(), True), 60 | ("weight", T.DecimalType(), True), 61 | ] 62 | ) 63 | pets.show() 64 | ``` 65 | 66 | +---+--------+--------+-------------------+---+-----+------+ 67 | | id|breed_id|nickname| birthday|age|color|weight| 68 | +---+--------+--------+-------------------+---+-----+------+ 69 | | 1| 1| King|2014-11-22 12:30:31| 5|brown| 10| 70 | | 2| 3| Argus|2016-11-22 10:05:10| 10| null| 6| 71 | | 3| 1| Chewie|2016-11-22 10:05:10| 15| null| 12| 72 | | 3| 2| Maple|2018-11-22 10:05:10| 17|white| 3| 73 | | 4| 2| null|2019-01-01 10:05:10| 13| null| 10| 74 | +---+--------+--------+-------------------+---+-----+------+ 75 | 76 | 77 | 78 | ### Arrays and Lists 79 | 80 | ### Case 1: Reading in Data that contains `Arrays` 81 | 82 | TODO 83 | 84 | ### Case 2: Creating Arrays 85 | 86 | 87 | ```python 88 | ( 89 | pets 90 | .withColumn('array column', F.array([ 91 | F.lit(1), 92 | F.lit("Bob"), 93 | F.lit(datetime(2019,2,1)), 94 | ])) 95 | .show() 96 | ) 97 | ``` 98 | 99 | +---+--------+--------+-------------------+---+-----+------+--------------------+ 100 | | id|breed_id|nickname| birthday|age|color|weight| array column| 101 | +---+--------+--------+-------------------+---+-----+------+--------------------+ 102 | | 1| 1| King|2014-11-22 12:30:31| 5|brown| 10|[1, Bob, 2019-02-...| 103 | | 2| 3| Argus|2016-11-22 10:05:10| 10| null| 6|[1, Bob, 2019-02-...| 104 | | 3| 1| Chewie|2016-11-22 10:05:10| 15| null| 12|[1, Bob, 2019-02-...| 105 | | 3| 2| Maple|2018-11-22 10:05:10| 17|white| 3|[1, Bob, 2019-02-...| 106 | | 4| 2| null|2019-01-01 10:05:10| 13| null| 10|[1, Bob, 2019-02-...| 107 | +---+--------+--------+-------------------+---+-----+------+--------------------+ 108 | 109 | 110 | 111 | **What Happened?** 112 | 113 | We will explain in the later chapter what the `F.lit()` function does, but for now understand that in order to create an array type you need to call the `F.array()` function and for each array element call `F.lit()` on. 114 | 115 | ### Summary 116 | 117 | * It's pretty simple to create an array in Spark, you will need to call 2 functions: `F.array()` and `F.lit()`. 118 | * Each element of the array needs to be of type `F.lit()` 119 | -------------------------------------------------------------------------------- /gitbook/src/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | - [README](./README.md) 4 | - Chapter 1 - Basics 5 | - [Section 1 - Useful Material](Chapter 1 - Basics/Section 1 - Useful Material.md) 6 | - [Section 2 - Creating your First Data Object](Chapter 1 - Basics/Section 2 - Creating your First Data Object.md) 7 | - [Section 3 - Reading your First Dataset](Chapter 1 - Basics/Section 3 - Reading your First Dataset.md) 8 | - [Section 4 - More Comfortable with SQL](Chapter 1 - Basics/Section 4 - More Comfortable with SQL.md) 9 | - Chapter 2 - Exploring the Spark APIs 10 | - [Section 1.1 - Struct Types](Chapter 2 - Exploring the Spark APIs/Section 1.1 - Struct Types.md) 11 | - [Section 1.2 - Arrays and Lists](Chapter 2 - Exploring the Spark APIs/Section 1.2 - Arrays and Lists.md) 12 | - [Section 1.3 - Maps and Dictionaries](Chapter 2 - Exploring the Spark APIs/Section 1.3 - Maps and Dictionaries.md) 13 | - [Section 1.4 - Decimals and Why did my Decimals Overflow](Chapter 2 - Exploring the Spark APIs/Section 1.4 - Decimals and Why did my Decimals Overflow.md) 14 | - [Section 2 - Performing your First Transformations](Chapter 2 - Exploring the Spark APIs/Section 2 - Performing your First Transformations.md) 15 | - [Section 2.1 - Looking at Your Data](Chapter 2 - Exploring the Spark APIs/Section 2.1 - Looking at Your Data.md) 16 | - [Section 2.2 - Selecting a Subset of Columns](Chapter 2 - Exploring the Spark APIs/Section 2.2 - Selecting a Subset of Columns.md) 17 | - [Section 2.3 - Creating New Columns and Transforming Data](Chapter 2 - Exploring the Spark APIs/Section 2.3 - Creating New Columns and Transforming Data.md) 18 | - [Section 2.4 - Constant Values and Column Expressions](Chapter 2 - Exploring the Spark APIs/Section 2.4 - Constant Values and Column Expressions.md) 19 | - [Section 2.5 - Casting Columns to Different Type](Chapter 2 - Exploring the Spark APIs/Section 2.5 - Casting Columns to Different Type.md) 20 | - [Section 2.6 - Filtering Data](Chapter 2 - Exploring the Spark APIs/Section 2.6 - Filtering Data.md) 21 | - [Section 2.7 - Equality Statements in Spark and Comparison with Nulls](Chapter 2 - Exploring the Spark APIs/Section 2.7 - Equality Statements in Spark and Comparison with Nulls.md) 22 | - [Section 2.8 - Case Statements](Chapter 2 - Exploring the Spark APIs/Section 2.8 - Case Statements.md) 23 | - [Section 2.9 - Filling in Null Values](Chapter 2 - Exploring the Spark APIs/Section 2.9 - Filling in Null Values.md) 24 | - [Section 2.10 - Spark Functions aren't Enough, I Need my Own!](Chapter 2 - Exploring the Spark APIs/Section 2.10 - Spark Functions aren't Enough, I Need my Own!.md) 25 | - [Section 2.11 - Unionizing Multiple Dataframes](Chapter 2 - Exploring the Spark APIs/Section 2.11 - Unionizing Multiple Dataframes.md) 26 | - [Section 2.12 - Performing Joins ](Chapter 2 - Exploring the Spark APIs/Section 2.12 - Performing Joins .md) 27 | - [Section 3.1 - One to Many Rows](Chapter 2 - Exploring the Spark APIs/Section 3.1 - One to Many Rows.md) 28 | - [Section 3.2 - Range Join Conditions ](Chapter 2 - Exploring the Spark APIs/Section 3.2 - Range Join Conditions .md) 29 | - Chapter 3 - Aggregates 30 | - [Section 1 - Clean Aggregations](Chapter 3 - Aggregates/Section 1 - Clean Aggregations.md) 31 | - [Section 2 - Non Deterministic Ordering for GroupBys](Chapter 3 - Aggregates/Section 2 - Non Deterministic Ordering for GroupBys.md) 32 | - Chapter 4 - Window Objects 33 | - [Section 1 - Default Behaviour of a Window Object](Chapter 4 - Window Objects/Section 1 - Default Behaviour of a Window Object.md) 34 | - [Section 2 - Ordering High Frequency Data with a Window Object](Chapter 4 - Window Objects/Section 2 - Ordering High Frequency Data with a Window Object.md) 35 | - Chapter 6 - Tuning & Spark Parameters 36 | - [Section 1.1 - Understanding how Spark Works](Chapter 6 - Tuning & Spark Parameters/Section 1.1 - Understanding how Spark Works.md) 37 | - Chapter 7 - High Performance Code 38 | - [Section 1.1 - Filter Pushdown](Chapter 7 - High Performance Code/Section 1.1 - Filter Pushdown.md) 39 | - [Section 1.2 - Joins on Skewed Data ](Chapter 7 - High Performance Code/Section 1.2 - Joins on Skewed Data .md) 40 | - [Section 1.3 - Joins on Skewed Data ](Chapter 7 - High Performance Code/Section 1.3 - Joins on Skewed Data .md) 41 | - [Section 1.4 - Joins on Skewed Data ](Chapter 7 - High Performance Code/Section 1.4 - Joins on Skewed Data .md) 42 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 1 - Basics/Section 3 - Reading your First Dataset.md: -------------------------------------------------------------------------------- 1 | 2 | ### Library Imports 3 | 4 | 5 | ```python 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql import types as T 8 | ``` 9 | 10 | The above also shows you the "best practices" for importing these components into your program. 11 | 12 | *some of the above imports will be explained later, just know this is how you should import these functions into your Spark application. 13 | 14 | These are the essential `imports` that you will need for any `PySpark` program. 15 | 16 | **`SparkSession`** 17 | The `SparkSession` is how you begin a Spark application. This is where you provide some configuration for your Spark program. 18 | 19 | **`pyspark.sql.functions`** 20 | You will find that all your data wrangling/analysis will mostly be done by chaining together multiple `functions`. If you find that you get your desired transformations with the base functions, you should: 21 | 1. Look through the API docs again. 22 | 2. Ask Google. 23 | 3. Write a `user defined function` (`udf`). 24 | 25 | **`pyspark.sql.types`** 26 | When working with spark, you will need to define the type of data for each column you are working with. 27 | 28 | The possible types that Spark accepts are listed here: [Spark types](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.types) 29 | 30 | ### Hello World 31 | 32 | 33 | ```python 34 | spark = ( 35 | SparkSession.builder 36 | .master("local") 37 | .appName("Section 3 - Reading your First Dataset") 38 | .config("spark.some.config.option", "some-value") 39 | .getOrCreate() 40 | ) 41 | 42 | sc = spark.sparkContext 43 | ``` 44 | 45 | Create a `SparkSession`. No need to create `SparkContext` as you automatically get it as part of the `SparkSession`. 46 | 47 | ### Read in Data (CSV) 48 | 49 | 50 | ```python 51 | # define the structure of your data inside the CSV file 52 | def get_csv_schema(*args): 53 | return T.StructType([ 54 | T.StructField(*arg) 55 | for arg in args 56 | ]) 57 | 58 | # read in your csv file with enforcing a schema 59 | def read_csv(fname, schema): 60 | return spark.read.csv( 61 | path=fname, 62 | header=True, 63 | schema=get_csv_schema(*schema) 64 | ) 65 | ``` 66 | 67 | 68 | ```python 69 | import os 70 | 71 | data_path = "/data" 72 | pets_path = "/pets.csv" 73 | base_path = os.path.dirname(os.getcwd()) 74 | 75 | path = base_path + data_path + pets_path 76 | df = read_csv( 77 | fname=path, 78 | schema=[ 79 | ("id", T.LongType(), False), 80 | ("breed_id", T.LongType(), True), 81 | ("name", T.StringType(), True), 82 | ("birthday", T.TimestampType(), True), 83 | ("color", T.StringType(), True) 84 | ] 85 | ) 86 | ``` 87 | 88 | 89 | ```python 90 | df.toPandas() 91 | ``` 92 | 93 | 94 | 95 | 96 |

97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 |

	id	species_id	name	birthday	color
0	1	1	King	2014-11-22 12:30:31	brown
1	2	3	Argus	2016-11-22 10:05:10	None

127 |

128 | 129 | 130 | 131 | ### What Happened? 132 | Here we read in a `csv` file and put it into a `DataFrame (DF)`: this is one of the three datasets that Spark allows you to use. The other two are `Resilient Distributed Dataset (RDD)` and `Dataset`. `DF`s have replaced `RDD`s as more features have been brought out in version `2.x` of Spark. You should be able to perform anything with `DataFrames` now, if not you will have to work with `RDD`s, which I will not cover. 133 | 134 | Spark gives you the option to automatically infer the schema and types of columns in your dataset. But you should always specify a `schema` for the data that you're reading in. For each column in the `csv` file we specified: 135 | * the `name` of the column 136 | * the `data type` of the column 137 | * if `null` values can appear in the column 138 | 139 | ### Conclusion 140 | Congratulations! You've read in your first dataset in Spark. Next we'll look at how you can perform transformations on this dataset :). 141 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 2 - Exploring the Spark APIs/Section 2.6 - Filtering Data.md: -------------------------------------------------------------------------------- 1 | 2 | ### Library Imports 3 | 4 | 5 | ```python 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql import types as T 8 | 9 | from pyspark.sql import functions as F 10 | 11 | from datetime import datetime 12 | from decimal import Decimal 13 | ``` 14 | 15 | ### Template 16 | 17 | 18 | ```python 19 | spark = ( 20 | SparkSession.builder 21 | .master("local") 22 | .appName("Section 2.6 - Filtering Data") 23 | .config("spark.some.config.option", "some-value") 24 | .getOrCreate() 25 | ) 26 | 27 | sc = spark.sparkContext 28 | 29 | import os 30 | 31 | data_path = "/data/pets.csv" 32 | base_path = os.path.dirname(os.getcwd()) 33 | path = base_path + data_path 34 | ``` 35 | 36 | 37 | ```python 38 | pets = spark.read.csv(path, header=True) 39 | pets.toPandas() 40 | ``` 41 | 42 | 43 | 44 | 45 |

	id	breed_id	nickname	birthday	age	color
0	1	1	King	2014-11-22 12:30:31	5	brown
1	2	3	Argus	2016-11-22 10:05:10	10	None
2	3	1	Chewie	2016-11-22 10:05:10	15	None

88 |

89 | 90 | 91 | 92 | ### Filtering Data 93 | 94 | Again another commonly used function in data analysis, filtering out unwanted rows. 95 | 96 | ### Option 1 - `where()` 97 | 98 | 99 | ```python 100 | ( 101 | pets 102 | .where(F.col('breed_id') == 1) 103 | .filter(F.col('color') == 'brown') 104 | .toPandas() 105 | ) 106 | ``` 107 | 108 | 109 | 110 | 111 |

112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 |

	id	breed_id	nickname	birthday	age	color
0	1	1	King	2014-11-22 12:30:31	5	brown

136 |

137 | 138 | 139 | 140 | **What Happened?** 141 | 142 | Similar to the functions we have seen so far, there are multiple functioned that get `alias` to different names that perform the same transformation. IMO I perfor `where` as it's a bit more intuitive and closer to the `sql` syntax. 143 | 144 | **Note:** 145 | 146 | Notice how we don't have to wrap `1` or `brown` in a `F.lit()` function as these conditions are columnary expressions. 147 | 148 | We will look into how to perform more complex conditions in `2.1.7` that contain more than 1 condition. 149 | 150 | ### Option 2 - `isin()` 151 | 152 | 153 | ```python 154 | ( 155 | pets 156 | .where(F.col('nickname').isin('King', 'Argus')) 157 | .toPandas() 158 | ) 159 | ``` 160 | 161 | 162 | 163 | 164 |

165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 |

	id	breed_id	nickname	birthday	age	color
0	1	1	King	2014-11-22 12:30:31	5	brown
1	2	3	Argus	2016-11-22 10:05:10	10	None

198 |

199 | 200 | 201 | 202 | **What Happened?** 203 | 204 | If you want to know if a column can be of many values then you can use the `isin()` function. This function takes in both a list of values of comma seperated values. This is again very similar to `sql` syntax. 205 | 206 | ### Summary 207 | 208 | * We learnt of two filter functions in Spark `where()` and `isin()`. 209 | * Using `isin` you can see if a column can contain multiple values. 210 | * These functions are named similarly to a `sql` language. 211 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 3 - Aggregates/Section 1 - Clean Aggregations.md: -------------------------------------------------------------------------------- 1 | 2 | ### Library Imports 3 | 4 | 5 | ```python 6 | from datetime import datetime 7 | 8 | from pyspark.sql import SparkSession 9 | from pyspark.sql import functions as F 10 | ``` 11 | 12 | ### Template 13 | 14 | 15 | ```python 16 | spark = ( 17 | SparkSession.builder 18 | .master("local") 19 | .appName("Exploring Joins") 20 | .config("spark.some.config.option", "some-value") 21 | .getOrCreate() 22 | ) 23 | 24 | sc = spark.sparkContext 25 | ``` 26 | 27 | ### Initial Datasets 28 | 29 | 30 | ```python 31 | pets = spark.createDataFrame( 32 | [ 33 | (1, 1, 'Bear', 5), 34 | (2, 1, 'Chewie', 10), 35 | (3, 2, 'Roger', 15), 36 | ], ['id', 'breed_id', 'nickname', 'age'] 37 | ) 38 | 39 | pets.toPandas() 40 | ``` 41 | 42 | 43 | 44 | 45 |

46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 |

	id	breed_id	nickname	age
0	1	1	Bear	5
1	2	1	Chewie	10
2	3	2	Roger	15

80 |

81 | 82 | 83 | 84 | 85 | ```python 86 | groupby_columns = ['breed_id'] 87 | ``` 88 | 89 | ### Option 1: Using a Dictionary 90 | 91 | 92 | ```python 93 | df_1 = ( 94 | pets 95 | .groupby(groupby_columns) 96 | .agg({ 97 | "*": "count", 98 | "age": "sum", 99 | }) 100 | ) 101 | 102 | df_1.toPandas() 103 | ``` 104 | 105 | 106 | 107 | 108 |

109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 |

	breed_id	count(1)	sum(age)
0	1	2	15
1	2	1	15

133 |

134 | 135 | 136 | 137 | **What Happened:** 138 | * Very similar to `pandas` `agg` function. 139 | * The resultant column names are a bit awkward to use after the fact. 140 | 141 | ### Option 2: Using List of Columns 142 | 143 | 144 | ```python 145 | df_2 = ( 146 | pets 147 | .groupby(groupby_columns) 148 | .agg( 149 | F.count("*"), 150 | F.sum("age"), 151 | ) 152 | ) 153 | 154 | df_2.toPandas() 155 | ``` 156 | 157 | 158 | 159 | 160 |

161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 |

	breed_id	count(1)	sum(age)
0	1	2	15
1	2	1	15

185 |

186 | 187 | 188 | 189 | **What Happened:** 190 | * Here we use the Spark `agg` functions. 191 | * Again, the resultant column names are a bit awkward to use after the fact. 192 | 193 | ### Option 3: Using List of Columns, with Aliases 194 | 195 | 196 | ```python 197 | df_3 = ( 198 | pets 199 | .groupby(groupby_columns) 200 | .agg( 201 | F.count("*").alias("count_of_breeds"), 202 | F.sum("age").alias("total_age_of_breeds"), 203 | ) 204 | ) 205 | 206 | df_3.toPandas() 207 | ``` 208 | 209 | 210 | 211 | 212 |

213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 |

	breed_id	count_of_breeds	total_age_of_breeds
0	1	2	15
1	2	1	15

237 |

238 | 239 | 240 | 241 | **What Happened:** 242 | * Here we use the Spark `agg` functions and `alias`ed the resultant columns to new names. 243 | * This provides cleaner column names that we can use later on. 244 | 245 | ### Summary 246 | 247 | **I encourage using option #3.** 248 | 249 | This creates more elegant and meaning names for the new aggregate columns. 250 | 251 | A `withColumnRenamed` can be performed after the aggregates, but why not do it with an `alias`? It's easier as well. 252 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 2 - Exploring the Spark APIs/Section 2.8 - Case Statements.md: -------------------------------------------------------------------------------- 1 | 2 | ### Library Imports 3 | 4 | 5 | ```python 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql import types as T 8 | 9 | from pyspark.sql import functions as F 10 | 11 | from datetime import datetime 12 | from decimal import Decimal 13 | ``` 14 | 15 | ### Template 16 | 17 | 18 | ```python 19 | spark = ( 20 | SparkSession.builder 21 | .master("local") 22 | .appName("Section 2.8 - Case Statements") 23 | .config("spark.some.config.option", "some-value") 24 | .getOrCreate() 25 | ) 26 | 27 | sc = spark.sparkContext 28 | 29 | import os 30 | 31 | data_path = "/data/pets.csv" 32 | base_path = os.path.dirname(os.getcwd()) 33 | path = base_path + data_path 34 | ``` 35 | 36 | 37 | ```python 38 | pets = spark.read.csv(path, header=True) 39 | pets.toPandas() 40 | ``` 41 | 42 | 43 | 44 | 45 |

46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 |

	id	breed_id	nickname	birthday	age	color
0	1	1	King	2014-11-22 12:30:31	5	brown
1	2	3	Argus	2016-11-22 10:05:10	10	None
2	3	1	Chewie	2016-11-22 10:05:10	15	None
3	3	2	Maple	2018-11-22 10:05:10	17	white
4	4	2	None	2019-01-01 10:05:10	13	None

106 |

107 | 108 | 109 | 110 | ### Case Statements 111 | 112 | Case statements are usually used for performing stateful calculations. 113 | 114 | ie. 115 | - if `x` then `a` 116 | - if `y` then `b` 117 | - everything else `c` 118 | 119 | ### Using Switch/Case Statements in `Spark` 120 | 121 | 122 | ```python 123 | ( 124 | pets 125 | .withColumn( 126 | 'oldness_value', 127 | F.when(F.col('age') <= 5, 'young') 128 | .when((F.col('age') > 5) & (F.col('age') <= 10), 'middle age') 129 | .otherwise('old') 130 | ) 131 | .toPandas() 132 | ) 133 | ``` 134 | 135 | 136 | 137 | 138 |

139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 |

	id	breed_id	nickname	birthday	age	color	oldness_value
0	1	1	King	2014-11-22 12:30:31	5	brown	young
1	2	3	Argus	2016-11-22 10:05:10	10	None	middle age
2	3	1	Chewie	2016-11-22 10:05:10	15	None	old
3	3	2	Maple	2018-11-22 10:05:10	17	white	old
4	4	2	None	2019-01-01 10:05:10	13	None	old

205 |

206 | 207 | 208 | 209 | **What Happened?** 210 | 211 | Based on the age of the pet, we classified if they are either `young`, `middle age` or `old`. **Please don't take offense, this is merely an example. ** 212 | 213 | We mapped the logic of: 214 | - If their age is younger than or equal to 5, then they are considered `young`. 215 | - If their age is greater than 5 but younger than or equal to 10 , then they are considered `middle age`. 216 | - Anyone older is considered `old`. 217 | 218 | ### Summary 219 | 220 | * We learned how to map values based on case statements and a deafult value if all conditions are not satified. 221 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 2 - Exploring the Spark APIs/Section 1.3 - Maps and Dictionaries.md: -------------------------------------------------------------------------------- 1 | 2 | ### Library Imports 3 | 4 | 5 | ```python 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql import types as T 8 | 9 | from pyspark.sql import functions as F 10 | 11 | from datetime import datetime 12 | from decimal import Decimal 13 | ``` 14 | 15 | ### Template 16 | 17 | 18 | ```python 19 | spark = ( 20 | SparkSession.builder 21 | .master("local") 22 | .appName("Section 1.3 - Maps and Dictionaries") 23 | .config("spark.some.config.option", "some-value") 24 | .getOrCreate() 25 | ) 26 | 27 | sc = spark.sparkContext 28 | 29 | def get_csv_schema(*args): 30 | return T.StructType([ 31 | T.StructField(*arg) 32 | for arg in args 33 | ]) 34 | 35 | def read_csv(fname, schema): 36 | return spark.read.csv( 37 | path=fname, 38 | header=True, 39 | schema=get_csv_schema(*schema) 40 | ) 41 | 42 | import os 43 | 44 | data_path = "/data/pets.csv" 45 | base_path = os.path.dirname(os.getcwd()) 46 | path = base_path + data_path 47 | ``` 48 | 49 | 50 | ```python 51 | pets = spark.read.csv(path, header=True) 52 | pets.show() 53 | ``` 54 | 55 | +---+--------+--------+-------------------+---+-----+------+ 56 | | id|breed_id|nickname| birthday|age|color|weight| 57 | +---+--------+--------+-------------------+---+-----+------+ 58 | | 1| 1| King|2014-11-22 12:30:31| 5|brown| 10.0| 59 | | 2| 3| Argus|2016-11-22 10:05:10| 10| null| 5.5| 60 | | 3| 1| Chewie|2016-11-22 10:05:10| 15| null| 12| 61 | | 3| 2| Maple|2018-11-22 10:05:10| 17|white| 3.4| 62 | | 4| 2| null|2019-01-01 10:05:10| 13| null| 10| 63 | +---+--------+--------+-------------------+---+-----+------+ 64 | 65 | 66 | 67 | ### Maps and Dictionaries 68 | 69 | 70 | ### Case 1: Creating a Mapping from Existing Columns 71 | 72 | 73 | ```python 74 | ( 75 | pets 76 | .fillna({ 77 | 'nickname': 'Unknown Name', 78 | 'age': 'Unknown Age', 79 | }) 80 | .withColumn('{nickname:age}', F.create_map(F.col('nickname'), F.col('age'))) 81 | .withColumn('{nickname:age} 2', F.create_map('nickname', 'age')) 82 | .show() 83 | ) 84 | ``` 85 | 86 | +---+--------+------------+-------------------+---+-----+------+--------------------+--------------------+ 87 | | id|breed_id| nickname| birthday|age|color|weight| {nickname:age}| {nickname:age} 2| 88 | +---+--------+------------+-------------------+---+-----+------+--------------------+--------------------+ 89 | | 1| 1| King|2014-11-22 12:30:31| 5|brown| 10.0| [King -> 5]| [King -> 5]| 90 | | 2| 3| Argus|2016-11-22 10:05:10| 10| null| 5.5| [Argus -> 10]| [Argus -> 10]| 91 | | 3| 1| Chewie|2016-11-22 10:05:10| 15| null| 12| [Chewie -> 15]| [Chewie -> 15]| 92 | | 3| 2| Maple|2018-11-22 10:05:10| 17|white| 3.4| [Maple -> 17]| [Maple -> 17]| 93 | | 4| 2|Unknown Name|2019-01-01 10:05:10| 13| null| 10|[Unknown Name -> 13]|[Unknown Name -> 13]| 94 | +---+--------+------------+-------------------+---+-----+------+--------------------+--------------------+ 95 | 96 | 97 | 98 | **What Happened?** 99 | 100 | You can create a column of map types using either `columnary expressions` (we'll learn what column expressions are later) or column names. 101 | 102 | ### Case 2: Creating a Mapping from Constant Values 103 | 104 | 105 | ```python 106 | ( 107 | pets 108 | .fillna({ 109 | 'nickname': 'Unknown Name', 110 | 'age': 'Unknown Age', 111 | }) 112 | .withColumn('{nickname:age}', F.create_map(F.lit('key'), F.lit('value'))) 113 | .show() 114 | ) 115 | ``` 116 | 117 | +---+--------+------------+-------------------+---+-----+------+--------------+ 118 | | id|breed_id| nickname| birthday|age|color|weight|{nickname:age}| 119 | +---+--------+------------+-------------------+---+-----+------+--------------+ 120 | | 1| 1| King|2014-11-22 12:30:31| 5|brown| 10.0|[key -> value]| 121 | | 2| 3| Argus|2016-11-22 10:05:10| 10| null| 5.5|[key -> value]| 122 | | 3| 1| Chewie|2016-11-22 10:05:10| 15| null| 12|[key -> value]| 123 | | 3| 2| Maple|2018-11-22 10:05:10| 17|white| 3.4|[key -> value]| 124 | | 4| 2|Unknown Name|2019-01-01 10:05:10| 13| null| 10|[key -> value]| 125 | +---+--------+------------+-------------------+---+-----+------+--------------+ 126 | 127 | 128 | 129 | **What Happened?** 130 | 131 | You can create a column of map types of literals using the `columnary expression` `F.lit()`, we will learn this later on. Notice that each key/value needs to be a `columnal expression`? This will be a common theme throughout Spark. 132 | 133 | ### Summary 134 | 135 | * It is very simple to create map data in Spark. 136 | * You can do so with both existing columns or constant values. 137 | * If constant values are used, then each value must be a `columnary expression`. 138 | -------------------------------------------------------------------------------- /src/Chapter 2 - Exploring the Spark APIs/Section 1.1 - Struct Types.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Library Imports" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from pyspark.sql import SparkSession\n", 17 | "from pyspark.sql import types as T\n", 18 | "\n", 19 | "from pyspark.sql import functions as F\n", 20 | "\n", 21 | "from datetime import datetime\n", 22 | "from decimal import Decimal" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "### Template" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "spark = (\n", 39 | " SparkSession.builder\n", 40 | " .master(\"local\")\n", 41 | " .appName(\"Section 1.1 - Struct Types\")\n", 42 | " .config(\"spark.some.config.option\", \"some-value\")\n", 43 | " .getOrCreate()\n", 44 | ")\n", 45 | "\n", 46 | "sc = spark.sparkContext\n", 47 | "\n", 48 | "import os\n", 49 | "\n", 50 | "data_path = \"/data/pets.csv\"\n", 51 | "base_path = os.path.dirname(os.getcwd())\n", 52 | "path = base_path + data_path" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "+---+--------+--------+-------------------+---+-----+------+\n", 65 | "| id|breed_id|nickname| birthday|age|color|weight|\n", 66 | "+---+--------+--------+-------------------+---+-----+------+\n", 67 | "| 1| 1| King|2014-11-22 12:30:31| 5|brown| 10.0|\n", 68 | "| 2| 3| Argus|2016-11-22 10:05:10| 10| null| 5.5|\n", 69 | "| 3| 1| Chewie|2016-11-22 10:05:10| 15| null| 12|\n", 70 | "| 3| 2| Maple|2018-11-22 10:05:10| 17|white| 3.4|\n", 71 | "| 4| 2| null|2019-01-01 10:05:10| 13| null| 10|\n", 72 | "+---+--------+--------+-------------------+---+-----+------+\n", 73 | "\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "pets = spark.read.csv(path, header=True)\n", 79 | "pets.show()" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "### Struct Types\n", 87 | "\n", 88 | "What are they used for? TODO" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 4, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "+---+--------+--------+-------------------+---+-----+------+--------------------+--------------------+\n", 101 | "| id|breed_id|nickname| birthday|age|color|weight| struct_col|nickname_from_struct|\n", 102 | "+---+--------+--------+-------------------+---+-----+------+--------------------+--------------------+\n", 103 | "| 1| 1| King|2014-11-22 12:30:31| 5|brown| 10.0|[King, 2014-11-22...| King|\n", 104 | "| 2| 3| Argus|2016-11-22 10:05:10| 10| null| 5.5|[Argus, 2016-11-2...| Argus|\n", 105 | "| 3| 1| Chewie|2016-11-22 10:05:10| 15| null| 12|[Chewie, 2016-11-...| Chewie|\n", 106 | "| 3| 2| Maple|2018-11-22 10:05:10| 17|white| 3.4|[Maple, 2018-11-2...| Maple|\n", 107 | "| 4| 2| null|2019-01-01 10:05:10| 13| null| 10|[, 2019-01-01 10:...| null|\n", 108 | "+---+--------+--------+-------------------+---+-----+------+--------------------+--------------------+\n", 109 | "\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "(\n", 115 | " pets\n", 116 | " .withColumn('struct_col', F.struct('nickname', 'birthday', 'age', 'color'))\n", 117 | " .withColumn('nickname_from_struct', F.col('struct_col').nickname)\n", 118 | " .show()\n", 119 | ")" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "**What Happened?**\n", 127 | "\n", 128 | "We created a `struct` type column consisting of the columns `'nickname', 'birthday', 'age', 'color'`. Then we accessed a member `nickname` from the struct." 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### Summary\n", 136 | "\n", 137 | "* TODO: Fix a use-case.\n", 138 | "* It is pretty easy creating and accessing `struct` datatypes." 139 | ] 140 | } 141 | ], 142 | "metadata": { 143 | "kernelspec": { 144 | "display_name": "Python 2", 145 | "language": "python", 146 | "name": "python2" 147 | }, 148 | "language_info": { 149 | "codemirror_mode": { 150 | "name": "ipython", 151 | "version": 2 152 | }, 153 | "file_extension": ".py", 154 | "mimetype": "text/x-python", 155 | "name": "python", 156 | "nbconvert_exporter": "python", 157 | "pygments_lexer": "ipython2", 158 | "version": "2.7.15" 159 | } 160 | }, 161 | "nbformat": 4, 162 | "nbformat_minor": 2 163 | } 164 | -------------------------------------------------------------------------------- /gitbook/gitbook-auto-summary.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Author Frank Hu 3 | # GitBook auto summary 4 | # summary all .md files in a GitBook folder 5 | 6 | import os 7 | import re 8 | 9 | 10 | def atoi(text): 11 | return int(text) if text.isdigit() else text 12 | 13 | 14 | def output_markdown(dire, base_dir, output_file, append, iter_depth=0): 15 | """Main iterator for get information from every file/folder 16 | 17 | i: directory, base directory(to calulate relative path), 18 | output file name, iter depth. 19 | p: Judge is directory or is file, then process .md/.markdown files. 20 | o: write .md information (with identation) to output_file. 21 | """ 22 | 23 | dir_list = os.listdir(dire) 24 | 25 | # sort by numbers in directories 26 | dir_list.sort(key=lambda x: [atoi(c) for c in re.split('(\d+)', x)]) 27 | 28 | for filename in sort_dir_file(dir_list, base_dir): 29 | # add list and sort 30 | print('Processing ', filename) # output log 31 | file_or_path = os.path.join(dire, filename) 32 | if os.path.isdir(file_or_path): # is dir 33 | if mdfile_in_dir(file_or_path): 34 | # if there is .md files in the folder, output folder name 35 | output_file.write(' ' * iter_depth + '- ' + filename + '\n') 36 | output_markdown(file_or_path, base_dir, output_file, append, 37 | iter_depth + 1) # iteration 38 | else: # is file 39 | if is_markdown_file(filename): 40 | # re to find target markdown files, $ for matching end of filename 41 | if (filename not in ['SUMMARY.md', 42 | 'SUMMARY-GitBook-auto-summary.md'] 43 | or iter_depth != 0): # escape SUMMARY.md at base directory 44 | output_file.write(' ' * iter_depth + 45 | '- [{}]({})\n'.format(write_md_filename(filename, 46 | append), 47 | os.path.join(os.path.relpath(dire, base_dir), 48 | filename))) 49 | # iter depth for indent, relpath and join to write link. 50 | 51 | 52 | def mdfile_in_dir(dire): 53 | """Judge if there is .md file in the directory 54 | 55 | i: input directory 56 | o: return Ture if there is .md file; False if not. 57 | """ 58 | for root, dirs, files in os.walk(dire): 59 | for filename in files: 60 | if re.search('.md$|.markdown$', filename): 61 | return True 62 | return False 63 | 64 | 65 | def is_markdown_file(filename): 66 | """ Judge if the filename is a markdown filename 67 | 68 | i: filename 69 | o: filename without '.md' or '.markdown' 70 | """ 71 | match = re.search('.md$|.markdown$', filename) 72 | if not match: 73 | return False 74 | elif len(match.group()) is len('.md'): 75 | return filename[:-3] 76 | elif len(match.group()) is len('.markdown'): 77 | return filename[:-9] 78 | 79 | 80 | def sort_dir_file(listdir, dire): 81 | # sort dirs and files, first files a-z, then dirs a-z 82 | list_of_file = [] 83 | list_of_dir = [] 84 | for filename in listdir: 85 | if os.path.isdir(os.path.join(dire, filename)): 86 | list_of_dir.append(filename) 87 | else: 88 | list_of_file.append(filename) 89 | for dire in list_of_dir: 90 | list_of_file.append(dire) 91 | return list_of_file 92 | 93 | 94 | def write_md_filename(filename, append): 95 | """ write markdown filename 96 | 97 | i: filename and append 98 | p: if append: find former list name and return 99 | else: write filename 100 | """ 101 | if append: 102 | for line in former_summary_list: 103 | if re.search(filename, line): 104 | s = re.search('\[.*\]\(', line) 105 | return s.group()[1:-2] 106 | else: 107 | return is_markdown_file(filename) 108 | else: 109 | return is_markdown_file(filename) 110 | 111 | 112 | def main(): 113 | overwrite = True 114 | append = False 115 | dir_input = 'src' 116 | 117 | # print information 118 | print('GitBook auto summary:', dir_input, end=' ') 119 | 120 | if append and os.path.exists(os.path.join(dir_input, 'SUMMARY.md')): 121 | # append: read former SUMMARY.md 122 | print('--append', end=' ') 123 | global former_summary_list 124 | with open(os.path.join(dir_input, 'SUMMARY.md')) as f: 125 | former_summary_list = f.readlines() 126 | f.close() 127 | print() 128 | # output to flie 129 | if (overwrite == False and 130 | os.path.exists(os.path.join(dir_input, 'SUMMARY.md'))): 131 | # overwrite logic 132 | filename = 'SUMMARY-GitBook-auto-summary.md' 133 | else: 134 | filename = 'SUMMARY.md' 135 | output = open(os.path.join(dir_input, filename), 'w') 136 | output.write('# Summary\n\n') 137 | output_markdown(dir_input, dir_input, output, append) 138 | 139 | print('GitBook auto summary finished:) ') 140 | return 0 141 | 142 | 143 | if __name__ == '__main__': 144 | main() 145 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 2 - Exploring the Spark APIs/Section 2.2 - Selecting a Subset of Columns.md: -------------------------------------------------------------------------------- 1 | 2 | ### Library Imports 3 | 4 | 5 | ```python 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql import types as T 8 | 9 | from pyspark.sql import functions as F 10 | 11 | from datetime import datetime 12 | from decimal import Decimal 13 | ``` 14 | 15 | ### Template 16 | 17 | 18 | ```python 19 | spark = ( 20 | SparkSession.builder 21 | .master("local") 22 | .appName("Section 2.1 - Looking at Your Data") 23 | .config("spark.some.config.option", "some-value") 24 | .getOrCreate() 25 | ) 26 | 27 | sc = spark.sparkContext 28 | 29 | import os 30 | 31 | data_path = "/data/pets.csv" 32 | base_path = os.path.dirname(os.getcwd()) 33 | path = base_path + data_path 34 | ``` 35 | 36 | 37 | ```python 38 | pets = spark.read.csv(path, header=True) 39 | pets.toPandas() 40 | ``` 41 | 42 | 43 | 44 | 45 |

	id	breed_id	nickname	birthday	age	color
0	1	1	King	2014-11-22 12:30:31	5	brown
1	2	3	Argus	2016-11-22 10:05:10	10	None
2	3	1	Chewie	2016-11-22 10:05:10	15	None

88 |

89 | 90 | 91 | 92 | ### Selecting a Subset of Columns 93 | 94 | When you're working with raw data, you are usually only interested in a subset of columns. This means you should get into the habit of only selecting the columns you need before any spark transformations. 95 | 96 | **Why?** 97 | 98 | If you do not, and you are working with a wide dataset this will cause your spark application to do more work than it should do. This is because all the extra columns will be `shuffled` between the worker during the execution of the transformations. 99 | 100 | **This will really kill you, if you have string columns that have really large amounts of text within them.** 101 | 102 | **Note** 103 | 104 | Spark is sometimes smart enough to know which columns aren't being used and perform a `Project Pushdown` to drop the unneeded columns. But it's better practice to do the selection first. 105 | 106 | ### Option 1 - `select()` 107 | 108 | 109 | ```python 110 | ( 111 | pets 112 | .select("id", "nickname", "color") 113 | .toPandas() 114 | ) 115 | ``` 116 | 117 | 118 | 119 | 120 |

121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 |

	id	nickname	color
0	1	King	brown
1	2	Argus	None
2	3	Chewie	None

151 |

152 | 153 | 154 | 155 | **What Happened?** 156 | 157 | Similar to a `sql select` statement, it will only keep the columns specified in the arguments in the resulting `df`. a `list` object can be passed as the argument as well. 158 | 159 | If you have a wide dataset and only want to work with a small number of columns, a `select` would be less lines of code. 160 | 161 | **Note** 162 | 163 | If the argument `*` is provided, all the columns will be selected. 164 | 165 | ### Option 2 - `drop()` 166 | 167 | 168 | ```python 169 | ( 170 | pets 171 | .drop("breed_id", "birthday", "age") 172 | .toPandas() 173 | ) 174 | ``` 175 | 176 | 177 | 178 | 179 |

180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 |

	id	nickname	color
0	1	King	brown
1	2	Argus	None
2	3	Chewie	None

210 |

211 | 212 | 213 | 214 | **What Happened?** 215 | 216 | This is the opposite of a `select` statement it will drop an of the columns specified. 217 | 218 | If you have a wide dataset and will need a majority of the columns, a `drop` would be less lines of code. 219 | 220 | ### Summary 221 | 222 | * Work with only the subset of columns required for your spark application, there is no need do extra work. 223 | * Depending on the number of columns you are going to work with, a `select` over a `drop` would be better and vice-versa. 224 | -------------------------------------------------------------------------------- /src/Chapter 1 - Basics/Section 2 - Creating your First Data Object.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Library Imports" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 2, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from pyspark.sql import SparkSession\n", 17 | "from pyspark.sql import types as T" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "### Template" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "spark = (\n", 34 | " SparkSession.builder\n", 35 | " .master(\"local\")\n", 36 | " .appName(\"Exploring Joins\")\n", 37 | " .config(\"spark.some.config.option\", \"some-value\")\n", 38 | " .getOrCreate()\n", 39 | ")\n", 40 | "\n", 41 | "sc = spark.sparkContext" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "### Create a DataFrame" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/html": [ 59 | "

\n", 60 | "\n", 73 | "\n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | "

	pet_id	name	age
0	1	Bear	13
1	2	Chewie	12
2	2	Roger	1

\n", 103 | "

" 104 | ], 105 | "text/plain": [ 106 | " pet_id name age\n", 107 | "0 1 Bear 13\n", 108 | "1 2 Chewie 12\n", 109 | "2 2 Roger 1" 110 | ] 111 | }, 112 | "execution_count": 3, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "schema = T.StructType([\n", 119 | " T.StructField(\"pet_id\", T.IntegerType(), False),\n", 120 | " T.StructField(\"name\", T.StringType(), True),\n", 121 | " T.StructField(\"age\", T.IntegerType(), True),\n", 122 | "])\n", 123 | "\n", 124 | "data = [\n", 125 | " (1, \"Bear\", 13), \n", 126 | " (2, \"Chewie\", 12), \n", 127 | " (2, \"Roger\", 1), \n", 128 | "]\n", 129 | "\n", 130 | "pet_df = spark.createDataFrame(\n", 131 | " data=data,\n", 132 | " schema=schema\n", 133 | ")\n", 134 | "\n", 135 | "pet_df.toPandas()" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "### Background\n", 143 | "There are 3 datatypes in spark `RDD`, `DataFrame` and `Dataset`. As mentioned before, we will focus on the `DataFrame` datatype. \n", 144 | "\n", 145 | "* This is most performant and commonly used datatype. \n", 146 | "* `RDD`s are a thing of the past and you should refrain from using them unless you can't do the transformation in `DataFrame`s.\n", 147 | "* `Dataset`s are a thing in `Spark scala`.\n", 148 | "\n", 149 | "If you have used a `DataFrame` in Pandas, this is the same thing. If you haven't, a dataframe is similar to a `csv` or `excel` file. There are columns and rows that you can perform transformations on. You can search online for better descriptions of what a `DataFrame` is.\n", 150 | "\n", 151 | "### What Happened?\n", 152 | "For any `DataFrame (df)` that you work with in Spark you should provide it with 2 things:\n", 153 | "1. a `schema` for the data. Providing a `schema` explicitly makes it clearer to the reader and sometimes even more performant, if we can know that a column is `nullable`. This means providing 3 things:\n", 154 | " * the `name` of the column\n", 155 | " * the `datatype` of the column\n", 156 | " * the `nullability` of the column\n", 157 | "2. the data. Normally you would read data stored in `gcs`, `aws` etc and store it in a `df`, but there will be the off-times that you will need to create one. " 158 | ] 159 | } 160 | ], 161 | "metadata": { 162 | "kernelspec": { 163 | "display_name": "Python 2", 164 | "language": "python", 165 | "name": "python2" 166 | }, 167 | "language_info": { 168 | "codemirror_mode": { 169 | "name": "ipython", 170 | "version": 2 171 | }, 172 | "file_extension": ".py", 173 | "mimetype": "text/x-python", 174 | "name": "python", 175 | "nbconvert_exporter": "python", 176 | "pygments_lexer": "ipython2", 177 | "version": "2.7.15" 178 | } 179 | }, 180 | "nbformat": 4, 181 | "nbformat_minor": 2 182 | } 183 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 2 - Exploring the Spark APIs/Section 2.3 - Creating New Columns and Transforming Data.md: -------------------------------------------------------------------------------- 1 | 2 | ### Library Imports 3 | 4 | 5 | ```python 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql import types as T 8 | 9 | from pyspark.sql import functions as F 10 | 11 | from datetime import datetime 12 | from decimal import Decimal 13 | ``` 14 | 15 | ### Template 16 | 17 | 18 | ```python 19 | spark = ( 20 | SparkSession.builder 21 | .master("local") 22 | .appName("Section 2.3 - Creating New Columns") 23 | .config("spark.some.config.option", "some-value") 24 | .getOrCreate() 25 | ) 26 | 27 | sc = spark.sparkContext 28 | 29 | import os 30 | 31 | data_path = "/data/pets.csv" 32 | base_path = os.path.dirname(os.getcwd()) 33 | path = base_path + data_path 34 | ``` 35 | 36 | 37 | ```python 38 | pets = spark.read.csv(path, header=True) 39 | pets.toPandas() 40 | ``` 41 | 42 | 43 | 44 | 45 |

	id	breed_id	nickname	birthday	age	color
0	1	1	King	2014-11-22 12:30:31	5	brown
1	2	3	Argus	2016-11-22 10:05:10	10	None
2	3	1	Chewie	2016-11-22 10:05:10	15	None

88 |

89 | 90 | 91 | 92 | ### Creating New Columns and Transforming Data 93 | 94 | When we are data wrangling, transforming data, we will using assign the result to a new column. We will explore the `withColumn()` function and other transformation functions to achieve this our end results. 95 | 96 | We will also look into how we can rename a column with `withColumnRenamed()`, this is useful for making a join on the same `column`, etc. 97 | 98 | ### Case 1: New Columns - `withColumn()` 99 | 100 | 101 | ```python 102 | ( 103 | pets 104 | .withColumn('nickname_copy', F.col('nickname')) 105 | .withColumn('nickname_capitalized', F.upper(F.col('nickname'))) 106 | .toPandas() 107 | ) 108 | ``` 109 | 110 | 111 | 112 | 113 |

114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 |

	id	breed_id	nickname	birthday	age	color	nickname_copy	nickname_capitalized
0	1	1	King	2014-11-22 12:30:31	5	brown	King	KING
1	2	3	Argus	2016-11-22 10:05:10	10	None	Argus	ARGUS
2	3	1	Chewie	2016-11-22 10:05:10	15	None	Chewie	CHEWIE

164 |

165 | 166 | 167 | 168 | **What Happened?** 169 | 170 | We duplicated the `nickname` column as `nickname_copy` using the `withColumn()` function. We also created a new column where all the letters of the `nickname` are `capitalized` with chaining multiple spark functions together. 171 | 172 | We will look into more advanced column creation in the next section. There we will go into more details what a `column expression` is and what the purpose of `F.col()` is. 173 | 174 | ### Case 2: Renaming Columns - `withColumnRenamed()` 175 | 176 | 177 | ```python 178 | ( 179 | pets 180 | .withColumnRenamed('id', 'pet_id') 181 | .toPandas() 182 | ) 183 | ``` 184 | 185 | 186 | 187 | 188 |

189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 |

	pet_id	breed_id	nickname	birthday	age	color
0	1	1	King	2014-11-22 12:30:31	5	brown
1	2	3	Argus	2016-11-22 10:05:10	10	None
2	3	1	Chewie	2016-11-22 10:05:10	15	None

231 |

232 | 233 | 234 | 235 | **What Happened?** 236 | 237 | We renamed and replaced the `id` column with `pet_id`. 238 | 239 | ### Summary 240 | 241 | * We learned how to create new columns from old ones by chaining spark functions and using `withColumn()`. 242 | * We learned how to rename columns using `withColumnRenamed()`. 243 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 3 - Aggregates/Section 2 - Non Deterministic Ordering for GroupBys.md: -------------------------------------------------------------------------------- 1 | 2 | ### Introduction 3 | 4 | There are use cases where we would like to get the `first` or `last` of something within a `group` or particular `grain`. 5 | 6 | It is natural to do something in SQL like: 7 | 8 | ```sql 9 | select 10 | col_1, 11 | first(col_2) as first_something, 12 | last(col_2) as first_something 13 | from table 14 | group by 1 15 | order by 1 16 | ``` 17 | 18 | Which leads us to writing spark code like this `df.orderBy().groupBy().agg()`. This has unexpected behaviours in spark and can be different each run. 19 | 20 | ### Library Imports 21 | 22 | 23 | ```python 24 | from datetime import datetime 25 | 26 | from pyspark.sql import SparkSession 27 | from pyspark.sql import functions as F, Window 28 | ``` 29 | 30 | Create a `SparkSession`. No need to create `SparkContext` as you automatically get it as part of the `SparkSession`. 31 | 32 | 33 | ```python 34 | spark = ( 35 | SparkSession.builder 36 | .master("local") 37 | .appName("Exploring Joins") 38 | .config("spark.some.config.option", "some-value") 39 | .getOrCreate() 40 | ) 41 | 42 | sc = spark.sparkContext 43 | ``` 44 | 45 | ### Initial Datasets 46 | 47 | 48 | ```python 49 | pets = spark.createDataFrame( 50 | [ 51 | (1, 1, datetime(2018, 1, 1, 1 ,1, 1), 'Bear', 5), 52 | (2, 1, datetime(2010, 1, 1, 1 ,1, 1), 'Chewie', 15), 53 | (3, 1, datetime(2015, 1, 1, 1 ,1, 1), 'Roger', 10), 54 | ], ['id', 'breed_id', 'birthday', 'nickname', 'age'] 55 | ) 56 | 57 | pets.toPandas() 58 | ``` 59 | 60 | 61 | 62 | 63 |

64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 |

	id	breed_id	birthday	nickname	age
0	1	1	2018-01-01 01:01:01	Bear	5
1	2	1	2010-01-01 01:01:01	Chewie	15
2	3	1	2015-01-01 01:01:01	Roger	10

102 |

103 | 104 | 105 | 106 | ### Option 1: Wrong Way 107 | 108 | #### Result 1 109 | 110 | 111 | ```python 112 | df_1 = ( 113 | pets 114 | .orderBy('birthday') 115 | .groupBy('breed_id') 116 | .agg(F.first('nickname').alias('first_breed')) 117 | ) 118 | 119 | df_1.toPandas() 120 | ``` 121 | 122 | 123 | 124 | 125 |

126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 |

	breed_id	first_breed
0	1	Chewie

142 |

143 | 144 | 145 | 146 | #### Result 2 147 | 148 | 149 | ```python 150 | df_2 = ( 151 | pets 152 | .orderBy('birthday') 153 | .groupBy('breed_id') 154 | .agg(F.first('nickname').alias('first_breed')) 155 | ) 156 | 157 | df_2.toPandas() 158 | ``` 159 | 160 | 161 | 162 | 163 |

164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 |

	breed_id	first_breed
0	1	Chewie

180 |

181 | 182 | 183 | 184 | ### Option 2: Window Object, Right Way 185 | 186 | 187 | ```python 188 | window = Window.partitionBy('breed_id').orderBy('birthday') 189 | 190 | df_3 = ( 191 | pets 192 | .withColumn('first_breed', F.first('nickname').over(window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing))) 193 | .withColumn('rn', F.row_number().over(window.rowsBetween(Window.unboundedPreceding, Window.currentRow))) 194 | ) 195 | 196 | df_3.toPandas() 197 | ``` 198 | 199 | 200 | 201 | 202 |

203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 |

	id	breed_id	birthday	nickname	age	first_breed	rn
0	2	1	2010-01-01 01:01:01	Chewie	15	Chewie	1
1	3	1	2015-01-01 01:01:01	Roger	10	Chewie	2
2	1	1	2018-01-01 01:01:01	Bear	5	Chewie	3

249 |

250 | 251 | 252 | 253 | ### Summary 254 | 255 | Ok so my example didn't work locally lol, but trust me it that `orderBy()` in a statement like this: `orderBy().groupBy()` doesn't maintain it's order! 256 | 257 | reference: https://stackoverflow.com/a/50012355 258 | 259 | For anything aggregation that needs an ordering performed (ie. `first`, `last`, etc.), we should avoid using `groupby()`s and instead we should use a `window` object. 260 | -------------------------------------------------------------------------------- /src/Chapter 2 - Exploring the Spark APIs/Section 1.2 - Arrays and Lists.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Library Imports" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 5, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from pyspark.sql import SparkSession\n", 17 | "from pyspark.sql import types as T\n", 18 | "\n", 19 | "from pyspark.sql import functions as F\n", 20 | "\n", 21 | "from datetime import datetime\n", 22 | "from decimal import Decimal" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "### Template" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 14, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "spark = (\n", 39 | " SparkSession.builder\n", 40 | " .master(\"local\")\n", 41 | " .appName(\"Section 2.1.2 - Arrays and Lists\")\n", 42 | " .config(\"spark.some.config.option\", \"some-value\")\n", 43 | " .getOrCreate()\n", 44 | ")\n", 45 | "\n", 46 | "sc = spark.sparkContext\n", 47 | "\n", 48 | "def get_csv_schema(*args):\n", 49 | " return T.StructType([\n", 50 | " T.StructField(*arg)\n", 51 | " for arg in args\n", 52 | " ])\n", 53 | "\n", 54 | "def read_csv(fname, schema):\n", 55 | " return spark.read.csv(\n", 56 | " path=fname,\n", 57 | " header=True,\n", 58 | " schema=get_csv_schema(*schema)\n", 59 | " )\n", 60 | "\n", 61 | "import os\n", 62 | "\n", 63 | "data_path = \"/data/pets.csv\"\n", 64 | "base_path = os.path.dirname(os.getcwd())\n", 65 | "path = base_path + data_path" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 26, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "+---+--------+--------+-------------------+---+-----+------+\n", 78 | "| id|breed_id|nickname| birthday|age|color|weight|\n", 79 | "+---+--------+--------+-------------------+---+-----+------+\n", 80 | "| 1| 1| King|2014-11-22 12:30:31| 5|brown| 10|\n", 81 | "| 2| 3| Argus|2016-11-22 10:05:10| 10| null| 6|\n", 82 | "| 3| 1| Chewie|2016-11-22 10:05:10| 15| null| 12|\n", 83 | "| 3| 2| Maple|2018-11-22 10:05:10| 17|white| 3|\n", 84 | "| 4| 2| null|2019-01-01 10:05:10| 13| null| 10|\n", 85 | "+---+--------+--------+-------------------+---+-----+------+\n", 86 | "\n" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "pets = read_csv(\n", 92 | " fname=path,\n", 93 | " schema=[\n", 94 | " (\"id\", T.LongType(), False),\n", 95 | " (\"breed_id\", T.LongType(), True),\n", 96 | " (\"nickname\", T.StringType(), True),\n", 97 | " (\"birthday\", T.TimestampType(), True),\n", 98 | " (\"age\", T.LongType(), True),\n", 99 | " (\"color\", T.StringType(), True),\n", 100 | " (\"weight\", T.DecimalType(), True),\n", 101 | " ]\n", 102 | ")\n", 103 | "pets.show()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "### Arrays and Lists" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "### Case 1: Reading in Data that contains `Arrays`\n", 118 | "\n", 119 | "TODO" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "### Case 2: Creating Arrays" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 29, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "name": "stdout", 136 | "output_type": "stream", 137 | "text": [ 138 | "+---+--------+--------+-------------------+---+-----+------+--------------------+\n", 139 | "| id|breed_id|nickname| birthday|age|color|weight| array column|\n", 140 | "+---+--------+--------+-------------------+---+-----+------+--------------------+\n", 141 | "| 1| 1| King|2014-11-22 12:30:31| 5|brown| 10|[1, Bob, 2019-02-...|\n", 142 | "| 2| 3| Argus|2016-11-22 10:05:10| 10| null| 6|[1, Bob, 2019-02-...|\n", 143 | "| 3| 1| Chewie|2016-11-22 10:05:10| 15| null| 12|[1, Bob, 2019-02-...|\n", 144 | "| 3| 2| Maple|2018-11-22 10:05:10| 17|white| 3|[1, Bob, 2019-02-...|\n", 145 | "| 4| 2| null|2019-01-01 10:05:10| 13| null| 10|[1, Bob, 2019-02-...|\n", 146 | "+---+--------+--------+-------------------+---+-----+------+--------------------+\n", 147 | "\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "(\n", 153 | " pets\n", 154 | " .withColumn('array column', F.array([\n", 155 | " F.lit(1),\n", 156 | " F.lit(\"Bob\"),\n", 157 | " F.lit(datetime(2019,2,1)),\n", 158 | " ]))\n", 159 | " .show()\n", 160 | ")" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "**What Happened?**\n", 168 | "\n", 169 | "We will explain in the later chapter what the `F.lit()` function does, but for now understand that in order to create an array type you need to call the `F.array()` function and for each array element call `F.lit()` on." 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "### Summary\n", 177 | "\n", 178 | "* It's pretty simple to create an array in Spark, you will need to call 2 functions: `F.array()` and `F.lit()`.\n", 179 | "* Each element of the array needs to be of type `F.lit()`" 180 | ] 181 | } 182 | ], 183 | "metadata": { 184 | "kernelspec": { 185 | "display_name": "Python 2", 186 | "language": "python", 187 | "name": "python2" 188 | }, 189 | "language_info": { 190 | "codemirror_mode": { 191 | "name": "ipython", 192 | "version": 2 193 | }, 194 | "file_extension": ".py", 195 | "mimetype": "text/x-python", 196 | "name": "python", 197 | "nbconvert_exporter": "python", 198 | "pygments_lexer": "ipython2", 199 | "version": "2.7.15" 200 | } 201 | }, 202 | "nbformat": 4, 203 | "nbformat_minor": 2 204 | } 205 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 4 - Window Objects/Section 2 - Ordering High Frequency Data with a Window Object.md: -------------------------------------------------------------------------------- 1 | 2 | When dealing with big data, some datasets will have a much higher frequent of "events" than others. 3 | 4 | An example table could be a table that tracks each pageview, it's not uncommon for someone to visit a site at the same time as someone else, espically a very popular site such as google. 5 | 6 | I will illustrate how you can deal with these types of events, when you need to order by time. 7 | 8 | ### Library Imports 9 | 10 | 11 | ```python 12 | from datetime import datetime 13 | 14 | from pyspark.sql import SparkSession 15 | from pyspark.sql import functions as F, Window 16 | ``` 17 | 18 | ### Template 19 | 20 | 21 | ```python 22 | spark = ( 23 | SparkSession.builder 24 | .master("local") 25 | .appName("Exploring Joins") 26 | .config("spark.some.config.option", "some-value") 27 | .getOrCreate() 28 | ) 29 | 30 | sc = spark.sparkContext 31 | ``` 32 | 33 | ### Option 1: Only ordering by date column 34 | 35 | 36 | ```python 37 | window = ( 38 | Window 39 | .partitionBy('breed_id') 40 | .orderBy('birthday') 41 | .rowsBetween(Window.unboundedPreceding, Window.currentRow) 42 | ) 43 | ``` 44 | 45 | 46 | ```python 47 | pets = spark.createDataFrame( 48 | [ 49 | (1, 1, datetime(2018, 1, 1, 1 ,1, 1), 45), 50 | (2, 1, datetime(2018, 1, 1, 1 ,1, 1), 20), 51 | ], ['id', 'breed_id', 'birthday', 'age'] 52 | ) 53 | 54 | pets.withColumn('first_pet_of_breed', F.first('id').over(window)).toPandas() 55 | ``` 56 | 57 | 58 | 59 | 60 |

61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 |

	id	breed_id	birthday	age	first_pet_of_breed
0	1	1	2018-01-01 01:01:01	45	1
1	2	1	2018-01-01 01:01:01	20	1

91 |

92 | 93 | 94 | 95 | 96 | ```python 97 | pets = spark.createDataFrame( 98 | [ 99 | (2, 1, datetime(2018, 1, 1, 1 ,1, 1), 20), 100 | (1, 1, datetime(2018, 1, 1, 1 ,1, 1), 45), 101 | ], ['id', 'breed_id', 'birthday', 'age'] 102 | ) 103 | 104 | pets.withColumn('first_pet_of_breed', F.first('id').over(window)).toPandas() 105 | ``` 106 | 107 | 108 | 109 | 110 |

111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 |

	id	breed_id	birthday	age	first_pet_of_breed
0	2	1	2018-01-01 01:01:01	20	2
1	1	1	2018-01-01 01:01:01	45	2

141 |

142 | 143 | 144 | 145 | **What Happened:** 146 | * By changing the order of rows (this would happen with larger amounts of data stored on different partitions), we got a different value for "first" value. 147 | * `datetime`s can only be accurate to the second and if data is coming in faster than that, it is ambiguous to order by the date column. 148 | 149 | ### Option 2: Order by `date` and `id` Column 150 | 151 | ### Window Object 152 | 153 | 154 | ```python 155 | window_2 = ( 156 | Window 157 | .partitionBy('breed_id') 158 | .orderBy('birthday', 'id') 159 | .rowsBetween(Window.unboundedPreceding, Window.currentRow) 160 | ) 161 | ``` 162 | 163 | 164 | ```python 165 | pets = spark.createDataFrame( 166 | [ 167 | (1, 1, datetime(2018, 1, 1, 1 ,1, 1), 45), 168 | (2, 1, datetime(2018, 1, 1, 1 ,1, 1), 20), 169 | ], ['id', 'breed_id', 'birthday', 'age'] 170 | ) 171 | 172 | pets.withColumn('first_pet_of_breed', F.first('id').over(window_2)).toPandas() 173 | ``` 174 | 175 | 176 | 177 | 178 |

179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 |

	id	breed_id	birthday	age	first_pet_of_breed
0	1	1	2018-01-01 01:01:01	45	1
1	2	1	2018-01-01 01:01:01	20	1

209 |

210 | 211 | 212 | 213 | 214 | ```python 215 | pets = spark.createDataFrame( 216 | [ 217 | (2, 1, datetime(2018, 1, 1, 1 ,1, 1), 20), 218 | (1, 1, datetime(2018, 1, 1, 1 ,1, 1), 45), 219 | ], ['id', 'breed_id', 'birthday', 'age'] 220 | ) 221 | 222 | pets.withColumn('first_pet_of_breed', F.first('id').over(window_2)).toPandas() 223 | ``` 224 | 225 | 226 | 227 | 228 |

229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 |

	id	breed_id	birthday	age	first_pet_of_breed
0	1	1	2018-01-01 01:01:01	45	1
1	2	1	2018-01-01 01:01:01	20	1

259 |

260 | 261 | 262 | 263 | **What Happened**: 264 | * We get the same "first" value in both incidents, which is what we expect. 265 | 266 | # TL;DR 267 | 268 | In databases, the `id` (primary key) column of a table is usually monotonically increasing. Therefore if we are dealing with frequently arriving data we can additionally sort by `id` along the `date` column. 269 | -------------------------------------------------------------------------------- /src/Chapter 6 - Tuning & Spark Parameters/Section 1.1 - Understanding how Spark Works.md: -------------------------------------------------------------------------------- 1 | # Section 1.1 - Understanding Distributed Systems and how Spark Works 2 | 3 | Working with Spark requires a different kind of thinking. Your code isn't executing in a sequential manor anymore, it's being executing in parallel. To write performant parallel code, you will need to think about how you can perform as different/same tasks at the same while minimizing the blocking of other tasks. Hopefully by understanding how spark and distributed systems work you can get into the right mindset and write clean parallel spark code. 4 | 5 | ## Understanding Distributed Systems/Computing 6 | 7 | **Sequential Applications** 8 | 9 | In a generic application the code path is run in a sequential order. As in the code will execute from the top of the file to the bottom of the file, line by line. 10 | 11 | **Multiprocess/threaded Applications** 12 | 13 | In a `multi-processed/threaded` application the code path will diverge. The application will assign a portions of the code to `threads/processes` which will handle these tasks in an `asynchronous` manner. Once these tasks are completed the `threads/processes` will signal the main application and the code path will conform again. 14 | 15 | These `threads/processes` are allocated a certain amount of memory and processing power on a single machine where the main application is running. 16 | 17 | Think about how your computer can handle multiple applications at once. This is multiple processes running on a single machine. 18 | 19 | **Distributed Computing (Clusters and Nodes)** 20 | 21 | Spark is a distributed computing library that can be used in either Python, Scala, Java or R. When we say "distributed computing" we mean that our application runs on multiple machines/computers called `Nodes` which are all part of a single `Cluster`. 22 | 23 | This is very similar to how a `multi-processed` application would work, just with more processing juice. Each `Node` is essentially a computer running multiple process running at once. 24 | 25 | ![](https://github.com/ericxiao251/spark-syntax/blob/master/src/images/master-slave.png) 26 | 27 | **Master/Slave Architecture** 28 | 29 | From the image in the last section we can see there are 2 types of `Nodes`, a single `Driver/Master Node` and multiple `Worker Nodes`. 30 | 31 | Each machine is assigned a portion of the overall work in the application. Through communication and message passing the machines attempt to compute each portion of the work in parallel. When the work is dependent on another portion of work, then it will have to wait until that work is computed and passed to the worker. When every portion of the work is done, it is all sent back to the `Master Node`. The coordination of the communication and message passing is done by the `Master Node` talking to the `Worker Nodes`. 32 | 33 | You can think of it as a team lead with multiple engineers, UX, designers, etc. The lead assigns tasks to everyone. The designers and UX collaborate to create an effective interface that is user friendly, once they are done they report back to the lead. The lead will pass this information to engineers and they can start coding the interface, etc. 34 | 35 | ## Lazy Execution 36 | 37 | When you're writing a spark application, no work is actually being done until you perform a `collect` action. As seen in some examples in the previous chapters, a `collect` action is when you want to see the results of your spark transformations in the form of a `toPandas()`, `show()`, etc. This triggers the `Driver` to start the distribution of work, etc. 38 | 39 | For now this is all you need to know, we will look into why Spark works this way and why it's a desired design decision. 40 | 41 | ## MapReduce 42 | 43 | When the `Driver Node` actually starts to do some work, it communications and distributes work using a technique called "MapReduce". There are two essential behaviors of a MapReduce application, `map` and `reduce`. 44 | 45 | ![](https://github.com/ericxiao251/spark-syntax/blob/master/src/images/mapreduce.png) 46 | 47 | **Map** 48 | 49 | When the `Driver Node` is distributing the work it `maps` the 1) data and 2) transformations to each `Worker Node`. This allows the `Worker Nodes` to perform the work (transformations) on the associated data in parallel. 50 | 51 | Ex. 52 | 53 | ```python 54 | # initialize your variables 55 | x = 5 56 | y = 10 57 | 58 | # do some transformations to your variables 59 | x = x * 2 60 | y = y + 2 61 | ``` 62 | 63 | Here we can see that the arithmetic operations performed on `x` and `y` can be done independently, so we can do those 2 operations in parallel on two different `Worker Nodes`. So Spark will `map` `x` and the operation `x * 2` to a `Worker Node` and `y` and `y + 2` to another `Worker Node`. 64 | 65 | Think about this but on a larger scale, we have 1 billion rows of numbers that we want to increment by 1. We will map portions of the data to each `Worker Node` and the operation `+ 1` to each `Worker Node`. 66 | 67 | **Reduce** 68 | 69 | When work can't be done in parallel and is dependent on some previous work, the post transformed `data` is sent back to the `Driver Node` from all the `Worker Nodes`. There the new data may be redistributed to the `Worker Nodes` to resume execution or execution is done on the `Driver Node` depending on the type of work. 70 | 71 | Ex. 72 | 73 | ```python 74 | # initialize your variables 75 | x = 5 76 | y = 10 77 | 78 | # do some transformations to your variables 79 | x = x * 2 80 | y = y + 2 81 | 82 | # do some more transformations 83 | z = x + y 84 | ``` 85 | 86 | Similar to the example above, but here we see that the last transformation `z = x + y` depends on the previous transformations. So we will need to collect all the work done on the `Worker Nodes` to the `Driver Node` and perform the final transformation. 87 | 88 | ## Key Terms 89 | 90 | **Driver Node** 91 | 92 | Learnt above. 93 | 94 | **Worker** 95 | 96 | Learnt above. 97 | 98 | **Executor** 99 | 100 | A process launched from an application on a `Worker Node`, that runs tasks and keeps data in memory or disk storage across them. Each application has its own executors. 101 | 102 | **Jobs** 103 | 104 | Job A parallel computation consisting of multiple tasks that gets spawned in response to a Spark action. 105 | 106 | **Stages** 107 | 108 | Smaller set of tasks inside any job. 109 | 110 | **Tasks** 111 | 112 | Unit of work that will be sent to one executor. 113 | 114 | [Source](https://www.slideshare.net/DatioBD/apache-spark-ii-sparksql) 115 | 116 | ![](https://github.com/ericxiao251/spark-syntax/blob/master/src/images/key-terms.png) 117 | 118 | [Source](https://www.oreilly.com/library/view/learning-spark/9781449359034/) 119 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 6 - Tuning & Spark Parameters/Section 1.1 - Understanding how Spark Works.md: -------------------------------------------------------------------------------- 1 | # Section 1.1 - Understanding Distributed Systems and how Spark Works 2 | 3 | Working with Spark requires a different kind of thinking. Your code isn't executing in a sequential manor anymore, it's being executing in parallel. To write performant parallel code, you will need to think about how you can perform as different/same tasks at the same while minimizing the blocking of other tasks. Hopefully by understanding how spark and distributed systems work you can get into the right mindset and write clean parallel spark code. 4 | 5 | ## Understanding Distributed Systems/Computing 6 | 7 | **Sequential Applications** 8 | 9 | In a generic application the code path is run in a sequential order. As in the code will execute from the top of the file to the bottom of the file, line by line. 10 | 11 | **Multiprocess/threaded Applications** 12 | 13 | In a `multi-processed/threaded` application the code path will diverge. The application will assign a portions of the code to `threads/processes` which will handle these tasks in an `asynchronous` manner. Once these tasks are completed the `threads/processes` will signal the main application and the code path will conform again. 14 | 15 | These `threads/processes` are allocated a certain amount of memory and processing power on a single machine where the main application is running. 16 | 17 | Think about how your computer can handle multiple applications at once. This is multiple processes running on a single machine. 18 | 19 | **Distributed Computing (Clusters and Nodes)** 20 | 21 | Spark is a distributed computing library that can be used in either Python, Scala, Java or R. When we say "distributed computing" we mean that our application runs on multiple machines/computers called `Nodes` which are all part of a single `Cluster`. 22 | 23 | This is very similar to how a `multi-processed` application would work, just with more processing juice. Each `Node` is essentially a computer running multiple process running at once. 24 | 25 | ![](https://github.com/ericxiao251/spark-syntax/blob/master/src/images/master-slave.png) 26 | 27 | **Master/Slave Architecture** 28 | 29 | From the image in the last section we can see there are 2 types of `Nodes`, a single `Driver/Master Node` and multiple `Worker Nodes`. 30 | 31 | Each machine is assigned a portion of the overall work in the application. Through communication and message passing the machines attempt to compute each portion of the work in parallel. When the work is dependent on another portion of work, then it will have to wait until that work is computed and passed to the worker. When every portion of the work is done, it is all sent back to the `Master Node`. The coordination of the communication and message passing is done by the `Master Node` talking to the `Worker Nodes`. 32 | 33 | You can think of it as a team lead with multiple engineers, UX, designers, etc. The lead assigns tasks to everyone. The designers and UX collaborate to create an effective interface that is user friendly, once they are done they report back to the lead. The lead will pass this information to engineers and they can start coding the interface, etc. 34 | 35 | ## Lazy Execution 36 | 37 | When you're writing a spark application, no work is actually being done until you perform a `collect` action. As seen in some examples in the previous chapters, a `collect` action is when you want to see the results of your spark transformations in the form of a `toPandas()`, `show()`, etc. This triggers the `Driver` to start the distribution of work, etc. 38 | 39 | For now this is all you need to know, we will look into why Spark works this way and why it's a desired design decision. 40 | 41 | ## MapReduce 42 | 43 | When the `Driver Node` actually starts to do some work, it communications and distributes work using a technique called "MapReduce". There are two essential behaviors of a MapReduce application, `map` and `reduce`. 44 | 45 | ![](https://github.com/ericxiao251/spark-syntax/blob/master/src/images/mapreduce.png) 46 | 47 | **Map** 48 | 49 | When the `Driver Node` is distributing the work it `maps` the 1) data and 2) transformations to each `Worker Node`. This allows the `Worker Nodes` to perform the work (transformations) on the associated data in parallel. 50 | 51 | Ex. 52 | 53 | ```python 54 | # initialize your variables 55 | x = 5 56 | y = 10 57 | 58 | # do some transformations to your variables 59 | x = x * 2 60 | y = y + 2 61 | ``` 62 | 63 | Here we can see that the arithmetic operations performed on `x` and `y` can be done independently, so we can do those 2 operations in parallel on two different `Worker Nodes`. So Spark will `map` `x` and the operation `x * 2` to a `Worker Node` and `y` and `y + 2` to another `Worker Node`. 64 | 65 | Think about this but on a larger scale, we have 1 billion rows of numbers that we want to increment by 1. We will map portions of the data to each `Worker Node` and the operation `+ 1` to each `Worker Node`. 66 | 67 | **Reduce** 68 | 69 | When work can't be done in parallel and is dependent on some previous work, the post transformed `data` is sent back to the `Driver Node` from all the `Worker Nodes`. There the new data may be redistributed to the `Worker Nodes` to resume execution or execution is done on the `Driver Node` depending on the type of work. 70 | 71 | Ex. 72 | 73 | ```python 74 | # initialize your variables 75 | x = 5 76 | y = 10 77 | 78 | # do some transformations to your variables 79 | x = x * 2 80 | y = y + 2 81 | 82 | # do some more transformations 83 | z = x + y 84 | ``` 85 | 86 | Similar to the example above, but here we see that the last transformation `z = x + y` depends on the previous transformations. So we will need to collect all the work done on the `Worker Nodes` to the `Driver Node` and perform the final transformation. 87 | 88 | ## Key Terms 89 | 90 | **Driver Node** 91 | 92 | Learnt above. 93 | 94 | **Worker** 95 | 96 | Learnt above. 97 | 98 | **Executor** 99 | 100 | A process launched from an application on a `Worker Node`, that runs tasks and keeps data in memory or disk storage across them. Each application has its own executors. 101 | 102 | **Jobs** 103 | 104 | Job A parallel computation consisting of multiple tasks that gets spawned in response to a Spark action. 105 | 106 | **Stages** 107 | 108 | Smaller set of tasks inside any job. 109 | 110 | **Tasks** 111 | 112 | Unit of work that will be sent to one executor. 113 | 114 | [Source](https://www.slideshare.net/DatioBD/apache-spark-ii-sparksql) 115 | 116 | ![](https://github.com/ericxiao251/spark-syntax/blob/master/src/images/key-terms.png) 117 | 118 | [Source](https://www.oreilly.com/library/view/learning-spark/9781449359034/) 119 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 2 - Exploring the Spark APIs/Section 2.1 - Looking at Your Data.md: -------------------------------------------------------------------------------- 1 | 2 | ### Library Imports 3 | 4 | 5 | ```python 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql import types as T 8 | 9 | from pyspark.sql import functions as F 10 | 11 | from datetime import datetime 12 | from decimal import Decimal 13 | ``` 14 | 15 | ### Template 16 | 17 | 18 | ```python 19 | spark = ( 20 | SparkSession.builder 21 | .master("local") 22 | .appName("Section 2.1 - Looking at Your Data") 23 | .config("spark.some.config.option", "some-value") 24 | .getOrCreate() 25 | ) 26 | 27 | sc = spark.sparkContext 28 | 29 | import os 30 | 31 | data_path = "/data/pets.csv" 32 | base_path = os.path.dirname(os.getcwd()) 33 | path = base_path + data_path 34 | ``` 35 | 36 | 37 | ```python 38 | pets = spark.read.csv(path, header=True) 39 | pets.toPandas() 40 | ``` 41 | 42 | 43 | 44 | 45 |

	id	breed_id	nickname	birthday	age	color
0	1	1	King	2014-11-22 12:30:31	5	brown
1	2	3	Argus	2016-11-22 10:05:10	10	None
2	3	1	Chewie	2016-11-22 10:05:10	15	None

88 |

89 | 90 | 91 | 92 | ### Looking at Your Data 93 | 94 | Spark is lazily evaluated. To look at your data you must perform a `take` operation to trigger your transformations to be evaluated. There are a couple of ways to perform a `take` operation that we'll go through here, and their performance. 95 | 96 | For example, the `toPandas()` is a `take` operation which you've already seen in many places. 97 | 98 | ### Option 1 - `collect()` 99 | 100 | 101 | ```python 102 | pets.collect() 103 | ``` 104 | 105 | 106 | 107 | 108 | [Row(id=u'1', breed_id=u'1', nickname=u'King', birthday=u'2014-11-22 12:30:31', age=u'5', color=u'brown'), 109 | Row(id=u'2', breed_id=u'3', nickname=u'Argus', birthday=u'2016-11-22 10:05:10', age=u'10', color=None), 110 | Row(id=u'3', breed_id=u'1', nickname=u'Chewie', birthday=u'2016-11-22 10:05:10', age=u'15', color=None)] 111 | 112 | 113 | 114 | **What Happened?** 115 | 116 | When you call `collect` on a `dataframe`, it will trigger a `take` operation, bring all the data to the driver node and then return all rows as a lists of `Row` objects. 117 | 118 | **Note** 119 | 120 | This should not be advised unless you **have to** look at all the rows of your dataset, you should usually sample a subset of the data. This call will execution **all** of the transformations that you have specified on **all** the data. 121 | 122 | ### Option 2 - `head()/take()/first()` 123 | 124 | 125 | ```python 126 | pets.head(n=1) 127 | ``` 128 | 129 | 130 | 131 | 132 | [Row(id=u'1', breed_id=u'1', nickname=u'King', birthday=u'2014-11-22 12:30:31', age=u'5', color=u'brown')] 133 | 134 | 135 | 136 | **What Happened?** 137 | 138 | When you call `head(n)` on a `dataframe`, it will trigger a `take` operation and return the first `n` rows of the result dataset. The different operations will return different number of rows. 139 | 140 | **Note** 141 | 142 | * If the data is **unsorted**, spark will perform the **all** the transformations on a selected amount of partitions until the number of rows are satified. This is much optimal based on how much and large your dataset is. 143 | * If the data is **sorted**, spark will perform the same as a `collect` and perform `all` of the `transformations` on `all` of the data. 144 | 145 | By `sorted` we mean, if any sort of "sorting of the data" is done during the transformations, such as `sort()`, `orderBy()`, etc. 146 | 147 | ### Option 3 - `toPandas()` 148 | 149 | 150 | ```python 151 | pets.toPandas() 152 | ``` 153 | 154 | 155 | 156 | 157 |

158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 |

	id	breed_id	nickname	birthday	age	color
0	1	1	King	2014-11-22 12:30:31	5	brown
1	2	3	Argus	2016-11-22 10:05:10	10	None
2	3	1	Chewie	2016-11-22 10:05:10	15	None

200 |

201 | 202 | 203 | 204 | **What Happened?** 205 | 206 | When you call a `toPandas()` on a `dataframe`, it will trigger a `take` operation and return all of the rows. 207 | 208 | This is as performant as the `collect()` function, but the most readible in my opinion. 209 | 210 | ### Option 4 - `show()` 211 | 212 | 213 | ```python 214 | pets.show() 215 | ``` 216 | 217 | +---+--------+--------+-------------------+---+-----+ 218 | | id|breed_id|nickname| birthday|age|color| 219 | +---+--------+--------+-------------------+---+-----+ 220 | | 1| 1| King|2014-11-22 12:30:31| 5|brown| 221 | | 2| 3| Argus|2016-11-22 10:05:10| 10| null| 222 | | 3| 1| Chewie|2016-11-22 10:05:10| 15| null| 223 | +---+--------+--------+-------------------+---+-----+ 224 | 225 | 226 | 227 | **What Happened?** 228 | 229 | When you call a `show()` on a `dataframe`, it will trigger a `take` operation return up to 20 rows. 230 | 231 | This is as performant as the `head()` function and more readible. (I still perfer `toPandas()` 😀). 232 | 233 | ### Summary 234 | 235 | * We learnt about various functions that allow you to look at your data. 236 | * Some functions are less performant than others based on if the resultant data is sorted or not. 237 | * Try to refrain from looking at all the data, unless you are required to. 238 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 2 - Exploring the Spark APIs/Section 2.4 - Constant Values and Column Expressions.md: -------------------------------------------------------------------------------- 1 | 2 | ### Library Imports 3 | 4 | 5 | ```python 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql import functions as F 8 | 9 | from datetime import date 10 | ``` 11 | 12 | ### Template 13 | 14 | 15 | ```python 16 | spark = ( 17 | SparkSession.builder 18 | .master("local") 19 | .appName("Section 2.4 - Constant Values") 20 | .config("spark.some.config.option", "some-value") 21 | .getOrCreate() 22 | ) 23 | 24 | sc = spark.sparkContext 25 | 26 | import os 27 | 28 | data_path = "/data/pets.csv" 29 | base_path = os.path.dirname(os.getcwd()) 30 | path = base_path + data_path 31 | ``` 32 | 33 | 34 | ```python 35 | pets = spark.read.csv(path, header=True) 36 | pets.toPandas() 37 | ``` 38 | 39 | 40 | 41 | 42 |

43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 |

	id	breed_id	nickname	birthday	age	color
0	1	1	King	2014-11-22 12:30:31	5	brown
1	2	3	Argus	2016-11-22 10:05:10	10	None
2	3	1	Chewie	2016-11-22 10:05:10	15	None

85 |

86 | 87 | 88 | 89 | ### Constant Values 90 | 91 | There are many instances where you will need to create a `column` expression or use a constant value to perform some of the spark transformations. We'll explore some of these. 92 | 93 | ### Case 1: Creating a Column with a constant value (`withColumn()`) (wrong) 94 | 95 | 96 | ```python 97 | pets.withColumn('todays_date', date.today()).toPandas() 98 | ``` 99 | 100 | 101 | --------------------------------------------------------------------------- 102 | 103 | AssertionError Traceback (most recent call last) 104 | 105 | in () 106 | ----> 1 pets.withColumn('todays_date', date.today()).toPandas() 107 | 108 | 109 | /usr/local/lib/python2.7/site-packages/pyspark/sql/dataframe.pyc in withColumn(self, colName, col) 110 | 1846 111 | 1847 """ 112 | -> 1848 assert isinstance(col, Column), "col should be Column" 113 | 1849 return DataFrame(self._jdf.withColumn(colName, col._jc), self.sql_ctx) 114 | 1850 115 | 116 | 117 | AssertionError: col should be Column 118 | 119 | 120 | **What Happened?** 121 | 122 | Spark functions that have a `col` as an argument will usually require you to pass in a `Column` expression. As seen in the previous section, `withColumn()` worked fine when we gave it a column from the current `df`. But this isn't the case when we want set a column to a constant value. 123 | 124 | If you get an ```AssertionError: col should be Column``` that is usually the case, we'll look into how to fix this. 125 | 126 | ### Case 1: Creating a Column with a constant value (`withColumn()`) (correct) 127 | 128 | 129 | ```python 130 | pets.withColumn('todays_date', F.lit(date.today())).toPandas() 131 | ``` 132 | 133 | 134 | 135 | 136 |

137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 |

	id	breed_id	nickname	birthday	age	color	todays_date
0	1	1	King	2014-11-22 12:30:31	5	brown	2019-02-14
1	2	3	Argus	2016-11-22 10:05:10	10	None	2019-02-14
2	3	1	Chewie	2016-11-22 10:05:10	15	None	2019-02-14

183 |

184 | 185 | 186 | 187 | **What Happened?** 188 | 189 | With `F.lit()` you can create a `column` expression that you can now assign to a new column in your dataframe. 190 | 191 | ### More Examples 192 | 193 | 194 | ```python 195 | ( 196 | pets 197 | .withColumn('age_greater_than_5', F.col("age") > 5) 198 | .withColumn('height', F.lit(150)) 199 | .where(F.col('breed_id') == 1) 200 | .where(F.col('breed_id') == F.lit(1)) 201 | .toPandas() 202 | ) 203 | ``` 204 | 205 | 206 | 207 | 208 |

209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 |

	id	breed_id	nickname	birthday	age	color	age_greater_than_5	height
0	1	1	King	2014-11-22 12:30:31	5	brown	False	150
1	3	1	Chewie	2016-11-22 10:05:10	15	None	True	150

248 |

249 | 250 | 251 | 252 | **What Happened?** 253 | 254 | (We will look into equilities statements later.) 255 | 256 | The above contains constant values (column `height`) and column expressions (columns using `F.col()`) so a `F.lit()` is not required. 257 | 258 | ### Summary 259 | 260 | * You need to use `F.lit()` to assign constant values to columns. 261 | * Equality expressions with `F.col()` is also another way to have a column expressions. 262 | * When in doubt, always use column expressions `F.lit()`. 263 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 7 - High Performance Code/Section 1.4 - Joins on Skewed Data .md: -------------------------------------------------------------------------------- 1 | 2 | A `skewed dataset` is defined by a dataset that has a class imbalance, this leads to poor or failing spark jobs that often get a `OOM` (out of memory) error. 3 | 4 | When performing a `join` onto a `skewed dataset` it's usually the case where there is an imbalance on the `key`(s) on which the join is performed on. This results in a majority of the data falls onto a single partition, which will take longer to complete than the other partitions. 5 | 6 | Some hints to detect skewness is: 7 | 1. The `key`(s) consist mainly of `null` values which fall onto a single partition. 8 | 2. There is a subset of values for the `key`(s) that makeup the high percentage of the total keys which fall onto a single partition. 9 | 10 | We go through both these cases and see how we can combat it. 11 | 12 | ### Library Imports 13 | 14 | 15 | ```python 16 | from pyspark.sql import SparkSession 17 | from pyspark.sql import functions as F 18 | ``` 19 | 20 | ### Template 21 | 22 | 23 | ```python 24 | spark = ( 25 | SparkSession.builder 26 | .master("local") 27 | .appName("Exploring Joins") 28 | .config("spark.some.config.option", "some-value") 29 | .getOrCreate() 30 | ) 31 | 32 | sc = spark.sparkContext 33 | ``` 34 | 35 | ### Situation 2: High Frequency Keys 36 | 37 | Inital Datasets 38 | 39 | 40 | ```python 41 | customers = spark.createDataFrame([ 42 | (1, "John"), 43 | (2, "Bob"), 44 | ], ["customer_id", "first_name"]) 45 | 46 | customers.toPandas() 47 | ``` 48 | 49 | 50 | 51 | 52 |

53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 |

	customer_id	first_name
0	1	John
1	2	Bob

74 |

75 | 76 | 77 | 78 | 79 | ```python 80 | orders = spark.createDataFrame([ 81 | (i, 1 if i < 95 else 2, "order #{}".format(i)) for i in range(100) 82 | ], ["id", "customer_id", "order_name"]) 83 | 84 | orders.toPandas().tail(6) 85 | ``` 86 | 87 | 88 | 89 | 90 |

91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 |

	id	customer_id	order_name
94	94	1	order #94
95	95	2	order #95
96	96	2	order #96
97	97	2	order #97
98	98	2	order #98
99	99	2	order #99

139 |

140 | 141 | 142 | 143 | ### Option 1: Inner Join 144 | 145 | 146 | ```python 147 | df = customers.join(orders, "customer_id") 148 | 149 | df.toPandas().tail(10) 150 | ``` 151 | 152 | 153 | 154 | 155 |

156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 |

	customer_id	first_name	id	order_name
90	1	John	90	order #90
91	1	John	91	order #91
92	1	John	92	order #92
93	1	John	93	order #93
94	1	John	94	order #94
95	2	Bob	95	order #95
96	2	Bob	96	order #96
97	2	Bob	97	order #97
98	2	Bob	98	order #98
99	2	Bob	99	order #99

239 |

240 | 241 | 242 | 243 | 244 | ```python 245 | df.explain() 246 | ``` 247 | 248 | == Physical Plan == 249 | *(5) Project [customer_id#122L, first_name#123, id#126L, order_name#128] 250 | +- *(5) SortMergeJoin [customer_id#122L], [customer_id#127L], Inner 251 | :- *(2) Sort [customer_id#122L ASC NULLS FIRST], false, 0 252 | : +- Exchange hashpartitioning(customer_id#122L, 200) 253 | : +- *(1) Filter isnotnull(customer_id#122L) 254 | : +- Scan ExistingRDD[customer_id#122L,first_name#123] 255 | +- *(4) Sort [customer_id#127L ASC NULLS FIRST], false, 0 256 | +- Exchange hashpartitioning(customer_id#127L, 200) 257 | +- *(3) Filter isnotnull(customer_id#127L) 258 | +- Scan ExistingRDD[id#126L,customer_id#127L,order_name#128] 259 | 260 | 261 | **What Happened**: 262 | * We want to find what `order`s each `customer` made, so we will be `join`ing the `customer`s table to the `order`s table. 263 | * When performing the join, we perform a `hashpartitioning` on `customer_id`. 264 | * From our data creation, this means 95% of the data landed onto a single partition. 265 | 266 | **Results**: 267 | * Similar to the `Null Skew` case, this means that single task/partition will take a lot longer than the others, and most likely erroring out. 268 | 269 | ### Option 2: Split the DataFrame in 2 Sections, High Frequency and Non-High Frequency values 270 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 2 - Exploring the Spark APIs/Section 3.2 - Range Join Conditions .md: -------------------------------------------------------------------------------- 1 | 2 | ### Library Imports 3 | 4 | 5 | ```python 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql import functions as F 8 | ``` 9 | 10 | ### Template 11 | 12 | 13 | ```python 14 | spark = ( 15 | SparkSession.builder 16 | .master("local") 17 | .appName("Section 3.2 - Range Join Conditions (WIP)") 18 | .config("spark.some.config.option", "some-value") 19 | .getOrCreate() 20 | ) 21 | 22 | 23 | sc = spark.sparkContext 24 | ``` 25 | 26 | 27 | ```python 28 | geo_loc_table = spark.createDataFrame([ 29 | (1, 10, "foo"), 30 | (11, 36, "bar"), 31 | (37, 59, "baz"), 32 | ], ["ipstart", "ipend", "loc"]) 33 | 34 | geo_loc_table.toPandas() 35 | ``` 36 | 37 | 38 | 39 | 40 |

41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 |

	ipstart	ipend	loc
0	1	10	foo
1	11	36	bar
2	37	59	baz

71 |

72 | 73 | 74 | 75 | 76 | ```python 77 | records_table = spark.createDataFrame([ 78 | (1, 11), 79 | (2, 38), 80 | (3, 50), 81 | ],["id", "inet"]) 82 | 83 | records_table.toPandas() 84 | ``` 85 | 86 | 87 | 88 | 89 |

90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 |

	id	inet
0	1	11
1	2	38
2	3	50

116 |

117 | 118 | 119 | 120 | ### Range Join Conditions 121 | 122 | > A naive approach (just specifying this as the range condition) would result in a full cartesian product and a filter that enforces the condition (tested using Spark 2.0). This has a horrible effect on performance, especially if DataFrames are more than a few hundred thousands records. 123 | 124 | source: http://zachmoshe.com/2016/09/26/efficient-range-joins-with-spark.html 125 | 126 | > The source of the problem is pretty simple. When you execute join and join condition is not equality based the only thing that Spark can do right now is expand it to Cartesian product followed by filter what is pretty much what happens inside `BroadcastNestedLoopJoin` 127 | 128 | source: https://stackoverflow.com/questions/37953830/spark-sql-performance-join-on-value-between-min-and-max?answertab=active#tab-top 129 | 130 | ### Option #1 131 | 132 | 133 | ```python 134 | join_condition = [ 135 | records_table['inet'] >= geo_loc_table['ipstart'], 136 | records_table['inet'] <= geo_loc_table['ipend'], 137 | ] 138 | 139 | df = records_table.join(geo_loc_table, join_condition, "left") 140 | 141 | df.toPandas() 142 | ``` 143 | 144 | 145 | 146 | 147 |

148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 |

	id	inet	ipstart	ipend	loc
0	1	11	11	36	bar
1	2	38	37	59	baz
2	3	50	37	59	baz

186 |

187 | 188 | 189 | 190 | 191 | ```python 192 | df.explain() 193 | ``` 194 | 195 | == Physical Plan == 196 | BroadcastNestedLoopJoin BuildRight, LeftOuter, ((inet#252L >= ipstart#245L) && (inet#252L <= ipend#246L)) 197 | :- Scan ExistingRDD[id#251L,inet#252L] 198 | +- BroadcastExchange IdentityBroadcastMode 199 | +- Scan ExistingRDD[ipstart#245L,ipend#246L,loc#247] 200 | 201 | 202 | ### Option #2 203 | 204 | 205 | ```python 206 | from bisect import bisect_right 207 | from pyspark.sql.functions import udf 208 | from pyspark.sql.types import LongType 209 | 210 | geo_start_bd = spark.sparkContext.broadcast(map(lambda x: x.ipstart, geo_loc_table 211 | .select("ipstart") 212 | .orderBy("ipstart") 213 | .collect() 214 | )) 215 | 216 | def find_le(x): 217 | 'Find rightmost value less than or equal to x' 218 | i = bisect_right(geo_start_bd.value, x) 219 | if i: 220 | return geo_start_bd.value[i-1] 221 | return None 222 | 223 | records_table_with_ipstart = records_table.withColumn( 224 | "ipstart", udf(find_le, LongType())("inet") 225 | ) 226 | 227 | df = records_table_with_ipstart.join(geo_loc_table, ["ipstart"], "left") 228 | 229 | df.toPandas() 230 | ``` 231 | 232 | 233 | 234 | 235 |

236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 |

	ipstart	id	inet	ipend	loc
0	37	2	38	59	baz
1	37	3	50	59	baz
2	11	1	11	36	bar

274 |

275 | 276 | 277 | 278 | 279 | ```python 280 | df.explain() 281 | ``` 282 | 283 | == Physical Plan == 284 | *(4) Project [ipstart#272L, id#251L, inet#252L, ipend#246L, loc#247] 285 | +- SortMergeJoin [ipstart#272L], [ipstart#245L], LeftOuter 286 | :- *(2) Sort [ipstart#272L ASC NULLS FIRST], false, 0 287 | : +- Exchange hashpartitioning(ipstart#272L, 200) 288 | : +- *(1) Project [id#251L, inet#252L, pythonUDF0#281L AS ipstart#272L] 289 | : +- BatchEvalPython [find_le(inet#252L)], [id#251L, inet#252L, pythonUDF0#281L] 290 | : +- Scan ExistingRDD[id#251L,inet#252L] 291 | +- *(3) Sort [ipstart#245L ASC NULLS FIRST], false, 0 292 | +- Exchange hashpartitioning(ipstart#245L, 200) 293 | +- Scan ExistingRDD[ipstart#245L,ipend#246L,loc#247] 294 | 295 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 2 - Exploring the Spark APIs/Section 2.12 - Performing Joins .md: -------------------------------------------------------------------------------- 1 | 2 | ### Library Imports 3 | 4 | 5 | ```python 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql import functions as F 8 | ``` 9 | 10 | ### Template 11 | 12 | 13 | ```python 14 | spark = ( 15 | SparkSession.builder 16 | .master("local") 17 | .appName("Section 2.12 - Performing Joins (clean one)") 18 | .config("spark.some.config.option", "some-value") 19 | .getOrCreate() 20 | ) 21 | 22 | sc = spark.sparkContext 23 | ``` 24 | 25 | 26 | ```python 27 | pets = spark.createDataFrame( 28 | [ 29 | (1, 1, 'Bear'), 30 | (2, 1, 'Chewie'), 31 | (3, 2, 'Roger'), 32 | ], ['id', 'breed_id', 'nickname'] 33 | ) 34 | 35 | pets.toPandas() 36 | ``` 37 | 38 | 39 | 40 | 41 |

42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 |

	id	breed_id	nickname
0	1	1	Bear
1	2	1	Chewie
2	3	2	Roger

72 |

73 | 74 | 75 | 76 | 77 | ```python 78 | breeds = spark.createDataFrame( 79 | [ 80 | (1, 'Pitbull', 10), 81 | (2, 'Corgie', 20), 82 | ], ['id', 'name', 'average_height'] 83 | ) 84 | 85 | breeds.toPandas() 86 | ``` 87 | 88 | 89 | 90 | 91 |

92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 |

	id	name	average_height
0	1	Pitbull	10
1	2	Corgie	20

116 |

117 | 118 | 119 | 120 | ### Performing Joins 121 | There are typically two types of joins in sql: 122 | 1. `Inner Join` is where 2 tables are joined on the basis of common columns mentioned in the ON clause. 123 | 124 | ie. `left.join(right, left[lkey] == right[rkey])` 125 | 126 | 127 | 2. `Natural Join` is where 2 tables are joined on the basis of all common columns. 128 | 129 | ie. `left.join(right, 'key')` 130 | 131 | source: https://stackoverflow.com/a/8696402 132 | 133 | **Question:** 134 | Which is better? Is it just a style choice? 135 | 136 | ### Option 1: Inner Join (w/Different Keys) 137 | 138 | 139 | ```python 140 | join_condition = pets['breed_id'] == breeds['id'] 141 | 142 | df = pets.join(breeds, join_condition) 143 | 144 | df.toPandas() 145 | ``` 146 | 147 | 148 | 149 | 150 |

151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 |

	id	breed_id	nickname	id	name	average_height
0	1	1	Bear	1	Pitbull	10
1	2	1	Chewie	1	Pitbull	10
2	3	2	Roger	2	Corgie	20

193 |

194 | 195 | 196 | 197 | **What Happened**: 198 | * We have 2 columns named `id`, but they refer to different things. 199 | * We can't uniquely reference these 2 columns (easily, still possible). 200 | * Pretty long `join expression`. 201 | 202 | This is not ideal. Let's try `renaming` it before the join? 203 | 204 | ### Option 2: Inner Join (w/Same Keys) 205 | 206 | 207 | ```python 208 | breeds = breeds.withColumnRenamed('id', 'breed_id') 209 | join_condition = pets['breed_id'] == breeds['breed_id'] 210 | 211 | df = pets.join(breeds, join_condition) 212 | 213 | df.toPandas() 214 | ``` 215 | 216 | 217 | 218 | 219 |

220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 |

	id	breed_id	nickname	breed_id	name	average_height
0	1	1	Bear	1	Pitbull	10
1	2	1	Chewie	1	Pitbull	10
2	3	2	Roger	2	Corgie	20

262 |

263 | 264 | 265 | 266 | **What Happened**: 267 | * We have 2 columns named `breed_id` which mean the same thing! 268 | * Duplicate columns appear in the result. 269 | * Still pretty long `join expression`. 270 | 271 | This is again not ideal. 272 | 273 | ### Option 3: Natural Join 274 | 275 | 276 | ```python 277 | df = pets.join(breeds, 'breed_id') 278 | df.toPandas() 279 | ``` 280 | 281 | 282 | 283 | 284 |

285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 |

	breed_id	id	nickname	name	average_height
0	1	1	Bear	Pitbull	10
1	1	2	Chewie	Pitbull	10
2	2	3	Roger	Corgie	20

323 |

324 | 325 | 326 | 327 | **What Happened**: 328 | * No duplicated column! 329 | * No extra column! 330 | * A single string required for the `join expression` (list of column/keys, if joining on multiple column/keys join). 331 | 332 | ### Summary 333 | 334 | Preforming a `natural join` was the most elegant solution in terms of `join expression` and the resulting `df`. 335 | 336 | **NOTE:** These rules also apply to the other join types (ie. `left` and `right`). 337 | 338 | **Some might argue that you will need both join keys in the result for further transformations such as filter only the left or right key, but I would recommend doing this before the join, as this is more performant. 339 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 2 - Exploring the Spark APIs/Section 2.7 - Equality Statements in Spark and Comparison with Nulls.md: -------------------------------------------------------------------------------- 1 | 2 | ### Library Imports 3 | 4 | 5 | ```python 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql import types as T 8 | 9 | from pyspark.sql import functions as F 10 | 11 | from datetime import datetime 12 | from decimal import Decimal 13 | ``` 14 | 15 | ### Template 16 | 17 | 18 | ```python 19 | spark = ( 20 | SparkSession.builder 21 | .master("local") 22 | .appName("Section 2.7 - Equality Statements in Spark and Comparison with Nulls") 23 | .config("spark.some.config.option", "some-value") 24 | .getOrCreate() 25 | ) 26 | 27 | sc = spark.sparkContext 28 | 29 | import os 30 | 31 | data_path = "/data/pets.csv" 32 | base_path = os.path.dirname(os.getcwd()) 33 | path = base_path + data_path 34 | ``` 35 | 36 | 37 | ```python 38 | pets = spark.read.csv(path, header=True) 39 | pets.toPandas() 40 | ``` 41 | 42 | 43 | 44 | 45 |

	id	breed_id	nickname	birthday	age	color
0	1	1	King	2014-11-22 12:30:31	5	brown
1	2	3	Argus	2016-11-22 10:05:10	10	None
2	3	1	Chewie	2016-11-22 10:05:10	15	None
3	3	2	Maple	2018-11-22 10:05:10	17	white

97 |

98 | 99 | 100 | 101 | ### Filtering Data 102 | 103 | When you want ot filter data with more than just one expression, there are a couple of gotchas that you will need to be careful of. 104 | 105 | ### Case 1: Multiple Conditions 106 | 107 | 108 | ```python 109 | ( 110 | pets 111 | .where( 112 | (F.col('breed_id') == 1) & 113 | (F.col('color') == 'brown') & 114 | F.col('color').isin('brown') 115 | ) 116 | .toPandas() 117 | ) 118 | ``` 119 | 120 | 121 | 122 | 123 |

124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 |

	id	breed_id	nickname	birthday	age	color
0	1	1	King	2014-11-22 12:30:31	5	brown

148 |

149 | 150 | 151 | 152 | **What Happened?** 153 | 154 | When there exists more than 1 condition you will to wrap each condition in `()` brackets and as well provide [bitwise operations](https://www.tutorialspoint.com/python/bitwise_operators_example.htm) instead of [logical operations](https://www.tutorialspoint.com/python/logical_operators_example.htm) in Python. 155 | 156 | **Why?** 157 | 158 | This is because in the spark internals they had to overwrite the `logical operations` and was only left with the `bitwise operations`. This is to my best knowledge, I could be wrong. 159 | 160 | ### Case 2: Nested Conditions 161 | 162 | 163 | ```python 164 | ( 165 | pets 166 | .where( 167 | ( 168 | F.col('breed_id').isin([1, 2]) & 169 | F.col('breed_id').isNotNull() 170 | ) | 171 | (F.col('color') == 'white') 172 | ) 173 | .toPandas() 174 | ) 175 | ``` 176 | 177 | 178 | 179 | 180 |

181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 |

	id	breed_id	nickname	birthday	age	color
0	1	1	King	2014-11-22 12:30:31	5	brown
1	3	1	Chewie	2016-11-22 10:05:10	15	None
2	3	2	Maple	2018-11-22 10:05:10	17	white

223 |

224 | 225 | 226 | 227 | **What Happened?** 228 | 229 | Similar to before, nested conditions will need to be wrapped with `()` as well. 230 | 231 | ### Case 3: Equality Statements with `Null` Values, (use `isNotNull()` and `isNull()`) 232 | 233 | 234 | ```python 235 | ( 236 | pets 237 | .withColumn('result', F.col('color') != 'white') 238 | .withColumn( 239 | 'result_2', 240 | (F.col('color') != 'white') & 241 | (F.col('color').isNotNull()) 242 | ) 243 | .toPandas() 244 | ) 245 | ``` 246 | 247 | 248 | 249 | 250 |

251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 |

	id	breed_id	nickname	birthday	age	color	result	result_2
0	1	1	King	2014-11-22 12:30:31	5	brown	True	True
1	2	3	Argus	2016-11-22 10:05:10	10	None	None	False
2	3	1	Chewie	2016-11-22 10:05:10	15	None	None	False
3	3	2	Maple	2018-11-22 10:05:10	17	white	False	False

312 |

313 | 314 | 315 | 316 | **What Happened?** 317 | 318 | If you do not come from a `sql` background any comparison with `Null` will also be `Null`, unless you specifically use the `Null` comparisons. 319 | 320 | The 2 `Null` comparisons are `isNotNull()` and `isNull()`. 321 | 322 | ### Summary 323 | 324 | * In spark when using a more involved conditional expression, you will need to wrap each condition with `()` brackets and use the **bitwise operations** in Python. 325 | 326 | * Be explicit with you're performing conditional transformations on columns that can be `Null`. 327 | -------------------------------------------------------------------------------- /src/Chapter 1 - Basics/Section 4 - More Comfortable with SQL?.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Library Imports" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from pyspark.sql import SparkSession\n", 17 | "from pyspark.sql import types as T" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "### Template" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/html": [ 35 | "

\n", 36 | "\n", 49 | "\n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | "

	id	species_id	name	birthday	color
0	1	1	King	2014-11-22 12:30:31	brown
1	2	3	Argus	2016-11-22 10:05:10	None

\n", 79 | "

" 80 | ], 81 | "text/plain": [ 82 | " id species_id name birthday color\n", 83 | "0 1 1 King 2014-11-22 12:30:31 brown\n", 84 | "1 2 3 Argus 2016-11-22 10:05:10 None" 85 | ] 86 | }, 87 | "execution_count": 2, 88 | "metadata": {}, 89 | "output_type": "execute_result" 90 | } 91 | ], 92 | "source": [ 93 | "spark = (\n", 94 | " SparkSession.builder\n", 95 | " .master(\"local\")\n", 96 | " .appName(\"Section 4 - More Comfortable with SQL?\")\n", 97 | " .config(\"spark.some.config.option\", \"some-value\")\n", 98 | " .getOrCreate()\n", 99 | ")\n", 100 | "\n", 101 | "sc = spark.sparkContext\n", 102 | "\n", 103 | "import os\n", 104 | "\n", 105 | "data_path = \"/data/pets.csv\"\n", 106 | "base_path = os.path.dirname(os.getcwd())\n", 107 | "path = base_path + data_path\n", 108 | "\n", 109 | "df = spark.read.csv(path, header=True)\n", 110 | "df.toPandas()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "### Register DataFrame as a SQL Table" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 3, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "df.createOrReplaceTempView(\"pets\")" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "### What Happened?\n", 134 | "The first step in making a `df` queryable with `SQL` is to **register** the table as a sql table.\n", 135 | "\n", 136 | "This particular function will **replace** any previously registered **local** table named `pets` as a result. There are other functions that will register a dataframe with slightly different behavior. You can check the reference docs if this isn't the desired behavior: [docs](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.createGlobalTempView)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "### Let Write a SQL Query!" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 4, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/html": [ 154 | "

\n", 155 | "\n", 168 | "\n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | "

	id	species_id	name	birthday	color
0	2	3	Argus	2016-11-22 10:05:10	None

\n", 190 | "

" 191 | ], 192 | "text/plain": [ 193 | " id species_id name birthday color\n", 194 | "0 2 3 Argus 2016-11-22 10:05:10 None" 195 | ] 196 | }, 197 | "execution_count": 4, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "df_2 = spark.sql(\"\"\"\n", 204 | "SELECT \n", 205 | " *\n", 206 | "FROM pets\n", 207 | "WHERE name = 'Argus'\n", 208 | "\"\"\")\n", 209 | "\n", 210 | "df_2.toPandas()" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "### What Happened?\n", 218 | "Once your `df` is registered, call the spark `sc` function on your `spark session` object. It takes a `sql string` as an input and outputs a new `df`." 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "### Conclusion?\n", 226 | "If you're more comfortable with writing `sql` than python/spark code, then you can do so with a spark `df`! We do this by:\n", 227 | "1. Register the `df` with `df.createOrReplaceTempView('table')`.\n", 228 | "2. Call the `sql` function on your `spark session` with a `sql string` as an input.\n", 229 | "3. You're done!" 230 | ] 231 | } 232 | ], 233 | "metadata": { 234 | "kernelspec": { 235 | "display_name": "Python 2", 236 | "language": "python", 237 | "name": "python2" 238 | }, 239 | "language_info": { 240 | "codemirror_mode": { 241 | "name": "ipython", 242 | "version": 2 243 | }, 244 | "file_extension": ".py", 245 | "mimetype": "text/x-python", 246 | "name": "python", 247 | "nbconvert_exporter": "python", 248 | "pygments_lexer": "ipython2", 249 | "version": "2.7.15" 250 | } 251 | }, 252 | "nbformat": 4, 253 | "nbformat_minor": 2 254 | } 255 | -------------------------------------------------------------------------------- /gitbook/src/Chapter 4 - Window Objects/Section 1 - Default Behaviour of a Window Object.md: -------------------------------------------------------------------------------- 1 | 2 | ### Library Imports 3 | 4 | 5 | ```python 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql.window import Window 8 | from pyspark.sql import functions as F 9 | 10 | from datetime import datetime 11 | ``` 12 | 13 | ### Template 14 | 15 | 16 | ```python 17 | spark = ( 18 | SparkSession.builder 19 | .master("local") 20 | .appName("Exploring Joins") 21 | .config("spark.some.config.option", "some-value") 22 | .getOrCreate() 23 | ) 24 | 25 | sc = spark.sparkContext 26 | ``` 27 | 28 | ### Initial Datasets 29 | 30 | 31 | ```python 32 | pets = spark.createDataFrame( 33 | [ 34 | (1, 1, datetime(2018, 1, 1, 1 ,1, 1), 'Bear', 5), 35 | (2, 1, datetime(2015, 1, 1, 1 ,1, 1), 'Chewie', 10), 36 | (3, 1, datetime(2015, 1, 1, 1 ,1, 1), 'Roger', 15), 37 | ], ['id', 'breed_id', 'birthday', 'nickname', 'age'] 38 | ) 39 | 40 | pets.toPandas() 41 | ``` 42 | 43 | 44 | 45 | 46 |

47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 |

	id	breed_id	nickname	age
0	1	1	Bear	5
1	2	1	Chewie	10
2	3	1	Roger	15

81 |

82 | 83 | 84 | 85 | ### Scenario #1 86 | 87 | No `orderBy` specified for `window` object. 88 | 89 | 90 | ```python 91 | window_1 = Window.partitionBy('breed_id') 92 | 93 | df_1 = pets.withColumn('foo', (F.sum(F.col('age')).over(window_1))) 94 | 95 | df_1.toPandas() 96 | ``` 97 | 98 | 99 | 100 | 101 |

102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 |

	id	breed_id	nickname	age	foo
0	1	1	Bear	5	30
1	2	1	Chewie	10	30
2	3	1	Roger	15	30

140 |

141 | 142 | 143 | 144 | ### Scenario #2 145 | 146 | `orderBy` with no `rowsBetween` specified for `window` object. 147 | 148 | 149 | ```python 150 | window_2 = ( 151 | Window 152 | .partitionBy('breed_id') 153 | .orderBy(F.col('id')) 154 | ) 155 | 156 | df_2 = pets.withColumn('foo', (F.sum(F.col('age')).over(window_2))) 157 | 158 | df_2.toPandas() 159 | ``` 160 | 161 | 162 | 163 | 164 |

165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 |

	id	breed_id	nickname	age	foo
0	1	1	Bear	5	5
1	2	1	Chewie	10	15
2	3	1	Roger	15	30

203 |

204 | 205 | 206 | 207 | ### Scenario #3 208 | 209 | `orderBy` with a `rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)` specified for `window` object. 210 | 211 | 212 | ```python 213 | window_3 = ( 214 | Window 215 | .partitionBy('breed_id') 216 | .orderBy(F.col('id')) 217 | .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) 218 | ) 219 | 220 | df_3 = pets.withColumn('foo', (F.sum(F.col('age')).over(window_3))) 221 | 222 | df_3.toPandas() 223 | ``` 224 | 225 | 226 | 227 | 228 |

229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 |

	id	breed_id	nickname	age	foo
0	1	1	Bear	5	30
1	2	1	Chewie	10	30
2	3	1	Roger	15	30

267 |

268 | 269 | 270 | 271 | ## Why is This? 272 | 273 | 274 | ```python 275 | df_1.explain() 276 | ``` 277 | 278 | == Physical Plan == 279 | Window [sum(age#3L) windowspecdefinition(breed_id#1L, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS foo#9L], [breed_id#1L] 280 | +- *(1) Sort [breed_id#1L ASC NULLS FIRST], false, 0 281 | +- Exchange hashpartitioning(breed_id#1L, 200) 282 | +- Scan ExistingRDD[id#0L,breed_id#1L,nickname#2,age#3L] 283 | 284 | 285 | 286 | ```python 287 | df_2.explain() 288 | ``` 289 | 290 | == Physical Plan == 291 | Window [sum(age#3L) windowspecdefinition(breed_id#1L, id#0L ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS foo#216L], [breed_id#1L], [id#0L ASC NULLS FIRST] 292 | +- *(1) Sort [breed_id#1L ASC NULLS FIRST, id#0L ASC NULLS FIRST], false, 0 293 | +- Exchange hashpartitioning(breed_id#1L, 200) 294 | +- Scan ExistingRDD[id#0L,breed_id#1L,nickname#2,age#3L] 295 | 296 | 297 | 298 | ```python 299 | df_3.explain() 300 | ``` 301 | 302 | == Physical Plan == 303 | Window [sum(age#3L) windowspecdefinition(breed_id#1L, id#0L ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS foo#423L], [breed_id#1L], [id#0L ASC NULLS FIRST] 304 | +- *(1) Sort [breed_id#1L ASC NULLS FIRST, id#0L ASC NULLS FIRST], false, 0 305 | +- Exchange hashpartitioning(breed_id#1L, 200) 306 | +- Scan ExistingRDD[id#0L,breed_id#1L,nickname#2,age#3L] 307 | 308 | 309 | ### TL;DR 310 | 311 | By looking at the **Physical Plan**, the default behaviour for `Window.partitionBy('col_1').orderBy('col_2')` without a `.rowsBetween()` is to do `.rowsBetween(Window.unboundedPreceding, Window.currentRow)`. 312 | 313 | Looking at the scala code we can see that this is indeed the default and intended behavior, https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala#L36-L38. 314 | 315 | ```scala 316 | * @note When ordering is not defined, an unbounded window frame (rowFrame, unboundedPreceding, 317 | * unboundedFollowing) is used by default. When ordering is defined, a growing window frame 318 | * (rangeFrame, unboundedPreceding, currentRow) is used by default. 319 | ``` 320 | 321 | **Problem:** 322 | This will cause problems if you're care about all the rows in the partitions. 323 | 324 | -------------------------------------------------------------------------------- /src/Chapter 2 - Exploring the Spark APIs/Section 1.3 - Maps and Dictionaries.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Library Imports" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from pyspark.sql import SparkSession\n", 17 | "from pyspark.sql import types as T\n", 18 | "\n", 19 | "from pyspark.sql import functions as F\n", 20 | "\n", 21 | "from datetime import datetime\n", 22 | "from decimal import Decimal" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "### Template" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "spark = (\n", 39 | " SparkSession.builder\n", 40 | " .master(\"local\")\n", 41 | " .appName(\"Section 1.3 - Maps and Dictionaries\")\n", 42 | " .config(\"spark.some.config.option\", \"some-value\")\n", 43 | " .getOrCreate()\n", 44 | ")\n", 45 | "\n", 46 | "sc = spark.sparkContext\n", 47 | "\n", 48 | "def get_csv_schema(*args):\n", 49 | " return T.StructType([\n", 50 | " T.StructField(*arg)\n", 51 | " for arg in args\n", 52 | " ])\n", 53 | "\n", 54 | "def read_csv(fname, schema):\n", 55 | " return spark.read.csv(\n", 56 | " path=fname,\n", 57 | " header=True,\n", 58 | " schema=get_csv_schema(*schema)\n", 59 | " )\n", 60 | "\n", 61 | "import os\n", 62 | "\n", 63 | "data_path = \"/data/pets.csv\"\n", 64 | "base_path = os.path.dirname(os.getcwd())\n", 65 | "path = base_path + data_path" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 3, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "+---+--------+--------+-------------------+---+-----+------+\n", 78 | "| id|breed_id|nickname| birthday|age|color|weight|\n", 79 | "+---+--------+--------+-------------------+---+-----+------+\n", 80 | "| 1| 1| King|2014-11-22 12:30:31| 5|brown| 10.0|\n", 81 | "| 2| 3| Argus|2016-11-22 10:05:10| 10| null| 5.5|\n", 82 | "| 3| 1| Chewie|2016-11-22 10:05:10| 15| null| 12|\n", 83 | "| 3| 2| Maple|2018-11-22 10:05:10| 17|white| 3.4|\n", 84 | "| 4| 2| null|2019-01-01 10:05:10| 13| null| 10|\n", 85 | "+---+--------+--------+-------------------+---+-----+------+\n", 86 | "\n" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "pets = spark.read.csv(path, header=True)\n", 92 | "pets.show()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "### Maps and Dictionaries\n" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "### Case 1: Creating a Mapping from Existing Columns" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 4, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "+---+--------+------------+-------------------+---+-----+------+--------------------+--------------------+\n", 119 | "| id|breed_id| nickname| birthday|age|color|weight| {nickname:age}| {nickname:age} 2|\n", 120 | "+---+--------+------------+-------------------+---+-----+------+--------------------+--------------------+\n", 121 | "| 1| 1| King|2014-11-22 12:30:31| 5|brown| 10.0| [King -> 5]| [King -> 5]|\n", 122 | "| 2| 3| Argus|2016-11-22 10:05:10| 10| null| 5.5| [Argus -> 10]| [Argus -> 10]|\n", 123 | "| 3| 1| Chewie|2016-11-22 10:05:10| 15| null| 12| [Chewie -> 15]| [Chewie -> 15]|\n", 124 | "| 3| 2| Maple|2018-11-22 10:05:10| 17|white| 3.4| [Maple -> 17]| [Maple -> 17]|\n", 125 | "| 4| 2|Unknown Name|2019-01-01 10:05:10| 13| null| 10|[Unknown Name -> 13]|[Unknown Name -> 13]|\n", 126 | "+---+--------+------------+-------------------+---+-----+------+--------------------+--------------------+\n", 127 | "\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "(\n", 133 | " pets\n", 134 | " .fillna({\n", 135 | " 'nickname': 'Unknown Name',\n", 136 | " 'age': 'Unknown Age',\n", 137 | " })\n", 138 | " .withColumn('{nickname:age}', F.create_map(F.col('nickname'), F.col('age')))\n", 139 | " .withColumn('{nickname:age} 2', F.create_map('nickname', 'age'))\n", 140 | " .show()\n", 141 | ")" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "**What Happened?**\n", 149 | "\n", 150 | "You can create a column of map types using either `columnary expressions` (we'll learn what column expressions are later) or column names." 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "### Case 2: Creating a Mapping from Constant Values" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 5, 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "name": "stdout", 167 | "output_type": "stream", 168 | "text": [ 169 | "+---+--------+------------+-------------------+---+-----+------+--------------+\n", 170 | "| id|breed_id| nickname| birthday|age|color|weight|{nickname:age}|\n", 171 | "+---+--------+------------+-------------------+---+-----+------+--------------+\n", 172 | "| 1| 1| King|2014-11-22 12:30:31| 5|brown| 10.0|[key -> value]|\n", 173 | "| 2| 3| Argus|2016-11-22 10:05:10| 10| null| 5.5|[key -> value]|\n", 174 | "| 3| 1| Chewie|2016-11-22 10:05:10| 15| null| 12|[key -> value]|\n", 175 | "| 3| 2| Maple|2018-11-22 10:05:10| 17|white| 3.4|[key -> value]|\n", 176 | "| 4| 2|Unknown Name|2019-01-01 10:05:10| 13| null| 10|[key -> value]|\n", 177 | "+---+--------+------------+-------------------+---+-----+------+--------------+\n", 178 | "\n" 179 | ] 180 | } 181 | ], 182 | "source": [ 183 | "(\n", 184 | " pets\n", 185 | " .fillna({\n", 186 | " 'nickname': 'Unknown Name',\n", 187 | " 'age': 'Unknown Age',\n", 188 | " })\n", 189 | " .withColumn('{nickname:age}', F.create_map(F.lit('key'), F.lit('value')))\n", 190 | " .show()\n", 191 | ")" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "**What Happened?**\n", 199 | "\n", 200 | "You can create a column of map types of literals using the `columnary expression` `F.lit()`, we will learn this later on. Notice that each key/value needs to be a `columnal expression`? This will be a common theme throughout Spark." 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "### Summary\n", 208 | "\n", 209 | "* It is very simple to create map data in Spark.\n", 210 | "* You can do so with both existing columns or constant values.\n", 211 | "* If constant values are used, then each value must be a `columnary expression`." 212 | ] 213 | } 214 | ], 215 | "metadata": { 216 | "kernelspec": { 217 | "display_name": "Python 2", 218 | "language": "python", 219 | "name": "python2" 220 | }, 221 | "language_info": { 222 | "codemirror_mode": { 223 | "name": "ipython", 224 | "version": 2 225 | }, 226 | "file_extension": ".py", 227 | "mimetype": "text/x-python", 228 | "name": "python", 229 | "nbconvert_exporter": "python", 230 | "pygments_lexer": "ipython2", 231 | "version": "2.7.15" 232 | } 233 | }, 234 | "nbformat": 4, 235 | "nbformat_minor": 2 236 | } 237 | -------------------------------------------------------------------------------- /src/Chapter 1 - Basics/Section 3 - Reading your First Dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Library Imports" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from pyspark.sql import SparkSession\n", 17 | "from pyspark.sql import types as T" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "The above also shows you the \"best practices\" for importing these components into your program.\n", 25 | "\n", 26 | "*some of the above imports will be explained later, just know this is how you should import these functions into your Spark application." 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "These are the essential `imports` that you will need for any `PySpark` program. \n", 34 | "\n", 35 | "**`SparkSession`** \n", 36 | "The `SparkSession` is how you begin a Spark application. This is where you provide some configuration for your Spark program.\n", 37 | "\n", 38 | "**`pyspark.sql.functions`** \n", 39 | "You will find that all your data wrangling/analysis will mostly be done by chaining together multiple `functions`. If you find that you get your desired transformations with the base functions, you should:\n", 40 | "1. Look through the API docs again.\n", 41 | "2. Ask Google.\n", 42 | "3. Write a `user defined function` (`udf`).\n", 43 | "\n", 44 | "**`pyspark.sql.types`** \n", 45 | "When working with spark, you will need to define the type of data for each column you are working with. \n", 46 | "\n", 47 | "The possible types that Spark accepts are listed here: [Spark types](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.types)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "### Hello World" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "spark = (\n", 64 | " SparkSession.builder\n", 65 | " .master(\"local\")\n", 66 | " .appName(\"Section 3 - Reading your First Dataset\")\n", 67 | " .config(\"spark.some.config.option\", \"some-value\")\n", 68 | " .getOrCreate()\n", 69 | ")\n", 70 | "\n", 71 | "sc = spark.sparkContext" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "Create a `SparkSession`. No need to create `SparkContext` as you automatically get it as part of the `SparkSession`." 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "### Read in Data (CSV)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 3, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "# define the structure of your data inside the CSV file\n", 95 | "def get_csv_schema(*args):\n", 96 | " return T.StructType([\n", 97 | " T.StructField(*arg)\n", 98 | " for arg in args\n", 99 | " ])\n", 100 | "\n", 101 | "# read in your csv file with enforcing a schema\n", 102 | "def read_csv(fname, schema):\n", 103 | " return spark.read.csv(\n", 104 | " path=fname,\n", 105 | " header=True,\n", 106 | " schema=get_csv_schema(*schema)\n", 107 | " )" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 4, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "import os\n", 117 | "\n", 118 | "data_path = \"/data\"\n", 119 | "pets_path = \"/pets.csv\"\n", 120 | "base_path = os.path.dirname(os.getcwd())\n", 121 | "\n", 122 | "path = base_path + data_path + pets_path\n", 123 | "df = read_csv(\n", 124 | " fname=path,\n", 125 | " schema=[\n", 126 | " (\"id\", T.LongType(), False),\n", 127 | " (\"breed_id\", T.LongType(), True),\n", 128 | " (\"name\", T.StringType(), True),\n", 129 | " (\"birthday\", T.TimestampType(), True),\n", 130 | " (\"color\", T.StringType(), True)\n", 131 | " ]\n", 132 | ")" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 5, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/html": [ 143 | "

\n", 144 | "\n", 157 | "\n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | "

	id	species_id	name	birthday	color
0	1	1	King	2014-11-22 12:30:31	brown
1	2	3	Argus	2016-11-22 10:05:10	None

\n", 187 | "

" 188 | ], 189 | "text/plain": [ 190 | " id species_id name birthday color\n", 191 | "0 1 1 King 2014-11-22 12:30:31 brown\n", 192 | "1 2 3 Argus 2016-11-22 10:05:10 None" 193 | ] 194 | }, 195 | "execution_count": 5, 196 | "metadata": {}, 197 | "output_type": "execute_result" 198 | } 199 | ], 200 | "source": [ 201 | "df.toPandas()" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "### What Happened?\n", 209 | "Here we read in a `csv` file and put it into a `DataFrame (DF)`: this is one of the three datasets that Spark allows you to use. The other two are `Resilient Distributed Dataset (RDD)` and `Dataset`. `DF`s have replaced `RDD`s as more features have been brought out in version `2.x` of Spark. You should be able to perform anything with `DataFrames` now, if not you will have to work with `RDD`s, which I will not cover.\n", 210 | "\n", 211 | "Spark gives you the option to automatically infer the schema and types of columns in your dataset. But you should always specify a `schema` for the data that you're reading in. For each column in the `csv` file we specified:\n", 212 | "* the `name` of the column\n", 213 | "* the `data type` of the column\n", 214 | "* if `null` values can appear in the column" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "### Conclusion\n", 222 | "Congratulations! You've read in your first dataset in Spark. Next we'll look at how you can perform transformations on this dataset :)." 223 | ] 224 | } 225 | ], 226 | "metadata": { 227 | "kernelspec": { 228 | "display_name": "Python 2", 229 | "language": "python", 230 | "name": "python2" 231 | }, 232 | "language_info": { 233 | "codemirror_mode": { 234 | "name": "ipython", 235 | "version": 2 236 | }, 237 | "file_extension": ".py", 238 | "mimetype": "text/x-python", 239 | "name": "python", 240 | "nbconvert_exporter": "python", 241 | "pygments_lexer": "ipython2", 242 | "version": "2.7.15" 243 | } 244 | }, 245 | "nbformat": 4, 246 | "nbformat_minor": 2 247 | } 248 | -------------------------------------------------------------------------------- /src/Chapter 2 - Exploring the Spark APIs/Section 2.5 - Casting Columns to Different Type.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Library Imports" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from pyspark.sql import SparkSession\n", 17 | "from pyspark.sql import types as T\n", 18 | "\n", 19 | "from pyspark.sql import functions as F\n", 20 | "\n", 21 | "from datetime import datetime\n", 22 | "from decimal import Decimal" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "### Template" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "spark = (\n", 39 | " SparkSession.builder\n", 40 | " .master(\"local\")\n", 41 | " .appName(\"Section 2.5 - Casting Columns to Different Type\")\n", 42 | " .config(\"spark.some.config.option\", \"some-value\")\n", 43 | " .getOrCreate()\n", 44 | ")\n", 45 | "\n", 46 | "sc = spark.sparkContext\n", 47 | "\n", 48 | "import os\n", 49 | "\n", 50 | "data_path = \"/data/pets.csv\"\n", 51 | "base_path = os.path.dirname(os.getcwd())\n", 52 | "path = base_path + data_path" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "text/html": [ 63 | "

\n", 64 | "\n", 77 | "\n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | "

	id	breed_id	nickname	birthday	age	color
0	1	1	King	2014-11-22 12:30:31	5	brown
1	2	3	Argus	2016-11-22 10:05:10	10	None
2	3	1	Chewie	2016-11-22 10:05:10	15	None

\n", 119 | "

" 120 | ], 121 | "text/plain": [ 122 | " id breed_id nickname birthday age color\n", 123 | "0 1 1 King 2014-11-22 12:30:31 5 brown\n", 124 | "1 2 3 Argus 2016-11-22 10:05:10 10 None\n", 125 | "2 3 1 Chewie 2016-11-22 10:05:10 15 None" 126 | ] 127 | }, 128 | "execution_count": 3, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "pets = spark.read.csv(path, header=True)\n", 135 | "pets.toPandas()" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "### Casting Columns in Different Types\n", 143 | "\n", 144 | "Sometimes your data can be read in as all `unicode`/`string` in which you will need to cast them to the correct type. Or Simply you want to change the type of a column as a part of your transformation." 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "### Option 1 - `cast()`" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 4, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "data": { 161 | "text/html": [ 162 | "

\n", 163 | "\n", 176 | "\n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | "

	birthday	birthday_date	birthday_date_2
0	2014-11-22 12:30:31	2014-11-22	2014-11-22
1	2016-11-22 10:05:10	2016-11-22	2016-11-22
2	2016-11-22 10:05:10	2016-11-22	2016-11-22

\n", 206 | "

" 207 | ], 208 | "text/plain": [ 209 | " birthday birthday_date birthday_date_2\n", 210 | "0 2014-11-22 12:30:31 2014-11-22 2014-11-22\n", 211 | "1 2016-11-22 10:05:10 2016-11-22 2016-11-22\n", 212 | "2 2016-11-22 10:05:10 2016-11-22 2016-11-22" 213 | ] 214 | }, 215 | "execution_count": 4, 216 | "metadata": {}, 217 | "output_type": "execute_result" 218 | } 219 | ], 220 | "source": [ 221 | "(\n", 222 | " pets\n", 223 | " .select('birthday')\n", 224 | " .withColumn('birthday_date', F.col('birthday').cast('date'))\n", 225 | " .withColumn('birthday_date_2', F.col('birthday').cast(T.DateType()))\n", 226 | " .toPandas()\n", 227 | ")" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "**What Happened?**\n", 235 | "\n", 236 | "There are 2 ways that you can `cast` a column.\n", 237 | "1. Use a string (`cast('date')`).\n", 238 | "2. Use the spark types (`cast(T.DateType())`).\n", 239 | "\n", 240 | "I tend to use a string as it's shorter, one less import and in more editors there will be syntax highlighting for the string. " 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "### Summary\n", 248 | "\n", 249 | "* We learnt about two ways of casting a column.\n", 250 | "* The first way is a bit more cleaner IMO." 251 | ] 252 | } 253 | ], 254 | "metadata": { 255 | "kernelspec": { 256 | "display_name": "Python 2", 257 | "language": "python", 258 | "name": "python2" 259 | }, 260 | "language_info": { 261 | "codemirror_mode": { 262 | "name": "ipython", 263 | "version": 2 264 | }, 265 | "file_extension": ".py", 266 | "mimetype": "text/x-python", 267 | "name": "python", 268 | "nbconvert_exporter": "python", 269 | "pygments_lexer": "ipython2", 270 | "version": "2.7.15" 271 | } 272 | }, 273 | "nbformat": 4, 274 | "nbformat_minor": 2 275 | } 276 | -------------------------------------------------------------------------------- /src/Chapter 2 - Exploring the Spark APIs/Section 2 - Performing your First Transformations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Library Imports" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from pyspark.sql import SparkSession\n", 17 | "from pyspark.sql import types as T\n", 18 | "\n", 19 | "from pyspark.sql import functions as F\n", 20 | "\n", 21 | "from datetime import datetime\n", 22 | "from decimal import Decimal" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "### Template" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "spark = (\n", 39 | " SparkSession.builder\n", 40 | " .master(\"local\")\n", 41 | " .appName(\"Section 2 - Performing your First Transformations\")\n", 42 | " .config(\"spark.some.config.option\", \"some-value\")\n", 43 | " .getOrCreate()\n", 44 | ")\n", 45 | "\n", 46 | "sc = spark.sparkContext\n", 47 | "\n", 48 | "import os\n", 49 | "\n", 50 | "data_path = \"/data/pets.csv\"\n", 51 | "base_path = os.path.dirname(os.getcwd())\n", 52 | "path = base_path + data_path" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "text/html": [ 63 | "

	id	breed_id	nickname	birthday	age	color
0	1	1	King	2014-11-22 12:30:31	5	brown
1	2	3	Argus	2016-11-22 10:05:10	10	None
2	3	1	Chewie	2016-11-22 10:05:10	15	None

\n", 119 | "

\n", 156 | "\n", 169 | "\n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | "

	pet_id	breed_id	nickname	birthday	age	color	birthday_date	owned_by
0	2	3	Argus	2016-11-22 10:05:10	10	None	2016-11-22	me
1	3	1	Chewie	2016-11-22 10:05:10	15	None	2016-11-22	me

\n", 208 | "

" 209 | ], 210 | "text/plain": [ 211 | " pet_id breed_id nickname birthday age color birthday_date \\\n", 212 | "0 2 3 Argus 2016-11-22 10:05:10 10 None 2016-11-22 \n", 213 | "1 3 1 Chewie 2016-11-22 10:05:10 15 None 2016-11-22 \n", 214 | "\n", 215 | " owned_by \n", 216 | "0 me \n", 217 | "1 me " 218 | ] 219 | }, 220 | "execution_count": 4, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "(\n", 227 | " pets\n", 228 | " .withColumn('birthday_date', F.col('birthday').cast('date'))\n", 229 | " .withColumn('owned_by', F.lit('me'))\n", 230 | " .withColumnRenamed('id', 'pet_id')\n", 231 | " .where(F.col('birthday_date') > datetime(2015,1,1))\n", 232 | ").toPandas()" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "**What Happened?**\n", 240 | "* We renamed the `primary key` of our `df`\n", 241 | "* We truncated the precision of our date types.\n", 242 | "* we filtered our dataset to a smaller subset.\n", 243 | "* We created a new column describing who own these pets." 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "#### Summary\n", 251 | "We performed a variety of spark transformations to transform our data, we will go through these transformations in detailed in the following section." 252 | ] 253 | } 254 | ], 255 | "metadata": { 256 | "kernelspec": { 257 | "display_name": "Python 2", 258 | "language": "python", 259 | "name": "python2" 260 | }, 261 | "language_info": { 262 | "codemirror_mode": { 263 | "name": "ipython", 264 | "version": 2 265 | }, 266 | "file_extension": ".py", 267 | "mimetype": "text/x-python", 268 | "name": "python", 269 | "nbconvert_exporter": "python", 270 | "pygments_lexer": "ipython2", 271 | "version": "2.7.15" 272 | } 273 | }, 274 | "nbformat": 4, 275 | "nbformat_minor": 2 276 | } 277 | --------------------------------------------------------------------------------