├── .gitignore
├── LICENSE
├── README.md
├── docker_nbextensions
├── Dockerfile
└── docker.txt
├── stack-nbext.yml
├── stack.yml
└── work
├── 01_simple_script.py
├── 02_pyspark_job.py
├── 03_load_sql.py
├── 04_notebook.ipynb
├── 05_notebook.ipynb
├── BreadBasket_DMS.csv
├── bakery.sql
├── bootstrap_jupyter.sh
├── log4j.properties
├── postgresql-42.2.10.jar
└── requirements.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | venv/
2 | .idea/
3 | work/spark-warehouse/
4 | work/.ipynb_checkpoints/
5 | work/output/
6 | work/.empty/
7 | work/postgresql-42.2.8.jar
8 | work/.env
9 | **/.DS_Store
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Gary A. Stafford
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Jupyter Notebook PySpark Demo
2 |
3 | Demo of [PySpark](http://spark.apache.org/docs/2.4.0/api/python/pyspark.html) and [Jupyter Notebook](http://jupyter.org/) with the [Jupyter Docker Stacks](https://jupyter-docker-stacks.readthedocs.io/en/latest/). Complete information for this project can be found by reading the related blog post, [Getting Started with PySpark for Big Data Analytics, using Jupyter Notebooks and Docker
4 | ](https://wp.me/p1RD28-6Fj)
5 |
6 | ## Architecture
7 |
8 | 
9 |
10 | ## Set-up
11 |
12 | 1. Clone this project from GitHub:
13 |
14 | ```bash
15 | git clone \
16 | --branch v2 --single-branch --depth 1 --no-tags \
17 | https://github.com/garystafford/pyspark-setup-demo.git
18 | ```
19 |
20 | 2. Create `$HOME/data/postgres` directory for PostgreSQL files: `mkdir -p ~/data/postgres`
21 | 3. Optional, for local development, install Python packages: `python3 -m pip install -r requirements.txt`
22 | 4. Optional, pull docker images first:
23 |
24 | ```bash
25 | docker pull jupyter/all-spark-notebook:latest
26 | docker pull postgres:12-alpine
27 | docker pull adminer:latest
28 | ```
29 |
30 | 5. Deploy Docker Stack: `docker stack deploy -c stack.yml jupyter`
31 | 6. Retrieve the token to log into Jupyter: `docker logs $(docker ps | grep jupyter_spark | awk '{print $NF}')`
32 | 7. From the Jupyter terminal, run the install script: `sh bootstrap_jupyter.sh`
33 | 8. Export your Plotly username and api key to `.env` file:
34 |
35 | ```bash
36 | echo "PLOTLY_USERNAME=your-username" >> .env
37 | echo "PLOTLY_API_KEY=your-api-key" >> .env
38 | ```
39 |
40 | ## Demo
41 |
42 | From a Jupyter terminal window:
43 |
44 | 1. Sample Python script, run `python3 01_simple_script.py` from Jupyter terminal
45 | 2. Sample PySpark job, run `$SPARK_HOME/bin/spark-submit 02_pyspark_job.py` from Jupyter terminal
46 | 3. Load PostgreSQL sample data, run `python3 03_load_sql.py` from Jupyter terminal
47 | 4. Sample Jupyter Notebook, open `04_notebook.ipynb` from Jupyter Console
48 | 5. Sample Jupyter Notebook, open `05_notebook.ipynb` from Jupyter Console
49 | 6. Try the alternate Jupyter stack with [nbextensions](https://jupyter-contrib-nbextensions.readthedocs.io/en/latest/install.html) pre-installed, first `cd docker_nbextensions/`, then run `docker build -t garystafford/all-spark-notebook-nbext:latest .` to build the new image
50 | 7. Then, to delete the previous stack, run `docker stack rm jupyter`, followed by creating the new stack, run `cd -` and `docker stack deploy -c stack-nbext.yml jupyter`
51 |
52 |
53 |
54 | ## References
55 |
56 | - [PostgreSQL JDBC Driver](https://jdbc.postgresql.org/download.html)
57 | - [Spark SQL, DataFrames and Datasets Guide Spark](https://spark.apache.org/docs/latest/sql-programming-guide.html#jdbc-to-other-databases)
58 | - [Lesson 42: Interactive plotting with Bokeh](http://justinbois.github.io/bootcamp/2017/lessons/l42_bokeh.html)
59 | - [How to use SparkSession in Apache Spark 2.0](https://databricks.com/blog/2016/08/15/how-to-use-sparksession-in-apache-spark-2-0.html)
60 | - [Advanced Jupyter Notebook Tricks — Part I](https://blog.dominodatalab.com/lesser-known-ways-of-using-notebooks/)
61 |
--------------------------------------------------------------------------------
/docker_nbextensions/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM jupyter/all-spark-notebook:latest
2 |
3 | USER root
4 |
5 | RUN pip install jupyter_contrib_nbextensions \
6 | && jupyter contrib nbextension install --system \
7 | && pip install jupyter_nbextensions_configurator \
8 | && jupyter nbextensions_configurator enable --system \
9 | && pip install yapf # for code pretty
10 |
11 | USER $NB_UID
--------------------------------------------------------------------------------
/docker_nbextensions/docker.txt:
--------------------------------------------------------------------------------
1 | docker build -t garystafford/all-spark-notebook-nbext:latest .
--------------------------------------------------------------------------------
/stack-nbext.yml:
--------------------------------------------------------------------------------
1 | # docker stack deploy -c stack-nbext.yml jupyter
2 | # optional pgadmin container
3 | version: "3.7"
4 | networks:
5 | demo-net:
6 | services:
7 | spark:
8 | image: garystafford/all-spark-notebook-nbext:latest
9 | ports:
10 | - "8888:8888/tcp"
11 | - "4040:4040/tcp"
12 | networks:
13 | - demo-net
14 | working_dir: /home/$USER/work
15 | environment:
16 | CHOWN_HOME: "yes"
17 | GRANT_SUDO: "yes"
18 | NB_UID: 1000
19 | NB_GID: 100
20 | NB_USER: $USER
21 | NB_GROUP: staff
22 | user: root
23 | deploy:
24 | replicas: 1
25 | restart_policy:
26 | condition: on-failure
27 | volumes:
28 | - $PWD/work:/home/$USER/work
29 | postgres:
30 | image: postgres:12-alpine
31 | environment:
32 | POSTGRES_USERNAME: postgres
33 | POSTGRES_PASSWORD: postgres1234
34 | POSTGRES_DB: bakery
35 | ports:
36 | - "5432:5432/tcp"
37 | networks:
38 | - demo-net
39 | volumes:
40 | - $HOME/data/postgres:/var/lib/postgresql/data
41 | deploy:
42 | restart_policy:
43 | condition: on-failure
44 | adminer:
45 | image: adminer:latest
46 | ports:
47 | - "8080:8080/tcp"
48 | networks:
49 | - demo-net
50 | deploy:
51 | restart_policy:
52 | condition: on-failure
53 | # pgadmin:
54 | # image: dpage/pgadmin4:latest
55 | # environment:
56 | # PGADMIN_DEFAULT_EMAIL: user@domain.com
57 | # PGADMIN_DEFAULT_PASSWORD: 5up3rS3cr3t!
58 | # ports:
59 | # - "8180:80/tcp"
60 | # networks:
61 | # - demo-net
62 | # deploy:
63 | # restart_policy:
64 | # condition: on-failure
65 |
--------------------------------------------------------------------------------
/stack.yml:
--------------------------------------------------------------------------------
1 | # docker stack deploy -c stack.yml jupyter
2 | # optional pgadmin container
3 | version: "3.7"
4 | networks:
5 | demo-net:
6 | services:
7 | spark:
8 | image: jupyter/all-spark-notebook:latest
9 | ports:
10 | - "8888:8888/tcp"
11 | - "4040:4040/tcp"
12 | networks:
13 | - demo-net
14 | working_dir: /home/$USER/work
15 | environment:
16 | CHOWN_HOME: "yes"
17 | GRANT_SUDO: "yes"
18 | NB_UID: 1000
19 | NB_GID: 100
20 | NB_USER: $USER
21 | NB_GROUP: staff
22 | user: root
23 | deploy:
24 | replicas: 1
25 | restart_policy:
26 | condition: on-failure
27 | volumes:
28 | - $PWD/work:/home/$USER/work
29 | postgres:
30 | image: postgres:12-alpine
31 | environment:
32 | POSTGRES_USERNAME: postgres
33 | POSTGRES_PASSWORD: postgres1234
34 | POSTGRES_DB: bakery
35 | ports:
36 | - "5432:5432/tcp"
37 | networks:
38 | - demo-net
39 | volumes:
40 | - $HOME/data/postgres:/var/lib/postgresql/data
41 | deploy:
42 | restart_policy:
43 | condition: on-failure
44 | adminer:
45 | image: adminer:latest
46 | ports:
47 | - "8080:8080/tcp"
48 | networks:
49 | - demo-net
50 | deploy:
51 | restart_policy:
52 | condition: on-failure
53 | # pgadmin:
54 | # image: dpage/pgadmin4:latest
55 | # environment:
56 | # PGADMIN_DEFAULT_EMAIL: user@domain.com
57 | # PGADMIN_DEFAULT_PASSWORD: 5up3rS3cr3t!
58 | # ports:
59 | # - "8180:80/tcp"
60 | # networks:
61 | # - demo-net
62 | # deploy:
63 | # restart_policy:
64 | # condition: on-failure
65 |
--------------------------------------------------------------------------------
/work/01_simple_script.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | import random
4 |
5 | technologies = [ 'PySpark', 'Python', 'Spark', 'Scala', 'Java', 'Project Jupyter', 'R' ]
6 |
7 | print(f"Technologies: {technologies}")
8 |
9 | technologies.sort()
10 | print(f"Sorted: {technologies}")
11 |
12 | print(f"I'm interested in learning about {random.choice(technologies)}.")
13 |
--------------------------------------------------------------------------------
/work/02_pyspark_job.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | from pyspark.sql import SparkSession
4 |
5 | spark = SparkSession \
6 | .builder \
7 | .appName('spark-demo') \
8 | .getOrCreate()
9 |
10 | df_bakery = spark.read \
11 | .format('csv') \
12 | .option('header', 'true') \
13 | .option('delimiter', ',') \
14 | .option('inferSchema', 'true') \
15 | .load('BreadBasket_DMS.csv')
16 |
17 | df_sorted = df_bakery.cube('item').count() \
18 | .filter('item NOT LIKE \'NONE\'') \
19 | .filter('item NOT LIKE \'Adjustment\'') \
20 | .orderBy(['count', 'item'], ascending=[False, True])
21 |
22 | df_sorted.show(10, False)
23 |
24 | df_sorted.coalesce(1) \
25 | .write.format('csv') \
26 | .option('header', 'true') \
27 | .save('output/items-sold.csv', mode='overwrite')
28 |
--------------------------------------------------------------------------------
/work/03_load_sql.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | import psycopg2
4 |
5 | # connect to database
6 | connect_str = 'host=postgres port=5432 dbname=bakery user=postgres password=postgres1234'
7 | conn = psycopg2.connect(connect_str)
8 | conn.autocommit = True
9 | cursor = conn.cursor()
10 |
11 | # execute sql script
12 | sql_file = open('bakery.sql', 'r')
13 | sqlFile = sql_file.read()
14 | sql_file.close()
15 | sqlCommands = sqlFile.split(';')
16 | for command in sqlCommands:
17 | print(command)
18 | if command.strip() != '':
19 | cursor.execute(command)
20 |
21 | # import data from csv file
22 | with open('BreadBasket_DMS.csv', 'r') as f:
23 | next(f) # Skip the header row.
24 | cursor.copy_from(
25 | f,
26 | 'transactions',
27 | sep=',',
28 | columns=('date', 'time', 'transaction', 'item')
29 | )
30 | conn.commit()
31 |
32 | # confirm by selecting record
33 | command = 'SELECT COUNT(*) FROM public.transactions;'
34 | cursor.execute(command)
35 | recs = cursor.fetchall()
36 | print('Row count: %d' % recs[0])
37 |
--------------------------------------------------------------------------------
/work/04_notebook.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# PySpark Demo Notebook 4\n",
8 | "\n",
9 | "## Contents\n",
10 | "\n",
11 | "1. [Read CSV-Format File](#Read-CSV-Format-File)\n",
12 | "2. [Run PostgreSQL Script](#Run-PostgreSQL-Script)\n",
13 | "3. [Load PostgreSQL Data](#Run-PostgreSQL-Script)\n",
14 | "4. [Create a New Record](#Create-a-New-Record)\n",
15 | "5. [Append Record to Database Table](#Append-Record-to-Database-Table)\n",
16 | "6. [Overwrite Data to Database Table](#Overwrite-Data-to-Database-Table)\n",
17 | "7. [Analyze and Graph Data with BokehJS](#Analyze-and-Graph-Data-with-BokehJS)\n",
18 | "9. [Read and Write Data to Parquet](#Read-and-Write-Data-to-Parquet)\n",
19 | "\n",
20 | "## Background\n",
21 | "\n",
22 | "_Prepared by: [Gary A. Stafford](https://twitter.com/GaryStafford) \n",
23 | "Associated article: [Getting Started with Data Analytics using Jupyter Notebooks, PySpark, and Docker](https://wp.me/p1RD28-6Fj)_"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "## Read CSV-Format File\n",
31 | "Read CSV-format data file into a Spark DataFrame."
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 1,
37 | "metadata": {
38 | "ExecuteTime": {
39 | "end_time": "2019-12-06T03:45:29.626016Z",
40 | "start_time": "2019-12-06T03:45:29.416677Z"
41 | },
42 | "pycharm": {
43 | "is_executing": false
44 | }
45 | },
46 | "outputs": [],
47 | "source": [
48 | "from pyspark.sql import SparkSession\n",
49 | "from pyspark.sql.types import StructType, StructField, StringType, IntegerType"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 2,
55 | "metadata": {
56 | "ExecuteTime": {
57 | "end_time": "2019-12-06T03:45:34.343019Z",
58 | "start_time": "2019-12-06T03:45:29.628615Z"
59 | },
60 | "pycharm": {
61 | "is_executing": false
62 | }
63 | },
64 | "outputs": [],
65 | "source": [
66 | "spark = SparkSession \\\n",
67 | " .builder \\\n",
68 | " .appName('04_notebook') \\\n",
69 | " .config('spark.driver.extraClassPath', 'postgresql-42.2.10.jar') \\\n",
70 | " .getOrCreate()"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 3,
76 | "metadata": {
77 | "ExecuteTime": {
78 | "end_time": "2019-12-06T03:45:34.354352Z",
79 | "start_time": "2019-12-06T03:45:34.346118Z"
80 | },
81 | "pycharm": {
82 | "is_executing": false
83 | }
84 | },
85 | "outputs": [],
86 | "source": [
87 | "bakery_schema = StructType([\n",
88 | " StructField('date', StringType(), True),\n",
89 | " StructField('time', StringType(), True),\n",
90 | " StructField('transaction', IntegerType(), True),\n",
91 | " StructField('item', StringType(), True)\n",
92 | "])"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 4,
98 | "metadata": {
99 | "ExecuteTime": {
100 | "end_time": "2019-12-06T03:45:36.751894Z",
101 | "start_time": "2019-12-06T03:45:34.357930Z"
102 | },
103 | "pycharm": {
104 | "is_executing": false
105 | }
106 | },
107 | "outputs": [],
108 | "source": [
109 | "df1 = spark.read \\\n",
110 | " .format('csv') \\\n",
111 | " .option('header', 'true') \\\n",
112 | " .load('BreadBasket_DMS.csv', schema=bakery_schema)"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 5,
118 | "metadata": {
119 | "ExecuteTime": {
120 | "end_time": "2019-12-06T03:45:39.950006Z",
121 | "start_time": "2019-12-06T03:45:36.753623Z"
122 | },
123 | "pycharm": {
124 | "is_executing": false
125 | }
126 | },
127 | "outputs": [
128 | {
129 | "name": "stdout",
130 | "output_type": "stream",
131 | "text": [
132 | "DataFrame rows: 21293\n",
133 | "DataFrame schema: DataFrame[date: string, time: string, transaction: int, item: string]\n",
134 | "+----------+--------+-----------+-------------+\n",
135 | "|date |time |transaction|item |\n",
136 | "+----------+--------+-----------+-------------+\n",
137 | "|2016-10-30|09:58:11|1 |Bread |\n",
138 | "|2016-10-30|10:05:34|2 |Scandinavian |\n",
139 | "|2016-10-30|10:05:34|2 |Scandinavian |\n",
140 | "|2016-10-30|10:07:57|3 |Hot chocolate|\n",
141 | "|2016-10-30|10:07:57|3 |Jam |\n",
142 | "|2016-10-30|10:07:57|3 |Cookies |\n",
143 | "|2016-10-30|10:08:41|4 |Muffin |\n",
144 | "|2016-10-30|10:13:03|5 |Coffee |\n",
145 | "|2016-10-30|10:13:03|5 |Pastry |\n",
146 | "|2016-10-30|10:13:03|5 |Bread |\n",
147 | "+----------+--------+-----------+-------------+\n",
148 | "only showing top 10 rows\n",
149 | "\n"
150 | ]
151 | }
152 | ],
153 | "source": [
154 | "print('DataFrame rows: %d' % df1.count())\n",
155 | "print('DataFrame schema: %s' % df1)\n",
156 | "df1.show(10, False)"
157 | ]
158 | },
159 | {
160 | "cell_type": "markdown",
161 | "metadata": {},
162 | "source": [
163 | "## Run PostgreSQL Script\n",
164 | "Run the sql script to create the database schema and import data from CSV file."
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 6,
170 | "metadata": {
171 | "ExecuteTime": {
172 | "end_time": "2019-12-06T03:45:40.268164Z",
173 | "start_time": "2019-12-06T03:45:39.954014Z"
174 | },
175 | "pycharm": {
176 | "is_executing": false
177 | },
178 | "scrolled": true
179 | },
180 | "outputs": [
181 | {
182 | "name": "stdout",
183 | "output_type": "stream",
184 | "text": [
185 | "DROP TABLE IF EXISTS \"transactions\"\n",
186 | "\n",
187 | "DROP SEQUENCE IF EXISTS transactions_id_seq\n",
188 | "\n",
189 | "CREATE SEQUENCE transactions_id_seq INCREMENT 1 MINVALUE 1 MAXVALUE 2147483647 START 1 CACHE 1\n",
190 | "\n",
191 | "\n",
192 | "CREATE TABLE \"public\".\"transactions\"\n",
193 | "(\n",
194 | " \"id\" integer DEFAULT nextval('transactions_id_seq') NOT NULL,\n",
195 | " \"date\" character varying(10) NOT NULL,\n",
196 | " \"time\" character varying(8) NOT NULL,\n",
197 | " \"transaction\" integer NOT NULL,\n",
198 | " \"item\" character varying(50) NOT NULL\n",
199 | ") WITH (oids = false)\n",
200 | "\n",
201 | "Row count: 21293\n"
202 | ]
203 | }
204 | ],
205 | "source": [
206 | "%run -i '03_load_sql.py'"
207 | ]
208 | },
209 | {
210 | "cell_type": "markdown",
211 | "metadata": {},
212 | "source": [
213 | "## Load PostgreSQL Data\n",
214 | "Load the PostgreSQL 'transactions' table's contents into a Spark DataFrame."
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 7,
220 | "metadata": {
221 | "ExecuteTime": {
222 | "end_time": "2019-12-06T03:45:40.276617Z",
223 | "start_time": "2019-12-06T03:45:40.270872Z"
224 | },
225 | "pycharm": {
226 | "is_executing": false
227 | }
228 | },
229 | "outputs": [],
230 | "source": [
231 | "properties = {\n",
232 | " 'driver': 'org.postgresql.Driver',\n",
233 | " 'url': 'jdbc:postgresql://postgres:5432/bakery',\n",
234 | " 'user': 'postgres',\n",
235 | " 'password': 'postgres1234',\n",
236 | " 'dbtable': 'transactions',\n",
237 | "}"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": 8,
243 | "metadata": {
244 | "ExecuteTime": {
245 | "end_time": "2019-12-06T03:45:40.600010Z",
246 | "start_time": "2019-12-06T03:45:40.278813Z"
247 | },
248 | "pycharm": {
249 | "is_executing": false
250 | }
251 | },
252 | "outputs": [],
253 | "source": [
254 | "df2 = spark.read \\\n",
255 | " .format('jdbc') \\\n",
256 | " .option('driver', properties['driver']) \\\n",
257 | " .option('url', properties['url']) \\\n",
258 | " .option('user', properties['user']) \\\n",
259 | " .option('password', properties['password']) \\\n",
260 | " .option('dbtable', properties['dbtable']) \\\n",
261 | " .load()"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 9,
267 | "metadata": {
268 | "ExecuteTime": {
269 | "end_time": "2019-12-06T03:45:40.979256Z",
270 | "start_time": "2019-12-06T03:45:40.602206Z"
271 | },
272 | "pycharm": {
273 | "is_executing": false
274 | },
275 | "scrolled": true
276 | },
277 | "outputs": [
278 | {
279 | "name": "stdout",
280 | "output_type": "stream",
281 | "text": [
282 | "DataFrame rows: 21293\n",
283 | "DataFrame schema: DataFrame[date: string, time: string, transaction: int, item: string]\n",
284 | "+---+----------+--------+-----------+-------------+\n",
285 | "|id |date |time |transaction|item |\n",
286 | "+---+----------+--------+-----------+-------------+\n",
287 | "|1 |2016-10-30|09:58:11|1 |Bread |\n",
288 | "|2 |2016-10-30|10:05:34|2 |Scandinavian |\n",
289 | "|3 |2016-10-30|10:05:34|2 |Scandinavian |\n",
290 | "|4 |2016-10-30|10:07:57|3 |Hot chocolate|\n",
291 | "|5 |2016-10-30|10:07:57|3 |Jam |\n",
292 | "|6 |2016-10-30|10:07:57|3 |Cookies |\n",
293 | "|7 |2016-10-30|10:08:41|4 |Muffin |\n",
294 | "|8 |2016-10-30|10:13:03|5 |Coffee |\n",
295 | "|9 |2016-10-30|10:13:03|5 |Pastry |\n",
296 | "|10 |2016-10-30|10:13:03|5 |Bread |\n",
297 | "+---+----------+--------+-----------+-------------+\n",
298 | "only showing top 10 rows\n",
299 | "\n"
300 | ]
301 | }
302 | ],
303 | "source": [
304 | "print('DataFrame rows: %d' % df1.count())\n",
305 | "print('DataFrame schema: %s' % df1)\n",
306 | "df2.show(10, False)"
307 | ]
308 | },
309 | {
310 | "cell_type": "markdown",
311 | "metadata": {},
312 | "source": [
313 | "## Create a New Record\n",
314 | "Create a new bakery record and load into a Spark DataFrame."
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": 10,
320 | "metadata": {
321 | "ExecuteTime": {
322 | "end_time": "2019-12-06T03:45:41.359691Z",
323 | "start_time": "2019-12-06T03:45:40.980775Z"
324 | },
325 | "pycharm": {
326 | "is_executing": false
327 | }
328 | },
329 | "outputs": [],
330 | "source": [
331 | "data = [('2016-10-30', '10:13:27', 2, 'Pastry')]\n",
332 | "df3 = spark.createDataFrame(data, bakery_schema)"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": 11,
338 | "metadata": {
339 | "ExecuteTime": {
340 | "end_time": "2019-12-06T03:45:42.269428Z",
341 | "start_time": "2019-12-06T03:45:41.363867Z"
342 | },
343 | "pycharm": {
344 | "is_executing": false
345 | }
346 | },
347 | "outputs": [
348 | {
349 | "name": "stdout",
350 | "output_type": "stream",
351 | "text": [
352 | "DataFrame rows: 1\n",
353 | "DataFrame schema: DataFrame[date: string, time: string, transaction: int, item: string]\n",
354 | "+----------+--------+-----------+------+\n",
355 | "|date |time |transaction|item |\n",
356 | "+----------+--------+-----------+------+\n",
357 | "|2016-10-30|10:13:27|2 |Pastry|\n",
358 | "+----------+--------+-----------+------+\n",
359 | "\n"
360 | ]
361 | }
362 | ],
363 | "source": [
364 | "print('DataFrame rows: %d' % df3.count())\n",
365 | "print('DataFrame schema: %s' % df3)\n",
366 | "df3.show(10, False)"
367 | ]
368 | },
369 | {
370 | "cell_type": "markdown",
371 | "metadata": {},
372 | "source": [
373 | "## Append Record to Database Table\n",
374 | "Append the contents of the DataFrame to the bakery PostgreSQL database's 'transactions' table."
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": 12,
380 | "metadata": {
381 | "ExecuteTime": {
382 | "end_time": "2019-12-06T03:45:42.600551Z",
383 | "start_time": "2019-12-06T03:45:42.270651Z"
384 | },
385 | "pycharm": {
386 | "is_executing": false
387 | }
388 | },
389 | "outputs": [],
390 | "source": [
391 | "df3.write \\\n",
392 | " .format('jdbc') \\\n",
393 | " .option('driver', properties['driver']) \\\n",
394 | " .option('url', properties['url']) \\\n",
395 | " .option('user', properties['user']) \\\n",
396 | " .option('password', properties['password']) \\\n",
397 | " .option('dbtable', properties['dbtable']) \\\n",
398 | " .mode('append') \\\n",
399 | " .save()"
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": 13,
405 | "metadata": {
406 | "ExecuteTime": {
407 | "end_time": "2019-12-06T03:45:42.748246Z",
408 | "start_time": "2019-12-06T03:45:42.602245Z"
409 | },
410 | "pycharm": {
411 | "is_executing": false
412 | }
413 | },
414 | "outputs": [
415 | {
416 | "name": "stdout",
417 | "output_type": "stream",
418 | "text": [
419 | "DataFrame rows: 21294\n"
420 | ]
421 | }
422 | ],
423 | "source": [
424 | "# should now contain one additional row of data\n",
425 | "print('DataFrame rows: %d' % df2.count())"
426 | ]
427 | },
428 | {
429 | "cell_type": "markdown",
430 | "metadata": {},
431 | "source": [
432 | "## Overwrite Data to Database Table\n",
433 | "Overwrite the contents of the CSV file-based DataFrame to the 'transactions' table."
434 | ]
435 | },
436 | {
437 | "cell_type": "code",
438 | "execution_count": 14,
439 | "metadata": {
440 | "ExecuteTime": {
441 | "end_time": "2019-12-06T03:45:44.441462Z",
442 | "start_time": "2019-12-06T03:45:42.750237Z"
443 | },
444 | "pycharm": {
445 | "is_executing": false
446 | }
447 | },
448 | "outputs": [],
449 | "source": [
450 | "df1.write \\\n",
451 | " .format('jdbc') \\\n",
452 | " .option('driver', properties['driver']) \\\n",
453 | " .option('url', properties['url']) \\\n",
454 | " .option('user', properties['user']) \\\n",
455 | " .option('password', properties['password']) \\\n",
456 | " .option('dbtable', properties['dbtable']) \\\n",
457 | " .option('truncate', 'true') \\\n",
458 | " .mode('overwrite') \\\n",
459 | " .save()"
460 | ]
461 | },
462 | {
463 | "cell_type": "markdown",
464 | "metadata": {},
465 | "source": [
466 | "## Analyze and Graph Data with BokehJS\n",
467 | "Perform some simple analysis of the bakery data and plot the results with [BokehJS](https://docs.bokeh.org/en/latest/index.html).\n",
468 | "### Business Questions\n",
469 | "1. What are the busiest days of the week?\n",
470 | "2. What are the busiest times of the day?\n",
471 | "3. What are the top selling bakery items?\n",
472 | "4. How many items do customers usually buy?"
473 | ]
474 | },
475 | {
476 | "cell_type": "code",
477 | "execution_count": 15,
478 | "metadata": {
479 | "ExecuteTime": {
480 | "end_time": "2019-12-06T03:45:44.682664Z",
481 | "start_time": "2019-12-06T03:45:44.445670Z"
482 | },
483 | "pycharm": {
484 | "is_executing": false
485 | }
486 | },
487 | "outputs": [
488 | {
489 | "data": {
490 | "text/html": [
491 | "\n",
492 | "
\n",
493 | "
\n",
494 | "
Loading BokehJS ...\n",
495 | "
"
496 | ]
497 | },
498 | "metadata": {},
499 | "output_type": "display_data"
500 | },
501 | {
502 | "data": {
503 | "application/javascript": [
504 | "\n",
505 | "(function(root) {\n",
506 | " function now() {\n",
507 | " return new Date();\n",
508 | " }\n",
509 | "\n",
510 | " var force = true;\n",
511 | "\n",
512 | " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n",
513 | " root._bokeh_onload_callbacks = [];\n",
514 | " root._bokeh_is_loading = undefined;\n",
515 | " }\n",
516 | "\n",
517 | " var JS_MIME_TYPE = 'application/javascript';\n",
518 | " var HTML_MIME_TYPE = 'text/html';\n",
519 | " var EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n",
520 | " var CLASS_NAME = 'output_bokeh rendered_html';\n",
521 | "\n",
522 | " /**\n",
523 | " * Render data to the DOM node\n",
524 | " */\n",
525 | " function render(props, node) {\n",
526 | " var script = document.createElement(\"script\");\n",
527 | " node.appendChild(script);\n",
528 | " }\n",
529 | "\n",
530 | " /**\n",
531 | " * Handle when an output is cleared or removed\n",
532 | " */\n",
533 | " function handleClearOutput(event, handle) {\n",
534 | " var cell = handle.cell;\n",
535 | "\n",
536 | " var id = cell.output_area._bokeh_element_id;\n",
537 | " var server_id = cell.output_area._bokeh_server_id;\n",
538 | " // Clean up Bokeh references\n",
539 | " if (id != null && id in Bokeh.index) {\n",
540 | " Bokeh.index[id].model.document.clear();\n",
541 | " delete Bokeh.index[id];\n",
542 | " }\n",
543 | "\n",
544 | " if (server_id !== undefined) {\n",
545 | " // Clean up Bokeh references\n",
546 | " var cmd = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n",
547 | " cell.notebook.kernel.execute(cmd, {\n",
548 | " iopub: {\n",
549 | " output: function(msg) {\n",
550 | " var id = msg.content.text.trim();\n",
551 | " if (id in Bokeh.index) {\n",
552 | " Bokeh.index[id].model.document.clear();\n",
553 | " delete Bokeh.index[id];\n",
554 | " }\n",
555 | " }\n",
556 | " }\n",
557 | " });\n",
558 | " // Destroy server and session\n",
559 | " var cmd = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n",
560 | " cell.notebook.kernel.execute(cmd);\n",
561 | " }\n",
562 | " }\n",
563 | "\n",
564 | " /**\n",
565 | " * Handle when a new output is added\n",
566 | " */\n",
567 | " function handleAddOutput(event, handle) {\n",
568 | " var output_area = handle.output_area;\n",
569 | " var output = handle.output;\n",
570 | "\n",
571 | " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n",
572 | " if ((output.output_type != \"display_data\") || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n",
573 | " return\n",
574 | " }\n",
575 | "\n",
576 | " var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n",
577 | "\n",
578 | " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n",
579 | " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n",
580 | " // store reference to embed id on output_area\n",
581 | " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n",
582 | " }\n",
583 | " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n",
584 | " var bk_div = document.createElement(\"div\");\n",
585 | " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n",
586 | " var script_attrs = bk_div.children[0].attributes;\n",
587 | " for (var i = 0; i < script_attrs.length; i++) {\n",
588 | " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n",
589 | " }\n",
590 | " // store reference to server id on output_area\n",
591 | " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n",
592 | " }\n",
593 | " }\n",
594 | "\n",
595 | " function register_renderer(events, OutputArea) {\n",
596 | "\n",
597 | " function append_mime(data, metadata, element) {\n",
598 | " // create a DOM node to render to\n",
599 | " var toinsert = this.create_output_subarea(\n",
600 | " metadata,\n",
601 | " CLASS_NAME,\n",
602 | " EXEC_MIME_TYPE\n",
603 | " );\n",
604 | " this.keyboard_manager.register_events(toinsert);\n",
605 | " // Render to node\n",
606 | " var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n",
607 | " render(props, toinsert[toinsert.length - 1]);\n",
608 | " element.append(toinsert);\n",
609 | " return toinsert\n",
610 | " }\n",
611 | "\n",
612 | " /* Handle when an output is cleared or removed */\n",
613 | " events.on('clear_output.CodeCell', handleClearOutput);\n",
614 | " events.on('delete.Cell', handleClearOutput);\n",
615 | "\n",
616 | " /* Handle when a new output is added */\n",
617 | " events.on('output_added.OutputArea', handleAddOutput);\n",
618 | "\n",
619 | " /**\n",
620 | " * Register the mime type and append_mime function with output_area\n",
621 | " */\n",
622 | " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n",
623 | " /* Is output safe? */\n",
624 | " safe: true,\n",
625 | " /* Index of renderer in `output_area.display_order` */\n",
626 | " index: 0\n",
627 | " });\n",
628 | " }\n",
629 | "\n",
630 | " // register the mime type if in Jupyter Notebook environment and previously unregistered\n",
631 | " if (root.Jupyter !== undefined) {\n",
632 | " var events = require('base/js/events');\n",
633 | " var OutputArea = require('notebook/js/outputarea').OutputArea;\n",
634 | "\n",
635 | " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n",
636 | " register_renderer(events, OutputArea);\n",
637 | " }\n",
638 | " }\n",
639 | "\n",
640 | " \n",
641 | " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n",
642 | " root._bokeh_timeout = Date.now() + 5000;\n",
643 | " root._bokeh_failed_load = false;\n",
644 | " }\n",
645 | "\n",
646 | " var NB_LOAD_WARNING = {'data': {'text/html':\n",
647 | " \"\\n\"+\n",
648 | " \"
\\n\"+\n",
649 | " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n",
650 | " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n",
651 | " \"
\\n\"+\n",
652 | " \"
\\n\"+\n",
653 | " \"- re-rerun `output_notebook()` to attempt to load from CDN again, or
\\n\"+\n",
654 | " \"- use INLINE resources instead, as so:
\\n\"+\n",
655 | " \"
\\n\"+\n",
656 | " \"
\\n\"+\n",
657 | " \"from bokeh.resources import INLINE\\n\"+\n",
658 | " \"output_notebook(resources=INLINE)\\n\"+\n",
659 | " \"
\\n\"+\n",
660 | " \"
\"}};\n",
661 | "\n",
662 | " function display_loaded() {\n",
663 | " var el = document.getElementById(\"1001\");\n",
664 | " if (el != null) {\n",
665 | " el.textContent = \"BokehJS is loading...\";\n",
666 | " }\n",
667 | " if (root.Bokeh !== undefined) {\n",
668 | " if (el != null) {\n",
669 | " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n",
670 | " }\n",
671 | " } else if (Date.now() < root._bokeh_timeout) {\n",
672 | " setTimeout(display_loaded, 100)\n",
673 | " }\n",
674 | " }\n",
675 | "\n",
676 | "\n",
677 | " function run_callbacks() {\n",
678 | " try {\n",
679 | " root._bokeh_onload_callbacks.forEach(function(callback) {\n",
680 | " if (callback != null)\n",
681 | " callback();\n",
682 | " });\n",
683 | " } finally {\n",
684 | " delete root._bokeh_onload_callbacks\n",
685 | " }\n",
686 | " console.debug(\"Bokeh: all callbacks have finished\");\n",
687 | " }\n",
688 | "\n",
689 | " function load_libs(css_urls, js_urls, callback) {\n",
690 | " if (css_urls == null) css_urls = [];\n",
691 | " if (js_urls == null) js_urls = [];\n",
692 | "\n",
693 | " root._bokeh_onload_callbacks.push(callback);\n",
694 | " if (root._bokeh_is_loading > 0) {\n",
695 | " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n",
696 | " return null;\n",
697 | " }\n",
698 | " if (js_urls == null || js_urls.length === 0) {\n",
699 | " run_callbacks();\n",
700 | " return null;\n",
701 | " }\n",
702 | " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n",
703 | " root._bokeh_is_loading = css_urls.length + js_urls.length;\n",
704 | "\n",
705 | " function on_load() {\n",
706 | " root._bokeh_is_loading--;\n",
707 | " if (root._bokeh_is_loading === 0) {\n",
708 | " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n",
709 | " run_callbacks()\n",
710 | " }\n",
711 | " }\n",
712 | "\n",
713 | " function on_error() {\n",
714 | " console.error(\"failed to load \" + url);\n",
715 | " }\n",
716 | "\n",
717 | " for (var i = 0; i < css_urls.length; i++) {\n",
718 | " var url = css_urls[i];\n",
719 | " const element = document.createElement(\"link\");\n",
720 | " element.onload = on_load;\n",
721 | " element.onerror = on_error;\n",
722 | " element.rel = \"stylesheet\";\n",
723 | " element.type = \"text/css\";\n",
724 | " element.href = url;\n",
725 | " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n",
726 | " document.body.appendChild(element);\n",
727 | " }\n",
728 | "\n",
729 | " for (var i = 0; i < js_urls.length; i++) {\n",
730 | " var url = js_urls[i];\n",
731 | " var element = document.createElement('script');\n",
732 | " element.onload = on_load;\n",
733 | " element.onerror = on_error;\n",
734 | " element.async = false;\n",
735 | " element.src = url;\n",
736 | " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n",
737 | " document.head.appendChild(element);\n",
738 | " }\n",
739 | " };var element = document.getElementById(\"1001\");\n",
740 | " if (element == null) {\n",
741 | " console.error(\"Bokeh: ERROR: autoload.js configured with elementid '1001' but no matching script tag was found. \")\n",
742 | " return false;\n",
743 | " }\n",
744 | "\n",
745 | " function inject_raw_css(css) {\n",
746 | " const element = document.createElement(\"style\");\n",
747 | " element.appendChild(document.createTextNode(css));\n",
748 | " document.body.appendChild(element);\n",
749 | " }\n",
750 | "\n",
751 | " \n",
752 | " var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-1.4.0.min.js\"];\n",
753 | " var css_urls = [];\n",
754 | " \n",
755 | "\n",
756 | " var inline_js = [\n",
757 | " function(Bokeh) {\n",
758 | " Bokeh.set_log_level(\"info\");\n",
759 | " },\n",
760 | " function(Bokeh) {\n",
761 | " \n",
762 | " \n",
763 | " }\n",
764 | " ];\n",
765 | "\n",
766 | " function run_inline_js() {\n",
767 | " \n",
768 | " if (root.Bokeh !== undefined || force === true) {\n",
769 | " \n",
770 | " for (var i = 0; i < inline_js.length; i++) {\n",
771 | " inline_js[i].call(root, root.Bokeh);\n",
772 | " }\n",
773 | " if (force === true) {\n",
774 | " display_loaded();\n",
775 | " }} else if (Date.now() < root._bokeh_timeout) {\n",
776 | " setTimeout(run_inline_js, 100);\n",
777 | " } else if (!root._bokeh_failed_load) {\n",
778 | " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n",
779 | " root._bokeh_failed_load = true;\n",
780 | " } else if (force !== true) {\n",
781 | " var cell = $(document.getElementById(\"1001\")).parents('.cell').data().cell;\n",
782 | " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n",
783 | " }\n",
784 | "\n",
785 | " }\n",
786 | "\n",
787 | " if (root._bokeh_is_loading === 0) {\n",
788 | " console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n",
789 | " run_inline_js();\n",
790 | " } else {\n",
791 | " load_libs(css_urls, js_urls, function() {\n",
792 | " console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n",
793 | " run_inline_js();\n",
794 | " });\n",
795 | " }\n",
796 | "}(window));"
797 | ],
798 | "application/vnd.bokehjs_load.v0+json": "\n(function(root) {\n function now() {\n return new Date();\n }\n\n var force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n \n\n \n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n var NB_LOAD_WARNING = {'data': {'text/html':\n \"\\n\"+\n \"
\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"
\\n\"+\n \"- re-rerun `output_notebook()` to attempt to load from CDN again, or
\\n\"+\n \"- use INLINE resources instead, as so:
\\n\"+\n \"
\\n\"+\n \"
\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"
\"}};\n\n function display_loaded() {\n var el = document.getElementById(\"1001\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error() {\n console.error(\"failed to load \" + url);\n }\n\n for (var i = 0; i < css_urls.length; i++) {\n var url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error;\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (var i = 0; i < js_urls.length; i++) {\n var url = js_urls[i];\n var element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error;\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };var element = document.getElementById(\"1001\");\n if (element == null) {\n console.error(\"Bokeh: ERROR: autoload.js configured with elementid '1001' but no matching script tag was found. \")\n return false;\n }\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n \n var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-1.4.0.min.js\"];\n var css_urls = [];\n \n\n var inline_js = [\n function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\n function(Bokeh) {\n \n \n }\n ];\n\n function run_inline_js() {\n \n if (root.Bokeh !== undefined || force === true) {\n \n for (var i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\n if (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n var cell = $(document.getElementById(\"1001\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));"
799 | },
800 | "metadata": {},
801 | "output_type": "display_data"
802 | }
803 | ],
804 | "source": [
805 | "from math import pi\n",
806 | "from bokeh.io import output_notebook, show\n",
807 | "from bokeh.plotting import figure\n",
808 | "from bokeh.models import ColumnDataSource\n",
809 | "from bokeh.transform import factor_cmap, cumsum\n",
810 | "from bokeh.palettes import Paired12\n",
811 | "\n",
812 | "output_notebook()"
813 | ]
814 | },
815 | {
816 | "cell_type": "markdown",
817 | "metadata": {},
818 | "source": [
819 | "### Pie Chart\n",
820 | "What are the busiest days of the week?"
821 | ]
822 | },
823 | {
824 | "cell_type": "code",
825 | "execution_count": 16,
826 | "metadata": {
827 | "ExecuteTime": {
828 | "end_time": "2019-12-06T03:45:45.072814Z",
829 | "start_time": "2019-12-06T03:45:44.684274Z"
830 | }
831 | },
832 | "outputs": [],
833 | "source": [
834 | "df1.createOrReplaceTempView('tmp_bakery')"
835 | ]
836 | },
837 | {
838 | "cell_type": "code",
839 | "execution_count": 17,
840 | "metadata": {
841 | "ExecuteTime": {
842 | "end_time": "2019-12-06T03:45:48.126136Z",
843 | "start_time": "2019-12-06T03:45:45.073935Z"
844 | }
845 | },
846 | "outputs": [
847 | {
848 | "name": "stdout",
849 | "output_type": "stream",
850 | "text": [
851 | "+---------+-----+\n",
852 | "|day |count|\n",
853 | "+---------+-----+\n",
854 | "|Wednesday|2320 |\n",
855 | "|Monday |2324 |\n",
856 | "|Tuesday |2392 |\n",
857 | "|Thursday |2646 |\n",
858 | "|Sunday |3095 |\n",
859 | "|Friday |3124 |\n",
860 | "|Saturday |4605 |\n",
861 | "+---------+-----+\n",
862 | "\n"
863 | ]
864 | }
865 | ],
866 | "source": [
867 | "sql_query = \"SELECT date_format(date, 'EEEE') as day, count(*) as count \" \\\n",
868 | " \"FROM tmp_bakery \" \\\n",
869 | " \"WHERE item NOT LIKE 'NONE' AND item NOT LIKE 'Adjustment' \" \\\n",
870 | " \"GROUP BY day \" \\\n",
871 | " \"ORDER BY count ASC \" \\\n",
872 | " \"LIMIT 10\"\n",
873 | "df4 = spark.sql(sql_query)\n",
874 | "df4.show(10, False)"
875 | ]
876 | },
877 | {
878 | "cell_type": "code",
879 | "execution_count": 18,
880 | "metadata": {
881 | "ExecuteTime": {
882 | "end_time": "2019-12-06T03:45:49.952748Z",
883 | "start_time": "2019-12-06T03:45:48.133574Z"
884 | },
885 | "pycharm": {
886 | "is_executing": false
887 | }
888 | },
889 | "outputs": [
890 | {
891 | "data": {
892 | "text/html": [
893 | "\n",
894 | "\n",
895 | "\n",
896 | "\n",
897 | "\n",
898 | "\n",
899 | " \n"
900 | ]
901 | },
902 | "metadata": {},
903 | "output_type": "display_data"
904 | },
905 | {
906 | "data": {
907 | "application/javascript": [
908 | "(function(root) {\n",
909 | " function embed_document(root) {\n",
910 | " \n",
911 | " var docs_json = {\"6f764be0-f527-4346-b57f-fb478ac469ea\":{\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1014\",\"type\":\"LinearAxis\"}],\"center\":[{\"id\":\"1018\",\"type\":\"Grid\"},{\"id\":\"1023\",\"type\":\"Grid\"},{\"id\":\"1053\",\"type\":\"Legend\"}],\"left\":[{\"id\":\"1019\",\"type\":\"LinearAxis\"}],\"plot_height\":450,\"plot_width\":700,\"renderers\":[{\"id\":\"1045\",\"type\":\"GlyphRenderer\"}],\"title\":{\"id\":\"1004\",\"type\":\"Title\"},\"toolbar\":{\"id\":\"1031\",\"type\":\"Toolbar\"},\"x_range\":{\"id\":\"1006\",\"type\":\"Range1d\"},\"x_scale\":{\"id\":\"1010\",\"type\":\"LinearScale\"},\"y_range\":{\"id\":\"1008\",\"type\":\"DataRange1d\"},\"y_scale\":{\"id\":\"1012\",\"type\":\"LinearScale\"}},\"id\":\"1003\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"callback\":null},\"id\":\"1008\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1050\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"dimension\":1,\"grid_line_color\":null,\"ticker\":{\"id\":\"1020\",\"type\":\"BasicTicker\"}},\"id\":\"1023\",\"type\":\"Grid\"},{\"attributes\":{\"items\":[{\"id\":\"1054\",\"type\":\"LegendItem\"}]},\"id\":\"1053\",\"type\":\"Legend\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1024\",\"type\":\"PanTool\"},{\"id\":\"1025\",\"type\":\"WheelZoomTool\"},{\"id\":\"1026\",\"type\":\"BoxZoomTool\"},{\"id\":\"1027\",\"type\":\"SaveTool\"},{\"id\":\"1028\",\"type\":\"ResetTool\"},{\"id\":\"1029\",\"type\":\"HelpTool\"},{\"id\":\"1030\",\"type\":\"HoverTool\"}]},\"id\":\"1031\",\"type\":\"Toolbar\"},{\"attributes\":{\"field\":\"angle\",\"include_zero\":true},\"id\":\"1039\",\"type\":\"CumSum\"},{\"attributes\":{\"label\":{\"field\":\"day\"},\"renderers\":[{\"id\":\"1045\",\"type\":\"GlyphRenderer\"}]},\"id\":\"1054\",\"type\":\"LegendItem\"},{\"attributes\":{\"field\":\"angle\"},\"id\":\"1040\",\"type\":\"CumSum\"},{\"attributes\":{\"callback\":null,\"data\":{\"angle\":{\"__ndarray__\":\"lS49KWe/5j8XEZ19ccnmP7kb+xchdOc/+srDBbHx6T993M6EuFjuP6sGhmiDoe4/nK8Z+HmT9j8=\",\"dtype\":\"float64\",\"shape\":[7]},\"count\":[2320,2324,2392,2646,3095,3124,4605],\"day\":[\"Wednesday\",\"Monday\",\"Tuesday\",\"Thursday\",\"Sunday\",\"Friday\",\"Saturday\"],\"index\":[0,1,2,3,4,5,6]},\"selected\":{\"id\":\"1061\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1062\",\"type\":\"UnionRenderers\"}},\"id\":\"1041\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"callback\":null,\"start\":-0.5},\"id\":\"1006\",\"type\":\"Range1d\"},{\"attributes\":{},\"id\":\"1061\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1062\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1024\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"1025\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"text\":\"Items Sold/Day\"},\"id\":\"1004\",\"type\":\"Title\"},{\"attributes\":{\"overlay\":{\"id\":\"1052\",\"type\":\"BoxAnnotation\"}},\"id\":\"1026\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"1020\",\"type\":\"BasicTicker\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"lightgrey\"},\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":{\"value\":1.0},\"line_color\":{\"value\":\"black\"},\"line_dash\":[4,4],\"line_width\":{\"value\":2},\"render_mode\":\"css\",\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1052\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"1027\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1028\",\"type\":\"ResetTool\"},{\"attributes\":{\"axis_label\":null,\"formatter\":{\"id\":\"1048\",\"type\":\"BasicTickFormatter\"},\"ticker\":{\"id\":\"1020\",\"type\":\"BasicTicker\"},\"visible\":false},\"id\":\"1019\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1010\",\"type\":\"LinearScale\"},{\"attributes\":{},\"id\":\"1029\",\"type\":\"HelpTool\"},{\"attributes\":{\"callback\":null,\"tooltips\":[[\"day\",\"@day\"],[\"count\",\"@{count}{,}\"]]},\"id\":\"1030\",\"type\":\"HoverTool\"},{\"attributes\":{\"end_angle\":{\"expr\":{\"id\":\"1040\",\"type\":\"CumSum\"},\"units\":\"rad\"},\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b4\"},\"radius\":{\"units\":\"data\",\"value\":0.4},\"start_angle\":{\"expr\":{\"id\":\"1039\",\"type\":\"CumSum\"},\"units\":\"rad\"},\"x\":{\"value\":0},\"y\":{\"value\":1}},\"id\":\"1044\",\"type\":\"Wedge\"},{\"attributes\":{},\"id\":\"1012\",\"type\":\"LinearScale\"},{\"attributes\":{\"end_angle\":{\"expr\":{\"id\":\"1040\",\"type\":\"CumSum\"},\"units\":\"rad\"},\"fill_color\":{\"field\":\"day\",\"transform\":{\"id\":\"1002\",\"type\":\"CategoricalColorMapper\"}},\"line_color\":{\"value\":\"white\"},\"radius\":{\"units\":\"data\",\"value\":0.4},\"start_angle\":{\"expr\":{\"id\":\"1039\",\"type\":\"CumSum\"},\"units\":\"rad\"},\"x\":{\"value\":0},\"y\":{\"value\":1}},\"id\":\"1043\",\"type\":\"Wedge\"},{\"attributes\":{},\"id\":\"1015\",\"type\":\"BasicTicker\"},{\"attributes\":{\"factors\":[\"Wednesday\",\"Monday\",\"Tuesday\",\"Thursday\",\"Sunday\",\"Friday\",\"Saturday\"],\"palette\":[\"#a6cee3\",\"#1f78b4\",\"#b2df8a\",\"#33a02c\",\"#fb9a99\",\"#e31a1c\",\"#fdbf6f\",\"#ff7f00\",\"#cab2d6\",\"#6a3d9a\",\"#ffff99\",\"#b15928\"]},\"id\":\"1002\",\"type\":\"CategoricalColorMapper\"},{\"attributes\":{\"data_source\":{\"id\":\"1041\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1043\",\"type\":\"Wedge\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1044\",\"type\":\"Wedge\"},\"selection_glyph\":null,\"view\":{\"id\":\"1046\",\"type\":\"CDSView\"}},\"id\":\"1045\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"grid_line_color\":null,\"ticker\":{\"id\":\"1015\",\"type\":\"BasicTicker\"}},\"id\":\"1018\",\"type\":\"Grid\"},{\"attributes\":{\"source\":{\"id\":\"1041\",\"type\":\"ColumnDataSource\"}},\"id\":\"1046\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1048\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"axis_label\":null,\"formatter\":{\"id\":\"1050\",\"type\":\"BasicTickFormatter\"},\"ticker\":{\"id\":\"1015\",\"type\":\"BasicTicker\"},\"visible\":false},\"id\":\"1014\",\"type\":\"LinearAxis\"}],\"root_ids\":[\"1003\"]},\"title\":\"Bokeh Application\",\"version\":\"1.4.0\"}};\n",
912 | " var render_items = [{\"docid\":\"6f764be0-f527-4346-b57f-fb478ac469ea\",\"roots\":{\"1003\":\"ef7bb50b-4ecb-4a86-951a-cb2c294ac99b\"}}];\n",
913 | " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n",
914 | "\n",
915 | " }\n",
916 | " if (root.Bokeh !== undefined) {\n",
917 | " embed_document(root);\n",
918 | " } else {\n",
919 | " var attempts = 0;\n",
920 | " var timer = setInterval(function(root) {\n",
921 | " if (root.Bokeh !== undefined) {\n",
922 | " clearInterval(timer);\n",
923 | " embed_document(root);\n",
924 | " } else {\n",
925 | " attempts++;\n",
926 | " if (attempts > 100) {\n",
927 | " clearInterval(timer);\n",
928 | " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n",
929 | " }\n",
930 | " }\n",
931 | " }, 10, root)\n",
932 | " }\n",
933 | "})(window);"
934 | ],
935 | "application/vnd.bokehjs_exec.v0+json": ""
936 | },
937 | "metadata": {
938 | "application/vnd.bokehjs_exec.v0+json": {
939 | "id": "1003"
940 | }
941 | },
942 | "output_type": "display_data"
943 | }
944 | ],
945 | "source": [
946 | "data = df4.toPandas()\n",
947 | "tooltips = [('day', '@day'), ('count', '@{count}{,}')]\n",
948 | "days = data['day'].tolist()\n",
949 | "color_map = factor_cmap(field_name='day', palette=Paired12, factors=days)\n",
950 | "\n",
951 | "data['angle'] = data['count'] / data['count'].sum() * 2 * pi\n",
952 | "plot = figure(plot_height=450,\n",
953 | " plot_width=700,\n",
954 | " title='Items Sold/Day',\n",
955 | " tooltips=tooltips,\n",
956 | " x_range=(-0.5, 1.0))\n",
957 | "plot.wedge(x=0,\n",
958 | " y=1,\n",
959 | " radius=0.4,\n",
960 | " start_angle=cumsum('angle', include_zero=True),\n",
961 | " end_angle=cumsum('angle'),\n",
962 | " line_color='white',\n",
963 | " fill_color=color_map,\n",
964 | " legend_field='day',\n",
965 | " source=data)\n",
966 | "plot.axis.axis_label = None\n",
967 | "plot.axis.visible = False\n",
968 | "plot.grid.grid_line_color = None\n",
969 | "\n",
970 | "show(plot)"
971 | ]
972 | },
973 | {
974 | "cell_type": "markdown",
975 | "metadata": {},
976 | "source": [
977 | "### Vertical Bar Chart\n",
978 | "What are the busiest times of the day?"
979 | ]
980 | },
981 | {
982 | "cell_type": "code",
983 | "execution_count": 19,
984 | "metadata": {
985 | "ExecuteTime": {
986 | "end_time": "2019-12-06T03:45:53.354117Z",
987 | "start_time": "2019-12-06T03:45:49.956857Z"
988 | },
989 | "scrolled": true
990 | },
991 | "outputs": [
992 | {
993 | "name": "stdout",
994 | "output_type": "stream",
995 | "text": [
996 | "+------+-----+\n",
997 | "|period|count|\n",
998 | "+------+-----+\n",
999 | "|07:00 |1 |\n",
1000 | "|07:30 |23 |\n",
1001 | "|08:00 |209 |\n",
1002 | "|08:30 |436 |\n",
1003 | "|09:00 |960 |\n",
1004 | "|09:30 |1006 |\n",
1005 | "|10:00 |1238 |\n",
1006 | "|10:30 |1428 |\n",
1007 | "|11:00 |1628 |\n",
1008 | "|11:30 |1474 |\n",
1009 | "+------+-----+\n",
1010 | "only showing top 10 rows\n",
1011 | "\n"
1012 | ]
1013 | }
1014 | ],
1015 | "source": [
1016 | "def time_increment(h, m):\n",
1017 | " \"\"\"Calculates a 30-minute time increment\n",
1018 | "\n",
1019 | " Parameters:\n",
1020 | " h (str): hours, '0' or '00' to '23'\n",
1021 | " m (str): minutes, '0' or '00' to '59'\n",
1022 | "\n",
1023 | " Returns:\n",
1024 | " str: 30-minute time increment, i.e. '07:30', '23:00', or '12:00'\n",
1025 | "\n",
1026 | " \"\"\"\n",
1027 | "\n",
1028 | " increment = (int(m) * (100 / 60)) / 100 # 0.0000 - 0.9833\n",
1029 | " increment = round(increment, 0) # 0.0 or 1.0\n",
1030 | " increment = int(increment) * 30 # 0 or 30\n",
1031 | " increment = str(h).rjust(2, '0') + ':' + str(increment).rjust(2, '0')\n",
1032 | "\n",
1033 | " return increment # i.e. '07:30' or '23:00'\n",
1034 | "\n",
1035 | "\n",
1036 | "spark.udf.register(\"udfTimeIncrement\", time_increment, StringType())\n",
1037 | "\n",
1038 | "\n",
1039 | "sql_query = \"WITH tmp_table AS (\" \\\n",
1040 | " \" SELECT udfTimeIncrement(date_format(time, 'HH'), date_format(time, 'mm')) as period, count(*) as count \" \\\n",
1041 | " \" FROM tmp_bakery \" \\\n",
1042 | " \" WHERE item NOT LIKE 'NONE' AND item NOT LIKE 'Adjustment' \" \\\n",
1043 | " \" GROUP BY period \" \\\n",
1044 | " \" ORDER BY period ASC\" \\\n",
1045 | " \") \" \\\n",
1046 | " \"SELECT period, count \" \\\n",
1047 | " \"FROM tmp_table \" \\\n",
1048 | " \"WHERE period BETWEEN '07:00' AND '19:00'\"\n",
1049 | "\n",
1050 | "df5 = spark.sql(sql_query)\n",
1051 | "df5.show(10, False)"
1052 | ]
1053 | },
1054 | {
1055 | "cell_type": "code",
1056 | "execution_count": 20,
1057 | "metadata": {
1058 | "ExecuteTime": {
1059 | "end_time": "2019-12-06T03:45:57.039172Z",
1060 | "start_time": "2019-12-06T03:45:53.355547Z"
1061 | },
1062 | "pycharm": {
1063 | "is_executing": false
1064 | }
1065 | },
1066 | "outputs": [
1067 | {
1068 | "data": {
1069 | "text/html": [
1070 | "\n",
1071 | "\n",
1072 | "\n",
1073 | "\n",
1074 | "\n",
1075 | "\n",
1076 | " \n"
1077 | ]
1078 | },
1079 | "metadata": {},
1080 | "output_type": "display_data"
1081 | },
1082 | {
1083 | "data": {
1084 | "application/javascript": [
1085 | "(function(root) {\n",
1086 | " function embed_document(root) {\n",
1087 | " \n",
1088 | " var docs_json = {\"8383c3d3-ce27-4680-adaa-5c98f981259b\":{\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1121\",\"type\":\"CategoricalAxis\"}],\"center\":[{\"id\":\"1124\",\"type\":\"Grid\"},{\"id\":\"1129\",\"type\":\"Grid\"}],\"left\":[{\"id\":\"1125\",\"type\":\"LinearAxis\"}],\"min_border\":0,\"plot_height\":450,\"plot_width\":900,\"renderers\":[{\"id\":\"1148\",\"type\":\"GlyphRenderer\"}],\"title\":{\"id\":\"1150\",\"type\":\"Title\"},\"toolbar\":{\"id\":\"1137\",\"type\":\"Toolbar\"},\"x_range\":{\"id\":\"1113\",\"type\":\"FactorRange\"},\"x_scale\":{\"id\":\"1117\",\"type\":\"CategoricalScale\"},\"y_range\":{\"id\":\"1115\",\"type\":\"DataRange1d\"},\"y_scale\":{\"id\":\"1119\",\"type\":\"LinearScale\"}},\"id\":\"1112\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"axis_label\":\"Hour of the Day\",\"formatter\":{\"id\":\"1163\",\"type\":\"CategoricalTickFormatter\"},\"ticker\":{\"id\":\"1122\",\"type\":\"CategoricalTicker\"}},\"id\":\"1121\",\"type\":\"CategoricalAxis\"},{\"attributes\":{\"data_source\":{\"id\":\"1111\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1146\",\"type\":\"VBar\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1147\",\"type\":\"VBar\"},\"selection_glyph\":null,\"view\":{\"id\":\"1149\",\"type\":\"CDSView\"}},\"id\":\"1148\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"callback\":null,\"data\":{\"count\":[1,23,209,436,960,1006,1238,1428,1628,1474,1474,1380,1341,1276,1472,1168,1062,1053,824,519,300,68,42,40,32],\"index\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24],\"period\":[\"07:00\",\"07:30\",\"08:00\",\"08:30\",\"09:00\",\"09:30\",\"10:00\",\"10:30\",\"11:00\",\"11:30\",\"12:00\",\"12:30\",\"13:00\",\"13:30\",\"14:00\",\"14:30\",\"15:00\",\"15:30\",\"16:00\",\"16:30\",\"17:00\",\"17:30\",\"18:00\",\"18:30\",\"19:00\"]},\"selected\":{\"id\":\"1166\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1167\",\"type\":\"UnionRenderers\"}},\"id\":\"1111\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"callback\":null,\"tooltips\":[[\"period\",\"@period\"],[\"count\",\"@{count}{,}\"]]},\"id\":\"1136\",\"type\":\"HoverTool\"},{\"attributes\":{},\"id\":\"1119\",\"type\":\"LinearScale\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1130\",\"type\":\"PanTool\"},{\"id\":\"1131\",\"type\":\"WheelZoomTool\"},{\"id\":\"1132\",\"type\":\"BoxZoomTool\"},{\"id\":\"1133\",\"type\":\"SaveTool\"},{\"id\":\"1134\",\"type\":\"ResetTool\"},{\"id\":\"1135\",\"type\":\"HelpTool\"},{\"id\":\"1136\",\"type\":\"HoverTool\"}]},\"id\":\"1137\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1166\",\"type\":\"Selection\"},{\"attributes\":{\"callback\":null},\"id\":\"1115\",\"type\":\"DataRange1d\"},{\"attributes\":{\"overlay\":{\"id\":\"1165\",\"type\":\"BoxAnnotation\"}},\"id\":\"1132\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"source\":{\"id\":\"1111\",\"type\":\"ColumnDataSource\"}},\"id\":\"1149\",\"type\":\"CDSView\"},{\"attributes\":{\"dimension\":1,\"ticker\":{\"id\":\"1126\",\"type\":\"BasicTicker\"}},\"id\":\"1129\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1163\",\"type\":\"CategoricalTickFormatter\"},{\"attributes\":{},\"id\":\"1161\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"ticker\":{\"id\":\"1122\",\"type\":\"CategoricalTicker\"}},\"id\":\"1124\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1133\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1117\",\"type\":\"CategoricalScale\"},{\"attributes\":{},\"id\":\"1134\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"1122\",\"type\":\"CategoricalTicker\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"lightgrey\"},\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":{\"value\":1.0},\"line_color\":{\"value\":\"black\"},\"line_dash\":[4,4],\"line_width\":{\"value\":2},\"render_mode\":\"css\",\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1165\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"text\":\"Items Sold/Hour\"},\"id\":\"1150\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"1167\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"axis_label\":\"Total Items Sold\",\"formatter\":{\"id\":\"1161\",\"type\":\"BasicTickFormatter\"},\"ticker\":{\"id\":\"1126\",\"type\":\"BasicTicker\"}},\"id\":\"1125\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1135\",\"type\":\"HelpTool\"},{\"attributes\":{\"callback\":null,\"factors\":[\"07:00\",\"07:30\",\"08:00\",\"08:30\",\"09:00\",\"09:30\",\"10:00\",\"10:30\",\"11:00\",\"11:30\",\"12:00\",\"12:30\",\"13:00\",\"13:30\",\"14:00\",\"14:30\",\"15:00\",\"15:30\",\"16:00\",\"16:30\",\"17:00\",\"17:30\",\"18:00\",\"18:30\",\"19:00\"]},\"id\":\"1113\",\"type\":\"FactorRange\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b4\"},\"top\":{\"field\":\"count\"},\"width\":{\"value\":0.9},\"x\":{\"field\":\"period\"}},\"id\":\"1147\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"1126\",\"type\":\"BasicTicker\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"line_color\":{\"value\":\"#1f77b4\"},\"top\":{\"field\":\"count\"},\"width\":{\"value\":0.9},\"x\":{\"field\":\"period\"}},\"id\":\"1146\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"1131\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1130\",\"type\":\"PanTool\"}],\"root_ids\":[\"1112\"]},\"title\":\"Bokeh Application\",\"version\":\"1.4.0\"}};\n",
1089 | " var render_items = [{\"docid\":\"8383c3d3-ce27-4680-adaa-5c98f981259b\",\"roots\":{\"1112\":\"1bc6866d-29a0-4b79-87d3-d01eee51b3b7\"}}];\n",
1090 | " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n",
1091 | "\n",
1092 | " }\n",
1093 | " if (root.Bokeh !== undefined) {\n",
1094 | " embed_document(root);\n",
1095 | " } else {\n",
1096 | " var attempts = 0;\n",
1097 | " var timer = setInterval(function(root) {\n",
1098 | " if (root.Bokeh !== undefined) {\n",
1099 | " clearInterval(timer);\n",
1100 | " embed_document(root);\n",
1101 | " } else {\n",
1102 | " attempts++;\n",
1103 | " if (attempts > 100) {\n",
1104 | " clearInterval(timer);\n",
1105 | " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n",
1106 | " }\n",
1107 | " }\n",
1108 | " }, 10, root)\n",
1109 | " }\n",
1110 | "})(window);"
1111 | ],
1112 | "application/vnd.bokehjs_exec.v0+json": ""
1113 | },
1114 | "metadata": {
1115 | "application/vnd.bokehjs_exec.v0+json": {
1116 | "id": "1112"
1117 | }
1118 | },
1119 | "output_type": "display_data"
1120 | }
1121 | ],
1122 | "source": [
1123 | "source = ColumnDataSource(data=df5.toPandas())\n",
1124 | "tooltips = [('period', '@period'), ('count', '@{count}{,}')]\n",
1125 | "periods = source.data['period'].tolist()\n",
1126 | "plot = figure(x_range=periods,\n",
1127 | " plot_width=900,\n",
1128 | " plot_height=450,\n",
1129 | " min_border=0,\n",
1130 | " tooltips=tooltips)\n",
1131 | "plot.vbar(x='period', bottom=0, top='count', source=source, width=0.9)\n",
1132 | "plot.title.text = 'Items Sold/Hour'\n",
1133 | "plot.xaxis.axis_label = 'Hour of the Day'\n",
1134 | "plot.yaxis.axis_label = 'Total Items Sold'\n",
1135 | "\n",
1136 | "show(plot)"
1137 | ]
1138 | },
1139 | {
1140 | "cell_type": "markdown",
1141 | "metadata": {},
1142 | "source": [
1143 | "### Horizontal Bar Chart\n",
1144 | "What are the top selling bakery items?"
1145 | ]
1146 | },
1147 | {
1148 | "cell_type": "code",
1149 | "execution_count": 21,
1150 | "metadata": {
1151 | "ExecuteTime": {
1152 | "end_time": "2019-12-06T03:45:58.169964Z",
1153 | "start_time": "2019-12-06T03:45:57.042381Z"
1154 | },
1155 | "pycharm": {
1156 | "is_executing": false
1157 | }
1158 | },
1159 | "outputs": [
1160 | {
1161 | "name": "stdout",
1162 | "output_type": "stream",
1163 | "text": [
1164 | "+-------------+-----+\n",
1165 | "|item |count|\n",
1166 | "+-------------+-----+\n",
1167 | "|Coffee |5471 |\n",
1168 | "|Bread |3325 |\n",
1169 | "|Tea |1435 |\n",
1170 | "|Cake |1025 |\n",
1171 | "|Pastry |856 |\n",
1172 | "|Sandwich |771 |\n",
1173 | "|Medialuna |616 |\n",
1174 | "|Hot chocolate|590 |\n",
1175 | "|Cookies |540 |\n",
1176 | "|Brownie |379 |\n",
1177 | "+-------------+-----+\n",
1178 | "\n"
1179 | ]
1180 | }
1181 | ],
1182 | "source": [
1183 | "sql_query = \"SELECT item, count(*) as count \" \\\n",
1184 | " \"FROM tmp_bakery \" \\\n",
1185 | " \"WHERE item NOT LIKE 'NONE' AND item NOT LIKE 'Adjustment' \" \\\n",
1186 | " \"GROUP BY item \" \\\n",
1187 | " \"ORDER BY count DESC \" \\\n",
1188 | " \"LIMIT 10\"\n",
1189 | "\n",
1190 | "df6 = spark.sql(sql_query)\n",
1191 | "df6.show(10, False)"
1192 | ]
1193 | },
1194 | {
1195 | "cell_type": "code",
1196 | "execution_count": 22,
1197 | "metadata": {
1198 | "ExecuteTime": {
1199 | "end_time": "2019-12-06T03:45:59.120361Z",
1200 | "start_time": "2019-12-06T03:45:58.171342Z"
1201 | },
1202 | "pycharm": {
1203 | "is_executing": false
1204 | }
1205 | },
1206 | "outputs": [
1207 | {
1208 | "data": {
1209 | "text/html": [
1210 | "\n",
1211 | "\n",
1212 | "\n",
1213 | "\n",
1214 | "\n",
1215 | "\n",
1216 | " \n"
1217 | ]
1218 | },
1219 | "metadata": {},
1220 | "output_type": "display_data"
1221 | },
1222 | {
1223 | "data": {
1224 | "application/javascript": [
1225 | "(function(root) {\n",
1226 | " function embed_document(root) {\n",
1227 | " \n",
1228 | " var docs_json = {\"d3112e97-deb3-4dea-90a8-8ab0648c6a45\":{\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1232\",\"type\":\"LinearAxis\"}],\"center\":[{\"id\":\"1236\",\"type\":\"Grid\"},{\"id\":\"1240\",\"type\":\"Grid\"}],\"left\":[{\"id\":\"1237\",\"type\":\"CategoricalAxis\"}],\"min_border\":0,\"plot_height\":375,\"plot_width\":750,\"renderers\":[{\"id\":\"1259\",\"type\":\"GlyphRenderer\"}],\"title\":{\"id\":\"1261\",\"type\":\"Title\"},\"toolbar\":{\"id\":\"1248\",\"type\":\"Toolbar\"},\"x_range\":{\"id\":\"1224\",\"type\":\"DataRange1d\"},\"x_scale\":{\"id\":\"1228\",\"type\":\"LinearScale\"},\"y_range\":{\"id\":\"1226\",\"type\":\"FactorRange\"},\"y_scale\":{\"id\":\"1230\",\"type\":\"CategoricalScale\"}},\"id\":\"1223\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"text\":\"Top 10 Bakery Items\"},\"id\":\"1261\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"1228\",\"type\":\"LinearScale\"},{\"attributes\":{\"ticker\":{\"id\":\"1233\",\"type\":\"BasicTicker\"}},\"id\":\"1236\",\"type\":\"Grid\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"value\":0.9},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b4\"},\"right\":{\"field\":\"count\"},\"y\":{\"field\":\"item\"}},\"id\":\"1258\",\"type\":\"HBar\"},{\"attributes\":{\"dimension\":1,\"ticker\":{\"id\":\"1238\",\"type\":\"CategoricalTicker\"}},\"id\":\"1240\",\"type\":\"Grid\"},{\"attributes\":{\"source\":{\"id\":\"1222\",\"type\":\"ColumnDataSource\"}},\"id\":\"1260\",\"type\":\"CDSView\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1241\",\"type\":\"PanTool\"},{\"id\":\"1242\",\"type\":\"WheelZoomTool\"},{\"id\":\"1243\",\"type\":\"BoxZoomTool\"},{\"id\":\"1244\",\"type\":\"SaveTool\"},{\"id\":\"1245\",\"type\":\"ResetTool\"},{\"id\":\"1246\",\"type\":\"HelpTool\"},{\"id\":\"1247\",\"type\":\"HoverTool\"}]},\"id\":\"1248\",\"type\":\"Toolbar\"},{\"attributes\":{\"data_source\":{\"id\":\"1222\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1257\",\"type\":\"HBar\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1258\",\"type\":\"HBar\"},\"selection_glyph\":null,\"view\":{\"id\":\"1260\",\"type\":\"CDSView\"}},\"id\":\"1259\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis_label\":\"Items\",\"formatter\":{\"id\":\"1281\",\"type\":\"CategoricalTickFormatter\"},\"ticker\":{\"id\":\"1238\",\"type\":\"CategoricalTicker\"}},\"id\":\"1237\",\"type\":\"CategoricalAxis\"},{\"attributes\":{\"callback\":null,\"tooltips\":[[\"item\",\"@item\"],[\"count\",\"@{count}{,}\"]]},\"id\":\"1247\",\"type\":\"HoverTool\"},{\"attributes\":{},\"id\":\"1238\",\"type\":\"CategoricalTicker\"},{\"attributes\":{},\"id\":\"1246\",\"type\":\"HelpTool\"},{\"attributes\":{\"callback\":null,\"data\":{\"count\":[5471,3325,1435,1025,856,771,616,590,540,379],\"index\":[0,1,2,3,4,5,6,7,8,9],\"item\":[\"Coffee\",\"Bread\",\"Tea\",\"Cake\",\"Pastry\",\"Sandwich\",\"Medialuna\",\"Hot chocolate\",\"Cookies\",\"Brownie\"]},\"selected\":{\"id\":\"1286\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1287\",\"type\":\"UnionRenderers\"}},\"id\":\"1222\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1245\",\"type\":\"ResetTool\"},{\"attributes\":{\"callback\":null},\"id\":\"1224\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1244\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1283\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"overlay\":{\"id\":\"1285\",\"type\":\"BoxAnnotation\"}},\"id\":\"1243\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"lightgrey\"},\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":{\"value\":1.0},\"line_color\":{\"value\":\"black\"},\"line_dash\":[4,4],\"line_width\":{\"value\":2},\"render_mode\":\"css\",\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1285\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"1281\",\"type\":\"CategoricalTickFormatter\"},{\"attributes\":{},\"id\":\"1242\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1286\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1230\",\"type\":\"CategoricalScale\"},{\"attributes\":{},\"id\":\"1287\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1241\",\"type\":\"PanTool\"},{\"attributes\":{\"callback\":null,\"factors\":[\"Brownie\",\"Cookies\",\"Hot chocolate\",\"Medialuna\",\"Sandwich\",\"Pastry\",\"Cake\",\"Tea\",\"Bread\",\"Coffee\"]},\"id\":\"1226\",\"type\":\"FactorRange\"},{\"attributes\":{},\"id\":\"1233\",\"type\":\"BasicTicker\"},{\"attributes\":{\"axis_label\":\"Total Items Sold\",\"formatter\":{\"id\":\"1283\",\"type\":\"BasicTickFormatter\"},\"ticker\":{\"id\":\"1233\",\"type\":\"BasicTicker\"}},\"id\":\"1232\",\"type\":\"LinearAxis\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"height\":{\"value\":0.9},\"line_color\":{\"value\":\"#1f77b4\"},\"right\":{\"field\":\"count\"},\"y\":{\"field\":\"item\"}},\"id\":\"1257\",\"type\":\"HBar\"}],\"root_ids\":[\"1223\"]},\"title\":\"Bokeh Application\",\"version\":\"1.4.0\"}};\n",
1229 | " var render_items = [{\"docid\":\"d3112e97-deb3-4dea-90a8-8ab0648c6a45\",\"roots\":{\"1223\":\"53185ccd-0d4a-4acb-9a6b-81521f9da94d\"}}];\n",
1230 | " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n",
1231 | "\n",
1232 | " }\n",
1233 | " if (root.Bokeh !== undefined) {\n",
1234 | " embed_document(root);\n",
1235 | " } else {\n",
1236 | " var attempts = 0;\n",
1237 | " var timer = setInterval(function(root) {\n",
1238 | " if (root.Bokeh !== undefined) {\n",
1239 | " clearInterval(timer);\n",
1240 | " embed_document(root);\n",
1241 | " } else {\n",
1242 | " attempts++;\n",
1243 | " if (attempts > 100) {\n",
1244 | " clearInterval(timer);\n",
1245 | " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n",
1246 | " }\n",
1247 | " }\n",
1248 | " }, 10, root)\n",
1249 | " }\n",
1250 | "})(window);"
1251 | ],
1252 | "application/vnd.bokehjs_exec.v0+json": ""
1253 | },
1254 | "metadata": {
1255 | "application/vnd.bokehjs_exec.v0+json": {
1256 | "id": "1223"
1257 | }
1258 | },
1259 | "output_type": "display_data"
1260 | }
1261 | ],
1262 | "source": [
1263 | "source = ColumnDataSource(data=df6.toPandas())\n",
1264 | "tooltips = [('item', '@item'), ('count', '@{count}{,}')]\n",
1265 | "items = source.data['item'].tolist()\n",
1266 | "items.reverse()\n",
1267 | "plot = figure(y_range=items,\n",
1268 | " plot_width=750,\n",
1269 | " plot_height=375,\n",
1270 | " min_border=0,\n",
1271 | " tooltips=tooltips)\n",
1272 | "plot.hbar(y='item', right='count', height=.9, source=source)\n",
1273 | "plot.title.text = 'Top 10 Bakery Items'\n",
1274 | "plot.yaxis.axis_label = 'Items'\n",
1275 | "plot.xaxis.axis_label = 'Total Items Sold'\n",
1276 | "\n",
1277 | "show(plot)"
1278 | ]
1279 | },
1280 | {
1281 | "cell_type": "markdown",
1282 | "metadata": {},
1283 | "source": [
1284 | "### Vertical Bar Chart\n",
1285 | "How many items do customers usually buy?"
1286 | ]
1287 | },
1288 | {
1289 | "cell_type": "code",
1290 | "execution_count": 23,
1291 | "metadata": {
1292 | "ExecuteTime": {
1293 | "end_time": "2019-12-06T03:46:01.838780Z",
1294 | "start_time": "2019-12-06T03:45:59.122373Z"
1295 | }
1296 | },
1297 | "outputs": [
1298 | {
1299 | "name": "stdout",
1300 | "output_type": "stream",
1301 | "text": [
1302 | "+----------+-----+\n",
1303 | "|order_size|count|\n",
1304 | "+----------+-----+\n",
1305 | "|1 |3630 |\n",
1306 | "|2 |2908 |\n",
1307 | "|3 |1528 |\n",
1308 | "|4 |850 |\n",
1309 | "|5 |341 |\n",
1310 | "|6 |135 |\n",
1311 | "|7 |38 |\n",
1312 | "|8 |21 |\n",
1313 | "|9 |7 |\n",
1314 | "|10 |2 |\n",
1315 | "|11 |4 |\n",
1316 | "+----------+-----+\n",
1317 | "\n"
1318 | ]
1319 | }
1320 | ],
1321 | "source": [
1322 | "sql_query = \"WITH tmp_table AS (\" \\\n",
1323 | " \" SELECT transaction, count(*) as order_size \" \\\n",
1324 | " \" FROM tmp_bakery \" \\\n",
1325 | " \" WHERE item NOT LIKE 'NONE' AND item NOT LIKE 'Adjustment' \" \\\n",
1326 | " \" GROUP BY transaction \" \\\n",
1327 | " \" ORDER BY order_size DESC\" \\\n",
1328 | " \") \" \\\n",
1329 | " \"SELECT order_size, count(*) as count \" \\\n",
1330 | " \"FROM tmp_table \" \\\n",
1331 | " \"GROUP BY order_size \" \\\n",
1332 | " \"ORDER BY order_size ASC\" \\\n",
1333 | "\n",
1334 | "df7 = spark.sql(sql_query)\n",
1335 | "df7.show(24, False)"
1336 | ]
1337 | },
1338 | {
1339 | "cell_type": "code",
1340 | "execution_count": 24,
1341 | "metadata": {
1342 | "ExecuteTime": {
1343 | "end_time": "2019-12-06T03:46:04.794879Z",
1344 | "start_time": "2019-12-06T03:46:01.842464Z"
1345 | },
1346 | "pycharm": {
1347 | "is_executing": false
1348 | }
1349 | },
1350 | "outputs": [
1351 | {
1352 | "data": {
1353 | "text/html": [
1354 | "\n",
1355 | "\n",
1356 | "\n",
1357 | "\n",
1358 | "\n",
1359 | "\n",
1360 | " \n"
1361 | ]
1362 | },
1363 | "metadata": {},
1364 | "output_type": "display_data"
1365 | },
1366 | {
1367 | "data": {
1368 | "application/javascript": [
1369 | "(function(root) {\n",
1370 | " function embed_document(root) {\n",
1371 | " \n",
1372 | " var docs_json = {\"e530c1b2-2392-434a-886f-b501dd29c1bd\":{\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1352\",\"type\":\"CategoricalAxis\"}],\"center\":[{\"id\":\"1355\",\"type\":\"Grid\"},{\"id\":\"1360\",\"type\":\"Grid\"}],\"left\":[{\"id\":\"1356\",\"type\":\"LinearAxis\"}],\"min_border\":0,\"plot_height\":375,\"plot_width\":750,\"renderers\":[{\"id\":\"1379\",\"type\":\"GlyphRenderer\"},{\"id\":\"1384\",\"type\":\"GlyphRenderer\"}],\"title\":{\"id\":\"1386\",\"type\":\"Title\"},\"toolbar\":{\"id\":\"1368\",\"type\":\"Toolbar\"},\"x_range\":{\"id\":\"1344\",\"type\":\"FactorRange\"},\"x_scale\":{\"id\":\"1348\",\"type\":\"CategoricalScale\"},\"y_range\":{\"id\":\"1346\",\"type\":\"DataRange1d\"},\"y_scale\":{\"id\":\"1350\",\"type\":\"LinearScale\"}},\"id\":\"1343\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1417\",\"type\":\"CategoricalTickFormatter\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b4\"},\"top\":{\"field\":\"count\"},\"width\":{\"value\":0.9},\"x\":{\"field\":\"order_size\"}},\"id\":\"1378\",\"type\":\"VBar\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1361\",\"type\":\"PanTool\"},{\"id\":\"1362\",\"type\":\"WheelZoomTool\"},{\"id\":\"1363\",\"type\":\"BoxZoomTool\"},{\"id\":\"1364\",\"type\":\"SaveTool\"},{\"id\":\"1365\",\"type\":\"ResetTool\"},{\"id\":\"1366\",\"type\":\"HelpTool\"},{\"id\":\"1367\",\"type\":\"HoverTool\"}]},\"id\":\"1368\",\"type\":\"Toolbar\"},{\"attributes\":{\"callback\":null},\"id\":\"1346\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1361\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"1350\",\"type\":\"LinearScale\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"lightgrey\"},\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":{\"value\":1.0},\"line_color\":{\"value\":\"black\"},\"line_dash\":[4,4],\"line_width\":{\"value\":2},\"render_mode\":\"css\",\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1419\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"1362\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1420\",\"type\":\"Selection\"},{\"attributes\":{\"overlay\":{\"id\":\"1419\",\"type\":\"BoxAnnotation\"}},\"id\":\"1363\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"callback\":null,\"factors\":[\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"10\",\"11\"]},\"id\":\"1344\",\"type\":\"FactorRange\"},{\"attributes\":{},\"id\":\"1421\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"line_color\":{\"value\":\"#1f77b4\"},\"top\":{\"field\":\"count\"},\"width\":{\"value\":0.9},\"x\":{\"field\":\"order_size\"}},\"id\":\"1377\",\"type\":\"VBar\"},{\"attributes\":{},\"id\":\"1364\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1353\",\"type\":\"CategoricalTicker\"},{\"attributes\":{},\"id\":\"1365\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"1366\",\"type\":\"HelpTool\"},{\"attributes\":{\"callback\":null,\"tooltips\":[[\"order_size\",\"@order_size\"],[\"count\",\"@count\"]]},\"id\":\"1367\",\"type\":\"HoverTool\"},{\"attributes\":{\"callback\":null,\"data\":{\"count\":[3630,2908,1528,850,341,135,38,21,7,2,4],\"index\":[0,1,2,3,4,5,6,7,8,9,10],\"order_size\":[1,2,3,4,5,6,7,8,9,10,11]},\"selected\":{\"id\":\"1420\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1421\",\"type\":\"UnionRenderers\"}},\"id\":\"1342\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data_source\":{\"id\":\"1342\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1377\",\"type\":\"VBar\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1378\",\"type\":\"VBar\"},\"selection_glyph\":null,\"view\":{\"id\":\"1380\",\"type\":\"CDSView\"}},\"id\":\"1379\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"source\":{\"id\":\"1342\",\"type\":\"ColumnDataSource\"}},\"id\":\"1380\",\"type\":\"CDSView\"},{\"attributes\":{\"text\":\"Transaction Size\"},\"id\":\"1386\",\"type\":\"Title\"},{\"attributes\":{\"dimension\":1,\"ticker\":{\"id\":\"1357\",\"type\":\"BasicTicker\"}},\"id\":\"1360\",\"type\":\"Grid\"},{\"attributes\":{\"line_color\":\"red\",\"line_width\":2,\"x\":{\"field\":\"order_size\"},\"y\":{\"field\":\"count\"}},\"id\":\"1382\",\"type\":\"Line\"},{\"attributes\":{\"line_alpha\":0.1,\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"order_size\"},\"y\":{\"field\":\"count\"}},\"id\":\"1383\",\"type\":\"Line\"},{\"attributes\":{},\"id\":\"1357\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1348\",\"type\":\"CategoricalScale\"},{\"attributes\":{\"data_source\":{\"id\":\"1342\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1382\",\"type\":\"Line\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1383\",\"type\":\"Line\"},\"selection_glyph\":null,\"view\":{\"id\":\"1385\",\"type\":\"CDSView\"}},\"id\":\"1384\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"axis_label\":\"Total Transactions\",\"formatter\":{\"id\":\"1415\",\"type\":\"BasicTickFormatter\"},\"ticker\":{\"id\":\"1357\",\"type\":\"BasicTicker\"}},\"id\":\"1356\",\"type\":\"LinearAxis\"},{\"attributes\":{\"ticker\":{\"id\":\"1353\",\"type\":\"CategoricalTicker\"}},\"id\":\"1355\",\"type\":\"Grid\"},{\"attributes\":{\"source\":{\"id\":\"1342\",\"type\":\"ColumnDataSource\"}},\"id\":\"1385\",\"type\":\"CDSView\"},{\"attributes\":{\"axis_label\":\"Items/Transaction\",\"formatter\":{\"id\":\"1417\",\"type\":\"CategoricalTickFormatter\"},\"ticker\":{\"id\":\"1353\",\"type\":\"CategoricalTicker\"}},\"id\":\"1352\",\"type\":\"CategoricalAxis\"},{\"attributes\":{},\"id\":\"1415\",\"type\":\"BasicTickFormatter\"}],\"root_ids\":[\"1343\"]},\"title\":\"Bokeh Application\",\"version\":\"1.4.0\"}};\n",
1373 | " var render_items = [{\"docid\":\"e530c1b2-2392-434a-886f-b501dd29c1bd\",\"roots\":{\"1343\":\"2e6f7fd3-d357-4ab4-9c76-c15c1c682ccb\"}}];\n",
1374 | " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n",
1375 | "\n",
1376 | " }\n",
1377 | " if (root.Bokeh !== undefined) {\n",
1378 | " embed_document(root);\n",
1379 | " } else {\n",
1380 | " var attempts = 0;\n",
1381 | " var timer = setInterval(function(root) {\n",
1382 | " if (root.Bokeh !== undefined) {\n",
1383 | " clearInterval(timer);\n",
1384 | " embed_document(root);\n",
1385 | " } else {\n",
1386 | " attempts++;\n",
1387 | " if (attempts > 100) {\n",
1388 | " clearInterval(timer);\n",
1389 | " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n",
1390 | " }\n",
1391 | " }\n",
1392 | " }, 10, root)\n",
1393 | " }\n",
1394 | "})(window);"
1395 | ],
1396 | "application/vnd.bokehjs_exec.v0+json": ""
1397 | },
1398 | "metadata": {
1399 | "application/vnd.bokehjs_exec.v0+json": {
1400 | "id": "1343"
1401 | }
1402 | },
1403 | "output_type": "display_data"
1404 | }
1405 | ],
1406 | "source": [
1407 | "source = ColumnDataSource(data=df7.toPandas())\n",
1408 | "tooltips = [('order_size', '@order_size'), ('count', '@count')]\n",
1409 | "items = source.data['order_size'].tolist()\n",
1410 | "items = list(map(str, items))\n",
1411 | "plot = figure(x_range=items,\n",
1412 | " plot_width=750,\n",
1413 | " plot_height=375,\n",
1414 | " min_border=0,\n",
1415 | " tooltips=tooltips)\n",
1416 | "plot.vbar(x='order_size', bottom=0, top='count', source=source, width=0.9)\n",
1417 | "plot.line(x='order_size',\n",
1418 | " y='count',\n",
1419 | " source=source,\n",
1420 | " line_color='red',\n",
1421 | " line_width=2)\n",
1422 | "plot.title.text = 'Transaction Size'\n",
1423 | "plot.xaxis.axis_label = 'Items/Transaction'\n",
1424 | "plot.yaxis.axis_label = 'Total Transactions'\n",
1425 | "\n",
1426 | "show(plot)"
1427 | ]
1428 | },
1429 | {
1430 | "cell_type": "markdown",
1431 | "metadata": {},
1432 | "source": [
1433 | "## Read and Write Data to Parquet\n",
1434 | "Perform basic analysis of the bakery data using Spark SQL. Read and write resulting DataFrame contents to [Apache Parquet](https://parquet.apache.org/) format."
1435 | ]
1436 | },
1437 | {
1438 | "cell_type": "code",
1439 | "execution_count": 25,
1440 | "metadata": {
1441 | "ExecuteTime": {
1442 | "end_time": "2019-12-06T03:46:07.263463Z",
1443 | "start_time": "2019-12-06T03:46:04.800474Z"
1444 | },
1445 | "pycharm": {
1446 | "is_executing": false
1447 | }
1448 | },
1449 | "outputs": [
1450 | {
1451 | "name": "stdout",
1452 | "output_type": "stream",
1453 | "text": [
1454 | "DataFrame rows: 20506\n",
1455 | "DataFrame schema: DataFrame[transaction: int, timestamp: timestamp, item: string]\n",
1456 | "+-----------+-------------------+-------------+\n",
1457 | "|transaction|timestamp |item |\n",
1458 | "+-----------+-------------------+-------------+\n",
1459 | "|1 |2016-10-30 09:58:11|Bread |\n",
1460 | "|2 |2016-10-30 10:05:34|Scandinavian |\n",
1461 | "|2 |2016-10-30 10:05:34|Scandinavian |\n",
1462 | "|3 |2016-10-30 10:07:57|Cookies |\n",
1463 | "|3 |2016-10-30 10:07:57|Hot chocolate|\n",
1464 | "|3 |2016-10-30 10:07:57|Jam |\n",
1465 | "|4 |2016-10-30 10:08:41|Muffin |\n",
1466 | "|5 |2016-10-30 10:13:03|Bread |\n",
1467 | "|5 |2016-10-30 10:13:03|Coffee |\n",
1468 | "|5 |2016-10-30 10:13:03|Pastry |\n",
1469 | "+-----------+-------------------+-------------+\n",
1470 | "only showing top 10 rows\n",
1471 | "\n"
1472 | ]
1473 | }
1474 | ],
1475 | "source": [
1476 | "sql_query = \"SELECT transaction, CAST(CONCAT(date,' ',time) as timestamp) as timestamp, item \" \\\n",
1477 | " \"FROM tmp_bakery \" \\\n",
1478 | " \"WHERE item NOT LIKE 'NONE' AND item NOT LIKE 'Adjustment' \" \\\n",
1479 | " \"ORDER BY transaction ASC, item ASC\"\n",
1480 | "\n",
1481 | "df8 = spark.sql(sql_query)\n",
1482 | "print('DataFrame rows: %d' % df8.count())\n",
1483 | "print('DataFrame schema: %s' % df8)\n",
1484 | "df8.show(10, False)"
1485 | ]
1486 | },
1487 | {
1488 | "cell_type": "code",
1489 | "execution_count": 26,
1490 | "metadata": {
1491 | "ExecuteTime": {
1492 | "end_time": "2019-12-06T03:46:20.183778Z",
1493 | "start_time": "2019-12-06T03:46:07.264751Z"
1494 | },
1495 | "pycharm": {
1496 | "is_executing": false
1497 | }
1498 | },
1499 | "outputs": [],
1500 | "source": [
1501 | "df8.write.parquet('output/bakery_parquet', mode='overwrite')"
1502 | ]
1503 | },
1504 | {
1505 | "cell_type": "code",
1506 | "execution_count": 27,
1507 | "metadata": {
1508 | "ExecuteTime": {
1509 | "end_time": "2019-12-06T03:46:21.330630Z",
1510 | "start_time": "2019-12-06T03:46:20.184990Z"
1511 | },
1512 | "pycharm": {
1513 | "is_executing": false
1514 | }
1515 | },
1516 | "outputs": [
1517 | {
1518 | "name": "stdout",
1519 | "output_type": "stream",
1520 | "text": [
1521 | "total 800K\n",
1522 | "-rw-r--r-- 1 garystaf users 1.9K Dec 6 03:46 part-00000-50c8ea60-bdf4-4213-a6cd-78c9df626246-c000.snappy.parquet\n",
1523 | "-rw-r--r-- 1 garystaf users 2.0K Dec 6 03:46 part-00001-50c8ea60-bdf4-4213-a6cd-78c9df626246-c000.snappy.parquet\n",
1524 | "-rw-r--r-- 1 garystaf users 1.8K Dec 6 03:46 part-00002-50c8ea60-bdf4-4213-a6cd-78c9df626246-c000.snappy.parquet\n",
1525 | "-rw-r--r-- 1 garystaf users 2.0K Dec 6 03:46 part-00003-50c8ea60-bdf4-4213-a6cd-78c9df626246-c000.snappy.parquet\n",
1526 | "-rw-r--r-- 1 garystaf users 1.9K Dec 6 03:46 part-00004-50c8ea60-bdf4-4213-a6cd-78c9df626246-c000.snappy.parquet\n",
1527 | "-rw-r--r-- 1 garystaf users 1.9K Dec 6 03:46 part-00005-50c8ea60-bdf4-4213-a6cd-78c9df626246-c000.snappy.parquet\n",
1528 | "-rw-r--r-- 1 garystaf users 2.0K Dec 6 03:46 part-00006-50c8ea60-bdf4-4213-a6cd-78c9df626246-c000.snappy.parquet\n",
1529 | "-rw-r--r-- 1 garystaf users 1.9K Dec 6 03:46 part-00007-50c8ea60-bdf4-4213-a6cd-78c9df626246-c000.snappy.parquet\n",
1530 | "-rw-r--r-- 1 garystaf users 2.1K Dec 6 03:46 part-00008-50c8ea60-bdf4-4213-a6cd-78c9df626246-c000.snappy.parquet\n",
1531 | "Parquet Files: 13\n"
1532 | ]
1533 | }
1534 | ],
1535 | "source": [
1536 | "! ls 2>&1 -lh output/bakery_parquet | head -10\n",
1537 | "! echo 'Parquet Files:' $(ls | wc -l)"
1538 | ]
1539 | },
1540 | {
1541 | "cell_type": "code",
1542 | "execution_count": 28,
1543 | "metadata": {
1544 | "ExecuteTime": {
1545 | "end_time": "2019-12-06T03:46:30.259680Z",
1546 | "start_time": "2019-12-06T03:46:21.339399Z"
1547 | },
1548 | "pycharm": {
1549 | "is_executing": false
1550 | }
1551 | },
1552 | "outputs": [
1553 | {
1554 | "name": "stdout",
1555 | "output_type": "stream",
1556 | "text": [
1557 | "DataFrame rows: 20506\n",
1558 | "DataFrame schema: DataFrame[transaction: int, timestamp: timestamp, item: string]\n",
1559 | "+-----------+-------------------+-------------+\n",
1560 | "|transaction|timestamp |item |\n",
1561 | "+-----------+-------------------+-------------+\n",
1562 | "|1 |2016-10-30 09:58:11|Bread |\n",
1563 | "|2 |2016-10-30 10:05:34|Scandinavian |\n",
1564 | "|2 |2016-10-30 10:05:34|Scandinavian |\n",
1565 | "|3 |2016-10-30 10:07:57|Cookies |\n",
1566 | "|3 |2016-10-30 10:07:57|Hot chocolate|\n",
1567 | "|3 |2016-10-30 10:07:57|Jam |\n",
1568 | "|4 |2016-10-30 10:08:41|Muffin |\n",
1569 | "|5 |2016-10-30 10:13:03|Bread |\n",
1570 | "|5 |2016-10-30 10:13:03|Coffee |\n",
1571 | "|5 |2016-10-30 10:13:03|Pastry |\n",
1572 | "+-----------+-------------------+-------------+\n",
1573 | "only showing top 10 rows\n",
1574 | "\n"
1575 | ]
1576 | }
1577 | ],
1578 | "source": [
1579 | "df9 = spark.read.parquet('output/bakery_parquet')\n",
1580 | "print('DataFrame rows: %d' % df9.count())\n",
1581 | "print('DataFrame schema: %s' % df9)\n",
1582 | "df9.select('transaction', 'timestamp', 'item') \\\n",
1583 | " .sort('transaction', 'item') \\\n",
1584 | " .show(10, False)"
1585 | ]
1586 | }
1587 | ],
1588 | "metadata": {
1589 | "kernelspec": {
1590 | "display_name": "Python 3",
1591 | "language": "python",
1592 | "name": "python3"
1593 | },
1594 | "language_info": {
1595 | "codemirror_mode": {
1596 | "name": "ipython",
1597 | "version": 3
1598 | },
1599 | "file_extension": ".py",
1600 | "mimetype": "text/x-python",
1601 | "name": "python",
1602 | "nbconvert_exporter": "python",
1603 | "pygments_lexer": "ipython3",
1604 | "version": "3.7.3"
1605 | },
1606 | "pycharm": {
1607 | "stem_cell": {
1608 | "cell_type": "raw",
1609 | "source": [],
1610 | "metadata": {
1611 | "collapsed": false
1612 | }
1613 | }
1614 | },
1615 | "toc": {
1616 | "base_numbering": 1,
1617 | "nav_menu": {},
1618 | "number_sections": false,
1619 | "sideBar": true,
1620 | "skip_h1_title": false,
1621 | "title_cell": "Table of Contents",
1622 | "title_sidebar": "Contents",
1623 | "toc_cell": false,
1624 | "toc_position": {},
1625 | "toc_section_display": true,
1626 | "toc_window_display": true
1627 | }
1628 | },
1629 | "nbformat": 4,
1630 | "nbformat_minor": 2
1631 | }
1632 |
--------------------------------------------------------------------------------
/work/05_notebook.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# PySpark Demo Notebook 5\n",
8 | "\n",
9 | "## Contents\n",
10 | "\n",
11 | "1. [Setup Spark](#Setup-Spark)\n",
12 | "2. [Load Kaggle Data](#Load-Kaggle-Dataset)\n",
13 | "3. [Analyze Data with Spark SQL](#Analyze-Data-with-Spark-SQL)\n",
14 | "4. [Graph Data with Plotly](#Graph-Data-with-Plotly)\n",
15 | "\n",
16 | "## Requirements\n",
17 | "\n",
18 | "1. Create a free [Plotly Chart Studio](https://chart-studio.plot.ly) account\n",
19 | "2. Generate a Plotly API key\n",
20 | "3. Place Plotly username and API key to .env file\n",
21 | "\n",
22 | "## Background\n",
23 | "\n",
24 | "_Prepared by: [Gary A. Stafford](https://twitter.com/GaryStafford) \n",
25 | "Associated article: [Getting Started with Data Analytics using Jupyter Notebooks, PySpark, and Docker](https://wp.me/p1RD28-6Fj)_"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "### Setup Spark\n",
33 | "Setup the SparkSession, the entry point to programming Spark with the Dataset and DataFrame API."
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 1,
39 | "metadata": {
40 | "ExecuteTime": {
41 | "end_time": "2019-12-06T22:31:06.448034Z",
42 | "start_time": "2019-12-06T22:31:06.070195Z"
43 | }
44 | },
45 | "outputs": [],
46 | "source": [
47 | "from pyspark.sql import SparkSession"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 2,
53 | "metadata": {
54 | "ExecuteTime": {
55 | "end_time": "2019-12-06T22:31:12.549446Z",
56 | "start_time": "2019-12-06T22:31:06.450315Z"
57 | }
58 | },
59 | "outputs": [],
60 | "source": [
61 | "# reference: https://spark.apache.org/docs/latest/configuration.html#viewing-spark-properties\n",
62 | "spark = SparkSession \\\n",
63 | " .builder \\\n",
64 | " .appName('05_notebook') \\\n",
65 | " .getOrCreate()"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 3,
71 | "metadata": {
72 | "ExecuteTime": {
73 | "end_time": "2019-12-06T22:31:12.607373Z",
74 | "start_time": "2019-12-06T22:31:12.554880Z"
75 | }
76 | },
77 | "outputs": [
78 | {
79 | "data": {
80 | "text/plain": [
81 | "[('spark.driver.host', '87ee13b37142'),\n",
82 | " ('spark.app.id', 'local-1577921437466'),\n",
83 | " ('spark.app.name', '05_notebook'),\n",
84 | " ('spark.driver.port', '33709'),\n",
85 | " ('spark.rdd.compress', 'True'),\n",
86 | " ('spark.serializer.objectStreamReset', '100'),\n",
87 | " ('spark.master', 'local[*]'),\n",
88 | " ('spark.executor.id', 'driver'),\n",
89 | " ('spark.submit.deployMode', 'client'),\n",
90 | " ('spark.ui.showConsoleProgress', 'true')]"
91 | ]
92 | },
93 | "execution_count": 3,
94 | "metadata": {},
95 | "output_type": "execute_result"
96 | }
97 | ],
98 | "source": [
99 | "spark.sparkContext.getConf().getAll()"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": 5,
105 | "metadata": {
106 | "ExecuteTime": {
107 | "end_time": "2019-12-06T22:31:20.048786Z",
108 | "start_time": "2019-12-06T22:31:12.608668Z"
109 | },
110 | "scrolled": true
111 | },
112 | "outputs": [
113 | {
114 | "name": "stdout",
115 | "output_type": "stream",
116 | "text": [
117 | "DataFrame rows: 21293\n",
118 | "DataFrame schema: DataFrame[Date: timestamp, Time: string, Transaction: int, Item: string]\n",
119 | "+-------------------+--------+-----------+-------------+\n",
120 | "|Date |Time |Transaction|Item |\n",
121 | "+-------------------+--------+-----------+-------------+\n",
122 | "|2016-10-30 00:00:00|09:58:11|1 |Bread |\n",
123 | "|2016-10-30 00:00:00|10:05:34|2 |Scandinavian |\n",
124 | "|2016-10-30 00:00:00|10:05:34|2 |Scandinavian |\n",
125 | "|2016-10-30 00:00:00|10:07:57|3 |Hot chocolate|\n",
126 | "|2016-10-30 00:00:00|10:07:57|3 |Jam |\n",
127 | "|2016-10-30 00:00:00|10:07:57|3 |Cookies |\n",
128 | "|2016-10-30 00:00:00|10:08:41|4 |Muffin |\n",
129 | "|2016-10-30 00:00:00|10:13:03|5 |Coffee |\n",
130 | "|2016-10-30 00:00:00|10:13:03|5 |Pastry |\n",
131 | "|2016-10-30 00:00:00|10:13:03|5 |Bread |\n",
132 | "+-------------------+--------+-----------+-------------+\n",
133 | "only showing top 10 rows\n",
134 | "\n"
135 | ]
136 | }
137 | ],
138 | "source": [
139 | "df1 = spark.read \\\n",
140 | " .format('csv') \\\n",
141 | " .option('header', 'true') \\\n",
142 | " .option('delimiter', ',') \\\n",
143 | " .option('inferSchema', True) \\\n",
144 | " .load('BreadBasket_DMS.csv')\n",
145 | "\n",
146 | "print('DataFrame rows: %d' % df1.count())\n",
147 | "print('DataFrame schema: %s' % df1)\n",
148 | "df1.show(10, False)"
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "metadata": {},
154 | "source": [
155 | "## Analyze Data with Spark SQL\n",
156 | "Analyze the DataFrame's bakery data using Spark SQL."
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": 26,
162 | "metadata": {
163 | "ExecuteTime": {
164 | "end_time": "2019-12-06T22:31:27.157886Z",
165 | "start_time": "2019-12-06T22:31:20.050988Z"
166 | }
167 | },
168 | "outputs": [
169 | {
170 | "name": "stdout",
171 | "output_type": "stream",
172 | "text": [
173 | "DataFrame rows: 159\n",
174 | "+-------------------+-----+-------------+\n",
175 | "| date|count|hourly_period|\n",
176 | "+-------------------+-----+-------------+\n",
177 | "|2016-10-30 00:00:00| 180| 20|\n",
178 | "|2016-10-31 00:00:00| 205| 20|\n",
179 | "|2016-11-01 00:00:00| 154| 20|\n",
180 | "|2016-11-02 00:00:00| 169| 20|\n",
181 | "|2016-11-03 00:00:00| 195| 20|\n",
182 | "|2016-11-04 00:00:00| 192| 20|\n",
183 | "|2016-11-05 00:00:00| 283| 20|\n",
184 | "|2016-11-06 00:00:00| 203| 20|\n",
185 | "|2016-11-07 00:00:00| 149| 20|\n",
186 | "|2016-11-08 00:00:00| 147| 20|\n",
187 | "+-------------------+-----+-------------+\n",
188 | "only showing top 10 rows\n",
189 | "\n",
190 | "None\n"
191 | ]
192 | }
193 | ],
194 | "source": [
195 | "df1.createOrReplaceTempView('tmp_bakery')\n",
196 | "\n",
197 | "df2 = spark.sql(\"SELECT date, count(*) as count \" + \"FROM tmp_bakery \" +\n",
198 | " \"GROUP BY date \" + \"ORDER BY date\")\n",
199 | "\n",
200 | "print('DataFrame rows: %d' % df2.count())\n",
201 | "\n",
202 | "df3 = df2.withColumn(\"hourly_period\", df2['date'].substr(1, 2))\n",
203 | "\n",
204 | "print(df3.show(10))\n"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {},
210 | "source": [
211 | "## Graph Data with Plotly\n",
212 | "Use [Plotly](https://plot.ly/python/) to create a chart showing bakery items sold over time. Demostrates linear fit and data smoothing.\n",
213 | "* [Plotly Python Open Source Graphing Library](https://plot.ly/python/)\n",
214 | "* [Smoothing in Python](https://plot.ly/python/smoothing/)\n",
215 | "* [Linear Fit in Python](https://plot.ly/python/linear-fits/)"
216 | ]
217 | },
218 | {
219 | "cell_type": "markdown",
220 | "metadata": {},
221 | "source": [
222 | "## Load Kaggle Dataset\n",
223 | "Load the Kaggle dataset from the CSV file, containing ~21K rows, into a Spark DataFrame."
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 6,
229 | "metadata": {
230 | "ExecuteTime": {
231 | "end_time": "2019-12-06T22:31:28.708003Z",
232 | "start_time": "2019-12-06T22:31:27.163677Z"
233 | }
234 | },
235 | "outputs": [],
236 | "source": [
237 | "import os\n",
238 | "from dotenv import load_dotenv\n",
239 | "import chart_studio.tools\n",
240 | "import chart_studio.plotly as py\n",
241 | "import plotly.graph_objs as go\n",
242 | "from numpy import arange\n",
243 | "from scipy import stats, signal\n",
244 | "import warnings\n",
245 | "\n",
246 | "warnings.filterwarnings('ignore')"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": 7,
252 | "metadata": {
253 | "ExecuteTime": {
254 | "end_time": "2019-12-06T22:31:28.750965Z",
255 | "start_time": "2019-12-06T22:31:28.721229Z"
256 | }
257 | },
258 | "outputs": [],
259 | "source": [
260 | "# load your plotly credentials\n",
261 | "load_dotenv()\n",
262 | "chart_studio.tools.set_credentials_file(username=os.getenv('PLOTLY_USERNAME'),\n",
263 | " api_key=os.getenv('PLOTLY_API_KEY'))"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 8,
269 | "metadata": {
270 | "ExecuteTime": {
271 | "end_time": "2019-12-06T22:31:44.229096Z",
272 | "start_time": "2019-12-06T22:31:28.757479Z"
273 | }
274 | },
275 | "outputs": [
276 | {
277 | "data": {
278 | "text/html": [
279 | "\n",
280 | " \n",
287 | " "
288 | ],
289 | "text/plain": [
290 | ""
291 | ]
292 | },
293 | "execution_count": 8,
294 | "metadata": {},
295 | "output_type": "execute_result"
296 | }
297 | ],
298 | "source": [
299 | "# convert the Spark DataFrame into a pandas DataFrame\n",
300 | "pdf = df2.toPandas()\n",
301 | "\n",
302 | "# calculates a linear least-squares regression using scipy\n",
303 | "xi = arange(0, len(pdf.index))\n",
304 | "slope, intercept, r_value, p_value, std_err = stats.linregress(\n",
305 | " xi, pdf['count'])\n",
306 | "line = slope * xi + intercept\n",
307 | "\n",
308 | "layout = dict(title='Bakery Sales',\n",
309 | " xaxis=dict(title='Month',\n",
310 | " showgrid=True,\n",
311 | " zeroline=True,\n",
312 | " showline=True,\n",
313 | " ticks='outside',\n",
314 | " tickangle=45,\n",
315 | " showticklabels=True),\n",
316 | " yaxis=dict(title='Items Sold/Day',\n",
317 | " showgrid=True,\n",
318 | " zeroline=True,\n",
319 | " showline=True,\n",
320 | " ticks='outside',\n",
321 | " showticklabels=True))\n",
322 | "\n",
323 | "trace1 = go.Bar(x=pdf['date'], y=pdf['count'], name='Items Sold')\n",
324 | "trace2 = go.Scatter(x=pdf['date'], y=line, mode='lines', name='Linear Fit')\n",
325 | "trace3 = go.Scatter(x=pdf['date'],\n",
326 | " y=signal.savgol_filter(pdf['count'], 53, 3),\n",
327 | " mode='lines',\n",
328 | " name='Savitzky-Golay')\n",
329 | "data = [trace1, trace2, trace3]\n",
330 | "fig = dict(data=data, layout=layout)\n",
331 | "py.iplot(fig, filename='jupyter-basic_bar.html')"
332 | ]
333 | }
334 | ],
335 | "metadata": {
336 | "kernelspec": {
337 | "display_name": "Python 3",
338 | "language": "python",
339 | "name": "python3"
340 | },
341 | "language_info": {
342 | "codemirror_mode": {
343 | "name": "ipython",
344 | "version": 3
345 | },
346 | "file_extension": ".py",
347 | "mimetype": "text/x-python",
348 | "name": "python",
349 | "nbconvert_exporter": "python",
350 | "pygments_lexer": "ipython3",
351 | "version": "3.7.3"
352 | },
353 | "pycharm": {
354 | "stem_cell": {
355 | "cell_type": "raw",
356 | "metadata": {
357 | "collapsed": false
358 | },
359 | "source": []
360 | }
361 | },
362 | "toc": {
363 | "base_numbering": 1,
364 | "nav_menu": {},
365 | "number_sections": false,
366 | "sideBar": true,
367 | "skip_h1_title": false,
368 | "title_cell": "Table of Contents",
369 | "title_sidebar": "Contents",
370 | "toc_cell": false,
371 | "toc_position": {
372 | "height": "calc(100% - 180px)",
373 | "left": "10px",
374 | "top": "150px",
375 | "width": "260px"
376 | },
377 | "toc_section_display": true,
378 | "toc_window_display": true
379 | }
380 | },
381 | "nbformat": 4,
382 | "nbformat_minor": 4
383 | }
384 |
--------------------------------------------------------------------------------
/work/bakery.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE IF EXISTS "transactions";
2 | DROP SEQUENCE IF EXISTS transactions_id_seq;
3 | CREATE SEQUENCE transactions_id_seq INCREMENT 1 MINVALUE 1 MAXVALUE 2147483647 START 1 CACHE 1;
4 |
5 | CREATE TABLE "public"."transactions"
6 | (
7 | "id" integer DEFAULT nextval('transactions_id_seq') NOT NULL,
8 | "date" character varying(10) NOT NULL,
9 | "time" character varying(8) NOT NULL,
10 | "transaction" integer NOT NULL,
11 | "item" character varying(50) NOT NULL
12 | ) WITH (oids = false);
13 |
--------------------------------------------------------------------------------
/work/bootstrap_jupyter.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | # update/upgrade and install htop
6 | sudo apt-get update -y && sudo apt-get upgrade -y
7 | sudo apt-get install htop
8 |
9 | # install required python packages
10 | python3 -m pip install --user --upgrade pip
11 | python3 -m pip install -r requirements.txt --upgrade
12 |
13 | # download latest postgres driver jar
14 | POSTGRES_JAR="postgresql-42.2.10.jar"
15 | if [ -f "$POSTGRES_JAR" ]; then
16 | echo "$POSTGRES_JAR exist"
17 | else
18 | wget -nv "https://jdbc.postgresql.org/download/${POSTGRES_JAR}"
19 | fi
20 |
21 | # spark-submit logging level from INFO to WARN
22 | sudo cp log4j.properties /usr/local/spark/conf/log4j.properties
23 |
--------------------------------------------------------------------------------
/work/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootCategory=WARN, console
2 | log4j.appender.console=org.apache.log4j.ConsoleAppender
3 | log4j.appender.console.target=System.err
4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
--------------------------------------------------------------------------------
/work/postgresql-42.2.10.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/garystafford/pyspark-setup-demo/2fa2143eaf32032de8c72cd921d790751848d315/work/postgresql-42.2.10.jar
--------------------------------------------------------------------------------
/work/requirements.txt:
--------------------------------------------------------------------------------
1 | psycopg2-binary
2 | bokeh
3 | plotly
4 | chart_studio
5 | numpy
6 | scipy
7 | python-dotenv
--------------------------------------------------------------------------------