├── .DS_Store
├── .markdownlint.json
├── .vscode
└── settings.json
├── 1_Data_Modelling
├── exercises
│ ├── L1_Exercise_1_Creating_a_Table_with_Postgres.ipynb
│ ├── L1_Exercise_2_Creating_a_Table_with_Apache_Cassandra.ipynb
│ ├── L2_Exercise_1_Creating_Normalized_Tables.ipynb
│ ├── L2_Exercise_2_Creating_Denormalized_Tables.ipynb
│ ├── L2_Exercise_3_Dimension_Tables_with_Star_Schema.ipynb
│ ├── L3_Exercise_1_Three_Queries_Three_Tables.ipynb
│ ├── L3_Exercise_2_Primary_Key.ipynb
│ ├── L3_Exercise_3_Clustering_Column.ipynb
│ └── L3_Exercise_4_Using_the_WHERE_Clause.ipynb
├── images
│ ├── basics_of_cassandra.png
│ ├── dimension_fact_tables.png
│ └── music_store_database_with_star_schema.png
├── notes.md
└── project
│ ├── data_modelling_project.ipynb
│ ├── event_data
│ ├── 2018-11-01-events.csv
│ ├── 2018-11-02-events.csv
│ ├── 2018-11-03-events.csv
│ ├── 2018-11-04-events.csv
│ ├── 2018-11-05-events.csv
│ ├── 2018-11-06-events.csv
│ ├── 2018-11-07-events.csv
│ ├── 2018-11-08-events.csv
│ ├── 2018-11-09-events.csv
│ ├── 2018-11-10-events.csv
│ ├── 2018-11-11-events.csv
│ ├── 2018-11-12-events.csv
│ ├── 2018-11-13-events.csv
│ ├── 2018-11-14-events.csv
│ ├── 2018-11-15-events.csv
│ ├── 2018-11-16-events.csv
│ ├── 2018-11-17-events.csv
│ ├── 2018-11-18-events.csv
│ ├── 2018-11-19-events.csv
│ ├── 2018-11-20-events.csv
│ ├── 2018-11-21-events.csv
│ ├── 2018-11-22-events.csv
│ ├── 2018-11-23-events.csv
│ ├── 2018-11-24-events.csv
│ ├── 2018-11-25-events.csv
│ ├── 2018-11-26-events.csv
│ ├── 2018-11-27-events.csv
│ ├── 2018-11-28-events.csv
│ ├── 2018-11-29-events.csv
│ └── 2018-11-30-events.csv
│ ├── event_datafile_new.csv
│ └── images
│ └── image_event_datafile_new.jpg
├── 2_Cloud_Data_Warehouses
├── .DS_Store
├── exercises
│ ├── 10_Parallel_ETL.ipynb
│ ├── 11_Optimizing_Redshift_Table_Design.ipynb
│ ├── 1_Exploratory_Data_Analysis.ipynb
│ ├── 2_Dimensional_Modeling.ipynb
│ ├── 3_ETL.ipynb
│ ├── 4_Roll_up_and_Drill_Down.ipynb
│ ├── 5_Slicing_and_Dicing.ipynb
│ ├── 6_Grouping_Sets.ipynb
│ ├── 7_Cube.ipynb
│ ├── 8_OLAP_Cubes_all.ipynb
│ ├── 9_Infra_as_a_Code.ipynb
│ └── dwh.cfg
├── images
│ ├── 1-introduction-to-cloud-data-warehouses.jpg
│ ├── 2-introduction-to-datawarehousing.png
│ ├── 3-DW_ETL_Design.png
│ ├── 4-Kimballs_Bus_Architecture.png
│ ├── 5-DWH_Tech_Perspective.png
│ ├── 6-pagila_star_schema.png
│ ├── 7_pagila-3nf.png
│ ├── all_distribution.png
│ ├── amazon_redshift.png
│ ├── dwh_etl.png
│ ├── even_distribution.png
│ ├── ingesting_with_manifest_example.png
│ ├── ingesting_with_prefix_example.png
│ ├── key_distribution.png
│ ├── olap_cube.png
│ ├── redshift_architecture.png
│ ├── redshift_etl.png
│ ├── redshift_etl_dataflow.png
│ ├── redshift_node_types.png
│ ├── sorting_and_dist_key_syntax.png
│ ├── sorting_key_distribution.png
│ └── tutorial-optimize-tables-ssb-data-model.png
├── notes.md
└── project
│ ├── README.md
│ ├── create_tables.py
│ ├── dwh.cfg
│ ├── etl.py
│ ├── images
│ └── sparkify-s3-to-redshift-etl.png
│ ├── main.ipynb
│ └── sql_queries.py
├── 3_Spark_and_Data_Lakes
├── .DS_Store
├── exercises
│ ├── 1_mapreduce_practice.ipynb
│ ├── 2_rdd_song_lower_case.py
│ ├── 3_data_inputs_and_outputs.py
│ ├── 4_data_wrangling.py
│ ├── 5_data_wrangling_quiz.py
│ ├── 6_data_wrangling_with_spark_sql.py
│ ├── 7_accelerometer_landing_to_trusted.py
│ ├── 8_customer_urated.py
│ └── data
│ │ ├── songplays.txt
│ │ └── sparkify_log_small.json
├── images
│ ├── Running_Spark_scripts_at_a_time_interval.png
│ ├── aws_glue_configuration.jpeg
│ ├── datalake_zones.png
│ ├── evolution_of_bigdata_ecosystem.png
│ ├── glue_job_diagram.png
│ ├── glue_job_using_s3_vpc_gateway.jpeg
│ ├── hadoop_to_data_lakehouse.png
│ ├── ingesting_and_organizing_data_in_a_lakehouse.jpeg
│ ├── spark_catalyst.png
│ ├── spark_dag.png
│ ├── spark_job_using_glue_studio.jpeg
│ ├── spark_modes.png
│ ├── spark_version_rdd_mapping.png
│ └── streaming_data.jpeg
├── notes.md
└── project
│ ├── .DS_Store
│ ├── README.md
│ ├── images
│ ├── accelerometer_landing.png
│ ├── customer_landing.png
│ └── customer_trusted.png
│ └── scripts
│ ├── accelerometer_landing.sql
│ ├── accelerometer_landing_to_trusted.py
│ ├── customer_landing.sql
│ ├── customer_landing_to_trusted.py
│ ├── customer_trusted_to_curated.py
│ ├── step_trainer_landing_to_trusted.py
│ └── trainer_trusted_to_curated.py
├── 4_Automate_Data_Pipelines
├── exercises
│ ├── .DS_Store
│ ├── airflow_dags.py
│ ├── airflow_official_tutorials
│ │ ├── reuse_tasks.py
│ │ ├── tutorial.py
│ │ ├── tutorial_dag.py
│ │ └── tutorial_taskflow_api.py
│ ├── build_full_dag.py
│ ├── connections_hooks.py
│ ├── context_templating.py
│ ├── convert_airflow1.py
│ ├── custom_operators.py
│ ├── custom_operators
│ │ ├── facts_calculator.py
│ │ ├── has_rows.py
│ │ └── s3_to_redshift.py
│ ├── data_lineage.py
│ ├── data_partitioning.py
│ ├── data_quality.py
│ ├── refactor_dag.py
│ ├── run_the_schedules.py
│ ├── s3_to_redshift.py
│ ├── schedule_backfills.py
│ ├── sql_statements.py
│ └── task_dependencies.py
├── images
│ ├── airflow_aws_all_together.png
│ ├── airflow_component_diagram.jpeg
│ ├── airflow_data_lineage.jpeg
│ ├── airflow_instrumentation.png
│ ├── bikeshare_dag_example.png
│ ├── bikeshare_data_lineage.jpeg
│ ├── directed_acyclic_graph.png
│ ├── example_pipeline.png
│ ├── how-airflow-works.png
│ ├── project_dag_sample.png
│ └── scheduling_in_airflow.jpeg
├── notes.md
└── project
│ ├── README.md
│ ├── dags
│ ├── common
│ │ ├── create_tables.sql
│ │ └── sql_statements.py
│ └── project_dag.py
│ ├── images
│ └── airflow_project_dag.png
│ └── plugins
│ └── operators
│ ├── data_quality.py
│ ├── load_dimension.py
│ ├── load_fact.py
│ └── stage_redshift.py
└── README.md
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/.DS_Store
--------------------------------------------------------------------------------
/.markdownlint.json:
--------------------------------------------------------------------------------
1 | {
2 | "MD033": {
3 | "allowed_elements": [
4 | "figure",
5 | "img",
6 | "hr"
7 | ]
8 | },
9 | "MD013": {
10 | "line_length": 500
11 | }
12 | }
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "cSpell.words": [
3 | "HDFS",
4 | "lakehouse"
5 | ]
6 | }
--------------------------------------------------------------------------------
/1_Data_Modelling/exercises/L1_Exercise_1_Creating_a_Table_with_Postgres.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Lesson 1 Exercise 1: Creating a Table with PostgreSQL\n",
8 | "\n",
9 | "
"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "### Walk through the basics of PostgreSQL. You will need to complete the following tasks:
Create a table in PostgreSQL, Insert rows of data Run a simple SQL query to validate the information.
\n",
17 | "`#####` denotes where the code needs to be completed. "
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "#### Import the library \n",
25 | "*Note:* An error might popup after this command has executed. If it does, read it carefully before ignoring. "
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 1,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "import psycopg2"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 2,
40 | "metadata": {},
41 | "outputs": [
42 | {
43 | "name": "stdout",
44 | "output_type": "stream",
45 | "text": [
46 | "ALTER ROLE\r\n"
47 | ]
48 | }
49 | ],
50 | "source": [
51 | "!echo \"alter user student createdb;\" | sudo -u postgres psql"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "### Create a connection to the database"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 3,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "try: \n",
68 | " conn = psycopg2.connect(\"host=127.0.0.1 dbname=studentdb user=student password=student\")\n",
69 | "except psycopg2.Error as e: \n",
70 | " print(\"Error: Could not make connection to the Postgres database\")\n",
71 | " print(e)"
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": [
78 | "### Use the connection to get a cursor that can be used to execute queries."
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 4,
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "try: \n",
88 | " cur = conn.cursor()\n",
89 | "except psycopg2.Error as e: \n",
90 | " print(\"Error: Could not get curser to the Database\")\n",
91 | " print(e)"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "### TO-DO: Set automatic commit to be true so that each action is committed without having to call conn.commit() after each command. "
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 5,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "# TO-DO: set automatic commit to be true\n",
108 | "conn.set_session(autocommit=True)"
109 | ]
110 | },
111 | {
112 | "cell_type": "markdown",
113 | "metadata": {},
114 | "source": [
115 | "### TO-DO: Create a database to do the work in. "
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 6,
121 | "metadata": {},
122 | "outputs": [
123 | {
124 | "name": "stdout",
125 | "output_type": "stream",
126 | "text": [
127 | "database \"album\" already exists\n",
128 | "\n"
129 | ]
130 | }
131 | ],
132 | "source": [
133 | "## TO-DO: Add the database name within the CREATE DATABASE statement. You can choose your own db name.\n",
134 | "try: \n",
135 | " cur.execute(\"create database album\")\n",
136 | "except psycopg2.Error as e:\n",
137 | " print(e)"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "metadata": {},
143 | "source": [
144 | "#### TO-DO: Add the database name in the connect statement. Let's close our connection to the default database, reconnect to the Udacity database, and get a new cursor."
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 7,
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "## TO-DO: Add the database name within the connect statement\n",
154 | "try: \n",
155 | " conn.close()\n",
156 | "except psycopg2.Error as e:\n",
157 | " print(e)\n",
158 | " \n",
159 | "try: \n",
160 | " conn = psycopg2.connect(\"host=127.0.0.1 dbname=album user=student password=student\")\n",
161 | "except psycopg2.Error as e: \n",
162 | " print(\"Error: Could not make connection to the Postgres database\")\n",
163 | " print(e)\n",
164 | " \n",
165 | "try: \n",
166 | " cur = conn.cursor()\n",
167 | "except psycopg2.Error as e: \n",
168 | " print(\"Error: Could not get curser to the Database\")\n",
169 | " print(e)\n",
170 | "\n",
171 | "conn.set_session(autocommit=True)"
172 | ]
173 | },
174 | {
175 | "cell_type": "markdown",
176 | "metadata": {},
177 | "source": [
178 | "### Create a Song Library that contains a list of songs, including the song name, artist name, year, album it was from, and if it was a single. \n",
179 | "\n",
180 | "`song_title\n",
181 | "artist_name\n",
182 | "year\n",
183 | "album_name\n",
184 | "single`\n"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 8,
190 | "metadata": {},
191 | "outputs": [],
192 | "source": [
193 | "## TO-DO: Finish writing the CREATE TABLE statement with the correct arguments\n",
194 | "try: \n",
195 | " cur.execute(\"CREATE TABLE IF NOT EXISTS music_ibrary (song_title varchar, artist_name varchar, year int, album_name varchar, single boolean);\")\n",
196 | "except psycopg2.Error as e: \n",
197 | " print(\"Error: Issue creating table\")\n",
198 | " print (e)"
199 | ]
200 | },
201 | {
202 | "cell_type": "markdown",
203 | "metadata": {},
204 | "source": [
205 | "### TO-DO: Insert the following two rows in the table\n",
206 | "`First Row: \"Across The Universe\", \"The Beatles\", \"1970\", \"Let It Be\", \"False\"`\n",
207 | "\n",
208 | "`Second Row: \"Think For Yourself\", \"The Beatles\", \"1965\", \"Rubber Soul\", \"False\"`"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 9,
214 | "metadata": {},
215 | "outputs": [],
216 | "source": [
217 | "## TO-DO: Finish the INSERT INTO statement with the correct arguments\n",
218 | "\n",
219 | "try: \n",
220 | " cur.execute(\"INSERT INTO music_ibrary (song_title, artist_name, year, album_name, single) \\\n",
221 | " VALUES (%s, %s, %s, %s, %s)\", \\\n",
222 | " (\"Across The Universe\", \"The Beatles\", \"1970\", \"Let It Be\", \"False\"))\n",
223 | "except psycopg2.Error as e: \n",
224 | " print(\"Error: Inserting Rows\")\n",
225 | " print (e)\n",
226 | " \n",
227 | "try: \n",
228 | " cur.execute(\"INSERT INTO music_ibrary (song_title, artist_name, year, album_name, single) \\\n",
229 | " VALUES (%s, %s, %s, %s, %s)\",\n",
230 | " (\"Think For Yourself\", \"The Beatles\", \"1965\", \"Rubber Soul\", \"False\"))\n",
231 | "except psycopg2.Error as e: \n",
232 | " print(\"Error: Inserting Rows\")\n",
233 | " print (e)"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {},
239 | "source": [
240 | "### TO-DO: Validate your data was inserted into the table. \n"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": 10,
246 | "metadata": {},
247 | "outputs": [
248 | {
249 | "name": "stdout",
250 | "output_type": "stream",
251 | "text": [
252 | "('Across The Universe', 'The Beatles', 1970, 'Let It Be', False)\n",
253 | "('Think For Yourself', 'The Beatles', 1965, 'Rubber Soul', False)\n"
254 | ]
255 | }
256 | ],
257 | "source": [
258 | "## TO-DO: Finish the SELECT * Statement \n",
259 | "try: \n",
260 | " cur.execute(\"SELECT * FROM music_ibrary;\")\n",
261 | "except psycopg2.Error as e: \n",
262 | " print(\"Error: select *\")\n",
263 | " print (e)\n",
264 | "\n",
265 | "row = cur.fetchone()\n",
266 | "while row:\n",
267 | " print(row)\n",
268 | " row = cur.fetchone()"
269 | ]
270 | },
271 | {
272 | "cell_type": "markdown",
273 | "metadata": {},
274 | "source": [
275 | "### And finally close your cursor and connection. "
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 11,
281 | "metadata": {},
282 | "outputs": [],
283 | "source": [
284 | "cur.close()\n",
285 | "conn.close()"
286 | ]
287 | }
288 | ],
289 | "metadata": {
290 | "kernelspec": {
291 | "display_name": "Python 3",
292 | "language": "python",
293 | "name": "python3"
294 | },
295 | "language_info": {
296 | "codemirror_mode": {
297 | "name": "ipython",
298 | "version": 3
299 | },
300 | "file_extension": ".py",
301 | "mimetype": "text/x-python",
302 | "name": "python",
303 | "nbconvert_exporter": "python",
304 | "pygments_lexer": "ipython3",
305 | "version": "3.9.6 (default, Oct 18 2022, 12:41:40) \n[Clang 14.0.0 (clang-1400.0.29.202)]"
306 | },
307 | "vscode": {
308 | "interpreter": {
309 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
310 | }
311 | }
312 | },
313 | "nbformat": 4,
314 | "nbformat_minor": 4
315 | }
316 |
--------------------------------------------------------------------------------
/1_Data_Modelling/exercises/L1_Exercise_2_Creating_a_Table_with_Apache_Cassandra.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Lesson 1 Exercise 2: Creating a Table with Apache Cassandra\n",
8 | "
"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "### Walk through the basics of Apache Cassandra. Complete the following tasks: Create a table in Apache Cassandra, Insert rows of data, Run a simple SQL query to validate the information.
\n",
16 | "`#####` denotes where the code needs to be completed."
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "#### Import Apache Cassandra python package"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 1,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "import cassandra"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "### Create a connection to the database"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 2,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "from cassandra.cluster import Cluster\n",
49 | "try: \n",
50 | " cluster = Cluster(['127.0.0.1']) #If you have a locally installed Apache Cassandra instance\n",
51 | " session = cluster.connect()\n",
52 | "except Exception as e:\n",
53 | " print(e)\n",
54 | " "
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "### TO-DO: Create a keyspace to do the work in "
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 3,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "## TO-DO: Create the keyspace\n",
71 | "try:\n",
72 | " session.execute(\"\"\"\n",
73 | " CREATE KEYSPACE IF NOT EXISTS music \n",
74 | " WITH REPLICATION = \n",
75 | " { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }\"\"\"\n",
76 | ")\n",
77 | "\n",
78 | "except Exception as e:\n",
79 | " print(e)"
80 | ]
81 | },
82 | {
83 | "cell_type": "markdown",
84 | "metadata": {},
85 | "source": [
86 | "### TO-DO: Connect to the Keyspace"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 4,
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "## To-Do: Add in the keyspace you created\n",
96 | "try:\n",
97 | " session.set_keyspace('music')\n",
98 | "except Exception as e:\n",
99 | " print(e)"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "### Create a Song Library that contains a list of songs, including the song name, artist name, year, album it was from, and if it was a single. \n",
107 | "\n",
108 | "`song_title\n",
109 | "artist_name\n",
110 | "year\n",
111 | "album_name\n",
112 | "single`"
113 | ]
114 | },
115 | {
116 | "cell_type": "markdown",
117 | "metadata": {},
118 | "source": [
119 | "### TO-DO: You need to create a table to be able to run the following query: \n",
120 | "`select * from songs WHERE year=1970 AND artist_name=\"The Beatles\"`"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 5,
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "## TO-DO: Complete the query below\n",
130 | "query = \"CREATE TABLE IF NOT EXISTS music_library\"\n",
131 | "query = query + \"(year int, song_title text, artist_name text, album_name text, single boolean, PRIMARY KEY (year, artist_name))\"\n",
132 | "try:\n",
133 | " session.execute(query)\n",
134 | "except Exception as e:\n",
135 | " print(e)\n"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {},
141 | "source": [
142 | "### TO-DO: Insert the following two rows in your table\n",
143 | "`First Row: \"1970\", \"Let It Be\", \"The Beatles\", \"Across The Universe\", \"False\", `\n",
144 | "\n",
145 | "`Second Row: \"1965\", \"Think For Yourself\", \"The Beatles\", \"Rubber Soul\", \"False\"`"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 6,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "## Add in query and then run the insert statement\n",
155 | "query = \"INSERT INTO music_library(year, song_title, artist_name, album_name, single)\" \n",
156 | "query = query + \" VALUES (%s, %s, %s, %s, %s)\"\n",
157 | "\n",
158 | "try:\n",
159 | " session.execute(query, (1970, \"Let It Be\", \"The Beatles\", \"Across The Universe\", False))\n",
160 | "except Exception as e:\n",
161 | " print(e)\n",
162 | " \n",
163 | "try:\n",
164 | " session.execute(query, (1965, \"Think For Yourself\", \"The Beatles\", \"Rubber Soul\", False))\n",
165 | "except Exception as e:\n",
166 | " print(e)"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "### TO-DO: Validate your data was inserted into the table."
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 7,
179 | "metadata": {
180 | "scrolled": true
181 | },
182 | "outputs": [
183 | {
184 | "name": "stdout",
185 | "output_type": "stream",
186 | "text": [
187 | "1965 Rubber Soul The Beatles\n",
188 | "1970 Across The Universe The Beatles\n"
189 | ]
190 | }
191 | ],
192 | "source": [
193 | "## TO-DO: Complete and then run the select statement to validate the data was inserted into the table\n",
194 | "query = 'SELECT * FROM music_library'\n",
195 | "try:\n",
196 | " rows = session.execute(query)\n",
197 | "except Exception as e:\n",
198 | " print(e)\n",
199 | " \n",
200 | "for row in rows:\n",
201 | " print (row.year, row.album_name, row.artist_name)"
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "metadata": {},
207 | "source": [
208 | "### TO-DO: Validate the Data Model with the original query.\n",
209 | "\n",
210 | "`select * from songs WHERE YEAR=1970 AND artist_name=\"The Beatles\"`"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": 10,
216 | "metadata": {},
217 | "outputs": [
218 | {
219 | "name": "stdout",
220 | "output_type": "stream",
221 | "text": [
222 | "1970 Across The Universe The Beatles\n"
223 | ]
224 | }
225 | ],
226 | "source": [
227 | "##TO-DO: Complete the select statement to run the query \n",
228 | "query = \"select * from music_library WHERE year=1970 AND artist_name='The Beatles'\"\n",
229 | "try:\n",
230 | " rows = session.execute(query)\n",
231 | "except Exception as e:\n",
232 | " print(e)\n",
233 | " \n",
234 | "for row in rows:\n",
235 | " print (row.year, row.album_name, row.artist_name)"
236 | ]
237 | },
238 | {
239 | "cell_type": "markdown",
240 | "metadata": {},
241 | "source": [
242 | "### And Finally close the session and cluster connection"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 11,
248 | "metadata": {},
249 | "outputs": [],
250 | "source": [
251 | "session.shutdown()\n",
252 | "cluster.shutdown()"
253 | ]
254 | }
255 | ],
256 | "metadata": {
257 | "kernelspec": {
258 | "display_name": "Python 3",
259 | "language": "python",
260 | "name": "python3"
261 | },
262 | "language_info": {
263 | "codemirror_mode": {
264 | "name": "ipython",
265 | "version": 3
266 | },
267 | "file_extension": ".py",
268 | "mimetype": "text/x-python",
269 | "name": "python",
270 | "nbconvert_exporter": "python",
271 | "pygments_lexer": "ipython3",
272 | "version": "3.9.6 (default, Oct 18 2022, 12:41:40) \n[Clang 14.0.0 (clang-1400.0.29.202)]"
273 | },
274 | "vscode": {
275 | "interpreter": {
276 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
277 | }
278 | }
279 | },
280 | "nbformat": 4,
281 | "nbformat_minor": 4
282 | }
283 |
--------------------------------------------------------------------------------
/1_Data_Modelling/exercises/L3_Exercise_2_Primary_Key.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Lesson 3 Exercise 2: Focus on Primary Key\n",
8 | "
"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "### Walk through the basics of creating a table with a good Primary Key in Apache Cassandra, inserting rows of data, and doing a simple CQL query to validate the information. \n",
16 | "\n",
17 | "### Replace ##### with your own answers. "
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "#### We will use a python wrapper/ python driver called cassandra to run the Apache Cassandra queries. This library should be preinstalled but in the future to install this library you can run this command in a notebook to install locally: \n",
25 | "! pip install cassandra-driver\n",
26 | "#### More documentation can be found here: https://datastax.github.io/python-driver/"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "#### Import Apache Cassandra python package"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 1,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "import cassandra"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "### Create a connection to the database"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 2,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "from cassandra.cluster import Cluster\n",
59 | "try: \n",
60 | " cluster = Cluster(['127.0.0.1']) #If you have a locally installed Apache Cassandra instance\n",
61 | " session = cluster.connect()\n",
62 | "except Exception as e:\n",
63 | " print(e)"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "### Create a keyspace to work in "
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 3,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "try:\n",
80 | " session.execute(\"\"\"\n",
81 | " CREATE KEYSPACE IF NOT EXISTS udacity \n",
82 | " WITH REPLICATION = \n",
83 | " { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }\"\"\"\n",
84 | ")\n",
85 | "\n",
86 | "except Exception as e:\n",
87 | " print(e)"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "#### Connect to the Keyspace. Compare this to how we had to create a new session in PostgreSQL. "
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 4,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "try:\n",
104 | " session.set_keyspace('udacity')\n",
105 | "except Exception as e:\n",
106 | " print(e)"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "### Imagine you need to create a new Music Library of albums \n",
114 | "\n",
115 | "### Here is the information asked of the data:\n",
116 | "#### 1. Give every album in the music library that was created by a given artist\n",
117 | "`select * from music_library WHERE artist_name=\"The Beatles\"`\n"
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {},
123 | "source": [
124 | "### Here is the collection of data\n",
125 | "
"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "#### Practice by making the PRIMARY KEY only 1 Column (not 2 or more)"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 5,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "query = \"CREATE TABLE IF NOT EXISTS music_library \"\n",
142 | "query = query + \"(year int, city text, artist_name text, album_name text, PRIMARY KEY(artist_name))\"\n",
143 | "try:\n",
144 | " session.execute(query)\n",
145 | "except Exception as e:\n",
146 | " print(e)"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "### Let's insert the data into the table"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 6,
159 | "metadata": {},
160 | "outputs": [],
161 | "source": [
162 | "query = \"INSERT INTO music_library (year, artist_name, album_name, city)\"\n",
163 | "query = query + \" VALUES (%s, %s, %s, %s)\"\n",
164 | "\n",
165 | "try:\n",
166 | " session.execute(query, (1970, \"The Beatles\", \"Let it Be\", \"Liverpool\"))\n",
167 | "except Exception as e:\n",
168 | " print(e)\n",
169 | " \n",
170 | "try:\n",
171 | " session.execute(query, (1965, \"The Beatles\", \"Rubber Soul\", \"Oxford\"))\n",
172 | "except Exception as e:\n",
173 | " print(e)\n",
174 | " \n",
175 | "try:\n",
176 | " session.execute(query, (1965, \"The Who\", \"My Generation\", \"London\"))\n",
177 | "except Exception as e:\n",
178 | " print(e)\n",
179 | "\n",
180 | "try:\n",
181 | " session.execute(query, (1966, \"The Monkees\", \"The Monkees\", \"Los Angeles\"))\n",
182 | "except Exception as e:\n",
183 | " print(e)\n",
184 | "\n",
185 | "try:\n",
186 | " session.execute(query, (1970, \"The Carpenters\", \"Close To You\", \"San Diego\"))\n",
187 | "except Exception as e:\n",
188 | " print(e)"
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {},
194 | "source": [
195 | "### Validate the Data Model -- Does it give you two rows?"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 8,
201 | "metadata": {},
202 | "outputs": [
203 | {
204 | "name": "stdout",
205 | "output_type": "stream",
206 | "text": [
207 | "1965 The Beatles Rubber Soul Oxford\n"
208 | ]
209 | }
210 | ],
211 | "source": [
212 | "query = \"SELECT * FROM music_library WHERE artist_name='The Beatles'\"\n",
213 | "try:\n",
214 | " rows = session.execute(query)\n",
215 | "except Exception as e:\n",
216 | " print(e)\n",
217 | " \n",
218 | "for row in rows:\n",
219 | " print (row.year, row.artist_name, row.album_name, row.city)"
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "metadata": {},
225 | "source": [
226 | "### If you used just one column as your PRIMARY KEY, your output should be:\n",
227 | "1965 The Beatles Rubber Soul Oxford\n",
228 | "\n",
229 | "\n",
230 | "### That didn't work out as planned! Why is that? Did you create a unique primary key?"
231 | ]
232 | },
233 | {
234 | "cell_type": "markdown",
235 | "metadata": {},
236 | "source": [
237 | "### Try again - Create a new table with a composite key this time"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": 9,
243 | "metadata": {},
244 | "outputs": [],
245 | "source": [
246 | "query = \"CREATE TABLE IF NOT EXISTS music_library_new \"\n",
247 | "query = query + \"(year int, city text, artist_name text, album_name text, PRIMARY KEY (artist_name, year))\"\n",
248 | "try:\n",
249 | " session.execute(query)\n",
250 | "except Exception as e:\n",
251 | " print(e)"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": 10,
257 | "metadata": {},
258 | "outputs": [],
259 | "source": [
260 | "## You can opt to change the sequence of columns to match your composite key. \\ \n",
261 | "## Make sure to match the values in the INSERT statement\n",
262 | "\n",
263 | "query = \"INSERT INTO music_library_new (year, artist_name, album_name, city)\"\n",
264 | "query = query + \" VALUES (%s, %s, %s, %s)\"\n",
265 | "\n",
266 | "try:\n",
267 | " session.execute(query, (1970, \"The Beatles\", \"Let it Be\", \"Liverpool\"))\n",
268 | "except Exception as e:\n",
269 | " print(e)\n",
270 | " \n",
271 | "try:\n",
272 | " session.execute(query, (1965, \"The Beatles\", \"Rubber Soul\", \"Oxford\"))\n",
273 | "except Exception as e:\n",
274 | " print(e)\n",
275 | " \n",
276 | "try:\n",
277 | " session.execute(query, (1965, \"The Who\", \"My Generation\", \"London\"))\n",
278 | "except Exception as e:\n",
279 | " print(e)\n",
280 | "\n",
281 | "try:\n",
282 | " session.execute(query, (1966, \"The Monkees\", \"The Monkees\", \"Los Angeles\"))\n",
283 | "except Exception as e:\n",
284 | " print(e)\n",
285 | "\n",
286 | "try:\n",
287 | " session.execute(query, (1970, \"The Carpenters\", \"Close To You\", \"San Diego\"))\n",
288 | "except Exception as e:\n",
289 | " print(e)"
290 | ]
291 | },
292 | {
293 | "cell_type": "markdown",
294 | "metadata": {},
295 | "source": [
296 | "### Validate the Data Model -- Did it work?"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 11,
302 | "metadata": {},
303 | "outputs": [
304 | {
305 | "name": "stdout",
306 | "output_type": "stream",
307 | "text": [
308 | "1965 The Beatles Rubber Soul Oxford\n",
309 | "1970 The Beatles Let it Be Liverpool\n"
310 | ]
311 | }
312 | ],
313 | "source": [
314 | "query = \"SELECT * FROM music_library_new WHERE artist_name='The Beatles'\"\n",
315 | "try:\n",
316 | " rows = session.execute(query)\n",
317 | "except Exception as e:\n",
318 | " print(e)\n",
319 | " \n",
320 | "for row in rows:\n",
321 | " print (row.year, row.artist_name, row.album_name, row.city)"
322 | ]
323 | },
324 | {
325 | "cell_type": "markdown",
326 | "metadata": {},
327 | "source": [
328 | "### Your output should be:\n",
329 | "1970 The Beatles Let it Be Liverpool
\n",
330 | "1965 The Beatles Rubber Soul Oxford"
331 | ]
332 | },
333 | {
334 | "cell_type": "markdown",
335 | "metadata": {},
336 | "source": [
337 | "### Drop the tables"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": 12,
343 | "metadata": {},
344 | "outputs": [],
345 | "source": [
346 | "query = \"drop table music_library\"\n",
347 | "try:\n",
348 | " rows = session.execute(query)\n",
349 | "except Exception as e:\n",
350 | " print(e)\n",
351 | "\n",
352 | "query = \"drop table music_library_new\"\n",
353 | "try:\n",
354 | " rows = session.execute(query)\n",
355 | "except Exception as e:\n",
356 | " print(e)"
357 | ]
358 | },
359 | {
360 | "cell_type": "markdown",
361 | "metadata": {},
362 | "source": [
363 | "### Close the session and cluster connection"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": 13,
369 | "metadata": {},
370 | "outputs": [],
371 | "source": [
372 | "session.shutdown()\n",
373 | "cluster.shutdown()"
374 | ]
375 | }
376 | ],
377 | "metadata": {
378 | "kernelspec": {
379 | "display_name": "Python 3",
380 | "language": "python",
381 | "name": "python3"
382 | },
383 | "language_info": {
384 | "codemirror_mode": {
385 | "name": "ipython",
386 | "version": 3
387 | },
388 | "file_extension": ".py",
389 | "mimetype": "text/x-python",
390 | "name": "python",
391 | "nbconvert_exporter": "python",
392 | "pygments_lexer": "ipython3",
393 | "version": "3.11.1 (main, Dec 23 2022, 09:39:26) [Clang 14.0.0 (clang-1400.0.29.202)]"
394 | },
395 | "vscode": {
396 | "interpreter": {
397 | "hash": "1a1af0ee75eeea9e2e1ee996c87e7a2b11a0bebd85af04bb136d915cefc0abce"
398 | }
399 | }
400 | },
401 | "nbformat": 4,
402 | "nbformat_minor": 2
403 | }
404 |
--------------------------------------------------------------------------------
/1_Data_Modelling/exercises/L3_Exercise_3_Clustering_Column.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Lesson 3 Exercise 3: Focus on Clustering Columns\n",
8 | "
"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "### Walk through the basics of creating a table with a good Primary Key and Clustering Columns in Apache Cassandra, inserting rows of data, and doing a simple CQL query to validate the information. \n",
16 | "\n",
17 | "### Remember, replace ##### with your own code."
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "#### We will use a python wrapper/ python driver called cassandra to run the Apache Cassandra queries. This library should be preinstalled but in the future to install this library you can run this command in a notebook to install locally: \n",
25 | "! pip install cassandra-driver\n",
26 | "#### More documentation can be found here: https://datastax.github.io/python-driver/"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "#### Import Apache Cassandra python package"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 1,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "import cassandra"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "### Create a connection to the database"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 2,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "from cassandra.cluster import Cluster\n",
59 | "try: \n",
60 | " cluster = Cluster(['127.0.0.1']) #If you have a locally installed Apache Cassandra instance\n",
61 | " session = cluster.connect()\n",
62 | "except Exception as e:\n",
63 | " print(e)"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "### Create a keyspace to work in "
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 3,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "try:\n",
80 | " session.execute(\"\"\"\n",
81 | " CREATE KEYSPACE IF NOT EXISTS udacity \n",
82 | " WITH REPLICATION = \n",
83 | " { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }\"\"\"\n",
84 | ")\n",
85 | "\n",
86 | "except Exception as e:\n",
87 | " print(e)"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "#### Connect to the Keyspace. Compare this to how we had to create a new session in PostgreSQL. "
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 4,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "try:\n",
104 | " session.set_keyspace('udacity')\n",
105 | "except Exception as e:\n",
106 | " print(e)"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "### Imagine we would like to start creating a new Music Library of albums. \n",
114 | "\n",
115 | "### We want to ask 1 question of our data:\n",
116 | "### 1. Give me all the information from the music library about a given album\n",
117 | "`select * from album_library WHERE album_name=\"Close To You\"`"
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {},
123 | "source": [
124 | "### Here is the data:\n",
125 | "
"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "### How should we model this data? What should be our Primary Key and Partition Key? "
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 5,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "query = \"CREATE TABLE IF NOT EXISTS album_library \"\n",
142 | "query = query + \"(year int, artist_name text, album_name text, city text, PRIMARY KEY(album_name, artist_name))\"\n",
143 | "try:\n",
144 | " session.execute(query)\n",
145 | "except Exception as e:\n",
146 | " print(e)"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "### Insert data into the table"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 6,
159 | "metadata": {},
160 | "outputs": [],
161 | "source": [
162 | "## You can opt to change the sequence of columns to match your composite key. \\ \n",
163 | "## If you do, make sure to match the values in the INSERT statement\n",
164 | "\n",
165 | "query = \"INSERT INTO album_library (year, artist_name, album_name, city)\"\n",
166 | "query = query + \" VALUES (%s, %s, %s, %s)\"\n",
167 | "\n",
168 | "try:\n",
169 | " session.execute(query, (1970, \"The Beatles\", \"Let it Be\", \"Liverpool\"))\n",
170 | "except Exception as e:\n",
171 | " print(e)\n",
172 | " \n",
173 | "try:\n",
174 | " session.execute(query, (1965, \"The Beatles\", \"Rubber Soul\", \"Oxford\"))\n",
175 | "except Exception as e:\n",
176 | " print(e)\n",
177 | " \n",
178 | "try:\n",
179 | " session.execute(query, (1964, \"The Beatles\", \"Beatles For Sale\", \"London\"))\n",
180 | "except Exception as e:\n",
181 | " print(e)\n",
182 | "\n",
183 | "try:\n",
184 | " session.execute(query, (1966, \"The Monkees\", \"The Monkees\", \"Los Angeles\"))\n",
185 | "except Exception as e:\n",
186 | " print(e)\n",
187 | "\n",
188 | "try:\n",
189 | " session.execute(query, (1970, \"The Carpenters\", \"Close To You\", \"San Diego\"))\n",
190 | "except Exception as e:\n",
191 | " print(e)"
192 | ]
193 | },
194 | {
195 | "cell_type": "markdown",
196 | "metadata": {},
197 | "source": [
198 | "### Validate the Data Model -- Did it work? \n",
199 | "`select * from album_library WHERE album_name=\"Close To You\"`"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": 7,
205 | "metadata": {},
206 | "outputs": [
207 | {
208 | "name": "stdout",
209 | "output_type": "stream",
210 | "text": [
211 | "The Carpenters Close To You San Diego 1970\n"
212 | ]
213 | }
214 | ],
215 | "source": [
216 | "query = \"select * from album_library WHERE album_name='Close To You'\"\n",
217 | "try:\n",
218 | " rows = session.execute(query)\n",
219 | "except Exception as e:\n",
220 | " print(e)\n",
221 | " \n",
222 | "for row in rows:\n",
223 | " print (row.artist_name, row.album_name, row.city, row.year)"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {},
229 | "source": [
230 | "### Your output should be:\n",
231 | "('The Carpenters', 'Close to You', 'San Diego', 1970)\n",
232 | "\n",
233 | "### OR\n",
234 | "('The Carpenters', 'Close to You', 1970, 'San Diego') "
235 | ]
236 | },
237 | {
238 | "cell_type": "markdown",
239 | "metadata": {},
240 | "source": [
241 | "### Drop the table"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 8,
247 | "metadata": {},
248 | "outputs": [],
249 | "source": [
250 | "query = \"drop table album_library\"\n",
251 | "try:\n",
252 | " rows = session.execute(query)\n",
253 | "except Exception as e:\n",
254 | " print(e)\n"
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "metadata": {},
260 | "source": [
261 | "### Close the session and cluster connection"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 9,
267 | "metadata": {},
268 | "outputs": [],
269 | "source": [
270 | "session.shutdown()\n",
271 | "cluster.shutdown()"
272 | ]
273 | }
274 | ],
275 | "metadata": {
276 | "kernelspec": {
277 | "display_name": "Python 3",
278 | "language": "python",
279 | "name": "python3"
280 | },
281 | "language_info": {
282 | "codemirror_mode": {
283 | "name": "ipython",
284 | "version": 3
285 | },
286 | "file_extension": ".py",
287 | "mimetype": "text/x-python",
288 | "name": "python",
289 | "nbconvert_exporter": "python",
290 | "pygments_lexer": "ipython3",
291 | "version": "3.6.3"
292 | }
293 | },
294 | "nbformat": 4,
295 | "nbformat_minor": 2
296 | }
297 |
--------------------------------------------------------------------------------
/1_Data_Modelling/exercises/L3_Exercise_4_Using_the_WHERE_Clause.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Lesson 3 Demo 4: Using the WHERE Clause\n",
8 | "
"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "### In this exercise we are going to walk through the basics of using the WHERE clause in Apache Cassandra.\n",
16 | "\n",
17 | "##### denotes where the code needs to be completed."
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "#### We will use a python wrapper/ python driver called cassandra to run the Apache Cassandra queries. This library should be preinstalled but in the future to install this library you can run this command in a notebook to install locally: \n",
25 | "! pip install cassandra-driver\n",
26 | "#### More documentation can be found here: https://datastax.github.io/python-driver/"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "#### Import Apache Cassandra python package"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 1,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "import cassandra"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "### First let's create a connection to the database"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 2,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "from cassandra.cluster import Cluster\n",
59 | "try: \n",
60 | " cluster = Cluster(['127.0.0.1']) #If you have a locally installed Apache Cassandra instance\n",
61 | " session = cluster.connect()\n",
62 | "except Exception as e:\n",
63 | " print(e)"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "### Let's create a keyspace to do our work in "
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 3,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "try:\n",
80 | " session.execute(\"\"\"\n",
81 | " CREATE KEYSPACE IF NOT EXISTS udacity \n",
82 | " WITH REPLICATION = \n",
83 | " { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }\"\"\"\n",
84 | ")\n",
85 | "\n",
86 | "except Exception as e:\n",
87 | " print(e)"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "#### Connect to our Keyspace. Compare this to how we had to create a new session in PostgreSQL. "
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 4,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "try:\n",
104 | " session.set_keyspace('udacity')\n",
105 | "except Exception as e:\n",
106 | " print(e)"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "### Let's imagine we would like to start creating a new Music Library of albums. \n",
114 | "### We want to ask 4 question of our data\n",
115 | "#### 1. Give me every album in my music library that was released in a 1965 year\n",
116 | "#### 2. Give me the album that is in my music library that was released in 1965 by \"The Beatles\"\n",
117 | "#### 3. Give me all the albums released in a given year that was made in London \n",
118 | "#### 4. Give me the city that the album \"Rubber Soul\" was recorded"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "### Here is our Collection of Data\n",
126 | "
"
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "metadata": {},
132 | "source": [
133 | "### How should we model this data? What should be our Primary Key and Partition Key? Since our data is looking for the YEAR let's start with that. From there we will add clustering columns on Artist Name and Album Name."
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 5,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "query = \"CREATE TABLE IF NOT EXISTS music_library \"\n",
143 | "query = query + \"(year int, artist_name text, album_name text, city text, PRIMARY KEY (year, artist_name, album_name))\"\n",
144 | "try:\n",
145 | " session.execute(query)\n",
146 | "except Exception as e:\n",
147 | " print(e)"
148 | ]
149 | },
150 | {
151 | "cell_type": "markdown",
152 | "metadata": {},
153 | "source": [
154 | "### Let's insert our data into of table"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 6,
160 | "metadata": {},
161 | "outputs": [],
162 | "source": [
163 | "query = \"INSERT INTO music_library (year, artist_name, album_name, city)\"\n",
164 | "query = query + \" VALUES (%s, %s, %s, %s)\"\n",
165 | "\n",
166 | "try:\n",
167 | " session.execute(query, (1970, \"The Beatles\", \"Let it Be\", \"Liverpool\"))\n",
168 | "except Exception as e:\n",
169 | " print(e)\n",
170 | " \n",
171 | "try:\n",
172 | " session.execute(query, (1965, \"The Beatles\", \"Rubber Soul\", \"Oxford\"))\n",
173 | "except Exception as e:\n",
174 | " print(e)\n",
175 | " \n",
176 | "try:\n",
177 | " session.execute(query, (1965, \"The Who\", \"My Generation\", \"London\"))\n",
178 | "except Exception as e:\n",
179 | " print(e)\n",
180 | "\n",
181 | "try:\n",
182 | " session.execute(query, (1966, \"The Monkees\", \"The Monkees\", \"Los Angeles\"))\n",
183 | "except Exception as e:\n",
184 | " print(e)\n",
185 | "\n",
186 | "try:\n",
187 | " session.execute(query, (1970, \"The Carpenters\", \"Close To You\", \"San Diego\"))\n",
188 | "except Exception as e:\n",
189 | " print(e)"
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {},
195 | "source": [
196 | "### Let's Validate our Data Model with our 4 queries.\n",
197 | "\n",
198 | "Query 1: "
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 7,
204 | "metadata": {},
205 | "outputs": [
206 | {
207 | "name": "stdout",
208 | "output_type": "stream",
209 | "text": [
210 | "1965 The Beatles Rubber Soul Oxford\n",
211 | "1965 The Who My Generation London\n"
212 | ]
213 | }
214 | ],
215 | "source": [
216 | "query = \"SELECT * from music_library WHERE year = 1965\"\n",
217 | "try:\n",
218 | " rows = session.execute(query)\n",
219 | "except Exception as e:\n",
220 | " print(e)\n",
221 | " \n",
222 | "for row in rows:\n",
223 | " print (row.year, row.artist_name, row.album_name, row.city)"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {},
229 | "source": [
230 | " Let's try the 2nd query.\n",
231 | " Query 2: "
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": 9,
237 | "metadata": {},
238 | "outputs": [
239 | {
240 | "name": "stdout",
241 | "output_type": "stream",
242 | "text": [
243 | "1965 The Beatles Rubber Soul Oxford\n"
244 | ]
245 | }
246 | ],
247 | "source": [
248 | "query = \"SELECT * from music_library WHERE year = 1965 AND artist_name = 'The Beatles'\"\n",
249 | "try:\n",
250 | " rows = session.execute(query)\n",
251 | "except Exception as e:\n",
252 | " print(e)\n",
253 | " \n",
254 | "for row in rows:\n",
255 | " print (row.year, row.artist_name, row.album_name, row.city)"
256 | ]
257 | },
258 | {
259 | "cell_type": "markdown",
260 | "metadata": {},
261 | "source": [
262 | "### Let's try the 3rd query.\n",
263 | "Query 3: "
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 10,
269 | "metadata": {},
270 | "outputs": [
271 | {
272 | "name": "stdout",
273 | "output_type": "stream",
274 | "text": [
275 | "Error from server: code=2200 [Invalid query] message=\"Cannot execute this query as it might involve data filtering and thus may have unpredictable performance. If you want to execute this query despite the performance unpredictability, use ALLOW FILTERING\"\n"
276 | ]
277 | }
278 | ],
279 | "source": [
280 | "query = \"SELECT * from music_library WHERE year = 1965 AND city = 'London'\"\n",
281 | "try:\n",
282 | " rows = session.execute(query)\n",
283 | "except Exception as e:\n",
284 | " print(e)\n",
285 | " \n",
286 | "for row in rows:\n",
287 | " print (row.year, row.artist_name, row.album_name, row.city)"
288 | ]
289 | },
290 | {
291 | "cell_type": "markdown",
292 | "metadata": {},
293 | "source": [
294 | "### Did you get an error? You can not try to access a column or a clustering column if you have not used the other defined clustering column. Let's see if we can try it a different way. \n",
295 | "Try Query 4: \n",
296 | "\n"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 11,
302 | "metadata": {},
303 | "outputs": [
304 | {
305 | "name": "stdout",
306 | "output_type": "stream",
307 | "text": [
308 | "Oxford\n"
309 | ]
310 | }
311 | ],
312 | "source": [
313 | "query = \"SELECT city FROM music_library WHERE year = 1965 AND artist_name = 'The Beatles' AND album_name = 'Rubber Soul'\"\n",
314 | "try:\n",
315 | " rows = session.execute(query)\n",
316 | "except Exception as e:\n",
317 | " print(e)\n",
318 | " \n",
319 | "for row in rows:\n",
320 | " print (row.city)"
321 | ]
322 | },
323 | {
324 | "cell_type": "markdown",
325 | "metadata": {},
326 | "source": [
327 | "### And Finally close the session and cluster connection"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": 12,
333 | "metadata": {},
334 | "outputs": [],
335 | "source": [
336 | "session.shutdown()\n",
337 | "cluster.shutdown()"
338 | ]
339 | }
340 | ],
341 | "metadata": {
342 | "kernelspec": {
343 | "display_name": "Python 3",
344 | "language": "python",
345 | "name": "python3"
346 | },
347 | "language_info": {
348 | "codemirror_mode": {
349 | "name": "ipython",
350 | "version": 3
351 | },
352 | "file_extension": ".py",
353 | "mimetype": "text/x-python",
354 | "name": "python",
355 | "nbconvert_exporter": "python",
356 | "pygments_lexer": "ipython3",
357 | "version": "3.6.3"
358 | }
359 | },
360 | "nbformat": 4,
361 | "nbformat_minor": 2
362 | }
363 |
--------------------------------------------------------------------------------
/1_Data_Modelling/images/basics_of_cassandra.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/1_Data_Modelling/images/basics_of_cassandra.png
--------------------------------------------------------------------------------
/1_Data_Modelling/images/dimension_fact_tables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/1_Data_Modelling/images/dimension_fact_tables.png
--------------------------------------------------------------------------------
/1_Data_Modelling/images/music_store_database_with_star_schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/1_Data_Modelling/images/music_store_database_with_star_schema.png
--------------------------------------------------------------------------------
/1_Data_Modelling/project/event_data/2018-11-01-events.csv:
--------------------------------------------------------------------------------
1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId
2 | ,Logged In,Walter,M,0,Frye,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54092E+12,38,,200,1.54111E+12,39
3 | ,Logged In,Kaylee,F,0,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Home,1.54034E+12,139,,200,1.54111E+12,8
4 | Des'ree,Logged In,Kaylee,F,1,Summers,246.30812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,You Gotta Be,200,1.54111E+12,8
5 | ,Logged In,Kaylee,F,2,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Upgrade,1.54034E+12,139,,200,1.54111E+12,8
6 | Mr Oizo,Logged In,Kaylee,F,3,Summers,144.03873,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Flat 55,200,1.54111E+12,8
7 | Tamba Trio,Logged In,Kaylee,F,4,Summers,177.18812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Quem Quiser Encontrar O Amor,200,1.54111E+12,8
8 | The Mars Volta,Logged In,Kaylee,F,5,Summers,380.42077,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Eriatarka,200,1.54111E+12,8
9 | Infected Mushroom,Logged In,Kaylee,F,6,Summers,440.2673,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Becoming Insane,200,1.54111E+12,8
10 | Blue October / Imogen Heap,Logged In,Kaylee,F,7,Summers,241.3971,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Congratulations,200,1.54111E+12,8
11 | Girl Talk,Logged In,Kaylee,F,8,Summers,160.15628,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Once again,200,1.54111E+12,8
12 | Black Eyed Peas,Logged In,Sylvie,F,0,Cruz,214.93506,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV",PUT,NextSong,1.54027E+12,9,Pump It,200,1.54111E+12,10
13 | ,Logged In,Ryan,M,0,Smith,,free,"San Jose-Sunnyvale-Santa Clara, CA",GET,Home,1.54102E+12,169,,200,1.54111E+12,26
14 | Fall Out Boy,Logged In,Ryan,M,1,Smith,200.72444,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,169,Nobody Puts Baby In The Corner,200,1.54111E+12,26
15 | M.I.A.,Logged In,Ryan,M,2,Smith,233.7171,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,169,Mango Pickle Down River (With The Wilcannia Mob),200,1.54111E+12,26
16 | Survivor,Logged In,Jayden,M,0,Fox,245.36771,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,100,Eye Of The Tiger,200,1.54111E+12,101
17 |
--------------------------------------------------------------------------------
/1_Data_Modelling/project/event_data/2018-11-25-events.csv:
--------------------------------------------------------------------------------
1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId
2 | matchbox twenty,Logged In,Jayden,F,0,Duffy,177.65832,free,"Seattle-Tacoma-Bellevue, WA",PUT,NextSong,1.54015E+12,846,Argue (LP Version),200,1.54311E+12,76
3 | The Lonely Island / T-Pain,Logged In,Jayden,F,1,Duffy,156.23791,free,"Seattle-Tacoma-Bellevue, WA",PUT,NextSong,1.54015E+12,846,I'm On A Boat,200,1.54311E+12,76
4 | ,Logged In,Jayden,F,2,Duffy,,free,"Seattle-Tacoma-Bellevue, WA",GET,Home,1.54015E+12,846,,200,1.54311E+12,76
5 | ,Logged In,Jayden,F,3,Duffy,,free,"Seattle-Tacoma-Bellevue, WA",GET,Settings,1.54015E+12,846,,200,1.54311E+12,76
6 | ,Logged In,Jayden,F,4,Duffy,,free,"Seattle-Tacoma-Bellevue, WA",PUT,Save Settings,1.54015E+12,846,,307,1.54311E+12,76
7 | John Mayer,Logged In,Wyatt,M,0,Scott,275.27791,free,"Eureka-Arcata-Fortuna, CA",PUT,NextSong,1.54087E+12,856,All We Ever Do Is Say Goodbye,200,1.54311E+12,9
8 | ,Logged In,Wyatt,M,1,Scott,,free,"Eureka-Arcata-Fortuna, CA",GET,Home,1.54087E+12,856,,200,1.54311E+12,9
9 | 10_000 Maniacs,Logged In,Wyatt,M,2,Scott,251.8722,free,"Eureka-Arcata-Fortuna, CA",PUT,NextSong,1.54087E+12,856,Gun Shy (LP Version),200,1.54311E+12,9
10 | Leona Lewis,Logged In,Chloe,F,0,Cuevas,203.88526,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Forgive Me,200,1.54312E+12,49
11 | Nine Inch Nails,Logged In,Chloe,F,1,Cuevas,277.83791,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,La Mer,200,1.54312E+12,49
12 | Audioslave,Logged In,Chloe,F,2,Cuevas,334.91546,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,I Am The Highway,200,1.54312E+12,49
13 | Kid Rock,Logged In,Chloe,F,3,Cuevas,296.95955,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,All Summer Long (Album Version),200,1.54312E+12,49
14 | The Jets,Logged In,Chloe,F,4,Cuevas,220.89098,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,I Do You,200,1.54312E+12,49
15 | The Gerbils,Logged In,Chloe,F,5,Cuevas,27.01016,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,(iii),200,1.54312E+12,49
16 | Damian Marley / Stephen Marley / Yami Bolo,Logged In,Chloe,F,6,Cuevas,304.69179,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Still Searching,200,1.54312E+12,49
17 | ,Logged In,Chloe,F,7,Cuevas,,paid,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,916,,200,1.54312E+12,49
18 | The Bloody Beetroots,Logged In,Chloe,F,8,Cuevas,201.97832,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Warp 1.9 (feat. Steve Aoki),200,1.54312E+12,49
19 | ,Logged In,Chloe,F,9,Cuevas,,paid,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,916,,200,1.54313E+12,49
20 | The Specials,Logged In,Chloe,F,10,Cuevas,188.81261,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Rat Race,200,1.54313E+12,49
21 | The Lively Ones,Logged In,Chloe,F,11,Cuevas,142.52363,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Walkin' The Board (LP Version),200,1.54313E+12,49
22 | Katie Melua,Logged In,Chloe,F,12,Cuevas,252.78649,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Blues In The Night,200,1.54313E+12,49
23 | Jason Mraz,Logged In,Chloe,F,13,Cuevas,243.48689,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,I'm Yours (Album Version),200,1.54313E+12,49
24 | Fisher,Logged In,Chloe,F,14,Cuevas,133.98159,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Rianna,200,1.54313E+12,49
25 | Zee Avi,Logged In,Chloe,F,15,Cuevas,160.62649,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,No Christmas For Me,200,1.54313E+12,49
26 | Black Eyed Peas,Logged In,Chloe,F,16,Cuevas,289.12281,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,I Gotta Feeling,200,1.54313E+12,49
27 | Emiliana Torrini,Logged In,Chloe,F,17,Cuevas,184.29342,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Sunny Road,200,1.54313E+12,49
28 | ,Logged In,Chloe,F,18,Cuevas,,paid,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,916,,200,1.54313E+12,49
29 | Days Of The New,Logged In,Chloe,F,19,Cuevas,258.5073,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,The Down Town,200,1.54313E+12,49
30 | Julio Iglesias duet with Willie Nelson,Logged In,Chloe,F,20,Cuevas,212.16608,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,To All The Girls I've Loved Before (With Julio Iglesias),200,1.54313E+12,49
31 | ,Logged In,Jacqueline,F,0,Lynch,,paid,"Atlanta-Sandy Springs-Roswell, GA",GET,Home,1.54022E+12,914,,200,1.54313E+12,29
32 | Jason Mraz & Colbie Caillat,Logged In,Chloe,F,0,Roth,189.6224,free,"Indianapolis-Carmel-Anderson, IN",PUT,NextSong,1.5407E+12,704,Lucky (Album Version),200,1.54314E+12,78
33 | ,Logged In,Anabelle,F,0,Simpson,,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",GET,Home,1.54104E+12,901,,200,1.54315E+12,69
34 | R. Kelly,Logged In,Anabelle,F,1,Simpson,234.39628,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1.54104E+12,901,The World's Greatest,200,1.54315E+12,69
35 | ,Logged In,Kynnedi,F,0,Sanchez,,free,"Cedar Rapids, IA",GET,Home,1.54108E+12,804,,200,1.54315E+12,89
36 | Jacky Terrasson,Logged In,Marina,F,0,Sutton,342.7522,free,"Salinas, CA",PUT,NextSong,1.54106E+12,373,Le Jardin d'Hiver,200,1.54315E+12,48
37 | Papa Roach,Logged In,Theodore,M,0,Harris,202.1873,free,"Red Bluff, CA",PUT,NextSong,1.5411E+12,813,Alive,200,1.54316E+12,14
38 | Burt Bacharach,Logged In,Theodore,M,1,Harris,156.96934,free,"Red Bluff, CA",PUT,NextSong,1.5411E+12,813,Casino Royale Theme (Main Title),200,1.54316E+12,14
39 | ,Logged In,Chloe,F,0,Cuevas,,paid,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,923,,200,1.54316E+12,49
40 | Floetry,Logged In,Chloe,F,1,Cuevas,254.48444,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,Sunshine,200,1.54316E+12,49
41 | The Rakes,Logged In,Chloe,F,2,Cuevas,225.2273,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,Leave The City And Come Home,200,1.54316E+12,49
42 | Dwight Yoakam,Logged In,Chloe,F,3,Cuevas,239.3073,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,You're The One,200,1.54316E+12,49
43 | Ween,Logged In,Chloe,F,4,Cuevas,228.10077,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,Voodoo Lady,200,1.54316E+12,49
44 | Café Quijano,Logged In,Chloe,F,5,Cuevas,197.32853,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,La Lola,200,1.54316E+12,49
45 | ,Logged In,Chloe,F,0,Roth,,free,"Indianapolis-Carmel-Anderson, IN",GET,Home,1.5407E+12,925,,200,1.54317E+12,78
46 | Parov Stelar,Logged In,Chloe,F,1,Roth,203.65016,free,"Indianapolis-Carmel-Anderson, IN",PUT,NextSong,1.5407E+12,925,Good Bye Emily (feat. Gabriella Hanninen),200,1.54317E+12,78
47 | ,Logged In,Chloe,F,2,Roth,,free,"Indianapolis-Carmel-Anderson, IN",GET,Home,1.5407E+12,925,,200,1.54317E+12,78
48 | ,Logged In,Tegan,F,0,Levine,,paid,"Portland-South Portland, ME",GET,Home,1.54079E+12,915,,200,1.54317E+12,80
49 | Bryan Adams,Logged In,Tegan,F,1,Levine,166.29506,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,I Will Always Return,200,1.54317E+12,80
50 | KT Tunstall,Logged In,Tegan,F,2,Levine,192.31302,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,White Bird,200,1.54317E+12,80
51 | Technicolour,Logged In,Tegan,F,3,Levine,235.12771,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,Turn Away,200,1.54317E+12,80
52 | The Dears,Logged In,Tegan,F,4,Levine,289.95873,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,Lost In The Plot,200,1.54317E+12,80
53 | Go West,Logged In,Tegan,F,5,Levine,259.49995,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,Never Let Them See You Sweat,200,1.54317E+12,80
54 | ,Logged In,Tegan,F,6,Levine,,paid,"Portland-South Portland, ME",PUT,Logout,1.54079E+12,915,,307,1.54317E+12,80
55 | ,Logged In,Sylvie,F,0,Cruz,,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV",GET,Home,1.54027E+12,912,,200,1.54317E+12,10
56 | ,Logged Out,,,7,,,paid,,GET,Home,,915,,200,1.54317E+12,
57 | Gondwana,Logged In,Jordan,F,0,Hicks,262.5824,free,"Salinas, CA",PUT,NextSong,1.54001E+12,814,Mi Princesa,200,1.54319E+12,37
58 | ,Logged In,Kevin,M,0,Arellano,,free,"Harrisburg-Carlisle, PA",GET,Home,1.54001E+12,855,,200,1.54319E+12,66
59 | Ella Fitzgerald,Logged In,Jordan,F,1,Hicks,427.15383,free,"Salinas, CA",PUT,NextSong,1.54001E+12,814,On Green Dolphin Street (Medley) (1999 Digital Remaster),200,1.54319E+12,37
60 | Creedence Clearwater Revival,Logged In,Jordan,F,2,Hicks,184.73751,free,"Salinas, CA",PUT,NextSong,1.54001E+12,814,Run Through The Jungle,200,1.54319E+12,37
61 |
--------------------------------------------------------------------------------
/1_Data_Modelling/project/images/image_event_datafile_new.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/1_Data_Modelling/project/images/image_event_datafile_new.jpg
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/.DS_Store
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/exercises/10_Parallel_ETL.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Exercise 3: Parallel ETL"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "%load_ext sql"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 2,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "import boto3\n",
26 | "import configparser\n",
27 | "import matplotlib.pyplot as plt\n",
28 | "import pandas as pd\n",
29 | "from time import time"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {},
35 | "source": [
36 | "# STEP 1: Get the params of the created redshift cluster \n",
37 | "- We need:\n",
38 | " - The redshift cluster endpoint\n",
39 | " - The IAM role ARN that give access to Redshift to read from S3"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 3,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "config = configparser.ConfigParser()\n",
49 | "config.read_file(open('dwh.cfg'))\n",
50 | "KEY=config.get('AWS','key')\n",
51 | "SECRET= config.get('AWS','secret')\n",
52 | "\n",
53 | "DWH_DB= config.get(\"DWH\",\"DWH_DB\")\n",
54 | "DWH_DB_USER= config.get(\"DWH\",\"DWH_DB_USER\")\n",
55 | "DWH_DB_PASSWORD= config.get(\"DWH\",\"DWH_DB_PASSWORD\")\n",
56 | "DWH_PORT = config.get(\"DWH\",\"DWH_PORT\")"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 4,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "# FILL IN THE REDSHIFT ENPOINT HERE\n",
66 | "DWH_ENDPOINT=\"dwhcluster.cartrrecbcpi.us-west-2.redshift.amazonaws.com\" \n",
67 | " \n",
68 | "#FILL IN THE IAM ROLE ARN you got in step 2.2 of the previous exercise\n",
69 | "DWH_ROLE_ARN=\"arn:aws:iam::261476836151:role/dwhRole\""
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | "# STEP 2: Connect to the Redshift Cluster"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 5,
82 | "metadata": {},
83 | "outputs": [
84 | {
85 | "name": "stdout",
86 | "output_type": "stream",
87 | "text": [
88 | "postgresql://dwhuser:Passw0rd@dwhcluster.cartrrecbcpi.us-west-2.redshift.amazonaws.com:5439/dwh\n"
89 | ]
90 | },
91 | {
92 | "data": {
93 | "text/plain": [
94 | "'Connected: dwhuser@dwh'"
95 | ]
96 | },
97 | "execution_count": 5,
98 | "metadata": {},
99 | "output_type": "execute_result"
100 | }
101 | ],
102 | "source": [
103 | "conn_string=\"postgresql://{}:{}@{}:{}/{}\".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB)\n",
104 | "print(conn_string)\n",
105 | "%sql $conn_string"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 6,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "s3 = boto3.resource(\n",
115 | " 's3',\n",
116 | " region_name=\"us-west-2\",\n",
117 | " aws_access_key_id=KEY,\n",
118 | " aws_secret_access_key=SECRET\n",
119 | ")\n",
120 | "# TODO: Create S3 cient\n",
121 | "\n",
122 | "sampleDbBucket = s3.Bucket(\"udacity-labs\") # TODO: Create udacity-labs bucket"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 7,
128 | "metadata": {},
129 | "outputs": [
130 | {
131 | "name": "stdout",
132 | "output_type": "stream",
133 | "text": [
134 | "s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/')\n",
135 | "s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/full/')\n",
136 | "s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/full/full.csv.gz')\n",
137 | "s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/')\n",
138 | "s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00000-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')\n",
139 | "s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00001-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')\n",
140 | "s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00002-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')\n",
141 | "s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00003-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')\n",
142 | "s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00004-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')\n",
143 | "s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00005-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')\n",
144 | "s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00006-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')\n",
145 | "s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00007-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')\n",
146 | "s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00008-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')\n",
147 | "s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00009-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')\n"
148 | ]
149 | }
150 | ],
151 | "source": [
152 | "for obj in sampleDbBucket.objects.filter(Prefix=\"tickets\"):\n",
153 | " print(obj)"
154 | ]
155 | },
156 | {
157 | "cell_type": "markdown",
158 | "metadata": {},
159 | "source": [
160 | "# STEP 3: Create Tables"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": 8,
166 | "metadata": {},
167 | "outputs": [
168 | {
169 | "name": "stdout",
170 | "output_type": "stream",
171 | "text": [
172 | " * postgresql://dwhuser:***@dwhcluster.cartrrecbcpi.us-west-2.redshift.amazonaws.com:5439/dwh\n",
173 | "Done.\n",
174 | "Done.\n"
175 | ]
176 | },
177 | {
178 | "data": {
179 | "text/plain": [
180 | "[]"
181 | ]
182 | },
183 | "execution_count": 8,
184 | "metadata": {},
185 | "output_type": "execute_result"
186 | }
187 | ],
188 | "source": [
189 | "%%sql \n",
190 | "DROP TABLE IF EXISTS \"sporting_event_ticket\";\n",
191 | "CREATE TABLE \"sporting_event_ticket\" (\n",
192 | " \"id\" double precision DEFAULT nextval('sporting_event_ticket_seq') NOT NULL,\n",
193 | " \"sporting_event_id\" double precision NOT NULL,\n",
194 | " \"sport_location_id\" double precision NOT NULL,\n",
195 | " \"seat_level\" numeric(1,0) NOT NULL,\n",
196 | " \"seat_section\" character varying(15) NOT NULL,\n",
197 | " \"seat_row\" character varying(10) NOT NULL,\n",
198 | " \"seat\" character varying(10) NOT NULL,\n",
199 | " \"ticketholder_id\" double precision,\n",
200 | " \"ticket_price\" numeric(8,2) NOT NULL\n",
201 | ");"
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "metadata": {},
207 | "source": [
208 | "# STEP 4: Load Partitioned data into the cluster\n",
209 | "Use the COPY command to load data from `s3://udacity-labs/tickets/split/part` using your iam role credentials. Use gzip delimiter `;`."
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 9,
215 | "metadata": {},
216 | "outputs": [
217 | {
218 | "name": "stdout",
219 | "output_type": "stream",
220 | "text": [
221 | " * postgresql://dwhuser:***@dwhcluster.cartrrecbcpi.us-west-2.redshift.amazonaws.com:5439/dwh\n",
222 | "Done.\n",
223 | "CPU times: user 4.46 ms, sys: 62 µs, total: 4.52 ms\n",
224 | "Wall time: 12.1 s\n"
225 | ]
226 | }
227 | ],
228 | "source": [
229 | "%%time\n",
230 | "qry = \"\"\"\n",
231 | " copy sporting_event_ticket from 's3://udacity-labs/tickets/split/part'\n",
232 | " credentials 'aws_iam_role={}'\n",
233 | " gzip delimiter ';' compupdate off region 'us-west-2';\n",
234 | "\"\"\".format(DWH_ROLE_ARN)\n",
235 | "\n",
236 | "%sql $qry"
237 | ]
238 | },
239 | {
240 | "cell_type": "markdown",
241 | "metadata": {},
242 | "source": [
243 | "# STEP 5: Create Tables for the non-partitioned data"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 10,
249 | "metadata": {},
250 | "outputs": [
251 | {
252 | "name": "stdout",
253 | "output_type": "stream",
254 | "text": [
255 | " * postgresql://dwhuser:***@dwhcluster.cartrrecbcpi.us-west-2.redshift.amazonaws.com:5439/dwh\n",
256 | "Done.\n",
257 | "Done.\n"
258 | ]
259 | },
260 | {
261 | "data": {
262 | "text/plain": [
263 | "[]"
264 | ]
265 | },
266 | "execution_count": 10,
267 | "metadata": {},
268 | "output_type": "execute_result"
269 | }
270 | ],
271 | "source": [
272 | "%%sql\n",
273 | "DROP TABLE IF EXISTS \"sporting_event_ticket_full\";\n",
274 | "CREATE TABLE \"sporting_event_ticket_full\" (\n",
275 | " \"id\" double precision DEFAULT nextval('sporting_event_ticket_seq') NOT NULL,\n",
276 | " \"sporting_event_id\" double precision NOT NULL,\n",
277 | " \"sport_location_id\" double precision NOT NULL,\n",
278 | " \"seat_level\" numeric(1,0) NOT NULL,\n",
279 | " \"seat_section\" character varying(15) NOT NULL,\n",
280 | " \"seat_row\" character varying(10) NOT NULL,\n",
281 | " \"seat\" character varying(10) NOT NULL,\n",
282 | " \"ticketholder_id\" double precision,\n",
283 | " \"ticket_price\" numeric(8,2) NOT NULL\n",
284 | ");"
285 | ]
286 | },
287 | {
288 | "cell_type": "markdown",
289 | "metadata": {},
290 | "source": [
291 | "# STEP 6: Load non-partitioned data into the cluster\n",
292 | "Use the COPY command to load data from `s3://udacity-labs/tickets/full/full.csv.gz` using your iam role credentials. Use gzip delimiter `;`.\n",
293 | "\n",
294 | "- Note how it's slower than loading partitioned data"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 11,
300 | "metadata": {},
301 | "outputs": [
302 | {
303 | "name": "stdout",
304 | "output_type": "stream",
305 | "text": [
306 | " * postgresql://dwhuser:***@dwhcluster.cartrrecbcpi.us-west-2.redshift.amazonaws.com:5439/dwh\n",
307 | "Done.\n",
308 | "CPU times: user 4.32 ms, sys: 0 ns, total: 4.32 ms\n",
309 | "Wall time: 22.2 s\n"
310 | ]
311 | }
312 | ],
313 | "source": [
314 | "%%time\n",
315 | "\n",
316 | "qry = \"\"\"\n",
317 | " copy sporting_event_ticket_full from 's3://udacity-labs/tickets/full/full.csv.gz' \n",
318 | " credentials 'aws_iam_role={}' \n",
319 | " gzip delimiter ';' compupdate off region 'us-west-2';\n",
320 | "\"\"\".format(DWH_ROLE_ARN)\n",
321 | "\n",
322 | "%sql $qry"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": null,
328 | "metadata": {},
329 | "outputs": [],
330 | "source": []
331 | }
332 | ],
333 | "metadata": {
334 | "kernelspec": {
335 | "display_name": "Python 3",
336 | "language": "python",
337 | "name": "python3"
338 | },
339 | "language_info": {
340 | "codemirror_mode": {
341 | "name": "ipython",
342 | "version": 3
343 | },
344 | "file_extension": ".py",
345 | "mimetype": "text/x-python",
346 | "name": "python",
347 | "nbconvert_exporter": "python",
348 | "pygments_lexer": "ipython3",
349 | "version": "3.11.1 (main, Dec 23 2022, 09:39:26) [Clang 14.0.0 (clang-1400.0.29.202)]"
350 | },
351 | "vscode": {
352 | "interpreter": {
353 | "hash": "1a1af0ee75eeea9e2e1ee996c87e7a2b11a0bebd85af04bb136d915cefc0abce"
354 | }
355 | }
356 | },
357 | "nbformat": 4,
358 | "nbformat_minor": 2
359 | }
360 |
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/exercises/dwh.cfg:
--------------------------------------------------------------------------------
1 | [AWS]
2 | KEY=
3 | SECRET=
4 |
5 | [DWH]
6 | DWH_CLUSTER_TYPE=multi-node
7 | DWH_NUM_NODES=4
8 | DWH_NODE_TYPE=dc2.large
9 |
10 | DWH_IAM_ROLE_NAME=dwhRole
11 | DWH_CLUSTER_IDENTIFIER=dwhCluster
12 | DWH_DB=dwh
13 | DWH_DB_USER=dwhuser
14 | DWH_DB_PASSWORD=Passw0rd
15 | DWH_PORT=5439
16 |
17 |
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/1-introduction-to-cloud-data-warehouses.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/1-introduction-to-cloud-data-warehouses.jpg
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/2-introduction-to-datawarehousing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/2-introduction-to-datawarehousing.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/3-DW_ETL_Design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/3-DW_ETL_Design.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/4-Kimballs_Bus_Architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/4-Kimballs_Bus_Architecture.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/5-DWH_Tech_Perspective.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/5-DWH_Tech_Perspective.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/6-pagila_star_schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/6-pagila_star_schema.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/7_pagila-3nf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/7_pagila-3nf.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/all_distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/all_distribution.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/amazon_redshift.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/amazon_redshift.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/dwh_etl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/dwh_etl.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/even_distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/even_distribution.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/ingesting_with_manifest_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/ingesting_with_manifest_example.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/ingesting_with_prefix_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/ingesting_with_prefix_example.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/key_distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/key_distribution.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/olap_cube.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/olap_cube.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/redshift_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/redshift_architecture.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/redshift_etl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/redshift_etl.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/redshift_etl_dataflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/redshift_etl_dataflow.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/redshift_node_types.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/redshift_node_types.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/sorting_and_dist_key_syntax.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/sorting_and_dist_key_syntax.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/sorting_key_distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/sorting_key_distribution.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/images/tutorial-optimize-tables-ssb-data-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/images/tutorial-optimize-tables-ssb-data-model.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/project/README.md:
--------------------------------------------------------------------------------
1 | # Project: Data Warehouse
2 |
3 | ## Introduction
4 |
5 | A music streaming startup, Sparkify, has grown their user base and song database and want to move their processes and data onto the cloud. Their data resides in S3, in a directory of JSON logs on user activity on the app, as well as a directory with JSON metadata on the songs in their app.
6 |
7 | As their data engineer, you are tasked with building an ETL pipeline that extracts their data from S3, stages them in Redshift, and transforms data into a set of dimensional tables for their analytics team to continue finding insights into what songs their users are listening to.
8 |
9 |
10 |
11 |
12 |
13 | ## Database schema design
14 |
15 | ### Staging tables
16 |
17 | - **staging_events**: stores data extracted from JSON logs on user activity. Columns: *artist, auth, firstName, gender, itemInSession, lastName, length, level, location, method, page, registration, sessionId, song, status, ts, userAgent, userId*
18 | - **staging_songs**: stores data extracted from JSON metadata on the songs in the app. Columns: *num_songs, artist_id, artist_latitude, artist_longitude, artist_location, artist_name, song_id, title, duration, year*
19 |
20 | ### Analytical tables
21 |
22 | - **Fact Table**
23 | - **songplays**: records in event data associated with song plays i.e. records with page NextSong. Columns: *songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent*
24 | - **Dimension Tables**
25 | - **users**: users in the app. Columns: *user_id, first_name, last_name, gender, level*
26 | - **songs**: songs in music database. Columns: *song_id, title, artist_id, year, duration*
27 | - **artists**: artists in music database. Columns: *artist_id, name, location, latitude, longitude*
28 | - **time**: timestamps of records in songplays broken down into specific units. Columns: *start_time, hour, day, week, month, year, weekday*
29 |
30 | ## ETL pipeline
31 |
32 | - [create_tables.py](create_tables.py) will drop all existing tables and create tables a per the queries mentioned in [sql_queries.py](sql_queries.py).
33 | - [etl.py](etl.py) copy data from s3 to staging table and then populate *fact* and *dimension* tables.
34 |
35 | [main.ipynb](main.ipynb) for running complete project flow including setting up aws resources, running aforementioned etl pipeline and cleaning up resources.
36 |
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/project/create_tables.py:
--------------------------------------------------------------------------------
1 | import configparser
2 | import psycopg2
3 | from sql_queries import create_table_queries, drop_table_queries
4 |
5 |
6 | def drop_tables(cur, conn):
7 | """
8 | Drop all existing staging, fact, dimension tables
9 |
10 | Args:
11 | conn: (connection) instance of connection class
12 | cur: (cursor) instance of cursor class
13 |
14 | Returns:
15 | none
16 | """
17 | for query in drop_table_queries:
18 | cur.execute(query)
19 | conn.commit()
20 |
21 |
22 | def create_tables(cur, conn):
23 | """
24 | Create staging, fact, dimension tables
25 |
26 | Args:
27 | conn: (connection) instance of connection class
28 | cur: (cursor) instance of cursor class
29 |
30 | Returns:
31 | none
32 | """
33 | for query in create_table_queries:
34 | cur.execute(query)
35 | conn.commit()
36 |
37 |
38 | def main():
39 | config = configparser.ConfigParser()
40 | config.read('dwh.cfg')
41 |
42 | # Establish the connection to database
43 | conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values()))
44 |
45 | # Create cusor object
46 | cur = conn.cursor()
47 |
48 | drop_tables(cur, conn)
49 | create_tables(cur, conn)
50 |
51 | conn.close()
52 |
53 | if __name__ == "__main__":
54 | main()
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/project/dwh.cfg:
--------------------------------------------------------------------------------
1 | [AWS]
2 | region = us-west-2
3 |
4 | [CLUSTER]
5 | host = dwhcluster.cartrrecbcpi.us-west-2.redshift.amazonaws.com
6 | db_name = dwh
7 | db_user = dwhuser
8 | db_password = Passw0rd
9 | db_port = 5439
10 |
11 | [DWH]
12 | dwh_cluster_type = multi-node
13 | dwh_num_nodes = 4
14 | dwh_node_type = dc2.large
15 | dwh_cluster_identifier = dwhCluster
16 |
17 | [IAM]
18 | role_name = dwhRole
19 | role_arn = arn:aws:iam::261476836151:role/dwhRole
20 |
21 | [S3]
22 | log_data = 's3://udacity-dend/log_data'
23 | log_jsonpath = 's3://udacity-dend/log_json_path.json'
24 | song_data = 's3://udacity-dend/song_data'
25 |
26 |
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/project/etl.py:
--------------------------------------------------------------------------------
1 | import configparser
2 | import psycopg2
3 | from sql_queries import copy_table_queries, insert_table_queries
4 |
5 |
6 | def load_staging_tables(cur, conn):
7 | """
8 | Populate staging tables from S3
9 |
10 | Args:
11 | conn: (connection) instance of connection class
12 | cur: (cursor) instance of cursor class
13 |
14 | Returns:
15 | none
16 | """
17 | for query in copy_table_queries:
18 | cur.execute(query)
19 | conn.commit()
20 |
21 |
22 | def insert_tables(cur, conn):
23 | """
24 | Populate fact and dimention tables from staging tables
25 |
26 | Args:
27 | conn: (connection) instance of connection class
28 | cur: (cursor) instance of cursor class
29 |
30 | Returns:
31 | none
32 | """
33 | for query in insert_table_queries:
34 | cur.execute(query)
35 | conn.commit()
36 |
37 | def main():
38 | config = configparser.ConfigParser()
39 | config.read('dwh.cfg')
40 |
41 | # Establish the connection to database
42 | conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values()))
43 |
44 | # Create cusor object
45 | cur = conn.cursor()
46 |
47 | # forming the connection
48 | load_staging_tables(cur, conn)
49 | insert_tables(cur, conn)
50 |
51 | conn.close()
52 |
53 |
54 | if __name__ == "__main__":
55 | main()
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/project/images/sparkify-s3-to-redshift-etl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/2_Cloud_Data_Warehouses/project/images/sparkify-s3-to-redshift-etl.png
--------------------------------------------------------------------------------
/2_Cloud_Data_Warehouses/project/sql_queries.py:
--------------------------------------------------------------------------------
1 | import configparser
2 |
3 | # CONFIG
4 | config = configparser.ConfigParser()
5 | config.read('dwh.cfg')
6 |
7 | DWH_ROLE_ARN = config.get("IAM","ROLE_ARN")
8 | LOG_DATA = config.get("S3","LOG_DATA")
9 | LOG_JSONPATH = config.get("S3", "LOG_JSONPATH")
10 | SONG_DATA = config.get("S3", "SONG_DATA")
11 |
12 | # DROP TABLES
13 | staging_events_table_drop = "DROP TABLE IF EXISTS staging_events"
14 | staging_songs_table_drop = "DROP TABLE IF EXISTS staging_songs"
15 | songplay_table_drop = "DROP TABLE IF EXISTS songplays"
16 | user_table_drop = "DROP TABLE IF EXISTS users"
17 | song_table_drop = "DROP TABLE IF EXISTS songs"
18 | artist_table_drop = "DROP TABLE IF EXISTS artists"
19 | time_table_drop = "DROP TABLE IF EXISTS time"
20 |
21 | # CREATE TABLES
22 | staging_events_table_create= ("""
23 | CREATE TABLE IF NOT EXISTS staging_events (
24 | artist VARCHAR,
25 | auth VARCHAR,
26 | firstName VARCHAR,
27 | gender CHAR(1),
28 | itemInSession INTEGER,
29 | lastName VARCHAR,
30 | length FLOAT,
31 | level VARCHAR,
32 | location TEXT,
33 | method VARCHAR,
34 | page VARCHAR,
35 | registration FLOAT,
36 | sessionId INTEGER,
37 | song VARCHAR,
38 | status INTEGER,
39 | ts BIGINT,
40 | userAgent TEXT,
41 | userId INTEGER);
42 | """)
43 |
44 | staging_songs_table_create = ("""
45 | CREATE TABLE IF NOT EXISTS staging_songs (
46 | num_songs INTEGER,
47 | artist_id VARCHAR,
48 | artist_latitude FLOAT,
49 | artist_longitude FLOAT,
50 | artist_location TEXT,
51 | artist_name VARCHAR,
52 | song_id VARCHAR,
53 | title VARCHAR,
54 | duration FLOAT,
55 | year INTEGER);
56 | """)
57 |
58 | songplay_table_create = ("""
59 | CREATE TABLE IF NOT EXISTS songplays (
60 | songplay_id INTEGER IDENTITY(0,1) NOT NULL PRIMARY KEY,
61 | start_time TIMESTAMP,
62 | user_id INTEGER,
63 | level VARCHAR,
64 | song_id VARCHAR,
65 | artist_id VARCHAR,
66 | session_id INTEGER,
67 | location TEXT,
68 | user_agent TEXT);
69 | """)
70 |
71 | user_table_create = ("""
72 | CREATE TABLE IF NOT EXISTS users (
73 | user_id INTEGER NOT NULL PRIMARY KEY,
74 | first_name VARCHAR,
75 | last_name VARCHAR,
76 | gender CHAR(1),
77 | level VARCHAR);
78 | """)
79 |
80 | song_table_create = ("""
81 | CREATE TABLE IF NOT EXISTS songs (
82 | song_id VARCHAR NOT NULL PRIMARY KEY,
83 | title VARCHAR,
84 | artist_id VARCHAR,
85 | year INT,
86 | duration FLOAT);
87 | """)
88 |
89 | artist_table_create = ("""
90 | CREATE TABLE IF NOT EXISTS artists (
91 | artist_id VARCHAR NOT NULL PRIMARY KEY,
92 | name VARCHAR,
93 | location TEXT ,
94 | latitude FLOAT ,
95 | longitude FLOAT);
96 | """)
97 |
98 | time_table_create = ("""
99 | CREATE TABLE IF NOT EXISTS time (
100 | start_time TIMESTAMP NOT NULL PRIMARY KEY,
101 | hour INTEGER,
102 | day INTEGER,
103 | week INTEGER,
104 | month INTEGER,
105 | year INTEGER,
106 | weekday VARCHAR);
107 | """)
108 |
109 | # STAGING TABLES
110 | staging_events_copy = ("""
111 | copy staging_events
112 | from {}
113 | credentials 'aws_iam_role={}'
114 | format as json {}
115 | compupdate off
116 | region 'us-west-2';
117 | """).format(LOG_DATA, DWH_ROLE_ARN, LOG_JSONPATH)
118 |
119 | staging_songs_copy = ("""
120 | copy staging_songs
121 | from {}
122 | credentials 'aws_iam_role={}'
123 | format as json 'auto'
124 | compupdate off
125 | region 'us-west-2';
126 | """).format(SONG_DATA, DWH_ROLE_ARN)
127 |
128 | # FINAL TABLES
129 | songplay_table_insert = ("""
130 | INSERT INTO songplays (
131 | start_time,
132 | user_id,
133 | level,
134 | song_id,
135 | artist_id,
136 | session_id,
137 | location,
138 | user_agent
139 | )
140 | SELECT
141 | timestamp 'epoch' + se.ts/1000 * interval '1 second',
142 | se.userId,
143 | se.level,
144 | ss.song_id,
145 | ss.artist_id,
146 | se.sessionId,
147 | se.location,
148 | se.userAgent
149 | FROM staging_events se
150 | JOIN staging_songs ss ON (se.song = ss.title AND se.artist = ss.artist_name)
151 | WHERE se.page = 'NextSong';
152 | """)
153 |
154 | user_table_insert = ("""
155 | INSERT INTO users (
156 | user_id,
157 | first_name,
158 | last_name,
159 | gender,
160 | level
161 | )
162 | SELECT
163 | DISTINCT userId,
164 | firstName,
165 | lastName,
166 | gender,
167 | level
168 | FROM staging_events
169 | WHERE page = 'NextSong' AND userId IS NOT NULL
170 | """)
171 |
172 | song_table_insert = ("""
173 | INSERT INTO songs (
174 | song_id,
175 | title,
176 | artist_id,
177 | year,
178 | duration
179 | )
180 | SELECT
181 | DISTINCT song_id,
182 | title,
183 | artist_id,
184 | year,
185 | duration
186 | FROM staging_songs
187 | WHERE song_id IS NOT NULL
188 | """)
189 |
190 | artist_table_insert = ("""
191 | INSERT INTO artists (
192 | artist_id,
193 | name,
194 | location,
195 | latitude,
196 | longitude
197 | )
198 | SELECT
199 | DISTINCT artist_id,
200 | artist_name,
201 | artist_location,
202 | artist_latitude,
203 | artist_longitude
204 | FROM staging_songs
205 | WHERE artist_id IS NOT NULL
206 | """)
207 |
208 | time_table_insert = ("""
209 | INSERT INTO time (
210 | start_time,
211 | hour,
212 | day,
213 | week,
214 | month,
215 | year,
216 | weekday
217 | )
218 | SELECT
219 | DISTINCT start_time,
220 | EXTRACT(hour from start_time),
221 | EXTRACT(day from start_time),
222 | EXTRACT(week from start_time),
223 | EXTRACT(month from start_time),
224 | EXTRACT(year from start_time),
225 | EXTRACT(weekday from start_time)
226 | FROM songplays
227 | """)
228 |
229 | # QUERY LISTS
230 | create_table_queries = [staging_events_table_create, staging_songs_table_create, songplay_table_create, user_table_create, song_table_create, artist_table_create, time_table_create]
231 | drop_table_queries = [staging_events_table_drop, staging_songs_table_drop, songplay_table_drop, user_table_drop, song_table_drop, artist_table_drop, time_table_drop]
232 | copy_table_queries = [staging_events_copy, staging_songs_copy]
233 | insert_table_queries = [songplay_table_insert, user_table_insert, song_table_insert, artist_table_insert, time_table_insert]
234 |
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/3_Spark_and_Data_Lakes/.DS_Store
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/exercises/1_mapreduce_practice.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MapReduce\n",
8 | "\n",
9 | "The MapReduce programming technique was designed to analyze massive data sets across a cluster. In this Jupyter notebook, you'll get a sense for how Hadoop MapReduce works; however, this notebook will run locally rather than on a cluster.\n",
10 | "\n",
11 | "The biggest difference between Hadoop and Spark is that Spark tries to do as many calculations as possible in memory, which avoids moving data back and forth across a cluster. Hadoop writes intermediate calculations out to disk, which can be less efficient. Hadoop is an older technology than Spark and one of the cornerstone big data technologies.\n",
12 | "\n",
13 | "\n",
14 | "# MapReduce versus Hadoop MapReduce\n",
15 | "\n",
16 | "Don't get confused by the terminology! MapReduce is a programming technique. Hadoop MapReduce is a specific implementation of the programming technique.\n",
17 | "\n",
18 | "Some of the syntax will look a bit funny, so be sure to read the explanation and comments for each section. You'll learn more about the syntax in later lessons. \n",
19 | "\n",
20 | "Run each of the code cells below to see the output."
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 1,
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "name": "stdout",
30 | "output_type": "stream",
31 | "text": [
32 | "Collecting mrjob\n",
33 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/8e/58/fc28ab743aba16e90736ad4e29694bd2adaf7b879376ff149306d50c4e90/mrjob-0.7.4-py2.py3-none-any.whl (439kB)\n",
34 | "\u001b[K 100% |████████████████████████████████| 440kB 17.1MB/s ta 0:00:01\n",
35 | "\u001b[?25hRequirement already satisfied: PyYAML>=3.10 in /opt/conda/lib/python3.6/site-packages (from mrjob) (3.12)\n",
36 | "Installing collected packages: mrjob\n",
37 | "Successfully installed mrjob-0.7.4\n"
38 | ]
39 | }
40 | ],
41 | "source": [
42 | "# Install mrjob library. This package is for running MapReduce jobs with Python\n",
43 | "# In Jupyter notebooks, \"!\" runs terminal commands from inside notebooks \n",
44 | "\n",
45 | "! pip install mrjob"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 2,
51 | "metadata": {},
52 | "outputs": [
53 | {
54 | "name": "stdout",
55 | "output_type": "stream",
56 | "text": [
57 | "Overwriting wordcount.py\n"
58 | ]
59 | }
60 | ],
61 | "source": [
62 | "%%file wordcount.py\n",
63 | "# %%file is an Ipython magic function that saves the code cell as a file\n",
64 | "\n",
65 | "from mrjob.job import MRJob # import the mrjob library\n",
66 | "\n",
67 | "class MRSongCount(MRJob):\n",
68 | " \n",
69 | " # the map step: each line in the txt file is read as a key, value pair\n",
70 | " # in this case, each line in the txt file only contains a value but no key\n",
71 | " # _ means that in this case, there is no key for each line\n",
72 | " def mapper(self, _, song):\n",
73 | " # output each line as a tuple of (song_names, 1) \n",
74 | " yield (song, 1)\n",
75 | "\n",
76 | " # the reduce step: combine all tuples with the same key\n",
77 | " # in this case, the key is the song name\n",
78 | " # then sum all the values of the tuple, which will give the total song plays\n",
79 | " def reducer(self, key, values):\n",
80 | " yield (key, sum(values))\n",
81 | " \n",
82 | "if __name__ == \"__main__\":\n",
83 | " MRSongCount.run()"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 3,
89 | "metadata": {},
90 | "outputs": [
91 | {
92 | "name": "stdout",
93 | "output_type": "stream",
94 | "text": [
95 | "No configs found; falling back on auto-configuration\n",
96 | "No configs specified for inline runner\n",
97 | "Creating temp directory /tmp/wordcount.root.20230121.121623.573819\n",
98 | "Running step 1 of 1...\n",
99 | "job output is in /tmp/wordcount.root.20230121.121623.573819/output\n",
100 | "Streaming final output from /tmp/wordcount.root.20230121.121623.573819/output...\n",
101 | "\"Broken Networks\"\t510\n",
102 | "\"Data House Rock\"\t828\n",
103 | "\"Deep Dreams\"\t1131\n",
104 | "Removing temp directory /tmp/wordcount.root.20230121.121623.573819...\n"
105 | ]
106 | }
107 | ],
108 | "source": [
109 | "# run the code as a terminal command\n",
110 | "! python wordcount.py data/songplays.txt"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {},
116 | "source": [
117 | "# Summary of what happens in the code.\n",
118 | "\n",
119 | "There is a list of songs in songplays.txt that looks like the following:\n",
120 | "\n",
121 | "Deep Dreams\n",
122 | "Data House Rock\n",
123 | "Deep Dreams\n",
124 | "Data House Rock\n",
125 | "Broken Networks\n",
126 | "Data House Rock\n",
127 | "etc.....\n",
128 | "\n",
129 | "During the map step, the code reads in the txt file one line at a time. The map steps outputs a set of tuples that look like this:\n",
130 | "\n",
131 | "(Deep Dreams, 1) \n",
132 | "(Data House Rock, 1) \n",
133 | "(Deep Dreams, 1) \n",
134 | "(Data House Rock, 1) \n",
135 | "(Broken Networks, 1) \n",
136 | "(Data House Rock, 1) \n",
137 | "etc.....\n",
138 | "\n",
139 | "Finally, the reduce step combines all of the values by keys and sums the values: \n",
140 | "\n",
141 | "(Deep Dreams, \\[1, 1, 1, 1, 1, 1, ... \\]) \n",
142 | "(Data House Rock, \\[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\\]) \n",
143 | "(Broken Networks, \\[1, 1, 1, ...\\] \n",
144 | "\n",
145 | "With the output \n",
146 | "\n",
147 | "(Deep Dreams, 1131) \n",
148 | "(Data House Rock, 510) \n",
149 | "(Broken Networks, 828) "
150 | ]
151 | }
152 | ],
153 | "metadata": {
154 | "kernelspec": {
155 | "display_name": "Python 3",
156 | "language": "python",
157 | "name": "python3"
158 | },
159 | "language_info": {
160 | "codemirror_mode": {
161 | "name": "ipython",
162 | "version": 3
163 | },
164 | "file_extension": ".py",
165 | "mimetype": "text/x-python",
166 | "name": "python",
167 | "nbconvert_exporter": "python",
168 | "pygments_lexer": "ipython3",
169 | "version": "3.11.1 (main, Dec 23 2022, 09:39:26) [Clang 14.0.0 (clang-1400.0.29.202)]"
170 | },
171 | "vscode": {
172 | "interpreter": {
173 | "hash": "1a1af0ee75eeea9e2e1ee996c87e7a2b11a0bebd85af04bb136d915cefc0abce"
174 | }
175 | }
176 | },
177 | "nbformat": 4,
178 | "nbformat_minor": 2
179 | }
180 |
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/exercises/2_rdd_song_lower_case.py:
--------------------------------------------------------------------------------
1 | ###
2 | # You might have noticed this code in the screencast.
3 | #
4 | # import findspark
5 | # findspark.init('spark-2.3.2-bin-hadoop2.7')
6 | #
7 | # The findspark Python module makes it easier to install
8 | # Spark in local mode on your computer. This is convenient
9 | # for practicing Spark syntax locally.
10 | # However, the workspaces already have Spark installed and you do not
11 | # need to use the findspark module
12 | #
13 | ###
14 |
15 | from pyspark.sql import SparkSession
16 |
17 | # Because we aren't running on a spark cluster, the session is just for development
18 | spark = SparkSession \
19 | .builder \
20 | .appName("Maps and Lazy Evaluation Example") \
21 | .getOrCreate()
22 |
23 |
24 | # Starting off with a regular python list
25 | log_of_songs = [
26 | "Despacito",
27 | "Nice for what",
28 | "No tears left to cry",
29 | "Despacito",
30 | "Havana",
31 | "In my feelings",
32 | "Nice for what",
33 | "despacito",
34 | "All the stars"
35 | ]
36 |
37 | # parallelize the log_of_songs to use with Spark
38 | # distributed_song_log_rdd is an RDD (Reslient Distributed Dataset)
39 | distributed_song_log_rdd = spark.sparkContext.parallelize(log_of_songs)
40 |
41 | # notice we DO NOT use the .collect() method. What is the difference between
42 | # .collect() and .foreach() ?
43 | # .collect() forces all the data from the entire RDD on all nodes
44 | # to be collected from ALL the nodes, which kills productivity, and could crash
45 | # .foreach() allows the data to stay on each of the independent nodes
46 |
47 | print("Show the original input data is preserved")
48 |
49 | distributed_song_log_rdd.foreach(print)
50 |
51 | print("-" * 30)
52 |
53 | def convert_song_to_lowercase(song):
54 | return song.lower()
55 |
56 | print("Converting sample string 'Havana' to lowercase")
57 | print(convert_song_to_lowercase("Havana"))
58 |
59 | print("-" * 30)
60 |
61 | print("Show the converted data")
62 | lower_case_songs=distributed_song_log_rdd.map(convert_song_to_lowercase)
63 | lower_case_songs.foreach(print)
64 |
65 | print("-" * 30)
66 |
67 | # Show the original input data is still mixed case
68 | print("Show the original input data is still mixed case")
69 | distributed_song_log_rdd.foreach(print)
70 |
71 | print("-" * 30)
72 |
73 | # Use lambda functions instead of named functions to do the same map operation
74 | print("Using lambda function")
75 | distributed_song_log_rdd.map(lambda song: song.lower()).foreach(print)
76 |
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/exercises/3_data_inputs_and_outputs.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 |
3 |
4 | # Because we aren't running on a spark cluster, the session is just for development
5 | spark = SparkSession \
6 | .builder \
7 | .appName("Our first Python Spark SQL example") \
8 | .getOrCreate()
9 |
10 |
11 | # This should print the default configuration
12 | print(
13 | spark.sparkContext.getConf().getAll()
14 | )
15 |
16 | # This path resides on your computer or workspace, not in HDFS
17 | path = "data/sparkify_log_small.json"
18 | user_log_df = spark.read.json(path)
19 |
20 | # See how Spark inferred the schema from the JSON file
21 | user_log_df.printSchema()
22 | print(
23 | user_log_df.describe()
24 | )
25 |
26 | user_log_df.show(n=1)
27 | print(
28 | user_log_df.take(5)
29 | )
30 |
31 | # We are changing file formats
32 | out_path = "data/sparkify_log_small_2.json"
33 |
34 |
35 | # The filename alone didn't tell Spark the actual format, we need to do it here
36 | user_log_df.write.mode("overwrite").save(out_path, format="csv", header=True)
37 |
38 | # Notice we have created another dataframe here
39 | # We wouldn't usually read the data that we just wrote
40 | # This does show, however, that the read method works with
41 | # Different data types
42 | user_log_2_df = spark.read.csv(out_path, header=True)
43 | user_log_2_df.printSchema()
44 |
45 | # Choose two records from the CSV file
46 | print(
47 | user_log_2_df.take(2)
48 | )
49 |
50 | # Show the userID column for the first several rows
51 | user_log_2_df.select("userID").show()
52 |
53 | #
54 | print(
55 | user_log_2_df.take(1)
56 | )
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/exercises/4_data_wrangling.py:
--------------------------------------------------------------------------------
1 | # # Data Wrangling with Spark
2 | #
3 | # This is the code used in the previous screencast. Run each code cell to understand what the code does and how it works.
4 | #
5 | # These first three cells import libraries, instantiate a SparkSession, and then read in the data set
6 |
7 | from pyspark.sql import SparkSession
8 | from pyspark.sql.functions import udf
9 | from pyspark.sql.types import IntegerType
10 | from pyspark.sql.functions import desc
11 | from pyspark.sql.functions import sum as Fsum
12 |
13 | import datetime
14 |
15 | import numpy as np
16 | import pandas as pd
17 | import matplotlib.pyplot as plt
18 |
19 | spark = SparkSession \
20 | .builder \
21 | .appName("Wrangling Data") \
22 | .getOrCreate()
23 |
24 | path = "data/sparkify_log_small.json"
25 |
26 | user_log_df = spark.read.json(path)
27 |
28 |
29 | # # Data Exploration
30 | #
31 | # # Explore the data set.
32 |
33 |
34 | # View 5 records
35 | print(
36 | user_log_df.take(5)
37 | )
38 | # Print the schema
39 | user_log_df.printSchema()
40 |
41 | # Describe the dataframe
42 | user_log_df.describe().show()
43 |
44 | # Describe the statistics for the song length column
45 | user_log_df.describe("length").show()
46 |
47 | # Count the rows in the dataframe
48 | print(
49 | user_log_df.count()
50 | )
51 |
52 | # Select the page column, drop the duplicates, and sort by page
53 | user_log_df.select("page").dropDuplicates().sort("page").show()
54 |
55 | # Select data for all pages where userId is 1046
56 | user_log_df.select(["userId", "firstname", "page", "song"]) \
57 | .where(user_log_df.userId == "1046") \
58 | .show()
59 |
60 |
61 | # # Calculating Statistics by Hour
62 | get_hour = udf(lambda x: datetime.datetime.fromtimestamp(x / 1000.0). hour)
63 |
64 | user_log_df = user_log_df.withColumn("hour", get_hour(user_log_df.ts))
65 |
66 | print(
67 | # Get the first row
68 | user_log_df.head(1)
69 | )
70 |
71 | # Select just the NextSong page
72 | songs_in_hour_df = user_log_df.filter(user_log_df.page == "NextSong") \
73 | .groupby(user_log_df.hour) \
74 | .count() \
75 | .orderBy(user_log_df.hour.cast("float"))
76 |
77 | songs_in_hour_df.show()
78 |
79 | songs_in_hour_pd = songs_in_hour_df.toPandas()
80 | songs_in_hour_pd.hour = pd.to_numeric(songs_in_hour_pd.hour)
81 |
82 | plt.scatter(songs_in_hour_pd["hour"], songs_in_hour_pd["count"])
83 | plt.xlim(-1, 24)
84 | plt.ylim(0, 1.2 * max(songs_in_hour_pd["count"]))
85 | plt.xlabel("Hour")
86 | plt.ylabel("Songs played")
87 | plt.show()
88 |
89 |
90 | # # Drop Rows with Missing Values
91 | #
92 | # As you'll see, it turns out there are no missing values in the userID or session columns. But there are userID values that are empty strings.
93 | # how = 'any' or 'all'. If 'any', drop a row if it contains any nulls. If 'all', drop a row only if all its values are null.
94 | # subset = list of columns to consider
95 | user_log_valid_df = user_log_df.dropna(how = "any", subset = ["userId", "sessionId"])
96 |
97 | # How many are there now that we dropped rows with null userId or sessionId?
98 | print(
99 | user_log_valid_df.count()
100 | )
101 |
102 | # select all unique user ids into a dataframe
103 | user_log_df.select("userId") \
104 | .dropDuplicates() \
105 | .sort("userId").show()
106 |
107 | # Select only data for where the userId column isn't an empty string (different from null)
108 | user_log_valid_df = user_log_valid_df.filter(user_log_valid_df["userId"] != "")
109 |
110 | # Notice the count has dropped after dropping rows with empty userId
111 | print(
112 | user_log_valid_df.count()
113 | )
114 |
115 | # # Users Downgrade Their Accounts
116 | #
117 | # Find when users downgrade their accounts and then show those log entries.
118 |
119 | user_log_valid_df.filter("page = 'Submit Downgrade'") \
120 | .show()
121 |
122 | user_log_df.select(["userId", "firstname", "page", "level", "song"]) \
123 | .where(user_log_df.userId == "1138") \
124 | .show()
125 |
126 | # Create a user defined function to return a 1 if the record contains a downgrade
127 | flag_downgrade_event = udf(lambda x: 1 if x == "Submit Downgrade" else 0, IntegerType())
128 |
129 | # Select data including the user defined function
130 | user_log_valid_df = user_log_valid_df \
131 | .withColumn("downgraded", flag_downgrade_event("page"))
132 |
133 | print(
134 | user_log_valid_df.head()
135 | )
136 |
137 | from pyspark.sql import Window
138 |
139 | # Partition by user id
140 | # Then use a window function and cumulative sum to distinguish each user's data as either pre or post downgrade events.
141 | windowval = Window.partitionBy("userId") \
142 | .orderBy(desc("ts")) \
143 | .rangeBetween(Window.unboundedPreceding, 0)
144 |
145 | # Fsum is a cumulative sum over a window - in this case a window showing all events for a user
146 | # Add a column called phase, 0 if the user hasn't downgraded yet, 1 if they have
147 | user_log_valid_df = user_log_valid_df \
148 | .withColumn("phase", Fsum("downgraded") \
149 | .over(windowval))
150 |
151 | user_log_valid_df.show()
152 |
153 | # Show the phases for user 1138
154 | user_log_valid_df \
155 | .select(["userId", "firstname", "ts", "page", "level", "phase"]) \
156 | .where(user_log_df.userId == "1138") \
157 | .sort("ts") \
158 | .show()
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/exercises/5_data_wrangling_quiz.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Answer Key to the Data Wrangling with DataFrames Coding Quiz
5 | #
6 | # Helpful resources:
7 | # https://spark.apache.org/docs/2.4.0/api/python/pyspark.sql.html
8 |
9 |
10 | from pyspark.sql import SparkSession
11 | from pyspark.sql.functions import col, desc, udf, col
12 | from pyspark.sql.functions import sum as Fsum
13 | from pyspark.sql.window import Window
14 | from pyspark.sql.types import IntegerType
15 |
16 | # 1) import any other libraries you might need
17 | # 2) instantiate a Spark session
18 | # 3) read in the data set located at the path "../../data/sparkify_log_small.json"
19 | # 4) write code to answer the quiz questions
20 |
21 | spark = SparkSession \
22 | .builder \
23 | .appName("Data Frames practice") \
24 | .getOrCreate()
25 |
26 | logs_df = spark.read.json("data/sparkify_log_small.json")
27 |
28 |
29 | # # Question 1
30 | #
31 | # Which page did user id "" (empty string) NOT visit?
32 |
33 |
34 | logs_df.printSchema()
35 |
36 |
37 | # filter for users with blank user id
38 | blank_pages_df = logs_df.filter(logs_df.userId == '') \
39 | .select(col('page') \
40 | .alias('blank_pages')) \
41 | .dropDuplicates()
42 |
43 | # get a list of possible pages that could be visited
44 | all_pages_df = logs_df.select('page').dropDuplicates()
45 |
46 | # find values in all_pages that are not in blank_pages
47 | # these are the pages that the blank user did not go to
48 | # NOTE WE SHOULD NOT USE .collect() on large datasets (>100 MB)
49 | for row in set(all_pages_df.collect()) - set(blank_pages_df.collect()):
50 | print(row.page)
51 |
52 |
53 | # # Question 2 - Reflect
54 | #
55 | # What type of user does the empty string user id most likely refer to?
56 | #
57 |
58 | # Perhaps it represents users who have not signed up yet or who are signed out and are about to log in.
59 |
60 | # # Question 3
61 | #
62 | # How many female users do we have in the data set?
63 |
64 | print(
65 | logs_df.filter(logs_df.gender == 'F') \
66 | .select('userId', 'gender') \
67 | .dropDuplicates() \
68 | .count()
69 | )
70 |
71 | # # Question 4
72 | #
73 | # How many songs were played from the most played artist?
74 |
75 |
76 | logs_df.filter(logs_df.page == 'NextSong') \
77 | .select('Artist') \
78 | .groupBy('Artist') \
79 | .agg({'Artist':'count'}) \
80 | .withColumnRenamed('count(Artist)', 'Playcount') \
81 | .sort(desc('Playcount')) \
82 | .show(1)
83 |
84 |
85 | # # Question 5 (challenge)
86 | #
87 | # How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer.
88 | #
89 | #
90 |
91 | # TODO: filter out 0 sum and max sum to get more exact answer
92 |
93 | user_window = Window \
94 | .partitionBy('userID') \
95 | .orderBy(desc('ts')) \
96 | .rangeBetween(Window.unboundedPreceding, 0)
97 |
98 | ishome = udf(lambda ishome : int(ishome == 'Home'), IntegerType())
99 |
100 | # Filter only NextSong and Home pages, add 1 for each time they visit Home
101 | # Adding a column called period which is a specific interval between Home visits
102 | cusum = logs_df.filter((logs_df.page == 'NextSong') | (logs_df.page == 'Home')) \
103 | .select('userID', 'page', 'ts') \
104 | .withColumn('homevisit', ishome(col('page'))) \
105 | .withColumn('period', Fsum('homevisit') \
106 | .over(user_window))
107 |
108 | # This will only show 'Home' in the first several rows due to default sorting
109 |
110 | cusum.show(300)
111 |
112 |
113 | # See how many songs were listened to on average during each period
114 | cusum.filter((cusum.page == 'NextSong')) \
115 | .groupBy('userID', 'period') \
116 | .agg({'period':'count'}) \
117 | .agg({'count(period)':'avg'}) \
118 | .show()
119 |
120 |
121 |
122 |
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/exercises/6_data_wrangling_with_spark_sql.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Spark SQL Examples
5 | #
6 |
7 | from pyspark.sql import SparkSession
8 |
9 | import datetime
10 |
11 | spark = SparkSession \
12 | .builder \
13 | .appName("Data wrangling with Spark SQL") \
14 | .getOrCreate()
15 |
16 |
17 | path = "data/sparkify_log_small.json"
18 | user_log_df = spark.read.json(path)
19 |
20 | user_log_df.take(1)
21 |
22 | user_log_df.printSchema()
23 |
24 |
25 | # # Create a View And Run Queries
26 | #
27 | # The code below creates a temporary view against which you can run SQL queries.
28 |
29 | user_log_df.createOrReplaceTempView("user_log_table")
30 |
31 |
32 | spark.sql('''
33 | SELECT *
34 | FROM user_log_table
35 | LIMIT 2
36 | '''
37 | ).show()
38 |
39 | spark.sql('''
40 | SELECT COUNT(*)
41 | FROM user_log_table
42 | '''
43 | ).show()
44 |
45 | spark.sql('''
46 | SELECT userID, firstname, page, song
47 | FROM user_log_table
48 | WHERE userID == '1046'
49 | '''
50 | ).show()
51 |
52 | spark.sql('''
53 | SELECT DISTINCT page
54 | FROM user_log_table
55 | ORDER BY page ASC
56 | '''
57 | ).show()
58 |
59 |
60 | # # User Defined Functions
61 |
62 | spark.udf.register("get_hour", lambda x: int(datetime.datetime.fromtimestamp(x / 1000.0).hour))
63 |
64 | spark.sql('''
65 | SELECT *, get_hour(ts) AS hour
66 | FROM user_log_table
67 | LIMIT 1
68 | '''
69 | ).show()
70 |
71 | songs_in_hour_df = spark.sql('''
72 | SELECT get_hour(ts) AS hour, COUNT(*) as plays_per_hour
73 | FROM user_log_table
74 | WHERE page = "NextSong"
75 | GROUP BY hour
76 | ORDER BY cast(hour as int) ASC
77 | '''
78 | )
79 |
80 | songs_in_hour_df.show()
81 |
82 | # Converting Results to Pandas
83 |
84 | songs_in_hour_pd = songs_in_hour_df.toPandas()
85 |
86 |
87 | print(songs_in_hour_pd)
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/exercises/7_accelerometer_landing_to_trusted.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from awsglue.transforms import *
3 | from awsglue.utils import getResolvedOptions
4 | from pyspark.context import SparkContext
5 | from awsglue.context import GlueContext
6 | from awsglue.job import Job
7 |
8 | args = getResolvedOptions(sys.argv, ["JOB_NAME"])
9 | sc = SparkContext()
10 | glueContext = GlueContext(sc)
11 | spark = glueContext.spark_session
12 | job = Job(glueContext)
13 | job.init(args["JOB_NAME"], args)
14 |
15 | # Script generated for node Customer Trusted Zone
16 | CustomerTrustedZone_node1675623468647 = glueContext.create_dynamic_frame.from_options(
17 | format_options={"multiline": False},
18 | connection_type="s3",
19 | format="json",
20 | connection_options={
21 | "paths": ["s3://udacity-glue-spark-bucket/customer/trusted/"],
22 | "recurse": True,
23 | },
24 | transformation_ctx="CustomerTrustedZone_node1675623468647",
25 | )
26 |
27 | # Script generated for node Accelerometer Landing
28 | AccelerometerLanding_node1 = glueContext.create_dynamic_frame.from_catalog(
29 | database="stedi",
30 | table_name="accelerometer_landing",
31 | transformation_ctx="AccelerometerLanding_node1",
32 | )
33 |
34 | # Script generated for node Join Customer
35 | JoinCustomer_node2 = Join.apply(
36 | frame1=AccelerometerLanding_node1,
37 | frame2=CustomerTrustedZone_node1675623468647,
38 | keys1=["user"],
39 | keys2=["email"],
40 | transformation_ctx="JoinCustomer_node2",
41 | )
42 |
43 | # Script generated for node Drop Fields
44 | DropFields_node1675625653291 = DropFields.apply(
45 | frame=JoinCustomer_node2,
46 | paths=[
47 | "timestamp",
48 | "serialNumber",
49 | "shareWithPublicAsOfDate",
50 | "birthDay",
51 | "registrationDate",
52 | "shareWithResearchAsOfDate",
53 | "customerName",
54 | "email",
55 | "lastUpdateDate",
56 | "phone",
57 | "shareWithFriendsAsOfDate",
58 | ],
59 | transformation_ctx="DropFields_node1675625653291",
60 | )
61 |
62 | # Script generated for node Accelerometer Trusted
63 | AccelerometerTrusted_node3 = glueContext.write_dynamic_frame.from_options(
64 | frame=DropFields_node1675625653291,
65 | connection_type="s3",
66 | format="json",
67 | connection_options={
68 | "path": "s3://udacity-glue-spark-bucket/accelerometer/trusted/",
69 | "partitionKeys": [],
70 | },
71 | transformation_ctx="AccelerometerTrusted_node3",
72 | )
73 |
74 | job.commit()
75 |
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/exercises/8_customer_urated.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from awsglue.transforms import *
3 | from awsglue.utils import getResolvedOptions
4 | from pyspark.context import SparkContext
5 | from awsglue.context import GlueContext
6 | from awsglue.job import Job
7 |
8 | args = getResolvedOptions(sys.argv, ["JOB_NAME"])
9 | sc = SparkContext()
10 | glueContext = GlueContext(sc)
11 | spark = glueContext.spark_session
12 | job = Job(glueContext)
13 | job.init(args["JOB_NAME"], args)
14 |
15 | # Script generated for node Customer Trusted Zone
16 | CustomerTrustedZone_node1675623468647 = glueContext.create_dynamic_frame.from_options(
17 | format_options={"multiline": False},
18 | connection_type="s3",
19 | format="json",
20 | connection_options={
21 | "paths": ["s3://udacity-glue-spark-bucket/customer/trusted/"],
22 | "recurse": True,
23 | },
24 | transformation_ctx="CustomerTrustedZone_node1675623468647",
25 | )
26 |
27 | # Script generated for node Accelerometer Landing
28 | AccelerometerLanding_node1 = glueContext.create_dynamic_frame.from_catalog(
29 | database="stedi",
30 | table_name="accelerometer_landing",
31 | transformation_ctx="AccelerometerLanding_node1",
32 | )
33 |
34 | # Script generated for node Join Customer
35 | JoinCustomer_node2 = Join.apply(
36 | frame1=AccelerometerLanding_node1,
37 | frame2=CustomerTrustedZone_node1675623468647,
38 | keys1=["user"],
39 | keys2=["email"],
40 | transformation_ctx="JoinCustomer_node2",
41 | )
42 |
43 | # Script generated for node Drop Fields
44 | DropFields_node1675625653291 = DropFields.apply(
45 | frame=JoinCustomer_node2,
46 | paths=["user", "timestamp", "x", "y", "z"],
47 | transformation_ctx="DropFields_node1675625653291",
48 | )
49 |
50 | # Script generated for node Customer Curated
51 | CustomerCurated_node3 = glueContext.write_dynamic_frame.from_options(
52 | frame=DropFields_node1675625653291,
53 | connection_type="s3",
54 | format="json",
55 | connection_options={
56 | "path": "s3://udacity-glue-spark-bucket/customer/curated/",
57 | "partitionKeys": [],
58 | },
59 | transformation_ctx="CustomerCurated_node3",
60 | )
61 |
62 | job.commit()
63 |
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/images/Running_Spark_scripts_at_a_time_interval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/3_Spark_and_Data_Lakes/images/Running_Spark_scripts_at_a_time_interval.png
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/images/aws_glue_configuration.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/3_Spark_and_Data_Lakes/images/aws_glue_configuration.jpeg
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/images/datalake_zones.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/3_Spark_and_Data_Lakes/images/datalake_zones.png
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/images/evolution_of_bigdata_ecosystem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/3_Spark_and_Data_Lakes/images/evolution_of_bigdata_ecosystem.png
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/images/glue_job_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/3_Spark_and_Data_Lakes/images/glue_job_diagram.png
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/images/glue_job_using_s3_vpc_gateway.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/3_Spark_and_Data_Lakes/images/glue_job_using_s3_vpc_gateway.jpeg
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/images/hadoop_to_data_lakehouse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/3_Spark_and_Data_Lakes/images/hadoop_to_data_lakehouse.png
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/images/ingesting_and_organizing_data_in_a_lakehouse.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/3_Spark_and_Data_Lakes/images/ingesting_and_organizing_data_in_a_lakehouse.jpeg
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/images/spark_catalyst.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/3_Spark_and_Data_Lakes/images/spark_catalyst.png
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/images/spark_dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/3_Spark_and_Data_Lakes/images/spark_dag.png
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/images/spark_job_using_glue_studio.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/3_Spark_and_Data_Lakes/images/spark_job_using_glue_studio.jpeg
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/images/spark_modes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/3_Spark_and_Data_Lakes/images/spark_modes.png
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/images/spark_version_rdd_mapping.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/3_Spark_and_Data_Lakes/images/spark_version_rdd_mapping.png
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/images/streaming_data.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/3_Spark_and_Data_Lakes/images/streaming_data.jpeg
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/project/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/3_Spark_and_Data_Lakes/project/.DS_Store
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/project/README.md:
--------------------------------------------------------------------------------
1 | # Project: STEDI Human Balance Analytics
2 |
3 | ## Introduction
4 |
5 | Spark and AWS Glue allow you to process data from multiple sources, categorize the data, and curate it to be queried in the future for multiple purposes. As a data engineer on the STEDI Step Trainer team, you'll need to extract the data produced by the STEDI Step Trainer sensors and the mobile app, and curate them into a data lakehouse solution on AWS so that Data Scientists can train the learning model.
6 |
7 | ## Project Details
8 |
9 | The STEDI Team has been hard at work developing a hardware STEDI Step Trainer that:
10 |
11 | - trains the user to do a STEDI balance exercise;
12 | - and has sensors on the device that collect data to train a machine-learning algorithm to detect steps;
13 | - has a companion mobile app that collects customer data and interacts with the device sensors.
14 |
15 | STEDI has heard from millions of early adopters who are willing to purchase the STEDI Step Trainers and use them.
16 |
17 | Several customers have already received their Step Trainers, installed the mobile application, and begun using them together to test their balance. The Step Trainer is just a motion sensor that records the distance of the object detected. The app uses a mobile phone accelerometer to detect motion in the X, Y, and Z directions.
18 |
19 | The STEDI team wants to use the motion sensor data to train a machine learning model to detect steps accurately in real-time. Privacy will be a primary consideration in deciding what data can be used.
20 |
21 | Some of the early adopters have agreed to share their data for research purposes. Only these customers’ Step Trainer and accelerometer data should be used in the training data for the machine learning model.
22 |
23 | ## Implementation
24 |
25 | ### Landing Zone
26 |
27 | **Glue Tables**:
28 |
29 | - [customer_landing.sql](script/customer_landing.sql)
30 | - [accelerometer_landing.sql](script/accelerometer_landing.sql)
31 |
32 | **Athena**:
33 | Landing Zone data query results
34 |
35 | *Customer Landing*:
36 |
37 |
38 |
39 |
40 |
41 | *Accelerometer Landing*:
42 |
43 |
44 |
45 |
46 |
47 | ### Trusted Zone
48 |
49 | **Glue job scripts**:
50 |
51 | - [customer_landing_to_trusted.py](scripts/customer_landing_to_trusted.py)
52 | - [accelerometer_landing_to_trusted_zone.py](scripts/accelerometer_landing_to_trusted.py)
53 |
54 | **Athena**:
55 | Trusted Zone Query results:
56 |
57 |
58 |
59 |
60 |
61 | ### Curated Zone
62 |
63 | **Glue job scripts**:
64 |
65 | - [customer_trusted_to_curated.py](scripts/customer_trusted_to_curated.py)
66 | - [trainer_trusted_to_curated.py](scripts/trainer_trusted_to_curated.py)
67 |
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/project/images/accelerometer_landing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/3_Spark_and_Data_Lakes/project/images/accelerometer_landing.png
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/project/images/customer_landing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/3_Spark_and_Data_Lakes/project/images/customer_landing.png
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/project/images/customer_trusted.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/3_Spark_and_Data_Lakes/project/images/customer_trusted.png
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/project/scripts/accelerometer_landing.sql:
--------------------------------------------------------------------------------
1 | CREATE EXTERNAL TABLE IF NOT EXISTS `project`.`accelerometer_landing` (
2 | `user` string,
3 | `timestamp` bigint,
4 | `x` float,
5 | `y` float,
6 | `z` float
7 | )
8 | ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
9 | WITH SERDEPROPERTIES (
10 | 'ignore.malformed.json' = 'FALSE',
11 | 'dots.in.keys' = 'FALSE',
12 | 'case.insensitive' = 'TRUE',
13 | 'mapping' = 'TRUE'
14 | )
15 | STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
16 | LOCATION 's3://udacity-glue-spark-bucket/project/accelerometer/landing/'
17 | TBLPROPERTIES ('classification' = 'json');
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/project/scripts/accelerometer_landing_to_trusted.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from awsglue.transforms import *
3 | from awsglue.utils import getResolvedOptions
4 | from pyspark.context import SparkContext
5 | from awsglue.context import GlueContext
6 | from awsglue.job import Job
7 |
8 | args = getResolvedOptions(sys.argv, ["JOB_NAME"])
9 | sc = SparkContext()
10 | glueContext = GlueContext(sc)
11 | spark = glueContext.spark_session
12 | job = Job(glueContext)
13 | job.init(args["JOB_NAME"], args)
14 |
15 | # Script generated for node Accelerometer Landing
16 | AccelerometerLanding_node1676402494135 = glueContext.create_dynamic_frame.from_catalog(
17 | database="project",
18 | table_name="accelerometer_landing",
19 | transformation_ctx="AccelerometerLanding_node1676402494135",
20 | )
21 |
22 | # Script generated for node Customer Trusted Zone
23 | CustomerTrustedZone_node1 = glueContext.create_dynamic_frame.from_options(
24 | format_options={"multiline": False},
25 | connection_type="s3",
26 | format="json",
27 | connection_options={
28 | "paths": ["s3://udacity-glue-spark-bucket/project/customers/trusted/"],
29 | "recurse": True,
30 | },
31 | transformation_ctx="CustomerTrustedZone_node1",
32 | )
33 |
34 | # Script generated for node Join Customer
35 | JoinCustomer_node1676402624725 = Join.apply(
36 | frame1=CustomerTrustedZone_node1,
37 | frame2=AccelerometerLanding_node1676402494135,
38 | keys1=["email"],
39 | keys2=["user"],
40 | transformation_ctx="JoinCustomer_node1676402624725",
41 | )
42 |
43 | # Script generated for node Drop Fields
44 | DropFields_node1676402768067 = DropFields.apply(
45 | frame=JoinCustomer_node1676402624725,
46 | paths=[
47 | "serialNumber",
48 | "shareWithPublicAsOfDate",
49 | "birthDay",
50 | "registrationDate",
51 | "shareWithResearchAsOfDate",
52 | "customerName",
53 | "email",
54 | "lastUpdateDate",
55 | "phone",
56 | "shareWithFriendsAsOfDate",
57 | "timestamp",
58 | ],
59 | transformation_ctx="DropFields_node1676402768067",
60 | )
61 |
62 | # Script generated for node AWS Glue Data Catalog
63 | AWSGlueDataCatalog_node1676574482997 = glueContext.write_dynamic_frame.from_catalog(
64 | frame=DropFields_node1676402768067,
65 | database="project",
66 | table_name="accelerometer_trusted",
67 | transformation_ctx="AWSGlueDataCatalog_node1676574482997",
68 | )
69 |
70 | job.commit()
71 |
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/project/scripts/customer_landing.sql:
--------------------------------------------------------------------------------
1 | CREATE EXTERNAL TABLE IF NOT EXISTS `project`.`customer_landing` (
2 | `customername` string,
3 | `email` string,
4 | `phone` string,
5 | `birthday` string,
6 | `serialnumber` string,
7 | `registrationdate` bigint,
8 | `lastupdatedate` bigint,
9 | `sharewithresearchasofdate` bigint,
10 | `sharewithfriendsasofdate` bigint,
11 | `sharewithpublicasofdate` bigint
12 | )
13 | ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
14 | WITH SERDEPROPERTIES (
15 | 'ignore.malformed.json' = 'FALSE',
16 | 'dots.in.keys' = 'FALSE',
17 | 'case.insensitive' = 'TRUE',
18 | 'mapping' = 'TRUE'
19 | )
20 | STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
21 | LOCATION 's3://udacity-glue-spark-bucket/project/customers/landing/'
22 | TBLPROPERTIES ('classification' = 'json');
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/project/scripts/customer_landing_to_trusted.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from awsglue.transforms import *
3 | from awsglue.utils import getResolvedOptions
4 | from pyspark.context import SparkContext
5 | from awsglue.context import GlueContext
6 | from awsglue.job import Job
7 | import re
8 |
9 | args = getResolvedOptions(sys.argv, ["JOB_NAME"])
10 | sc = SparkContext()
11 | glueContext = GlueContext(sc)
12 | spark = glueContext.spark_session
13 | job = Job(glueContext)
14 | job.init(args["JOB_NAME"], args)
15 |
16 | # Script generated for node S3 bucket
17 | S3bucket_node1 = glueContext.create_dynamic_frame.from_options(
18 | format_options={"multiline": False},
19 | connection_type="s3",
20 | format="json",
21 | connection_options={
22 | "paths": ["s3://udacity-glue-spark-bucket/project/customers/landing/"],
23 | "recurse": True,
24 | },
25 | transformation_ctx="S3bucket_node1",
26 | )
27 |
28 | # Script generated for node ApplyMapping
29 | ApplyMapping_node2 = Filter.apply(
30 | frame=S3bucket_node1,
31 | f=lambda row: (not (row["shareWithResearchAsOfDate"] == 0)),
32 | transformation_ctx="ApplyMapping_node2",
33 | )
34 |
35 | # Script generated for node AWS Glue Data Catalog
36 | AWSGlueDataCatalog_node1676574309040 = glueContext.write_dynamic_frame.from_catalog(
37 | frame=ApplyMapping_node2,
38 | database="project",
39 | table_name="customer_trusted",
40 | transformation_ctx="AWSGlueDataCatalog_node1676574309040",
41 | )
42 |
43 | job.commit()
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/project/scripts/customer_trusted_to_curated.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from awsglue.transforms import *
3 | from awsglue.utils import getResolvedOptions
4 | from pyspark.context import SparkContext
5 | from awsglue.context import GlueContext
6 | from awsglue.job import Job
7 |
8 | args = getResolvedOptions(sys.argv, ["JOB_NAME"])
9 | sc = SparkContext()
10 | glueContext = GlueContext(sc)
11 | spark = glueContext.spark_session
12 | job = Job(glueContext)
13 | job.init(args["JOB_NAME"], args)
14 |
15 | # Script generated for node Accelerometer Landing
16 | AccelerometerLanding_node1676402494135 = glueContext.create_dynamic_frame.from_catalog(
17 | database="project",
18 | table_name="accelerometer_landing",
19 | transformation_ctx="AccelerometerLanding_node1676402494135",
20 | )
21 |
22 | # Script generated for node Customer Trusted Zone
23 | CustomerTrustedZone_node1 = glueContext.create_dynamic_frame.from_options(
24 | format_options={"multiline": False},
25 | connection_type="s3",
26 | format="json",
27 | connection_options={
28 | "paths": ["s3://udacity-glue-spark-bucket/project/customers/trusted/"],
29 | "recurse": True,
30 | },
31 | transformation_ctx="CustomerTrustedZone_node1",
32 | )
33 |
34 | # Script generated for node Join Customer
35 | JoinCustomer_node1676402624725 = Join.apply(
36 | frame1=CustomerTrustedZone_node1,
37 | frame2=AccelerometerLanding_node1676402494135,
38 | keys1=["email"],
39 | keys2=["user"],
40 | transformation_ctx="JoinCustomer_node1676402624725",
41 | )
42 |
43 | # Script generated for node Drop Fields
44 | DropFields_node1676402768067 = DropFields.apply(
45 | frame=JoinCustomer_node1676402624725,
46 | paths=["x", "y", "z", "user", "timestamp"],
47 | transformation_ctx="DropFields_node1676402768067",
48 | )
49 |
50 | # Script generated for node Customer Curated
51 | CustomerCurated_node1676576584339 = glueContext.write_dynamic_frame.from_catalog(
52 | frame=DropFields_node1676402768067,
53 | database="project",
54 | table_name="customer_curated",
55 | transformation_ctx="CustomerCurated_node1676576584339",
56 | )
57 |
58 | job.commit()
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/project/scripts/step_trainer_landing_to_trusted.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from awsglue.transforms import *
3 | from awsglue.utils import getResolvedOptions
4 | from pyspark.context import SparkContext
5 | from awsglue.context import GlueContext
6 | from awsglue.job import Job
7 |
8 | args = getResolvedOptions(sys.argv, ["JOB_NAME"])
9 | sc = SparkContext()
10 | glueContext = GlueContext(sc)
11 | spark = glueContext.spark_session
12 | job = Job(glueContext)
13 | job.init(args["JOB_NAME"], args)
14 |
15 | # Script generated for node Customer Curated
16 | CustomerCurated_node1676402494135 = glueContext.create_dynamic_frame.from_catalog(
17 | database="project",
18 | table_name="customer_curated",
19 | transformation_ctx="CustomerCurated_node1676402494135",
20 | )
21 |
22 | # Script generated for node Step Trainer Landing
23 | StepTrainerLanding_node1 = glueContext.create_dynamic_frame.from_options(
24 | format_options={"multiline": False},
25 | connection_type="s3",
26 | format="json",
27 | connection_options={
28 | "paths": ["s3://udacity-glue-spark-bucket/project/step_trainer/landing/"],
29 | "recurse": True,
30 | },
31 | transformation_ctx="StepTrainerLanding_node1",
32 | )
33 |
34 | # Script generated for node Join Customer and Step Trainer
35 | JoinCustomerandStepTrainer_node1676402624725 = Join.apply(
36 | frame1=StepTrainerLanding_node1,
37 | frame2=CustomerCurated_node1676402494135,
38 | keys1=["serialNumber"],
39 | keys2=["serialnumber"],
40 | transformation_ctx="JoinCustomerandStepTrainer_node1676402624725",
41 | )
42 |
43 | # Script generated for node Drop Fields
44 | DropFields_node1676402768067 = DropFields.apply(
45 | frame=JoinCustomerandStepTrainer_node1676402624725,
46 | paths=[
47 | "customername",
48 | "email",
49 | "phone",
50 | "birthday",
51 | "serialnumber",
52 | "registrationdate",
53 | "lastupdatedate",
54 | "sharewithresearchasofdate",
55 | "sharewithfriendsasofdate",
56 | "sharewithpublicasofdate",
57 | ],
58 | transformation_ctx="DropFields_node1676402768067",
59 | )
60 |
61 | # Script generated for node Step Trainer Trusted
62 | StepTrainerTrusted_node1676576584339 = glueContext.write_dynamic_frame.from_catalog(
63 | frame=DropFields_node1676402768067,
64 | database="project",
65 | table_name="step_trainer_trusted",
66 | transformation_ctx="StepTrainerTrusted_node1676576584339",
67 | )
68 |
69 | job.commit()
--------------------------------------------------------------------------------
/3_Spark_and_Data_Lakes/project/scripts/trainer_trusted_to_curated.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from awsglue.transforms import *
3 | from awsglue.utils import getResolvedOptions
4 | from pyspark.context import SparkContext
5 | from awsglue.context import GlueContext
6 | from awsglue.job import Job
7 |
8 | args = getResolvedOptions(sys.argv, ["JOB_NAME"])
9 | sc = SparkContext()
10 | glueContext = GlueContext(sc)
11 | spark = glueContext.spark_session
12 | job = Job(glueContext)
13 | job.init(args["JOB_NAME"], args)
14 |
15 | # Script generated for node Step Trainer Trusted
16 | StepTrainerTrusted_node1676402494135 = glueContext.create_dynamic_frame.from_catalog(
17 | database="project",
18 | table_name="step_trainer_trusted",
19 | transformation_ctx="StepTrainerTrusted_node1676402494135",
20 | )
21 |
22 | # Script generated for node Accelerometer Trusted
23 | AccelerometerTrusted_node1 = glueContext.create_dynamic_frame.from_catalog(
24 | database="project",
25 | table_name="accelerometer_trusted",
26 | transformation_ctx="AccelerometerTrusted_node1",
27 | )
28 |
29 | # Script generated for node Join Customer and Step Trainer
30 | JoinCustomerandStepTrainer_node1676402624725 = Join.apply(
31 | frame1=AccelerometerTrusted_node1,
32 | frame2=StepTrainerTrusted_node1676402494135,
33 | keys1=["timestamp"],
34 | keys2=["sensorreadingtime"],
35 | transformation_ctx="JoinCustomerandStepTrainer_node1676402624725",
36 | )
37 |
38 | # Script generated for node Drop Fields
39 | DropFields_node1676402768067 = DropFields.apply(
40 | frame=JoinCustomerandStepTrainer_node1676402624725,
41 | paths=["user"],
42 | transformation_ctx="DropFields_node1676402768067",
43 | )
44 |
45 | # Script generated for node Step Trainer Trusted
46 | StepTrainerTrusted_node1676578616395 = glueContext.write_dynamic_frame.from_catalog(
47 | frame=DropFields_node1676402768067,
48 | database="project",
49 | table_name="machine_learning_curated",
50 | transformation_ctx="StepTrainerTrusted_node1676578616395",
51 | )
52 |
53 | job.commit()
54 |
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/4_Automate_Data_Pipelines/exercises/.DS_Store
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/airflow_dags.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import pendulum
3 |
4 | from airflow.decorators import dag, task
5 |
6 | # @dag decorates the greet_task to denote it's the main function
7 | @dag(
8 | start_date=pendulum.now()
9 | )
10 | def greet_flow_dag():
11 |
12 | # @task decorates the re-usable hello_world_task - it can be called as often as needed in the DAG
13 | @task
14 | def hello_world_task():
15 | logging.info("Hello World!")
16 |
17 | # hello_world represents a discrete invocation of the hello_world_task
18 | hello_world=hello_world_task()
19 |
20 | # greet_dag represents the invocation of the greet_flow_dag
21 | greet_dag=greet_flow_dag()
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/airflow_official_tutorials/reuse_tasks.py:
--------------------------------------------------------------------------------
1 | from airflow.decorators import task, dag
2 | from datetime import datetime
3 |
4 |
5 | @task
6 | def add_task(x, y):
7 | print(f"Task args: x={x}, y={y}")
8 | return x + y
9 |
10 |
11 | @dag(start_date=datetime(2022, 1, 1))
12 | def mydag():
13 | start = add_task.override(task_id="start")(1, 2)
14 | for i in range(3):
15 | start >> add_task.override(task_id=f"add_start_{i}")(start, i)
16 |
17 |
18 | @dag(start_date=datetime(2022, 1, 1))
19 | def mydag2():
20 | start = add_task(1, 2)
21 | for i in range(3):
22 | start >> add_task.override(task_id=f"new_add_task_{i}")(start, i)
23 |
24 |
25 | first_dag = mydag()
26 | second_dag = mydag2()
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/airflow_official_tutorials/tutorial.py:
--------------------------------------------------------------------------------
1 | import os
2 | from datetime import datetime, timedelta
3 | from textwrap import dedent
4 |
5 | # The DAG object; we'll need this to instantiate a DAG
6 | from airflow import DAG
7 |
8 | # Operators; we need this to operate!
9 | from airflow.operators.bash import BashOperator
10 |
11 | os.environ["SQLALCHEMY_SILENCE_UBER_WARNING=1"] = "1"
12 |
13 | with DAG(
14 | "tutorial",
15 | # These args will get passed on to each operator
16 | # You can override them on a per-task basis during operator initialization
17 | default_args={
18 | "depends_on_past": False,
19 | "email": ["airflow@example.com"],
20 | "email_on_failure": False,
21 | "email_on_retry": False,
22 | "retries": 1,
23 | "retry_delay": timedelta(minutes=5),
24 | # 'queue': 'bash_queue',
25 | # 'pool': 'backfill',
26 | # 'priority_weight': 10,
27 | # 'end_date': datetime(2016, 1, 1),
28 | # 'wait_for_downstream': False,
29 | # 'sla': timedelta(hours=2),
30 | # 'execution_timeout': timedelta(seconds=300),
31 | # 'on_failure_callback': some_function,
32 | # 'on_success_callback': some_other_function,
33 | # 'on_retry_callback': another_function,
34 | # 'sla_miss_callback': yet_another_function,
35 | # 'trigger_rule': 'all_success'
36 | },
37 | description="A simple tutorial DAG",
38 | schedule=timedelta(days=1),
39 | start_date=datetime(2021, 1, 1),
40 | catchup=False,
41 | tags=["example"],
42 | ) as dag:
43 |
44 | # t1, t2 and t3 are examples of tasks created by instantiating operators
45 | t1 = BashOperator(
46 | task_id="print_date",
47 | bash_command="date",
48 | )
49 |
50 | t2 = BashOperator(
51 | task_id="sleep",
52 | depends_on_past=False,
53 | bash_command="sleep 5",
54 | retries=3,
55 | )
56 | t1.doc_md = dedent(
57 | """\
58 | #### Task Documentation
59 | You can document your task using the attributes `doc_md` (markdown),
60 | `doc` (plain text), `doc_rst`, `doc_json`, `doc_yaml` which gets
61 | rendered in the UI's Task Instance Details page.
62 | 
63 | **Image Credit:** Randall Munroe, [XKCD](https://xkcd.com/license.html)
64 | """
65 | )
66 |
67 | dag.doc_md = __doc__ # providing that you have a docstring at the beginning of the DAG; OR
68 | dag.doc_md = """
69 | This is a documentation placed anywhere
70 | """ # otherwise, type it like this
71 | templated_command = dedent(
72 | """
73 | {% for i in range(5) %}
74 | echo "{{ ds }}"
75 | echo "{{ macros.ds_add(ds, 7)}}"
76 | {% endfor %}
77 | """
78 | )
79 |
80 | t3 = BashOperator(
81 | task_id="templated",
82 | depends_on_past=False,
83 | bash_command=templated_command,
84 | )
85 |
86 | t1 >> [t2, t3]
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/airflow_official_tutorials/tutorial_dag.py:
--------------------------------------------------------------------------------
1 |
2 | import json
3 | from textwrap import dedent
4 |
5 | import pendulum
6 |
7 | # The DAG object; we'll need this to instantiate a DAG
8 | from airflow import DAG
9 |
10 | # Operators; we need this to operate!
11 | from airflow.operators.python import PythonOperator
12 | with DAG(
13 | "tutorial_dag",
14 | # These args will get passed on to each operator
15 | # You can override them on a per-task basis during operator initialization
16 | default_args={"retries": 2},
17 | description="DAG tutorial",
18 | schedule=None,
19 | start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
20 | catchup=False,
21 | tags=["example"],
22 | ) as dag:
23 | dag.doc_md = __doc__
24 | def extract(**kwargs):
25 | ti = kwargs["ti"]
26 | data_string = '{"1001": 301.27, "1002": 433.21, "1003": 502.22}'
27 | ti.xcom_push("order_data", data_string)
28 | def transform(**kwargs):
29 | ti = kwargs["ti"]
30 | extract_data_string = ti.xcom_pull(task_ids="extract", key="order_data")
31 | order_data = json.loads(extract_data_string)
32 |
33 | total_order_value = 0
34 | for value in order_data.values():
35 | total_order_value += value
36 |
37 | total_value = {"total_order_value": total_order_value}
38 | total_value_json_string = json.dumps(total_value)
39 | ti.xcom_push("total_order_value", total_value_json_string)
40 | def load(**kwargs):
41 | ti = kwargs["ti"]
42 | total_value_string = ti.xcom_pull(task_ids="transform", key="total_order_value")
43 | total_order_value = json.loads(total_value_string)
44 |
45 | print(total_order_value)
46 | extract_task = PythonOperator(
47 | task_id="extract",
48 | python_callable=extract,
49 | )
50 | extract_task.doc_md = dedent(
51 | """\
52 | #### Extract task
53 | A simple Extract task to get data ready for the rest of the data pipeline.
54 | In this case, getting data is simulated by reading from a hardcoded JSON string.
55 | This data is then put into xcom, so that it can be processed by the next task.
56 | """
57 | )
58 |
59 | transform_task = PythonOperator(
60 | task_id="transform",
61 | python_callable=transform,
62 | )
63 | transform_task.doc_md = dedent(
64 | """\
65 | #### Transform task
66 | A simple Transform task which takes in the collection of order data from xcom
67 | and computes the total order value.
68 | This computed value is then put into xcom, so that it can be processed by the next task.
69 | """
70 | )
71 |
72 | load_task = PythonOperator(
73 | task_id="load",
74 | python_callable=load,
75 | )
76 | load_task.doc_md = dedent(
77 | """\
78 | #### Load task
79 | A simple Load task which takes in the result of the Transform task, by reading it
80 | from xcom and instead of saving it to end user review, just prints it out.
81 | """
82 | )
83 |
84 | extract_task >> transform_task >> load_task
85 |
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/airflow_official_tutorials/tutorial_taskflow_api.py:
--------------------------------------------------------------------------------
1 |
2 | import json
3 | import pendulum
4 | from airflow.decorators import dag, task
5 |
6 | @dag(
7 | schedule=None,
8 | start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
9 | catchup=False,
10 | tags=["example"],
11 | )
12 | def tutorial_taskflow_api():
13 | """
14 | ### TaskFlow API Tutorial Documentation
15 | This is a simple data pipeline example which demonstrates the use of
16 | the TaskFlow API using three simple tasks for Extract, Transform, and Load.
17 | Documentation that goes along with the Airflow TaskFlow API tutorial is
18 | located
19 | [here](https://airflow.apache.org/docs/apache-airflow/stable/tutorial_taskflow_api.html)
20 | """
21 | @task()
22 | def extract():
23 | """
24 | #### Extract task
25 | A simple Extract task to get data ready for the rest of the data
26 | pipeline. In this case, getting data is simulated by reading from a
27 | hardcoded JSON string.
28 | """
29 | data_string = '{"1001": 301.27, "1002": 433.21, "1003": 502.22}'
30 |
31 | order_data_dict = json.loads(data_string)
32 | return order_data_dict
33 | @task(multiple_outputs=True)
34 | def transform(order_data_dict: dict):
35 | """
36 | #### Transform task
37 | A simple Transform task which takes in the collection of order data and
38 | computes the total order value.
39 | """
40 | total_order_value = 0
41 |
42 | for value in order_data_dict.values():
43 | total_order_value += value
44 |
45 | return {"total_order_value": total_order_value}
46 | @task()
47 | def load(total_order_value: float):
48 | """
49 | #### Load task
50 | A simple Load task which takes in the result of the Transform task and
51 | instead of saving it to end user review, just prints it out.
52 | """
53 |
54 | print(f"Total order value is: {total_order_value:.2f}")
55 | order_data = extract()
56 | order_summary = transform(order_data)
57 | load(order_summary["total_order_value"])
58 | tutorial_taskflow_api()
59 |
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/build_full_dag.py:
--------------------------------------------------------------------------------
1 | import pendulum
2 |
3 | from airflow.decorators import dag,task
4 |
5 | from custom_operators.facts_calculator import FactsCalculatorOperator
6 | from custom_operators.has_rows import HasRowsOperator
7 | from custom_operators.s3_to_redshift import S3ToRedshiftOperator
8 | from airflow.operators.empty import EmptyOperator
9 |
10 | #
11 | # The following DAG performs the following functions:
12 | #
13 | # 1. Loads Trip data from S3 to RedShift
14 | # 2. Performs a data quality check on the Trips table in RedShift
15 | # 3. Uses the FactsCalculatorOperator to create a Facts table in Redshift
16 | # a. **NOTE**: to complete this step you must complete the FactsCalcuatorOperator
17 | # skeleton defined in plugins/operators/facts_calculator.py
18 | #
19 | @dag(start_date=pendulum.now())
20 | def full_pipeline():
21 | #
22 | # The following code will load trips data from S3 to RedShift. Use the s3_key
23 | # "data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv"
24 | # and the s3_bucket "sean-murdock"
25 | #
26 | copy_trips_task = S3ToRedshiftOperator(
27 | task_id="load_trips_from_s3_to_redshift",
28 | table="trips",
29 | redshift_conn_id="redshift",
30 | aws_credentials_id="aws_credentials",
31 | s3_bucket="sean-murdock",
32 | s3_key="data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv"
33 | )
34 |
35 | #
36 | # Data quality check on the Trips table
37 | #
38 | check_trips = HasRowsOperator(
39 | task_id="check_trips_data",
40 | redshift_conn_id="redshift",
41 | table="trips"
42 | )
43 |
44 | #
45 | # We use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column is
46 | # `tripduration` and the groupby_column is `bikeid`
47 | #
48 | calculate_facts = FactsCalculatorOperator(
49 | task_id="calculate_facts_trips",
50 | redshift_conn_id="redshift",
51 | origin_table="trips",
52 | destination_table="trips_facts",
53 | fact_column="tripduration",
54 | groupby_column="bikeid"
55 | )
56 |
57 | #
58 | # Task ordering for the DAG tasks
59 | #
60 | copy_trips_task >> check_trips
61 | check_trips >> calculate_facts
62 |
63 | full_pipeline_dag = full_pipeline()
64 |
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/connections_hooks.py:
--------------------------------------------------------------------------------
1 | # Remember to run "/opt/airflow/start.sh" command to start the web server. Once the Airflow web server is ready, open the Airflow UI using the "Access Airflow" button. Turn your DAG “On”, and then Run your DAG. If you get stuck, you can take a look at the solution file in the workspace/airflow/dags folder in the workspace and the video walkthrough on the next page.
2 |
3 | import pendulum
4 | import logging
5 |
6 | from airflow.decorators import dag, task
7 | from airflow.models import Variable
8 | from airflow.operators.python_operator import PythonOperator
9 | from airflow.hooks.S3_hook import S3Hook
10 |
11 |
12 | @dag(start_date=pendulum.now())
13 | def list_keys():
14 |
15 | @task
16 | def list_keys():
17 | hook = S3Hook(aws_conn_id='aws_credentials')
18 | bucket = Variable.get('s3_bucket')
19 | prefix = Variable.get('s3_prefix')
20 | logging.info(f"Listing Keys from {bucket}/{prefix}")
21 | keys = hook.list_keys(bucket, prefix=prefix)
22 | for key in keys:
23 | logging.info(f"- s3://{bucket}/{key}")
24 | list_keys()
25 |
26 | list_keys_dag = list_keys()
27 |
28 |
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/context_templating.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import pendulum
3 |
4 | from airflow.decorators import dag, task
5 | from airflow.models import Variable
6 |
7 | @dag(
8 | start_date=pendulum.now(),
9 | schedule_interval="@daily"
10 | )
11 | def log_details():
12 |
13 | @task
14 | def log_execution_date(**kwargs):
15 | logging.info(f"Execution date is {kwargs['ds']}")
16 |
17 | @task
18 | def log_run_id(**kwargs):
19 | logging.info(f"My run id is {kwargs['run_id']}")
20 |
21 | @task
22 | def log_previous_run(**kwargs):
23 | logging.info(f"My previous run was on {kwargs['prev_start_date_success']}")
24 |
25 | @task
26 | def log_next_run(**kwargs):
27 | logging.info(f"My next run will be {kwargs['next_execution_date']}")
28 |
29 |
30 | log_execution_date_task=log_execution_date()
31 | log_run_id_task=log_run_id()
32 | log_previous_run_task=log_previous_run()
33 | log_next_run_task=log_next_run()
34 |
35 | log_details_dag = log_details()
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/convert_airflow1.py:
--------------------------------------------------------------------------------
1 | import pendulum
2 | import datetime
3 | import logging
4 |
5 | from airflow.decorators import dag,task
6 | from airflow.secrets.metastore import MetastoreBackend
7 | from airflow.hooks.postgres_hook import PostgresHook
8 | from airflow.operators.postgres_operator import PostgresOperator
9 | from airflow.operators.python_operator import PythonOperator
10 |
11 | from udacity.common import sql_statements
12 |
13 | @dag(
14 | start_date=pendulum.datetime(2018, 1, 1, 0, 0, 0, 0),
15 | end_date=pendulum.datetime(2018, 12, 1, 0, 0, 0, 0),
16 | schedule_interval='@monthly',
17 | max_active_runs=1
18 | )
19 | def data_quality():
20 |
21 | @task(sla=datetime.timedelta(hours=1))
22 | def load_trip_data_to_redshift(*args, **kwargs):
23 | metastoreBackend = MetastoreBackend()
24 | aws_connection=metastoreBackend.get_connection("aws_credentials")
25 | redshift_hook = PostgresHook("redshift")
26 | execution_date = kwargs["execution_date"]
27 | sql_stmt = sql_statements.COPY_MONTHLY_TRIPS_SQL.format(
28 | aws_connection.login,
29 | aws_connection.password,
30 | year=execution_date.year,
31 | month=execution_date.month
32 | )
33 | redshift_hook.run(sql_stmt)
34 |
35 | @task()
36 | def load_station_data_to_redshift(*args, **kwargs):
37 | metastoreBackend = MetastoreBackend()
38 | aws_connection=metastoreBackend.get_connection("aws_credentials")
39 | redshift_hook = PostgresHook("redshift")
40 | sql_stmt = sql_statements.COPY_STATIONS_SQL.format(
41 | aws_connection.login,
42 | aws_connection.password,
43 | )
44 | redshift_hook.run(sql_stmt)
45 |
46 | @task()
47 | def check_greater_than_zero(*args, **kwargs):
48 | table = kwargs["params"]["table"]
49 | redshift_hook = PostgresHook("redshift")
50 | records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {table}")
51 | if len(records) < 1 or len(records[0]) < 1:
52 | raise ValueError(f"Data quality check failed. {table} returned no results")
53 | num_records = records[0][0]
54 | if num_records < 1:
55 | raise ValueError(f"Data quality check failed. {table} contained 0 rows")
56 | logging.info(f"Data quality on table {table} check passed with {records[0][0]} records")
57 |
58 |
59 | create_trips_table = PostgresOperator(
60 | task_id="create_trips_table",
61 | postgres_conn_id="redshift",
62 | sql=sql_statements.CREATE_TRIPS_TABLE_SQL
63 | )
64 |
65 | load_trips_task = load_trip_data_to_redshift()
66 |
67 |
68 | check_trips_task = check_greater_than_zero(
69 | params={
70 | 'table':'trips'
71 | }
72 | )
73 |
74 | create_stations_table = PostgresOperator(
75 | task_id="create_stations_table",
76 | postgres_conn_id="redshift",
77 | sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
78 | )
79 |
80 | load_stations_task = load_station_data_to_redshift()
81 |
82 | check_stations_task = check_greater_than_zero(
83 | params={
84 | 'table': 'stations',
85 | }
86 | )
87 |
88 |
89 | create_trips_table >> load_trips_task
90 | create_stations_table >> load_stations_task
91 | load_stations_task >> check_stations_task
92 | load_trips_task >> check_trips_task
93 |
94 | data_quality_dag = data_quality()
95 |
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/custom_operators.py:
--------------------------------------------------------------------------------
1 | #Instructions
2 | #In this exercise, we’ll consolidate repeated code into Operator Plugins
3 | #1 - Replace both uses of the check_greater_than_zero function with calls to the HasRowsOperator
4 | #2 - Execute the DAG
5 |
6 | # Remember to run "/opt/airflow/start.sh" command to start the web server. Once the Airflow web server is ready, open the Airflow UI using the "Access Airflow" button. Turn your DAG “On”, and then Run your DAG. If you get stuck, you can take a look at the solution file in the workspace/airflow/dags folder in the workspace and the video walkthrough on the next page.
7 |
8 | import pendulum
9 | import logging
10 |
11 | from airflow.decorators import dag, task
12 | from airflow.hooks.postgres_hook import PostgresHook
13 |
14 | from airflow.operators.postgres_operator import PostgresOperator
15 | from custom_operators.s3_to_redshift import S3ToRedshiftOperator
16 | from custom_operators.has_rows import HasRowsOperator
17 |
18 |
19 | from udacity.common import sql_statements
20 |
21 | @dag(
22 | start_date=pendulum.now(),
23 | max_active_runs=1
24 | )
25 |
26 | def demonstrate_custom_operators():
27 |
28 | @task()
29 | def check_greater_than_zero(*args, **kwargs):
30 | table = kwargs["params"]["table"]
31 | redshift_hook = PostgresHook("redshift")
32 | records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {table}")
33 | if len(records) < 1 or len(records[0]) < 1:
34 | raise ValueError(f"Data quality check failed. {table} returned no results")
35 | num_records = records[0][0]
36 | if num_records < 1:
37 | raise ValueError(f"Data quality check failed. {table} contained 0 rows")
38 | logging.info(f"Data quality on table {table} check passed with {records[0][0]} records")
39 |
40 |
41 |
42 | create_trips_table = PostgresOperator(
43 | task_id="create_trips_table",
44 | postgres_conn_id="redshift",
45 | sql=sql_statements.CREATE_TRIPS_TABLE_SQL
46 | )
47 |
48 | copy_trips_task = S3ToRedshiftOperator(
49 | task_id="load_trips_from_s3_to_redshift",
50 | table="trips",
51 | redshift_conn_id="redshift",
52 | aws_credentials_id="aws_credentials",
53 | s3_bucket="sean-murdock",
54 | s3_key="data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv"
55 | )
56 |
57 | #
58 | # TODO: Replace this data quality check with the HasRowsOperator
59 | #
60 | check_trips_task = HasRowsOperator(
61 | task_id="count_trips",
62 | table="trips",
63 | redshift_conn_id="redshift",
64 | )
65 |
66 |
67 |
68 | create_stations_table = PostgresOperator(
69 | task_id="create_stations_table",
70 | postgres_conn_id="redshift",
71 | sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
72 | )
73 |
74 | copy_stations_task = S3ToRedshiftOperator(
75 | task_id="load_stations_from_s3_to_redshift",
76 | redshift_conn_id="redshift",
77 | aws_credentials_id="aws_credentials",
78 | s3_bucket="sean-murdock",
79 | s3_key="data-pipelines/divvy/unpartitioned/divvy_stations_2017.csv",
80 | table="stations"
81 | )
82 |
83 | #
84 | # TODO: Replace this data quality check with the HasRowsOperator
85 | #
86 | check_stations_task = HasRowsOperator(
87 | task_id="count_stations",
88 | table="stations",
89 | redshift_conn_id="redshift",
90 | )
91 |
92 | create_trips_table >> copy_trips_task
93 | create_stations_table >> copy_stations_task
94 | copy_stations_task >> check_stations_task
95 | copy_trips_task >> check_trips_task
96 |
97 | custom_operators_dag = demonstrate_custom_operators()
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/custom_operators/facts_calculator.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from airflow.hooks.postgres_hook import PostgresHook
4 | from airflow.models import BaseOperator
5 | from airflow.utils.decorators import apply_defaults
6 |
7 |
8 | class FactsCalculatorOperator(BaseOperator):
9 | facts_sql_template = """
10 | DROP TABLE IF EXISTS {destination_table};
11 | CREATE TABLE {destination_table} AS
12 | SELECT
13 | {groupby_column},
14 | MAX({fact_column}) AS max_{fact_column},
15 | MIN({fact_column}) AS min_{fact_column},
16 | AVG({fact_column}) AS average_{fact_column}
17 | FROM {origin_table}
18 | GROUP BY {groupby_column};
19 | """
20 |
21 | @apply_defaults
22 | def __init__(self,
23 | redshift_conn_id="",
24 | origin_table="",
25 | destination_table="",
26 | fact_column="",
27 | groupby_column="",
28 | *args,* *kwargs):
29 |
30 | super(FactsCalculatorOperator, self).__init__(*args,* *kwargs)
31 | self.redshift_conn_id = redshift_conn_id
32 | self.origin_table = origin_table
33 | self.destination_table = destination_table
34 | self.fact_column = fact_column
35 | self.groupby_column = groupby_column
36 |
37 | def execute(self, context):
38 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
39 | facts_sql = FactsCalculatorOperator.facts_sql_template.format(
40 | origin_table=self.origin_table,
41 | destination_table=self.destination_table,
42 | fact_column=self.fact_column,
43 | groupby_column=self.groupby_column
44 | )
45 | redshift.run(facts_sql)
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/custom_operators/has_rows.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from airflow.hooks.postgres_hook import PostgresHook
4 | from airflow.models import BaseOperator
5 | from airflow.utils.decorators import apply_defaults
6 |
7 |
8 | class HasRowsOperator(BaseOperator):
9 |
10 | @apply_defaults
11 | def __init__(self,
12 | redshift_conn_id="",
13 | table="",
14 | *args, **kwargs):
15 |
16 | super(HasRowsOperator, self).__init__(*args, **kwargs)
17 | self.table = table
18 | self.redshift_conn_id = redshift_conn_id
19 |
20 | def execute(self, context):
21 | redshift_hook = PostgresHook(self.redshift_conn_id)
22 | records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {self.table}")
23 | if len(records) < 1 or len(records[0]) < 1:
24 | raise ValueError(f"Data quality check failed. {self.table} returned no results")
25 | num_records = records[0][0]
26 | if num_records < 1:
27 | raise ValueError(f"Data quality check failed. {self.table} contained 0 rows")
28 | logging.info(f"Data quality on table {self.table} check passed with {records[0][0]} records")
29 |
30 |
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/custom_operators/s3_to_redshift.py:
--------------------------------------------------------------------------------
1 | from airflow.secrets.metastore import MetastoreBackend
2 | from airflow.hooks.postgres_hook import PostgresHook
3 | from airflow.models import BaseOperator
4 | from airflow.utils.decorators import apply_defaults
5 |
6 |
7 | class S3ToRedshiftOperator(BaseOperator):
8 | template_fields = ("s3_key",)
9 | copy_sql = """
10 | COPY {}
11 | FROM '{}'
12 | ACCESS_KEY_ID '{}'
13 | SECRET_ACCESS_KEY '{}'
14 | IGNOREHEADER {}
15 | DELIMITER '{}'
16 | """
17 |
18 |
19 | @apply_defaults
20 | def __init__(self,
21 | redshift_conn_id="",
22 | aws_credentials_id="",
23 | table="",
24 | s3_bucket="",
25 | s3_key="",
26 | delimiter=",",
27 | ignore_headers=1,
28 | *args, **kwargs):
29 |
30 | super(S3ToRedshiftOperator, self).__init__(*args, **kwargs)
31 | self.table = table
32 | self.redshift_conn_id = redshift_conn_id
33 | self.s3_bucket = s3_bucket
34 | self.s3_key = s3_key
35 | self.delimiter = delimiter
36 | self.ignore_headers = ignore_headers
37 | self.aws_credentials_id = aws_credentials_id
38 |
39 | def execute(self, context):
40 | metastoreBackend = MetastoreBackend()
41 | aws_connection=metastoreBackend.get_connection(self.aws_credentials_id)
42 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
43 |
44 | self.log.info("Clearing data from destination Redshift table")
45 | redshift.run("DELETE FROM {}".format(self.table))
46 |
47 | self.log.info("Copying data from S3 to Redshift")
48 | rendered_key = self.s3_key.format(**context)
49 | s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key)
50 | formatted_sql = S3ToRedshiftOperator.copy_sql.format(
51 | self.table,
52 | s3_path,
53 | aws_connection.login,
54 | aws_connection.password,
55 | self.ignore_headers,
56 | self.delimiter
57 | )
58 | redshift.run(formatted_sql)
59 |
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/data_lineage.py:
--------------------------------------------------------------------------------
1 | import pendulum
2 |
3 |
4 | from airflow.decorators import dag, task
5 | from airflow.secrets.metastore import MetastoreBackend
6 | from airflow.hooks.postgres_hook import PostgresHook
7 | from airflow.operators.postgres_operator import PostgresOperator
8 |
9 | import sql_statements
10 |
11 | @dag(
12 | start_date=pendulum.now()
13 | )
14 | def data_lineage():
15 |
16 |
17 | @task()
18 | def load_trip_data_to_redshift():
19 | metastoreBackend = MetastoreBackend()
20 | aws_connection=metastoreBackend.get_connection("aws_credentials")
21 | redshift_hook = PostgresHook("redshift")
22 | sql_stmt = sql_statements.COPY_ALL_TRIPS_SQL.format(
23 | aws_connection.login,
24 | aws_connection.password,
25 | )
26 | redshift_hook.run(sql_stmt)
27 |
28 | load_trip_data_to_redshift_task= load_trip_data_to_redshift()
29 |
30 | @task()
31 | def load_station_data_to_redshift():
32 | metastoreBackend = MetastoreBackend()
33 | aws_connection=metastoreBackend.get_connection("aws_credentials")
34 | redshift_hook = PostgresHook("redshift")
35 | sql_stmt = sql_statements.COPY_STATIONS_SQL.format(
36 | aws_connection.login,
37 | aws_connection.password,
38 | )
39 | redshift_hook.run(sql_stmt)
40 |
41 | load_station_data_to_redshift_task = load_station_data_to_redshift()
42 |
43 | create_trips_table = PostgresOperator(
44 | task_id="create_trips_table",
45 | postgres_conn_id="redshift",
46 | sql=sql_statements.CREATE_TRIPS_TABLE_SQL
47 | )
48 |
49 |
50 | create_stations_table = PostgresOperator(
51 | task_id="create_stations_table",
52 | postgres_conn_id="redshift",
53 | sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
54 | )
55 |
56 |
57 | calculate_traffic_task = PostgresOperator(
58 | task_id='calculate_location_traffic',
59 | postgres_conn_id="redshift",
60 | sql=sql_statements.LOCATION_TRAFFIC_SQL,
61 | )
62 |
63 | create_trips_table >> load_trip_data_to_redshift_task >> calculate_traffic_task
64 | create_stations_table >> load_station_data_to_redshift_task
65 | data_lineage_dag = data_lineage()
66 |
67 |
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/data_partitioning.py:
--------------------------------------------------------------------------------
1 | import pendulum
2 |
3 |
4 | from airflow.decorators import dag, task
5 | from airflow.secrets.metastore import MetastoreBackend
6 | from airflow.hooks.postgres_hook import PostgresHook
7 | from airflow.operators.postgres_operator import PostgresOperator
8 |
9 | from udacity.common import sql_statements
10 |
11 | @dag(
12 | start_date=pendulum.datetime(2018, 1, 1, 0, 0, 0, 0),
13 | end_date=pendulum.datetime(2018, 2, 1, 0, 0, 0, 0),
14 | schedule_interval='@monthly',
15 | max_active_runs=1
16 | )
17 | def data_partitioning():
18 |
19 |
20 | @task()
21 | def load_trip_data_to_redshift(*args, **kwargs):
22 | metastoreBackend = MetastoreBackend()
23 | aws_connection=metastoreBackend.get_connection("aws_credentials")
24 | redshift_hook = PostgresHook("redshift")
25 | execution_date = kwargs["execution_date"]
26 |
27 | sql_stmt = sql_statements.COPY_MONTHLY_TRIPS_SQL.format(
28 | aws_connection.login,
29 | aws_connection.password,
30 | year=execution_date.year,
31 | month=execution_date.month
32 | )
33 | redshift_hook.run(sql_stmt)
34 |
35 | load_trip_data_to_redshift_task= load_trip_data_to_redshift()
36 |
37 | @task()
38 | def load_station_data_to_redshift():
39 | metastoreBackend = MetastoreBackend()
40 | aws_connection=metastoreBackend.get_connection("aws_credentials")
41 | redshift_hook = PostgresHook("redshift")
42 | sql_stmt = sql_statements.COPY_STATIONS_SQL.format(
43 | aws_connection.login,
44 | aws_connection.password,
45 | )
46 | redshift_hook.run(sql_stmt)
47 |
48 | load_station_data_to_redshift_task = load_station_data_to_redshift()
49 |
50 | create_trips_table = PostgresOperator(
51 | task_id="create_trips_table",
52 | postgres_conn_id="redshift",
53 | sql=sql_statements.CREATE_TRIPS_TABLE_SQL
54 | )
55 |
56 |
57 | create_stations_table = PostgresOperator(
58 | task_id="create_stations_table",
59 | postgres_conn_id="redshift",
60 | sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
61 | )
62 |
63 | create_trips_table >> load_trip_data_to_redshift_task
64 | create_stations_table >> load_station_data_to_redshift_task
65 |
66 | data_partitioning_dag = data_partitioning()
67 |
68 |
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/data_quality.py:
--------------------------------------------------------------------------------
1 | import pendulum
2 | import datetime
3 | import logging
4 |
5 | from airflow.decorators import dag,task
6 | from airflow.secrets.metastore import MetastoreBackend
7 | from airflow.hooks.postgres_hook import PostgresHook
8 | from airflow.operators.postgres_operator import PostgresOperator
9 | from airflow.operators.python_operator import PythonOperator
10 |
11 | from udacity.common import sql_statements
12 |
13 | @dag(
14 | start_date=pendulum.now(),
15 | max_active_runs=1
16 | )
17 | def data_quality():
18 |
19 | @task(sla=datetime.timedelta(hours=1))
20 | def load_trip_data_to_redshift(*args, **kwargs):
21 | metastoreBackend = MetastoreBackend()
22 | aws_connection=metastoreBackend.get_connection("aws_credentials")
23 | redshift_hook = PostgresHook("redshift")
24 | sql_stmt = sql_statements.COPY_ALL_TRIPS_SQL.format(
25 | aws_connection.login,
26 | aws_connection.password
27 | )
28 | redshift_hook.run(sql_stmt)
29 |
30 | @task()
31 | def load_station_data_to_redshift(*args, **kwargs):
32 | metastoreBackend = MetastoreBackend()
33 | aws_connection=metastoreBackend.get_connection("aws_credentials")
34 | redshift_hook = PostgresHook("redshift")
35 | sql_stmt = sql_statements.COPY_STATIONS_SQL.format(
36 | aws_connection.login,
37 | aws_connection.password,
38 | )
39 | redshift_hook.run(sql_stmt)
40 |
41 | @task()
42 | def check_greater_than_zero(*args, **kwargs):
43 | table = kwargs["params"]["table"]
44 | redshift_hook = PostgresHook("redshift")
45 | records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {table}")
46 | if len(records) < 1 or len(records[0]) < 1:
47 | raise ValueError(f"Data quality check failed. {table} returned no results")
48 | num_records = records[0][0]
49 | if num_records < 1:
50 | raise ValueError(f"Data quality check failed. {table} contained 0 rows")
51 | logging.info(f"Data quality on table {table} check passed with {records[0][0]} records")
52 |
53 |
54 | create_trips_table = PostgresOperator(
55 | task_id="create_trips_table",
56 | postgres_conn_id="redshift",
57 | sql=sql_statements.CREATE_TRIPS_TABLE_SQL
58 | )
59 |
60 | load_trips_task = load_trip_data_to_redshift()
61 |
62 |
63 | check_trips_task = check_greater_than_zero(
64 | params={
65 | 'table':'trips'
66 | }
67 | )
68 |
69 | create_stations_table = PostgresOperator(
70 | task_id="create_stations_table",
71 | postgres_conn_id="redshift",
72 | sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
73 | )
74 |
75 | load_stations_task = load_station_data_to_redshift()
76 |
77 | check_stations_task = check_greater_than_zero(
78 | params={
79 | 'table': 'stations',
80 | }
81 | )
82 |
83 |
84 | create_trips_table >> load_trips_task
85 | create_stations_table >> load_stations_task
86 | load_stations_task >> check_stations_task
87 | load_trips_task >> check_trips_task
88 |
89 | data_quality_dag = data_quality()
90 |
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/refactor_dag.py:
--------------------------------------------------------------------------------
1 | #Instructions
2 | #In this exercise, we’ll refactor a DAG with a single overloaded task into a DAG with several tasks with well-defined boundaries
3 | #1 - Read through the DAG and identify points in the DAG that could be split apart
4 | #2 - Split the DAG into multiple tasks
5 | #3 - Run the DAG
6 |
7 | # Remember to run "/opt/airflow/start.sh" command to start the web server. Once the Airflow web server is ready, open the Airflow UI using the "Access Airflow" button. Turn your DAG “On”, and then Run your DAG. If you get stuck, you can take a look at the solution file in the workspace/airflow/dags folder in the workspace and the video walkthrough on the next page.
8 |
9 | import pendulum
10 | import logging
11 |
12 | from airflow.decorators import dag, task
13 | from airflow.hooks.postgres_hook import PostgresHook
14 |
15 | from airflow.operators.postgres_operator import PostgresOperator
16 |
17 | @dag (
18 | start_date=pendulum.now()
19 | )
20 | def demonstrating_refactoring():
21 |
22 | #
23 | # TODO: Finish refactoring this function into the appropriate set of tasks,
24 | # instead of keeping this one large task.
25 | #
26 | @task()
27 | def find_riders_under_18(*args, **kwargs):
28 | redshift_hook = PostgresHook("redshift")
29 |
30 | # Find all trips where the rider was under 18
31 | redshift_hook.run("""
32 | BEGIN;
33 | DROP TABLE IF EXISTS younger_riders;
34 | CREATE TABLE younger_riders AS (
35 | SELECT * FROM trips WHERE birthyear > 2000
36 | );
37 | COMMIT;
38 | """)
39 | records = redshift_hook.get_records("""
40 | SELECT birthyear FROM younger_riders ORDER BY birthyear DESC LIMIT 1
41 | """)
42 | if len(records) > 0 and len(records[0]) > 0:
43 | logging.info(f"Youngest rider was born in {records[0][0]}")
44 |
45 | @task()
46 | def how_often_bikes_ridden(*args, **kwargs):
47 | redshift_hook = PostgresHook("redshift")
48 |
49 | # Find out how often each bike is ridden
50 | redshift_hook.run("""
51 | BEGIN;
52 | DROP TABLE IF EXISTS lifetime_rides;
53 | CREATE TABLE lifetime_rides AS (
54 | SELECT bikeid, COUNT(bikeid)
55 | FROM trips
56 | GROUP BY bikeid
57 | );
58 | COMMIT;
59 | """)
60 |
61 | @task()
62 | def create_station_count(*args, **kwargs):
63 | redshift_hook = PostgresHook("redshift")
64 | # Count the number of stations by city
65 | redshift_hook.run("""
66 | BEGIN;
67 | DROP TABLE IF EXISTS city_station_counts;
68 | CREATE TABLE city_station_counts AS(
69 | SELECT city, COUNT(city)
70 | FROM stations
71 | GROUP BY city
72 | );
73 | COMMIT;
74 | """)
75 |
76 | @task()
77 | def log_oldest():
78 | redshift_hook = PostgresHook("redshift")
79 | records = redshift_hook.get_records("""
80 | SELECT birthyear FROM older_riders ORDER BY birthyear ASC LIMIT 1
81 | """)
82 | if len(records) > 0 and len(records[0]) > 0:
83 | logging.info(f"Oldest rider was born in {records[0][0]}")
84 |
85 | find_riders_under_18_task = find_riders_under_18()
86 | how_often_bikes_ridden_task = how_often_bikes_ridden()
87 | create_station_count_task=create_station_count()
88 |
89 |
90 | create_oldest_task = PostgresOperator(
91 | task_id="create_oldest",
92 | sql="""
93 | BEGIN;
94 | DROP TABLE IF EXISTS older_riders;
95 | CREATE TABLE older_riders AS (
96 | SELECT * FROM trips WHERE birthyear > 0 AND birthyear <= 1945
97 | );
98 | COMMIT;
99 | """,
100 | postgres_conn_id="redshift"
101 | )
102 |
103 | log_oldest_task = log_oldest()
104 |
105 | find_riders_under_18_task >> create_oldest_task
106 | how_often_bikes_ridden_task >> create_oldest_task
107 | create_station_count_task >> create_oldest_task
108 |
109 | create_oldest_task >> log_oldest_task
110 |
111 | demonstrating_refactoring_dag = demonstrating_refactoring()
112 |
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/run_the_schedules.py:
--------------------------------------------------------------------------------
1 | import pendulum
2 | import logging
3 |
4 | from airflow.decorators import dag, task
5 |
6 | @dag(
7 | # schedule to run daily
8 | # once it is enabled in Airflow
9 | schedule_interval='@daily',
10 | start_date=pendulum.now()
11 | )
12 | def greet_flow():
13 |
14 | @task
15 | def hello_world():
16 | logging.info("Hello World!")
17 |
18 | # hello_world represents the invocation of the only task in this DAG
19 | # it will run by itself, without any sequence before or after another task
20 | hello_world_task=hello_world()
21 |
22 | greet_flow_dag=greet_flow()
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/s3_to_redshift.py:
--------------------------------------------------------------------------------
1 | import pendulum
2 | import logging
3 |
4 | from airflow.decorators import dag, task
5 | from airflow.secrets.metastore import MetastoreBackend
6 | from airflow.hooks.postgres_hook import PostgresHook
7 | from airflow.operators.postgres_operator import PostgresOperator
8 | from airflow.operators.python_operator import PythonOperator
9 |
10 | from udacity.common import sql_statements
11 |
12 | @dag(
13 | start_date=pendulum.now()
14 | )
15 | def load_data_to_redshift():
16 |
17 |
18 | @task
19 | def load_task():
20 | metastoreBackend = MetastoreBackend()
21 | aws_connection=metastoreBackend.get_connection("aws_credentials")
22 | redshift_hook = PostgresHook("redshift")
23 | redshift_hook.run(sql_statements.COPY_ALL_TRIPS_SQL.format(aws_connection.login, aws_connection.password))
24 |
25 |
26 | create_table_task=PostgresOperator(
27 | task_id="create_table",
28 | postgres_conn_id="redshift",
29 | sql=sql_statements.CREATE_TRIPS_TABLE_SQL
30 | )
31 |
32 | location_traffic_task = PostgresOperator(
33 | task_id="calculate_location_traffic",
34 | postgres_conn_id="redshift",
35 | sql=sql_statements.LOCATION_TRAFFIC_SQL
36 | )
37 |
38 | load_data = load_task()
39 | create_table_task >> load_data
40 | load_data >> location_traffic_task
41 |
42 | s3_to_redshift_dag = load_data_to_redshift()
43 |
44 |
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/schedule_backfills.py:
--------------------------------------------------------------------------------
1 | import pendulum
2 |
3 |
4 | from airflow.decorators import dag, task
5 | from airflow.secrets.metastore import MetastoreBackend
6 | from airflow.hooks.postgres_hook import PostgresHook
7 | from airflow.operators.postgres_operator import PostgresOperator
8 |
9 | import sql_statements
10 |
11 | @dag(
12 | start_date=pendulum.datetime(2018, 1, 1, 0, 0, 0, 0),
13 | end_date=pendulum.datetime(2018, 2, 1, 0, 0, 0, 0),
14 | schedule_interval='@monthly',
15 | max_active_runs=1
16 | )
17 | def schedule_backfills():
18 |
19 |
20 | @task()
21 | def load_trip_data_to_redshift():
22 | metastoreBackend = MetastoreBackend()
23 | aws_connection=metastoreBackend.get_connection("aws_credentials")
24 | redshift_hook = PostgresHook("redshift")
25 | sql_stmt = sql_statements.COPY_ALL_TRIPS_SQL.format(
26 | aws_connection.login,
27 | aws_connection.password,
28 | )
29 | redshift_hook.run(sql_stmt)
30 |
31 | load_trip_data_to_redshift_task= load_trip_data_to_redshift()
32 |
33 | @task()
34 | def load_station_data_to_redshift():
35 | metastoreBackend = MetastoreBackend()
36 | aws_connection=metastoreBackend.get_connection("aws_credentials")
37 | redshift_hook = PostgresHook("redshift")
38 | sql_stmt = sql_statements.COPY_STATIONS_SQL.format(
39 | aws_connection.login,
40 | aws_connection.password,
41 | )
42 | redshift_hook.run(sql_stmt)
43 |
44 | load_station_data_to_redshift_task = load_station_data_to_redshift()
45 |
46 | create_trips_table = PostgresOperator(
47 | task_id="create_trips_table",
48 | postgres_conn_id="redshift",
49 | sql=sql_statements.CREATE_TRIPS_TABLE_SQL
50 | )
51 |
52 |
53 | create_stations_table = PostgresOperator(
54 | task_id="create_stations_table",
55 | postgres_conn_id="redshift",
56 | sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
57 | )
58 |
59 | create_trips_table >> load_trip_data_to_redshift_task
60 | create_stations_table >> load_station_data_to_redshift_task
61 |
62 | schedule_backfills_dag = schedule_backfills()
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/sql_statements.py:
--------------------------------------------------------------------------------
1 | CREATE_TRIPS_TABLE_SQL = """
2 | CREATE TABLE IF NOT EXISTS trips (
3 | trip_id INTEGER NOT NULL,
4 | start_time TIMESTAMP NOT NULL,
5 | end_time TIMESTAMP NOT NULL,
6 | bikeid INTEGER NOT NULL,
7 | tripduration DECIMAL(16,2) NOT NULL,
8 | from_station_id INTEGER NOT NULL,
9 | from_station_name VARCHAR(100) NOT NULL,
10 | to_station_id INTEGER NOT NULL,
11 | to_station_name VARCHAR(100) NOT NULL,
12 | usertype VARCHAR(20),
13 | gender VARCHAR(6),
14 | birthyear INTEGER,
15 | PRIMARY KEY(trip_id))
16 | DISTSTYLE ALL;
17 | """
18 |
19 | CREATE_STATIONS_TABLE_SQL = """
20 | CREATE TABLE IF NOT EXISTS stations (
21 | id INTEGER NOT NULL,
22 | name VARCHAR(250) NOT NULL,
23 | city VARCHAR(100) NOT NULL,
24 | latitude DECIMAL(9, 6) NOT NULL,
25 | longitude DECIMAL(9, 6) NOT NULL,
26 | dpcapacity INTEGER NOT NULL,
27 | online_date TIMESTAMP NOT NULL,
28 | PRIMARY KEY(id))
29 | DISTSTYLE ALL;
30 | """
31 |
32 | COPY_SQL = """
33 | COPY {}
34 | FROM '{}'
35 | ACCESS_KEY_ID '{{}}'
36 | SECRET_ACCESS_KEY '{{}}'
37 | IGNOREHEADER 1
38 | DELIMITER ','
39 | """
40 |
41 | COPY_MONTHLY_TRIPS_SQL = COPY_SQL.format(
42 | "trips",
43 | "s3://udacity-airflow-bkt/data-pipelines/divvy/partitioned/{year}/{month}/divvy_trips.csv"
44 | )
45 |
46 | COPY_ALL_TRIPS_SQL = COPY_SQL.format(
47 | "trips",
48 | "s3://udacity-airflow-bkt/data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv"
49 | )
50 |
51 | COPY_STATIONS_SQL = COPY_SQL.format(
52 | "stations",
53 | "s3://udacity-airflow-bkt/data-pipelines/divvy/unpartitioned/divvy_stations_2017.csv"
54 | )
55 |
56 | LOCATION_TRAFFIC_SQL = """
57 | BEGIN;
58 | DROP TABLE IF EXISTS station_traffic;
59 | CREATE TABLE station_traffic AS
60 | SELECT
61 | DISTINCT(t.from_station_id) AS station_id,
62 | t.from_station_name AS station_name,
63 | num_departures,
64 | num_arrivals
65 | FROM trips t
66 | JOIN (
67 | SELECT
68 | from_station_id,
69 | COUNT(from_station_id) AS num_departures
70 | FROM trips
71 | GROUP BY from_station_id
72 | ) AS fs ON t.from_station_id = fs.from_station_id
73 | JOIN (
74 | SELECT
75 | to_station_id,
76 | COUNT(to_station_id) AS num_arrivals
77 | FROM trips
78 | GROUP BY to_station_id
79 | ) AS ts ON t.from_station_id = ts.to_station_id
80 | """
81 |
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/exercises/task_dependencies.py:
--------------------------------------------------------------------------------
1 | import pendulum
2 | import logging
3 |
4 | from airflow.decorators import dag, task
5 |
6 | @dag(
7 | schedule_interval='@hourly',
8 | start_date=pendulum.now()
9 | )
10 | def task_dependencies():
11 |
12 | # the hello_world task doesn't accept parameters
13 | @task()
14 | def hello_world():
15 | logging.info("Hello World")
16 |
17 | # the addition task accepts two parameters and adds them together
18 | # logs the result, and returns it
19 | @task()
20 | def addition(first,second):
21 | logging.info(f"{first} + {second} = {first+second}")
22 | return first+second
23 |
24 | # the subtraction task accepts two parameters, and subracts the
25 | # second from the first, then logs and returns the result
26 | @task()
27 | def subtraction(first,second):
28 | logging.info(f"{first -second} = {first-second}")
29 | return first-second
30 |
31 | # the division task accepts two parameters, and divides the first
32 | # by the second, logs and returns the result
33 | @task()
34 | def division(first,second):
35 | logging.info(f"{first} / {second} = {int(first/second)}")
36 | return int(first/second)
37 |
38 | # hello represents a discrete invocation of hello world
39 | hello=hello_world()
40 |
41 | # two_plus_two represents the invocation of addition with 2 and 2
42 | two_plus_two=addition(2,2)
43 |
44 | # two_from_six represents the invocation of subtraction with 6 and 2
45 | two_from_six=subtraction(6,2)
46 |
47 | # eight_divided_by_two represents the invocation of division with 8 and 2
48 | eight_divided_by_two = division(8,2)
49 |
50 | # sum represents the invocation of addition with 5 and 5
51 | sum= addition(5,5)
52 |
53 | # difference represents the invocation of subtraction with 6 and 4
54 | difference = subtraction(6,4)
55 |
56 | # sum_divided_by_difference represents the invocation of division with the sum and the difference
57 | sum_divided_by_difference = division(sum,difference)
58 |
59 | # hello to run before two_plus_two and two_from_six
60 | hello >> two_plus_two
61 | hello >> two_from_six
62 |
63 | # Notice, addition and subtraction can run at the same time
64 |
65 | # two_plus_two to run before eight_divided_by_two
66 | two_plus_two >> eight_divided_by_two
67 |
68 | # two_from_six to run before eight_divided_by_two
69 | two_from_six >> eight_divided_by_two
70 |
71 |
72 | # Notice division waits for subtraction and addition to run
73 |
74 | # sum to run before sum_divided_by_difference
75 | sum >> sum_divided_by_difference
76 |
77 | # difference to run before sum_divided_by_difference
78 | difference >> sum_divided_by_difference
79 |
80 | task_dependencies_dag=task_dependencies()
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/images/airflow_aws_all_together.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/4_Automate_Data_Pipelines/images/airflow_aws_all_together.png
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/images/airflow_component_diagram.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/4_Automate_Data_Pipelines/images/airflow_component_diagram.jpeg
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/images/airflow_data_lineage.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/4_Automate_Data_Pipelines/images/airflow_data_lineage.jpeg
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/images/airflow_instrumentation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/4_Automate_Data_Pipelines/images/airflow_instrumentation.png
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/images/bikeshare_dag_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/4_Automate_Data_Pipelines/images/bikeshare_dag_example.png
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/images/bikeshare_data_lineage.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/4_Automate_Data_Pipelines/images/bikeshare_data_lineage.jpeg
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/images/directed_acyclic_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/4_Automate_Data_Pipelines/images/directed_acyclic_graph.png
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/images/example_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/4_Automate_Data_Pipelines/images/example_pipeline.png
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/images/how-airflow-works.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/4_Automate_Data_Pipelines/images/how-airflow-works.png
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/images/project_dag_sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/4_Automate_Data_Pipelines/images/project_dag_sample.png
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/images/scheduling_in_airflow.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/4_Automate_Data_Pipelines/images/scheduling_in_airflow.jpeg
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/project/README.md:
--------------------------------------------------------------------------------
1 | # Project: Data Pipelines with Airflow
2 |
3 | ## Introduction
4 |
5 | A music streaming company, Sparkify, has decided that it is time to introduce more automation and monitoring to their data warehouse ETL pipelines and come to the conclusion that the best tool to achieve this is Apache Airflow.
6 |
7 | They have decided to bring you into the project and expect you to create high grade data pipelines that are dynamic and built from reusable tasks, can be monitored, and allow easy backfills. They have also noted that the data quality plays a big part when analyses are executed on top the data warehouse and want to run tests against their datasets after the ETL steps have been executed to catch any discrepancies in the datasets.
8 |
9 | The source data resides in S3 and needs to be processed in Sparkify's data warehouse in Amazon Redshift. The source datasets consist of JSON logs that tell about user activity in the application and JSON metadata about the songs the users listen to.
10 |
11 | ## Project Details
12 |
13 | ### Datasets
14 |
15 | - Log data: ```s3://udacity-dend/log_data```
16 | - Song data: ```s3://udacity-dend/song_data```
17 |
18 | ### Data pipeline with Airflow
19 |
20 | Create custom operators to perform tasks such as staging the data, filling the data warehouse, and running checks on the data as the final step.
21 |
22 | ## Implementation
23 |
24 | ### Copy S3 Data
25 |
26 | - Create S3 bucket
27 |
28 | ```bash
29 | aws s3 mb s3://uc-de-airflow-aws/
30 | ```
31 |
32 | - Copy the data from the udacity bucket to the home cloudshell directory:
33 |
34 | ```bash
35 | aws s3 cp s3://udacity-dend/log-data/ ~/log-data/ --recursive
36 | aws s3 cp s3://udacity-dend/song-data/ ~/song-data/ --recursive
37 | ```
38 |
39 | - Copy the data from the home cloudshell directory to required bucket:
40 |
41 | ```bash
42 | aws s3 cp ~/log-data/ s3://uc-de-airflow-aws/log-data/ --recursive
43 | aws s3 cp ~/song-data/ s3://uc-de-airflow-aws/song-data/ --recursive
44 | ```
45 |
46 | - List the data in your own bucket to be sure it copied over
47 |
48 | ```bash
49 | aws s3 ls s3://uc-de-airflow-aws/log-data/
50 | aws s3 ls s3://uc-de-airflow-aws/song-data/
51 | ```
52 |
53 | ### Airflow DAG
54 |
55 |
56 |
57 |
58 |
59 | **Operators**:
60 |
61 | - ```Begin_execution``` and ```End_executiom```: Dummy operators represents starting and end of DAG.
62 | - ```Create_tables```: create required database tables.
63 | - ```Stage_events``` and ```Stage_songs```: extract data from S3 to Redshift staging tables.
64 | - ```Load_songplays_fact_table```: load data from staging tables to fact table.
65 | - ```Load_user_dim_table```, ```Load_song_dim_table```, ```Load_artist_dim_table``` and ```Load_time_dim_table```: load data from staging tables to dimension tables.
66 | - ```Run_data_quality_checks```: operator to run data quality checks.
67 |
68 | ## Execution
69 |
70 | 1. Create S3 Bucket and copy data from source.
71 | 2. Add AWS connection info in Airflow via UI
72 | 3. Create Redshift serverless and connection information and store it in Airflow via UI
73 | 4. Run project DAG and monitor the execution via Airflow UI.
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/project/dags/common/create_tables.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE IF EXISTS staging_events;
2 | DROP TABLE IF EXISTS staging_songs;
3 | DROP TABLE IF EXISTS songplays;
4 | DROP TABLE IF EXISTS users;
5 | DROP TABLE IF EXISTS songs;
6 | DROP TABLE IF EXISTS artists;
7 | DROP TABLE IF EXISTS time;
8 |
9 | CREATE TABLE IF NOT EXISTS staging_events (
10 | artist VARCHAR,
11 | auth VARCHAR,
12 | firstName VARCHAR,
13 | gender CHAR(1),
14 | itemInSession INTEGER,
15 | lastName VARCHAR,
16 | length FLOAT,
17 | level VARCHAR,
18 | location TEXT,
19 | method VARCHAR,
20 | page VARCHAR,
21 | registration FLOAT,
22 | sessionId INTEGER,
23 | song VARCHAR,
24 | status INTEGER,
25 | ts BIGINT,
26 | userAgent TEXT,
27 | userId INTEGER
28 | );
29 |
30 | CREATE TABLE IF NOT EXISTS staging_songs (
31 | num_songs INTEGER,
32 | artist_id VARCHAR,
33 | artist_name VARCHAR,
34 | artist_latitude FLOAT,
35 | artist_longitude FLOAT,
36 | artist_location TEXT,
37 | song_id VARCHAR,
38 | title VARCHAR,
39 | duration FLOAT,
40 | year INTEGER
41 | );
42 |
43 | CREATE TABLE IF NOT EXISTS songplays (
44 | songplay_id INTEGER IDENTITY(0,1) NOT NULL PRIMARY KEY,
45 | start_time TIMESTAMP,
46 | user_id INTEGER,
47 | level VARCHAR,
48 | song_id VARCHAR,
49 | artist_id VARCHAR,
50 | session_id INTEGER,
51 | location TEXT,
52 | user_agent TEXT
53 | );
54 |
55 | CREATE TABLE IF NOT EXISTS users (
56 | user_id INTEGER NOT NULL PRIMARY KEY,
57 | first_name VARCHAR,
58 | last_name VARCHAR,
59 | gender CHAR(1),
60 | level VARCHAR
61 | );
62 |
63 | CREATE TABLE IF NOT EXISTS songs (
64 | song_id VARCHAR NOT NULL PRIMARY KEY,
65 | title VARCHAR,
66 | artist_id VARCHAR,
67 | year INT,
68 | duration FLOAT
69 | );
70 |
71 | CREATE TABLE IF NOT EXISTS artists (
72 | artist_id VARCHAR NOT NULL PRIMARY KEY,
73 | name VARCHAR,
74 | location TEXT ,
75 | latitude FLOAT ,
76 | longitude FLOAT
77 | );
78 |
79 | CREATE TABLE IF NOT EXISTS time (
80 | start_time TIMESTAMP NOT NULL PRIMARY KEY,
81 | hour INTEGER,
82 | day INTEGER,
83 | week INTEGER,
84 | month INTEGER,
85 | year INTEGER,
86 | weekday VARCHAR
87 | );
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/project/dags/common/sql_statements.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/4_Automate_Data_Pipelines/project/dags/common/sql_statements.py
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/project/dags/project_dag.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pendulum
3 | from datetime import datetime, timedelta
4 | from airflow.decorators import dag
5 | from airflow.operators.dummy_operator import DummyOperator
6 | from airflow.operators.postgres_operator import PostgresOperator
7 | from operators.stage_redshift import StageToRedshiftOperator
8 | from operators.load_fact import LoadFactOperator
9 | from operators.load_dimension import LoadDimensionOperator
10 | from operators.data_quality import DataQualityOperator
11 | from common.sql_statements import SqlQueries
12 |
13 | start_date = datetime(2018, 11, 1)
14 | end_date = datetime(2018, 11, 30)
15 |
16 | s3_bucket = "uc-de-airflow-aws"
17 | events_s3_key = "log-data"
18 | songs_s3_key = "song-data/A/A/"
19 | log_json_file = 'log_json_path.json'
20 |
21 | default_args = {
22 | 'owner': 'Hari',
23 | 'start_date': pendulum.now(),
24 | 'depends_on_past': False,
25 | 'retries': 3,
26 | 'retry_delay': timedelta(minutes=5),
27 | 'catchup': False,
28 | 'email_on_retry': False
29 | }
30 |
31 | @dag(
32 | default_args=default_args,
33 | description='Load and transform data in Redshift with Airflow',
34 | schedule_interval='0 * * * *'
35 | )
36 | def final_project():
37 |
38 | start_operator = DummyOperator(task_id='Begin_execution')
39 |
40 | create_redshift_tables = PostgresOperator(
41 | task_id='Create_tables',
42 | postgres_conn_id="redshift",
43 | sql='common/sql_queries.sql'
44 | )
45 |
46 | stage_events_to_redshift = StageToRedshiftOperator(
47 | task_id='Stage_events',
48 | table="staging_events",
49 | redshift_conn_id="redshift",
50 | aws_credentials_id="aws_credentials",
51 | s3_bucket=s3_bucket,
52 | s3_key=events_s3_key,
53 | log_json_file=log_json_file
54 | )
55 |
56 | # s3_key="{events_s3_key}/{execution_date.year}/{execution_date.month}/{ds}-events.json"
57 |
58 | stage_songs_to_redshift = StageToRedshiftOperator(
59 | task_id='Stage_songs',
60 | table="staging_songs",
61 | redshift_conn_id="redshift",
62 | aws_credentials_id="aws_credentials",
63 | s3_bucket=s3_bucket,
64 | s3_key=songs_s3_key
65 | )
66 |
67 | load_songplays_table = LoadFactOperator(
68 | task_id='Load_songplays_fact_table',
69 | redshift_conn_id="redshift",
70 | sql_query=SqlQueries.songplay_table_insert
71 | )
72 |
73 | load_user_dimension_table = LoadDimensionOperator(
74 | task_id='Load_user_dim_table',
75 | redshift_conn_id="redshift",
76 | sql_query=SqlQueries.user_table_insert
77 | )
78 |
79 | load_song_dimension_table = LoadDimensionOperator(
80 | task_id='Load_song_dim_table',
81 | redshift_conn_id="redshift",
82 | sql_query=SqlQueries.song_table_insert
83 | )
84 |
85 | load_artist_dimension_table = LoadDimensionOperator(
86 | task_id='Load_artist_dim_table',
87 | redshift_conn_id="redshift",
88 | sql_query=SqlQueries.artist_table_insert
89 | )
90 |
91 | load_time_dimension_table = LoadDimensionOperator(
92 | task_id='Load_time_dim_table',
93 | redshift_conn_id="redshift",
94 | sql_query=SqlQueries.time_table_insert
95 | )
96 |
97 | run_quality_checks = DataQualityOperator(
98 | task_id='Run_data_quality_checks',
99 | redshift_conn_id = "redshift",
100 | tables = ["songplays", "users", "songs", "artists", "time"]
101 | )
102 |
103 | end_operator = DummyOperator(task_id='End_execution')
104 |
105 | start_operator >> create_redshift_tables >> [stage_events_to_redshift, stage_songs_to_redshift] >> \
106 | load_songplays_table >> [load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table] >> \
107 | run_quality_checks >> end_operator
108 |
109 | final_project_dag = final_project()
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/project/images/airflow_project_dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lal4Tech/Data-Engineering-With-AWS/bb7950be668f1f62ac14ae42707875fa1286ef6c/4_Automate_Data_Pipelines/project/images/airflow_project_dag.png
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/project/plugins/operators/data_quality.py:
--------------------------------------------------------------------------------
1 | from airflow.hooks.postgres_hook import PostgresHook
2 | from airflow.models import BaseOperator
3 | from airflow.utils.decorators import apply_defaults
4 |
5 | class DataQualityOperator(BaseOperator):
6 |
7 | ui_color = '#89DA59'
8 |
9 | @apply_defaults
10 | def __init__(self,
11 | redshift_conn_id,
12 | tables,
13 | *args, **kwargs):
14 |
15 | super(DataQualityOperator, self).__init__(*args, **kwargs)
16 | self.redshift_conn_id = redshift_conn_id
17 | self.tables = tables
18 |
19 | def execute(self, context):
20 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
21 | for table in self.tables:
22 | self.log.info(f"Running Data Quality checks on table: {table}")
23 | records = redshift.get_records(f"SELECT COUNT(*) FROM {table};")
24 | #self.log.info(f"len(records): {len(records)}")
25 | #self.log.info(f"len(records[0]): {len(records[0])}")
26 | #self.log.info(f"len(records[0][0] ): {records[0][0]}")
27 | if len(records) < 1 or len(records[0]) < 1 or records[0][0] < 1:
28 | raise ValueError(f"{table} containe 0 rows")
29 |
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/project/plugins/operators/load_dimension.py:
--------------------------------------------------------------------------------
1 | from airflow.hooks.postgres_hook import PostgresHook
2 | from airflow.models import BaseOperator
3 | from airflow.utils.decorators import apply_defaults
4 |
5 | class LoadDimensionOperator(BaseOperator):
6 |
7 | ui_color = '#80BD9E'
8 |
9 | @apply_defaults
10 | def __init__(self,
11 | redshift_conn_id="",
12 | sql_query="",
13 | *args, **kwargs):
14 |
15 | super(LoadDimensionOperator, self).__init__(*args, **kwargs)
16 | self.redshift_conn_id = redshift_conn_id
17 | self.sql_query = sql_query
18 |
19 | def execute(self, context):
20 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
21 | self.log.info("Load data from staging to dimension table")
22 | redshift.run(self.sql_query)
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/project/plugins/operators/load_fact.py:
--------------------------------------------------------------------------------
1 | from airflow.hooks.postgres_hook import PostgresHook
2 | from airflow.models import BaseOperator
3 | from airflow.utils.decorators import apply_defaults
4 |
5 | class LoadFactOperator(BaseOperator):
6 |
7 | ui_color = '#F98866'
8 |
9 | @apply_defaults
10 | def __init__(self,
11 | redshift_conn_id="",
12 | sql_query="",
13 | *args, **kwargs):
14 |
15 | super(LoadFactOperator, self).__init__(*args, **kwargs)
16 | self.redshift_conn_id = redshift_conn_id
17 | self.sql_query = sql_query
18 |
19 | def execute(self, context):
20 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
21 | self.log.info("Load data from staging to Fact table")
22 | redshift.run(self.sql_query)
23 |
--------------------------------------------------------------------------------
/4_Automate_Data_Pipelines/project/plugins/operators/stage_redshift.py:
--------------------------------------------------------------------------------
1 | from airflow.hooks.postgres_hook import PostgresHook
2 | from airflow.models import BaseOperator
3 | from airflow.utils.decorators import apply_defaults
4 | from airflow.secrets.metastore import MetastoreBackend
5 |
6 | class StageToRedshiftOperator(BaseOperator):
7 | ui_color = '#358140'
8 |
9 | template_fields = ('s3_key',)
10 |
11 | copy_sql = """
12 | COPY {}
13 | FROM '{}'
14 | ACCESS_KEY_ID '{}'
15 | SECRET_ACCESS_KEY '{}'
16 | FORMAT AS json '{}';
17 | """
18 |
19 | @apply_defaults
20 | def __init__(self,
21 | redshift_conn_id="",
22 | aws_credentials_id="",
23 | table="",
24 | s3_bucket="",
25 | s3_key="",
26 | log_json_file="",
27 | *args, **kwargs):
28 |
29 | super(StageToRedshiftOperator, self).__init__(*args, **kwargs)
30 | self.table = table
31 | self.redshift_conn_id = redshift_conn_id
32 | self.s3_bucket = s3_bucket
33 | self.s3_key = s3_key
34 | self.log_json_file = log_json_file
35 | self.aws_credentials_id = aws_credentials_id
36 | self.execution_date = kwargs.get('execution_date')
37 |
38 |
39 | def execute(self, context):
40 | metastoreBackend = MetastoreBackend()
41 | aws_connection=metastoreBackend.get_connection(self.aws_credentials_id)
42 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
43 |
44 | self.log.info("Clearing data from destination Redshift table")
45 | redshift.run("DELETE FROM {}".format(self.table))
46 |
47 | self.log.info("Copying data from S3 to Redshift")
48 | rendered_key = self.s3_key.format(**context)
49 | s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key)
50 |
51 |
52 | if self.log_json_file != "":
53 | self.log_json_file = "s3://{}/{}".format(self.s3_bucket, self.log_json_file)
54 | formatted_sql = StageToRedshiftOperator.copy_sql.format(
55 | self.table,
56 | s3_path,
57 | aws_connection.login,
58 | aws_connection.password,
59 | self.log_json_file
60 | )
61 | else:
62 | formatted_sql = StageToRedshiftOperator.copy_sql.format(
63 | self.table,
64 | s3_path,
65 | aws_connection.login,
66 | aws_connection.password,
67 | 'auto'
68 | )
69 |
70 | redshift.run(formatted_sql)
71 | self.log.info(f"Success fully copied to Redshift table {self.table}")
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data-Engineering-With-AWS
2 | Resources and projects from Udacity Data Engineering with AWS nano degree programme
3 |
4 | ## Projects
5 |
6 | ### Data Modelling
7 |
8 | [Data modeling with Apache Cassandra](1_Data_Modelling/project/data_modelling_project.ipynb)
9 |
10 | In this project,
11 |
12 | - Apply concepts learned on data modeling with Apache Cassandra and complete an ETL pipeline using Python.
13 | - Model the data by creating tables in Apache Cassandra to run queries.
14 |
15 | ### Cloud Data Warehouses
16 |
17 | [Data warehousing with AWS Redshift](2_Cloud_Data_Warehouses/project/README.md)
18 |
19 | In this project,
20 |
21 | - Apply concepts on data warehouses and AWS to build an ETL pipeline for a database hosted on Redshift.
22 | - To complete the project, need to load data from S3 to staging tables on Redshift and execute SQL statements that create the analytics tables from these staging tables.
23 |
24 | ### Spark and Data Lakes
25 |
26 | [STEDI Human Balance Analytics](3_Spark_and_Data_Lakes/project/README.md)
27 |
28 | In this project,
29 |
30 | - Use Spark and AWS Glue allow you to process data from multiple sources, categorize the data, and curate it to be queried in the future for multiple purposes.
31 | - Build a data lakehouse solution for sensor data that trains a machine learning model.
32 |
33 | ### Automate Data Pipelines
34 |
35 | [Data Pipelines with Airflow](4_Automate_Data_Pipelines/project/README.md)
36 |
37 | In this project,
38 |
39 | - Using Airflow to create high grade data pipelines that are dynamic and built from reusable tasks, can be monitored, and allow easy backfills.
40 | - Create custom operators to perform tasks such as staging the data, filling the data warehouse, and running checks on the data as the final step.
41 |
--------------------------------------------------------------------------------