├── Data Model Exercise Part 1.ipynb
├── README.md
└── datawarehouse.sql
/Data Model Exercise Part 1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "56dc665d",
6 | "metadata": {},
7 | "source": [
8 | "## Lesson 1 Exercise 1: Creating a Table with PostgreSQL"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "id": "17f92a9f",
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "!pip install psycopg2"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "id": "46004eee",
24 | "metadata": {},
25 | "source": [
26 | "## Import the library"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 1,
32 | "id": "8a148d09",
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "import psycopg2"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "id": "00f7a6a2",
42 | "metadata": {},
43 | "source": [
44 | "## Create a connection to the database"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 3,
50 | "id": "c2191466",
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "try: \n",
55 | " conn = psycopg2.connect(\"host=127.0.0.1 dbname=postgres user=postgres password=root\")\n",
56 | "except psycopg2.Error as e: \n",
57 | " print(\"Error: Could not make connection to the Postgres database\")\n",
58 | " print(e)"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "id": "1055bff6",
64 | "metadata": {},
65 | "source": [
66 | "## Use the connection to get a cursor that can be used to execute queries."
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 4,
72 | "id": "2a893200",
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "try: \n",
77 | " cur = conn.cursor()\n",
78 | "except psycopg2.Error as e: \n",
79 | " print(\"Error: Could not get curser to the Database\")\n",
80 | " print(e)"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "id": "d779c966",
86 | "metadata": {},
87 | "source": [
88 | "## Set automatic commit to be true so that each action is committed without having to call conn.commit() after each command."
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 5,
94 | "id": "13305c08",
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "conn.set_session(autocommit=True)"
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "id": "c8e46fa3",
104 | "metadata": {},
105 | "source": [
106 | "## Create a database to do the work in"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 6,
112 | "id": "7063598d",
113 | "metadata": {},
114 | "outputs": [
115 | {
116 | "name": "stdout",
117 | "output_type": "stream",
118 | "text": [
119 | "database \"myfirstdb\" already exists\n",
120 | "\n"
121 | ]
122 | }
123 | ],
124 | "source": [
125 | "try: \n",
126 | " cur.execute(\"create database myfirstdb\")\n",
127 | "except psycopg2.Error as e:\n",
128 | " print(e)"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "id": "7fb0a512",
134 | "metadata": {},
135 | "source": [
136 | "## Add the database name in the connect statement. Let's close our connection to the default database, reconnect to the Udacity database, and get a new cursor."
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 7,
142 | "id": "9fd39615",
143 | "metadata": {},
144 | "outputs": [],
145 | "source": [
146 | "try: \n",
147 | " conn.close()\n",
148 | "except psycopg2.Error as e:\n",
149 | " print(e)\n",
150 | " \n",
151 | "try: \n",
152 | " conn = psycopg2.connect(\"host=127.0.0.1 dbname=myfirstdb user=postgres password=root\")\n",
153 | "except psycopg2.Error as e: \n",
154 | " print(\"Error: Could not make connection to the Postgres database\")\n",
155 | " print(e)\n",
156 | " \n",
157 | "try: \n",
158 | " cur = conn.cursor()\n",
159 | "except psycopg2.Error as e: \n",
160 | " print(\"Error: Could not get curser to the Database\")\n",
161 | " print(e)\n",
162 | "\n",
163 | "conn.set_session(autocommit=True)"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "id": "530a7702",
169 | "metadata": {},
170 | "source": [
171 | "## Create Table for students which includes below columns\n",
172 | "\n",
173 | "student_id
\n",
174 | "name
\n",
175 | "age
\n",
176 | "gender
\n",
177 | "subject
\n",
178 | "marks"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": 8,
184 | "id": "25549b7f",
185 | "metadata": {},
186 | "outputs": [],
187 | "source": [
188 | "try: \n",
189 | " cur.execute(\"CREATE TABLE IF NOT EXISTS students (student_id int, name varchar,\\\n",
190 | " age int, gender varchar, subject varchar, marks int);\")\n",
191 | "except psycopg2.Error as e: \n",
192 | " print(\"Error: Issue creating table\")\n",
193 | " print (e)"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "id": "807ee3b5",
199 | "metadata": {},
200 | "source": [
201 | "### Insert the following two rows in the table\n",
202 | "First Row: 1, \"Raj\", 23, \"Male\", \"Python\", 85\n",
203 | "\n",
204 | "Second Row: 2, \"Priya\", 22, \"Female\", \"Python\", 86"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 9,
210 | "id": "b06a6be8",
211 | "metadata": {},
212 | "outputs": [],
213 | "source": [
214 | "try: \n",
215 | " cur.execute(\"INSERT INTO students (student_id, name, age, gender, subject, marks) \\\n",
216 | " VALUES (%s, %s, %s, %s, %s, %s)\", \\\n",
217 | " (1, \"Raj\", 23, \"Male\", \"Python\", 85))\n",
218 | "except psycopg2.Error as e: \n",
219 | " print(\"Error: Inserting Rows\")\n",
220 | " print (e)\n",
221 | " \n",
222 | "try: \n",
223 | " cur.execute(\"INSERT INTO students (student_id, name, age, gender, subject, marks) \\\n",
224 | " VALUES (%s, %s, %s, %s, %s, %s)\",\n",
225 | " ( 2, \"Priya\", 22, \"Female\", \"Python\", 86))\n",
226 | "except psycopg2.Error as e: \n",
227 | " print(\"Error: Inserting Rows\")\n",
228 | " print (e)"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "id": "830f62a8",
234 | "metadata": {},
235 | "source": [
236 | "## Validate your data was inserted into the table."
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": 10,
242 | "id": "39875456",
243 | "metadata": {},
244 | "outputs": [
245 | {
246 | "name": "stdout",
247 | "output_type": "stream",
248 | "text": [
249 | "(1, 'Raj', 23, 'Male', 'Python', 85)\n",
250 | "(2, 'Priya', 22, 'Female', 'Python', 86)\n"
251 | ]
252 | }
253 | ],
254 | "source": [
255 | "try: \n",
256 | " cur.execute(\"SELECT * FROM students;\")\n",
257 | "except psycopg2.Error as e: \n",
258 | " print(\"Error: select *\")\n",
259 | " print (e)\n",
260 | "\n",
261 | "row = cur.fetchone()\n",
262 | "while row:\n",
263 | " print(row)\n",
264 | " row = cur.fetchone()"
265 | ]
266 | },
267 | {
268 | "cell_type": "markdown",
269 | "id": "7dc5483a",
270 | "metadata": {},
271 | "source": [
272 | "## And finally close your cursor and connection."
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 11,
278 | "id": "a95e6d61",
279 | "metadata": {},
280 | "outputs": [],
281 | "source": [
282 | "cur.close()\n",
283 | "conn.close()"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": null,
289 | "id": "25622897",
290 | "metadata": {},
291 | "outputs": [],
292 | "source": []
293 | }
294 | ],
295 | "metadata": {
296 | "kernelspec": {
297 | "display_name": "Python 3 (ipykernel)",
298 | "language": "python",
299 | "name": "python3"
300 | },
301 | "language_info": {
302 | "codemirror_mode": {
303 | "name": "ipython",
304 | "version": 3
305 | },
306 | "file_extension": ".py",
307 | "mimetype": "text/x-python",
308 | "name": "python",
309 | "nbconvert_exporter": "python",
310 | "pygments_lexer": "ipython3",
311 | "version": "3.7.9"
312 | }
313 | },
314 | "nbformat": 4,
315 | "nbformat_minor": 5
316 | }
317 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data-Engineer-Tutorial-Series
--------------------------------------------------------------------------------
/datawarehouse.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE dimDate
2 | (
3 | date_key integer NOT NULL PRIMARY KEY,
4 | date date NOT NULL,
5 | year smallint NOT NULL,
6 | quarter smallint NOT NULL,
7 | month smallint NOT NULL,
8 | day smallint NOT NULL,
9 | week smallint NOT NULL,
10 | is_weekend boolean
11 | );
12 |
13 |
14 | INSERT INTO dimDate (date_key, date, year, quarter, month, day, week, is_weekend)
15 | SELECT DISTINCT(TO_CHAR(payment_date :: DATE, 'yyyyMMDD')::integer) AS date_key,
16 | date(payment_date) AS date,
17 | EXTRACT(year FROM payment_date) AS year,
18 | EXTRACT(quarter FROM payment_date) AS quarter,
19 | EXTRACT(month FROM payment_date) AS month,
20 | EXTRACT(day FROM payment_date) AS day,
21 | EXTRACT(week FROM payment_date) AS week,
22 | CASE WHEN EXTRACT(ISODOW FROM payment_date) IN (6, 7) THEN true ELSE false END AS is_weekend
23 | FROM payment;
24 |
25 | DROP TABLE IF EXISTS dimCustomer;
26 | CREATE TABLE dimCustomer
27 | (
28 | customer_key SERIAL PRIMARY KEY,
29 | customer_id smallint NOT NULL,
30 | first_name varchar(45) NOT NULL,
31 | last_name varchar(45) NOT NULL,
32 | email varchar(50),
33 | address varchar(50) NOT NULL,
34 | address2 varchar(50),
35 | district varchar(20) NOT NULL,
36 | city varchar(50) NOT NULL,
37 | country varchar(50) NOT NULL,
38 | postal_code varchar(10),
39 | phone varchar(20) NOT NULL,
40 | active smallint NOT NULL,
41 | create_date timestamp NOT NULL,
42 | start_date date NOT NULL,
43 | end_date date NOT NULL
44 | );
45 |
46 | INSERT INTO dimCustomer (customer_key, customer_id, first_name, last_name, email, address,
47 | address2, district, city, country, postal_code, phone, active,
48 | create_date, start_date, end_date)
49 | SELECT c.customer_id as customer_key,
50 | c.customer_id,
51 | c.first_name,
52 | c.last_name,
53 | c.email,
54 | a.address,
55 | a.address2,
56 | a.district,
57 | ci.city,
58 | co.country,
59 | postal_code,
60 | a.phone,
61 | c.active,
62 | c.create_date,
63 | now() AS start_date,
64 | now() AS end_date
65 | FROM customer c
66 | JOIN address a ON (c.address_id = a.address_id)
67 | JOIN city ci ON (a.city_id = ci.city_id)
68 | JOIN country co ON (ci.country_id = co.country_id);
69 |
70 |
71 | CREATE TABLE dimMovie
72 | (
73 | movie_key SERIAL PRIMARY KEY,
74 | film_id smallint NOT NULL,
75 | title varchar(255) NOT NULL,
76 | description text,
77 | release_year year,
78 | language varchar(20) NOT NULL,
79 | original_language varchar(20),
80 | rental_duration smallint NOT NULL,
81 | length smallint NOT NULL,
82 | rating varchar(5) NOT NULL,
83 | special_features varchar(60) NOT NULL
84 | );
85 |
86 |
87 |
88 | INSERT INTO dimMovie (movie_key, film_id, title, description, release_year, language, original_language, rental_duration, length, rating, special_features)
89 | SELECT
90 | f.film_id as movie_key,
91 | f.film_id,
92 | f.title,
93 | f.description,
94 | f.release_year,
95 | l.name as language,
96 | orig_lang.name AS original_language,
97 | f.rental_duration,
98 | f.length,
99 | f.rating,
100 | f.special_features
101 | FROM film f
102 | JOIN language l ON (f.language_id=l.language_id)
103 | LEFT JOIN language orig_lang ON (f.language_id = orig_lang.language_id);
104 |
105 | CREATE TABLE dimStore
106 | (
107 | store_key SERIAL PRIMARY KEY,
108 | store_id smallint NOT NULL,
109 | address varchar(50) NOT NULL,
110 | address2 varchar(50),
111 | district varchar(20) NOT NULL,
112 | city varchar(50) NOT NULL,
113 | country varchar(50) NOT NULL,
114 | postal_code varchar(10),
115 | manager_first_name varchar(45) NOT NULL,
116 | manager_last_name varchar(45) NOT NULL,
117 | start_date date NOT NULL,
118 | end_date date NOT NULL
119 | );
120 |
121 | INSERT INTO dimStore (store_key, store_id, address, address2, district, city, country, postal_code, manager_first_name, manager_last_name, start_date, end_date)
122 | SELECT
123 | s.store_id as store_key,
124 | s.store_id,
125 | a.address,
126 | a.address2,
127 | a.district,
128 | c.city,
129 | co.country,
130 | a.postal_code,
131 | st.first_name as manager_first_name,
132 | st.last_name as manager_last_name,
133 | now() as start_date,
134 | now() as end_date
135 | FROM store s
136 | JOIN staff st ON (s.manager_staff_id = st.staff_id)
137 | JOIN address a ON (s.address_id = a.address_id)
138 | JOIN city c ON (a.city_id = c.city_id)
139 | JOIN country co ON (c.country_id = co.country_id);
140 |
141 |
142 | DROP TABLE IF EXISTS factSales;
143 | CREATE TABLE factSales
144 | (
145 | sales_key SERIAL PRIMARY KEY,
146 | date_key integer REFERENCES dimDate (date_key),
147 | customer_key integer REFERENCES dimCustomer (customer_key),
148 | movie_key integer REFERENCES dimMovie (movie_key),
149 | store_key integer REFERENCES dimStore (store_key),
150 | sales_amount numeric
151 | );
152 |
153 | INSERT INTO factSales (date_key, customer_key, movie_key, store_key, sales_amount)
154 | SELECT
155 | TO_CHAR(payment_date :: DATE, 'yyyyMMDD')::integer AS date_key,
156 | p.customer_id as customer_key,
157 | i.film_id as movie_key,
158 | i.store_id as store_key,
159 | p.amount as sales_amount
160 | FROM payment p
161 | JOIN rental r ON (p.rental_id = r.rental_id)
162 | JOIN inventory i ON (r.inventory_id = i.inventory_id);
163 |
164 |
165 |
166 |
167 |
168 |
169 | -- start schema
170 | SELECT dimMovie.title, dimDate.month, dimCustomer.city, sum(sales_amount) as revenue
171 | FROM factSales
172 | JOIN dimMovie on (dimMovie.movie_key = factSales.movie_key)
173 | JOIN dimDate on (dimDate.date_key = factSales.date_key)
174 | JOIN dimCustomer on (dimCustomer.customer_key = factSales.customer_key)
175 | group by (dimMovie.title, dimDate.month, dimCustomer.city)
176 | order by dimMovie.title, dimDate.month, dimCustomer.city, revenue desc;
177 |
178 |
179 | -- 3nf
180 | SELECT f.title, EXTRACT(month FROM p.payment_date) as month, ci.city, sum(p.amount) as revenue
181 | FROM payment p
182 | JOIN rental r ON ( p.rental_id = r.rental_id )
183 | JOIN inventory i ON ( r.inventory_id = i.inventory_id )
184 | JOIN film f ON ( i.film_id = f.film_id)
185 | JOIN customer c ON ( p.customer_id = c.customer_id )
186 | JOIN address a ON ( c.address_id = a.address_id )
187 | JOIN city ci ON ( a.city_id = ci.city_id )
188 | group by (f.title, month, ci.city)
189 | order by f.title, month, ci.city, revenue desc;
--------------------------------------------------------------------------------