├── Data Model Exercise Part 1.ipynb ├── README.md └── datawarehouse.sql /Data Model Exercise Part 1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "56dc665d", 6 | "metadata": {}, 7 | "source": [ 8 | "## Lesson 1 Exercise 1: Creating a Table with PostgreSQL" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "17f92a9f", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "!pip install psycopg2" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "46004eee", 24 | "metadata": {}, 25 | "source": [ 26 | "## Import the library" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 1, 32 | "id": "8a148d09", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "import psycopg2" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "id": "00f7a6a2", 42 | "metadata": {}, 43 | "source": [ 44 | "## Create a connection to the database" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "id": "c2191466", 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "try: \n", 55 | " conn = psycopg2.connect(\"host=127.0.0.1 dbname=postgres user=postgres password=root\")\n", 56 | "except psycopg2.Error as e: \n", 57 | " print(\"Error: Could not make connection to the Postgres database\")\n", 58 | " print(e)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "id": "1055bff6", 64 | "metadata": {}, 65 | "source": [ 66 | "## Use the connection to get a cursor that can be used to execute queries." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 4, 72 | "id": "2a893200", 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "try: \n", 77 | " cur = conn.cursor()\n", 78 | "except psycopg2.Error as e: \n", 79 | " print(\"Error: Could not get curser to the Database\")\n", 80 | " print(e)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "id": "d779c966", 86 | "metadata": {}, 87 | "source": [ 88 | "## Set automatic commit to be true so that each action is committed without having to call conn.commit() after each command." 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 5, 94 | "id": "13305c08", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "conn.set_session(autocommit=True)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "id": "c8e46fa3", 104 | "metadata": {}, 105 | "source": [ 106 | "## Create a database to do the work in" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 6, 112 | "id": "7063598d", 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "database \"myfirstdb\" already exists\n", 120 | "\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "try: \n", 126 | " cur.execute(\"create database myfirstdb\")\n", 127 | "except psycopg2.Error as e:\n", 128 | " print(e)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "id": "7fb0a512", 134 | "metadata": {}, 135 | "source": [ 136 | "## Add the database name in the connect statement. Let's close our connection to the default database, reconnect to the Udacity database, and get a new cursor." 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 7, 142 | "id": "9fd39615", 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "try: \n", 147 | " conn.close()\n", 148 | "except psycopg2.Error as e:\n", 149 | " print(e)\n", 150 | " \n", 151 | "try: \n", 152 | " conn = psycopg2.connect(\"host=127.0.0.1 dbname=myfirstdb user=postgres password=root\")\n", 153 | "except psycopg2.Error as e: \n", 154 | " print(\"Error: Could not make connection to the Postgres database\")\n", 155 | " print(e)\n", 156 | " \n", 157 | "try: \n", 158 | " cur = conn.cursor()\n", 159 | "except psycopg2.Error as e: \n", 160 | " print(\"Error: Could not get curser to the Database\")\n", 161 | " print(e)\n", 162 | "\n", 163 | "conn.set_session(autocommit=True)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "id": "530a7702", 169 | "metadata": {}, 170 | "source": [ 171 | "## Create Table for students which includes below columns\n", 172 | "\n", 173 | "student_id
\n", 174 | "name
\n", 175 | "age
\n", 176 | "gender
\n", 177 | "subject
\n", 178 | "marks" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 8, 184 | "id": "25549b7f", 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "try: \n", 189 | " cur.execute(\"CREATE TABLE IF NOT EXISTS students (student_id int, name varchar,\\\n", 190 | " age int, gender varchar, subject varchar, marks int);\")\n", 191 | "except psycopg2.Error as e: \n", 192 | " print(\"Error: Issue creating table\")\n", 193 | " print (e)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "id": "807ee3b5", 199 | "metadata": {}, 200 | "source": [ 201 | "### Insert the following two rows in the table\n", 202 | "First Row: 1, \"Raj\", 23, \"Male\", \"Python\", 85\n", 203 | "\n", 204 | "Second Row: 2, \"Priya\", 22, \"Female\", \"Python\", 86" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 9, 210 | "id": "b06a6be8", 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "try: \n", 215 | " cur.execute(\"INSERT INTO students (student_id, name, age, gender, subject, marks) \\\n", 216 | " VALUES (%s, %s, %s, %s, %s, %s)\", \\\n", 217 | " (1, \"Raj\", 23, \"Male\", \"Python\", 85))\n", 218 | "except psycopg2.Error as e: \n", 219 | " print(\"Error: Inserting Rows\")\n", 220 | " print (e)\n", 221 | " \n", 222 | "try: \n", 223 | " cur.execute(\"INSERT INTO students (student_id, name, age, gender, subject, marks) \\\n", 224 | " VALUES (%s, %s, %s, %s, %s, %s)\",\n", 225 | " ( 2, \"Priya\", 22, \"Female\", \"Python\", 86))\n", 226 | "except psycopg2.Error as e: \n", 227 | " print(\"Error: Inserting Rows\")\n", 228 | " print (e)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "id": "830f62a8", 234 | "metadata": {}, 235 | "source": [ 236 | "## Validate your data was inserted into the table." 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 10, 242 | "id": "39875456", 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "name": "stdout", 247 | "output_type": "stream", 248 | "text": [ 249 | "(1, 'Raj', 23, 'Male', 'Python', 85)\n", 250 | "(2, 'Priya', 22, 'Female', 'Python', 86)\n" 251 | ] 252 | } 253 | ], 254 | "source": [ 255 | "try: \n", 256 | " cur.execute(\"SELECT * FROM students;\")\n", 257 | "except psycopg2.Error as e: \n", 258 | " print(\"Error: select *\")\n", 259 | " print (e)\n", 260 | "\n", 261 | "row = cur.fetchone()\n", 262 | "while row:\n", 263 | " print(row)\n", 264 | " row = cur.fetchone()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "id": "7dc5483a", 270 | "metadata": {}, 271 | "source": [ 272 | "## And finally close your cursor and connection." 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 11, 278 | "id": "a95e6d61", 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "cur.close()\n", 283 | "conn.close()" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "id": "25622897", 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [] 293 | } 294 | ], 295 | "metadata": { 296 | "kernelspec": { 297 | "display_name": "Python 3 (ipykernel)", 298 | "language": "python", 299 | "name": "python3" 300 | }, 301 | "language_info": { 302 | "codemirror_mode": { 303 | "name": "ipython", 304 | "version": 3 305 | }, 306 | "file_extension": ".py", 307 | "mimetype": "text/x-python", 308 | "name": "python", 309 | "nbconvert_exporter": "python", 310 | "pygments_lexer": "ipython3", 311 | "version": "3.7.9" 312 | } 313 | }, 314 | "nbformat": 4, 315 | "nbformat_minor": 5 316 | } 317 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data-Engineer-Tutorial-Series -------------------------------------------------------------------------------- /datawarehouse.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE dimDate 2 | ( 3 | date_key integer NOT NULL PRIMARY KEY, 4 | date date NOT NULL, 5 | year smallint NOT NULL, 6 | quarter smallint NOT NULL, 7 | month smallint NOT NULL, 8 | day smallint NOT NULL, 9 | week smallint NOT NULL, 10 | is_weekend boolean 11 | ); 12 | 13 | 14 | INSERT INTO dimDate (date_key, date, year, quarter, month, day, week, is_weekend) 15 | SELECT DISTINCT(TO_CHAR(payment_date :: DATE, 'yyyyMMDD')::integer) AS date_key, 16 | date(payment_date) AS date, 17 | EXTRACT(year FROM payment_date) AS year, 18 | EXTRACT(quarter FROM payment_date) AS quarter, 19 | EXTRACT(month FROM payment_date) AS month, 20 | EXTRACT(day FROM payment_date) AS day, 21 | EXTRACT(week FROM payment_date) AS week, 22 | CASE WHEN EXTRACT(ISODOW FROM payment_date) IN (6, 7) THEN true ELSE false END AS is_weekend 23 | FROM payment; 24 | 25 | DROP TABLE IF EXISTS dimCustomer; 26 | CREATE TABLE dimCustomer 27 | ( 28 | customer_key SERIAL PRIMARY KEY, 29 | customer_id smallint NOT NULL, 30 | first_name varchar(45) NOT NULL, 31 | last_name varchar(45) NOT NULL, 32 | email varchar(50), 33 | address varchar(50) NOT NULL, 34 | address2 varchar(50), 35 | district varchar(20) NOT NULL, 36 | city varchar(50) NOT NULL, 37 | country varchar(50) NOT NULL, 38 | postal_code varchar(10), 39 | phone varchar(20) NOT NULL, 40 | active smallint NOT NULL, 41 | create_date timestamp NOT NULL, 42 | start_date date NOT NULL, 43 | end_date date NOT NULL 44 | ); 45 | 46 | INSERT INTO dimCustomer (customer_key, customer_id, first_name, last_name, email, address, 47 | address2, district, city, country, postal_code, phone, active, 48 | create_date, start_date, end_date) 49 | SELECT c.customer_id as customer_key, 50 | c.customer_id, 51 | c.first_name, 52 | c.last_name, 53 | c.email, 54 | a.address, 55 | a.address2, 56 | a.district, 57 | ci.city, 58 | co.country, 59 | postal_code, 60 | a.phone, 61 | c.active, 62 | c.create_date, 63 | now() AS start_date, 64 | now() AS end_date 65 | FROM customer c 66 | JOIN address a ON (c.address_id = a.address_id) 67 | JOIN city ci ON (a.city_id = ci.city_id) 68 | JOIN country co ON (ci.country_id = co.country_id); 69 | 70 | 71 | CREATE TABLE dimMovie 72 | ( 73 | movie_key SERIAL PRIMARY KEY, 74 | film_id smallint NOT NULL, 75 | title varchar(255) NOT NULL, 76 | description text, 77 | release_year year, 78 | language varchar(20) NOT NULL, 79 | original_language varchar(20), 80 | rental_duration smallint NOT NULL, 81 | length smallint NOT NULL, 82 | rating varchar(5) NOT NULL, 83 | special_features varchar(60) NOT NULL 84 | ); 85 | 86 | 87 | 88 | INSERT INTO dimMovie (movie_key, film_id, title, description, release_year, language, original_language, rental_duration, length, rating, special_features) 89 | SELECT 90 | f.film_id as movie_key, 91 | f.film_id, 92 | f.title, 93 | f.description, 94 | f.release_year, 95 | l.name as language, 96 | orig_lang.name AS original_language, 97 | f.rental_duration, 98 | f.length, 99 | f.rating, 100 | f.special_features 101 | FROM film f 102 | JOIN language l ON (f.language_id=l.language_id) 103 | LEFT JOIN language orig_lang ON (f.language_id = orig_lang.language_id); 104 | 105 | CREATE TABLE dimStore 106 | ( 107 | store_key SERIAL PRIMARY KEY, 108 | store_id smallint NOT NULL, 109 | address varchar(50) NOT NULL, 110 | address2 varchar(50), 111 | district varchar(20) NOT NULL, 112 | city varchar(50) NOT NULL, 113 | country varchar(50) NOT NULL, 114 | postal_code varchar(10), 115 | manager_first_name varchar(45) NOT NULL, 116 | manager_last_name varchar(45) NOT NULL, 117 | start_date date NOT NULL, 118 | end_date date NOT NULL 119 | ); 120 | 121 | INSERT INTO dimStore (store_key, store_id, address, address2, district, city, country, postal_code, manager_first_name, manager_last_name, start_date, end_date) 122 | SELECT 123 | s.store_id as store_key, 124 | s.store_id, 125 | a.address, 126 | a.address2, 127 | a.district, 128 | c.city, 129 | co.country, 130 | a.postal_code, 131 | st.first_name as manager_first_name, 132 | st.last_name as manager_last_name, 133 | now() as start_date, 134 | now() as end_date 135 | FROM store s 136 | JOIN staff st ON (s.manager_staff_id = st.staff_id) 137 | JOIN address a ON (s.address_id = a.address_id) 138 | JOIN city c ON (a.city_id = c.city_id) 139 | JOIN country co ON (c.country_id = co.country_id); 140 | 141 | 142 | DROP TABLE IF EXISTS factSales; 143 | CREATE TABLE factSales 144 | ( 145 | sales_key SERIAL PRIMARY KEY, 146 | date_key integer REFERENCES dimDate (date_key), 147 | customer_key integer REFERENCES dimCustomer (customer_key), 148 | movie_key integer REFERENCES dimMovie (movie_key), 149 | store_key integer REFERENCES dimStore (store_key), 150 | sales_amount numeric 151 | ); 152 | 153 | INSERT INTO factSales (date_key, customer_key, movie_key, store_key, sales_amount) 154 | SELECT 155 | TO_CHAR(payment_date :: DATE, 'yyyyMMDD')::integer AS date_key, 156 | p.customer_id as customer_key, 157 | i.film_id as movie_key, 158 | i.store_id as store_key, 159 | p.amount as sales_amount 160 | FROM payment p 161 | JOIN rental r ON (p.rental_id = r.rental_id) 162 | JOIN inventory i ON (r.inventory_id = i.inventory_id); 163 | 164 | 165 | 166 | 167 | 168 | 169 | -- start schema 170 | SELECT dimMovie.title, dimDate.month, dimCustomer.city, sum(sales_amount) as revenue 171 | FROM factSales 172 | JOIN dimMovie on (dimMovie.movie_key = factSales.movie_key) 173 | JOIN dimDate on (dimDate.date_key = factSales.date_key) 174 | JOIN dimCustomer on (dimCustomer.customer_key = factSales.customer_key) 175 | group by (dimMovie.title, dimDate.month, dimCustomer.city) 176 | order by dimMovie.title, dimDate.month, dimCustomer.city, revenue desc; 177 | 178 | 179 | -- 3nf 180 | SELECT f.title, EXTRACT(month FROM p.payment_date) as month, ci.city, sum(p.amount) as revenue 181 | FROM payment p 182 | JOIN rental r ON ( p.rental_id = r.rental_id ) 183 | JOIN inventory i ON ( r.inventory_id = i.inventory_id ) 184 | JOIN film f ON ( i.film_id = f.film_id) 185 | JOIN customer c ON ( p.customer_id = c.customer_id ) 186 | JOIN address a ON ( c.address_id = a.address_id ) 187 | JOIN city ci ON ( a.city_id = ci.city_id ) 188 | group by (f.title, month, ci.city) 189 | order by f.title, month, ci.city, revenue desc; --------------------------------------------------------------------------------