├── project4 ├── README.md ├── dl.cfg ├── __MACOSX │ ├── ._dl.cfg │ ├── ._etl.py │ └── ._README.md ├── data-lake-project-resources.zip ├── etl.py └── etl.ipynb ├── Thumbs.db ├── projectNinja.png ├── projectPlanner.png ├── levelup-udacity.gif ├── projectBeginner.png ├── projectInnovator.png ├── .ipynb_checkpoints ├── Lesson1 Demo 1 Creating a Table with PostGres-Muthyala-checkpoint.ipynb └── Untitled-checkpoint.ipynb ├── project3 ├── dwh.cfg ├── etl.py ├── create_tables.py ├── README.md └── sql_queries.py ├── README.md ├── project1 ├── create_tables.py ├── sql_queries.py ├── etl.py ├── readme.md └── test.ipynb ├── RDBMS data modeling ├── Lesson1 Demo 1 Creating a Table with PostGres-Muthyala.ipynb ├── l1-exercise-1-solution-creating-a-table-with-postgres.ipynb ├── l1-demo-0-creating-a-table-with-postgres.ipynb └── l1-demo-1-creating-a-table-with-postgres.ipynb ├── Data wrangling with spark ├── 1_procedural_vs_functional_in_python.ipynb ├── 8_spark_sql_quiz.ipynb ├── 2_spark_maps_and_lazy_evaluation.ipynb ├── 6_dataframe_quiz_solution.ipynb └── 9_spark_sql_quiz_solution.ipynb ├── Intro to Data Warehouses └── L3 Exercise 3 - Parallel ETL - Solution.ipynb ├── NoSQL data modeling ├── Lesson 3 Exercise 3 Clustering Column.ipynb ├── Lesson 3 Exercise 4 Using the WHERE Clause.ipynb ├── Lesson 3 Exercise 2 Primary Key.ipynb └── Lesson 3 Exercise 1 Three Queries Three Tables.ipynb ├── Power of spark └── mapreduce_practice.ipynb └── Intro to Data Lake └── Exercise 3 - Data Lake on S3.ipynb /project4/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /project4/dl.cfg: -------------------------------------------------------------------------------- 1 | AWS_ACCESS_KEY_ID='' 2 | AWS_SECRET_ACCESS_KEY='' -------------------------------------------------------------------------------- /Thumbs.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/Data-Engineer-Nano-Degree/HEAD/Thumbs.db -------------------------------------------------------------------------------- /projectNinja.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/Data-Engineer-Nano-Degree/HEAD/projectNinja.png -------------------------------------------------------------------------------- /projectPlanner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/Data-Engineer-Nano-Degree/HEAD/projectPlanner.png -------------------------------------------------------------------------------- /levelup-udacity.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/Data-Engineer-Nano-Degree/HEAD/levelup-udacity.gif -------------------------------------------------------------------------------- /projectBeginner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/Data-Engineer-Nano-Degree/HEAD/projectBeginner.png -------------------------------------------------------------------------------- /projectInnovator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/Data-Engineer-Nano-Degree/HEAD/projectInnovator.png -------------------------------------------------------------------------------- /project4/__MACOSX/._dl.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/Data-Engineer-Nano-Degree/HEAD/project4/__MACOSX/._dl.cfg -------------------------------------------------------------------------------- /project4/__MACOSX/._etl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/Data-Engineer-Nano-Degree/HEAD/project4/__MACOSX/._etl.py -------------------------------------------------------------------------------- /project4/__MACOSX/._README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/Data-Engineer-Nano-Degree/HEAD/project4/__MACOSX/._README.md -------------------------------------------------------------------------------- /project4/data-lake-project-resources.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/Data-Engineer-Nano-Degree/HEAD/project4/data-lake-project-resources.zip -------------------------------------------------------------------------------- /.ipynb_checkpoints/Lesson1 Demo 1 Creating a Table with PostGres-Muthyala-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /project3/dwh.cfg: -------------------------------------------------------------------------------- 1 | [CLUSTER] 2 | HOST='' 3 | DB_NAME= 4 | DB_USER= 5 | DB_PASSWORD= 6 | DB_PORT= 7 | 8 | [IAM_ROLE] 9 | ARN='' 10 | 11 | [S3] 12 | LOG_DATA='s3://udacity-dend/log_data' 13 | LOG_JSONPATH='s3://udacity-dend/log_json_path.json' 14 | SONG_DATA='s3://udacity-dend/song_data' -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data-Engineer-Nano-Degree 2 | Data Engineer Nano Degree - Udacity 3 | 4 | 5 | | ![Badge](projectBeginner.png) | ![Badge](projectPlanner.png) | ![Badge](projectInnovator.png) | ![Badge](projectNinja.png) | 6 | |:---:|:---:|:---:|:---:| 7 | | Data Modeling with Postgres | Data Modeling with Apache Cassandra | Cloud Data Warehouses | Data Lakes with Spark | 8 | -------------------------------------------------------------------------------- /project3/etl.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import psycopg2 3 | from sql_queries import copy_table_queries, insert_table_queries 4 | 5 | 6 | def load_staging_tables(cur, conn): 7 | for query in copy_table_queries: 8 | cur.execute(query) 9 | conn.commit() 10 | 11 | 12 | def insert_tables(cur, conn): 13 | for query in insert_table_queries: 14 | cur.execute(query) 15 | conn.commit() 16 | 17 | 18 | def main(): 19 | config = configparser.ConfigParser() 20 | config.read('dwh.cfg') 21 | 22 | conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values())) 23 | cur = conn.cursor() 24 | 25 | load_staging_tables(cur, conn) 26 | insert_tables(cur, conn) 27 | 28 | conn.close() 29 | 30 | 31 | if __name__ == "__main__": 32 | main() -------------------------------------------------------------------------------- /project3/create_tables.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import psycopg2 3 | from sql_queries import create_table_queries, drop_table_queries 4 | 5 | def drop_tables(cur, conn): 6 | for query in drop_table_queries: 7 | cur.execute(query) 8 | conn.commit() 9 | 10 | 11 | def create_tables(cur, conn): 12 | for query in create_table_queries: 13 | try: 14 | cur.execute(query) 15 | conn.commit() 16 | except Exception as e: 17 | print(e) 18 | 19 | 20 | def main(): 21 | config = configparser.ConfigParser() 22 | config.read('dwh.cfg') 23 | conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values())) 24 | cur = conn.cursor() 25 | 26 | drop_tables(cur, conn) 27 | create_tables(cur, conn) 28 | 29 | conn.close() 30 | 31 | 32 | if __name__ == "__main__": 33 | main() -------------------------------------------------------------------------------- /project1/create_tables.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | from sql_queries import create_table_queries, drop_table_queries 3 | 4 | 5 | def create_database(): 6 | # connect to default database 7 | conn = psycopg2.connect("host=127.0.0.1 dbname=studentdb user=student password=student") 8 | conn.set_session(autocommit=True) 9 | cur = conn.cursor() 10 | 11 | # create sparkify database with UTF8 encoding 12 | cur.execute("DROP DATABASE IF EXISTS sparkifydb") 13 | cur.execute("CREATE DATABASE sparkifydb WITH ENCODING 'utf8' TEMPLATE template0") 14 | 15 | # close connection to default database 16 | conn.close() 17 | 18 | # connect to sparkify database 19 | conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student") 20 | cur = conn.cursor() 21 | 22 | return cur, conn 23 | 24 | 25 | def drop_tables(cur, conn): 26 | for query in drop_table_queries: 27 | cur.execute(query) 28 | conn.commit() 29 | 30 | 31 | def create_tables(cur, conn): 32 | for query in create_table_queries: 33 | cur.execute(query) 34 | conn.commit() 35 | 36 | 37 | def main(): 38 | cur, conn = create_database() 39 | 40 | drop_tables(cur, conn) 41 | create_tables(cur, conn) 42 | 43 | conn.close() 44 | 45 | 46 | if __name__ == "__main__": 47 | main() -------------------------------------------------------------------------------- /RDBMS data modeling/Lesson1 Demo 1 Creating a Table with PostGres-Muthyala.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Creating a Music Library" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import psycopg2" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 3, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "ename": "OperationalError", 26 | "evalue": "could not connect to server: Connection refused (0x0000274D/10061)\n\tIs the server running on host \"127.0.0.1\" and accepting\n\tTCP/IP connections on port 5432?\n", 27 | "output_type": "error", 28 | "traceback": [ 29 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 30 | "\u001b[1;31mOperationalError\u001b[0m Traceback (most recent call last)", 31 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mconn\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpsycopg2\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"host=127.0.0.1 dbname=studentdb user=student password=student\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 32 | "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\psycopg2\\__init__.py\u001b[0m in \u001b[0;36mconnect\u001b[1;34m(dsn, connection_factory, cursor_factory, **kwargs)\u001b[0m\n\u001b[0;32m 124\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 125\u001b[0m \u001b[0mdsn\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_ext\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmake_dsn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdsn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 126\u001b[1;33m \u001b[0mconn\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_connect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdsn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconnection_factory\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mconnection_factory\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwasync\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 127\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcursor_factory\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 128\u001b[0m \u001b[0mconn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcursor_factory\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcursor_factory\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 33 | "\u001b[1;31mOperationalError\u001b[0m: could not connect to server: Connection refused (0x0000274D/10061)\n\tIs the server running on host \"127.0.0.1\" and accepting\n\tTCP/IP connections on port 5432?\n" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "conn = psycopg2.connect(\"host=127.0.0.1 dbname=studentdb user=student password=student\")" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [] 47 | } 48 | ], 49 | "metadata": { 50 | "kernelspec": { 51 | "display_name": "Python 3", 52 | "language": "python", 53 | "name": "python3" 54 | }, 55 | "language_info": { 56 | "codemirror_mode": { 57 | "name": "ipython", 58 | "version": 3 59 | }, 60 | "file_extension": ".py", 61 | "mimetype": "text/x-python", 62 | "name": "python", 63 | "nbconvert_exporter": "python", 64 | "pygments_lexer": "ipython3", 65 | "version": "3.6.4" 66 | } 67 | }, 68 | "nbformat": 4, 69 | "nbformat_minor": 2 70 | } 71 | -------------------------------------------------------------------------------- /project3/README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | A startup company called Sparkify provides music streaming to the users through the application. The songs details and the user activity data from the application are currently available and stored in the format of JSON. 4 | 5 | If Sparkify wants to analyze the user's daily activities and provide future song recommendations, that would be very tough to query for the analysis and future recommendations from the JSON files. As the data goes increasing every day from Bytes to GBs and even more difficult for processing and analyzing the data with the JSON files. 6 | 7 | Instead of storing the generated data from the user into the JSON files, I would recommend a cloud database which is widely used for modeling techniques and it helps in the fast retrieval of data and store large amount of data. This can be made even more efficient with the approach of the STAR schema creating in Redshift Cluster. 8 | 9 | The STAR schema consists of one fact table referencing any number of dimension tables which helps the Sparkify for solving simplified common business logic. 10 | 11 | * What is the next song Sparkify user would like to listen based on his past behavior. 12 | * which song user would be more interested in listening to that particular point of time. 13 | And much more complex business logics can be easily solved using the STAR schema method. 14 | 15 | Created a STAR schema, optimized for song play analysis. 16 | 17 | **Fact Table**: songplays: attributes referencing to the dimension tables. 18 | 19 | **Dimension Tables**: users, songs, artists and time table. 20 | 21 | This database will help the internal departments of the Sparkify company to do different kinds of analysis to recommend a Sparkify user. 22 | 23 | * Favorite songs of user based on the week day: By joining songplay and songs and user table based on level. 24 | * Recent listened to songs: By joining songplays and user table can show recommendation on the app based on subscription level. 25 | * Can help in recommending most popular songs of the day/week. 26 | 27 | ## DWH configurations and SETUP 28 | 29 | ### step-0 30 | * Create a new IAM user in your AWS account. 31 | * Give it AdministratorAccess and Attach policies 32 | * Use access key and secret key to create clients for EC2, S3, IAM, and Redshift. 33 | 34 | ### Step-1 35 | * See doc [IAM Role](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/iam.html#client) 36 | * Create an IAM Role that makes Redshift able to access S3 bucket (ReadOnly) 37 | 38 | ### Step-2 39 | * See doc [Create Cluster](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html#Redshift.Client.create_cluster) 40 | * Create a RedShift Cluster and get the DWH_ENDPOIN(Host address) and DWH_ROLE_ARN and fill the config file. 41 | 42 | ## ETL Pipeline 43 | 44 | 1. Created tables tables to store the data from S3 buckets. 45 | 2. Loading the data from S3 buckets to staging tables in the Redshift Cluster. 46 | 3. Inserted data into fact and dimension tables from the staging tables. 47 | 48 | ## Usage 49 | **sql_queries.py**: Contains all SQL queries of the project and this file can be used in multiple files. 50 | 51 | **create_tables.py**: Run this file only after writing queries in the **sql_queries.py** file. 52 | drop_tables: This function is used to drop the tables. 53 | create_tables: This function is used to create tables. 54 | 55 | **etl.py**: Check the table schemas in your redshift database, if you find database schema is created successfully you can run this file. 56 | load_staging_tables: This function is used to load the data from S3 to Redshift staging tables. 57 | insert_tables: This functionis used to insert data into fact and dimemsion tables from staging tables. 58 | 59 | ## execute files in the below order each time before pipeline 60 | 61 | 1.create_tables.py 62 | ```python 63 | $ python3 create_tables.py 64 | ``` 65 | 66 | 2. etl.py 67 | ```python 68 | $ python3 etl.py 69 | ``` 70 | -------------------------------------------------------------------------------- /project1/sql_queries.py: -------------------------------------------------------------------------------- 1 | # DROP TABLES 2 | 3 | songplay_table_drop = "DROP TABLE IF EXISTS songplays" 4 | user_table_drop = "DROP TABLE IF EXISTS users" 5 | song_table_drop = "DROP TABLE IF EXISTS songs" 6 | artist_table_drop = "DROP TABLE IF EXISTS artists" 7 | time_table_drop = "DROP TABLE IF EXISTS time" 8 | 9 | # CREATE TABLES 10 | # CREATE FACT TABLE 11 | songplay_table_create = ("""CREATE TABLE IF NOT EXISTS songplays( 12 | songplay_id SERIAL PRIMARY KEY, 13 | start_time timestamp, 14 | user_id int NOT NULL, 15 | level varchar, 16 | artist_id varchar, 17 | song_id varchar, 18 | session_id int, 19 | location text, 20 | user_agent text)""") 21 | 22 | # CREATE DIMENION TABLES 23 | user_table_create = ("""CREATE TABLE IF NOT EXISTS users( 24 | user_id int NOT NULL, 25 | first_name varchar NOT NULL, 26 | last_name varchar NOT NULL, 27 | gender char, 28 | level varchar, 29 | PRIMARY KEY (user_id))""") 30 | 31 | song_table_create = ("""CREATE TABLE IF NOT EXISTS songs( 32 | song_id varchar NOT NULL, 33 | title varchar, 34 | artist_id varchar, 35 | year int, 36 | duration float, 37 | PRIMARY KEY (song_id))""") 38 | 39 | artist_table_create = ("""CREATE TABLE IF NOT EXISTS artists( 40 | artist_id varchar NOT NULL, 41 | name varchar, 42 | location varchar, 43 | lattitude numeric, 44 | longitude numeric, 45 | PRIMARY KEY (artist_id))""") 46 | 47 | time_table_create = ("""CREATE TABLE IF NOT EXISTS time( 48 | start_time timestamp NOT NULL, 49 | hour int, 50 | day int, 51 | week int, 52 | month int, 53 | year int, 54 | weekday varchar, 55 | PRIMARY KEY (start_time))""") 56 | 57 | # INSERT RECORDS 58 | 59 | songplay_table_insert = ("""INSERT INTO songplays( start_time, user_id,level,artist_id,song_id, session_id, location, user_agent) 60 | VALUES(%s, %s, %s, %s, %s, %s, %s, %s)""") 61 | 62 | user_table_insert = ("""INSERT INTO users(user_id, first_name, last_name, gender,level) 63 | VALUES(%s, %s, %s, %s, %s) 64 | ON CONFLICT (user_id) 65 | DO UPDATE SET level = excluded.level""") 66 | 67 | song_table_insert = ("""INSERT INTO songs(song_id, title, artist_id,year,duration) 68 | VALUES(%s, %s, %s, %s, %s) 69 | ON CONFLICT (song_id) 70 | DO NOTHING""") 71 | 72 | artist_table_insert = ("""INSERT INTO artists(artist_id, name,location,lattitude, longitude) 73 | VALUES(%s, %s, %s, %s, %s) 74 | ON CONFLICT (artist_id) 75 | DO NOTHING""") 76 | 77 | 78 | time_table_insert = ("""INSERT INTO time(start_time,hour,day,week,month, year,weekday) 79 | VALUES(%s, %s, %s, %s, %s, %s, %s) 80 | ON CONFLICT (start_time) 81 | DO NOTHING""") 82 | 83 | # FIND SONGS 84 | 85 | song_select = ("""SELECT songs.song_id, artists.artist_id FROM songs 86 | JOIN artists ON songs.artist_id=artists.artist_id 87 | WHERE songs.title=%s AND artists.name=%s AND songs.duration=%s; 88 | """) 89 | 90 | # QUERY LISTS 91 | 92 | create_table_queries = [songplay_table_create, user_table_create, song_table_create, artist_table_create, time_table_create] 93 | drop_table_queries = [songplay_table_drop, user_table_drop, song_table_drop, artist_table_drop, time_table_drop] -------------------------------------------------------------------------------- /RDBMS data modeling/l1-exercise-1-solution-creating-a-table-with-postgres.ipynb: -------------------------------------------------------------------------------- 1 | #%% [markdown] 2 | # # Lesson 1 Exercise 1: Creating a Table with PostgreSQL 3 | # 4 | # Image file: postgresSQLlogo 5 | #%% [markdown] 6 | # ### Walk through the basics of PostgreSQL. You will need to complete the following tasks: 7 | #
  • Create a table in PostgreSQL,
  • Insert rows of data
  • Run a simple SQL query to validate the information. 8 | #%% [markdown] 9 | # #### Import the library 10 | # *Note:* An error might popup after this command has executed. If it does, read it carefully before ignoring. 11 | 12 | #%% 13 | import psycopg2 14 | 15 | #%% [markdown] 16 | # ### Create a connection to the database 17 | 18 | #%% 19 | try: 20 | conn = psycopg2.connect("host=127.0.0.1 dbname=studentdb user=student password=student") 21 | except psycopg2.Error as e: 22 | print("Error: Could not make connection to the Postgres database") 23 | print(e) 24 | 25 | #%% [markdown] 26 | # ### Use the connection to get a cursor that can be used to execute queries. 27 | 28 | #%% 29 | try: 30 | cur = conn.cursor() 31 | except psycopg2.Error as e: 32 | print("Error: Could not get curser to the Database") 33 | print(e) 34 | 35 | #%% [markdown] 36 | # ### Set automatic commit to be true so that each action is committed without having to call conn.commit() after each command. 37 | 38 | #%% 39 | # TO-DO: set automatic commit to be true 40 | 41 | #%% [markdown] 42 | # ### Create a database to do the work in. 43 | 44 | #%% 45 | ## TO-DO: Add the database name within the CREATE DATABASE statement. You can choose your own db name. 46 | try: 47 | cur.execute("create database udacity") 48 | except psycopg2.Error as e: 49 | print(e) 50 | 51 | #%% [markdown] 52 | # #### Add the database name in the connect statement. Let's close our connection to the default database, reconnect to the Udacity database, and get a new cursor. 53 | 54 | #%% 55 | try: 56 | conn.close() 57 | except psycopg2.Error as e: 58 | print(e) 59 | 60 | try: 61 | conn = psycopg2.connect("dbname=udacity") 62 | except psycopg2.Error as e: 63 | print("Error: Could not make connection to the Postgres database") 64 | print(e) 65 | 66 | try: 67 | cur = conn.cursor() 68 | except psycopg2.Error as e: 69 | print("Error: Could not get curser to the Database") 70 | print(e) 71 | 72 | conn.set_session(autocommit=True) 73 | 74 | #%% [markdown] 75 | # ### Create a Song Library that contains a list of songs, including the song name, artist name, year, album it was from, and if it was a single. 76 | # 77 | # `song title 78 | # artist 79 | # year 80 | # album 81 | # single` 82 | # 83 | 84 | #%% 85 | try: 86 | cur.execute("CREATE TABLE IF NOT EXISTS songs (song_title varchar, artist_name varchar, year int, album_name varchar, single Boolean);") 87 | except psycopg2.Error as e: 88 | print("Error: Issue creating table") 89 | print (e) 90 | 91 | #%% [markdown] 92 | # ### Insert the following two rows in the table 93 | # `First Row: "Across The Universe", "The Beatles", "1970", "False", "Let It Be"` 94 | # 95 | # `Second Row: "The Beatles", "Think For Yourself", "False", "1965", "Rubber Soul"` 96 | 97 | #%% 98 | try: 99 | cur.execute("INSERT INTO songs (song_title, artist_name, year, album_name, single) VALUES (%s, %s, %s, %s, %s)", ("Across The Universe", "The Beatles", 1970, "Let It Be", False)) 100 | except psycopg2.Error as e: 101 | print("Error: Inserting Rows") 102 | print (e) 103 | 104 | try: 105 | cur.execute("INSERT INTO songs (song_title, artist_name, year, album_name, single) VALUES (%s, %s, %s, %s, %s)", 106 | ("Think For Yourself", "The Beatles", 1965, "Rubber Soul", False)) 107 | except psycopg2.Error as e: 108 | print("Error: Inserting Rows") 109 | print (e) 110 | 111 | #%% [markdown] 112 | # ### Validate your data was inserted into the table. 113 | # 114 | 115 | #%% 116 | try: 117 | cur.execute("SELECT * FROM songs;") 118 | except psycopg2.Error as e: 119 | print("Error: select *") 120 | print (e) 121 | 122 | row = cur.fetchone() 123 | while row: 124 | print(row) 125 | row = cur.fetchone() 126 | 127 | #%% [markdown] 128 | # ### And finally close your cursor and connection. 129 | 130 | #%% 131 | cur.close() 132 | conn.close() 133 | 134 | 135 | -------------------------------------------------------------------------------- /project1/etl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import psycopg2 4 | import pandas as pd 5 | from sql_queries import * 6 | import datetime 7 | 8 | def process_song_file(cur, filepath): 9 | ''' 10 | cur: connection cursor 11 | filepath: Single song file path, extract details and insert into song and artist table. 12 | ''' 13 | 14 | # open song file 15 | df = pd.read_json(filepath, lines=True) 16 | 17 | # insert song record 18 | song_data = df[['song_id', 'title','artist_id', 'year', 'duration']].values[0].tolist() 19 | song_data = (song_data[0], song_data[1], song_data[2], song_data[3], song_data[4]) 20 | try: 21 | cur.execute(song_table_insert, song_data) 22 | except: 23 | pass 24 | 25 | # insert artist record 26 | artist_data = df[['artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude']].values[0].tolist() 27 | artist_data = (artist_data[0], artist_data[1], artist_data[2], artist_data[3], artist_data[4]) 28 | try: 29 | cur.execute(artist_table_insert, artist_data) 30 | except: 31 | pass 32 | 33 | 34 | def process_log_file(cur, filepath): 35 | ''' 36 | cur: connection cursor 37 | filepath: Single log file path, extract details and insert into user, time and songplays table. 38 | ''' 39 | 40 | # open log file 41 | df = pd.read_json(filepath, lines=True) 42 | 43 | # filter by NextSong action 44 | df = df[df['page']=="NextSong"].reset_index() 45 | 46 | # convert timestamp column to datetime 47 | t = pd.to_datetime(df.ts, unit='ms') 48 | 49 | df['week'] = t.apply(lambda x: datetime.date(x.year, x.month, x.day).isocalendar()[1]) 50 | df['week_day'] = t.apply(lambda x: datetime.date(x.year, x.month, x.day).strftime("%A")) 51 | # insert time data records 52 | time_data = (t, t.dt.hour, t.dt.day, df.week, t.dt.month, t.dt.year, df.week_day) 53 | column_labels = ['start_time','hour','day','week','month', 'year','weekday'] 54 | time_df = pd.DataFrame(dict(zip(column_labels, time_data))) 55 | df['start_time'] = t 56 | df.head() 57 | for i, row in time_df.iterrows(): 58 | cur.execute(time_table_insert, list(row)) 59 | 60 | # load user table 61 | user_df = df[['userId','firstName', 'lastName', 'gender', 'level']] 62 | 63 | # insert user records 64 | for i, row in user_df.iterrows(): 65 | cur.execute(user_table_insert, row) 66 | 67 | # insert songplay records 68 | for index, row in df.iterrows(): 69 | # get songid and artistid from song and artist tables 70 | cur.execute(song_select, (row.song, row.artist, row.length)) 71 | results = cur.fetchone() 72 | 73 | if results: 74 | songid, artistid = results 75 | else: 76 | songid, artistid = None, None 77 | 78 | # insert songplay record 79 | songplay_data = (row.start_time,row.userId,row.level,songid,artistid, row.sessionId,row.location,row.userAgent) 80 | cur.execute(songplay_table_insert, songplay_data) 81 | 82 | 83 | def process_data(cur, conn, filepath, func): 84 | ''' 85 | cur: cursor connection 86 | conn: database connection 87 | filepath: dataset path to extract all the sub paths 88 | func: function to call and process the data 89 | ''' 90 | 91 | # get all files matching extension from directory 92 | all_files = [] 93 | for root, dirs, files in os.walk(filepath): 94 | files = glob.glob(os.path.join(root,'*.json')) 95 | for f in files : 96 | all_files.append(os.path.abspath(f)) 97 | 98 | # get total number of files found 99 | num_files = len(all_files) 100 | print('{} files found in {}'.format(num_files, filepath)) 101 | 102 | # iterate over files and process 103 | for i, datafile in enumerate(all_files, 1): 104 | func(cur, datafile) 105 | conn.commit() 106 | print('{}/{} files processed.'.format(i, num_files)) 107 | 108 | 109 | def main(): 110 | conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student") #connect to the sparkify database 111 | cur = conn.cursor() 112 | 113 | process_data(cur, conn, filepath='data/song_data', func=process_song_file) 114 | process_data(cur, conn, filepath='data/log_data', func=process_log_file) 115 | 116 | #connection closed 117 | conn.close() 118 | 119 | 120 | if __name__ == "__main__": 121 | main() -------------------------------------------------------------------------------- /project1/readme.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | A startup company called Sparkify provides music streaming to the users through the application. The songs details and the user activity data from the application are currently available and stored in the format of JSON. 4 | 5 | If Sparkify wants to analyze the user's daily activities and provide future song recommendations, that would be very tough to query for the analysis and future recommendations from the JSON files. As the data goes increasing every day from Bytes to GBs and even more difficult for processing and analyzing the data with the JSON files. 6 | 7 | Instead of storing the generated data from the user into the JSON files, I would recommend a database which is widely used for modeling techniques and it helps in the fast retrieval of data. This can be made even more efficient with the approach of the STAR schema. 8 | 9 | The STAR schema consists of one fact table referencing any number of dimension tables which helps the Sparkify for solving simplified common business logic. 10 | * What is the next song Sparkify user would like to listen based on his past behavior. 11 | * which song user would be more interested in listening to that particular point of time. 12 | 13 | And much more complex business logics can be easily solved using the STAR schema method. 14 | 15 | 16 | Created a STAR schema, optimized for song play analysis. 17 | * **Fact Table**: songplays: attributes referencing to the dimension tables. 18 | * **Dimension Tables**: users, songs, artists and time table. 19 | 20 | This database will help the internal departments of the Sparkify company to do different kinds of analysis to recommend a Sparkify user. 21 | 22 | * Favorite songs of user based on the week day: By joining songplay and songs and user table based on level. 23 | * Recent listened to songs: By joining songplays and user table can show recommendation on the app based on subscription level. 24 | * Can help in recommending most popular songs of the day/week. 25 | 26 | ## ETL 27 | 1. Created **songs**, **artist** dimension tables from extracting songs_data by selected columns. 28 | 2. Created **users**, **time** dimension tables from extracting log_data by selected columns. 29 | 3. Created the most important table fact table from the dimensison tables and log_data called **songplays**. 30 | 31 | ## Installation 32 | 33 | Install PostgreSQL database drivers by using the below command 34 | ```bash 35 | pip install psycopg2 36 | ``` 37 | ## Usage 38 | 1. **sql_queries.py**: contains all SQL queries of the project and this file can be used in multiple files. 39 | 2. **create_tables.py**: run this file after writing for creating tables for the project. 40 | 41 | #### Libraries used: 42 | ```python 43 | import psycopg2 44 | from sql_queries import create_table_queries, drop_table_queries 45 | ``` 46 | #### Functions and its importance: 47 | **create_database**: This function helps in droping existing database, create new database and return the connection. 48 | 49 | **drop_tables**: Used to drop the existing tables. 50 | 51 | **create_tables**: This helps in creating above mentioned fact table and dimension tables. 52 | 3. **etl.ipynb**: reads and processes a single file from song_data and log_data and loads the data into your tables. This notebook contains detailed instructions on the ETL process for each of the tables. 53 | 4. **etl.py**: read and process files from song_data and log_data and load them to tables. 54 | #### Libraries used: 55 | ```python 56 | import os 57 | import glob 58 | import psycopg2 59 | import pandas as pd 60 | from sql_queries import * 61 | import json 62 | ``` 63 | #### Functions and its importance: 64 | 65 | **process_song_file**: This function is used to read the song file and insert details with selected columns into song and artist dimension table. 66 | 67 | **process_log_file**: read one by one log file and insert details with selected columns into user, time and songplays tables. 68 | 69 | **process_data**: collect all the file paths and call the above two function and show status of files processed. 70 | 71 | **main**: used to call the process_data function. 72 | 73 | 5. **test.py**: displays the first few rows of each table to let you check your database. 74 | 75 | ## execute files in the below order each time before pipeline. 76 | 77 | 1. create_tables.py 78 | ```python 79 | $ python3 create_tables.py 80 | 2. etl.ipynb/et.py 81 | ```python 82 | $ python3 etl.py 83 | 3. test.ipynb 84 | -------------------------------------------------------------------------------- /Data wrangling with spark/1_procedural_vs_functional_in_python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Procedural Programming\n", 8 | "\n", 9 | "This notebook contains the code from the previous screencast. The code counts the number of times a song appears in the log_of_songs variable. \n", 10 | "\n", 11 | "You'll notice that the first time you run `count_plays(\"Despacito\")`, you get the correct count. However, when you run the same code again `count_plays(\"Despacito\")`, the results are no longer correct.This is because the global variable `play_count` stores the results outside of the count_plays function. \n", 12 | "\n", 13 | "\n", 14 | "# Instructions\n", 15 | "\n", 16 | "Run the code cells in this notebook to see the problem with " 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "log_of_songs = [\n", 26 | " \"Despacito\",\n", 27 | " \"Nice for what\",\n", 28 | " \"No tears left to cry\",\n", 29 | " \"Despacito\",\n", 30 | " \"Havana\",\n", 31 | " \"In my feelings\",\n", 32 | " \"Nice for what\",\n", 33 | " \"Despacito\",\n", 34 | " \"All the stars\"\n", 35 | "]" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "play_count = 0" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "def count_plays(song_title):\n", 54 | " global play_count\n", 55 | " for song in log_of_songs:\n", 56 | " if song == song_title:\n", 57 | " play_count = play_count + 1\n", 58 | " return play_count" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/plain": [ 69 | "3" 70 | ] 71 | }, 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "count_plays(\"Despacito\")" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 5, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "6" 90 | ] 91 | }, 92 | "execution_count": 5, 93 | "metadata": {}, 94 | "output_type": "execute_result" 95 | } 96 | ], 97 | "source": [ 98 | "count_plays(\"Despacito\")" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "# How to Solve the Issue\n", 106 | "\n", 107 | "How might you solve this issue? You could get rid of the global variable and instead use play_count as an input to the function:\n", 108 | "\n", 109 | "```python\n", 110 | "def count_plays(song_title, play_count):\n", 111 | " for song in log_of_songs:\n", 112 | " if song == song_title:\n", 113 | " play_count = play_count + 1\n", 114 | " return play_count\n", 115 | "\n", 116 | "```\n", 117 | "\n", 118 | "How would this work with parallel programming? Spark splits up data onto multiple machines. If your songs list were split onto two machines, Machine A would first need to finish counting, and then return its own result to Machine B. And then Machine B could use the output from Machine A and add to the count.\n", 119 | "\n", 120 | "However, that isn't parallel computing. Machine B would have to wait until Machine A finishes. You'll see in the next parts of the lesson how Spark solves this issue with a functional programming paradigm.\n", 121 | "\n", 122 | "In Spark, if your data is split onto two different machines, machine A will run a function to count how many times 'Despacito' appears on machine A. Machine B will simultaneously run a function to count how many times 'Despacito' appears on machine B. After they finish counting individually, they'll combine their results together. You'll see how this works in the next parts of the lesson." 123 | ] 124 | } 125 | ], 126 | "metadata": { 127 | "kernelspec": { 128 | "display_name": "Python 3", 129 | "language": "python", 130 | "name": "python3" 131 | }, 132 | "language_info": { 133 | "codemirror_mode": { 134 | "name": "ipython", 135 | "version": 3 136 | }, 137 | "file_extension": ".py", 138 | "mimetype": "text/x-python", 139 | "name": "python", 140 | "nbconvert_exporter": "python", 141 | "pygments_lexer": "ipython3", 142 | "version": "3.6.3" 143 | } 144 | }, 145 | "nbformat": 4, 146 | "nbformat_minor": 2 147 | } 148 | -------------------------------------------------------------------------------- /project4/etl.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | from datetime import datetime 3 | import calendar 4 | import os 5 | from pyspark.sql import SparkSession 6 | from pyspark.sql.functions import udf, col 7 | from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format 8 | from pyspark.sql.functions import monotonically_increasing_id 9 | 10 | 11 | config = configparser.ConfigParser() 12 | config.read('dl.cfg') 13 | 14 | os.environ['AWS_ACCESS_KEY_ID']=config['KEYS']['AWS_ACCESS_KEY_ID'] 15 | os.environ['AWS_SECRET_ACCESS_KEY']=config['KEYS']['AWS_SECRET_ACCESS_KEY'] 16 | 17 | 18 | def create_spark_session(): 19 | spark = SparkSession \ 20 | .builder \ 21 | .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \ 22 | .getOrCreate() 23 | return spark 24 | 25 | 26 | def process_song_data(spark, input_data, output_data): 27 | # get filepath to song data file 28 | song_data = os.path.join(input_data,"song_data/*/*/*/*.json") 29 | # read song data file 30 | df = spark.read.json(song_data) 31 | # extract columns to create songs table 32 | songs_table = df['song_id', 'title', 'artist_id', 'year', 'duration'] 33 | 34 | # write songs table to parquet files partitioned by year and artist 35 | songs_table.write.partitionBy('year', 'artist_id').parquet(os.path.join(output_data, 'songs.parquet'), 'overwrite') 36 | print("--- songs.parquet completed ---") 37 | # extract columns to create artists table 38 | artists_table = df['artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude'] 39 | 40 | # write artists table to parquet files 41 | artists_table.write.parquet(os.path.join(output_data, 'artists.parquet'), 'overwrite') 42 | print("--- artists.parquet completed ---") 43 | print("*** process_song_data completed ***\n\n") 44 | 45 | 46 | def process_log_data(spark, input_data, output_data): 47 | # get filepath to log data file 48 | log_data =os.path.join(input_data,"log_data/*/*/*.json") 49 | 50 | # read log data file 51 | df = spark.read.json(log_data) 52 | 53 | # filter by actions for song plays 54 | # songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent 55 | songplays_table = df['ts', 'userId', 'level','sessionId', 'location', 'userAgent'] 56 | 57 | # extract columns for users table 58 | # user_id, first_name, last_name, gender, level 59 | users_table = df['userId', 'firstName', 'lastName', 'gender', 'level'] 60 | 61 | # write users table to parquet files 62 | users_table.write.parquet(os.path.join(output_data, 'users.parquet'), 'overwrite') 63 | print("--- users.parquet completed ---") 64 | # create timestamp column from original timestamp column 65 | get_timestamp = udf(lambda x: str(int(int(x)/1000))) 66 | df = df.withColumn('timestamp', get_timestamp(df.ts)) 67 | # create datetime column from original timestamp column 68 | get_datetime = udf(lambda x: datetime.fromtimestamp(int(int(x)/1000))) 69 | get_week = udf(lambda x: calendar.day_name[x.weekday()]) 70 | get_weekday = udf(lambda x: x.isocalendar()[1]) 71 | get_hour = udf(lambda x: x.hour) 72 | get_day = udf(lambda x : x.day) 73 | get_year = udf(lambda x: x.year) 74 | get_month = udf(lambda x: x.month) 75 | 76 | 77 | df = df.withColumn('start_time', get_datetime(df.ts)) 78 | df = df.withColumn('hour', get_hour(df.start_time)) 79 | df = df.withColumn('day', get_day(df.start_time)) 80 | df = df.withColumn('week', get_week(df.start_time)) 81 | df = df.withColumn('month', get_month(df.start_time)) 82 | df = df.withColumn('year', get_year(df.start_time)) 83 | df = df.withColumn('weekday', get_weekday(df.start_time)) 84 | # extract columns to create time table 85 | time_table = df['start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday'] 86 | 87 | # write time table to parquet files partitioned by year and month 88 | time_table.write.partitionBy('year', 'month').parquet(os.path.join(output_data, 'time.parquet'), 'overwrite') 89 | print("--- time.parquet completed ---") 90 | # read in song data to use for songplays table 91 | song_df = spark.read.parquet("songs.parquet") 92 | 93 | # extract columns from joined song and log datasets to create songplays table 94 | df = df.join(song_df, song_df.title == df.song) 95 | songplays_table = df['start_time', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent'] 96 | songplays_table.select(monotonically_increasing_id().alias('songplay_id')).collect() 97 | # write songplays table to parquet files partitioned by year and month 98 | songplays_table.write.parquet(os.path.join(output_data, 'songplays.parquet'), 'overwrite') 99 | print("--- songplays.parquet completed ---") 100 | print("*** process_log_data completed ***\n\nEND") 101 | 102 | 103 | def main(): 104 | spark = create_spark_session() 105 | input_data = "s3a://udacity-dend/" 106 | output_data = "" 107 | 108 | process_song_data(spark, input_data, output_data) 109 | process_log_data(spark, input_data, output_data) 110 | 111 | 112 | if __name__ == "__main__": 113 | main() 114 | -------------------------------------------------------------------------------- /RDBMS data modeling/l1-demo-0-creating-a-table-with-postgres.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lesson 1 Demo 0: PostgreSQL and AutoCommits\n", 8 | "\n", 9 | "" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Walk through the basics of PostgreSQL autocommits " 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "## import postgreSQL adapter for the Python\n", 26 | "import psycopg2" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "### Create a connection to the database\n", 34 | "1. Connect to the local instance of PostgreSQL (*127.0.0.1*)\n", 35 | "2. Use the database/schema from the instance. \n", 36 | "3. The connection reaches out to the database (*studentdb*) and use the correct privilages to connect to the database (*user and password = student*)." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "conn = psycopg2.connect(\"host=127.0.0.1 dbname=studentdb user=student password=student\")" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "### Use the connection to get a cursor that will be used to execute queries." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "cur = conn.cursor()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "### Create a database to work in" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "cur.execute(\"select * from test\")" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "### Error occurs, but it was to be expected because table has not been created as yet. To fix the error, create the table. " 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "cur.execute(\"CREATE TABLE test (col1 int, col2 int, col3 int);\")" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "### Error indicates we cannot execute this query. Since we have not committed the transaction and had an error in the transaction block, we are blocked until we restart the connection." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "conn = psycopg2.connect(\"host=127.0.0.1 dbname=studentdb user=student password=student\")\n", 110 | "cur = conn.cursor()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "In our exercises instead of worrying about commiting each transaction or getting a strange error when we hit something unexpected, let's set autocommit to true. **This says after each call during the session commit that one action and do not hold open the transaction for any other actions. One action = one transaction.**" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "In this demo we will use automatic commit so each action is commited without having to call `conn.commit()` after each command. **The ability to rollback and commit transactions are a feature of Relational Databases.**" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "conn.set_session(autocommit=True)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "cur.execute(\"select * from test\")" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "cur.execute(\"CREATE TABLE test (col1 int, col2 int, col3 int);\")" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "### Once autocommit is set to true, we execute this code successfully. There were no issues with transaction blocks and we did not need to restart our connection. " 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "cur.execute(\"select * from test\")" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "cur.execute(\"select count(*) from test\")\n", 177 | "print(cur.fetchall())" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [] 186 | } 187 | ], 188 | "metadata": { 189 | "kernelspec": { 190 | "display_name": "Python 3", 191 | "language": "python", 192 | "name": "python3" 193 | }, 194 | "language_info": { 195 | "codemirror_mode": { 196 | "name": "ipython", 197 | "version": 3 198 | }, 199 | "file_extension": ".py", 200 | "mimetype": "text/x-python", 201 | "name": "python", 202 | "nbconvert_exporter": "python", 203 | "pygments_lexer": "ipython3", 204 | "version": "3.7.2" 205 | } 206 | }, 207 | "nbformat": 4, 208 | "nbformat_minor": 2 209 | } -------------------------------------------------------------------------------- /.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 12, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "enter the number 30\n", 13 | "b {'truck': 0, 'bike': 0, 'car': 0, 'wheel': 30, 'seat': 30}\n", 14 | "2 1\n", 15 | "b {'truck': 0, 'bike': 1, 'car': 0, 'wheel': 28, 'seat': 29}\n", 16 | "2 1\n", 17 | "b {'truck': 0, 'bike': 2, 'car': 0, 'wheel': 26, 'seat': 28}\n", 18 | "2 1\n", 19 | "b {'truck': 0, 'bike': 3, 'car': 0, 'wheel': 24, 'seat': 27}\n", 20 | "2 1\n", 21 | "b {'truck': 0, 'bike': 4, 'car': 0, 'wheel': 22, 'seat': 26}\n", 22 | "2 1\n", 23 | "b {'truck': 0, 'bike': 5, 'car': 0, 'wheel': 20, 'seat': 25}\n", 24 | "2 1\n", 25 | "b {'truck': 0, 'bike': 6, 'car': 0, 'wheel': 18, 'seat': 24}\n", 26 | "2 1\n", 27 | "b {'truck': 0, 'bike': 7, 'car': 0, 'wheel': 16, 'seat': 23}\n", 28 | "2 1\n", 29 | "b {'truck': 0, 'bike': 8, 'car': 0, 'wheel': 14, 'seat': 22}\n", 30 | "2 1\n", 31 | "b {'truck': 0, 'bike': 9, 'car': 0, 'wheel': 12, 'seat': 21}\n", 32 | "2 1\n", 33 | "b {'truck': 0, 'bike': 10, 'car': 0, 'wheel': 10, 'seat': 20}\n", 34 | "2 1\n", 35 | "b {'truck': 0, 'bike': 11, 'car': 0, 'wheel': 8, 'seat': 19}\n", 36 | "2 1\n", 37 | "b {'truck': 0, 'bike': 12, 'car': 0, 'wheel': 6, 'seat': 18}\n", 38 | "2 1\n", 39 | "b {'truck': 0, 'bike': 13, 'car': 0, 'wheel': 4, 'seat': 17}\n", 40 | "2 1\n", 41 | "b {'truck': 0, 'bike': 14, 'car': 0, 'wheel': 2, 'seat': 16}\n", 42 | "2 1\n", 43 | "dict_values([0, 15, 0, 0, 15])\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "n=int(input(\"enter the number \"))\n", 49 | "class vehicle:\n", 50 | " def __init__(self, wheel, seat):\n", 51 | " self.wheel =wheel\n", 52 | " self.seat =seat\n", 53 | "class truck(vehicle): \n", 54 | " def __init__(self,wheel=6,seat=2): #default values are given as 6 and 2\n", 55 | " self.wheel=wheel\n", 56 | " self.seat=seat\n", 57 | "class car(vehicle): \n", 58 | " def __init__(self,wheel=4,seat=5): #default values are given as 4 and 5\n", 59 | " self.wheel=wheel\n", 60 | " self.seat=seat \n", 61 | "class bike(vehicle): \n", 62 | " def __init__(self,wheel=2,seat=1): #default vaues are given as 2 and 1\n", 63 | " self.wheel=wheel\n", 64 | " self.seat=wheel \n", 65 | " \n", 66 | "#----------------------------------------------------------------------------------------------- \n", 67 | " \n", 68 | "def createtruck(d):\n", 69 | " tr=truck()\n", 70 | " d['truck']=d['truck']+1\n", 71 | " d['wheel']=d['wheel']-tr.wheel\n", 72 | " d['seat']=d['seat']-tr.seat\n", 73 | " if d['wheel']<0 or d['seat']<0:\n", 74 | " return 1\n", 75 | " else:\n", 76 | " return d\n", 77 | "def createcar(d):\n", 78 | " cr=car()\n", 79 | " d['car']=d['car']+1\n", 80 | " d['wheel']=d['wheel']-cr.wheel\n", 81 | " d['seat']=d['seat']-cr.seat\n", 82 | " if d['wheel']<0 or d['seat']<0:\n", 83 | " return 1\n", 84 | " else:\n", 85 | " return d\n", 86 | "def createbike(d):\n", 87 | " bk=bike()\n", 88 | " print(bk.wheel, bk.seat)\n", 89 | " d['bike']=d['bike']+1\n", 90 | " d['wheel']=d['wheel']-bk.wheel\n", 91 | " d['seat']=d['seat']-bk.seat\n", 92 | " if d['wheel']<0 or d['seat']<0:\n", 93 | " return 1\n", 94 | " else:\n", 95 | " return d\n", 96 | "#------------------------------------------------------------------------------------------------ \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | "\n", 102 | "d={'truck':0,'bike':0,'car':0,'wheel':n,'seat':n}\n", 103 | "while d['wheel']>0 and d['seat']>0:\n", 104 | " trwh=d['wheel']//6 #gives floor value of total wheels that are required\n", 105 | " trs=d['seat']//2 #gives floor value of total seats that are required\n", 106 | " tottruck=min(trwh,trs) #total number of trucks that can be created \n", 107 | " \n", 108 | "\n", 109 | " crwh=d['wheel']//4\n", 110 | " crs=d['seat']//5\n", 111 | " totcar=min(crwh,crs) #total number of cars that can be created\n", 112 | " \n", 113 | "\n", 114 | " bkwh=d['wheel']//2\n", 115 | " bks=d['seat']//1\n", 116 | " totbikes=min(bkwh,bks) #total number of bikes that can be created\n", 117 | " \n", 118 | " \n", 119 | "\n", 120 | " \n", 121 | " m=0\n", 122 | " d1={'maketruck':tottruck,'makebike':totbikes,',makecar':totcar}\n", 123 | " for key,val in d1.items():\n", 124 | " if val>m:\n", 125 | " m=val\n", 126 | " maxim=key\n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " if maxim=='maketruck':\n", 131 | " print(\"t\",d)\n", 132 | " createtruck(d)\n", 133 | " \n", 134 | " elif maxim=='makebike':\n", 135 | " print(\"b\",d)\n", 136 | " createbike(d)\n", 137 | " \n", 138 | " elif maxim=='makecar':\n", 139 | " print(\"c\")\n", 140 | " createcar(d)\n", 141 | " \n", 142 | " \n", 143 | "\n", 144 | "\n", 145 | "print(d.values()) \n", 146 | "\n" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 6, 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "data": { 156 | "text/plain": [ 157 | "15" 158 | ] 159 | }, 160 | "execution_count": 6, 161 | "metadata": {}, 162 | "output_type": "execute_result" 163 | } 164 | ], 165 | "source": [ 166 | "30//2" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [] 175 | } 176 | ], 177 | "metadata": { 178 | "kernelspec": { 179 | "display_name": "Python 3", 180 | "language": "python", 181 | "name": "python3" 182 | }, 183 | "language_info": { 184 | "codemirror_mode": { 185 | "name": "ipython", 186 | "version": 3 187 | }, 188 | "file_extension": ".py", 189 | "mimetype": "text/x-python", 190 | "name": "python", 191 | "nbconvert_exporter": "python", 192 | "pygments_lexer": "ipython3", 193 | "version": "3.6.4" 194 | } 195 | }, 196 | "nbformat": 4, 197 | "nbformat_minor": 2 198 | } 199 | -------------------------------------------------------------------------------- /Data wrangling with spark/8_spark_sql_quiz.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Wrangling with Spark SQL Quiz\n", 8 | "\n", 9 | "This quiz uses the same dataset and most of the same questions from the earlier \"Quiz - Data Wrangling with Data Frames Jupyter Notebook.\" For this quiz, however, use Spark SQL instead of Spark Data Frames." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 3, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from pyspark.sql import SparkSession\n", 19 | "from pyspark.sql.functions import udf\n", 20 | "from pyspark.sql.types import StringType\n", 21 | "from pyspark.sql.types import IntegerType\n", 22 | "from pyspark.sql.functions import desc\n", 23 | "from pyspark.sql.functions import asc\n", 24 | "from pyspark.sql.functions import sum as Fsum\n", 25 | "\n", 26 | "import datetime\n", 27 | "\n", 28 | "import numpy as np\n", 29 | "import pandas as pd\n", 30 | "%matplotlib inline\n", 31 | "import matplotlib.pyplot as plt\n", 32 | "# TODOS: \n", 33 | "# 1) import any other libraries you might need\n", 34 | "# 2) instantiate a Spark session \n", 35 | "# 3) read in the data set located at the path \"data/sparkify_log_small.json\"\n", 36 | "# 4) create a view to use with your SQL queries\n", 37 | "# 5) write code to answer the quiz questions \n", 38 | "spark = SparkSession \\\n", 39 | " .builder \\\n", 40 | " .appName(\"Data wrangling with Spark SQL\") \\\n", 41 | " .getOrCreate()\n", 42 | "\n", 43 | "path = \"data/sparkify_log_small.json\"\n", 44 | "user_log = spark.read.json(path)\n", 45 | "user_log.createOrReplaceTempView(\"user_log_table\")" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "# Question 1\n", 53 | "\n", 54 | "Which page did user id \"\"(empty string) NOT visit?" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 7, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "+----------------+\n", 67 | "| page|\n", 68 | "+----------------+\n", 69 | "|Submit Downgrade|\n", 70 | "| Downgrade|\n", 71 | "| Logout|\n", 72 | "| Save Settings|\n", 73 | "| Settings|\n", 74 | "| NextSong|\n", 75 | "| Upgrade|\n", 76 | "| Error|\n", 77 | "| Submit Upgrade|\n", 78 | "+----------------+\n", 79 | "\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "# TODO: write your code to answer question 1\n", 85 | "spark.sql('''\n", 86 | " SELECT DISTINCT page\n", 87 | " FROM user_log_table\n", 88 | " WHERE page NOT IN (SELECT DISTINCT page\n", 89 | " FROM user_log_table\n", 90 | " WHERE userId = ''\n", 91 | " )\n", 92 | " ''').show()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "# Question 2 - Reflect\n", 100 | "\n", 101 | "Why might you prefer to use SQL over data frames? Why might you prefer data frames over SQL?" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "# Question 3\n", 109 | "\n", 110 | "How many female users do we have in the data set?" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 9, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "+-----+\n", 123 | "|count|\n", 124 | "+-----+\n", 125 | "| 462|\n", 126 | "+-----+\n", 127 | "\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "# TODO: write your code to answer question 3\n", 133 | "spark.sql('''\n", 134 | " SELECT count(DISTINCT userId ) as count\n", 135 | " FROM user_log_table\n", 136 | " WHERE gender = 'F'\n", 137 | " ''').show()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 15, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "+--------+-----+\n", 150 | "| artist|count|\n", 151 | "+--------+-----+\n", 152 | "|Coldplay| 83|\n", 153 | "+--------+-----+\n", 154 | "\n" 155 | ] 156 | } 157 | ], 158 | "source": [] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "# Question 4\n", 172 | "\n", 173 | "How many songs were played from the most played artist?" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 16, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | "+--------+-----+\n", 186 | "| artist|count|\n", 187 | "+--------+-----+\n", 188 | "|Coldplay| 83|\n", 189 | "+--------+-----+\n", 190 | "\n" 191 | ] 192 | } 193 | ], 194 | "source": [ 195 | "# TODO: write your code to answer question 4\n", 196 | "spark.sql('''\n", 197 | " SELECT artist, COUNT(*) AS count\n", 198 | " FROM user_log_table\n", 199 | " WHERE page = 'NextSong'\n", 200 | " GROUP BY artist\n", 201 | " ORDER BY count desc\n", 202 | " limit 1\n", 203 | " ''').show()" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "# Question 5 (challenge)\n", 211 | "\n", 212 | "How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer." 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "# TODO: write your code to answer question 5\n", 222 | "\n" 223 | ] 224 | } 225 | ], 226 | "metadata": { 227 | "kernelspec": { 228 | "display_name": "Python 3", 229 | "language": "python", 230 | "name": "python3" 231 | }, 232 | "language_info": { 233 | "codemirror_mode": { 234 | "name": "ipython", 235 | "version": 3 236 | }, 237 | "file_extension": ".py", 238 | "mimetype": "text/x-python", 239 | "name": "python", 240 | "nbconvert_exporter": "python", 241 | "pygments_lexer": "ipython3", 242 | "version": "3.6.3" 243 | } 244 | }, 245 | "nbformat": 4, 246 | "nbformat_minor": 2 247 | } 248 | -------------------------------------------------------------------------------- /Intro to Data Warehouses/L3 Exercise 3 - Parallel ETL - Solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Exercise 3: Parallel ETL" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%load_ext sql" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "from time import time\n", 26 | "import configparser\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "import pandas as pd" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# STEP 1: Get the params of the created redshift cluster \n", 36 | "- We need:\n", 37 | " - The redshift cluster endpoint\n", 38 | " - The IAM role ARN that give access to Redshift to read from S3" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "config = configparser.ConfigParser()\n", 48 | "config.read_file(open('dwh.cfg'))\n", 49 | "KEY=config.get('AWS','key')\n", 50 | "SECRET= config.get('AWS','secret')\n", 51 | "\n", 52 | "DWH_DB= config.get(\"DWH\",\"DWH_DB\")\n", 53 | "DWH_DB_USER= config.get(\"DWH\",\"DWH_DB_USER\")\n", 54 | "DWH_DB_PASSWORD= config.get(\"DWH\",\"DWH_DB_PASSWORD\")\n", 55 | "DWH_PORT = config.get(\"DWH\",\"DWH_PORT\")" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "# FILL IN THE REDSHIFT ENPOINT HERE\n", 65 | "# e.g. DWH_ENDPOINT=\"redshift-cluster-1.csmamz5zxmle.us-west-2.redshift.amazonaws.com\" \n", 66 | "DWH_ENDPOINT=\"\" \n", 67 | " \n", 68 | "#FILL IN THE IAM ROLE ARN you got in step 2.2 of the previous exercise\n", 69 | "#e.g DWH_ROLE_ARN=\"arn:aws:iam::988332130976:role/dwhRole\"\n", 70 | "DWH_ROLE_ARN=\"\"" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "# STEP 2: Connect to the Redshift Cluster" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "conn_string=\"postgresql://{}:{}@{}:{}/{}\".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB)\n", 87 | "print(conn_string)\n", 88 | "%sql $conn_string" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "import boto3\n", 98 | "\n", 99 | "s3 = boto3.resource('s3',\n", 100 | " region_name=\"us-west-2\",\n", 101 | " aws_access_key_id=KEY,\n", 102 | " aws_secret_access_key=SECRET\n", 103 | " )\n", 104 | "\n", 105 | "sampleDbBucket = s3.Bucket(\"udacity-labs\")\n", 106 | "\n", 107 | "for obj in sampleDbBucket.objects.filter(Prefix=\"tickets\"):\n", 108 | " print(obj)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "# STEP 3: Create Tables" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "%%sql \n", 125 | "DROP TABLE IF EXISTS \"sporting_event_ticket\";\n", 126 | "CREATE TABLE \"sporting_event_ticket\" (\n", 127 | " \"id\" double precision DEFAULT nextval('sporting_event_ticket_seq') NOT NULL,\n", 128 | " \"sporting_event_id\" double precision NOT NULL,\n", 129 | " \"sport_location_id\" double precision NOT NULL,\n", 130 | " \"seat_level\" numeric(1,0) NOT NULL,\n", 131 | " \"seat_section\" character varying(15) NOT NULL,\n", 132 | " \"seat_row\" character varying(10) NOT NULL,\n", 133 | " \"seat\" character varying(10) NOT NULL,\n", 134 | " \"ticketholder_id\" double precision,\n", 135 | " \"ticket_price\" numeric(8,2) NOT NULL\n", 136 | ");" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "# STEP 4: Load Partitioned data into the cluster" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "%%time\n", 153 | "qry = \"\"\"\n", 154 | " copy sporting_event_ticket from 's3://udacity-labs/tickets/split/part'\n", 155 | " credentials 'aws_iam_role={}'\n", 156 | " gzip delimiter ';' compupdate off region 'us-west-2';\n", 157 | "\"\"\".format(DWH_ROLE_ARN)\n", 158 | "\n", 159 | "%sql $qry" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "# STEP 4: Create Tables for the non-partitioned data" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "%%sql\n", 176 | "DROP TABLE IF EXISTS \"sporting_event_ticket_full\";\n", 177 | "CREATE TABLE \"sporting_event_ticket_full\" (\n", 178 | " \"id\" double precision DEFAULT nextval('sporting_event_ticket_seq') NOT NULL,\n", 179 | " \"sporting_event_id\" double precision NOT NULL,\n", 180 | " \"sport_location_id\" double precision NOT NULL,\n", 181 | " \"seat_level\" numeric(1,0) NOT NULL,\n", 182 | " \"seat_section\" character varying(15) NOT NULL,\n", 183 | " \"seat_row\" character varying(10) NOT NULL,\n", 184 | " \"seat\" character varying(10) NOT NULL,\n", 185 | " \"ticketholder_id\" double precision,\n", 186 | " \"ticket_price\" numeric(8,2) NOT NULL\n", 187 | ");" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "# STEP 5: Load non-partitioned data into the cluster\n", 195 | "- Note how it's slower than loading partitioned data" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "%%time\n", 205 | "\n", 206 | "qry = \"\"\"\n", 207 | " copy sporting_event_ticket_full from 's3://udacity-labs/tickets/full/full.csv.gz' \n", 208 | " credentials 'aws_iam_role={}' \n", 209 | " gzip delimiter ';' compupdate off region 'us-west-2';\n", 210 | "\"\"\".format(DWH_ROLE_ARN)\n", 211 | "\n", 212 | "%sql $qry" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [] 221 | } 222 | ], 223 | "metadata": { 224 | "kernelspec": { 225 | "display_name": "Python 3", 226 | "language": "python", 227 | "name": "python3" 228 | }, 229 | "language_info": { 230 | "codemirror_mode": { 231 | "name": "ipython", 232 | "version": 3 233 | }, 234 | "file_extension": ".py", 235 | "mimetype": "text/x-python", 236 | "name": "python", 237 | "nbconvert_exporter": "python", 238 | "pygments_lexer": "ipython3", 239 | "version": "3.6.3" 240 | } 241 | }, 242 | "nbformat": 4, 243 | "nbformat_minor": 2 244 | } 245 | -------------------------------------------------------------------------------- /project3/sql_queries.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | 3 | 4 | # CONFIG 5 | config = configparser.ConfigParser() 6 | config.read('dwh.cfg') 7 | IAM_ROLE = config['IAM_ROLE']['ARN'] 8 | LOG_DATA = config['S3']['LOG_DATA'] 9 | SONG_DATA = config['S3']['SONG_DATA'] 10 | LOG_JSONPATH = config['S3']['LOG_JSONPATH'] 11 | # DROP TABLES 12 | 13 | staging_events_table_drop = "DROP TABLE IF EXISTS staging_events_table" 14 | staging_songs_table_drop = "DROP TABLE IF EXISTS staging_songs_table" 15 | songplay_table_drop = "DROP TABLE IF EXISTS songplay_table" 16 | user_table_drop = "DROP TABLE IF EXISTS user_table" 17 | song_table_drop = "DROP TABLE IF EXISTS song_table" 18 | artist_table_drop = "DROP TABLE IF EXISTS artist_table" 19 | time_table_drop = "DROP TABLE IF EXISTS time_table" 20 | 21 | # CREATE TABLES 22 | 23 | staging_events_table_create= ("""CREATE TABLE IF NOT EXISTS staging_events( 24 | artist TEXT, 25 | auth TEXT, 26 | first_name TEXT, 27 | gender CHAR(1), 28 | item_session INTEGER, 29 | last_name TEXT, 30 | length NUMERIC, 31 | level TEXT, 32 | location TEXT, 33 | method TEXT, 34 | page TEXT, 35 | registration NUMERIC, 36 | session_id INTEGER, 37 | song TEXT, 38 | status INTEGER, 39 | ts BIGINT, 40 | user_agent TEXT, 41 | user_id INTEGER 42 | ) 43 | """) 44 | 45 | staging_songs_table_create = ("""CREATE TABLE IF NOT EXISTS staging_songs( 46 | num_songs INTEGER, 47 | artist_id TEXT, 48 | artist_latitude NUMERIC, 49 | artist_longitude NUMERIC, 50 | artist_location TEXT, 51 | artist_name TEXT, 52 | song_id TEXT, 53 | title TEXT, 54 | duration NUMERIC, 55 | year INTEGER 56 | ) 57 | """) 58 | 59 | songplay_table_create = ("""CREATE TABLE IF NOT EXISTS songplay( 60 | songplay_id INT IDENTITY(1,1) PRIMARY KEY, 61 | start_time TIMESTAMP, 62 | user_id INTEGER NOT NULL, 63 | level TEXT, 64 | song_id TEXT, 65 | artist_id TEXT, 66 | session_id INTEGER, 67 | location TEXT, 68 | user_agent TEXT 69 | ) 70 | """) 71 | 72 | user_table_create = ("""CREATE TABLE IF NOT EXISTS users( 73 | user_id INTEGER PRIMARY KEY, 74 | first_name TEXT NOT NULL, 75 | last_name TEXT NOT NULL, 76 | gender CHAR(1), 77 | level TEXT 78 | ) 79 | """) 80 | 81 | song_table_create = ("""CREATE TABLE IF NOT EXISTS song( 82 | song_id TEXT PRIMARY KEY, 83 | title TEXT, 84 | artist_id TEXT, 85 | year INTEGER, 86 | duration NUMERIC 87 | ) 88 | """) 89 | 90 | artist_table_create = ("""CREATE TABLE IF NOT EXISTS artist( 91 | artist_id TEXT PRIMARY KEY, 92 | name TEXT, 93 | location TEXT, 94 | latitude NUMERIC, 95 | longitude NUMERIC 96 | ) 97 | """) 98 | 99 | time_table_create = ("""CREATE TABLE IF NOT EXISTS time( 100 | start_time TIMESTAMP PRIMARY KEY, 101 | hour INTEGER, 102 | day INTEGER, 103 | week INTEGER, 104 | month INTEGER, 105 | year INTEGER, 106 | weekDay INTEGER 107 | ) 108 | """) 109 | 110 | # STAGING TABLES 111 | staging_events_copy = ("""copy staging_events 112 | from {} 113 | iam_role {} 114 | json {}; 115 | """).format(LOG_DATA, IAM_ROLE, LOG_JSONPATH) 116 | 117 | staging_songs_copy = ("""copy staging_songs 118 | from {} 119 | iam_role {} 120 | json 'auto'; 121 | """).format(SONG_DATA, IAM_ROLE) 122 | 123 | # FINAL TABLES 124 | 125 | songplay_table_insert = ("""INSERT INTO songplay(start_time, user_id, level, song_id, artist_id, session_id, location, user_agent) 126 | SELECT timestamp 'epoch' + se.ts/1000 * interval '1 second' as start_time, se.user_id, se.level, 127 | ss.song_id, ss.artist_id, se.session_id, se.location, se.user_agent 128 | FROM staging_events se, staging_songs ss 129 | WHERE se.page = 'NextSong' AND 130 | se.song =ss.title AND 131 | se.artist = ss.artist_name AND 132 | se.length = ss.duration 133 | """) 134 | 135 | user_table_insert = ("""INSERT INTO users(user_id, first_name, last_name, gender, level) 136 | SELECT distinct user_id, first_name, last_name, gender, level 137 | FROM staging_events 138 | WHERE page = 'NextSong' 139 | """) 140 | 141 | song_table_insert = ("""INSERT INTO song(song_id, title, artist_id, year, duration) 142 | SELECT song_id, title, artist_id, year, duration 143 | FROM staging_songs 144 | WHERE song_id IS NOT NULL 145 | """) 146 | 147 | artist_table_insert = ("""INSERT INTO artist(artist_id, name, location, latitude, longitude) 148 | SELECT distinct artist_id, artist_name, artist_location , artist_latitude, artist_longitude 149 | FROM staging_songs 150 | WHERE artist_id IS NOT NULL 151 | """) 152 | 153 | time_table_insert = ("""INSERT INTO time(start_time, hour, day, week, month, year, weekDay) 154 | SELECT start_time, extract(hour from start_time), extract(day from start_time), 155 | extract(week from start_time), extract(month from start_time), 156 | extract(year from start_time), extract(dayofweek from start_time) 157 | FROM songplay 158 | """) 159 | 160 | # QUERY LISTS 161 | 162 | create_table_queries = [staging_events_table_create, staging_songs_table_create, songplay_table_create, user_table_create, song_table_create, artist_table_create, time_table_create] 163 | drop_table_queries = [staging_events_table_drop, staging_songs_table_drop, songplay_table_drop, user_table_drop, song_table_drop, artist_table_drop, time_table_drop] 164 | copy_table_queries = [staging_events_copy, staging_songs_copy] 165 | insert_table_queries = [songplay_table_insert, user_table_insert, song_table_insert, artist_table_insert, time_table_insert] 166 | -------------------------------------------------------------------------------- /Data wrangling with spark/2_spark_maps_and_lazy_evaluation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Maps\n", 8 | "\n", 9 | "In Spark, maps take data as input and then transform that data with whatever function you put in the map. They are like directions for the data telling how each input should get to the output.\n", 10 | "\n", 11 | "The first code cell creates a SparkContext object. With the SparkContext, you can input a dataset and parallelize the data across a cluster (since you are currently using Spark in local mode on a single machine, technically the dataset isn't distributed yet).\n", 12 | "\n", 13 | "Run the code cell below to instantiate a SparkContext object and then read in the log_of_songs list into Spark. " 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "### \n", 23 | "# You might have noticed this code in the screencast.\n", 24 | "#\n", 25 | "# import findspark\n", 26 | "# findspark.init('spark-2.3.2-bin-hadoop2.7')\n", 27 | "#\n", 28 | "# The findspark Python module makes it easier to install\n", 29 | "# Spark in local mode on your computer. This is convenient\n", 30 | "# for practicing Spark syntax locally. \n", 31 | "# However, the workspaces already have Spark installed and you do not\n", 32 | "# need to use the findspark module\n", 33 | "#\n", 34 | "###\n", 35 | "\n", 36 | "import pyspark\n", 37 | "sc = pyspark.SparkContext(appName=\"maps_and_lazy_evaluation_example\")\n", 38 | "\n", 39 | "log_of_songs = [\n", 40 | " \"Despacito\",\n", 41 | " \"Nice for what\",\n", 42 | " \"No tears left to cry\",\n", 43 | " \"Despacito\",\n", 44 | " \"Havana\",\n", 45 | " \"In my feelings\",\n", 46 | " \"Nice for what\",\n", 47 | " \"despacito\",\n", 48 | " \"All the stars\"\n", 49 | "]\n", 50 | "\n", 51 | "# parallelize the log_of_songs to use with Spark\n", 52 | "distributed_song_log = sc.parallelize(log_of_songs)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 2, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "text/plain": [ 63 | "ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195" 64 | ] 65 | }, 66 | "execution_count": 2, 67 | "metadata": {}, 68 | "output_type": "execute_result" 69 | } 70 | ], 71 | "source": [ 72 | "distributed_song_log" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "This next code cell defines a function that converts a song title to lowercase. Then there is an example converting the word \"Havana\" to \"havana\"." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 4, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "'havana'" 91 | ] 92 | }, 93 | "execution_count": 4, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "def convert_song_to_lowercase(song):\n", 100 | " return song.lower()\n", 101 | "\n", 102 | "convert_song_to_lowercase(\"Havana\")" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "The following code cells demonstrate how to apply this function using a map step. The map step will go through each song in the list and apply the convert_song_to_lowercase() function. " 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 5, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/plain": [ 120 | "PythonRDD[1] at RDD at PythonRDD.scala:53" 121 | ] 122 | }, 123 | "execution_count": 5, 124 | "metadata": {}, 125 | "output_type": "execute_result" 126 | } 127 | ], 128 | "source": [ 129 | "distributed_song_log.map(convert_song_to_lowercase)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "You'll notice that this code cell ran quite quickly. This is because of lazy evaluation. Spark does not actually execute the map step unless it needs to.\n", 137 | "\n", 138 | "\"RDD\" in the output refers to resilient distributed dataset. RDDs are exactly what they say they are: fault-tolerant datasets distributed across a cluster. This is how Spark stores data. \n", 139 | "\n", 140 | "To get Spark to actually run the map step, you need to use an \"action\". One available action is the collect method. The collect() method takes the results from all of the clusters and \"collects\" them into a single list on the master node." 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 6, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/plain": [ 151 | "['despacito',\n", 152 | " 'nice for what',\n", 153 | " 'no tears left to cry',\n", 154 | " 'despacito',\n", 155 | " 'havana',\n", 156 | " 'in my feelings',\n", 157 | " 'nice for what',\n", 158 | " 'despacito',\n", 159 | " 'all the stars']" 160 | ] 161 | }, 162 | "execution_count": 6, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "distributed_song_log.map(convert_song_to_lowercase).collect()" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "Note as well that Spark is not changing the original data set: Spark is merely making a copy. You can see this by running collect() on the original dataset." 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 7, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "data": { 185 | "text/plain": [ 186 | "['Despacito',\n", 187 | " 'Nice for what',\n", 188 | " 'No tears left to cry',\n", 189 | " 'Despacito',\n", 190 | " 'Havana',\n", 191 | " 'In my feelings',\n", 192 | " 'Nice for what',\n", 193 | " 'despacito',\n", 194 | " 'All the stars']" 195 | ] 196 | }, 197 | "execution_count": 7, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "distributed_song_log.collect()" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "You do not always have to write a custom function for the map step. You can also use anonymous (lambda) functions as well as built-in Python functions like string.lower(). \n", 211 | "\n", 212 | "Anonymous functions are actually a Python feature for writing functional style programs." 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 8, 218 | "metadata": {}, 219 | "outputs": [ 220 | { 221 | "data": { 222 | "text/plain": [ 223 | "['despacito',\n", 224 | " 'nice for what',\n", 225 | " 'no tears left to cry',\n", 226 | " 'despacito',\n", 227 | " 'havana',\n", 228 | " 'in my feelings',\n", 229 | " 'nice for what',\n", 230 | " 'despacito',\n", 231 | " 'all the stars']" 232 | ] 233 | }, 234 | "execution_count": 8, 235 | "metadata": {}, 236 | "output_type": "execute_result" 237 | } 238 | ], 239 | "source": [ 240 | "distributed_song_log.map(lambda song: song.lower()).collect()" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "distributed_song_log.map(lambda x: x.lower()).collect()" 250 | ] 251 | } 252 | ], 253 | "metadata": { 254 | "kernelspec": { 255 | "display_name": "Python 3", 256 | "language": "python", 257 | "name": "python3" 258 | }, 259 | "language_info": { 260 | "codemirror_mode": { 261 | "name": "ipython", 262 | "version": 3 263 | }, 264 | "file_extension": ".py", 265 | "mimetype": "text/x-python", 266 | "name": "python", 267 | "nbconvert_exporter": "python", 268 | "pygments_lexer": "ipython3", 269 | "version": "3.6.3" 270 | } 271 | }, 272 | "nbformat": 4, 273 | "nbformat_minor": 2 274 | } 275 | -------------------------------------------------------------------------------- /Data wrangling with spark/6_dataframe_quiz_solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Answer Key to the Data Wrangling with DataFrames Coding Quiz\n", 8 | "\n", 9 | "Helpful resources:\n", 10 | "http://spark.apache.org/docs/latest/api/python/pyspark.sql.html" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from pyspark.sql import SparkSession\n", 20 | "from pyspark.sql.functions import isnan, count, when, col, desc, udf, col, sort_array, asc, avg\n", 21 | "from pyspark.sql.functions import sum as Fsum\n", 22 | "from pyspark.sql.window import Window\n", 23 | "from pyspark.sql.types import IntegerType" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# 1) import any other libraries you might need\n", 33 | "# 2) instantiate a Spark session \n", 34 | "# 3) read in the data set located at the path \"data/sparkify_log_small.json\"\n", 35 | "# 4) write code to answer the quiz questions \n", 36 | "\n", 37 | "spark = SparkSession \\\n", 38 | " .builder \\\n", 39 | " .appName(\"Data Frames practice\") \\\n", 40 | " .getOrCreate()\n", 41 | "\n", 42 | "df = spark.read.json(\"data/sparkify_log_small.json\")" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "# Question 1\n", 50 | "\n", 51 | "Which page did user id \"\" (empty string) NOT visit?" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 4, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "root\n", 64 | " |-- artist: string (nullable = true)\n", 65 | " |-- auth: string (nullable = true)\n", 66 | " |-- firstName: string (nullable = true)\n", 67 | " |-- gender: string (nullable = true)\n", 68 | " |-- itemInSession: long (nullable = true)\n", 69 | " |-- lastName: string (nullable = true)\n", 70 | " |-- length: double (nullable = true)\n", 71 | " |-- level: string (nullable = true)\n", 72 | " |-- location: string (nullable = true)\n", 73 | " |-- method: string (nullable = true)\n", 74 | " |-- page: string (nullable = true)\n", 75 | " |-- registration: long (nullable = true)\n", 76 | " |-- sessionId: long (nullable = true)\n", 77 | " |-- song: string (nullable = true)\n", 78 | " |-- status: long (nullable = true)\n", 79 | " |-- ts: long (nullable = true)\n", 80 | " |-- userAgent: string (nullable = true)\n", 81 | " |-- userId: string (nullable = true)\n", 82 | "\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "df.printSchema()" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 5, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "name": "stdout", 97 | "output_type": "stream", 98 | "text": [ 99 | "Logout\n", 100 | "Error\n", 101 | "Submit Upgrade\n", 102 | "Submit Downgrade\n", 103 | "Save Settings\n", 104 | "Downgrade\n", 105 | "Settings\n", 106 | "Upgrade\n", 107 | "NextSong\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "# filter for users with blank user id\n", 113 | "blank_pages = df.filter(df.userId == '') \\\n", 114 | " .select(col('page') \\\n", 115 | " .alias('blank_pages')) \\\n", 116 | " .dropDuplicates()\n", 117 | "\n", 118 | "# get a list of possible pages that could be visited\n", 119 | "all_pages = df.select('page').dropDuplicates()\n", 120 | "\n", 121 | "# find values in all_pages that are not in blank_pages\n", 122 | "# these are the pages that the blank user did not go to\n", 123 | "for row in set(all_pages.collect()) - set(blank_pages.collect()):\n", 124 | " print(row.page)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "# Question 2 - Reflect\n", 132 | "\n", 133 | "What type of user does the empty string user id most likely refer to?\n" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "Perhaps it represents users who have not signed up yet or who are signed out and are about to log in." 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "# Question 3\n", 148 | "\n", 149 | "How many female users do we have in the data set?" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 9, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/plain": [ 160 | "462" 161 | ] 162 | }, 163 | "execution_count": 9, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "df.filter(df.gender == 'F') \\\n", 170 | " .select('userId', 'gender') \\\n", 171 | " .dropDuplicates() \\\n", 172 | " .count()" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "# Question 4\n", 180 | "\n", 181 | "How many songs were played from the most played artist?" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 10, 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "name": "stdout", 191 | "output_type": "stream", 192 | "text": [ 193 | "+--------+-----------+\n", 194 | "| Artist|Artistcount|\n", 195 | "+--------+-----------+\n", 196 | "|Coldplay| 83|\n", 197 | "+--------+-----------+\n", 198 | "only showing top 1 row\n", 199 | "\n" 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "df.filter(df.page == 'NextSong') \\\n", 205 | " .select('Artist') \\\n", 206 | " .groupBy('Artist') \\\n", 207 | " .agg({'Artist':'count'}) \\\n", 208 | " .withColumnRenamed('count(Artist)', 'Artistcount') \\\n", 209 | " .sort(desc('Artistcount')) \\\n", 210 | " .show(1)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "# Question 5 (challenge)\n", 218 | "\n", 219 | "How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer.\n", 220 | "\n" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 11, 226 | "metadata": {}, 227 | "outputs": [ 228 | { 229 | "name": "stdout", 230 | "output_type": "stream", 231 | "text": [ 232 | "+------------------+\n", 233 | "|avg(count(period))|\n", 234 | "+------------------+\n", 235 | "| 6.898347107438017|\n", 236 | "+------------------+\n", 237 | "\n" 238 | ] 239 | } 240 | ], 241 | "source": [ 242 | "# TODO: filter out 0 sum and max sum to get more exact answer\n", 243 | "\n", 244 | "function = udf(lambda ishome : int(ishome == 'Home'), IntegerType())\n", 245 | "\n", 246 | "user_window = Window \\\n", 247 | " .partitionBy('userID') \\\n", 248 | " .orderBy(desc('ts')) \\\n", 249 | " .rangeBetween(Window.unboundedPreceding, 0)\n", 250 | "\n", 251 | "cusum = df.filter((df.page == 'NextSong') | (df.page == 'Home')) \\\n", 252 | " .select('userID', 'page', 'ts') \\\n", 253 | " .withColumn('homevisit', function(col('page'))) \\\n", 254 | " .withColumn('period', Fsum('homevisit').over(user_window))\n", 255 | "\n", 256 | "cusum.filter((cusum.page == 'NextSong')) \\\n", 257 | " .groupBy('userID', 'period') \\\n", 258 | " .agg({'period':'count'}) \\\n", 259 | " .agg({'count(period)':'avg'}).show()" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [] 268 | } 269 | ], 270 | "metadata": { 271 | "kernelspec": { 272 | "display_name": "Python 3", 273 | "language": "python", 274 | "name": "python3" 275 | }, 276 | "language_info": { 277 | "codemirror_mode": { 278 | "name": "ipython", 279 | "version": 3 280 | }, 281 | "file_extension": ".py", 282 | "mimetype": "text/x-python", 283 | "name": "python", 284 | "nbconvert_exporter": "python", 285 | "pygments_lexer": "ipython3", 286 | "version": "3.6.3" 287 | } 288 | }, 289 | "nbformat": 4, 290 | "nbformat_minor": 2 291 | } 292 | -------------------------------------------------------------------------------- /project4/etl.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import configparser\n", 10 | "from datetime import datetime\n", 11 | "import calendar\n", 12 | "import os\n", 13 | "from pyspark.sql import SparkSession\n", 14 | "from pyspark.sql.functions import udf, col\n", 15 | "from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format\n", 16 | "from pyspark.sql.functions import monotonically_increasing_id\n", 17 | "\n", 18 | "config = configparser.ConfigParser()\n", 19 | "config.read('dl.cfg')\n", 20 | "\n", 21 | "os.environ['AWS_ACCESS_KEY_ID']=config['KEYS']['AWS_ACCESS_KEY_ID']\n", 22 | "os.environ['AWS_SECRET_ACCESS_KEY']=config['KEYS']['AWS_SECRET_ACCESS_KEY']\n" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 4, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "def create_spark_session():\n", 32 | " spark = SparkSession \\\n", 33 | " .builder \\\n", 34 | " .config(\"spark.jars.packages\", \"org.apache.hadoop:hadoop-aws:2.7.0\") \\\n", 35 | " .getOrCreate()\n", 36 | " return spark\n" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "def process_song_data(spark, input_data, output_data):\n", 46 | " # get filepath to song data file\n", 47 | " song_data = os.path.join(input_data,\"song_data/*/*/*/*.json\")\n", 48 | " # read song data file\n", 49 | " df = spark.read.json(song_data)\n", 50 | " # extract columns to create songs table\n", 51 | " songs_table = df['song_id', 'title', 'artist_id', 'year', 'duration']\n", 52 | " \n", 53 | " # write songs table to parquet files partitioned by year and artist\n", 54 | " songs_table.write.partitionBy('year', 'artist_id').parquet(os.path.join(output_data, 'songs.parquet'), 'overwrite')\n", 55 | " print(\"--- songs.parquet completed ---\")\n", 56 | " # extract columns to create artists table\n", 57 | " artists_table = df['artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude']\n", 58 | " \n", 59 | " # write artists table to parquet files\n", 60 | " artists_table.write.parquet(os.path.join(output_data, 'artists.parquet'), 'overwrite')\n", 61 | " print(\"--- artists.parquet completed ---\")\n", 62 | " print(\"*** process_song_data completed ***\\n\\n\")\n" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 4, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "def process_log_data(spark, input_data, output_data):\n", 72 | " # get filepath to log data file\n", 73 | " log_data =os.path.join(input_data,\"log_data/*/*/*.json\")\n", 74 | "\n", 75 | " # read log data file\n", 76 | " df = spark.read.json(log_data)\n", 77 | " \n", 78 | " # filter by actions for song plays\n", 79 | "# songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent\n", 80 | " songplays_table = df['ts', 'userId', 'level','sessionId', 'location', 'userAgent']\n", 81 | "\n", 82 | " # extract columns for users table \n", 83 | "# user_id, first_name, last_name, gender, level\n", 84 | " users_table = df['userId', 'firstName', 'lastName', 'gender', 'level']\n", 85 | " \n", 86 | " # write users table to parquet files\n", 87 | " users_table.write.parquet(os.path.join(output_data, 'users.parquet'), 'overwrite')\n", 88 | " print(\"--- users.parquet completed ---\")\n", 89 | " # create timestamp column from original timestamp column\n", 90 | " get_timestamp = udf(lambda x: str(int(int(x)/1000)))\n", 91 | " df = df.withColumn('timestamp', get_timestamp(df.ts))\n", 92 | " # create datetime column from original timestamp column\n", 93 | " get_datetime = udf(lambda x: datetime.fromtimestamp(int(int(x)/1000)))\n", 94 | " get_week = udf(lambda x: calendar.day_name[x.weekday()])\n", 95 | " get_weekday = udf(lambda x: x.isocalendar()[1])\n", 96 | " get_hour = udf(lambda x: x.hour)\n", 97 | " get_day = udf(lambda x : x.day)\n", 98 | " get_year = udf(lambda x: x.year)\n", 99 | " get_month = udf(lambda x: x.month)\n", 100 | " \n", 101 | " \n", 102 | " df = df.withColumn('start_time', get_datetime(df.ts))\n", 103 | " df = df.withColumn('hour', get_hour(df.start_time))\n", 104 | " df = df.withColumn('day', get_day(df.start_time))\n", 105 | " df = df.withColumn('week', get_week(df.start_time))\n", 106 | " df = df.withColumn('month', get_month(df.start_time))\n", 107 | " df = df.withColumn('year', get_year(df.start_time))\n", 108 | " df = df.withColumn('weekday', get_weekday(df.start_time))\n", 109 | " time_table = df['start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday']\n", 110 | " \n", 111 | " # write time table to parquet files partitioned by year and month\n", 112 | " time_table.write.partitionBy('year', 'month').parquet(os.path.join(output_data, 'time.parquet'), 'overwrite')\n", 113 | " print(\"--- time.parquet completed ---\")\n", 114 | " # read in song data to use for songplays table\n", 115 | " song_df = spark.read.parquet(\"songs.parquet\")\n", 116 | "\n", 117 | " # extract columns from joined song and log datasets to create songplays table \n", 118 | " df = df.join(song_df, song_df.title == df.song)\n", 119 | " songplays_table = df['start_time', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent']\n", 120 | " songplays_table.select(monotonically_increasing_id().alias('songplay_id')).collect()\n", 121 | " # write songplays table to parquet files partitioned by year and month\n", 122 | " songplays_table.write.parquet(os.path.join(output_data, 'songplays.parquet'), 'overwrite')\n", 123 | " print(\"--- songplays.parquet completed ---\")\n", 124 | " print(\"*** process_log_data completed ***\\n\\nEND\")" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 6, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "\n", 134 | "spark = create_spark_session()\n", 135 | "input_data = \"s3a://udacity-dend/\"\n", 136 | "output_data = \"\"\n", 137 | "\n", 138 | "# process_song_data(spark, input_data, output_data) \n", 139 | "# process_log_data(spark, input_data, output_data)\n", 140 | "\n" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 7, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "song_df = spark.read.parquet(\"songs.parquet\")" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "song_df.show(5)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "# log_data =os.path.join(input_data,\"log_data/2018/11/2018-11-12-events.json\")\n", 168 | "\n", 169 | "# # read log data file\n", 170 | "# df = spark.read.json(log_data)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "# df.show(1)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "# df = df.join(song_df, song_df.title == df.song)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "# df.show()" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "# os.system(\"rm -rf users\")" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [] 215 | } 216 | ], 217 | "metadata": { 218 | "kernelspec": { 219 | "display_name": "Python 3", 220 | "language": "python", 221 | "name": "python3" 222 | }, 223 | "language_info": { 224 | "codemirror_mode": { 225 | "name": "ipython", 226 | "version": 3 227 | }, 228 | "file_extension": ".py", 229 | "mimetype": "text/x-python", 230 | "name": "python", 231 | "nbconvert_exporter": "python", 232 | "pygments_lexer": "ipython3", 233 | "version": "3.6.3" 234 | } 235 | }, 236 | "nbformat": 4, 237 | "nbformat_minor": 2 238 | } 239 | -------------------------------------------------------------------------------- /NoSQL data modeling/Lesson 3 Exercise 3 Clustering Column.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lesson 3 Exercise 3: Focus on Clustering Columns\n", 8 | "" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "### Walk through the basics of creating a table with a good Primary Key and Clustering Columns in Apache Cassandra, inserting rows of data, and doing a simple CQL query to validate the information. \n", 16 | "\n", 17 | "### Remember, replace ##### with your own code.\n", 18 | "\n", 19 | "Note: __Do not__ click the blue Preview button in the lower task bar" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "#### We will use a python wrapper/ python driver called cassandra to run the Apache Cassandra queries. This library should be preinstalled but in the future to install this library you can run this command in a notebook to install locally: \n", 27 | "! pip install cassandra-driver\n", 28 | "#### More documentation can be found here: https://datastax.github.io/python-driver/" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "#### Import Apache Cassandra python package" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 22, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import cassandra" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "### Create a connection to the database" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 23, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from cassandra.cluster import Cluster\n", 61 | "try: \n", 62 | " cluster = Cluster(['127.0.0.1']) #If you have a locally installed Apache Cassandra instance\n", 63 | " session = cluster.connect()\n", 64 | "except Exception as e:\n", 65 | " print(e)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "### Create a keyspace to work in " 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 24, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "try:\n", 82 | " session.execute(\"\"\"\n", 83 | " CREATE KEYSPACE IF NOT EXISTS udacity \n", 84 | " WITH REPLICATION = \n", 85 | " { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }\"\"\"\n", 86 | ")\n", 87 | "\n", 88 | "except Exception as e:\n", 89 | " print(e)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "#### Connect to the Keyspace. Compare this to how we had to create a new session in PostgreSQL. " 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 25, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "try:\n", 106 | " session.set_keyspace('udacity')\n", 107 | "except Exception as e:\n", 108 | " print(e)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "### Imagine we would like to start creating a new Music Library of albums. \n", 116 | "\n", 117 | "### We want to ask 1 question of our data:\n", 118 | "### 1. Give me all the information from the music library about a given album\n", 119 | "`select * from album_library WHERE album_name=\"Close To You\"`" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "### Here is the data:\n", 127 | "" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "### How should we model this data? What should be our Primary Key and Partition Key? " 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 26, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "query = \"CREATE TABLE IF NOT EXISTS album_library \"\n", 144 | "query = query + \"(year int, artist_name text, album_name text, city text, PRIMARY KEY (album_name, artist_name, city))\"\n", 145 | "try:\n", 146 | " session.execute(query)\n", 147 | "except Exception as e:\n", 148 | " print(e)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "### Insert data into the table" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 27, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "## You can opt to change the sequence of columns to match your composite key. \\ \n", 165 | "## If you do, make sure to match the values in the INSERT statement\n", 166 | "\n", 167 | "query = \"INSERT INTO album_library (year, artist_name, album_name, city)\"\n", 168 | "query = query + \" VALUES (%s, %s, %s, %s)\"\n", 169 | "\n", 170 | "try:\n", 171 | " session.execute(query, (1970, \"The Beatles\", \"Let it Be\", \"Liverpool\"))\n", 172 | "except Exception as e:\n", 173 | " print(e)\n", 174 | " \n", 175 | "try:\n", 176 | " session.execute(query, (1965, \"The Beatles\", \"Rubber Soul\", \"Oxford\"))\n", 177 | "except Exception as e:\n", 178 | " print(e)\n", 179 | " \n", 180 | "try:\n", 181 | " session.execute(query, (1964, \"The Beatles\", \"Beatles For Sale\", \"London\"))\n", 182 | "except Exception as e:\n", 183 | " print(e)\n", 184 | "\n", 185 | "try:\n", 186 | " session.execute(query, (1966, \"The Monkees\", \"The Monkees\", \"Los Angeles\"))\n", 187 | "except Exception as e:\n", 188 | " print(e)\n", 189 | "\n", 190 | "try:\n", 191 | " session.execute(query, (1970, \"The Carpenters\", \"Close To You\", \"San Diego\"))\n", 192 | "except Exception as e:\n", 193 | " print(e)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "### Validate the Data Model -- Did it work? \n", 201 | "`select * from album_library WHERE album_name=\"Close To You\"`" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 30, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "name": "stdout", 211 | "output_type": "stream", 212 | "text": [ 213 | "The Carpenters Close To You San Diego 1970\n" 214 | ] 215 | } 216 | ], 217 | "source": [ 218 | "query = \"select * from album_library WHERE album_name='Close To You'\"\n", 219 | "try:\n", 220 | " rows = session.execute(query)\n", 221 | "except Exception as e:\n", 222 | " print(e)\n", 223 | " \n", 224 | "for row in rows:\n", 225 | " print (row.artist_name, row.album_name, row.city, row.year)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "### Your output should be:\n", 233 | "('The Carpenters', 'Close to You', 'San Diego', 1970)\n", 234 | "\n", 235 | "### OR\n", 236 | "('The Carpenters', 'Close to You', 1970, 'San Diego') " 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "### Drop the table" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 31, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "query = \"drop table album_library\"\n", 253 | "try:\n", 254 | " rows = session.execute(query)\n", 255 | "except Exception as e:\n", 256 | " print(e)\n" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "### Close the session and cluster connection" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "session.shutdown()\n", 273 | "cluster.shutdown()" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [] 282 | } 283 | ], 284 | "metadata": { 285 | "kernelspec": { 286 | "display_name": "Python 3", 287 | "language": "python", 288 | "name": "python3" 289 | }, 290 | "language_info": { 291 | "codemirror_mode": { 292 | "name": "ipython", 293 | "version": 3 294 | }, 295 | "file_extension": ".py", 296 | "mimetype": "text/x-python", 297 | "name": "python", 298 | "nbconvert_exporter": "python", 299 | "pygments_lexer": "ipython3", 300 | "version": "3.6.3" 301 | } 302 | }, 303 | "nbformat": 4, 304 | "nbformat_minor": 2 305 | } 306 | -------------------------------------------------------------------------------- /RDBMS data modeling/l1-demo-1-creating-a-table-with-postgres.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lesson 1 Demo 1: Creating a Table with PostgreSQL\n", 8 | "\n", 9 | "Image title: Postgres Icon" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Walk through the basics of PostgreSQL:
  • Creating a table
  • Inserting rows of data,
  • Running a simple SQL query to validate the information. " 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "### Typically, we would use a python wrapper called *psycopg2* to run the PostgreSQL queries. This library should be preinstalled but in the future to install this library, run the following command in the notebook to install locally: \n", 24 | "!pip3 install --user psycopg2\n", 25 | "#### More documentation can be found here: http://initd.org/psycopg/ " 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "#### Import the library \n", 33 | "Note: An error might popup after this command has executed. Read it carefully before proceeding." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "import psycopg2" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "### Create a connection to the database\n", 50 | "1. Connect to the local instance of PostgreSQL (*127.0.0.1*)\n", 51 | "2. Use the database/schema from the instance. \n", 52 | "3. The connection reaches out to the database (*studentdb*) and uses the correct privileges to connect to the database (*user and password = student*).\n", 53 | "\n", 54 | "### Note 1: This block of code will be standard in all notebooks. \n", 55 | "### Note 2: Adding the try except will make sure errors are caught and understood" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "try: \n", 65 | " conn = psycopg2.connect(\"host=127.0.0.1 dbname=studentdb user=student password=student\")\n", 66 | "except psycopg2.Error as e: \n", 67 | " print(\"Error: Could not make connection to the Postgres database\")\n", 68 | " print(e)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "### Use the connection to get a cursor that can be used to execute queries." 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "try: \n", 85 | " cur = conn.cursor()\n", 86 | "except psycopg2.Error as e: \n", 87 | " print(\"Error: Could not get curser to the Database\")\n", 88 | " print(e)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "### Use automactic commit so that each action is commited without having to call conn.commit() after each command. The ability to rollback and commit transactions is a feature of Relational Databases. " 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "conn.set_session(autocommit=True)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "### Test the Connection and Error Handling Code\n", 112 | "The try-except block should handle the error: We are trying to do a select * on a table but the table has not been created yet." 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "try: \n", 122 | " cur.execute(\"select * from udacity.music_library\")\n", 123 | "except psycopg2.Error as e:\n", 124 | " print(e)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "### Create a database to work in " 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "try: \n", 141 | " cur.execute(\"create database udacity\")\n", 142 | "except psycopg2.Error as e:\n", 143 | " print(e)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "### Close our connection to the default database, reconnect to the Udacity database, and get a new cursor." 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "try: \n", 160 | " conn.close()\n", 161 | "except psycopg2.Error as e:\n", 162 | " print(e)\n", 163 | " \n", 164 | "try: \n", 165 | " conn = psycopg2.connect(\"host=127.0.0.1 dbname=studentdb user=student password=student\")\n", 166 | "except psycopg2.Error as e: \n", 167 | " print(\"Error: Could not make connection to the Postgres database\")\n", 168 | " print(e)\n", 169 | " \n", 170 | "try: \n", 171 | " cur = conn.cursor()\n", 172 | "except psycopg2.Error as e: \n", 173 | " print(\"Error: Could not get curser to the Database\")\n", 174 | " print(e)\n", 175 | "\n", 176 | "conn.set_session(autocommit=True)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "### We will create a Music Library of albums. Each album has a lot of information we could add to the music library table. We will start with album name, artist name, year. \n", 184 | "`Table Name: music_library\n", 185 | "column 1: Album Name\n", 186 | "column 2: Artist Name\n", 187 | "column 3: Year `\n", 188 | "### Translate this information into a Create Table Statement. \n", 189 | "\n", 190 | "Review this document on PostgreSQL datatypes: https://www.postgresql.org/docs/9.5/datatype.html\n" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "try: \n", 200 | " cur.execute(\"CREATE TABLE IF NOT EXISTS music_library (album_name varchar, artist_name varchar, year int);\")\n", 201 | "except psycopg2.Error as e: \n", 202 | " print(\"Error: Issue creating table\")\n", 203 | " print (e)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "### No error was found, but lets check to ensure our table was created. `select count(*)` which should return 0 as no rows have been inserted in the table." 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "try: \n", 220 | " cur.execute(\"select count(*) from music_library\")\n", 221 | "except psycopg2.Error as e: \n", 222 | " print(\"Error: Issue creating table\")\n", 223 | " print (e)\n", 224 | " \n", 225 | "print(cur.fetchall())" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "### Insert two rows " 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "try: \n", 242 | " cur.execute(\"INSERT INTO music_library (album_name, artist_name, year) \\\n", 243 | " VALUES (%s, %s, %s)\", \\\n", 244 | " (\"Let It Be\", \"The Beatles\", 1970))\n", 245 | "except psycopg2.Error as e: \n", 246 | " print(\"Error: Inserting Rows\")\n", 247 | " print (e)\n", 248 | " \n", 249 | "try: \n", 250 | " cur.execute(\"INSERT INTO music_library (album_name, artist_name, year) \\\n", 251 | " VALUES (%s, %s, %s)\", \\\n", 252 | " (\"Rubber Soul\", \"The Beatles\", 1965))\n", 253 | "except psycopg2.Error as e: \n", 254 | " print(\"Error: Inserting Rows\")\n", 255 | " print (e)" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "### Validate your data was inserted into the table. \n", 263 | "The while loop is used for printing the results. If executing queries in the Postgres shell, this would not be required." 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "### Note: If you run the insert statement code more than once, you will see duplicates of your data. PostgreSQL allows for duplicates." 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "try: \n", 280 | " cur.execute(\"SELECT * FROM music_library;\")\n", 281 | "except psycopg2.Error as e: \n", 282 | " print(\"Error: select *\")\n", 283 | " print (e)\n", 284 | "\n", 285 | "row = cur.fetchone()\n", 286 | "while row:\n", 287 | " print(row)\n", 288 | " row = cur.fetchone()" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "### Drop the table to avoid duplicates and clean up" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "try: \n", 305 | " cur.execute(\"DROP table music_library\")\n", 306 | "except psycopg2.Error as e: \n", 307 | " print(\"Error: Dropping table\")\n", 308 | " print (e)" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "### Close the cursor and connection. " 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "cur.close()\n", 325 | "conn.close()" 326 | ] 327 | } 328 | ], 329 | "metadata": { 330 | "kernelspec": { 331 | "display_name": "Python 3", 332 | "language": "python", 333 | "name": "python3" 334 | }, 335 | "language_info": { 336 | "codemirror_mode": { 337 | "name": "ipython", 338 | "version": 3 339 | }, 340 | "file_extension": ".py", 341 | "mimetype": "text/x-python", 342 | "name": "python", 343 | "nbconvert_exporter": "python", 344 | "pygments_lexer": "ipython3", 345 | "version": "3.7.2" 346 | } 347 | }, 348 | "nbformat": 4, 349 | "nbformat_minor": 2 350 | } -------------------------------------------------------------------------------- /Data wrangling with spark/9_spark_sql_quiz_solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Answer Key to the Data Wrangling with Spark SQL Quiz\n", 8 | "\n", 9 | "This quiz uses the same dataset and most of the same questions from the earlier \"Quiz - Data Wrangling with Data Frames Jupyter Notebook.\" For this quiz, however, use Spark SQL instead of Spark Data Frames.\n", 10 | "\n", 11 | "Helpful resources:\n", 12 | "http://spark.apache.org/docs/latest/api/python/pyspark.sql.html" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 4, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "from pyspark.sql import SparkSession\n", 22 | "# from pyspark.sql.functions import isnan, count, when, col, desc, udf, col, sort_array, asc, avg\n", 23 | "# from pyspark.sql.functions import sum as Fsum\n", 24 | "# from pyspark.sql.window import Window\n", 25 | "# from pyspark.sql.types import IntegerType" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 5, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# 1) import any other libraries you might need\n", 35 | "# 2) instantiate a Spark session \n", 36 | "# 3) read in the data set located at the path \"data/sparkify_log_small.json\"\n", 37 | "# 4) create a view to use with your SQL queries\n", 38 | "# 5) write code to answer the quiz questions \n", 39 | "\n", 40 | "spark = SparkSession \\\n", 41 | " .builder \\\n", 42 | " .appName(\"Spark SQL Quiz\") \\\n", 43 | " .getOrCreate()\n", 44 | "\n", 45 | "user_log = spark.read.json(\"data/sparkify_log_small.json\")\n", 46 | "\n", 47 | "user_log.createOrReplaceTempView(\"log_table\")\n" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "# Question 1\n", 55 | "\n", 56 | "Which page did user id \"\" (empty string) NOT visit?" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 6, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "root\n", 69 | " |-- artist: string (nullable = true)\n", 70 | " |-- auth: string (nullable = true)\n", 71 | " |-- firstName: string (nullable = true)\n", 72 | " |-- gender: string (nullable = true)\n", 73 | " |-- itemInSession: long (nullable = true)\n", 74 | " |-- lastName: string (nullable = true)\n", 75 | " |-- length: double (nullable = true)\n", 76 | " |-- level: string (nullable = true)\n", 77 | " |-- location: string (nullable = true)\n", 78 | " |-- method: string (nullable = true)\n", 79 | " |-- page: string (nullable = true)\n", 80 | " |-- registration: long (nullable = true)\n", 81 | " |-- sessionId: long (nullable = true)\n", 82 | " |-- song: string (nullable = true)\n", 83 | " |-- status: long (nullable = true)\n", 84 | " |-- ts: long (nullable = true)\n", 85 | " |-- userAgent: string (nullable = true)\n", 86 | " |-- userId: string (nullable = true)\n", 87 | "\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "user_log.printSchema()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 7, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "+----+----------------+\n", 105 | "|page| page|\n", 106 | "+----+----------------+\n", 107 | "|null|Submit Downgrade|\n", 108 | "|null| Downgrade|\n", 109 | "|null| Logout|\n", 110 | "|null| Save Settings|\n", 111 | "|null| Settings|\n", 112 | "|null| NextSong|\n", 113 | "|null| Upgrade|\n", 114 | "|null| Error|\n", 115 | "|null| Submit Upgrade|\n", 116 | "+----+----------------+\n", 117 | "\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "# SELECT distinct pages for the blank user and distinc pages for all users\n", 123 | "# Right join the results to find pages that blank visitor did not visit\n", 124 | "spark.sql(\"SELECT * \\\n", 125 | " FROM ( \\\n", 126 | " SELECT DISTINCT page \\\n", 127 | " FROM log_table \\\n", 128 | " WHERE userID='') AS user_pages \\\n", 129 | " RIGHT JOIN ( \\\n", 130 | " SELECT DISTINCT page \\\n", 131 | " FROM log_table) AS all_pages \\\n", 132 | " ON user_pages.page = all_pages.page \\\n", 133 | " WHERE user_pages.page IS NULL\").show()" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "# Question 2 - Reflect\n", 141 | "\n", 142 | "Why might you prefer to use SQL over data frames? Why might you prefer data frames over SQL?\n", 143 | "\n", 144 | "Both Spark SQL and Spark Data Frames are part of the Spark SQL library. Hence, they both use the Spark SQL Catalyst Optimizer to optimize queries. \n", 145 | "\n", 146 | "You might prefer SQL over data frames because the syntax is clearer especially for teams already experienced in SQL.\n", 147 | "\n", 148 | "Spark data frames give you more control. You can break down your queries into smaller steps, which can make debugging easier. You can also [cache](https://unraveldata.com/to-cache-or-not-to-cache/) intermediate results or [repartition](https://hackernoon.com/managing-spark-partitions-with-coalesce-and-repartition-4050c57ad5c4) intermediate results." 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "# Question 3\n", 156 | "\n", 157 | "How many female users do we have in the data set?" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 8, 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "name": "stdout", 167 | "output_type": "stream", 168 | "text": [ 169 | "+----------------------+\n", 170 | "|count(DISTINCT userID)|\n", 171 | "+----------------------+\n", 172 | "| 462|\n", 173 | "+----------------------+\n", 174 | "\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "spark.sql(\"SELECT COUNT(DISTINCT userID) \\\n", 180 | " FROM log_table \\\n", 181 | " WHERE gender = 'F'\").show()" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "# Question 4\n", 189 | "\n", 190 | "How many songs were played from the most played artist?" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 9, 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "name": "stdout", 200 | "output_type": "stream", 201 | "text": [ 202 | "+--------+-----+\n", 203 | "| Artist|plays|\n", 204 | "+--------+-----+\n", 205 | "|Coldplay| 83|\n", 206 | "+--------+-----+\n", 207 | "\n", 208 | "+--------+-----+\n", 209 | "| Artist|plays|\n", 210 | "+--------+-----+\n", 211 | "|Coldplay| 83|\n", 212 | "+--------+-----+\n", 213 | "\n" 214 | ] 215 | } 216 | ], 217 | "source": [ 218 | "# Here is one solution\n", 219 | "spark.sql(\"SELECT Artist, COUNT(Artist) AS plays \\\n", 220 | " FROM log_table \\\n", 221 | " GROUP BY Artist \\\n", 222 | " ORDER BY plays DESC \\\n", 223 | " LIMIT 1\").show()\n", 224 | "\n", 225 | "# Here is an alternative solution\n", 226 | "# Get the artist play counts\n", 227 | "play_counts = spark.sql(\"SELECT Artist, COUNT(Artist) AS plays \\\n", 228 | " FROM log_table \\\n", 229 | " GROUP BY Artist\")\n", 230 | "\n", 231 | "# save the results in a new view\n", 232 | "play_counts.createOrReplaceTempView(\"artist_counts\")\n", 233 | "\n", 234 | "# use a self join to find where the max play equals the count value\n", 235 | "spark.sql(\"SELECT a2.Artist, a2.plays FROM \\\n", 236 | " (SELECT max(plays) AS max_plays FROM artist_counts) AS a1 \\\n", 237 | " JOIN artist_counts AS a2 \\\n", 238 | " ON a1.max_plays = a2.plays \\\n", 239 | " \").show()" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "# Question 5 (challenge)\n", 247 | "\n", 248 | "How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer.\n", 249 | "\n" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 31, 255 | "metadata": {}, 256 | "outputs": [ 257 | { 258 | "name": "stdout", 259 | "output_type": "stream", 260 | "text": [ 261 | "+------------------+\n", 262 | "|avg(count(period))|\n", 263 | "+------------------+\n", 264 | "| 6.898347107438017|\n", 265 | "+------------------+\n", 266 | "\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "# SELECT CASE WHEN 1 > 0 THEN 1 WHEN 2 > 0 THEN 2.0 ELSE 1.2 END;\n", 272 | "is_home = spark.sql(\"SELECT userID, page, ts, CASE WHEN page = 'Home' THEN 1 ELSE 0 END AS is_home FROM log_table \\\n", 273 | " WHERE (page = 'NextSong') or (page = 'Home') \\\n", 274 | " \")\n", 275 | "\n", 276 | "# keep the results in a new view\n", 277 | "is_home.createOrReplaceTempView(\"is_home_table\")\n", 278 | "\n", 279 | "# find the cumulative sum over the is_home column\n", 280 | "cumulative_sum = spark.sql(\"SELECT *, SUM(is_home) OVER \\\n", 281 | " (PARTITION BY userID ORDER BY ts DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS period \\\n", 282 | " FROM is_home_table\")\n", 283 | "\n", 284 | "# keep the results in a view\n", 285 | "cumulative_sum.createOrReplaceTempView(\"period_table\")\n", 286 | "\n", 287 | "# find the average count for NextSong\n", 288 | "spark.sql(\"SELECT AVG(count_results) FROM \\\n", 289 | " (SELECT COUNT(*) AS count_results FROM period_table \\\n", 290 | "GROUP BY userID, period, page HAVING page = 'NextSong') AS counts\").show()" 291 | ] 292 | } 293 | ], 294 | "metadata": { 295 | "kernelspec": { 296 | "display_name": "Python 3", 297 | "language": "python", 298 | "name": "python3" 299 | }, 300 | "language_info": { 301 | "codemirror_mode": { 302 | "name": "ipython", 303 | "version": 3 304 | }, 305 | "file_extension": ".py", 306 | "mimetype": "text/x-python", 307 | "name": "python", 308 | "nbconvert_exporter": "python", 309 | "pygments_lexer": "ipython3", 310 | "version": "3.6.3" 311 | } 312 | }, 313 | "nbformat": 4, 314 | "nbformat_minor": 2 315 | } 316 | -------------------------------------------------------------------------------- /NoSQL data modeling/Lesson 3 Exercise 4 Using the WHERE Clause.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lesson 3 Demo 4: Using the WHERE Clause\n", 8 | "" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "### In this exercise we are going to walk through the basics of using the WHERE clause in Apache Cassandra.\n", 16 | "\n", 17 | "##### denotes where the code needs to be completed.\n", 18 | "\n", 19 | "Note: __Do not__ click the blue Preview button in the lower task bar" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "#### We will use a python wrapper/ python driver called cassandra to run the Apache Cassandra queries. This library should be preinstalled but in the future to install this library you can run this command in a notebook to install locally: \n", 27 | "! pip install cassandra-driver\n", 28 | "#### More documentation can be found here: https://datastax.github.io/python-driver/" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "#### Import Apache Cassandra python package" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import cassandra" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "### First let's create a connection to the database" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from cassandra.cluster import Cluster\n", 61 | "try: \n", 62 | " cluster = Cluster(['127.0.0.1']) #If you have a locally installed Apache Cassandra instance\n", 63 | " session = cluster.connect()\n", 64 | "except Exception as e:\n", 65 | " print(e)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "### Let's create a keyspace to do our work in " 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "try:\n", 82 | " session.execute(\"\"\"\n", 83 | " CREATE KEYSPACE IF NOT EXISTS udacity \n", 84 | " WITH REPLICATION = \n", 85 | " { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }\"\"\"\n", 86 | ")\n", 87 | "\n", 88 | "except Exception as e:\n", 89 | " print(e)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "#### Connect to our Keyspace. Compare this to how we had to create a new session in PostgreSQL. " 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 5, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "try:\n", 106 | " session.set_keyspace('udacity')\n", 107 | "except Exception as e:\n", 108 | " print(e)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "### Let's imagine we would like to start creating a new Music Library of albums. \n", 116 | "### We want to ask 4 question of our data\n", 117 | "#### 1. Give me every album in my music library that was released in a 1965 year\n", 118 | "#### 2. Give me the album that is in my music library that was released in 1965 by \"The Beatles\"\n", 119 | "#### 3. Give me all the albums released in a given year that was made in London \n", 120 | "#### 4. Give me the city that the album \"Rubber Soul\" was recorded" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "### Here is our Collection of Data\n", 128 | "" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### How should we model this data? What should be our Primary Key and Partition Key? Since our data is looking for the YEAR let's start with that. From there we will add clustering columns on Artist Name and Album Name." 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 6, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "query = \"CREATE TABLE IF NOT EXISTS music_library \"\n", 145 | "query = query + \"(year int, artist_name text, album_name text, city text, PRIMARY KEY (year, artist_name, album_name))\"\n", 146 | "try:\n", 147 | " session.execute(query)\n", 148 | "except Exception as e:\n", 149 | " print(e)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "### Let's insert our data into of table" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 7, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "query = \"INSERT INTO music_library (year, artist_name, album_name, city)\"\n", 166 | "query = query + \" VALUES (%s, %s, %s, %s)\"\n", 167 | "\n", 168 | "try:\n", 169 | " session.execute(query, (1970, \"The Beatles\", \"Let it Be\", \"Liverpool\"))\n", 170 | "except Exception as e:\n", 171 | " print(e)\n", 172 | " \n", 173 | "try:\n", 174 | " session.execute(query, (1965, \"The Beatles\", \"Rubber Soul\", \"Oxford\"))\n", 175 | "except Exception as e:\n", 176 | " print(e)\n", 177 | " \n", 178 | "try:\n", 179 | " session.execute(query, (1965, \"The Who\", \"My Generation\", \"London\"))\n", 180 | "except Exception as e:\n", 181 | " print(e)\n", 182 | "\n", 183 | "try:\n", 184 | " session.execute(query, (1966, \"The Monkees\", \"The Monkees\", \"Los Angeles\"))\n", 185 | "except Exception as e:\n", 186 | " print(e)\n", 187 | "\n", 188 | "try:\n", 189 | " session.execute(query, (1970, \"The Carpenters\", \"Close To You\", \"San Diego\"))\n", 190 | "except Exception as e:\n", 191 | " print(e)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "### Let's Validate our Data Model with our 4 queries.\n", 199 | "\n", 200 | "Query 1: " 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 9, 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "name": "stdout", 210 | "output_type": "stream", 211 | "text": [ 212 | "1965 The Beatles Rubber Soul Oxford\n", 213 | "1965 The Who My Generation London\n" 214 | ] 215 | } 216 | ], 217 | "source": [ 218 | "query = \"SELECT * FROM music_library WHERE YEAR=1965\"\n", 219 | "try:\n", 220 | " rows = session.execute(query)\n", 221 | "except Exception as e:\n", 222 | " print(e)\n", 223 | " \n", 224 | "for row in rows:\n", 225 | " print (row.year, row.artist_name, row.album_name, row.city)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | " Let's try the 2nd query.\n", 233 | " Query 2: " 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 10, 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "name": "stdout", 243 | "output_type": "stream", 244 | "text": [ 245 | "1965 The Beatles Rubber Soul Oxford\n" 246 | ] 247 | } 248 | ], 249 | "source": [ 250 | "query = \"SELECT * FROM music_library WHERE YEAR = 1965 AND ARTIST_NAME = 'The Beatles'\"\n", 251 | "try:\n", 252 | " rows = session.execute(query)\n", 253 | "except Exception as e:\n", 254 | " print(e)\n", 255 | " \n", 256 | "for row in rows:\n", 257 | " print (row.year, row.artist_name, row.album_name, row.city)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "### Let's try the 3rd query.\n", 265 | "Query 3: " 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 11, 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "name": "stdout", 275 | "output_type": "stream", 276 | "text": [ 277 | "Error from server: code=2200 [Invalid query] message=\"Cannot execute this query as it might involve data filtering and thus may have unpredictable performance. If you want to execute this query despite the performance unpredictability, use ALLOW FILTERING\"\n" 278 | ] 279 | } 280 | ], 281 | "source": [ 282 | "query = \"SELECT * FROM music_library WHERE YEAR = 1965 AND ARTIST_NAME = 'The Beatles' AND CITY = 'London'\"\n", 283 | "try:\n", 284 | " rows = session.execute(query)\n", 285 | "except Exception as e:\n", 286 | " print(e)\n", 287 | " \n", 288 | "for row in rows:\n", 289 | " print (row.year, row.artist_name, row.album_name, row.city)" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "### Did you get an error? You can not try to access a column or a clustering column if you have not used the other defined clustering column. Let's see if we can try it a different way. \n", 297 | "Try Query 4: \n", 298 | "\n" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 12, 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "name": "stdout", 308 | "output_type": "stream", 309 | "text": [ 310 | "Oxford\n" 311 | ] 312 | } 313 | ], 314 | "source": [ 315 | "query = \"SELECT city FROM music_library WHERE YEAR = 1965 AND ARTIST_NAME = 'The Beatles' AND ALBUM_NAME = 'Rubber Soul'\"\n", 316 | "try:\n", 317 | " rows = session.execute(query)\n", 318 | "except Exception as e:\n", 319 | " print(e)\n", 320 | " \n", 321 | "for row in rows:\n", 322 | " print (row.city)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "### And Finally close the session and cluster connection" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "session.shutdown()\n", 339 | "cluster.shutdown()" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [] 348 | } 349 | ], 350 | "metadata": { 351 | "kernelspec": { 352 | "display_name": "Python 3", 353 | "language": "python", 354 | "name": "python3" 355 | }, 356 | "language_info": { 357 | "codemirror_mode": { 358 | "name": "ipython", 359 | "version": 3 360 | }, 361 | "file_extension": ".py", 362 | "mimetype": "text/x-python", 363 | "name": "python", 364 | "nbconvert_exporter": "python", 365 | "pygments_lexer": "ipython3", 366 | "version": "3.6.3" 367 | } 368 | }, 369 | "nbformat": 4, 370 | "nbformat_minor": 2 371 | } 372 | -------------------------------------------------------------------------------- /Power of spark/mapreduce_practice.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MapReduce\n", 8 | "\n", 9 | "The MapReduce programming technique was designed to analyze massive data sets across a cluster. In this Jupyter notebook, you'll get a sense for how Hadoop MapReduce works; however, this notebook will run locally rather than on a cluster.\n", 10 | "\n", 11 | "The biggest difference between Hadoop and Spark is that Spark tries to do as many calculations as possible in memory, which avoids moving data back and forth across a cluster. Hadoop writes intermediate calculations out to disk, which can be less efficient. Hadoop is an older technology than Spark and one of the cornerstone big data technologies.\n", 12 | "\n", 13 | "If you click on the Jupyter notebook logo at the top of the workspace, you'll be taken to the workspace directory. There you will see a file called \"songplays.txt\". This is a text file where each line represents a song that was played in the Sparkify app. The MapReduce code will count how many times each song was played. In other words, the code counts how many times the song title appears in the list.\n", 14 | "\n", 15 | "\n", 16 | "# MapReduce versus Hadoop MapReduce\n", 17 | "\n", 18 | "Don't get confused by the terminology! MapReduce is a programming technique. Hadoop MapReduce is a specific implementation of the programming technique.\n", 19 | "\n", 20 | "Some of the syntax will look a bit funny, so be sure to read the explanation and comments for each section. You'll learn more about the syntax in later lessons. \n", 21 | "\n", 22 | "Run each of the code cells below to see the output." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [ 32 | { 33 | "name": "stdout", 34 | "output_type": "stream", 35 | "text": [ 36 | "Requirement already satisfied: mrjob in /opt/conda/lib/python3.6/site-packages (0.6.9)\n", 37 | "Requirement already satisfied: botocore>=1.6.0 in /opt/conda/lib/python3.6/site-packages (from mrjob) (1.12.7)\n", 38 | "Requirement already satisfied: google-cloud-storage>=1.13.1 in /opt/conda/lib/python3.6/site-packages (from mrjob) (1.16.1)\n", 39 | "Requirement already satisfied: PyYAML>=3.10 in /opt/conda/lib/python3.6/site-packages (from mrjob) (3.12)\n", 40 | "Requirement already satisfied: google-cloud-logging>=1.9.0 in /opt/conda/lib/python3.6/site-packages (from mrjob) (1.11.0)\n", 41 | "Requirement already satisfied: google-cloud-dataproc>=0.3.0 in /opt/conda/lib/python3.6/site-packages (from mrjob) (0.4.0)\n", 42 | "Requirement already satisfied: boto3>=1.4.6 in /opt/conda/lib/python3.6/site-packages (from mrjob) (1.9.7)\n", 43 | "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /opt/conda/lib/python3.6/site-packages (from botocore>=1.6.0->mrjob) (0.9.3)\n", 44 | "Requirement already satisfied: docutils>=0.10 in /opt/conda/lib/python3.6/site-packages (from botocore>=1.6.0->mrjob) (0.14)\n", 45 | "Requirement already satisfied: urllib3<1.24,>=1.20 in /opt/conda/lib/python3.6/site-packages (from botocore>=1.6.0->mrjob) (1.22)\n", 46 | "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /opt/conda/lib/python3.6/site-packages (from botocore>=1.6.0->mrjob) (2.6.1)\n", 47 | "Requirement already satisfied: google-cloud-core<2.0dev,>=1.0.0 in /opt/conda/lib/python3.6/site-packages (from google-cloud-storage>=1.13.1->mrjob) (1.0.2)\n", 48 | "Requirement already satisfied: google-resumable-media>=0.3.1 in /opt/conda/lib/python3.6/site-packages (from google-cloud-storage>=1.13.1->mrjob) (0.3.2)\n", 49 | "Requirement already satisfied: google-auth>=1.2.0 in /opt/conda/lib/python3.6/site-packages (from google-cloud-storage>=1.13.1->mrjob) (1.6.3)\n", 50 | "Requirement already satisfied: google-api-core[grpc]<2.0.0dev,>=1.6.0 in /opt/conda/lib/python3.6/site-packages (from google-cloud-logging>=1.9.0->mrjob) (1.11.1)\n", 51 | "Requirement already satisfied: s3transfer<0.2.0,>=0.1.10 in /opt/conda/lib/python3.6/site-packages (from boto3>=1.4.6->mrjob) (0.1.13)\n", 52 | "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.6/site-packages (from python-dateutil<3.0.0,>=2.1->botocore>=1.6.0->mrjob) (1.11.0)\n", 53 | "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.6/site-packages (from google-auth>=1.2.0->google-cloud-storage>=1.13.1->mrjob) (0.2.5)\n", 54 | "Requirement already satisfied: cachetools>=2.0.0 in /opt/conda/lib/python3.6/site-packages (from google-auth>=1.2.0->google-cloud-storage>=1.13.1->mrjob) (3.1.1)\n", 55 | "Requirement already satisfied: rsa>=3.1.4 in /opt/conda/lib/python3.6/site-packages (from google-auth>=1.2.0->google-cloud-storage>=1.13.1->mrjob) (3.4.2)\n", 56 | "Requirement already satisfied: protobuf>=3.4.0 in /opt/conda/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.6.0->google-cloud-logging>=1.9.0->mrjob) (3.5.1)\n", 57 | "Requirement already satisfied: setuptools>=34.0.0 in /opt/conda/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.6.0->google-cloud-logging>=1.9.0->mrjob) (38.4.0)\n", 58 | "Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /opt/conda/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.6.0->google-cloud-logging>=1.9.0->mrjob) (2.18.4)\n", 59 | "Requirement already satisfied: googleapis-common-protos!=1.5.4,<2.0dev,>=1.5.3 in /opt/conda/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.6.0->google-cloud-logging>=1.9.0->mrjob) (1.6.0)\n", 60 | "Requirement already satisfied: pytz in /opt/conda/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.6.0->google-cloud-logging>=1.9.0->mrjob) (2017.3)\n", 61 | "Requirement already satisfied: grpcio<2.0dev,>=1.8.2; extra == \"grpc\" in /opt/conda/lib/python3.6/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.6.0->google-cloud-logging>=1.9.0->mrjob) (1.21.1)\n", 62 | "Requirement already satisfied: pyasn1<0.5.0,>=0.4.1 in /opt/conda/lib/python3.6/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.2.0->google-cloud-storage>=1.13.1->mrjob) (0.4.4)\n", 63 | "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /opt/conda/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]<2.0.0dev,>=1.6.0->google-cloud-logging>=1.9.0->mrjob) (3.0.4)\n", 64 | "Requirement already satisfied: idna<2.7,>=2.5 in /opt/conda/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]<2.0.0dev,>=1.6.0->google-cloud-logging>=1.9.0->mrjob) (2.6)\n", 65 | "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]<2.0.0dev,>=1.6.0->google-cloud-logging>=1.9.0->mrjob) (2017.11.5)\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "# Install mrjob library. This package is for running MapReduce jobs with Python\n", 71 | "# In Jupyter notebooks, \"!\" runs terminal commands from inside notebooks \n", 72 | "\n", 73 | "! pip install mrjob" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 3, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "Overwriting wordcount.py\n" 86 | ] 87 | } 88 | ], 89 | "source": [ 90 | "%%file wordcount.py\n", 91 | "# %%file is an Ipython magic function that saves the code cell as a file\n", 92 | "\n", 93 | "from mrjob.job import MRJob # import the mrjob library\n", 94 | "\n", 95 | "class MRSongCount(MRJob):\n", 96 | " \n", 97 | " # the map step: each line in the txt file is read as a key, value pair\n", 98 | " # in this case, each line in the txt file only contains a value but no key\n", 99 | " # _ means that in this case, there is no key for each line\n", 100 | " def mapper(self, _, song):\n", 101 | " # output each line as a tuple of (song_names, 1) \n", 102 | " yield (song, 1)\n", 103 | "\n", 104 | " # the reduce step: combine all tuples with the same key\n", 105 | " # in this case, the key is the song name\n", 106 | " # then sum all the values of the tuple, which will give the total song plays\n", 107 | " def reducer(self, key, values):\n", 108 | " yield (key, sum(values))\n", 109 | " \n", 110 | "if __name__ == \"__main__\":\n", 111 | " MRSongCount.run()" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 4, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "No configs found; falling back on auto-configuration\n", 124 | "No configs specified for inline runner\n", 125 | "Creating temp directory /tmp/wordcount.root.20190614.193733.820499\n", 126 | "Running step 1 of 1...\n", 127 | "job output is in /tmp/wordcount.root.20190614.193733.820499/output\n", 128 | "Streaming final output from /tmp/wordcount.root.20190614.193733.820499/output...\n", 129 | "\"Deep Dreams\"\t1131\n", 130 | "\"Broken Networks\"\t510\n", 131 | "\"Data House Rock\"\t828\n", 132 | "Removing temp directory /tmp/wordcount.root.20190614.193733.820499...\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "# run the code as a terminal command\n", 138 | "! python wordcount.py songplays.txt" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "# Summary of what happens in the code.\n", 146 | "\n", 147 | "There is a list of songs in songplays.txt that looks like the following:\n", 148 | "\n", 149 | "Deep Dreams\n", 150 | "Data House Rock\n", 151 | "Deep Dreams\n", 152 | "Data House Rock\n", 153 | "Broken Networks\n", 154 | "Data House Rock\n", 155 | "etc.....\n", 156 | "\n", 157 | "During the map step, the code reads in the txt file one line at a time. The map steps outputs a set of tuples that look like this:\n", 158 | "\n", 159 | "(Deep Dreams, 1) \n", 160 | "(Data House Rock, 1) \n", 161 | "(Deep Dreams, 1) \n", 162 | "(Data House Rock, 1) \n", 163 | "(Broken Networks, 1) \n", 164 | "(Data House Rock, 1) \n", 165 | "etc.....\n", 166 | "\n", 167 | "Finally, the reduce step combines all of the values by keys and sums the values: \n", 168 | "\n", 169 | "(Deep Dreams, \\[1, 1, 1, 1, 1, 1, ... \\]) \n", 170 | "(Data House Rock, \\[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\\]) \n", 171 | "(Broken Networks, \\[1, 1, 1, ...\\] \n", 172 | "\n", 173 | "With the output \n", 174 | "\n", 175 | "(Deep Dreams, 1131) \n", 176 | "(Data House Rock, 510) \n", 177 | "(Broken Networks, 828) " 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [] 186 | } 187 | ], 188 | "metadata": { 189 | "kernelspec": { 190 | "display_name": "Python 3", 191 | "language": "python", 192 | "name": "python3" 193 | }, 194 | "language_info": { 195 | "codemirror_mode": { 196 | "name": "ipython", 197 | "version": 3 198 | }, 199 | "file_extension": ".py", 200 | "mimetype": "text/x-python", 201 | "name": "python", 202 | "nbconvert_exporter": "python", 203 | "pygments_lexer": "ipython3", 204 | "version": "3.6.3" 205 | } 206 | }, 207 | "nbformat": 4, 208 | "nbformat_minor": 2 209 | } 210 | -------------------------------------------------------------------------------- /NoSQL data modeling/Lesson 3 Exercise 2 Primary Key.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lesson 3 Exercise 2: Focus on Primary Key\n", 8 | "" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "### Walk through the basics of creating a table with a good Primary Key in Apache Cassandra, inserting rows of data, and doing a simple CQL query to validate the information. \n", 16 | "\n", 17 | "### Replace ##### with your own answers. \n", 18 | "\n", 19 | "Note: __Do not__ click the blue Preview button in the lower task bar" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "#### We will use a python wrapper/ python driver called cassandra to run the Apache Cassandra queries. This library should be preinstalled but in the future to install this library you can run this command in a notebook to install locally: \n", 27 | "! pip install cassandra-driver\n", 28 | "#### More documentation can be found here: https://datastax.github.io/python-driver/" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "#### Import Apache Cassandra python package" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 51, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import cassandra" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "### Create a connection to the database" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 52, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from cassandra.cluster import Cluster\n", 61 | "try: \n", 62 | " cluster = Cluster(['127.0.0.1']) #If you have a locally installed Apache Cassandra instance\n", 63 | " session = cluster.connect()\n", 64 | "except Exception as e:\n", 65 | " print(e)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "### Create a keyspace to work in " 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 53, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "try:\n", 82 | " session.execute(\"\"\"\n", 83 | " CREATE KEYSPACE IF NOT EXISTS udacity \n", 84 | " WITH REPLICATION = \n", 85 | " { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }\"\"\"\n", 86 | ")\n", 87 | "\n", 88 | "except Exception as e:\n", 89 | " print(e)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "#### Connect to the Keyspace. Compare this to how we had to create a new session in PostgreSQL. " 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 54, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "try:\n", 106 | " session.set_keyspace('udacity')\n", 107 | "except Exception as e:\n", 108 | " print(e)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "### Imagine you need to create a new Music Library of albums \n", 116 | "\n", 117 | "### Here is the information asked of the data:\n", 118 | "#### 1. Give every album in the music library that was created by a given artist\n", 119 | "`select * from music_library WHERE artist_name=\"The Beatles\"`\n" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "### Here is the collection of data\n", 127 | "" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 55, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "query = \"drop table music_library\"\n", 137 | "try:\n", 138 | " rows = session.execute(query)\n", 139 | "except Exception as e:\n", 140 | " print(e)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "#### Practice by making the PRIMARY KEY only 1 Column (not 2 or more)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 56, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "query = \"CREATE TABLE IF NOT EXISTS music_library \"\n", 157 | "query = query + \"(year int, city text, artist_name text, album_name text, PRIMARY KEY (year))\"\n", 158 | "try:\n", 159 | " session.execute(query)\n", 160 | "except Exception as e:\n", 161 | " print(e)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "### Let's insert the data into the table" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 57, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "query = \"INSERT INTO music_library (year, artist_name, album_name, city)\"\n", 178 | "query = query + \" VALUES (%s, %s, %s, %s)\"\n", 179 | "\n", 180 | "try:\n", 181 | " session.execute(query, (1970, \"The Beatles\", \"Let it Be\", \"Liverpool\"))\n", 182 | "except Exception as e:\n", 183 | " print(e)\n", 184 | " \n", 185 | "try:\n", 186 | " session.execute(query, (1965, \"The Beatles\", \"Rubber Soul\", \"Oxford\"))\n", 187 | "except Exception as e:\n", 188 | " print(e)\n", 189 | " \n", 190 | "try:\n", 191 | " session.execute(query, (1965, \"The Who\", \"My Generation\", \"London\"))\n", 192 | "except Exception as e:\n", 193 | " print(e)\n", 194 | "\n", 195 | "try:\n", 196 | " session.execute(query, (1966, \"The Monkees\", \"The Monkees\", \"Los Angeles\"))\n", 197 | "except Exception as e:\n", 198 | " print(e)\n", 199 | "\n", 200 | "try:\n", 201 | " session.execute(query, (1970, \"The Carpenters\", \"Close To You\", \"San Diego\"))\n", 202 | "except Exception as e:\n", 203 | " print(e)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "### Validate the Data Model -- Does it give you two rows?" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 58, 216 | "metadata": {}, 217 | "outputs": [ 218 | { 219 | "name": "stdout", 220 | "output_type": "stream", 221 | "text": [ 222 | "1965 The Who My Generation London\n" 223 | ] 224 | } 225 | ], 226 | "source": [ 227 | "query = \"select * from music_library WHERE YEAR=1965\"\n", 228 | "try:\n", 229 | " rows = session.execute(query)\n", 230 | "except Exception as e:\n", 231 | " print(e)\n", 232 | " \n", 233 | "for row in rows:\n", 234 | " print (row.year, row.artist_name, row.album_name, row.city)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 71, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "name": "stdout", 244 | "output_type": "stream", 245 | "text": [ 246 | "Error from server: code=2200 [Invalid query] message=\"unconfigured table music_library\"\n" 247 | ] 248 | } 249 | ], 250 | "source": [ 251 | "query = \"drop table music_library\"\n", 252 | "try:\n", 253 | " rows = session.execute(query)\n", 254 | "except Exception as e:\n", 255 | " print(e)" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "### If you used just one column as your PRIMARY KEY, your output should be:\n", 263 | "1965 The Beatles Rubber Soul Oxford\n", 264 | "\n", 265 | "\n", 266 | "### That didn't work out as planned! Why is that? Did you create a unique primary key?" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "### Try again - Create a new table with a composite key this time" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 72, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "query = \"CREATE TABLE IF NOT EXISTS music_library \"\n", 283 | "query = query + \"(year int, artist_name text, album_name text, city text, PRIMARY KEY(artist_name, year))\"\n", 284 | "try:\n", 285 | " session.execute(query)\n", 286 | "except Exception as e:\n", 287 | " print(e)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 73, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "## You can opt to change the sequence of columns to match your composite key. \\ \n", 297 | "## Make sure to match the values in the INSERT statement\n", 298 | "\n", 299 | "query = \"INSERT INTO music_library (year, artist_name, album_name, city)\"\n", 300 | "query = query + \" VALUES (%s, %s, %s, %s)\"\n", 301 | "\n", 302 | "try:\n", 303 | " session.execute(query, (1970, \"The Beatles\", \"Let it Be\", \"Liverpool\"))\n", 304 | "except Exception as e:\n", 305 | " print(e)\n", 306 | " \n", 307 | "try:\n", 308 | " session.execute(query, (1965, \"The Beatles\", \"Rubber Soul\", \"Oxford\"))\n", 309 | "except Exception as e:\n", 310 | " print(e)\n", 311 | " \n", 312 | "try:\n", 313 | " session.execute(query, (1965, \"The Who\", \"My Generation\", \"London\"))\n", 314 | "except Exception as e:\n", 315 | " print(e)\n", 316 | "\n", 317 | "try:\n", 318 | " session.execute(query, (1966, \"The Monkees\", \"The Monkees\", \"Los Angeles\"))\n", 319 | "except Exception as e:\n", 320 | " print(e)\n", 321 | "\n", 322 | "try:\n", 323 | " session.execute(query, (1970, \"The Carpenters\", \"Close To You\", \"San Diego\"))\n", 324 | "except Exception as e:\n", 325 | " print(e)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": {}, 331 | "source": [ 332 | "### Validate the Data Model -- Did it work?" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 74, 338 | "metadata": {}, 339 | "outputs": [ 340 | { 341 | "name": "stdout", 342 | "output_type": "stream", 343 | "text": [ 344 | "1965 The Beatles Rubber Soul Oxford\n", 345 | "1970 The Beatles Let it Be Liverpool\n" 346 | ] 347 | } 348 | ], 349 | "source": [ 350 | "query = \"SELECT * FROM music_library WHERE artist_name='The Beatles'\"\n", 351 | "try:\n", 352 | " rows = session.execute(query)\n", 353 | "except Exception as e:\n", 354 | " print(e)\n", 355 | " \n", 356 | "for row in rows:\n", 357 | " print (row.year, row.artist_name, row.album_name, row.city)" 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "metadata": {}, 363 | "source": [ 364 | "### Your output should be:\n", 365 | "1970 The Beatles Let it Be Liverpool
    \n", 366 | "1965 The Beatles Rubber Soul Oxford" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": {}, 372 | "source": [ 373 | "### Drop the tables" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": 75, 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [ 382 | "query = \"drop table music_library\"\n", 383 | "try:\n", 384 | " rows = session.execute(query)\n", 385 | "except Exception as e:\n", 386 | " print(e)\n", 387 | "\n", 388 | "# query = \"#####\"\n", 389 | "# try:\n", 390 | "# rows = session.execute(query)\n", 391 | "# except Exception as e:\n", 392 | "# print(e)" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": {}, 398 | "source": [ 399 | "### Close the session and cluster connection" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "session.shutdown()\n", 409 | "cluster.shutdown()" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [] 418 | } 419 | ], 420 | "metadata": { 421 | "kernelspec": { 422 | "display_name": "Python 3", 423 | "language": "python", 424 | "name": "python3" 425 | }, 426 | "language_info": { 427 | "codemirror_mode": { 428 | "name": "ipython", 429 | "version": 3 430 | }, 431 | "file_extension": ".py", 432 | "mimetype": "text/x-python", 433 | "name": "python", 434 | "nbconvert_exporter": "python", 435 | "pygments_lexer": "ipython3", 436 | "version": "3.6.3" 437 | } 438 | }, 439 | "nbformat": 4, 440 | "nbformat_minor": 2 441 | } 442 | -------------------------------------------------------------------------------- /Intro to Data Lake/Exercise 3 - Data Lake on S3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Exercise 3 - Data Lake on S3" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from pyspark.sql import SparkSession\n", 17 | "import os\n", 18 | "import configparser" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "# Make sure that your AWS credentials are loaded as env vars" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "config = configparser.ConfigParser()\n", 35 | "\n", 36 | "#Normally this file should be in ~/.aws/credentials\n", 37 | "config.read_file(open('aws/credentials.cfg'))\n", 38 | "\n", 39 | "os.environ[\"AWS_ACCESS_KEY_ID\"]= config['AWS']['AWS_ACCESS_KEY_ID']\n", 40 | "os.environ[\"AWS_SECRET_ACCESS_KEY\"]= config['AWS']['AWS_SECRET_ACCESS_KEY']" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "# Create spark session with hadoop-aws package" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "spark = SparkSession.builder\\\n", 57 | " .config(\"spark.jars.packages\",\"org.apache.hadoop:hadoop-aws:2.7.0\")\\\n", 58 | " .getOrCreate()" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "# Load data from S3" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 6, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "df = spark.read.csv(\"s3a://udacity-dend/pagila/payment/payment.csv\")" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 7, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "name": "stdout", 84 | "output_type": "stream", 85 | "text": [ 86 | "root\n", 87 | " |-- _c0: string (nullable = true)\n", 88 | "\n", 89 | "+--------------------+\n", 90 | "| _c0|\n", 91 | "+--------------------+\n", 92 | "|payment_id;custom...|\n", 93 | "|16050;269;2;7;1.9...|\n", 94 | "|16051;269;1;98;0....|\n", 95 | "|16052;269;2;678;6...|\n", 96 | "|16053;269;2;703;0...|\n", 97 | "+--------------------+\n", 98 | "only showing top 5 rows\n", 99 | "\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "df.printSchema()\n", 105 | "df.show(5)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "# Infer schema, fix header and separator" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "df = spark.read.csv(\"s3a://udacity-dend/pagila/payment/payment.csv\",sep=\";\", inferSchema=True, header=True)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 11, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "root\n", 134 | " |-- payment_id: integer (nullable = true)\n", 135 | " |-- customer_id: integer (nullable = true)\n", 136 | " |-- staff_id: integer (nullable = true)\n", 137 | " |-- rental_id: integer (nullable = true)\n", 138 | " |-- amount: double (nullable = true)\n", 139 | " |-- payment_date: string (nullable = true)\n", 140 | "\n", 141 | "+----------+-----------+--------+---------+------+--------------------+\n", 142 | "|payment_id|customer_id|staff_id|rental_id|amount| payment_date|\n", 143 | "+----------+-----------+--------+---------+------+--------------------+\n", 144 | "| 16050| 269| 2| 7| 1.99|2017-01-24 21:40:...|\n", 145 | "| 16051| 269| 1| 98| 0.99|2017-01-25 15:16:...|\n", 146 | "| 16052| 269| 2| 678| 6.99|2017-01-28 21:44:...|\n", 147 | "| 16053| 269| 2| 703| 0.99|2017-01-29 00:58:...|\n", 148 | "| 16054| 269| 1| 750| 4.99|2017-01-29 08:10:...|\n", 149 | "+----------+-----------+--------+---------+------+--------------------+\n", 150 | "only showing top 5 rows\n", 151 | "\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "df.printSchema()\n", 157 | "df.show(5)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "# Fix the data yourself " 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 23, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "name": "stdout", 174 | "output_type": "stream", 175 | "text": [ 176 | "root\n", 177 | " |-- payment_id: integer (nullable = true)\n", 178 | " |-- customer_id: integer (nullable = true)\n", 179 | " |-- staff_id: integer (nullable = true)\n", 180 | " |-- rental_id: integer (nullable = true)\n", 181 | " |-- amount: double (nullable = true)\n", 182 | " |-- payment_date: timestamp (nullable = true)\n", 183 | "\n", 184 | "+----------+-----------+--------+---------+------+--------------------+\n", 185 | "|payment_id|customer_id|staff_id|rental_id|amount| payment_date|\n", 186 | "+----------+-----------+--------+---------+------+--------------------+\n", 187 | "| 16050| 269| 2| 7| 1.99|2017-01-24 23:40:...|\n", 188 | "| 16051| 269| 1| 98| 0.99|2017-01-25 17:16:...|\n", 189 | "| 16052| 269| 2| 678| 6.99|2017-01-28 23:44:...|\n", 190 | "| 16053| 269| 2| 703| 0.99|2017-01-29 02:58:...|\n", 191 | "| 16054| 269| 1| 750| 4.99|2017-01-29 10:10:...|\n", 192 | "+----------+-----------+--------+---------+------+--------------------+\n", 193 | "only showing top 5 rows\n", 194 | "\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "import pyspark.sql.functions as F\n", 200 | "dfPayment = df.withColumn(\"payment_date\", F.to_timestamp(\"payment_date\"))\n", 201 | "dfPayment.printSchema()\n", 202 | "dfPayment.show(5)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "# Extract the month" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 24, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "+----------+-----------+--------+---------+------+--------------------+-----+\n", 222 | "|payment_id|customer_id|staff_id|rental_id|amount| payment_date|month|\n", 223 | "+----------+-----------+--------+---------+------+--------------------+-----+\n", 224 | "| 16050| 269| 2| 7| 1.99|2017-01-24 23:40:...| 1|\n", 225 | "| 16051| 269| 1| 98| 0.99|2017-01-25 17:16:...| 1|\n", 226 | "| 16052| 269| 2| 678| 6.99|2017-01-28 23:44:...| 1|\n", 227 | "| 16053| 269| 2| 703| 0.99|2017-01-29 02:58:...| 1|\n", 228 | "| 16054| 269| 1| 750| 4.99|2017-01-29 10:10:...| 1|\n", 229 | "+----------+-----------+--------+---------+------+--------------------+-----+\n", 230 | "only showing top 5 rows\n", 231 | "\n" 232 | ] 233 | } 234 | ], 235 | "source": [ 236 | "dfPayment = dfPayment.withColumn(\"month\", F.month(\"payment_date\"))\n", 237 | "dfPayment.show(5)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "# Computer aggregate revenue per month" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 27, 250 | "metadata": { 251 | "scrolled": true 252 | }, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "+-----+------------------+\n", 259 | "|month| revenue|\n", 260 | "+-----+------------------+\n", 261 | "| 4|28094.520000003773|\n", 262 | "| 3|23886.560000002115|\n", 263 | "| 2| 9631.879999999608|\n", 264 | "| 1| 4824.429999999856|\n", 265 | "| 5| 979.1200000000023|\n", 266 | "+-----+------------------+\n", 267 | "\n" 268 | ] 269 | } 270 | ], 271 | "source": [ 272 | "dfPayment.createOrReplaceTempView(\"payment\")\n", 273 | "spark.sql(\"\"\"\n", 274 | " SELECT month, sum(amount) as revenue\n", 275 | " FROM payment\n", 276 | " GROUP by month\n", 277 | " order by revenue desc\n", 278 | "\"\"\").show()" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "# Fix the schema" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 34, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "from pyspark.sql.types import StructType as R, StructField as Fld, DoubleType as Dbl, StringType as Str, IntegerType as Int, DateType as Date\n", 295 | "paymentSchema = R([\n", 296 | " Fld(\"payment_id\",Int()),\n", 297 | " Fld(\"customer_id\",Int()),\n", 298 | " Fld(\"staff_id\",Int()),\n", 299 | " Fld(\"rental_id\",Int()),\n", 300 | " Fld(\"amount\",Dbl()),\n", 301 | " Fld(\"payment_date\",Date()),\n", 302 | "])" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 35, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "dfPaymentWithSchema = spark.read.csv(\"s3a://udacity-dend/pagila/payment/payment.csv\",sep=\";\", schema=paymentSchema, header=True)\n" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 37, 317 | "metadata": {}, 318 | "outputs": [ 319 | { 320 | "name": "stdout", 321 | "output_type": "stream", 322 | "text": [ 323 | "root\n", 324 | " |-- payment_id: integer (nullable = true)\n", 325 | " |-- customer_id: integer (nullable = true)\n", 326 | " |-- staff_id: integer (nullable = true)\n", 327 | " |-- rental_id: integer (nullable = true)\n", 328 | " |-- amount: double (nullable = true)\n", 329 | " |-- payment_date: date (nullable = true)\n", 330 | "\n", 331 | "+----------+-----------+--------+---------+------+--------------------+\n", 332 | "|payment_id|customer_id|staff_id|rental_id|amount| payment_date|\n", 333 | "+----------+-----------+--------+---------+------+--------------------+\n", 334 | "| 16050| 269| 2| 7| 1.99|2017-01-24 21:40:...|\n", 335 | "| 16051| 269| 1| 98| 0.99|2017-01-25 15:16:...|\n", 336 | "| 16052| 269| 2| 678| 6.99|2017-01-28 21:44:...|\n", 337 | "| 16053| 269| 2| 703| 0.99|2017-01-29 00:58:...|\n", 338 | "| 16054| 269| 1| 750| 4.99|2017-01-29 08:10:...|\n", 339 | "+----------+-----------+--------+---------+------+--------------------+\n", 340 | "only showing top 5 rows\n", 341 | "\n" 342 | ] 343 | } 344 | ], 345 | "source": [ 346 | "dfPaymentWithSchema.printSchema()\n", 347 | "df.show(5)" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 39, 353 | "metadata": {}, 354 | "outputs": [ 355 | { 356 | "name": "stdout", 357 | "output_type": "stream", 358 | "text": [ 359 | "+---+------------------+\n", 360 | "| m| revenue|\n", 361 | "+---+------------------+\n", 362 | "| 4|28559.460000003943|\n", 363 | "| 3|23886.560000002115|\n", 364 | "| 2| 9631.879999999608|\n", 365 | "| 1| 4824.429999999856|\n", 366 | "| 5| 514.180000000001|\n", 367 | "+---+------------------+\n", 368 | "\n" 369 | ] 370 | } 371 | ], 372 | "source": [ 373 | "dfPaymentWithSchema.createOrReplaceTempView(\"payment\")\n", 374 | "spark.sql(\"\"\"\n", 375 | " SELECT month(payment_date) as m, sum(amount) as revenue\n", 376 | " FROM payment\n", 377 | " GROUP by m\n", 378 | " order by revenue desc\n", 379 | "\"\"\").show()" 380 | ] 381 | } 382 | ], 383 | "metadata": { 384 | "kernelspec": { 385 | "display_name": "Python 3", 386 | "language": "python", 387 | "name": "python3" 388 | }, 389 | "language_info": { 390 | "codemirror_mode": { 391 | "name": "ipython", 392 | "version": 3 393 | }, 394 | "file_extension": ".py", 395 | "mimetype": "text/x-python", 396 | "name": "python", 397 | "nbconvert_exporter": "python", 398 | "pygments_lexer": "ipython3", 399 | "version": "3.6.3" 400 | }, 401 | "toc": { 402 | "base_numbering": 1, 403 | "nav_menu": {}, 404 | "number_sections": false, 405 | "sideBar": true, 406 | "skip_h1_title": false, 407 | "title_cell": "Table of Contents", 408 | "title_sidebar": "Contents", 409 | "toc_cell": false, 410 | "toc_position": {}, 411 | "toc_section_display": true, 412 | "toc_window_display": false 413 | } 414 | }, 415 | "nbformat": 4, 416 | "nbformat_minor": 2 417 | } 418 | -------------------------------------------------------------------------------- /project1/test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext sql" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/plain": [ 20 | "'Connected: student@sparkifydb'" 21 | ] 22 | }, 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "output_type": "execute_result" 26 | } 27 | ], 28 | "source": [ 29 | "%sql postgresql://student:student@127.0.0.1/sparkifydb" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | " * postgresql://student:***@127.0.0.1/sparkifydb\n", 42 | "5 rows affected.\n" 43 | ] 44 | }, 45 | { 46 | "data": { 47 | "text/html": [ 48 | "\n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | "
    songplay_idstart_timeuser_idlevelartist_idsong_idsession_idlocationuser_agent
    12018-11-09 00:06:17.79600042paidNoneNone275New York-Newark-Jersey City, NY-NJ-PA"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"
    22018-11-09 00:09:46.79600042paidNoneNone275New York-Newark-Jersey City, NY-NJ-PA"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"
    32018-11-09 00:12:27.79600042paidNoneNone275New York-Newark-Jersey City, NY-NJ-PA"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"
    42018-11-09 00:14:52.79600042paidNoneNone275New York-Newark-Jersey City, NY-NJ-PA"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"
    52018-11-09 00:17:44.79600042paidNoneNone275New York-Newark-Jersey City, NY-NJ-PA"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"
    " 116 | ], 117 | "text/plain": [ 118 | "[(1, datetime.datetime(2018, 11, 9, 0, 6, 17, 796000), 42, 'paid', None, None, 275, 'New York-Newark-Jersey City, NY-NJ-PA', '\"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36\"'),\n", 119 | " (2, datetime.datetime(2018, 11, 9, 0, 9, 46, 796000), 42, 'paid', None, None, 275, 'New York-Newark-Jersey City, NY-NJ-PA', '\"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36\"'),\n", 120 | " (3, datetime.datetime(2018, 11, 9, 0, 12, 27, 796000), 42, 'paid', None, None, 275, 'New York-Newark-Jersey City, NY-NJ-PA', '\"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36\"'),\n", 121 | " (4, datetime.datetime(2018, 11, 9, 0, 14, 52, 796000), 42, 'paid', None, None, 275, 'New York-Newark-Jersey City, NY-NJ-PA', '\"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36\"'),\n", 122 | " (5, datetime.datetime(2018, 11, 9, 0, 17, 44, 796000), 42, 'paid', None, None, 275, 'New York-Newark-Jersey City, NY-NJ-PA', '\"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36\"')]" 123 | ] 124 | }, 125 | "execution_count": 3, 126 | "metadata": {}, 127 | "output_type": "execute_result" 128 | } 129 | ], 130 | "source": [ 131 | "%sql SELECT * FROM songplays LIMIT 5;" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 4, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | " * postgresql://student:***@127.0.0.1/sparkifydb\n", 144 | "5 rows affected.\n" 145 | ] 146 | }, 147 | { 148 | "data": { 149 | "text/html": [ 150 | "\n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | "
    user_idfirst_namelast_namegenderlevel
    88MohammadRodriguezMfree
    100AdlerBarreraMfree
    25JaydenGravesMpaid
    50AvaRobinsonFfree
    42HarperBarrettMpaid
    " 194 | ], 195 | "text/plain": [ 196 | "[(88, 'Mohammad', 'Rodriguez', 'M', 'free'),\n", 197 | " (100, 'Adler', 'Barrera', 'M', 'free'),\n", 198 | " (25, 'Jayden', 'Graves', 'M', 'paid'),\n", 199 | " (50, 'Ava', 'Robinson', 'F', 'free'),\n", 200 | " (42, 'Harper', 'Barrett', 'M', 'paid')]" 201 | ] 202 | }, 203 | "execution_count": 4, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "%sql SELECT * FROM users LIMIT 5;" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 5, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | " * postgresql://student:***@127.0.0.1/sparkifydb\n", 222 | "1 rows affected.\n" 223 | ] 224 | }, 225 | { 226 | "data": { 227 | "text/html": [ 228 | "\n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | "
    song_idtitleartist_idyearduration
    SONWXQJ12A8C134D94The Ballad Of Sleeping BeautyARNF6401187FB570321994305.162
    " 244 | ], 245 | "text/plain": [ 246 | "[('SONWXQJ12A8C134D94', 'The Ballad Of Sleeping Beauty', 'ARNF6401187FB57032', 1994, 305.162)]" 247 | ] 248 | }, 249 | "execution_count": 5, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "%sql SELECT * FROM songs LIMIT 5;" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 6, 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "name": "stdout", 265 | "output_type": "stream", 266 | "text": [ 267 | " * postgresql://student:***@127.0.0.1/sparkifydb\n", 268 | "1 rows affected.\n" 269 | ] 270 | }, 271 | { 272 | "data": { 273 | "text/html": [ 274 | "\n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | "
    artist_idnamelocationlattitudelongitude
    ARNF6401187FB57032Sophie B. HawkinsNew York, NY [Manhattan]40.79086-73.96644
    " 290 | ], 291 | "text/plain": [ 292 | "[('ARNF6401187FB57032', 'Sophie B. Hawkins', 'New York, NY [Manhattan]', Decimal('40.79086'), Decimal('-73.96644'))]" 293 | ] 294 | }, 295 | "execution_count": 6, 296 | "metadata": {}, 297 | "output_type": "execute_result" 298 | } 299 | ], 300 | "source": [ 301 | "%sql SELECT * FROM artists LIMIT 5;" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 7, 307 | "metadata": {}, 308 | "outputs": [ 309 | { 310 | "name": "stdout", 311 | "output_type": "stream", 312 | "text": [ 313 | " * postgresql://student:***@127.0.0.1/sparkifydb\n", 314 | "5 rows affected.\n" 315 | ] 316 | }, 317 | { 318 | "data": { 319 | "text/html": [ 320 | "\n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | "
    start_timehourdayweekmonthyearweekday
    2018-11-09 00:06:17.7960000945112018Friday
    2018-11-09 00:09:46.7960000945112018Friday
    2018-11-09 00:12:27.7960000945112018Friday
    2018-11-09 00:14:52.7960000945112018Friday
    2018-11-09 00:17:44.7960000945112018Friday
    " 376 | ], 377 | "text/plain": [ 378 | "[(datetime.datetime(2018, 11, 9, 0, 6, 17, 796000), 0, 9, 45, 11, 2018, 'Friday'),\n", 379 | " (datetime.datetime(2018, 11, 9, 0, 9, 46, 796000), 0, 9, 45, 11, 2018, 'Friday'),\n", 380 | " (datetime.datetime(2018, 11, 9, 0, 12, 27, 796000), 0, 9, 45, 11, 2018, 'Friday'),\n", 381 | " (datetime.datetime(2018, 11, 9, 0, 14, 52, 796000), 0, 9, 45, 11, 2018, 'Friday'),\n", 382 | " (datetime.datetime(2018, 11, 9, 0, 17, 44, 796000), 0, 9, 45, 11, 2018, 'Friday')]" 383 | ] 384 | }, 385 | "execution_count": 7, 386 | "metadata": {}, 387 | "output_type": "execute_result" 388 | } 389 | ], 390 | "source": [ 391 | "%sql SELECT * FROM time LIMIT 5;" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": {}, 397 | "source": [ 398 | "## REMEMBER: Restart this notebook to close connection to `sparkifydb`\n", 399 | "Each time you run the cells above, remember to restart this notebook to close the connection to your database. Otherwise, you won't be able to run your code in `create_tables.py`, `etl.py`, or `etl.ipynb` files since you can't make multiple connections to the same database (in this case, sparkifydb)." 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 8, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "# %sql SELECT * FROM songs LEFT JOIN artists ON songs.artist_id=artists.artist_id" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "metadata": {}, 415 | "outputs": [], 416 | "source": [] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [] 424 | } 425 | ], 426 | "metadata": { 427 | "kernelspec": { 428 | "display_name": "Python 3", 429 | "language": "python", 430 | "name": "python3" 431 | }, 432 | "language_info": { 433 | "codemirror_mode": { 434 | "name": "ipython", 435 | "version": 3 436 | }, 437 | "file_extension": ".py", 438 | "mimetype": "text/x-python", 439 | "name": "python", 440 | "nbconvert_exporter": "python", 441 | "pygments_lexer": "ipython3", 442 | "version": "3.6.3" 443 | } 444 | }, 445 | "nbformat": 4, 446 | "nbformat_minor": 2 447 | } 448 | -------------------------------------------------------------------------------- /NoSQL data modeling/Lesson 3 Exercise 1 Three Queries Three Tables.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lesson 3 Exercise 1: Three Queries Three Tables\n", 8 | "" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "### Walk through the basics of creating a table in Apache Cassandra, inserting rows of data, and doing a simple CQL query to validate the information. You will practice Denormalization, and the concept of 1 table per query, which is an encouraged practice with Apache Cassandra. \n", 16 | "\n", 17 | "### Remember, replace ##### with your answer.\n", 18 | "\n", 19 | "\n", 20 | "Note: __Do not__ click the blue Preview button at the bottom" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "#### We will use a python wrapper/ python driver called cassandra to run the Apache Cassandra queries. This library should be preinstalled but in the future to install this library you can run this command in a notebook to install locally: \n", 28 | "! pip install cassandra-driver\n", 29 | "#### More documentation can be found here: https://datastax.github.io/python-driver/" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "#### Import Apache Cassandra python package" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 1, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "import cassandra" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "### Create a connection to the database" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 2, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "from cassandra.cluster import Cluster\n", 62 | "try: \n", 63 | " cluster = Cluster(['127.0.0.1']) #If you have a locally installed Apache Cassandra instance\n", 64 | " session = cluster.connect()\n", 65 | "except Exception as e:\n", 66 | " print(e)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "### Create a keyspace to work in" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 3, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "try:\n", 83 | " session.execute(\"\"\"\n", 84 | " CREATE KEYSPACE IF NOT EXISTS udacity \n", 85 | " WITH REPLICATION = \n", 86 | " { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }\"\"\"\n", 87 | ")\n", 88 | "\n", 89 | "except Exception as e:\n", 90 | " print(e)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "#### Connect to our Keyspace. Compare this to how we had to create a new session in PostgreSQL. " 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 4, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "try:\n", 107 | " session.set_keyspace('udacity')\n", 108 | "except Exception as e:\n", 109 | " print(e)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "### Let's imagine we would like to start creating a Music Library of albums. \n", 117 | "\n", 118 | "### We want to ask 3 questions of the data\n", 119 | "#### 1. Give every album in the music library that was released in a given year\n", 120 | "`select * from music_library WHERE YEAR=1970`\n", 121 | "#### 2. Give every album in the music library that was created by a given artist \n", 122 | "`select * from artist_library WHERE artist_name=\"The Beatles\"`\n", 123 | "#### 3. Give all the information from the music library about a given album\n", 124 | "`select * from album_library WHERE album_name=\"Close To You\"`\n" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "### Because we want to do three different queries, we will need different tables that partition the data differently. \n", 132 | "\n", 133 | "\n", 134 | "" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "### TO-DO: Create the tables. " 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 5, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "query = \"CREATE TABLE IF NOT EXISTS music_library\"\n", 151 | "query = query + \"(year int, artist_name text, album_name text, PRIMARY KEY(year, album_name))\"\n", 152 | "try:\n", 153 | " session.execute(query)\n", 154 | "except Exception as e:\n", 155 | " print(e)\n", 156 | " " 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 6, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "query1 = \"CREATE TABLE IF NOT EXISTS artist_library\"\n", 166 | "query1 = query1 + \"(artist_name text, year int, album_name text, PRIMARY KEY(artist_name,year))\"\n", 167 | "try:\n", 168 | " session.execute(query1)\n", 169 | "except Exception as e:\n", 170 | " print(e)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 7, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "query2 = \"CREATE TABLE IF NOT EXISTS album_library\"\n", 180 | "query2 = query2 + \"(album_name text, artist_name text, year int, PRIMARY KEY(album_name, year))\"\n", 181 | "try:\n", 182 | " session.execute(query2)\n", 183 | "except Exception as e:\n", 184 | " print(e)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "### TO-DO: Insert data into the tables" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 8, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "query = \"INSERT INTO music_library (year, artist_name, album_name)\"\n", 201 | "query = query + \" VALUES (%s, %s, %s)\"\n", 202 | "\n", 203 | "query1 = \"INSERT INTO artist_library (artist_name, year, album_name)\"\n", 204 | "query1 = query1 + \" VALUES (%s, %s, %s)\"\n", 205 | "\n", 206 | "query2 = \"INSERT INTO album_library (album_name, artist_name, year)\"\n", 207 | "query2 = query2 + \" VALUES (%s, %s, %s)\"" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 9, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "try:\n", 217 | " session.execute(query, (1970, \"The Beatles\", \"Let it Be\"))\n", 218 | "except Exception as e:\n", 219 | " print(e)\n", 220 | " \n", 221 | "try:\n", 222 | " session.execute(query, (1965, \"The Beatles\", \"Rubber Soul\"))\n", 223 | "except Exception as e:\n", 224 | " print(e)\n", 225 | " \n", 226 | "try:\n", 227 | " session.execute(query, (1965, \"The Who\", \"My Generation\"))\n", 228 | "except Exception as e:\n", 229 | " print(e)\n", 230 | "\n", 231 | "try:\n", 232 | " session.execute(query, (1966, \"The Monkees\", \"The Monkees\"))\n", 233 | "except Exception as e:\n", 234 | " print(e)\n", 235 | "\n", 236 | "try:\n", 237 | " session.execute(query, (1970, \"The Carpenters\", \"Close To You\"))\n", 238 | "except Exception as e:\n", 239 | " print(e)\n", 240 | " " 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 10, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | " \n", 250 | "try:\n", 251 | " session.execute(query1, (\"The Beatles\", 1970, \"Let it Be\"))\n", 252 | "except Exception as e:\n", 253 | " print(e)\n", 254 | " \n", 255 | "try:\n", 256 | " session.execute(query1, (\"The Beatles\", 1965, \"Rubber Soul\"))\n", 257 | "except Exception as e:\n", 258 | " print(e)\n", 259 | " \n", 260 | "try:\n", 261 | " session.execute(query1, (\"The Who\", 1965, \"My Generation\"))\n", 262 | "except Exception as e:\n", 263 | " print(e)\n", 264 | "\n", 265 | "try:\n", 266 | " session.execute(query1, (\"The Monkees\", 1966, \"The Monkees\"))\n", 267 | "except Exception as e:\n", 268 | " print(e)\n", 269 | "\n", 270 | "try:\n", 271 | " session.execute(query1, (\"The Carpenters\", 1970, \"Close To You\"))\n", 272 | "except Exception as e:\n", 273 | " print(e)\n", 274 | " " 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 11, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | " \n", 284 | "try:\n", 285 | " session.execute(query1, (\"The Beatles\", 1970, \"Let it Be\"))\n", 286 | "except Exception as e:\n", 287 | " print(e)\n", 288 | " \n", 289 | "try:\n", 290 | " session.execute(query1, (\"The Beatles\", 1965, \"Rubber Soul\"))\n", 291 | "except Exception as e:\n", 292 | " print(e)\n", 293 | " \n", 294 | "try:\n", 295 | " session.execute(query1, (\"The Who\", 1965, \"My Generation\"))\n", 296 | "except Exception as e:\n", 297 | " print(e)\n", 298 | "\n", 299 | "try:\n", 300 | " session.execute(query1, (\"The Monkees\", 1966, \"The Monkees\"))\n", 301 | "except Exception as e:\n", 302 | " print(e)\n", 303 | "\n", 304 | "try:\n", 305 | " session.execute(query1, (\"The Carpenters\", 1970, \"Close To You\"))\n", 306 | "except Exception as e:\n", 307 | " print(e)\n", 308 | " " 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 12, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "try:\n", 318 | " session.execute(query2, (\"Let it Be\", \"The Beatles\", 1970))\n", 319 | "except Exception as e:\n", 320 | " print(e)\n", 321 | " \n", 322 | "try:\n", 323 | " session.execute(query2, (\"Rubber Soul\", \"The Beatles\", 1965))\n", 324 | "except Exception as e:\n", 325 | " print(e)\n", 326 | " \n", 327 | "try:\n", 328 | " session.execute(query2, (\"My Generation\", \"The Who\", 1965))\n", 329 | "except Exception as e:\n", 330 | " print(e)\n", 331 | "\n", 332 | "try:\n", 333 | " session.execute(query2, (\"The Monkees\", \"The Monkees\", 1966))\n", 334 | "except Exception as e:\n", 335 | " print(e)\n", 336 | "\n", 337 | "try:\n", 338 | " session.execute(query2, (\"Close To You\", \"The Carpenters\", 1970))\n", 339 | "except Exception as e:\n", 340 | " print(e)" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "This might have felt unnatural to insert duplicate data into the tables. If I just normalized these tables, I wouldn't have to have extra copies! While this is true, remember there are no `JOINS` in Apache Cassandra. For the benefit of high availibity and scalabity, denormalization must be how this is done. \n" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "### TO-DO: Validate the Data Model" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": 13, 360 | "metadata": {}, 361 | "outputs": [ 362 | { 363 | "name": "stdout", 364 | "output_type": "stream", 365 | "text": [ 366 | "1970 The Carpenters Close To You\n", 367 | "1970 The Beatles Let it Be\n" 368 | ] 369 | } 370 | ], 371 | "source": [ 372 | "query = \"select * from music_library WHERE year=1970\"\n", 373 | "try:\n", 374 | " rows = session.execute(query)\n", 375 | "except Exception as e:\n", 376 | " print(e)\n", 377 | " \n", 378 | "for row in rows:\n", 379 | " print (row.year, row.artist_name, row.album_name,)" 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "metadata": {}, 385 | "source": [ 386 | "### Your output should be:\n", 387 | "1970 The Beatles Let it Be
    \n", 388 | "1970 The Carpenters Close To You" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": {}, 394 | "source": [ 395 | "### TO-DO: Validate the Data Model" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 14, 401 | "metadata": {}, 402 | "outputs": [ 403 | { 404 | "name": "stdout", 405 | "output_type": "stream", 406 | "text": [ 407 | "The Beatles 1965 Rubber Soul\n", 408 | "The Beatles 1970 Let it Be\n" 409 | ] 410 | } 411 | ], 412 | "source": [ 413 | "query =\"select * from artist_library WHERE artist_name='The Beatles'\"\n", 414 | "try:\n", 415 | " rows = session.execute(query)\n", 416 | "except Exception as e:\n", 417 | " print(e)\n", 418 | " \n", 419 | "for row in rows:\n", 420 | " print (row.artist_name, row.year, row.album_name)" 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": {}, 426 | "source": [ 427 | "### Your output should be:\n", 428 | "The Beatles 1965 Rubber Soul
    \n", 429 | "The Beatles 1970 Let it Be" 430 | ] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "metadata": {}, 435 | "source": [ 436 | "### TO-DO: Validate the Data Model" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 15, 442 | "metadata": {}, 443 | "outputs": [ 444 | { 445 | "name": "stdout", 446 | "output_type": "stream", 447 | "text": [ 448 | "The Carpenters 1970 Close To You\n" 449 | ] 450 | } 451 | ], 452 | "source": [ 453 | "query = \"select * from album_library WHERE album_name='Close To You'\"\n", 454 | "try:\n", 455 | " rows = session.execute(query)\n", 456 | "except Exception as e:\n", 457 | " print(e)\n", 458 | " \n", 459 | "for row in rows:\n", 460 | " print (row.artist_name, row.year, row.album_name)" 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": {}, 466 | "source": [ 467 | "### Your output should be:\n", 468 | "The Carpenters 1970 Close To You" 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "metadata": {}, 474 | "source": [ 475 | "### And finally close the session and cluster connection" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 16, 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [ 484 | "query = \"DROP TABLE music_library\"\n", 485 | "try:\n", 486 | " rows = session.execute(query)\n", 487 | "except Exception as e:\n", 488 | " print(e)\n", 489 | "query1 = \"DROP TABLE artist_library\"\n", 490 | "try:\n", 491 | " rows = session.execute(query1)\n", 492 | "except Exception as e:\n", 493 | " print(e)\n", 494 | "query2 = \"DROP TABLE album_library\"\n", 495 | "try:\n", 496 | " rows = session.execute(query2)\n", 497 | "except Exception as e:\n", 498 | " print(e)" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": 17, 504 | "metadata": {}, 505 | "outputs": [], 506 | "source": [ 507 | "session.shutdown()\n", 508 | "cluster.shutdown()" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [] 517 | } 518 | ], 519 | "metadata": { 520 | "kernelspec": { 521 | "display_name": "Python 3", 522 | "language": "python", 523 | "name": "python3" 524 | }, 525 | "language_info": { 526 | "codemirror_mode": { 527 | "name": "ipython", 528 | "version": 3 529 | }, 530 | "file_extension": ".py", 531 | "mimetype": "text/x-python", 532 | "name": "python", 533 | "nbconvert_exporter": "python", 534 | "pygments_lexer": "ipython3", 535 | "version": "3.6.3" 536 | } 537 | }, 538 | "nbformat": 4, 539 | "nbformat_minor": 2 540 | } 541 | --------------------------------------------------------------------------------