├── Capstone Project Drug Data Warehouse ├── Data Dictionary.xlsx ├── README.md ├── Socrata_credentials.cfg ├── __pycache__ │ └── sql_queries.cpython-36.pyc ├── capstone.cfg ├── create_cluster.py ├── etl.py ├── images │ ├── database_schema_diagram.png │ ├── table_rowcounts.png │ └── test_query_aspirin.png └── sql_queries.py ├── Project 1 Data Modeling with Postgres ├── .ipynb_checkpoints │ ├── README-checkpoint.md │ ├── create_tables-checkpoint.py │ ├── database_schema_diagram-checkpoint.png │ ├── etl-checkpoint.ipynb │ ├── etl-checkpoint.py │ ├── sql_queries-checkpoint.py │ └── test-checkpoint.ipynb ├── README.md ├── __pycache__ │ └── sql_queries.cpython-36.pyc ├── create_tables.py ├── data │ ├── log_data │ │ └── 2018 │ │ │ └── 11 │ │ │ ├── .ipynb_checkpoints │ │ │ └── 2018-11-02-events-checkpoint.json │ │ │ ├── 2018-11-01-events.json │ │ │ ├── 2018-11-02-events.json │ │ │ ├── 2018-11-03-events.json │ │ │ ├── 2018-11-04-events.json │ │ │ ├── 2018-11-05-events.json │ │ │ ├── 2018-11-06-events.json │ │ │ ├── 2018-11-07-events.json │ │ │ ├── 2018-11-08-events.json │ │ │ ├── 2018-11-09-events.json │ │ │ ├── 2018-11-10-events.json │ │ │ ├── 2018-11-11-events.json │ │ │ ├── 2018-11-12-events.json │ │ │ ├── 2018-11-13-events.json │ │ │ ├── 2018-11-14-events.json │ │ │ ├── 2018-11-15-events.json │ │ │ ├── 2018-11-16-events.json │ │ │ ├── 2018-11-17-events.json │ │ │ ├── 2018-11-18-events.json │ │ │ ├── 2018-11-19-events.json │ │ │ ├── 2018-11-20-events.json │ │ │ ├── 2018-11-21-events.json │ │ │ ├── 2018-11-22-events.json │ │ │ ├── 2018-11-23-events.json │ │ │ ├── 2018-11-24-events.json │ │ │ ├── 2018-11-25-events.json │ │ │ ├── 2018-11-26-events.json │ │ │ ├── 2018-11-27-events.json │ │ │ ├── 2018-11-28-events.json │ │ │ ├── 2018-11-29-events.json │ │ │ └── 2018-11-30-events.json │ └── song_data │ │ └── A │ │ ├── A │ │ ├── A │ │ │ ├── .ipynb_checkpoints │ │ │ │ └── TRAAAAW128F429D538-checkpoint.json │ │ │ ├── TRAAAAW128F429D538.json │ │ │ ├── TRAAABD128F429CF47.json │ │ │ ├── TRAAADZ128F9348C2E.json │ │ │ ├── TRAAAEF128F4273421.json │ │ │ ├── TRAAAFD128F92F423A.json │ │ │ ├── TRAAAMO128F1481E7F.json │ │ │ ├── TRAAAMQ128F1460CD3.json │ │ │ ├── TRAAAPK128E0786D96.json │ │ │ ├── TRAAARJ128F9320760.json │ │ │ ├── TRAAAVG12903CFA543.json │ │ │ └── TRAAAVO128F93133D4.json │ │ ├── B │ │ │ ├── TRAABCL128F4286650.json │ │ │ ├── TRAABDL12903CAABBA.json │ │ │ ├── TRAABJL12903CDCF1A.json │ │ │ ├── TRAABJV128F1460C49.json │ │ │ ├── TRAABLR128F423B7E3.json │ │ │ ├── TRAABNV128F425CEE1.json │ │ │ ├── TRAABRB128F9306DD5.json │ │ │ ├── TRAABVM128F92CA9DC.json │ │ │ ├── TRAABXG128F9318EBD.json │ │ │ ├── TRAABYN12903CFD305.json │ │ │ └── TRAABYW128F4244559.json │ │ └── C │ │ │ ├── TRAACCG128F92E8A55.json │ │ │ ├── TRAACER128F4290F96.json │ │ │ ├── TRAACFV128F935E50B.json │ │ │ ├── TRAACHN128F1489601.json │ │ │ ├── TRAACIW12903CC0F6D.json │ │ │ ├── TRAACLV128F427E123.json │ │ │ ├── TRAACNS128F14A2DF5.json │ │ │ ├── TRAACOW128F933E35F.json │ │ │ ├── TRAACPE128F421C1B9.json │ │ │ ├── TRAACQT128F9331780.json │ │ │ ├── TRAACSL128F93462F4.json │ │ │ ├── TRAACTB12903CAAF15.json │ │ │ ├── TRAACVS128E078BE39.json │ │ │ └── TRAACZK128F4243829.json │ │ └── B │ │ ├── A │ │ ├── TRABACN128F425B784.json │ │ ├── TRABAFJ128F42AF24E.json │ │ ├── TRABAFP128F931E9A1.json │ │ ├── TRABAIO128F42938F9.json │ │ ├── TRABATO128F42627E9.json │ │ ├── TRABAVQ12903CBF7E0.json │ │ ├── TRABAWW128F4250A31.json │ │ ├── TRABAXL128F424FC50.json │ │ ├── TRABAXR128F426515F.json │ │ ├── TRABAXV128F92F6AE3.json │ │ └── TRABAZH128F930419A.json │ │ ├── B │ │ ├── TRABBAM128F429D223.json │ │ ├── TRABBBV128F42967D7.json │ │ ├── TRABBJE12903CDB442.json │ │ ├── TRABBKX128F4285205.json │ │ ├── TRABBLU128F93349CF.json │ │ ├── TRABBNP128F932546F.json │ │ ├── TRABBOP128F931B50D.json │ │ ├── TRABBOR128F4286200.json │ │ ├── TRABBTA128F933D304.json │ │ ├── TRABBVJ128F92F7EAA.json │ │ ├── TRABBXU128F92FEF48.json │ │ └── TRABBZN12903CD9297.json │ │ └── C │ │ ├── TRABCAJ12903CDFCC2.json │ │ ├── TRABCEC128F426456E.json │ │ ├── TRABCEI128F424C983.json │ │ ├── TRABCFL128F149BB0D.json │ │ ├── TRABCIX128F4265903.json │ │ ├── TRABCKL128F423A778.json │ │ ├── TRABCPZ128F4275C32.json │ │ ├── TRABCRU128F423F449.json │ │ ├── TRABCTK128F934B224.json │ │ ├── TRABCUQ128E0783E2B.json │ │ ├── TRABCXB128F4286BD3.json │ │ └── TRABCYE128F934CE1D.json ├── database_schema_diagram.png ├── etl.ipynb ├── etl.py ├── sql_queries.py └── test.ipynb ├── Project 1B Data Modeling with Apache Cassandra ├── .ipynb_checkpoints │ ├── Project_1B_ Project_Template-checkpoint.ipynb │ └── event_datafile_new-checkpoint.csv ├── Project_1B_ Project_Template.ipynb ├── ReadMe.md ├── event_data │ ├── 2018-11-01-events.csv │ ├── 2018-11-02-events.csv │ ├── 2018-11-03-events.csv │ ├── 2018-11-04-events.csv │ ├── 2018-11-05-events.csv │ ├── 2018-11-06-events.csv │ ├── 2018-11-07-events.csv │ ├── 2018-11-08-events.csv │ ├── 2018-11-09-events.csv │ ├── 2018-11-10-events.csv │ ├── 2018-11-11-events.csv │ ├── 2018-11-12-events.csv │ ├── 2018-11-13-events.csv │ ├── 2018-11-14-events.csv │ ├── 2018-11-15-events.csv │ ├── 2018-11-16-events.csv │ ├── 2018-11-17-events.csv │ ├── 2018-11-18-events.csv │ ├── 2018-11-19-events.csv │ ├── 2018-11-20-events.csv │ ├── 2018-11-21-events.csv │ ├── 2018-11-22-events.csv │ ├── 2018-11-23-events.csv │ ├── 2018-11-24-events.csv │ ├── 2018-11-25-events.csv │ ├── 2018-11-26-events.csv │ ├── 2018-11-27-events.csv │ ├── 2018-11-28-events.csv │ ├── 2018-11-29-events.csv │ └── 2018-11-30-events.csv ├── event_datafile_new.csv └── images │ ├── .ipynb_checkpoints │ └── image_event_datafile_new-checkpoint.jpg │ └── image_event_datafile_new.jpg ├── Project 3 Create AWS Redshift Data Warehouse ├── README.md ├── create_tables.py ├── database_schema_diagram.png ├── dwh.cfg ├── etl.py ├── query_execution.png ├── sql_queries.py └── table_sizes.png ├── Project 4 Data Lake ├── README.md ├── dl.cfg └── etl.py ├── Project 5 Data Pipelines with Airflow ├── README.md └── airflow │ ├── create_tables.sql │ ├── dags │ └── udac_example_dag.py │ └── plugins │ ├── __init__.py │ ├── helpers │ ├── __init__.py │ ├── data_quality_checks.py │ └── sql_queries.py │ └── operators │ ├── __init__.py │ ├── data_quality.py │ ├── load_dimension.py │ ├── load_fact.py │ └── stage_redshift.py └── __pycache__ └── sql_queries.cpython-36.pyc /Capstone Project Drug Data Warehouse/Data Dictionary.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbmayer/Udacity-Data-Engineering-Nanodegree/84c28b9d8538a5aa3b23867abad93e8de30c3476/Capstone Project Drug Data Warehouse/Data Dictionary.xlsx -------------------------------------------------------------------------------- /Capstone Project Drug Data Warehouse/README.md: -------------------------------------------------------------------------------- 1 | # Data Engineering NanoDegree Capstone Project: 2 | # Drug Information Data Warehouse in the Cloud 3 | 4 | ## Overview 5 | 6 | This project created a drug information database with over **one billion rows** by combining drug label information, pricing and adverse events data for drugs registered in the National Drug Code (NDC) directory of the United States. The database was designed for use as the back end of a medication research tool for consumers and professionals. The database's data model was optimized for deployment on a multi-node cluster from Amazon's Redshift database service. An ETL process scripted in python automatically downloads the data from each source into Amazon S3 buckets for staging; transforms and normalizes the data; and loads the data into the database. 7 | 8 | ![](images/table_rowcounts.png) 9 | 10 | ## Data Sources 11 | 12 | * [National average drug acquisition cost](https://dev.socrata.com/foundry/data.medicaid.gov/tau9-gfwr) (nadac) pharmacy pricing was sourced from the Centers for Medicare & Medicaid Services (CMS) through the Socrata Open Data API. **The data is downloaded in the form of a list of dicts** using the sodapy python client. 13 | * Drug labels and adverse event data were downloaded from the [openFDA S3 bucket](https://open.fda.gov/apis/downloads/) of the U.S. Food and Drug Administration (FDA) at open.fda.gov. The data is available as **zipped JSON files** partitioned by date. 14 | * The Federal Adverse Event Reporting System (FAERS) is the original source of the adverse events data. FAERS provides quarterly data going back to Q1 2004. For purposes of practicality and cost, only reports from Q1 2018 through Q1 2019 were loaded into the database. Even with this limited timeframe, the reports generated more than one billion drug/report combinations. 15 | 16 | ## Data model 17 | 18 | The data model contains six tables from three disparate sources linked by two primary join fields. 19 | 20 | `drug_events`, `pricing` and `labels` are joined together by the NDC codes of the drugs. 21 | 22 | `safety_reports`, `drug_events` and `reactions` and linked together by _safetyreportid_. 23 | 24 | 25 | ![](images/database_schema_diagram.png) 26 | 27 | ### Distribution and sortkeys 28 | 29 | Unlike traditional RDBMS, Redshift distributes the tables among multiple nodes and uses columnar storage. This means that traditional indexing is not available. Instead, performance is tuned using parameters that define how the tables are distributed among the nodes and how the data is sorted and summarized over the stored columns. 30 | 31 | `ndc`, the consolidated list of NDC codes, is the smallest table in the database with 321k rows. Its use is anticipated for table joining in almost all queries. In order to improve query response times, `ndc` is distributed to all the nodes in the cluster using DISTSTYLE 'ALL'. 32 | 33 | The largest table in the database, `drug_events`, has more than one billion rows. `drug_events` is distributed by _safetyreportid_ to facilitate joins to `safety_reports` and `reactions`, and sorted on *formatted_ndc* to facilitate joins to `pricing` and `labels` via the `ndc` table. 34 | 35 | ## Results 36 | 37 | Test queries confirmed the ability to retrieve drug information, pricing and adverse events data. 38 | 39 | ![](images/test_query_aspirin.png) 40 | 41 | ## Data Quality Checks 42 | 43 | Quality checks were incorporated into the ETL process. 44 | 45 | * The script tests for missing NDC codes in the adverse events data and excludes those records from the database, since they cannot be linked to prices or labels. 46 | * Similarly, the code tests for and excludes missing drug reactions. 47 | * Duplicate records are eliminated from the safety_reports data prior to loading into the database, as Redshift does not enforce uniqueness and other constraints. 48 | 49 | ## Steps Taken in Project 50 | 51 | 1. Selected drug data as project topic and confirmed availability of public data. 52 | 2. Developed use case. 53 | 3. Researched data formats, download options and API instructions. 54 | 4. Downloaded and explored sample data from each dataset . 55 | 5. Identified potential join fields among the three datasets. 56 | * Located instructions to convert 10-digit NDC codes from labels and drug events to 11-digit NDC codes used in pricing data. 57 | 6. Developed and tested extraction processes. 58 | * Located a community AWS Lambda function to automatically unzip JSON archives (serverlessrepo-s3-uncompressor-UncompressFunction-1UT7EMTAJOBXH). 59 | * Created S3 buckets as staging locations for unzipped and transformed data. 60 | 7. Developed and tested transform processes. 61 | * Utilized pandas to normalize nested JSON structure of adverse events data. 62 | * Configured an EC2 instance to run ETL since local machine was overloaded. 63 | 8. Downloaded data from openFDA S3 bucket over more than 3 days. 64 | 9. Finalized data model and created Redshift database. 65 | 10. Ran ETL script from EC2 instance to populate database. 66 | * Repeated steps 4-10 many times, and in multiple orders, until the data actually loaded. 67 | * Redesigned distribution and sortkey config until test queries loaded in reasonable timespans. 68 | 11. Ran test queries to verify database performance 69 | 70 | ## Other Scenarios 71 | 72 | **If the database size was increased by 100X**, it would be difficult to maintain query performance when joining the pricing, labels and drug_events tables via NDC code. In this case it would be worth considering moving to a NoSQL database structure such as Cassandra, with each table consolidating the pricing, adverse events and label information for a single drug. This would facilitate the main use case of the database to research individual medications. However, it would reduce the ability of the database to support aggregate trend analysis of adverse reactions over, say, brand-name vs generic drugs or drug types over time. 73 | 74 | **To update the database every morning at 7am**, it would make sense to utilize a pipeline scheduling application such as Airflow. The downloading tasks could be implemented using Airflow hooks to AWS S3 buckets, and the uploading could utilize existing hooks to Redshift. Transform tasks could be implemented using python callables with fairly limited modifications to the existing ETL script. Or, custom operators could be developed to support code reuse. 75 | 76 | **If the database needed to be accessed by 100+ people,** one could enable concurrency scaling in Redshift. In this case Redshift adds clusters as needed to support increases in demand for concurrent querying of the database. There are a number of [technical requirements for concurrency scaling](https://docs.aws.amazon.com/redshift/latest/dg/concurrency-scaling.html) such as node type, sort key type (cannot use interleaved sorting) and query type (e.g. read-only) that must be met. The existing data model and cluster configuration would need to be reviewed to meet these requirements. 77 | -------------------------------------------------------------------------------- /Capstone Project Drug Data Warehouse/Socrata_credentials.cfg: -------------------------------------------------------------------------------- 1 | [Socrata] 2 | KeyID=[add key] 3 | KeySecret=[add secret] 4 | AppToken=[add app token] 5 | -------------------------------------------------------------------------------- /Capstone Project Drug Data Warehouse/__pycache__/sql_queries.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbmayer/Udacity-Data-Engineering-Nanodegree/84c28b9d8538a5aa3b23867abad93e8de30c3476/Capstone Project Drug Data Warehouse/__pycache__/sql_queries.cpython-36.pyc -------------------------------------------------------------------------------- /Capstone Project Drug Data Warehouse/capstone.cfg: -------------------------------------------------------------------------------- 1 | [AWS] 2 | SECRET= 3 | KEY= 4 | 5 | [HW] 6 | CLUSTER_TYPE=multi-node 7 | NUM_NODES=2 8 | NODE_TYPE=dc2.large 9 | 10 | [ACCESS] 11 | IAM_ROLE_NAME= 12 | CLUSTER_IDENTIFIER= 13 | DB_NAME= 14 | DB_USER= 15 | DB_PASSWORD= 16 | DB_PORT= 17 | 18 | [CLUSTER] 19 | HOST= 20 | DB_NAME= 21 | DB_USER= 22 | DB_PASSWORD= 23 | DB_PORT= 24 | 25 | [IAM_ROLE] 26 | ARN= 27 | 28 | [GEO] 29 | REGION= 30 | -------------------------------------------------------------------------------- /Capstone Project Drug Data Warehouse/create_cluster.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import configparser 3 | 4 | def get_cluster_params(param_file): 5 | ''' Extract and return cluster parameters from configuration file. 6 | 7 | Args: 8 | config: the configuration parser 9 | param_file: filename of configuration file (.cfg) 10 | 11 | Returns: KEY, SECRET, CLUSTER_TYPE, NUM_NODES, NODE_TYPE, CLUSTER_IDENTIFIER, DB_NAME, DB_USER, DB_PASSWORD, DWH_PORT, IAM_ROLE_NAME 12 | ''' 13 | config = configparser.ConfigParser() 14 | config.read_file(open(param_file)) 15 | 16 | KEY = config.get('AWS','KEY') 17 | SECRET = config.get('AWS','SECRET') 18 | CLUSTER_TYPE = config.get("HW","CLUSTER_TYPE") 19 | NUM_NODES = config.get("HW","NUM_NODES") 20 | NODE_TYPE = config.get("HW","NODE_TYPE") 21 | CLUSTER_IDENTIFIER = config.get("ACCESS","CLUSTER_IDENTIFIER") 22 | DB_NAME = config.get("ACCESS","DB_NAME") 23 | DB_USER = config.get("ACCESS","DB_USER") 24 | DB_PASSWORD = config.get("ACCESS","DB_PASSWORD") 25 | DB_PORT = config.get("ACCESS","DB_PORT") 26 | IAM_ROLE_NAME = config.get("ACCESS", "IAM_ROLE_NAME") 27 | 28 | return(KEY, SECRET, CLUSTER_TYPE, NUM_NODES, NODE_TYPE, CLUSTER_IDENTIFIER, DB_NAME, DB_USER, DB_PASSWORD, DB_PORT, IAM_ROLE_NAME) 29 | 30 | 31 | def get_ARN(iam_role_name, key, secret): 32 | '''Retrieve ARN of existing IAM role to enable S3 read-only access from Redshift 33 | 34 | Args: 35 | iam_role_name: Name of existing IAM role with S3 read-only access policy 36 | key: Access Key ID for programmatic access to AWS API 37 | secret: Secret Access Key for programmatic access to AWS API 38 | 39 | Returns: ARN 40 | ''' 41 | iam = boto3.client('iam', 42 | region_name='us-west-2', 43 | aws_access_key_id=key, 44 | aws_secret_access_key=secret) 45 | try: 46 | return iam.get_role(RoleName=iam_role_name)['Role']['Arn'] 47 | 48 | except Exception: 49 | return None 50 | 51 | 52 | def create_redshift_cluster(key, secret, cluster_type, node_type, num_nodes, db_name, cluster_identifier, db_user, db_password, roleArn): 53 | '''Create AWS redshift cluster via API 54 | 55 | Args: 56 | key: AWS API access ID key 57 | secret: AWS API secret key 58 | cluster_type: 'single-node' or 'multi-node' 59 | node_type: node type, e.g. dc2.large 60 | num_nodes: number of nodes to create 61 | db_name: database name 62 | cluster_identifier: cluster name 63 | db_user: database user name 64 | db_password: database user password 65 | roleArn: Amazon Resource Name (ARN) for IAM role 66 | 67 | Returns: Status message indicating success or failure to initiate cluster 68 | ''' 69 | redshift = boto3.client('redshift', 70 | region_name='us-west-2', 71 | aws_access_key_id=key, 72 | aws_secret_access_key=secret) 73 | 74 | try: 75 | response = redshift.create_cluster( 76 | # add parameters for hardware 77 | ClusterType=cluster_type, 78 | NodeType=node_type, 79 | NumberOfNodes=int(num_nodes), 80 | 81 | # add parameters for identifiers & credentials 82 | DBName=db_name, 83 | ClusterIdentifier=cluster_identifier, 84 | MasterUsername=db_user, 85 | MasterUserPassword=db_password, 86 | 87 | # add parameter for role (to allow s3 access) 88 | IamRoles=[roleArn] 89 | ) 90 | 91 | return 'Creating cluster. Check management console for status.' 92 | 93 | except Exception as e: 94 | return ('Cluster creation failed: ' + str(e)) 95 | 96 | 97 | def main(): 98 | key, secret, cluster_type, num_nodes, node_type, cluster_identifier, \ 99 | db_name, db_user, db_password, db_port, iam_role_name = get_cluster_params('capstone.cfg') 100 | 101 | ARN = get_ARN(iam_role_name, key, secret) 102 | 103 | response = create_redshift_cluster(key, secret, cluster_type, node_type, num_nodes, db_name, cluster_identifier, db_user, db_password, ARN) 104 | print(response) 105 | 106 | 107 | if __name__ == "__main__": 108 | main() 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /Capstone Project Drug Data Warehouse/etl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import configparser 3 | import boto3 4 | import psycopg2 5 | import json 6 | import re 7 | import datetime 8 | import pandas as pd 9 | from pandas.io.json import json_normalize 10 | from sodapy import Socrata 11 | from sql_queries import load_ndc_queries 12 | 13 | 14 | def format_ndc(ndc): 15 | '''Convert NDC code to 11 digits per www.drugs.com/ndc.html''' 16 | # 4-4-2 17 | if re.match('^[0-9]{4}-[0-9]{4}-[0-9]{2}$', ndc): 18 | return '0' + ndc.replace('-','') 19 | # 5-3-2 20 | elif re.match('^[0-9]{5}-[0-9]{3}-[0-9]{2}$', ndc): 21 | return ndc[0:5] + '0' + ndc[6:9] + ndc[10:] 22 | # 5-4-1 23 | elif re.match('^[0-9]{5}-[0-9]{4}-[0-9]{1}$', ndc): 24 | return ndc.replace('-','')[0:9] + '0' + ndc[-1] 25 | else: 26 | return None 27 | 28 | 29 | def copy_from_S3_to_S3(s3, source, destination, key): 30 | '''Copy files between S3 buckets. 31 | 32 | s3: s3 resource 33 | source: source bucket 34 | destination: destination bucket 35 | key: prefix for source and destination 36 | 37 | ''' 38 | for obj in s3.Bucket(source).objects.filter(Prefix=key): 39 | print('Copying ' + obj.key) 40 | copy_source = { 41 | 'Bucket': source, 42 | 'Key': obj.key 43 | } 44 | s3.meta.client.copy(copy_source, destination, obj.key) 45 | 46 | 47 | def process_labels(s3, source, destination, key): 48 | '''Download labels JSONs, process data and save to S3 in csv format. 49 | 50 | s3: s3 resource 51 | source: source bucket 52 | destination: destination bucket 53 | key: prefix for source and destination 54 | 55 | ''' 56 | for obj in s3.Bucket(source).objects.filter(Prefix=key): 57 | # download file 58 | fn=obj.key.split('/')[-1] 59 | s3.Object(source, obj.key).download_file(Filename=fn) 60 | print('processing ' + obj.key) 61 | 62 | # load to dict 63 | with open(fn) as f: 64 | labels_dict = json.load(f) 65 | 66 | # normalize 67 | labels = json_normalize(labels_dict['results']) 68 | 69 | # convert invalid column names 70 | labels = labels.rename({ 71 | 'openfda.package_ndc':'package_ndc', 72 | 'openfda.generic_name':'generic_name', 73 | 'openfda.brand_name':'brand_name', 74 | }, axis=1) 75 | 76 | # select columns 77 | desired_columns = ['active_ingredient', 78 | 'adverse_reactions', 79 | 'brand_name', 80 | 'drug_interactions', 81 | 'generic_name', 82 | 'indications_and_usage', 83 | 'package_ndc', 84 | 'warnings'] 85 | 86 | labels = labels[desired_columns] 87 | 88 | # explode ndc's 89 | labels = labels.explode('package_ndc') 90 | 91 | # format ndc's 92 | labels['formatted_ndc'] = [format_ndc(str(x)) for x in 93 | labels['package_ndc']] 94 | 95 | # sort columns to match database table 96 | labels = labels[sorted(labels.columns)] 97 | 98 | # save to destination 99 | outfile = os.path.join('s3a://', destination, obj.key[0:-5], '.csv') 100 | 101 | labels.to_csv(outfile, sep='|', header=0, index=0) 102 | print('processed and saved as csv') 103 | 104 | # delete file 105 | os.remove(fn) 106 | 107 | 108 | def process_drug_events(s3, source, destination, key): 109 | '''Download drug events JSONs, process data and save to S3 as csv. 110 | 111 | s3: s3 resource 112 | source: source bucket 113 | destination: destination bucket 114 | key: prefix for source and destination 115 | 116 | ''' 117 | for obj in s3.Bucket(source).objects.filter(Prefix=key): 118 | 119 | # download file 120 | fn=obj.key.split('/')[-1] 121 | s3.Object(source, obj.key).download_file(Filename=fn) 122 | 123 | print('processing ' + obj.key) 124 | 125 | # load to dict 126 | with open(fn) as f: 127 | events_dict = json.load(f) 128 | 129 | # normalize nested patient-drug data 130 | drugs = json_normalize(events_dict['results'], 131 | ['patient', 'drug'], 132 | 'safetyreportid') 133 | 134 | # skip file if package_ndc element is missing 135 | if not 'openfda.package_ndc' in drugs.columns.values: 136 | print('package_ndc missing from ' + fn) 137 | os.remove(fn) 138 | continue 139 | 140 | # correct invalid column names 141 | drugs = drugs.rename({'openfda.package_ndc':'package_ndc'}, axis=1) 142 | 143 | # create drug_events fact table 144 | drug_events = (drugs[['package_ndc', 'safetyreportid']] 145 | .explode('package_ndc') 146 | .drop_duplicates()) 147 | 148 | # convert 10-digit ndc to 11-digit format 149 | drug_events['formatted_ndc'] = [format_ndc(str(x)) for x in 150 | drug_events['package_ndc']] 151 | 152 | # sort columns to match database table 153 | drug_events = drug_events[sorted(drug_events.columns)] 154 | 155 | # drop row if data missing 156 | drug_events = drug_events.dropna() 157 | 158 | # normalize nested patient reactions data 159 | reactions = json_normalize(events_dict['results'], 160 | ['patient', 'reaction'], 161 | 'safetyreportid') 162 | 163 | reactions_columns = ['reactionmeddrapt', 'safetyreportid'] 164 | 165 | reactions = reactions[reactions_columns] 166 | 167 | # normalize safetyreports data 168 | sr = json_normalize(events_dict['results'], max_level=1) 169 | 170 | desired_columns = ['safetyreportid', 'receivedate', 'receiptdate', 171 | 'seriousnesshospitalization','seriousnessdeath', 172 | 'seriousnesslifethreatening','seriousnessdisabling', 173 | 'seriousnesscongenitalanomali', 174 | 'patient.patientdeath'] 175 | 176 | available_columns = [colname for colname in sr.columns.values 177 | if colname in desired_columns] 178 | 179 | safetyreports = pd.DataFrame(columns=['safetyreportid', 180 | 'receivedate', 181 | 'receiptdate', 182 | 'seriousnesshospitalization', 183 | 'seriousnessdeath', 184 | 'seriousnesslifethreatening', 185 | 'seriousnessdisabling', 186 | 'seriousnesscongenitalanomali', 187 | 'patient.patientdeath']) 188 | 189 | safetyreports = safetyreports.append(sr[available_columns], sort=True) 190 | 191 | safetyreports.rename({'patient.patientdeath':'patientdeath'}, 192 | axis=1, inplace=True) 193 | 194 | safetyreports['receiptdate'] = safetyreports['receiptdate'].apply( 195 | lambda x: pd.to_datetime(x)) 196 | 197 | safetyreports['receivedate'] = safetyreports['receivedate'].apply( 198 | lambda x: pd.to_datetime(x)) 199 | 200 | safetyreports = (safetyreports[sorted(safetyreports.columns)] 201 | .drop_duplicates()) 202 | 203 | # save to destination 204 | safetyreports_key = 'safetyreports/' + obj.key[0:-5] 205 | 206 | safetyreports_outfile = os.path.join('s3a://', destination, 207 | safetyreports_key) 208 | 209 | safetyreports.to_csv(safetyreports_outfile, sep='|', header=0, index=0) 210 | 211 | drug_events_key = 'drug_events/' + obj.key[0:-5] 212 | 213 | drug_events_outfile = os.path.join('s3a://', destination, 214 | drug_events_key) 215 | 216 | drug_events.to_csv(drug_events_outfile, sep='|', header=0, index=0) 217 | 218 | reactions_key = 'reactions/reactions_' + obj.key[0:-5] 219 | 220 | reactions_outfile = os.path.join('s3a://', destination, reactions_key) 221 | 222 | # skip reactions if 'reactionmeddrapt' column missing 223 | if 'reactionmeddrapt' in reactions.columns.values: 224 | reactions.to_csv(reactions_outfile, sep='|', header=0, index=0) 225 | 226 | print('processed and saved as csv') 227 | 228 | # delete file 229 | os.remove(fn) 230 | 231 | def get_pricing_data(app_token, destination, key, first=1): 232 | '''Download NADAC pricing, process data and save to S3 as csv. 233 | 234 | app_token: Socrata app token 235 | destination: destination bucket 236 | key: prefix for destination 237 | 238 | ''' 239 | # download pricing data 240 | client = Socrata('data.medicaid.gov', app_token, timeout=100) 241 | 242 | # download all data on first data load 243 | if first==1: 244 | 245 | # get record count 246 | q = '''select count(as_of_date) as row_count ''' 247 | 248 | row_count = int(client.get("tau9-gfwr", query=q)[0]['row_count']) 249 | 250 | # page through data to download 251 | offset = -100000 252 | limit = 100000 253 | while offset < row_count: 254 | offset += limit 255 | #returned as JSON by API > converted to list of dicts by sodapy 256 | results = client.get("tau9-gfwr", limit=limit, offset=offset) 257 | if offset == 0: 258 | pricing = pd.DataFrame.from_records(results) 259 | else: 260 | pricing = pricing.append(pd.DataFrame.from_records(results), 261 | sort=True) 262 | else: 263 | # weekly updates: download latest as_of_date (about 25k rows) 264 | q2 = '''select max(as_of_date) as max_as_of_date ''' 265 | 266 | max_as_of_date = client.get("tau9-gfwr", query=q2)[0]['max_as_of_date'] 267 | 268 | q3 = '''select * where as_of_date = {}'''.format(max_as_of_date) 269 | 270 | results = client.get("tau9-gfwr", query=q3) 271 | 272 | pricing = pd.DataFrame.from_records(results) 273 | 274 | client.close() 275 | 276 | # wrangle dates 277 | pricing['as_of_date'] = (pricing['as_of_date'] 278 | .apply(lambda x: pd.Timestamp(x))) 279 | 280 | pricing['effective_date'] = (pricing['effective_date'] 281 | .apply(lambda x: pd.Timestamp(x))) 282 | 283 | pricing['corresponding_generic_drug_effective_date'] = ( 284 | pricing['corresponding_generic_drug_effective_date'] 285 | .apply(lambda x: pd.Timestamp(x))) 286 | 287 | # save to destination 288 | outfile = os.path.join('s3a://', destination, 'pricing/pricing') 289 | 290 | pricing.to_csv(outfile, sep='|', header=0, index=0) 291 | 292 | print(str(len(pricing)) + ' pricing records processed and saved as \ 293 | csv') 294 | 295 | 296 | def load_data_into_redshift(key, table, s3, source, cur, ARN): 297 | '''Copy data from csv files on S3 to table in redshift database. 298 | 299 | s3: S3 resource 300 | source: S3 source bucket 301 | key: S3 prefix 302 | table: destination table 303 | cur: psycopg2 cursor 304 | 305 | ''' 306 | for obj in s3.Bucket(source).objects.filter(Prefix=key): 307 | 308 | s3_location = str(os.path.join('s3://', source, obj.key)) 309 | 310 | copy_query = ("""COPY {} FROM '{}' iam_role '{}' CSV DELIMITER '|' TRUNCATECOLUMNS;""").format(table, s3_location, ARN) 311 | 312 | cur.execute(copy_query) 313 | 314 | print(str(obj.key) + ' loaded to table ' + table) 315 | 316 | 317 | def load_ndc_table(cur, conn): 318 | """Run load ndc table queries.""" 319 | for query in load_ndc_queries: 320 | cur.execute(query) 321 | conn.commit() 322 | 323 | 324 | def main(): 325 | # load Socrata credentials 326 | config = configparser.ConfigParser() 327 | config.read('Socrata_credentials.cfg') 328 | app_token = config.get('Socrata', 'AppToken') 329 | 330 | # load AWS credentials 331 | config.read('capstone.cfg') 332 | AWS_ACCESS_KEY_ID = config.get('AWS', 'KEY') 333 | AWS_SECRET_ACCESS_KEY = config.get('AWS', 'SECRET') 334 | ARN = config.get('IAM_ROLE', 'ARN') 335 | 336 | # initialize s3 resource 337 | s3 = boto3.resource('s3', 338 | aws_access_key_id=AWS_ACCESS_KEY_ID, 339 | aws_secret_access_key=AWS_SECRET_ACCESS_KEY) 340 | 341 | # set source and destination buckets 342 | openFDA_bucket = "download.open.fda.gov" 343 | zipped_bucket = "dend-rbmayer-zipped" 344 | unzipped_bucket = "dend-rbmayer-unzipped" 345 | labels_key = "drug/label" 346 | drug_events_key = "drug/event" 347 | destination_bucket = "dend-rbmayer" 348 | 349 | # copy labels archives from openFDA to zipped bucket 350 | # AWS Lambda function will automatically unzip files 351 | copy_from_S3_to_S3(s3, openFDA_bucket, zipped_bucket, labels_key) 352 | 353 | # process labels 354 | process_labels(s3, unzipped_bucket, destination_bucket, labels_key) 355 | 356 | # copy drug events archives from openFDA to zipped bucket 357 | copy_from_S3_to_S3(s3, openFDA_bucket, zipped_bucket, drug_events_key) 358 | 359 | # process drug events 360 | process_drug_events(s3, unzipped_bucket, destination_bucket, 361 | drug_events_key) 362 | 363 | # process pricing data 364 | get_pricing_data(app_token, destination_bucket, 'pricing') 365 | 366 | # load data into redshift 367 | keys_and_tables = [['pricing', 'pricing'] , 368 | [labels_key, 'labels'], 369 | ['safetyreports', 'safetyreports'], 370 | ['reactions', 'reactions']#, 371 | ['drug_events', 'drug_events']] 372 | 373 | conn = psycopg2.connect("host={} dbname={} user={} password={} port={}" 374 | .format(*config['CLUSTER'].values())) 375 | conn.autocommit = True 376 | cur = conn.cursor() 377 | 378 | for pair in keys_and_tables: 379 | load_data_into_redshift(*pair, s3, destination_bucket, cur, ARN) 380 | 381 | # populate ndc table 382 | load_ndc_table(cur, conn) 383 | 384 | conn.close() 385 | 386 | 387 | if __name__ == "__main__": 388 | main() 389 | -------------------------------------------------------------------------------- /Capstone Project Drug Data Warehouse/images/database_schema_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbmayer/Udacity-Data-Engineering-Nanodegree/84c28b9d8538a5aa3b23867abad93e8de30c3476/Capstone Project Drug Data Warehouse/images/database_schema_diagram.png -------------------------------------------------------------------------------- /Capstone Project Drug Data Warehouse/images/table_rowcounts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbmayer/Udacity-Data-Engineering-Nanodegree/84c28b9d8538a5aa3b23867abad93e8de30c3476/Capstone Project Drug Data Warehouse/images/table_rowcounts.png -------------------------------------------------------------------------------- /Capstone Project Drug Data Warehouse/images/test_query_aspirin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbmayer/Udacity-Data-Engineering-Nanodegree/84c28b9d8538a5aa3b23867abad93e8de30c3476/Capstone Project Drug Data Warehouse/images/test_query_aspirin.png -------------------------------------------------------------------------------- /Capstone Project Drug Data Warehouse/sql_queries.py: -------------------------------------------------------------------------------- 1 | # DROP TABLES 2 | 3 | drug_events_table_drop = "DROP TABLE IF EXISTS drug_events" 4 | reactions_table_drop = "DROP TABLE IF EXISTS reactions" 5 | safetyreports_table_drop = "DROP TABLE IF EXISTS safety_reports" 6 | labels_table_drop = "DROP TABLE IF EXISTS labels" 7 | pricing_table_drop = "DROP TABLE IF EXISTS pricing" 8 | ndc_table_drop = "DROP TABLE IF EXISTS ndc" 9 | 10 | 11 | # CREATE TABLES 12 | 13 | drug_events_table_create = ("""CREATE TABLE IF NOT EXISTS drug_events ( 14 | drug_event_id int IDENTITY primary key, 15 | formatted_ndc nvarchar(11) NOT NULL sortkey, 16 | package_ndc nvarchar(20) NOT NULL, 17 | safetyreportid nvarchar(9) NOT NULL distkey 18 | ) 19 | DISTSTYLE KEY; 20 | """) 21 | 22 | reactions_table_create = ("""CREATE TABLE IF NOT EXISTS reactions ( 23 | reactionmeddrapt nvarchar(max) NOT NULL, 24 | reactions_id int IDENTITY primary key, 25 | safetyreportid varchar(9) NOT NULL sortkey 26 | ) 27 | DISTSTYLE AUTO; 28 | """) 29 | 30 | safetyreports_table_create = ("""CREATE TABLE IF NOT EXISTS safety_reports ( 31 | id int IDENTITY, 32 | patientdeath BOOL, 33 | receiptdate date, 34 | receivedate date, 35 | safetyreportid varchar(9) NOT NULL primary key sortkey distkey, 36 | seriousnesscongenitalanomali BOOL, 37 | seriousnessdeath BOOL, 38 | seriousnessdisabling BOOL, 39 | seriousnesshospitalization BOOL, 40 | seriousnesslifethreatening BOOL 41 | ) 42 | DISTSTYLE KEY; 43 | """) 44 | 45 | 46 | labels_table_create = ("""CREATE TABLE IF NOT EXISTS labels ( 47 | active_ingredient nvarchar(max), 48 | adverse_reactions nvarchar(max), 49 | brand_name nvarchar(max), 50 | drug_interactions nvarchar(max), 51 | formatted_ndc nvarchar(11) NOT NULL sortkey, 52 | generic_name nvarchar(max), 53 | indications_and_usage nvarchar(max), 54 | labels_id int IDENTITY, 55 | package_ndc nvarchar(20), 56 | warnings nvarchar(max) 57 | ) 58 | DISTSTYLE AUTO; 59 | """) 60 | 61 | 62 | pricing_table_create = ("""CREATE TABLE IF NOT EXISTS pricing ( 63 | as_of_date timestamp distkey, 64 | classification_for_rate_setting nvarchar, 65 | corresponding_generic_drug_effective_date timestamp, 66 | corresponding_generic_drug_nadac_per_unit float, 67 | effective_date timestamp, 68 | explanation_code nvarchar, 69 | nadac_per_unit float, 70 | ndc nvarchar(11) NOT NULL sortkey, 71 | ndc_description nvarchar(max), 72 | otc nvarchar, 73 | pharmacy_type_indicator nvarchar, 74 | pricing_id int IDENTITY primary key, 75 | pricing_unit nvarchar 76 | ) 77 | DISTSTYLE KEY; 78 | """) 79 | 80 | ndc_table_create = ("""CREATE TABLE IF NOT EXISTS ndc ( 81 | ndc varchar(11) NOT NULL primary key sortkey 82 | ) 83 | DISTSTYLE ALL; 84 | """) 85 | 86 | 87 | # ALTER TABLES TO ADD FOREIGN CONSTRAINTS 88 | drug_events_table_alter = (""" 89 | ALTER TABLE drug_events ADD foreign key(formatted_ndc) references ndc(ndc); 90 | ALTER TABLE drug_events ADD foreign key(safetyreportid) references safety_reports(safetyreportid); 91 | """) 92 | 93 | reactions_table_alter = (""" 94 | ALTER TABLE reactions ADD foreign key(safetyreportid) references safety_reports(safetyreportid); 95 | """) 96 | 97 | labels_table_alter = (""" 98 | ALTER TABLE labels ADD foreign key(formatted_ndc) references ndc(ndc); 99 | """) 100 | 101 | pricing_table_alter = (""" 102 | ALTER TABLE pricing ADD foreign key(ndc) references ndc(ndc); 103 | """) 104 | 105 | # POPULATE ndc TABLE AFTER DATABASE LOAD 106 | create_labels_ndc_table = ("""CREATE TABLE IF NOT EXISTS public.labels_ndc (ndc varchar(11) NOT NULL sortkey);""") 107 | 108 | insert_labels_ndc_query = ("""INSERT INTO public.labels_ndc (select distinct formatted_ndc as ndc from public.labels);""") 109 | 110 | create_pricing_ndc_table = ("""CREATE TABLE IF NOT EXISTS public.pricing_ndc (ndc varchar(11) NOT NULL sortkey);""") 111 | 112 | insert_pricing_ndc_query = ("""INSERT INTO public.pricing_ndc (select distinct ndc from public.pricing);""") 113 | 114 | insert_ndc_query = ("""INSERT INTO public.ndc (select distinct ndc from (select * from public.labels_ndc union select * from public.pricing_ndc));""") 115 | 116 | drop_labels_ndc = ("""DROP TABLE public.labels_ndc;""") 117 | 118 | drop_pricing_ndc = ("""DROP TABLE public.pricing_ndc;""") 119 | 120 | # QUERY LISTS 121 | create_table_queries = [drug_events_table_create, reactions_table_create, 122 | safetyreports_table_create, labels_table_create, 123 | pricing_table_create, ndc_table_create] 124 | 125 | drop_table_queries = [drug_events_table_drop, reactions_table_drop, 126 | safetyreports_table_drop, labels_table_drop, 127 | pricing_table_drop, ndc_table_drop] 128 | 129 | alter_table_queries = [drug_events_table_alter, reactions_table_alter, 130 | labels_table_alter, pricing_table_alter] 131 | 132 | load_ndc_queries = [create_labels_ndc_table, insert_labels_ndc_query, 133 | create_pricing_ndc_table, insert_pricing_ndc_query, 134 | insert_ndc_query, drop_labels_ndc, drop_pricing_ndc] 135 | 136 | -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/.ipynb_checkpoints/README-checkpoint.md: -------------------------------------------------------------------------------- 1 | # DEND Project 1: Data Modeling with Postgres 2 | 3 | ## Project Summary 4 | 5 | The objective of this project is to create a SQL analytics database for a fictional music streaming service called Sparkify. Sparkify's analytics team seeks to understand what, when and how users are playing songs on the company's music app. The analysts need an easy way to query and analyze the songplay data, which is currently stored in raw JSON logs and metadata files on a local directory. 6 | 7 | As the data engineer assigned to the project, I have implemented an ETL pipeline in python to process and upload the data into a PostgreSQL database. The ETL process extracts each songplay from the list of page actions recorded by the app. Data for analysis, such as song name, user information, subscription tier, and location of user, is structured into the main songplay table and related dimensional tables. 8 | 9 | Data Modeling with Postgres was submitted for Udacity's Data Engineering Nanodegree (DEND) in Spring 2019. 10 | 11 | ## How to Use 12 | 13 | 1. Run create_tables.py from terminal or python console to set up database and tables. 14 | 2. Run etl.py from terminal or console to process and load data into database. 15 | 3. Optional: Launch etl.ipynb using Jupyter Notebook to explore how process was developed. Launch test.ipynb to run validation and example queries. 16 | 17 | ## Database Schema 18 | 19 | The sparkify database design uses the simple star schema shown below. The schema contains one fact table, *songplays*, and four dimension tables: *songs*, *artists*, *users* and *time*. The fact table references the primary keys of each dimention table, enabling joins to songplays on song_id, artist_id, user_id and start_time, respectively. This structure will enable the analysts to aggregate the data efficiently and explore it using standard SQL queries. 20 | 21 | ![Database schema diagram](database_schema_diagram.png) 22 | 23 | ###### Instructions for generating the schema diagram using [sqlalchemy_schemadisplay](https://github.com/fschulze/sqlalchemy_schemadisplay) were provided by Syed Mateen in the project-1-dend-v1 slack channel. Thanks Syed! 24 | 25 | Each songplay in the fact table is identified by a unique uuid generated from the song, user id and timestamp of the log entry. This field is set as a primary key, so that it is unique and non-null. A constraint on the UPSERT operation ensures that there are no duplicate songplays in the database. If the log contains multiple entries with the same song, user id and timestamp, only the first entry is imported. The process of generating unique uuid's could be applied to all of the primary identifiers of the dimension tables. This would improve join efficiency if the database were very large. 26 | 27 | To keep subscription data as up-to-date as log data allows, the users table updates the subscription status of the user ("level") when processing the data to reflect membership status as of the most recent songplay timestamp. 28 | 29 | ## Data Processing and Quality Checks 30 | 31 | Data is extracted from two types of JSON source files: song data from the [Million Song Dataset](https://labrosa.ee.columbia.edu/millionsong/) and songplay data from user logs. The JSON files are read into pandas dataframes, processed and uploaded into the database using psycopg2. 32 | 33 | A number of steps clean the data and reduce the size of the database by removing data not needed for the analysis: 34 | * Songplays are identified by filtering for actions initiated from the 'NextSong' page. 35 | * Timestamps are converted from UNIX time to datetime format without time zone prior to upload. 36 | * Rows from the users table are excluded where user_id is missing. 37 | * Rows from the artists table are excluded where artist_id is missing. 38 | 39 | ## Example Queries and Results 40 | 41 | The dataset contains 6,820 songplays from November 2018. 42 | 43 | > SELECT tm.month, tm.year, COUNT(sp.songplay_id) as songplay_count 44 | FROM songplays sp 45 | LEFT JOIN time tm 46 | ON sp.start_time = tm.start_time 47 | GROUP BY tm.month, tm.year; 48 | 49 | 81% of songplays -- 5591 streams -- are generated by paid members. 50 | 51 | > SELECT 52 | sp.level, 53 | COUNT(sp.songplay_id) as songplay_count, 54 | 100*COUNT(sp.songplay_id)/(select count(s.songplay_id) from songplays s) as percent 55 | FROM songplays sp GROUP BY sp.level; 56 | 57 | -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/.ipynb_checkpoints/create_tables-checkpoint.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | from sql_queries import create_table_queries, drop_table_queries 3 | 4 | 5 | def create_database(): 6 | """Return cursor and connection to sparkify database. 7 | 8 | Drop (if exists) and create sparkify database on default student connection 9 | to PostgreSQL engine. 10 | 11 | Returns: 12 | cur: database cursor 13 | conn: database connection 14 | 15 | """ 16 | # connect to default database 17 | conn = psycopg2.connect("host=127.0.0.1 dbname=studentdb user=student password=student") 18 | conn.set_session(autocommit=True) 19 | cur = conn.cursor() 20 | 21 | # create sparkify database with UTF8 encoding 22 | cur.execute("DROP DATABASE IF EXISTS sparkifydb") 23 | cur.execute("CREATE DATABASE sparkifydb WITH ENCODING 'utf8' TEMPLATE template0") 24 | 25 | # close connection to default database 26 | conn.close() 27 | 28 | # connect to sparkify database 29 | conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student") 30 | cur = conn.cursor() 31 | 32 | return cur, conn 33 | 34 | 35 | def drop_tables(cur, conn): 36 | """Run drop table queries.""" 37 | for query in drop_table_queries: 38 | cur.execute(query) 39 | conn.commit() 40 | 41 | 42 | def create_tables(cur, conn): 43 | """Run create table queries.""" 44 | for query in create_table_queries: 45 | cur.execute(query) 46 | conn.commit() 47 | 48 | 49 | def main(): 50 | """Create database and tables.""" 51 | cur, conn = create_database() 52 | 53 | drop_tables(cur, conn) 54 | create_tables(cur, conn) 55 | 56 | conn.close() 57 | 58 | 59 | if __name__ == "__main__": 60 | main() -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/.ipynb_checkpoints/database_schema_diagram-checkpoint.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbmayer/Udacity-Data-Engineering-Nanodegree/84c28b9d8538a5aa3b23867abad93e8de30c3476/Project 1 Data Modeling with Postgres/.ipynb_checkpoints/database_schema_diagram-checkpoint.png -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/.ipynb_checkpoints/etl-checkpoint.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import psycopg2 4 | import pandas as pd 5 | from sql_queries import * 6 | 7 | 8 | def process_song_file(cur, filepath): 9 | """Insert record from JSON song file into postgresql tables. 10 | 11 | Read JSON file to pandas dataframe, clean and process data, 12 | then load to song and artist tables. 13 | 14 | Parameters: 15 | cur (cursor object): connection cursor 16 | filepath (string): filepath 17 | 18 | Returns: None 19 | 20 | """ 21 | # open song file 22 | df = pd.read_json(filepath, lines=True) 23 | 24 | # insert song record 25 | song_data = df.loc[(df['song_id'].notnull() & df['title'].notnull() 26 | & df['artist_id'].notnull() & df['year'].notnull()), 27 | ['song_id', 'title', 'artist_id', 'year', 'duration']].values[0].tolist() 28 | cur.execute(song_table_insert, song_data) 29 | 30 | # insert artist record 31 | artist_data = df.loc[(df['artist_id'].notnull() & df['artist_name'].notnull()), 32 | ['artist_id', 'artist_name', 'artist_location', 'artist_latitude', 33 | 'artist_longitude']].values[0].tolist() 34 | cur.execute(artist_table_insert, artist_data) 35 | 36 | 37 | def process_log_file(cur, filepath): 38 | """Insert records from JSON log files into PostgreSQL tables. 39 | 40 | Read JSON file to pandas dataframe, clean and process data, 41 | then load to user and songplay tables. 42 | 43 | Parameters: 44 | cur (cursor object): connection cursor 45 | filepath (string): filepath 46 | 47 | Returns: None 48 | 49 | """ 50 | # open log file 51 | df = pd.read_json(filepath, lines=True) 52 | 53 | # filter by NextSong action 54 | df = df.loc[df.page=='NextSong'] 55 | 56 | # convert timestamp column to datetime 57 | df['ts'] = pd.to_datetime(df['ts'], unit='ms') 58 | t = pd.to_datetime(df['ts'], unit='ms') 59 | 60 | # insert non-null time data records 61 | time_data = list((t, t.dt.hour, t.dt.day, t.dt.week, t.dt.month, t.dt.year, t.dt.weekday)) 62 | column_labels = ('timestamp', 'hour', 'day', 'week', 'month', 'year', 'weekday') 63 | time_df = pd.DataFrame.from_dict(dict(zip(column_labels,time_data))) 64 | time_df = time_df.loc[time_df['timestamp'].notnull()] 65 | 66 | for i, row in time_df.iterrows(): 67 | cur.execute(time_table_insert, list(row)) 68 | 69 | # load user table 70 | # filter out rows with no user id, gender, level or timestamp 71 | user_df = df.loc[(df['userId'].notnull() & df['gender'].notnull() 72 | & df['level'].notnull() & df['ts'].notnull()), 73 | ['userId', 'firstName', 'lastName', 'gender', 'level', 'ts']] 74 | 75 | # insert user records 76 | for i, row in user_df.iterrows(): 77 | cur.execute(user_table_insert, row) 78 | 79 | # insert songplay records 80 | for index, row in df.iterrows(): 81 | 82 | # get songid and artistid from song and artist tables 83 | cur.execute(song_select, (row.song, row.artist, row.length)) 84 | results = cur.fetchone() 85 | 86 | if results: 87 | songid, artistid = results 88 | else: 89 | songid, artistid = None, None 90 | 91 | # create songplay uuid 92 | name = (str(row.song) + str(row.ts) + str(row.userId),) 93 | generate_uuid = ("""SELECT uuid_generate_v5(uuid_nil(), %s)""") 94 | cur.execute(generate_uuid, name) 95 | songplayid = cur.fetchone() 96 | 97 | # insert songplay record 98 | songplay_data = (songplayid, row.ts, row.userId, row.level, songid, artistid, 99 | row.sessionId, row.location, row.userAgent) 100 | if row.ts is not None: 101 | cur.execute(songplay_table_insert, songplay_data) 102 | 103 | 104 | def process_data(cur, conn, filepath, func): 105 | """Process data files from directory using function.""" 106 | # get all files matching extension from directory 107 | all_files = [] 108 | for root, dirs, files in os.walk(filepath): 109 | files = glob.glob(os.path.join(root,'*.json')) 110 | for f in files : 111 | all_files.append(os.path.abspath(f)) 112 | 113 | # get total number of files found 114 | num_files = len(all_files) 115 | print('{} files found in {}'.format(num_files, filepath)) 116 | 117 | # iterate over files and process 118 | for i, datafile in enumerate(all_files, 1): 119 | func(cur, datafile) 120 | conn.commit() 121 | print('{}/{} files processed.'.format(i, num_files)) 122 | 123 | 124 | def main(): 125 | """Load song and log data into postgresql star schema.""" 126 | conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student") 127 | cur = conn.cursor() 128 | 129 | # enable uuid extension 130 | cur.execute("""CREATE EXTENSION "uuid-ossp";""") 131 | 132 | process_data(cur, conn, filepath='data/song_data', func=process_song_file) 133 | process_data(cur, conn, filepath='data/log_data', func=process_log_file) 134 | 135 | conn.close() 136 | 137 | 138 | if __name__ == "__main__": 139 | main() -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/.ipynb_checkpoints/sql_queries-checkpoint.py: -------------------------------------------------------------------------------- 1 | # DROP TABLES 2 | 3 | songplay_table_drop = "DROP TABLE IF EXISTS songplays" 4 | user_table_drop = "DROP TABLE IF EXISTS users" 5 | song_table_drop = "DROP TABLE IF EXISTS songs" 6 | artist_table_drop = "DROP TABLE IF EXISTS artists" 7 | time_table_drop = "DROP TABLE IF EXISTS time" 8 | 9 | # CREATE TABLES 10 | 11 | songplay_table_create = ("""CREATE TABLE IF NOT EXISTS songplays (songplay_id uuid PRIMARY KEY, start_time timestamp NOT NULL REFERENCES time(start_time), user_id int NOT NULL REFERENCES users(user_id), level text, song_id text REFERENCES songs(song_id), artist_id text REFERENCES artists(artist_id), session_id int, location text, user_agent text)""") 12 | 13 | user_table_create = ("""CREATE TABLE IF NOT EXISTS users (user_id int PRIMARY KEY, first_name text, last_name text, gender text NOT NULL, level text NOT NULL, last_start_time timestamp NOT NULL)""") 14 | 15 | song_table_create = ("""CREATE TABLE IF NOT EXISTS songs (song_id text PRIMARY KEY, title text NOT NULL, artist_id text NOT NULL, year int NOT NULL, duration numeric) 16 | """) 17 | 18 | artist_table_create = ("""CREATE TABLE IF NOT EXISTS artists (artist_id text PRIMARY KEY, name text NOT NULL, location text, lattitude float8, longitude float8)""") 19 | 20 | time_table_create = ("""CREATE TABLE IF NOT EXISTS time (start_time timestamp PRIMARY KEY, hour int NOT NULL, day int NOT NULL, week int NOT NULL, month int NOT NULL, year int NOT NULL, weekday int NOT NULL)""") 21 | 22 | # INSERT RECORDS 23 | 24 | songplay_table_insert = ("""INSERT INTO songplays (songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (songplay_id) DO NOTHING""") 25 | 26 | # update free/paid level with latest value 27 | user_table_insert = ("""INSERT INTO users (user_id, first_name, last_name, gender, level, last_start_time) VALUES (%s, %s, %s, %s, %s, %s) ON CONFLICT (user_id) DO UPDATE SET level = (CASE WHEN EXCLUDED.last_start_time > users.last_start_time THEN EXCLUDED.level ELSE users.level END)""") 28 | 29 | song_table_insert = ("""INSERT INTO songs (song_id, title, artist_id, year, duration) VALUES (%s, %s, %s, %s, %s) ON CONFLICT (song_id) DO NOTHING""") 30 | 31 | artist_table_insert = ("""INSERT INTO artists (artist_id, name, location, lattitude, longitude) VALUES (%s, %s, %s, %s, %s) ON CONFLICT (artist_id) DO NOTHING""") 32 | 33 | 34 | time_table_insert = ("""INSERT INTO time (start_time, hour, day, week, month, year, weekday) VALUES (%s, %s, %s, %s, %s, %s, %s) ON CONFLICT (start_time) DO NOTHING""") 35 | 36 | # FIND SONGS 37 | 38 | song_select = ("""SELECT s.song_id, a.artist_id FROM songs s LEFT JOIN artists a ON s.artist_id = a.artist_id WHERE s.title = %s AND a.name = %s AND s.duration = %s""") 39 | 40 | # QUERY LISTS 41 | 42 | create_table_queries = [user_table_create, song_table_create, artist_table_create, time_table_create, songplay_table_create] 43 | drop_table_queries = [songplay_table_drop, user_table_drop, song_table_drop, artist_table_drop, time_table_drop] -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/README.md: -------------------------------------------------------------------------------- 1 | # DEND Project 1: Data Modeling with Postgres 2 | 3 | ## Project Summary 4 | 5 | The objective of this project is to create a SQL analytics database for a fictional music streaming service called Sparkify. Sparkify's analytics team seeks to understand what, when and how users are playing songs on the company's music app. The analysts need an easy way to query and analyze the songplay data, which is currently stored in raw JSON logs and metadata files on a local directory. 6 | 7 | As the data engineer assigned to the project, I have implemented an ETL pipeline in python to process and upload the data into a PostgreSQL database. The ETL process extracts each songplay from the list of page actions recorded by the app. Data for analysis, such as song name, user information, subscription tier, and location of user, is structured into the main songplay table and related dimensional tables. 8 | 9 | Data Modeling with Postgres was submitted for Udacity's Data Engineering Nanodegree (DEND) in Spring 2019. 10 | 11 | ## How to Use 12 | 13 | 1. Run create_tables.py from terminal or python console to set up database and tables. 14 | 2. Run etl.py from terminal or console to process and load data into database. 15 | 3. Optional: Launch etl.ipynb using Jupyter Notebook to explore how process was developed. Launch test.ipynb to run validation and example queries. 16 | 17 | ## Database Schema 18 | 19 | The sparkify database design uses the simple star schema shown below. The schema contains one fact table, *songplays*, and four dimension tables: *songs*, *artists*, *users* and *time*. The fact table references the primary keys of each dimention table, enabling joins to songplays on song_id, artist_id, user_id and start_time, respectively. This structure will enable the analysts to aggregate the data efficiently and explore it using standard SQL queries. 20 | 21 | ![Database schema diagram](database_schema_diagram.png) 22 | 23 | ###### Instructions for generating the schema diagram using [sqlalchemy_schemadisplay](https://github.com/fschulze/sqlalchemy_schemadisplay) were provided by Syed Mateen in the project-1-dend-v1 slack channel. Thanks Syed! 24 | 25 | Each songplay in the fact table is identified by a unique uuid generated from the song, user id and timestamp of the log entry. This field is set as a primary key, so that it is unique and non-null. A constraint on the UPSERT operation ensures that there are no duplicate songplays in the database. If the log contains multiple entries with the same song, user id and timestamp, only the first entry is imported. The process of generating unique uuid's could be applied to all of the primary identifiers of the dimension tables. This would improve join efficiency if the database were very large. 26 | 27 | To keep subscription data as up-to-date as log data allows, the users table updates the subscription status of the user ("level") when processing the data to reflect membership status as of the most recent songplay timestamp. 28 | 29 | ## Data Processing and Quality Checks 30 | 31 | Data is extracted from two types of JSON source files: song data from the [Million Song Dataset](https://labrosa.ee.columbia.edu/millionsong/) and songplay data from user logs. The JSON files are read into pandas dataframes, processed and uploaded into the database using psycopg2. 32 | 33 | A number of steps clean the data and reduce the size of the database by removing data not needed for the analysis: 34 | * Songplays are identified by filtering for actions initiated from the 'NextSong' page. 35 | * Timestamps are converted from UNIX time to datetime format without time zone prior to upload. 36 | * Rows from the users table are excluded where user_id is missing. 37 | * Rows from the artists table are excluded where artist_id is missing. 38 | 39 | ## Example Queries and Results 40 | 41 | The dataset contains 6,820 songplays from November 2018. 42 | 43 | > SELECT tm.month, tm.year, COUNT(sp.songplay_id) as songplay_count 44 | FROM songplays sp 45 | LEFT JOIN time tm 46 | ON sp.start_time = tm.start_time 47 | GROUP BY tm.month, tm.year; 48 | 49 | 81% of songplays -- 5591 streams -- are generated by paid members. 50 | 51 | > SELECT 52 | sp.level, 53 | COUNT(sp.songplay_id) as songplay_count, 54 | 100*COUNT(sp.songplay_id)/(select count(s.songplay_id) from songplays s) as percent 55 | FROM songplays sp GROUP BY sp.level; 56 | 57 | -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/__pycache__/sql_queries.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbmayer/Udacity-Data-Engineering-Nanodegree/84c28b9d8538a5aa3b23867abad93e8de30c3476/Project 1 Data Modeling with Postgres/__pycache__/sql_queries.cpython-36.pyc -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/create_tables.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | from sql_queries import create_table_queries, drop_table_queries 3 | 4 | 5 | def create_database(): 6 | """Return cursor and connection to sparkify database. 7 | 8 | Drop (if exists) and create sparkify database on default student connection 9 | to PostgreSQL engine. 10 | 11 | Returns: 12 | cur: database cursor 13 | conn: database connection 14 | 15 | """ 16 | # connect to default database 17 | conn = psycopg2.connect("host=127.0.0.1 dbname=studentdb user=student password=student") 18 | conn.set_session(autocommit=True) 19 | cur = conn.cursor() 20 | 21 | # create sparkify database with UTF8 encoding 22 | cur.execute("DROP DATABASE IF EXISTS sparkifydb") 23 | cur.execute("CREATE DATABASE sparkifydb WITH ENCODING 'utf8' TEMPLATE template0") 24 | 25 | # close connection to default database 26 | conn.close() 27 | 28 | # connect to sparkify database 29 | conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student") 30 | cur = conn.cursor() 31 | 32 | return cur, conn 33 | 34 | 35 | def drop_tables(cur, conn): 36 | """Run drop table queries.""" 37 | for query in drop_table_queries: 38 | cur.execute(query) 39 | conn.commit() 40 | 41 | 42 | def create_tables(cur, conn): 43 | """Run create table queries.""" 44 | for query in create_table_queries: 45 | cur.execute(query) 46 | conn.commit() 47 | 48 | 49 | def main(): 50 | """Create database and tables.""" 51 | cur, conn = create_database() 52 | 53 | drop_tables(cur, conn) 54 | create_tables(cur, conn) 55 | 56 | conn.close() 57 | 58 | 59 | if __name__ == "__main__": 60 | main() -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/log_data/2018/11/2018-11-01-events.json: -------------------------------------------------------------------------------- 1 | {"artist":null,"auth":"Logged In","firstName":"Walter","gender":"M","itemInSession":0,"lastName":"Frye","length":null,"level":"free","location":"San Francisco-Oakland-Hayward, CA","method":"GET","page":"Home","registration":1540919166796.0,"sessionId":38,"song":null,"status":200,"ts":1541105830796,"userAgent":"\"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/36.0.1985.143 Safari\/537.36\"","userId":"39"} 2 | {"artist":null,"auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":0,"lastName":"Summers","length":null,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"GET","page":"Home","registration":1540344794796.0,"sessionId":139,"song":null,"status":200,"ts":1541106106796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 3 | {"artist":"Des'ree","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":1,"lastName":"Summers","length":246.30812,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"You Gotta Be","status":200,"ts":1541106106796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 4 | {"artist":null,"auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":2,"lastName":"Summers","length":null,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"GET","page":"Upgrade","registration":1540344794796.0,"sessionId":139,"song":null,"status":200,"ts":1541106132796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 5 | {"artist":"Mr Oizo","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":3,"lastName":"Summers","length":144.03873,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Flat 55","status":200,"ts":1541106352796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 6 | {"artist":"Tamba Trio","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":4,"lastName":"Summers","length":177.18812,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Quem Quiser Encontrar O Amor","status":200,"ts":1541106496796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 7 | {"artist":"The Mars Volta","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":5,"lastName":"Summers","length":380.42077,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Eriatarka","status":200,"ts":1541106673796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 8 | {"artist":"Infected Mushroom","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":6,"lastName":"Summers","length":440.2673,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Becoming Insane","status":200,"ts":1541107053796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 9 | {"artist":"Blue October \/ Imogen Heap","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":7,"lastName":"Summers","length":241.3971,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Congratulations","status":200,"ts":1541107493796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 10 | {"artist":"Girl Talk","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":8,"lastName":"Summers","length":160.15628,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Once again","status":200,"ts":1541107734796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 11 | {"artist":"Black Eyed Peas","auth":"Logged In","firstName":"Sylvie","gender":"F","itemInSession":0,"lastName":"Cruz","length":214.93506,"level":"free","location":"Washington-Arlington-Alexandria, DC-VA-MD-WV","method":"PUT","page":"NextSong","registration":1540266185796.0,"sessionId":9,"song":"Pump It","status":200,"ts":1541108520796,"userAgent":"\"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit\/537.77.4 (KHTML, like Gecko) Version\/7.0.5 Safari\/537.77.4\"","userId":"10"} 12 | {"artist":null,"auth":"Logged In","firstName":"Ryan","gender":"M","itemInSession":0,"lastName":"Smith","length":null,"level":"free","location":"San Jose-Sunnyvale-Santa Clara, CA","method":"GET","page":"Home","registration":1541016707796.0,"sessionId":169,"song":null,"status":200,"ts":1541109015796,"userAgent":"\"Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Ubuntu Chromium\/36.0.1985.125 Chrome\/36.0.1985.125 Safari\/537.36\"","userId":"26"} 13 | {"artist":"Fall Out Boy","auth":"Logged In","firstName":"Ryan","gender":"M","itemInSession":1,"lastName":"Smith","length":200.72444,"level":"free","location":"San Jose-Sunnyvale-Santa Clara, CA","method":"PUT","page":"NextSong","registration":1541016707796.0,"sessionId":169,"song":"Nobody Puts Baby In The Corner","status":200,"ts":1541109125796,"userAgent":"\"Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Ubuntu Chromium\/36.0.1985.125 Chrome\/36.0.1985.125 Safari\/537.36\"","userId":"26"} 14 | {"artist":"M.I.A.","auth":"Logged In","firstName":"Ryan","gender":"M","itemInSession":2,"lastName":"Smith","length":233.7171,"level":"free","location":"San Jose-Sunnyvale-Santa Clara, CA","method":"PUT","page":"NextSong","registration":1541016707796.0,"sessionId":169,"song":"Mango Pickle Down River (With The Wilcannia Mob)","status":200,"ts":1541109325796,"userAgent":"\"Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Ubuntu Chromium\/36.0.1985.125 Chrome\/36.0.1985.125 Safari\/537.36\"","userId":"26"} 15 | {"artist":"Survivor","auth":"Logged In","firstName":"Jayden","gender":"M","itemInSession":0,"lastName":"Fox","length":245.36771,"level":"free","location":"New Orleans-Metairie, LA","method":"PUT","page":"NextSong","registration":1541033612796.0,"sessionId":100,"song":"Eye Of The Tiger","status":200,"ts":1541110994796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.3; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/36.0.1985.143 Safari\/537.36\"","userId":"101"} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/A/.ipynb_checkpoints/TRAAAAW128F429D538-checkpoint.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD7TVE1187B99BFB1", "artist_latitude": null, "artist_longitude": null, "artist_location": "California - LA", "artist_name": "Casual", "song_id": "SOMZWCG12A8C13C480", "title": "I Didn't Mean To", "duration": 218.93179, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/A/TRAAAAW128F429D538.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD7TVE1187B99BFB1", "artist_latitude": null, "artist_longitude": null, "artist_location": "California - LA", "artist_name": "Casual", "song_id": "SOMZWCG12A8C13C480", "title": "I Didn't Mean To", "duration": 218.93179, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/A/TRAAABD128F429CF47.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARMJAGH1187FB546F3", "artist_latitude": 35.14968, "artist_longitude": -90.04892, "artist_location": "Memphis, TN", "artist_name": "The Box Tops", "song_id": "SOCIWDW12A8C13D406", "title": "Soul Deep", "duration": 148.03546, "year": 1969} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/A/TRAAADZ128F9348C2E.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARKRRTF1187B9984DA", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Sonora Santanera", "song_id": "SOXVLOJ12AB0189215", "title": "Amor De Cabaret", "duration": 177.47546, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/A/TRAAAEF128F4273421.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR7G5I41187FB4CE6C", "artist_latitude": null, "artist_longitude": null, "artist_location": "London, England", "artist_name": "Adam Ant", "song_id": "SONHOTT12A8C13493C", "title": "Something Girls", "duration": 233.40363, "year": 1982} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/A/TRAAAFD128F92F423A.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARXR32B1187FB57099", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Gob", "song_id": "SOFSOCN12A8C143F5D", "title": "Face the Ashes", "duration": 209.60608, "year": 2007} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/A/TRAAAMO128F1481E7F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARKFYS91187B98E58F", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Jeff And Sheri Easter", "song_id": "SOYMRWW12A6D4FAB14", "title": "The Moon And I (Ordinary Day Album Version)", "duration": 267.7024, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/A/TRAAAMQ128F1460CD3.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD0S291187B9B7BF5", "artist_latitude": null, "artist_longitude": null, "artist_location": "Ohio", "artist_name": "Rated R", "song_id": "SOMJBYD12A6D4F8557", "title": "Keepin It Real (Skit)", "duration": 114.78159, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/A/TRAAAPK128E0786D96.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR10USD1187B99F3F1", "artist_latitude": null, "artist_longitude": null, "artist_location": "Burlington, Ontario, Canada", "artist_name": "Tweeterfriendly Music", "song_id": "SOHKNRJ12A6701D1F8", "title": "Drop of Rain", "duration": 189.57016, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/A/TRAAARJ128F9320760.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR8ZCNI1187B9A069B", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Planet P Project", "song_id": "SOIAZJW12AB01853F1", "title": "Pink World", "duration": 269.81832, "year": 1984} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/A/TRAAAVG12903CFA543.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNTLGG11E2835DDB9", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Clp", "song_id": "SOUDSGM12AC9618304", "title": "Insatiable (Instrumental Version)", "duration": 266.39628, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/A/TRAAAVO128F93133D4.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGSJW91187B9B1D6B", "artist_latitude": 35.21962, "artist_longitude": -80.01955, "artist_location": "North Carolina", "artist_name": "JennyAnyKind", "song_id": "SOQHXMF12AB0182363", "title": "Young Boy Blues", "duration": 218.77506, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/B/TRAABCL128F4286650.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARC43071187B990240", "artist_latitude": null, "artist_longitude": null, "artist_location": "Wisner, LA", "artist_name": "Wayne Watson", "song_id": "SOKEJEJ12A8C13E0D0", "title": "The Urgency (LP Version)", "duration": 245.21098, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/B/TRAABDL12903CAABBA.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARL7K851187B99ACD2", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Andy Andy", "song_id": "SOMUYGI12AB0188633", "title": "La Culpa", "duration": 226.35057, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/B/TRAABJL12903CDCF1A.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARHHO3O1187B989413", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Bob Azzam", "song_id": "SORAMLE12AB017C8B0", "title": "Auguri Cha Cha", "duration": 191.84281, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/B/TRAABJV128F1460C49.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARIK43K1187B9AE54C", "artist_latitude": null, "artist_longitude": null, "artist_location": "Beverly Hills, CA", "artist_name": "Lionel Richie", "song_id": "SOBONFF12A6D4F84D8", "title": "Tonight Will Be Alright", "duration": 307.3824, "year": 1986} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/B/TRAABLR128F423B7E3.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD842G1187B997376", "artist_latitude": 43.64856, "artist_longitude": -79.38533, "artist_location": "Toronto, Ontario, Canada", "artist_name": "Blue Rodeo", "song_id": "SOHUOAP12A8AE488E9", "title": "Floating", "duration": 491.12771, "year": 1987} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/B/TRAABNV128F425CEE1.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARIG6O41187B988BDD", "artist_latitude": 37.16793, "artist_longitude": -95.84502, "artist_location": "United States", "artist_name": "Richard Souther", "song_id": "SOUQQEA12A8C134B1B", "title": "High Tide", "duration": 228.5971, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/B/TRAABRB128F9306DD5.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR1ZHYZ1187FB3C717", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Faiz Ali Faiz", "song_id": "SOILPQQ12AB017E82A", "title": "Sohna Nee Sohna Data", "duration": 599.24853, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/B/TRAABVM128F92CA9DC.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARYKCQI1187FB3B18F", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Tesla", "song_id": "SOXLBJT12A8C140925", "title": "Caught In A Dream", "duration": 290.29832, "year": 2004} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/B/TRAABXG128F9318EBD.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNPAGP1241B9C7FD4", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "lextrical", "song_id": "SOZVMJI12AB01808AF", "title": "Synthetic Dream", "duration": 165.69424, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/B/TRAABYN12903CFD305.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARQGYP71187FB44566", "artist_latitude": 34.31109, "artist_longitude": -94.02978, "artist_location": "Mineola, AR", "artist_name": "Jimmy Wakely", "song_id": "SOWTBJW12AC468AC6E", "title": "Broken-Down Merry-Go-Round", "duration": 151.84934, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/B/TRAABYW128F4244559.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARI3BMM1187FB4255E", "artist_latitude": 38.8991, "artist_longitude": -77.029, "artist_location": "Washington", "artist_name": "Alice Stuart", "song_id": "SOBEBDG12A58A76D60", "title": "Kassie Jones", "duration": 220.78649, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/C/TRAACCG128F92E8A55.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR5KOSW1187FB35FF4", "artist_latitude": 49.80388, "artist_longitude": 15.47491, "artist_location": "Dubai UAE", "artist_name": "Elena", "song_id": "SOZCTXZ12AB0182364", "title": "Setanta matins", "duration": 269.58322, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/C/TRAACER128F4290F96.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARMAC4T1187FB3FA4C", "artist_latitude": 40.82624, "artist_longitude": -74.47995, "artist_location": "Morris Plains, NJ", "artist_name": "The Dillinger Escape Plan", "song_id": "SOBBUGU12A8C13E95D", "title": "Setting Fire to Sleeping Giants", "duration": 207.77751, "year": 2004} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/C/TRAACFV128F935E50B.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR47JEX1187B995D81", "artist_latitude": 37.83721, "artist_longitude": -94.35868, "artist_location": "Nevada, MO", "artist_name": "SUE THOMPSON", "song_id": "SOBLGCN12AB0183212", "title": "James (Hold The Ladder Steady)", "duration": 124.86485, "year": 1985} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/C/TRAACHN128F1489601.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGIWFO1187B9B55B7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Five Bolt Main", "song_id": "SOPSWQW12A6D4F8781", "title": "Made Like This (Live)", "duration": 225.09669, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/C/TRAACIW12903CC0F6D.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNTLGG11E2835DDB9", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Clp", "song_id": "SOZQDIU12A58A7BCF6", "title": "Superconfidential", "duration": 338.31138, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/C/TRAACLV128F427E123.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARDNS031187B9924F0", "artist_latitude": 32.67828, "artist_longitude": -83.22295, "artist_location": "Georgia", "artist_name": "Tim Wilson", "song_id": "SONYPOM12A8C13B2D7", "title": "I Think My Wife Is Running Around On Me (Taco Hell)", "duration": 186.48771, "year": 2005} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/C/TRAACNS128F14A2DF5.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AROUOZZ1187B9ABE51", "artist_latitude": 40.79195, "artist_longitude": -73.94512, "artist_location": "New York, NY [Spanish Harlem]", "artist_name": "Willie Bobo", "song_id": "SOBZBAZ12A6D4F8742", "title": "Spanish Grease", "duration": 168.25424, "year": 1997} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/C/TRAACOW128F933E35F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARH4Z031187B9A71F2", "artist_latitude": 40.73197, "artist_longitude": -74.17418, "artist_location": "Newark, NJ", "artist_name": "Faye Adams", "song_id": "SOVYKGO12AB0187199", "title": "Crazy Mixed Up World", "duration": 156.39465, "year": 1961} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/C/TRAACPE128F421C1B9.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARB29H41187B98F0EF", "artist_latitude": 41.88415, "artist_longitude": -87.63241, "artist_location": "Chicago", "artist_name": "Terry Callier", "song_id": "SOGNCJP12A58A80271", "title": "Do You Finally Need A Friend", "duration": 342.56934, "year": 1972} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/C/TRAACQT128F9331780.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR1Y2PT1187FB5B9CE", "artist_latitude": 27.94017, "artist_longitude": -82.32547, "artist_location": "Brandon", "artist_name": "John Wesley", "song_id": "SOLLHMX12AB01846DC", "title": "The Emperor Falls", "duration": 484.62322, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/C/TRAACSL128F93462F4.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARAJPHH1187FB5566A", "artist_latitude": 40.7038, "artist_longitude": -73.83168, "artist_location": "Queens, NY", "artist_name": "The Shangri-Las", "song_id": "SOYTPEP12AB0180E7B", "title": "Twist and Shout", "duration": 164.80608, "year": 1964} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/C/TRAACTB12903CAAF15.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR0RCMP1187FB3F427", "artist_latitude": 30.08615, "artist_longitude": -94.10158, "artist_location": "Beaumont, TX", "artist_name": "Billie Jo Spears", "song_id": "SOGXHEG12AB018653E", "title": "It Makes No Difference Now", "duration": 133.32853, "year": 1992} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/C/TRAACVS128E078BE39.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AREBBGV1187FB523D2", "artist_latitude": null, "artist_longitude": null, "artist_location": "Houston, TX", "artist_name": "Mike Jones (Featuring CJ_ Mello & Lil' Bran)", "song_id": "SOOLYAZ12A6701F4A6", "title": "Laws Patrolling (Album Version)", "duration": 173.66159, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/A/C/TRAACZK128F4243829.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGUVEV1187B98BA17", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Sierra Maestra", "song_id": "SOGOSOV12AF72A285E", "title": "\u00bfD\u00f3nde va Chichi?", "duration": 313.12934, "year": 1997} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/A/TRABACN128F425B784.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD7TVE1187B99BFB1", "artist_latitude": null, "artist_longitude": null, "artist_location": "California - LA", "artist_name": "Casual", "song_id": "SOQLGFP12A58A7800E", "title": "OAKtown", "duration": 259.44771, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/A/TRABAFJ128F42AF24E.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR3JMC51187B9AE49D", "artist_latitude": 28.53823, "artist_longitude": -81.37739, "artist_location": "Orlando, FL", "artist_name": "Backstreet Boys", "song_id": "SOPVXLX12A8C1402D5", "title": "Larger Than Life", "duration": 236.25098, "year": 1999} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/A/TRABAFP128F931E9A1.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARPBNLO1187FB3D52F", "artist_latitude": 40.71455, "artist_longitude": -74.00712, "artist_location": "New York, NY", "artist_name": "Tiny Tim", "song_id": "SOAOIBZ12AB01815BE", "title": "I Hold Your Hand In Mine [Live At Royal Albert Hall]", "duration": 43.36281, "year": 2000} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/A/TRABAIO128F42938F9.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR9AWNF1187B9AB0B4", "artist_latitude": null, "artist_longitude": null, "artist_location": "Seattle, Washington USA", "artist_name": "Kenny G featuring Daryl Hall", "song_id": "SOZHPGD12A8C1394FE", "title": "Baby Come To Me", "duration": 236.93016, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/A/TRABATO128F42627E9.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AROGWRA122988FEE45", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Christos Dantis", "song_id": "SOSLAVG12A8C13397F", "title": "Den Pai Alo", "duration": 243.82649, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/A/TRABAVQ12903CBF7E0.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARMBR4Y1187B9990EB", "artist_latitude": 37.77916, "artist_longitude": -122.42005, "artist_location": "California - SF", "artist_name": "David Martin", "song_id": "SOTTDKS12AB018D69B", "title": "It Wont Be Christmas", "duration": 241.47546, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/A/TRABAWW128F4250A31.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARQ9BO41187FB5CF1F", "artist_latitude": 40.99471, "artist_longitude": -77.60454, "artist_location": "Pennsylvania", "artist_name": "John Davis", "song_id": "SOMVWWT12A58A7AE05", "title": "Knocked Out Of The Park", "duration": 183.17016, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/A/TRABAXL128F424FC50.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARKULSX1187FB45F84", "artist_latitude": 39.49974, "artist_longitude": -111.54732, "artist_location": "Utah", "artist_name": "Trafik", "song_id": "SOQVMXR12A81C21483", "title": "Salt In NYC", "duration": 424.12363, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/A/TRABAXR128F426515F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARI2JSK1187FB496EF", "artist_latitude": 51.50632, "artist_longitude": -0.12714, "artist_location": "London, England", "artist_name": "Nick Ingman;Gavyn Wright", "song_id": "SODUJBS12A8C132150", "title": "Wessex Loses a Bride", "duration": 111.62077, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/A/TRABAXV128F92F6AE3.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AREDBBQ1187B98AFF5", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Eddie Calvert", "song_id": "SOBBXLX12A58A79DDA", "title": "Erica (2005 Digital Remaster)", "duration": 138.63138, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/A/TRABAZH128F930419A.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR7ZKHQ1187B98DD73", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Glad", "song_id": "SOTUKVB12AB0181477", "title": "Blessed Assurance", "duration": 270.602, "year": 1993} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/B/TRABBAM128F429D223.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARBGXIG122988F409D", "artist_latitude": 37.77916, "artist_longitude": -122.42005, "artist_location": "California - SF", "artist_name": "Steel Rain", "song_id": "SOOJPRH12A8C141995", "title": "Loaded Like A Gun", "duration": 173.19138, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/B/TRABBBV128F42967D7.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR7SMBG1187B9B9066", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Los Manolos", "song_id": "SOBCOSW12A8C13D398", "title": "Rumba De Barcelona", "duration": 218.38322, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/B/TRABBJE12903CDB442.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGCY1Y1187B9A4FA5", "artist_latitude": 36.16778, "artist_longitude": -86.77836, "artist_location": "Nashville, TN.", "artist_name": "Gloriana", "song_id": "SOQOTLQ12AB01868D0", "title": "Clementina Santaf\u00e8", "duration": 153.33832, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/B/TRABBKX128F4285205.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR36F9J1187FB406F1", "artist_latitude": 56.27609, "artist_longitude": 9.51695, "artist_location": "Denmark", "artist_name": "Bombay Rockers", "song_id": "SOBKWDJ12A8C13B2F3", "title": "Wild Rose (Back 2 Basics Mix)", "duration": 230.71302, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/B/TRABBLU128F93349CF.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNNKDK1187B98BBD5", "artist_latitude": 45.80726, "artist_longitude": 15.9676, "artist_location": "Zagreb Croatia", "artist_name": "Jinx", "song_id": "SOFNOQK12AB01840FC", "title": "Kutt Free (DJ Volume Remix)", "duration": 407.37914, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/B/TRABBNP128F932546F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR62SOJ1187FB47BB5", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Chase & Status", "song_id": "SOGVQGJ12AB017F169", "title": "Ten Tonne", "duration": 337.68444, "year": 2005} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/B/TRABBOP128F931B50D.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARBEBBY1187B9B43DB", "artist_latitude": null, "artist_longitude": null, "artist_location": "Gainesville, FL", "artist_name": "Tom Petty", "song_id": "SOFFKZS12AB017F194", "title": "A Higher Place (Album Version)", "duration": 236.17261, "year": 1994} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/B/TRABBOR128F4286200.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARDR4AC1187FB371A1", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Montserrat Caball\u00e9;Placido Domingo;Vicente Sardinero;Judith Blegen;Sherrill Milnes;Georg Solti", "song_id": "SOBAYLL12A8C138AF9", "title": "Sono andati? Fingevo di dormire", "duration": 511.16363, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/B/TRABBTA128F933D304.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARAGB2O1187FB3A161", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Pucho & His Latin Soul Brothers", "song_id": "SOLEYHO12AB0188A85", "title": "Got My Mojo Workin", "duration": 338.23302, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/B/TRABBVJ128F92F7EAA.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AREDL271187FB40F44", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Soul Mekanik", "song_id": "SOPEGZN12AB0181B3D", "title": "Get Your Head Stuck On Your Neck", "duration": 45.66159, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/B/TRABBXU128F92FEF48.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARP6N5A1187B99D1A3", "artist_latitude": null, "artist_longitude": null, "artist_location": "Hamtramck, MI", "artist_name": "Mitch Ryder", "song_id": "SOXILUQ12A58A7C72A", "title": "Jenny Take a Ride", "duration": 207.43791, "year": 2004} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/B/TRABBZN12903CD9297.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGSAFR1269FB35070", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Blingtones", "song_id": "SOTCKKY12AB018A141", "title": "Sonnerie lalaleul\u00e9 hi houuu", "duration": 29.54404, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/C/TRABCAJ12903CDFCC2.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARULZCI1241B9C8611", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Luna Orbit Project", "song_id": "SOSWKAV12AB018FC91", "title": "Midnight Star", "duration": 335.51628, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/C/TRABCEC128F426456E.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR0IAWL1187B9A96D0", "artist_latitude": 8.4177, "artist_longitude": -80.11278, "artist_location": "Panama", "artist_name": "Danilo Perez", "song_id": "SONSKXP12A8C13A2C9", "title": "Native Soul", "duration": 197.19791, "year": 2003} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/C/TRABCEI128F424C983.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARJIE2Y1187B994AB7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Line Renaud", "song_id": "SOUPIRU12A6D4FA1E1", "title": "Der Kleine Dompfaff", "duration": 152.92036, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/C/TRABCFL128F149BB0D.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARLTWXK1187FB5A3F8", "artist_latitude": 32.74863, "artist_longitude": -97.32925, "artist_location": "Fort Worth, TX", "artist_name": "King Curtis", "song_id": "SODREIN12A58A7F2E5", "title": "A Whiter Shade Of Pale (Live @ Fillmore West)", "duration": 326.00771, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/C/TRABCIX128F4265903.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNF6401187FB57032", "artist_latitude": 40.79086, "artist_longitude": -73.96644, "artist_location": "New York, NY [Manhattan]", "artist_name": "Sophie B. Hawkins", "song_id": "SONWXQJ12A8C134D94", "title": "The Ballad Of Sleeping Beauty", "duration": 305.162, "year": 1994} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/C/TRABCKL128F423A778.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARPFHN61187FB575F6", "artist_latitude": 41.88415, "artist_longitude": -87.63241, "artist_location": "Chicago, IL", "artist_name": "Lupe Fiasco", "song_id": "SOWQTQZ12A58A7B63E", "title": "Streets On Fire (Explicit Album Version)", "duration": 279.97995, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/C/TRABCPZ128F4275C32.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR051KA1187B98B2FF", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Wilks", "song_id": "SOLYIBD12A8C135045", "title": "Music is what we love", "duration": 261.51138, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/C/TRABCRU128F423F449.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR8IEZO1187B99055E", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Marc Shaiman", "song_id": "SOINLJW12A8C13314C", "title": "City Slickers", "duration": 149.86404, "year": 2008} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/C/TRABCTK128F934B224.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR558FS1187FB45658", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "40 Grit", "song_id": "SOGDBUF12A8C140FAA", "title": "Intro", "duration": 75.67628, "year": 2003} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/C/TRABCUQ128E0783E2B.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARVBRGZ1187FB4675A", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Gwen Stefani", "song_id": "SORRZGD12A6310DBC3", "title": "Harajuku Girls", "duration": 290.55955, "year": 2004} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/C/TRABCXB128F4286BD3.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARWB3G61187FB49404", "artist_latitude": null, "artist_longitude": null, "artist_location": "Hamilton, Ohio", "artist_name": "Steve Morse", "song_id": "SODAUVL12A8C13D184", "title": "Prognosis", "duration": 363.85914, "year": 2000} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/data/song_data/A/B/C/TRABCYE128F934CE1D.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AREVWGE1187B9B890A", "artist_latitude": -13.442, "artist_longitude": -41.9952, "artist_location": "Noci (BA)", "artist_name": "Bitter End", "song_id": "SOFCHDR12AB01866EF", "title": "Living Hell", "duration": 282.43546, "year": 0} -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/database_schema_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbmayer/Udacity-Data-Engineering-Nanodegree/84c28b9d8538a5aa3b23867abad93e8de30c3476/Project 1 Data Modeling with Postgres/database_schema_diagram.png -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/etl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import psycopg2 4 | import pandas as pd 5 | from sql_queries import * 6 | 7 | 8 | def process_song_file(cur, filepath): 9 | """Insert record from JSON song file into postgresql tables. 10 | 11 | Read JSON file to pandas dataframe, clean and process data, 12 | then load to song and artist tables. 13 | 14 | Parameters: 15 | cur (cursor object): connection cursor 16 | filepath (string): filepath 17 | 18 | Returns: None 19 | 20 | """ 21 | # open song file 22 | df = pd.read_json(filepath, lines=True) 23 | 24 | # insert song record 25 | song_data = df.loc[(df['song_id'].notnull() & df['title'].notnull() 26 | & df['artist_id'].notnull() & df['year'].notnull()), 27 | ['song_id', 'title', 'artist_id', 'year', 'duration']].values[0].tolist() 28 | cur.execute(song_table_insert, song_data) 29 | 30 | # insert artist record 31 | artist_data = df.loc[(df['artist_id'].notnull() & df['artist_name'].notnull()), 32 | ['artist_id', 'artist_name', 'artist_location', 'artist_latitude', 33 | 'artist_longitude']].values[0].tolist() 34 | cur.execute(artist_table_insert, artist_data) 35 | 36 | 37 | def process_log_file(cur, filepath): 38 | """Insert records from JSON log files into PostgreSQL tables. 39 | 40 | Read JSON file to pandas dataframe, clean and process data, 41 | then load to user and songplay tables. 42 | 43 | Parameters: 44 | cur (cursor object): connection cursor 45 | filepath (string): filepath 46 | 47 | Returns: None 48 | 49 | """ 50 | # open log file 51 | df = pd.read_json(filepath, lines=True) 52 | 53 | # filter by NextSong action 54 | df = df.loc[df.page=='NextSong'] 55 | 56 | # convert timestamp column to datetime 57 | df['ts'] = pd.to_datetime(df['ts'], unit='ms') 58 | t = pd.to_datetime(df['ts'], unit='ms') 59 | 60 | # insert non-null time data records 61 | time_data = list((t, t.dt.hour, t.dt.day, t.dt.week, t.dt.month, t.dt.year, t.dt.weekday)) 62 | column_labels = ('timestamp', 'hour', 'day', 'week', 'month', 'year', 'weekday') 63 | time_df = pd.DataFrame.from_dict(dict(zip(column_labels,time_data))) 64 | time_df = time_df.loc[time_df['timestamp'].notnull()] 65 | 66 | for i, row in time_df.iterrows(): 67 | cur.execute(time_table_insert, list(row)) 68 | 69 | # load user table 70 | # filter out rows with no user id, gender, level or timestamp 71 | user_df = df.loc[(df['userId'].notnull() & df['gender'].notnull() 72 | & df['level'].notnull() & df['ts'].notnull()), 73 | ['userId', 'firstName', 'lastName', 'gender', 'level', 'ts']] 74 | 75 | # insert user records 76 | for i, row in user_df.iterrows(): 77 | cur.execute(user_table_insert, row) 78 | 79 | # insert songplay records 80 | for index, row in df.iterrows(): 81 | 82 | # get songid and artistid from song and artist tables 83 | cur.execute(song_select, (row.song, row.artist, row.length)) 84 | results = cur.fetchone() 85 | 86 | if results: 87 | songid, artistid = results 88 | else: 89 | songid, artistid = None, None 90 | 91 | # create songplay uuid 92 | name = (str(row.song) + str(row.ts) + str(row.userId),) 93 | generate_uuid = ("""SELECT uuid_generate_v5(uuid_nil(), %s)""") 94 | cur.execute(generate_uuid, name) 95 | songplayid = cur.fetchone() 96 | 97 | # insert songplay record 98 | songplay_data = (songplayid, row.ts, row.userId, row.level, songid, artistid, 99 | row.sessionId, row.location, row.userAgent) 100 | if row.ts is not None: 101 | cur.execute(songplay_table_insert, songplay_data) 102 | 103 | 104 | def process_data(cur, conn, filepath, func): 105 | """Process data files from directory using function.""" 106 | # get all files matching extension from directory 107 | all_files = [] 108 | for root, dirs, files in os.walk(filepath): 109 | files = glob.glob(os.path.join(root,'*.json')) 110 | for f in files : 111 | all_files.append(os.path.abspath(f)) 112 | 113 | # get total number of files found 114 | num_files = len(all_files) 115 | print('{} files found in {}'.format(num_files, filepath)) 116 | 117 | # iterate over files and process 118 | for i, datafile in enumerate(all_files, 1): 119 | func(cur, datafile) 120 | conn.commit() 121 | print('{}/{} files processed.'.format(i, num_files)) 122 | 123 | 124 | def main(): 125 | """Load song and log data into postgresql star schema.""" 126 | conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student") 127 | cur = conn.cursor() 128 | 129 | # enable uuid extension 130 | cur.execute("""CREATE EXTENSION "uuid-ossp";""") 131 | 132 | process_data(cur, conn, filepath='data/song_data', func=process_song_file) 133 | process_data(cur, conn, filepath='data/log_data', func=process_log_file) 134 | 135 | conn.close() 136 | 137 | 138 | if __name__ == "__main__": 139 | main() -------------------------------------------------------------------------------- /Project 1 Data Modeling with Postgres/sql_queries.py: -------------------------------------------------------------------------------- 1 | # DROP TABLES 2 | 3 | songplay_table_drop = "DROP TABLE IF EXISTS songplays" 4 | user_table_drop = "DROP TABLE IF EXISTS users" 5 | song_table_drop = "DROP TABLE IF EXISTS songs" 6 | artist_table_drop = "DROP TABLE IF EXISTS artists" 7 | time_table_drop = "DROP TABLE IF EXISTS time" 8 | 9 | # CREATE TABLES 10 | 11 | songplay_table_create = ("""CREATE TABLE IF NOT EXISTS songplays (songplay_id uuid PRIMARY KEY, start_time timestamp NOT NULL REFERENCES time(start_time), user_id int NOT NULL REFERENCES users(user_id), level text, song_id text REFERENCES songs(song_id), artist_id text REFERENCES artists(artist_id), session_id int, location text, user_agent text)""") 12 | 13 | user_table_create = ("""CREATE TABLE IF NOT EXISTS users (user_id int PRIMARY KEY, first_name text, last_name text, gender text NOT NULL, level text NOT NULL, last_start_time timestamp NOT NULL)""") 14 | 15 | song_table_create = ("""CREATE TABLE IF NOT EXISTS songs (song_id text PRIMARY KEY, title text NOT NULL, artist_id text NOT NULL, year int NOT NULL, duration numeric) 16 | """) 17 | 18 | artist_table_create = ("""CREATE TABLE IF NOT EXISTS artists (artist_id text PRIMARY KEY, name text NOT NULL, location text, lattitude float8, longitude float8)""") 19 | 20 | time_table_create = ("""CREATE TABLE IF NOT EXISTS time (start_time timestamp PRIMARY KEY, hour int NOT NULL, day int NOT NULL, week int NOT NULL, month int NOT NULL, year int NOT NULL, weekday int NOT NULL)""") 21 | 22 | # INSERT RECORDS 23 | 24 | songplay_table_insert = ("""INSERT INTO songplays (songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (songplay_id) DO NOTHING""") 25 | 26 | # update free/paid level with latest value 27 | user_table_insert = ("""INSERT INTO users (user_id, first_name, last_name, gender, level, last_start_time) VALUES (%s, %s, %s, %s, %s, %s) ON CONFLICT (user_id) DO UPDATE SET level = (CASE WHEN EXCLUDED.last_start_time > users.last_start_time THEN EXCLUDED.level ELSE users.level END)""") 28 | 29 | song_table_insert = ("""INSERT INTO songs (song_id, title, artist_id, year, duration) VALUES (%s, %s, %s, %s, %s) ON CONFLICT (song_id) DO NOTHING""") 30 | 31 | artist_table_insert = ("""INSERT INTO artists (artist_id, name, location, lattitude, longitude) VALUES (%s, %s, %s, %s, %s) ON CONFLICT (artist_id) DO NOTHING""") 32 | 33 | 34 | time_table_insert = ("""INSERT INTO time (start_time, hour, day, week, month, year, weekday) VALUES (%s, %s, %s, %s, %s, %s, %s) ON CONFLICT (start_time) DO NOTHING""") 35 | 36 | # FIND SONGS 37 | 38 | song_select = ("""SELECT s.song_id, a.artist_id FROM songs s LEFT JOIN artists a ON s.artist_id = a.artist_id WHERE s.title = %s AND a.name = %s AND s.duration = %s""") 39 | 40 | # QUERY LISTS 41 | 42 | create_table_queries = [user_table_create, song_table_create, artist_table_create, time_table_create, songplay_table_create] 43 | drop_table_queries = [songplay_table_drop, user_table_drop, song_table_drop, artist_table_drop, time_table_drop] -------------------------------------------------------------------------------- /Project 1B Data Modeling with Apache Cassandra/ReadMe.md: -------------------------------------------------------------------------------- 1 | # DEND Project 2: Data Modeling with Apache Cassandra 2 | 3 | ## Project Summary 4 | 5 | The objective of this project is to create a NoSQL analytics database in Apache Cassandra for a fictional music streaming service called Sparkify. Sparkify's analytics team seeks to understand what, when and how users are playing songs on the company's music app. The analysts need an easy way to query and analyze the songplay data, which is currently stored in raw csv files on a local directory. 6 | 7 | As the data engineer assigned to the project, I have implemented an ETL pipeline in python to pre-process the data using pandas. The database tables are modeled on the queries according to the principle of one table per query. I selected the primary and clustering keys for each table in order to ensure a unique identifier for each row. 8 | 9 | Data Modeling with Apache Cassandra was submitted for Udacity's Data Engineering Nanodegree (DEND) in Spring 2019. 10 | 11 | # Part I. ETL Pipeline for Pre-Processing the Files 12 | 13 | ## PLEASE RUN THE FOLLOWING CODE FOR PRE-PROCESSING THE FILES 14 | 15 | #### Import Python packages 16 | 17 | 18 | ```python 19 | # Import Python packages 20 | import pandas as pd 21 | import cassandra 22 | import re 23 | import os 24 | import glob 25 | import numpy as np 26 | import json 27 | import csv 28 | ``` 29 | 30 | #### Creating list of filepaths to process original event csv data files 31 | 32 | 33 | ```python 34 | # checking your current working directory 35 | print(os.getcwd()) 36 | 37 | # Get your current folder and subfolder event data 38 | filepath = os.getcwd() + '/event_data' 39 | 40 | # Create a for loop to create a list of files and collect each filepath 41 | for root, dirs, files in os.walk(filepath): 42 | 43 | # join the file path and roots with the subdirectories using glob 44 | file_path_list = glob.glob(os.path.join(root,'*')) 45 | #print(file_path_list) 46 | ``` 47 | 48 | /home/workspace 49 | 50 | 51 | #### Processing the files to create the data file csv that will be used for Apache Casssandra tables 52 | 53 | 54 | ```python 55 | # initiating an empty list of rows that will be generated from each file 56 | full_data_rows_list = [] 57 | 58 | # for every filepath in the file path list 59 | for f in file_path_list: 60 | 61 | # reading csv file 62 | with open(f, 'r', encoding = 'utf8', newline='') as csvfile: 63 | # creating a csv reader object 64 | csvreader = csv.reader(csvfile) 65 | next(csvreader) 66 | 67 | # extracting each data row one by one and append it 68 | for line in csvreader: 69 | #print(line) 70 | full_data_rows_list.append(line) 71 | 72 | # creating a smaller event data csv file called event_datafile_full csv that will be used to insert data into the \ 73 | # Apache Cassandra tables 74 | csv.register_dialect('myDialect', quoting=csv.QUOTE_MINIMAL, skipinitialspace=True) 75 | 76 | with open('event_datafile_new.csv', 'w', encoding = 'utf8', newline='') as f: 77 | writer = csv.writer(f, dialect='myDialect') 78 | writer.writerow(['artist','firstName','gender','itemInSession','lastName','length',\ 79 | 'level','location','sessionId','song','userId']) 80 | for row in full_data_rows_list: 81 | if (row[0] == ''): 82 | continue 83 | writer.writerow((row[0], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[12], row[13], row[16])) 84 | 85 | ``` 86 | 87 | 88 | ```python 89 | # check the number of rows in your csv file 90 | with open('event_datafile_new.csv', 'r', encoding = 'utf8') as f: 91 | print(sum(1 for line in f)) 92 | ``` 93 | 94 | 6821 95 | 96 | 97 | #### Part I code was provided by course instructors with limited tweaks by the student. 98 | 99 | # Part II. Modeling Data for Query Retrieval in Apache Cassandra 100 | 101 | ## This section works with the CSV file titled event_datafile_new.csv, located within the Workspace directory. The event_datafile_new.csv contains the following columns: 102 | - artist 103 | - firstName of user 104 | - gender of user 105 | - item number in session 106 | - last name of user 107 | - length of the song 108 | - level (paid or free song) 109 | - location of the user 110 | - sessionId 111 | - song title 112 | - userId 113 | 114 | The image below is a screenshot of the denormalized in **event_datafile_new.csv** after the code above is run:
115 | 116 | 117 | 118 | #### Create Cluster 119 | 120 | 121 | ```python 122 | from cassandra.cluster import Cluster 123 | cluster = Cluster() 124 | 125 | session = cluster.connect() 126 | ``` 127 | 128 | #### Create Keyspace 129 | 130 | 131 | ```python 132 | try: 133 | session.execute(""" 134 | CREATE KEYSPACE IF NOT EXISTS sparkify 135 | WITH REPLICATION = 136 | { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }""" 137 | ) 138 | 139 | except Exception as e: 140 | print(e) 141 | ``` 142 | 143 | #### Set Keyspace 144 | 145 | 146 | ```python 147 | try: 148 | session.set_keyspace('sparkify') 149 | except Exception as e: 150 | print(e) 151 | ``` 152 | 153 | ## Create tables to run the following queries. Remember, with Apache Cassandra you model the database tables on the queries you want to run. 154 | 155 | ### Query 1: Give me the artist, song title and song's length in the music app history that was heard during sessionId = 338, and itemInSession = 4 156 | 157 | #### Create table 158 | The Primary Key for the **songs** table is session_id as the partition key and item_in_session as the clustering key. This will enable fast reads of the table to retrieve song data from a particular session. Partition by session_id ensures that a given playlist history is stored by session id and clustering by item_in_session ensures that the data is sorted by order of play. 159 | 160 | 161 | ```python 162 | query1 = """CREATE TABLE IF NOT EXISTS songs ( 163 | session_id int, item_in_session int, artist text, song text, length double, 164 | PRIMARY KEY (session_id, item_in_session) 165 | )""" 166 | 167 | try: 168 | session.execute(query1) 169 | except Exception as e: 170 | print(e) 171 | ``` 172 | 173 | #### Insert data 174 | 175 | 176 | ```python 177 | # We have provided part of the code to set up the CSV file. Please complete the Apache Cassandra code below# 178 | file = 'event_datafile_new.csv' 179 | 180 | with open(file, encoding = 'utf8') as f: 181 | csvreader = csv.reader(f) 182 | next(csvreader) # skip header 183 | for line in csvreader: 184 | ## Assign the INSERT statements into the `query` variable 185 | query = "INSERT INTO songs (session_id, item_in_session, artist, song, length)" 186 | query = query + " VALUES (%s, %s, %s, %s, %s)" 187 | ## Assign which column element should be assigned for each column in the INSERT statement. 188 | ## For e.g., to INSERT artist_name and user first_name, you would change the code below to `line[0], line[1]` 189 | session.execute(query, (int(line[8]), int(line[3]), line[0], line[9], float(line[5]))) 190 | ``` 191 | 192 | #### Run SELECT query to verify table model 193 | 194 | 195 | ```python 196 | query1 = """SELECT artist, song, length FROM songs WHERE session_id = 338 AND item_in_session = 4""" 197 | 198 | try: 199 | rows = session.execute(query1) 200 | except Exception as e: 201 | print(e) 202 | 203 | for row in rows: 204 | print( row.artist, row.song, row.length) 205 | ``` 206 | 207 | Faithless Music Matters (Mark Knight Dub) 495.3073 208 | 209 | 210 | ### Query 2: Give me only the following: name of artist, song (sorted by itemInSession) and user (first and last name) for userid = 10, sessionid = 182 211 | 212 | #### Create table 213 | 214 | 215 | ```python 216 | query2 = """CREATE TABLE IF NOT EXISTS user_sessions ( 217 | user_id int, session_id int, item_in_session int, artist text, song text, first_name text, last_name text, 218 | PRIMARY KEY ((user_id, session_id), item_in_session) 219 | )""" 220 | 221 | try: 222 | session.execute(query2) 223 | except Exception as e: 224 | print(e) 225 | ``` 226 | 227 | #### Insert data 228 | 229 | 230 | ```python 231 | file = 'event_datafile_new.csv' 232 | 233 | with open(file, encoding = 'utf8') as f: 234 | csvreader = csv.reader(f) 235 | next(csvreader) # skip header 236 | for line in csvreader: 237 | query = "INSERT INTO user_sessions (user_id, session_id, item_in_session, artist, song, first_name, last_name)" 238 | query = query + " VALUES (%s, %s, %s, %s, %s, %s, %s)" 239 | session.execute(query, (int(line[10]), int(line[8]), int(line[3]), line[0], line[9], line[1], line[4])) 240 | ``` 241 | 242 | #### Run SELECT query to verify table model 243 | 244 | 245 | ```python 246 | query2 = """SELECT artist, song, first_name, last_name FROM user_sessions WHERE user_id = 10 AND session_id = 182""" 247 | 248 | try: 249 | rows = session.execute(query2) 250 | except Exception as e: 251 | print(e) 252 | 253 | for row in rows: 254 | print( row.artist, row.song, row.first_name, row.last_name ) 255 | ``` 256 | 257 | Down To The Bone Keep On Keepin' On Sylvie Cruz 258 | Three Drives Greece 2000 Sylvie Cruz 259 | Sebastien Tellier Kilometer Sylvie Cruz 260 | Lonnie Gordon Catch You Baby (Steve Pitron & Max Sanna Radio Edit) Sylvie Cruz 261 | 262 | 263 | ### Query 3: Give me every user name (first and last) in my music app history who listened to the song 'All Hands Against His Own' 264 | 265 | #### Create table 266 | 267 | Since the query specifies the data to retrieve by song, I have used song as the partition key. Song name, alone, is not sufficient to define a unique record. A possible choice for clustering would be first name and/or last name. I chose user_id as the clustering column because it is unique per user, whereas many people may share the same name. 268 | 269 | 270 | ```python 271 | ## Query 3: Give me every user name (first and last) in my music app history who listened to the song 'All Hands Against His Own' 272 | query3 = """CREATE TABLE IF NOT EXISTS song_users ( 273 | song text, user_id int, first_name text, last_name text, 274 | PRIMARY KEY (song, user_id) 275 | )""" 276 | 277 | try: 278 | session.execute(query3) 279 | except Exception as e: 280 | print(e) 281 | ``` 282 | 283 | #### Insert data 284 | 285 | The objective of query 3 is to extract a list of users who listen to a given song. Since people tend to play the same song many times, the event data is likely to contain multiple rows with the same user and song name. For this reason I preprocess the data in pandas to remove duplicates before inserting it in the Apache Cassandra table. 286 | 287 | 288 | ```python 289 | file = 'event_datafile_new.csv' 290 | 291 | df = pd.read_csv(file, usecols=[1, 4, 9, 10]) 292 | df.drop_duplicates(inplace=True) 293 | 294 | for ix, row in df.iterrows(): 295 | query = "INSERT INTO song_users (song, user_id, first_name, last_name)" 296 | query = query + " VALUES (%s, %s, %s, %s)" 297 | session.execute(query, (row['song'], row['userId'], row['firstName'], row['lastName'])) 298 | ``` 299 | 300 | #### Run SELECT query to verify table model 301 | 302 | 303 | ```python 304 | query3 = """SELECT first_name, last_name FROM song_users WHERE song = 'All Hands Against His Own'""" 305 | 306 | try: 307 | rows = session.execute(query3) 308 | except Exception as e: 309 | print(e) 310 | 311 | for row in rows: 312 | print( row.first_name, row.last_name ) 313 | ``` 314 | 315 | Jacqueline Lynch 316 | Tegan Levine 317 | Sara Johnson 318 | 319 | 320 | ### Drop the tables before closing out the sessions 321 | 322 | 323 | ```python 324 | drop_songs = "DROP TABLE IF EXISTS songs" 325 | drop_user_sessions = "DROP TABLE IF EXISTS user_sessions" 326 | drop_song_users = "DROP TABLE IF EXISTS song_users" 327 | try: 328 | session.execute(drop_songs) 329 | session.execute(drop_user_sessions) 330 | session.execute(drop_song_users) 331 | except Exception as e: 332 | print(e) 333 | ``` 334 | 335 | ### Close the session and cluster connection¶ 336 | 337 | 338 | ```python 339 | session.shutdown() 340 | cluster.shutdown() 341 | ``` 342 | 343 | #### Part II code was substantially completed by the student on base provided by instructors 344 | -------------------------------------------------------------------------------- /Project 1B Data Modeling with Apache Cassandra/event_data/2018-11-01-events.csv: -------------------------------------------------------------------------------- 1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId 2 | ,Logged In,Walter,M,0,Frye,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54092E+12,38,,200,1.54111E+12,39 3 | ,Logged In,Kaylee,F,0,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Home,1.54034E+12,139,,200,1.54111E+12,8 4 | Des'ree,Logged In,Kaylee,F,1,Summers,246.30812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,You Gotta Be,200,1.54111E+12,8 5 | ,Logged In,Kaylee,F,2,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Upgrade,1.54034E+12,139,,200,1.54111E+12,8 6 | Mr Oizo,Logged In,Kaylee,F,3,Summers,144.03873,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Flat 55,200,1.54111E+12,8 7 | Tamba Trio,Logged In,Kaylee,F,4,Summers,177.18812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Quem Quiser Encontrar O Amor,200,1.54111E+12,8 8 | The Mars Volta,Logged In,Kaylee,F,5,Summers,380.42077,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Eriatarka,200,1.54111E+12,8 9 | Infected Mushroom,Logged In,Kaylee,F,6,Summers,440.2673,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Becoming Insane,200,1.54111E+12,8 10 | Blue October / Imogen Heap,Logged In,Kaylee,F,7,Summers,241.3971,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Congratulations,200,1.54111E+12,8 11 | Girl Talk,Logged In,Kaylee,F,8,Summers,160.15628,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Once again,200,1.54111E+12,8 12 | Black Eyed Peas,Logged In,Sylvie,F,0,Cruz,214.93506,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV",PUT,NextSong,1.54027E+12,9,Pump It,200,1.54111E+12,10 13 | ,Logged In,Ryan,M,0,Smith,,free,"San Jose-Sunnyvale-Santa Clara, CA",GET,Home,1.54102E+12,169,,200,1.54111E+12,26 14 | Fall Out Boy,Logged In,Ryan,M,1,Smith,200.72444,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,169,Nobody Puts Baby In The Corner,200,1.54111E+12,26 15 | M.I.A.,Logged In,Ryan,M,2,Smith,233.7171,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,169,Mango Pickle Down River (With The Wilcannia Mob),200,1.54111E+12,26 16 | Survivor,Logged In,Jayden,M,0,Fox,245.36771,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,100,Eye Of The Tiger,200,1.54111E+12,101 17 | -------------------------------------------------------------------------------- /Project 1B Data Modeling with Apache Cassandra/event_data/2018-11-10-events.csv: -------------------------------------------------------------------------------- 1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId 2 | Hoobastank,Logged In,Cierra,F,0,Finley,241.3971,free,"Richmond, VA",PUT,NextSong,1.54101E+12,132,Say The Same,200,1.54181E+12,96 3 | Mark Knopfler,Logged In,Cierra,F,1,Finley,249.3122,free,"Richmond, VA",PUT,NextSong,1.54101E+12,132,Why Aye Man,200,1.54181E+12,96 4 | Mogwai,Logged In,Cierra,F,2,Finley,341.28934,free,"Richmond, VA",PUT,NextSong,1.54101E+12,132,We're No Here,200,1.54181E+12,96 5 | The Casualties,Logged In,Cierra,F,3,Finley,181.49832,free,"Richmond, VA",PUT,NextSong,1.54101E+12,132,Punx Unite,200,1.54181E+12,96 6 | ,Logged In,Cecilia,F,0,Owens,,free,"Atlanta-Sandy Springs-Roswell, GA",GET,Home,1.54103E+12,424,,200,1.54181E+12,6 7 | The Living End,Logged In,Ryan,M,0,Smith,188.62975,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,433,Roll On (Album Version),200,1.54182E+12,26 8 | Aloe Blacc,Logged In,Rylan,M,0,George,244.1922,free,"Birmingham-Hoover, AL",PUT,NextSong,1.54102E+12,402,I Need A Dollar,200,1.54183E+12,16 9 | Faith No More,Logged In,Rylan,M,1,George,326.50404,free,"Birmingham-Hoover, AL",PUT,NextSong,1.54102E+12,402,Helpless,200,1.54183E+12,16 10 | Chris Cornell,Logged In,Aleena,F,0,Kirby,353.69751,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Sunshower (Great Expectations Soundtrack),200,1.54184E+12,44 11 | Weezer,Logged In,Aleena,F,1,Kirby,203.93751,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,My Name Is Jonas,200,1.54184E+12,44 12 | Stream of Passion feat. Ayreon,Logged In,Aleena,F,2,Kirby,257.56689,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Valley Of The Queens,200,1.54184E+12,44 13 | Lupe Fiasco,Logged In,Aleena,F,3,Kirby,273.94567,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Shining Down [feat. Matthew Santos] (Amended Album Version),200,1.54184E+12,44 14 | Tom Petty,Logged In,Aleena,F,4,Kirby,263.23546,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Runnin' Down A Dream,200,1.54184E+12,44 15 | The Killers,Logged In,Aleena,F,5,Kirby,220.89098,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,When You Were Young,200,1.54184E+12,44 16 | Afghan Whigs,Logged In,Aleena,F,6,Kirby,179.40853,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,I'm Her Slave (Album),200,1.54184E+12,44 17 | CSS,Logged In,Aleena,F,7,Kirby,213.75955,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Let's Make Love And Listen To Death From Above [Dan Carey Mix] (remastered album version),200,1.54184E+12,44 18 | Mos Def / Talib Kweli,Logged In,Aleena,F,8,Kirby,141.37424,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,History,200,1.54184E+12,44 19 | Ryan Leslie,Logged In,Aleena,F,9,Kirby,203.96363,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,How It Was Supposed To Be,200,1.54184E+12,44 20 | Mark Lowry,Logged In,Aleena,F,10,Kirby,168.28036,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Get Together With The Lord (The Best Of Mark Lowry - Volume 2 Version),200,1.54184E+12,44 21 | Beirut,Logged In,Aleena,F,11,Kirby,230.19057,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Nantes,200,1.54184E+12,44 22 | MODESELEKTOR FEAT. PUPPETMASTAZ,Logged In,Aleena,F,12,Kirby,52.79302,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,THE DARK SIDE OF THE FROG,200,1.54184E+12,44 23 | Kid Cudi / Kanye West / Common,Logged In,Aleena,F,13,Kirby,237.76608,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Make Her Say,200,1.54184E+12,44 24 | Julie Ruin,Logged In,Aleena,F,14,Kirby,142.47138,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Breakout A-Town,200,1.54184E+12,44 25 | Sons And Daughters,Logged In,Aleena,F,15,Kirby,165.90322,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,The Bell,200,1.54184E+12,44 26 | Children 18:3,Logged In,Aleena,F,16,Kirby,178.52036,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Mock The Music,200,1.54184E+12,44 27 | Chris Cagle,Logged In,Aleena,F,17,Kirby,232.85506,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Miss Me Baby,200,1.54184E+12,44 28 | John Waite,Logged In,Aleena,F,18,Kirby,269.76608,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Missing You,200,1.54184E+12,44 29 | Basshunter,Logged In,Aleena,F,19,Kirby,223.32036,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Walk On Water,200,1.54184E+12,44 30 | Jay-Z / Lil Wayne,Logged In,Aleena,F,20,Kirby,236.01587,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Hello Brooklyn 2.0,200,1.54184E+12,44 31 | Snow Patrol,Logged In,Aleena,F,21,Kirby,273.6322,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,It's Beginning To Get To Me,200,1.54184E+12,44 32 | Coldcut,Logged In,Aleena,F,22,Kirby,203.07546,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Autumn Leaves,200,1.54184E+12,44 33 | Magic Dirt,Logged In,Aleena,F,23,Kirby,251.79383,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Plastic Loveless Letter,200,1.54184E+12,44 34 | J. Karjalainen & Mustat Lasit,Logged In,Aleena,F,24,Kirby,336.74404,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Sinisten tähtien alla,200,1.54184E+12,44 35 | OneRepublic,Logged In,Aleena,F,25,Kirby,224.67873,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Secrets,200,1.54184E+12,44 36 | Nirvana,Logged In,Aleena,F,26,Kirby,219.08853,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Come As You Are,200,1.54184E+12,44 37 | ,Logged In,Theodore,M,0,Smith,,free,"Houston-The Woodlands-Sugar Land, TX",GET,Home,1.54031E+12,359,,200,1.54184E+12,52 38 | Joyce Cooling,Logged In,Aleena,F,27,Kirby,248.11057,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,It's Time I Go (Jazz),200,1.54184E+12,44 39 | Beastie Boys,Logged In,Aleena,F,28,Kirby,211.722,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Unite (2009 Digital Remaster),200,1.54184E+12,44 40 | Usher Featuring Lil' Jon & Ludacris,Logged In,Aleena,F,29,Kirby,250.38322,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Yeah!,200,1.54184E+12,44 41 | Nelly / Paul Wall / Ali & Gipp,Logged In,Aleena,F,30,Kirby,272.50893,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Grillz,200,1.54184E+12,44 42 | The Audition,Logged In,Aleena,F,31,Kirby,207.20281,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,The Running Man,200,1.54184E+12,44 43 | Savage Garden,Logged In,Aleena,F,32,Kirby,277.26322,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Truly Madly Deeply,200,1.54184E+12,44 44 | Adam Green,Logged In,Aleena,F,33,Kirby,141.00853,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Festival Song,200,1.54184E+12,44 45 | Tom Petty,Logged In,Aleena,F,34,Kirby,204.82567,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Square One (Album Version),200,1.54184E+12,44 46 | Muse,Logged In,Aleena,F,35,Kirby,209.34485,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Supermassive Black Hole (Album Version),200,1.54184E+12,44 47 | The Gerbils,Logged In,Jordan,F,0,Hicks,27.01016,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,(iii),200,1.54184E+12,37 48 | Robert Plant,Logged In,Jordan,F,1,Hicks,265.66485,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,Dancing In Heaven (2006 Remastered LP Version),200,1.54184E+12,37 49 | Metallica,Logged In,Jordan,F,2,Hicks,387.02975,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,Welcome Home (Sanitarium),200,1.54184E+12,37 50 | Infected Mushroom,Logged In,Jordan,F,3,Hicks,506.51383,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,Deeply Disturbed,200,1.54184E+12,37 51 | Eliza Doolittle,Logged In,Jordan,F,4,Hicks,184.60689,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,Rollerblades,200,1.54185E+12,37 52 | Alvin And The Chipmunks,Logged In,Jordan,F,5,Hicks,162.63791,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,Ain't No Party,200,1.54185E+12,37 53 | Chromeo,Logged In,Jordan,F,6,Hicks,348.65587,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,You're So Gangsta,200,1.54185E+12,37 54 | Keisha White,Logged In,Kevin,M,0,Arellano,251.42812,free,"Harrisburg-Carlisle, PA",PUT,NextSong,1.54001E+12,387,Brother,200,1.54185E+12,66 55 | Juanes,Logged In,Kevin,M,1,Arellano,247.37914,free,"Harrisburg-Carlisle, PA",PUT,NextSong,1.54001E+12,387,Damelo,200,1.54185E+12,66 56 | ,Logged In,Walter,M,0,Frye,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54092E+12,180,,200,1.54185E+12,39 57 | Karnivool,Logged In,Ryan,M,0,Smith,470.80444,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,445,Umbra,200,1.54186E+12,26 58 | WES,Logged In,Cecilia,F,0,Owens,221.57016,free,"Atlanta-Sandy Springs-Roswell, GA",PUT,NextSong,1.54103E+12,444,Alane,200,1.54186E+12,6 59 | Asia 2001,Logged In,Cecilia,F,1,Owens,150.30812,free,"Atlanta-Sandy Springs-Roswell, GA",PUT,NextSong,1.54103E+12,444,Epilogue,200,1.54186E+12,6 60 | Spike Milligan,Logged In,Samuel,M,0,Gonzalez,220.39465,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.54049E+12,384,Nothing At All,200,1.54186E+12,61 61 | Laura Izibor,Logged In,Anabelle,F,0,Simpson,211.56526,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1.54104E+12,378,Carousel (PSILY Album Version),200,1.54187E+12,69 62 | ,Logged In,Lily,F,0,Burns,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1.54062E+12,426,,200,1.54187E+12,32 63 | Ryan Adams,Logged In,Braden,M,0,Parker,248.5024,free,"Youngstown-Warren-Boardman, OH-PA",PUT,NextSong,1.541E+12,246,Wonderwall,200,1.54187E+12,74 64 | ,Logged In,Adelyn,F,0,Jordan,,free,"Chicago-Naperville-Elgin, IL-IN-WI",GET,Home,1.54013E+12,391,,200,1.54187E+12,7 65 | Method Man,Logged In,Adelyn,F,1,Jordan,204.64281,free,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54013E+12,391,The Motto,200,1.54187E+12,7 66 | The Stanley Brothers,Logged In,Adelyn,F,2,Jordan,179.69587,free,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54013E+12,391,I'm A Man Of Constant Sorrow,200,1.54187E+12,7 67 | Dexter Freebish,Logged In,Adelyn,F,3,Jordan,210.54649,free,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54013E+12,391,Deeper,200,1.54187E+12,7 68 | Jamiroquai,Logged In,Jacob,M,0,Rogers,362.05669,free,"San Diego-Carlsbad, CA",PUT,NextSong,1.54098E+12,432,Talullah,200,1.54187E+12,18 69 | Michael Cera & Ellen Page,Logged In,Matthew,M,0,Jones,116.71465,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Anyone Else But You,200,1.54188E+12,36 70 | The Cat Empire,Logged In,Matthew,M,1,Jones,218.22649,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,How To Explain,200,1.54188E+12,36 71 | Bryn Terfel / Berliner Philharmoniker / Claudio Abbado,Logged In,Matthew,M,2,Jones,967.36608,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Wotan's Farewell & Magic Fire Music,200,1.54188E+12,36 72 | The Fugees,Logged In,Matthew,M,3,Jones,281.20771,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Ready Or Not,200,1.54188E+12,36 73 | Hardline,Logged In,Matthew,M,4,Jones,234.73587,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Everything,200,1.54188E+12,36 74 | The Funky Lowlives,Logged In,Matthew,M,5,Jones,280.34567,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Sail Into the Sun,200,1.54188E+12,36 75 | DL Incognito,Logged In,Matthew,M,6,Jones,221.07383,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Proof,200,1.54188E+12,36 76 | ,Logged In,Theodore,M,0,Smith,,free,"Houston-The Woodlands-Sugar Land, TX",GET,Home,1.54031E+12,447,,200,1.54188E+12,52 77 | Justice,Logged In,Matthew,M,7,Jones,243.40853,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,TTHHEE PPAARRTTYY,200,1.54188E+12,36 78 | Earth_ Wind & Fire,Logged In,Theodore,M,1,Smith,178.20689,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.54031E+12,447,Night Dreamin',200,1.54188E+12,52 79 | Strawbs,Logged In,Matthew,M,8,Jones,255.81669,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Sheep,200,1.54188E+12,36 80 | Angus & Julia Stone,Logged In,Matthew,M,9,Jones,172.85179,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Wasted,200,1.54188E+12,36 81 | Sara Bareilles,Logged In,Matthew,M,10,Jones,260.8322,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Love Song,200,1.54188E+12,36 82 | Bruna Caram,Logged In,Matthew,M,11,Jones,198.63465,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Meus Sonhos,200,1.54188E+12,36 83 | Nando Reis,Logged In,Matthew,M,12,Jones,239.82975,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,O Segundo Sol,200,1.54188E+12,36 84 | The Black Keys,Logged In,Matthew,M,13,Jones,189.28281,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Same Old Thing,200,1.54188E+12,36 85 | Kreator,Logged In,Matthew,M,14,Jones,294.53016,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Riot Of Violence,200,1.54188E+12,36 86 | Audioslave,Logged In,Matthew,M,15,Jones,277.83791,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Show Me How To Live,200,1.54188E+12,36 87 | Red Hot Chili Peppers,Logged In,Matthew,M,16,Jones,269.34812,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Parallel Universe (Album Version),200,1.54188E+12,36 88 | Manu Chao,Logged In,Matthew,M,17,Jones,288.15628,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Me Quedo Contigo [Si Me Das A Elegir],200,1.54188E+12,36 89 | Barry Tuckwell/Academy of St Martin-in-the-Fields/Sir Neville Marriner,Logged In,Matthew,M,18,Jones,277.15873,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Horn Concerto No. 4 in E flat K495: II. Romance (Andante cantabile),200,1.54188E+12,36 90 | Ron Carter,Logged In,Matthew,M,19,Jones,497.13587,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,I CAN'T GET STARTED,200,1.54188E+12,36 91 | ,Logged In,Theodore,M,0,Harris,,free,"Red Bluff, CA",GET,Home,1.5411E+12,440,,200,1.54188E+12,14 92 | ,Logged In,Theodore,M,1,Harris,,free,"Red Bluff, CA",GET,Home,1.5411E+12,440,,200,1.54188E+12,14 93 | Lifehouse,Logged In,Theodore,M,2,Harris,195.47383,free,"Red Bluff, CA",PUT,NextSong,1.5411E+12,440,You And Me (Wedding Version),200,1.54188E+12,14 94 | Yann Tiersen,Logged In,Kaylee,F,0,Summers,158.71955,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,333,La Valse D'Amélie (Version Piano),200,1.54189E+12,8 95 | ISRAEL & NEW BREED,Logged In,Kaylee,F,1,Summers,176.48281,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,333,Awesome Medley,200,1.54189E+12,8 96 | ,Logged In,Molly,F,0,Taylor,,free,"St. Louis, MO-IL",GET,Home,1.54099E+12,396,,200,1.54189E+12,35 97 | Stellar Kart,Logged In,Molly,F,1,Taylor,186.17424,free,"St. Louis, MO-IL",PUT,NextSong,1.54099E+12,396,Jesus Loves You (Album Version),200,1.54189E+12,35 98 | -------------------------------------------------------------------------------- /Project 1B Data Modeling with Apache Cassandra/event_data/2018-11-11-events.csv: -------------------------------------------------------------------------------- 1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId 2 | Frumpies,Logged In,Anabelle,F,0,Simpson,134.47791,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1.54104E+12,455,Hello Kitty,200,1.5419E+12,69 3 | Kenny G with Peabo Bryson,Logged In,Anabelle,F,1,Simpson,264.75057,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1.54104E+12,455,By The Time This Night Is Over,200,1.5419E+12,69 4 | Biffy Clyro,Logged In,Anabelle,F,2,Simpson,189.83138,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1.54104E+12,455,God & Satan,200,1.5419E+12,69 5 | ,Logged In,Lily,F,0,Burns,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1.54062E+12,456,,200,1.54191E+12,32 6 | HIM,Logged In,Lily,F,1,Burns,212.06159,free,"New York-Newark-Jersey City, NY-NJ-PA",PUT,NextSong,1.54062E+12,456,Beautiful,200,1.54191E+12,32 7 | Matmos,Logged In,Joseph,M,0,Gutierrez,1449.11628,free,"Columbia, SC",PUT,NextSong,1.54081E+12,284,Supreme Balloon,200,1.54191E+12,75 8 | Gary Allan,Logged In,Ryann,F,0,Smith,259.83955,free,"Palestine, TX",PUT,NextSong,1.54069E+12,328,The One,200,1.54193E+12,92 9 | Miracle Fortress,Logged In,Ryann,F,1,Smith,200.9073,free,"Palestine, TX",PUT,NextSong,1.54069E+12,328,Five Roses,200,1.54193E+12,92 10 | Don Omar,Logged In,Ryann,F,2,Smith,261.35465,free,"Palestine, TX",PUT,NextSong,1.54069E+12,328,Cuentale,200,1.54193E+12,92 11 | Jay-Z,Logged In,Ryann,F,3,Smith,212.27057,free,"Palestine, TX",PUT,NextSong,1.54069E+12,328,D'Evils,200,1.54193E+12,92 12 | Red Hot Chili Peppers,Logged In,Ryann,F,4,Smith,231.33995,free,"Palestine, TX",PUT,NextSong,1.54069E+12,328,Easily (Album Version),200,1.54193E+12,92 13 | ,Logged In,Chloe,F,0,Cuevas,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,437,,200,1.54193E+12,49 14 | Flogging Molly,Logged In,Chloe,F,1,Cuevas,361.9522,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,Rebels of the Sacred Heart,200,1.54193E+12,49 15 | Reverend Horton Heat,Logged In,Chloe,F,2,Cuevas,158.64118,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,Now_ Right Now,200,1.54193E+12,49 16 | Sea Wolf,Logged In,Chloe,F,3,Cuevas,232.61995,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,I Made A Resolution,200,1.54193E+12,49 17 | Jason Mraz & Colbie Caillat,Logged In,Chloe,F,4,Cuevas,189.6224,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,Lucky (Album Version),200,1.54193E+12,49 18 | Jamie Lidell,Logged In,Chloe,F,5,Cuevas,175.25506,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,Enoughs Enough,200,1.54193E+12,49 19 | Feist,Logged In,Chloe,F,6,Cuevas,212.79302,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,Mushaboom (Postal Service Mix),200,1.54193E+12,49 20 | ,Logged In,Chloe,F,7,Cuevas,,free,"San Francisco-Oakland-Hayward, CA",PUT,Logout,1.54094E+12,437,,307,1.54193E+12,49 21 | ,Logged Out,,,8,,,free,,GET,Home,,437,,200,1.54193E+12, 22 | ,Logged Out,,,9,,,free,,GET,Home,,437,,200,1.54193E+12, 23 | ,Logged Out,,,10,,,free,,PUT,Login,,437,,307,1.54193E+12, 24 | ,Logged In,Chloe,F,11,Cuevas,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,437,,200,1.54193E+12,49 25 | Sex Slaves,Logged In,Chloe,F,12,Cuevas,175.51628,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,We're Going Out Tonight,200,1.54193E+12,49 26 | ,Logged In,Chloe,F,0,Cuevas,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,469,,200,1.54194E+12,49 27 | Rise Against,Logged In,Chloe,F,1,Cuevas,169.482,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,469,To Them These Streets Belong,200,1.54194E+12,49 28 | ,Logged In,Mohammad,M,0,Rodriguez,,free,"Sacramento--Roseville--Arden-Arcade, CA",GET,Home,1.54051E+12,441,,200,1.54194E+12,88 29 | Beyoncé,Logged In,Mohammad,M,1,Rodriguez,359.54893,free,"Sacramento--Roseville--Arden-Arcade, CA",PUT,NextSong,1.54051E+12,441,Get Me Bodied,200,1.54194E+12,88 30 | Nate Dogg,Logged In,Mohammad,M,2,Rodriguez,356.38812,free,"Sacramento--Roseville--Arden-Arcade, CA",PUT,NextSong,1.54051E+12,441,Never Leave Me Alone,200,1.54194E+12,88 31 | ,Logged In,Cierra,F,0,Finley,,free,"Richmond, VA",GET,Home,1.54101E+12,443,,200,1.54195E+12,96 32 | Taylor Swift,Logged In,Cierra,F,1,Finley,233.89995,free,"Richmond, VA",PUT,NextSong,1.54101E+12,443,Love Story,200,1.54195E+12,96 33 | Lynyrd Skynyrd,Logged In,Ryan,M,0,Smith,216.60689,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,452,Sweet home Alabama,200,1.54195E+12,26 34 | Kelis,Logged In,Cierra,F,2,Finley,293.58975,free,"Richmond, VA",PUT,NextSong,1.54101E+12,443,Caught Out There (Explicit),200,1.54195E+12,96 35 | The Kills,Logged In,Cierra,F,3,Finley,203.38893,free,"Richmond, VA",PUT,NextSong,1.54101E+12,443,Last Day Of Magic,200,1.54195E+12,96 36 | ,Logged In,Aleena,F,0,Kirby,,paid,"Waterloo-Cedar Falls, IA",GET,Home,1.54102E+12,448,,200,1.54195E+12,44 37 | Collie Buddz featuring Paul Wall,Logged In,Aleena,F,1,Kirby,271.62077,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,What A Feeling,200,1.54195E+12,44 38 | Charttraxx Karaoke,Logged In,Cierra,F,4,Finley,225.17506,free,"Richmond, VA",PUT,NextSong,1.54101E+12,443,Fireflies,200,1.54195E+12,96 39 | Band Of Horses,Logged In,Aleena,F,2,Kirby,321.14893,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,The Funeral (Album Version),200,1.54195E+12,44 40 | Coldplay,Logged In,Aleena,F,3,Kirby,307.51302,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Clocks,200,1.54195E+12,44 41 | Bon Jovi,Logged In,Aleena,F,4,Kirby,228.75383,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Have A Nice Day,200,1.54195E+12,44 42 | P.O.D.,Logged In,Aleena,F,5,Kirby,203.7024,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Alive (2006 Remastered Album Version),200,1.54195E+12,44 43 | Bloc Party,Logged In,Aleena,F,6,Kirby,222.04036,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Plans (Replanned by Mogwai),200,1.54195E+12,44 44 | Los Prisioneros,Logged In,Aleena,F,7,Kirby,211.12118,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Pa Pa Pa,200,1.54195E+12,44 45 | Octopus Project,Logged In,Aleena,F,8,Kirby,175.25506,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Lots More Stairs,200,1.54195E+12,44 46 | Roudoudou,Logged In,Aleena,F,9,Kirby,18.41587,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Ecoute Ce Scratch,200,1.54195E+12,44 47 | Africando,Logged In,Aleena,F,10,Kirby,253.54404,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Tierra Tradicional,200,1.54195E+12,44 48 | RUN-DMC Featuring Method Man_ Kenny Cash_ Mike Ransom_ and Jamel Simmons,Logged In,Aleena,F,11,Kirby,266.52689,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Simmons Incorporated,200,1.54195E+12,44 49 | ,Logged In,Colm,M,0,Santana,,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",GET,Home,1.54086E+12,414,,200,1.54195E+12,67 50 | Graham Coxon,Logged In,Colm,M,1,Santana,197.14567,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,I'm Goin' Away,200,1.54195E+12,67 51 | Queens Of The Stone Age,Logged In,Aleena,F,12,Kirby,231.02649,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,In The Fade,200,1.54195E+12,44 52 | Dance Gavin Dance,Logged In,Colm,M,2,Santana,193.30567,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,Strawberry André (Album Version),200,1.54195E+12,67 53 | ,Logged In,Colm,M,3,Santana,,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",GET,Home,1.54086E+12,414,,200,1.54195E+12,67 54 | Passion Pit,Logged In,Aleena,F,13,Kirby,243.69587,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Eyes As Candles,200,1.54195E+12,44 55 | ,Logged In,Aleena,F,14,Kirby,,paid,"Waterloo-Cedar Falls, IA",GET,Home,1.54102E+12,448,,200,1.54195E+12,44 56 | Black Eyed Peas,Logged In,Colm,M,4,Santana,229.61587,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,Let's Get It Started,200,1.54195E+12,67 57 | Plastic Bertrand,Logged In,Colm,M,5,Santana,180.00934,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,Ca plane pour moi,200,1.54195E+12,67 58 | Cream,Logged In,Colm,M,6,Santana,166.5824,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,Strange Brew,200,1.54195E+12,67 59 | Coldplay,Logged In,Colm,M,7,Santana,284.39465,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,A Message,200,1.54195E+12,67 60 | Cute Is What We Aim For,Logged In,Colm,M,8,Santana,172.22485,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,Sweat the Battle Before the Battle Sweats You (Album Version),200,1.54195E+12,67 61 | Metallica,Logged In,Connar,M,0,Moreno,256.9922,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.54082E+12,218,Of Wolf And Man,200,1.54195E+12,62 62 | The Kills,Logged In,Connar,M,1,Moreno,217.70404,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.54082E+12,218,Tape Song,200,1.54195E+12,62 63 | Foo Fighters,Logged In,Connar,M,2,Moreno,271.38567,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.54082E+12,218,The Pretender,200,1.54195E+12,62 64 | Plaid,Logged In,Connar,M,3,Moreno,260.96281,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.54082E+12,218,Eyen [Chosen by fans on Warp20.net],200,1.54195E+12,62 65 | ,Logged In,Brayden,M,0,Clark,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1.54103E+12,120,,200,1.54195E+12,41 66 | ,Logged In,Theodore,M,0,Harris,,free,"Red Bluff, CA",GET,Home,1.5411E+12,462,,200,1.54196E+12,14 67 | The Van Pelt,Logged In,Theodore,M,1,Harris,208.71791,free,"Red Bluff, CA",PUT,NextSong,1.5411E+12,462,It's New To Me,200,1.54196E+12,14 68 | ,Logged In,Ryan,M,0,Smith,,free,"San Jose-Sunnyvale-Santa Clara, CA",GET,Home,1.54102E+12,472,,200,1.54196E+12,26 69 | 44,Logged In,Ryan,M,1,Smith,224.57424,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,472,Make You Smile,200,1.54196E+12,26 70 | ,Logged In,Rylan,M,0,George,,free,"Birmingham-Hoover, AL",GET,Home,1.54102E+12,446,,200,1.54196E+12,16 71 | Chris Brown,Logged In,Rylan,M,1,George,275.1473,free,"Birmingham-Hoover, AL",PUT,NextSong,1.54102E+12,446,I May Never Find,200,1.54196E+12,16 72 | KT Tunstall,Logged In,Ryan,M,2,Smith,170.47465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,472,Black Horse And The Cherry Tree (Radio Version),200,1.54196E+12,26 73 | Cascada,Logged In,Rylan,M,2,George,184.39791,free,"Birmingham-Hoover, AL",PUT,NextSong,1.54102E+12,446,Kids In America,200,1.54196E+12,16 74 | Incubus,Logged In,Ryan,M,3,Smith,293.38077,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,472,Black Heart Inertia,200,1.54196E+12,26 75 | ,Logged In,Ryan,M,4,Smith,,free,"San Jose-Sunnyvale-Santa Clara, CA",GET,Help,1.54102E+12,472,,200,1.54196E+12,26 76 | ,Logged In,Ryan,M,5,Smith,,free,"San Jose-Sunnyvale-Santa Clara, CA",GET,Home,1.54102E+12,472,,200,1.54196E+12,26 77 | ,Logged In,Tegan,F,0,Levine,,paid,"Portland-South Portland, ME",GET,Home,1.54079E+12,435,,200,1.54197E+12,80 78 | Miike Snow,Logged In,Tegan,F,1,Levine,220.83873,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Black & Blue,200,1.54197E+12,80 79 | Cartola,Logged In,Tegan,F,2,Levine,208.92689,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Sala De Recepção,200,1.54197E+12,80 80 | Kill The Client,Logged In,Tegan,F,3,Levine,70.68689,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Commander In Thief,200,1.54197E+12,80 81 | ,Logged In,Tegan,F,4,Levine,,paid,"Portland-South Portland, ME",GET,Home,1.54079E+12,435,,200,1.54197E+12,80 82 | Wolfmother,Logged In,Tegan,F,5,Levine,175.82975,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Woman,200,1.54197E+12,80 83 | Old Crow Medicine Show,Logged In,Tegan,F,6,Levine,231.73179,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Wagon Wheel,200,1.54197E+12,80 84 | Architecture In Helsinki,Logged In,Tegan,F,7,Levine,173.73995,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Debbie,200,1.54197E+12,80 85 | Charlie Louvin,Logged In,Tegan,F,8,Levine,170.86649,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,I Think I'll Live,200,1.54197E+12,80 86 | Miguel Morales,Logged In,Tegan,F,9,Levine,270.78485,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,La Derrota de Un Don Juan,200,1.54197E+12,80 87 | Dominique A,Logged In,Tegan,F,10,Levine,153.20771,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Le Courage Des Oiseaux,200,1.54197E+12,80 88 | Cock Sparrer,Logged In,Tegan,F,11,Levine,203.25832,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Run With The Blind,200,1.54197E+12,80 89 | Jimmy Wakely,Logged In,Tegan,F,12,Levine,165.74649,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,I Love You So Much It Hurts,200,1.54197E+12,80 90 | Peter Doherty,Logged In,Tegan,F,13,Levine,217.02485,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,A Little Death Around the Eyes,200,1.54197E+12,80 91 | Katy Perry,Logged In,Tegan,F,14,Levine,246.41261,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Thinking Of You,200,1.54197E+12,80 92 | Sidewalk Prophets,Logged In,Molly,F,0,Taylor,260.62322,free,"St. Louis, MO-IL",PUT,NextSong,1.54099E+12,464,You Love Me Anyway (Album),200,1.54197E+12,35 93 | Rise Against,Logged In,Molly,F,1,Taylor,221.17832,free,"St. Louis, MO-IL",PUT,NextSong,1.54099E+12,464,Torches,200,1.54197E+12,35 94 | K'Naan,Logged In,Molly,F,2,Taylor,220.49914,free,"St. Louis, MO-IL",PUT,NextSong,1.54099E+12,464,Wavin' Flag,200,1.54197E+12,35 95 | Patrick Jumpen,Logged In,Ryan,M,0,Smith,208.87465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,480,Holiday,200,1.54198E+12,26 96 | Alicia Keys,Logged In,Ryan,M,1,Smith,216.47628,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,480,Empire State Of Mind (Part II) Broken Down,200,1.54198E+12,26 97 | -------------------------------------------------------------------------------- /Project 1B Data Modeling with Apache Cassandra/event_data/2018-11-22-events.csv: -------------------------------------------------------------------------------- 1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId 2 | Dee Dee Bridgewater,Logged In,Lily,F,38,Koch,318.64118,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,La Vie En Rose,200,1.54285E+12,15 3 | Tim O'brien,Logged In,Lily,F,39,Koch,176.14322,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Think About Last Night,200,1.54285E+12,15 4 | Nirvana,Logged In,Lily,F,40,Koch,215.11791,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Love Buzz,200,1.54285E+12,15 5 | Weezer,Logged In,Lily,F,41,Koch,479.32036,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Only In Dreams,200,1.54285E+12,15 6 | Nightwish,Logged In,Lily,F,42,Koch,286.1971,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,She Is My Sin,200,1.54285E+12,15 7 | California Swag District,Logged In,Lily,F,43,Koch,239.17669,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Teach Me How To Dougie,200,1.54285E+12,15 8 | Miike Snow,Logged In,Lily,F,44,Koch,385.35791,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Silvia,200,1.54285E+12,15 9 | Katy Perry,Logged In,Lily,F,45,Koch,179.40853,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,I Kissed A Girl,200,1.54285E+12,15 10 | Sikth,Logged In,Lily,F,46,Koch,250.53995,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Peep Show,200,1.54285E+12,15 11 | Lily Allen,Logged In,Lily,F,47,Koch,199.88853,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Not Fair,200,1.54285E+12,15 12 | The Presidents of the United States of America,Logged In,Lily,F,48,Koch,495.77751,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Lump,200,1.54285E+12,15 13 | Wordsworth,Logged In,Lily,F,49,Koch,253.1522,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Right Now (Produced by Ayatollah),200,1.54285E+12,15 14 | Rihanna,Logged In,Lily,F,50,Koch,229.04118,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Take A Bow,200,1.54285E+12,15 15 | Tomas Bodin,Logged In,Lily,F,51,Koch,396.53832,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Back To The African Garden,200,1.54285E+12,15 16 | Black Eyed Peas,Logged In,Lily,F,52,Koch,326.86975,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,My Humps,200,1.54285E+12,15 17 | Carolina Liar,Logged In,Lily,F,53,Koch,240.45669,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Show Me What I'm Looking For (Album Version),200,1.54285E+12,15 18 | Kansas,Logged In,Lily,F,54,Koch,202.29179,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Dust in The Wind,200,1.54285E+12,15 19 | Onar,Logged In,Lily,F,55,Koch,306.6771,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Xehasmeni Melodia,200,1.54285E+12,15 20 | Live,Logged In,Lily,F,56,Koch,286.98077,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Lakini's Juice,200,1.54285E+12,15 21 | Abstract Rude,Logged In,Lily,F,57,Koch,196.85832,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Nuff Fire,200,1.54285E+12,15 22 | Johnny Horton,Logged In,Lily,F,58,Koch,131.81342,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Mean Mean Son Of A Gun,200,1.54285E+12,15 23 | The Men They Couldn't Hang,Logged In,Lily,F,59,Koch,251.14077,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Ironmasters,200,1.54285E+12,15 24 | Rilo Kiley,Logged In,Lily,F,60,Koch,234.03057,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,The Absence Of God (Album Version),200,1.54285E+12,15 25 | Shwayze,Logged In,Lily,F,61,Koch,201.63873,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Lost My Mind,200,1.54285E+12,15 26 | Bram Vermeulen,Logged In,Lily,F,62,Koch,251.42812,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Mamma,200,1.54285E+12,15 27 | Death Cab for Cutie,Logged In,Lily,F,63,Koch,189.3873,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,I Will Follow You into the Dark (Album Version),200,1.54285E+12,15 28 | Dwight Yoakam,Logged In,Lily,F,64,Koch,239.3073,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,You're The One,200,1.54285E+12,15 29 | Jadakiss / Ghostface Killah / Raekwon,Logged In,Lily,F,65,Koch,173.76608,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Cartel Gathering,200,1.54285E+12,15 30 | Rosana,Logged In,Lily,F,66,Koch,256.31302,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Si tu no estas,200,1.54285E+12,15 31 | ,Logged In,Kaylee,F,0,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Home,1.54034E+12,775,,200,1.54285E+12,8 32 | ,Logged In,Kaylee,F,1,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Upgrade,1.54034E+12,775,,200,1.54285E+12,8 33 | ,Logged In,Kaylee,F,2,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Home,1.54034E+12,775,,200,1.54285E+12,8 34 | The Killers,Logged In,Lily,F,67,Koch,230.39955,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,The Ballad of Michael Valentine,200,1.54285E+12,15 35 | Alliance Ethnik,Logged In,Lily,F,68,Koch,195.94404,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Sincerité Et Jalousie,200,1.54285E+12,15 36 | Enya,Logged In,Lily,F,69,Koch,289.802,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,China Roses,200,1.54285E+12,15 37 | Aya RL,Logged In,Lily,F,70,Koch,225.43628,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Jazz,200,1.54285E+12,15 38 | ,Logged In,Lily,F,71,Koch,,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,Logout,1.54105E+12,818,,307,1.54285E+12,15 39 | ,Logged Out,,,72,,,paid,,GET,Home,,818,,200,1.54285E+12, 40 | ,Logged Out,,,73,,,paid,,GET,About,,818,,200,1.54285E+12, 41 | Clor,Logged In,Ryan,M,0,Smith,227.68281,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,820,Love + Pain,200,1.54286E+12,26 42 | Alejandro Fernandez,Logged In,Ryan,M,1,Smith,262.84363,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,820,Solitario Y Solo,200,1.54286E+12,26 43 | Yonder Mountain String Band,Logged In,Ryan,M,2,Smith,152.18893,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,820,Midwest Gospel Radio,200,1.54286E+12,26 44 | K'Naan,Logged In,Ava,F,0,Robinson,220.49914,free,"New Haven-Milford, CT",PUT,NextSong,1.54093E+12,824,Wavin' Flag,200,1.54287E+12,50 45 | Cradle Of Filth,Logged In,Kate,F,0,Harrell,453.09342,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Her Ghost In The Fog,200,1.54288E+12,97 46 | Amanda Marshall,Logged In,Kate,F,1,Harrell,274.28526,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Let It Rain,200,1.54288E+12,97 47 | Rammstein,Logged In,Kate,F,2,Harrell,272.40444,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Sonne,200,1.54288E+12,97 48 | Cat Stevens,Logged In,Kate,F,3,Harrell,167.6273,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,If You Want To Sing Out_ Sing Out,200,1.54288E+12,97 49 | Emma Shapplin,Logged In,Kate,F,4,Harrell,267.62404,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Spente Le Stelle,200,1.54289E+12,97 50 | Modest Mouse,Logged In,Kate,F,5,Harrell,209.52771,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Float On,200,1.54289E+12,97 51 | Flaco Jimenez,Logged In,Kate,F,6,Harrell,155.81995,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,En El Cielo No Hay Cerveza (In Heaven There Is No Beer),200,1.54289E+12,97 52 | Modest Mouse,Logged In,Kate,F,7,Harrell,209.52771,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Float On,200,1.54289E+12,97 53 | Cedric Gervais feat. Second Sun,Logged In,Kate,F,8,Harrell,230.32118,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Pills (Radio Edit) (Radio Edit),200,1.54289E+12,97 54 | Sheena Easton,Logged In,Kate,F,9,Harrell,239.62077,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Strut (1993 Digital Remaster),200,1.54289E+12,97 55 | Everything But The Girl,Logged In,Kate,F,10,Harrell,218.74893,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,My Baby Don't Love Me,200,1.54289E+12,97 56 | Florence + The Machine,Logged In,Kate,F,11,Harrell,219.66322,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Dog Days Are Over (Radio Edit),200,1.54289E+12,97 57 | BoDeans,Logged In,Kate,F,12,Harrell,354.01098,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Naked (Live),200,1.54289E+12,97 58 | OneRepublic,Logged In,Kate,F,13,Harrell,208.14322,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Apologize,200,1.54289E+12,97 59 | Miley Cyrus,Logged In,Kate,F,14,Harrell,194.45506,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Full Circle,200,1.54289E+12,97 60 | Coldplay,Logged In,Kate,F,15,Harrell,139.12771,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Don't Panic,200,1.54289E+12,97 61 | Atreyu,Logged In,Kate,F,16,Harrell,308.37506,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,You Were The King_ Now You're Unconscious (Album Version),200,1.54289E+12,97 62 | Bruce Springsteen,Logged In,Kate,F,17,Harrell,270.54975,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Born To Run,200,1.54289E+12,97 63 | Björk,Logged In,Kate,F,18,Harrell,348.57751,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Undo,200,1.54289E+12,97 64 | Big Shug,Logged In,Kate,F,19,Harrell,140.56444,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,It Just Don't Stop,200,1.54289E+12,97 65 | The Wallflowers,Logged In,Kate,F,20,Harrell,315.24526,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Be Your Own Girl,200,1.54289E+12,97 66 | Chris Brown,Logged In,Kate,F,21,Harrell,203.80689,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Ain't No Way (You Won't Love Me),200,1.54289E+12,97 67 | Charly García,Logged In,Kate,F,22,Harrell,231.73179,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Filosofia Barata Y Zapatos De Goma,200,1.54289E+12,97 68 | N.W.A ft. Eazy-E,Logged In,Kate,F,23,Harrell,338.18077,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Boyz-N-The-Hood,200,1.54289E+12,97 69 | The Mighty Mighty Bosstones,Logged In,Kate,F,24,Harrell,158.87628,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,1/2/2008,200,1.54289E+12,97 70 | Beastie Boys,Logged In,Kate,F,25,Harrell,211.722,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Unite (2009 Digital Remaster),200,1.54289E+12,97 71 | Yuksek,Logged In,Kate,F,26,Harrell,218.95791,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Take A Ride,200,1.54289E+12,97 72 | Fernando Ubiergo,Logged In,Kate,F,27,Harrell,218.74893,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Cuando Agosto Era 21,200,1.54289E+12,97 73 | Phoenix,Logged In,Kate,F,28,Harrell,192.86159,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Napoleon Says,200,1.54289E+12,97 74 | Radney Foster,Logged In,Jayden,M,0,Fox,288.96608,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,790,Sweet And Wild,200,1.54289E+12,101 75 | Neneh Cherry,Logged In,Jayden,M,1,Fox,232.202,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,790,Manchild,200,1.54289E+12,101 76 | Hooligans,Logged In,Ayla,F,0,Johnson,189.98812,free,"Santa Rosa, CA",PUT,NextSong,1.54088E+12,785,Szex & KV,200,1.54289E+12,63 77 | Kid Cudi / MGMT / Ratatat,Logged In,Lily,F,0,Burns,295.67955,free,"New York-Newark-Jersey City, NY-NJ-PA",PUT,NextSong,1.54062E+12,786,Pursuit Of Happiness (nightmare),200,1.5429E+12,32 78 | Foals,Logged In,Morris,M,0,Gilmore,316.89098,free,"Raleigh, NC",PUT,NextSong,1.54097E+12,351,Blue Blood,200,1.5429E+12,23 79 | 'N Sync/Phil Collins,Logged In,Morris,M,1,Gilmore,143.64689,free,"Raleigh, NC",PUT,NextSong,1.54097E+12,351,Trashin' The Camp (Phil And 'N Sync Version),200,1.5429E+12,23 80 | Kristian Stanfill,Logged In,Jayden,M,0,Fox,287.50322,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,838,I Need You,200,1.5429E+12,101 81 | Enrique Iglesias,Logged In,Jayden,M,1,Fox,241.42322,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,838,Tired Of Being Sorry,200,1.5429E+12,101 82 | Michael Cretu,Logged In,Jayden,M,2,Fox,301.06077,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,838,The Invisible Man,200,1.5429E+12,101 83 | Tommy Emmanuel,Logged In,Jayden,M,3,Fox,168.14975,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,838,Windy & Warm,200,1.5429E+12,101 84 | ,Logged In,Jayden,M,4,Fox,,free,"New Orleans-Metairie, LA",PUT,Logout,1.54103E+12,838,,307,1.5429E+12,101 85 | ,Logged Out,,,5,,,free,,GET,Home,,838,,200,1.5429E+12, 86 | ,Logged Out,,,6,,,free,,PUT,Login,,838,,307,1.5429E+12, 87 | ,Logged In,Jayden,M,7,Fox,,free,"New Orleans-Metairie, LA",GET,Home,1.54103E+12,838,,200,1.5429E+12,101 88 | ,Logged In,Jordan,F,0,Rodriguez,,free,"Los Angeles-Long Beach-Anaheim, CA",GET,Home,1.54099E+12,523,,200,1.5429E+12,68 89 | Cherise,Logged In,Stefany,F,0,White,229.69424,free,"Lubbock, TX",PUT,NextSong,1.54071E+12,772,No Good 4 You,200,1.54291E+12,83 90 | ,Logged In,Ryan,M,0,Smith,,free,"San Jose-Sunnyvale-Santa Clara, CA",GET,Home,1.54102E+12,835,,200,1.54291E+12,26 91 | Anna Waronker,Logged In,Jayden,F,0,Duffy,189.6224,free,"Seattle-Tacoma-Bellevue, WA",PUT,NextSong,1.54015E+12,662,Nothing Personal,200,1.54291E+12,76 92 | King Changó,Logged In,Cecilia,F,0,Owens,340.74077,free,"Atlanta-Sandy Springs-Roswell, GA",PUT,NextSong,1.54103E+12,763,Confesión,200,1.54292E+12,6 93 | Gang Of Four,Logged In,Cecilia,F,1,Owens,193.14893,free,"Atlanta-Sandy Springs-Roswell, GA",PUT,NextSong,1.54103E+12,763,I Found That Essence Rare,200,1.54292E+12,6 94 | Line Renaud,Logged In,Cecilia,F,2,Owens,176.16934,free,"Atlanta-Sandy Springs-Roswell, GA",PUT,NextSong,1.54103E+12,763,Le Soir,200,1.54292E+12,6 95 | ,Logged Out,,,0,,,paid,,PUT,Login,,823,,307,1.54292E+12, 96 | ,Logged In,Tegan,F,1,Levine,,paid,"Portland-South Portland, ME",GET,Home,1.54079E+12,823,,200,1.54292E+12,80 97 | the bird and the bee,Logged In,Tegan,F,2,Levine,198.1122,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,823,Last Day Of Our Love,200,1.54292E+12,80 98 | ,Logged Out,,,0,,,paid,,GET,Home,,831,,200,1.54293E+12, 99 | ,Logged Out,,,1,,,paid,,GET,Home,,831,,200,1.54293E+12, 100 | -------------------------------------------------------------------------------- /Project 1B Data Modeling with Apache Cassandra/event_data/2018-11-25-events.csv: -------------------------------------------------------------------------------- 1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId 2 | matchbox twenty,Logged In,Jayden,F,0,Duffy,177.65832,free,"Seattle-Tacoma-Bellevue, WA",PUT,NextSong,1.54015E+12,846,Argue (LP Version),200,1.54311E+12,76 3 | The Lonely Island / T-Pain,Logged In,Jayden,F,1,Duffy,156.23791,free,"Seattle-Tacoma-Bellevue, WA",PUT,NextSong,1.54015E+12,846,I'm On A Boat,200,1.54311E+12,76 4 | ,Logged In,Jayden,F,2,Duffy,,free,"Seattle-Tacoma-Bellevue, WA",GET,Home,1.54015E+12,846,,200,1.54311E+12,76 5 | ,Logged In,Jayden,F,3,Duffy,,free,"Seattle-Tacoma-Bellevue, WA",GET,Settings,1.54015E+12,846,,200,1.54311E+12,76 6 | ,Logged In,Jayden,F,4,Duffy,,free,"Seattle-Tacoma-Bellevue, WA",PUT,Save Settings,1.54015E+12,846,,307,1.54311E+12,76 7 | John Mayer,Logged In,Wyatt,M,0,Scott,275.27791,free,"Eureka-Arcata-Fortuna, CA",PUT,NextSong,1.54087E+12,856,All We Ever Do Is Say Goodbye,200,1.54311E+12,9 8 | ,Logged In,Wyatt,M,1,Scott,,free,"Eureka-Arcata-Fortuna, CA",GET,Home,1.54087E+12,856,,200,1.54311E+12,9 9 | 10_000 Maniacs,Logged In,Wyatt,M,2,Scott,251.8722,free,"Eureka-Arcata-Fortuna, CA",PUT,NextSong,1.54087E+12,856,Gun Shy (LP Version),200,1.54311E+12,9 10 | Leona Lewis,Logged In,Chloe,F,0,Cuevas,203.88526,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Forgive Me,200,1.54312E+12,49 11 | Nine Inch Nails,Logged In,Chloe,F,1,Cuevas,277.83791,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,La Mer,200,1.54312E+12,49 12 | Audioslave,Logged In,Chloe,F,2,Cuevas,334.91546,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,I Am The Highway,200,1.54312E+12,49 13 | Kid Rock,Logged In,Chloe,F,3,Cuevas,296.95955,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,All Summer Long (Album Version),200,1.54312E+12,49 14 | The Jets,Logged In,Chloe,F,4,Cuevas,220.89098,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,I Do You,200,1.54312E+12,49 15 | The Gerbils,Logged In,Chloe,F,5,Cuevas,27.01016,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,(iii),200,1.54312E+12,49 16 | Damian Marley / Stephen Marley / Yami Bolo,Logged In,Chloe,F,6,Cuevas,304.69179,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Still Searching,200,1.54312E+12,49 17 | ,Logged In,Chloe,F,7,Cuevas,,paid,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,916,,200,1.54312E+12,49 18 | The Bloody Beetroots,Logged In,Chloe,F,8,Cuevas,201.97832,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Warp 1.9 (feat. Steve Aoki),200,1.54312E+12,49 19 | ,Logged In,Chloe,F,9,Cuevas,,paid,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,916,,200,1.54313E+12,49 20 | The Specials,Logged In,Chloe,F,10,Cuevas,188.81261,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Rat Race,200,1.54313E+12,49 21 | The Lively Ones,Logged In,Chloe,F,11,Cuevas,142.52363,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Walkin' The Board (LP Version),200,1.54313E+12,49 22 | Katie Melua,Logged In,Chloe,F,12,Cuevas,252.78649,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Blues In The Night,200,1.54313E+12,49 23 | Jason Mraz,Logged In,Chloe,F,13,Cuevas,243.48689,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,I'm Yours (Album Version),200,1.54313E+12,49 24 | Fisher,Logged In,Chloe,F,14,Cuevas,133.98159,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Rianna,200,1.54313E+12,49 25 | Zee Avi,Logged In,Chloe,F,15,Cuevas,160.62649,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,No Christmas For Me,200,1.54313E+12,49 26 | Black Eyed Peas,Logged In,Chloe,F,16,Cuevas,289.12281,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,I Gotta Feeling,200,1.54313E+12,49 27 | Emiliana Torrini,Logged In,Chloe,F,17,Cuevas,184.29342,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Sunny Road,200,1.54313E+12,49 28 | ,Logged In,Chloe,F,18,Cuevas,,paid,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,916,,200,1.54313E+12,49 29 | Days Of The New,Logged In,Chloe,F,19,Cuevas,258.5073,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,The Down Town,200,1.54313E+12,49 30 | Julio Iglesias duet with Willie Nelson,Logged In,Chloe,F,20,Cuevas,212.16608,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,To All The Girls I've Loved Before (With Julio Iglesias),200,1.54313E+12,49 31 | ,Logged In,Jacqueline,F,0,Lynch,,paid,"Atlanta-Sandy Springs-Roswell, GA",GET,Home,1.54022E+12,914,,200,1.54313E+12,29 32 | Jason Mraz & Colbie Caillat,Logged In,Chloe,F,0,Roth,189.6224,free,"Indianapolis-Carmel-Anderson, IN",PUT,NextSong,1.5407E+12,704,Lucky (Album Version),200,1.54314E+12,78 33 | ,Logged In,Anabelle,F,0,Simpson,,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",GET,Home,1.54104E+12,901,,200,1.54315E+12,69 34 | R. Kelly,Logged In,Anabelle,F,1,Simpson,234.39628,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1.54104E+12,901,The World's Greatest,200,1.54315E+12,69 35 | ,Logged In,Kynnedi,F,0,Sanchez,,free,"Cedar Rapids, IA",GET,Home,1.54108E+12,804,,200,1.54315E+12,89 36 | Jacky Terrasson,Logged In,Marina,F,0,Sutton,342.7522,free,"Salinas, CA",PUT,NextSong,1.54106E+12,373,Le Jardin d'Hiver,200,1.54315E+12,48 37 | Papa Roach,Logged In,Theodore,M,0,Harris,202.1873,free,"Red Bluff, CA",PUT,NextSong,1.5411E+12,813,Alive,200,1.54316E+12,14 38 | Burt Bacharach,Logged In,Theodore,M,1,Harris,156.96934,free,"Red Bluff, CA",PUT,NextSong,1.5411E+12,813,Casino Royale Theme (Main Title),200,1.54316E+12,14 39 | ,Logged In,Chloe,F,0,Cuevas,,paid,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,923,,200,1.54316E+12,49 40 | Floetry,Logged In,Chloe,F,1,Cuevas,254.48444,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,Sunshine,200,1.54316E+12,49 41 | The Rakes,Logged In,Chloe,F,2,Cuevas,225.2273,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,Leave The City And Come Home,200,1.54316E+12,49 42 | Dwight Yoakam,Logged In,Chloe,F,3,Cuevas,239.3073,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,You're The One,200,1.54316E+12,49 43 | Ween,Logged In,Chloe,F,4,Cuevas,228.10077,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,Voodoo Lady,200,1.54316E+12,49 44 | Café Quijano,Logged In,Chloe,F,5,Cuevas,197.32853,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,La Lola,200,1.54316E+12,49 45 | ,Logged In,Chloe,F,0,Roth,,free,"Indianapolis-Carmel-Anderson, IN",GET,Home,1.5407E+12,925,,200,1.54317E+12,78 46 | Parov Stelar,Logged In,Chloe,F,1,Roth,203.65016,free,"Indianapolis-Carmel-Anderson, IN",PUT,NextSong,1.5407E+12,925,Good Bye Emily (feat. Gabriella Hanninen),200,1.54317E+12,78 47 | ,Logged In,Chloe,F,2,Roth,,free,"Indianapolis-Carmel-Anderson, IN",GET,Home,1.5407E+12,925,,200,1.54317E+12,78 48 | ,Logged In,Tegan,F,0,Levine,,paid,"Portland-South Portland, ME",GET,Home,1.54079E+12,915,,200,1.54317E+12,80 49 | Bryan Adams,Logged In,Tegan,F,1,Levine,166.29506,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,I Will Always Return,200,1.54317E+12,80 50 | KT Tunstall,Logged In,Tegan,F,2,Levine,192.31302,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,White Bird,200,1.54317E+12,80 51 | Technicolour,Logged In,Tegan,F,3,Levine,235.12771,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,Turn Away,200,1.54317E+12,80 52 | The Dears,Logged In,Tegan,F,4,Levine,289.95873,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,Lost In The Plot,200,1.54317E+12,80 53 | Go West,Logged In,Tegan,F,5,Levine,259.49995,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,Never Let Them See You Sweat,200,1.54317E+12,80 54 | ,Logged In,Tegan,F,6,Levine,,paid,"Portland-South Portland, ME",PUT,Logout,1.54079E+12,915,,307,1.54317E+12,80 55 | ,Logged In,Sylvie,F,0,Cruz,,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV",GET,Home,1.54027E+12,912,,200,1.54317E+12,10 56 | ,Logged Out,,,7,,,paid,,GET,Home,,915,,200,1.54317E+12, 57 | Gondwana,Logged In,Jordan,F,0,Hicks,262.5824,free,"Salinas, CA",PUT,NextSong,1.54001E+12,814,Mi Princesa,200,1.54319E+12,37 58 | ,Logged In,Kevin,M,0,Arellano,,free,"Harrisburg-Carlisle, PA",GET,Home,1.54001E+12,855,,200,1.54319E+12,66 59 | Ella Fitzgerald,Logged In,Jordan,F,1,Hicks,427.15383,free,"Salinas, CA",PUT,NextSong,1.54001E+12,814,On Green Dolphin Street (Medley) (1999 Digital Remaster),200,1.54319E+12,37 60 | Creedence Clearwater Revival,Logged In,Jordan,F,2,Hicks,184.73751,free,"Salinas, CA",PUT,NextSong,1.54001E+12,814,Run Through The Jungle,200,1.54319E+12,37 61 | -------------------------------------------------------------------------------- /Project 1B Data Modeling with Apache Cassandra/images/.ipynb_checkpoints/image_event_datafile_new-checkpoint.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbmayer/Udacity-Data-Engineering-Nanodegree/84c28b9d8538a5aa3b23867abad93e8de30c3476/Project 1B Data Modeling with Apache Cassandra/images/.ipynb_checkpoints/image_event_datafile_new-checkpoint.jpg -------------------------------------------------------------------------------- /Project 1B Data Modeling with Apache Cassandra/images/image_event_datafile_new.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbmayer/Udacity-Data-Engineering-Nanodegree/84c28b9d8538a5aa3b23867abad93e8de30c3476/Project 1B Data Modeling with Apache Cassandra/images/image_event_datafile_new.jpg -------------------------------------------------------------------------------- /Project 3 Create AWS Redshift Data Warehouse/README.md: -------------------------------------------------------------------------------- 1 | # DEND Project 3: Create AWS Redshift Data Warehouse 2 | 3 | ## Project Summary 4 | 5 | The objective of this project is to create a data warehouse in the cloud for a fictional music streaming service called Sparkify. Sparkify has grown their user base and song database and want to move their processes and data onto the cloud. Their data resides in Amazon S3 storage, in directories containing JSON logs on user activity and metadata on songs. 6 | 7 | As the data engineer assigned to the project, I have created a cluster on Redshift and built an ETL pipeline to populate it using the AWS SDK for Python. The ETL process extracts the data from S3, stages it in Redshift, then transforms the data into a set of dimensional tables in a star schema. To improve processing speeds for query analysis, I specified a distribution strategy for partitioning the tables across Redshift nodes using dist keys and ordering the data using sort keys. 8 | 9 | ## How to Run 10 | 11 | Prerequisites: Configuration file with login details for an active AWS Redshift cluster and ARN for an IAM role with S3 read access. 12 | 13 | 1. Run sql_queries.py from terminal or python console to load table create and insert queries. 14 | 2. Run create_tables.py from terminal or python console to create staging and analytical tables. 15 | 3. Run etl.py from terminal or python console to process and load data into data warehouse. 16 | 17 | ## Description of Data 18 | 19 | The Sparkify database consists of five tables in the star schema shown below. The fact table is called `songplays`, and contains a record of each songplay event generated by users of the music streaming app. There are four dimension tables. They store largely normalized data on users, artists, songs and timestamps. 20 | 21 | ![Database schema diagram](database_schema_diagram.png) 22 | 23 | Prior to table optimization, the largest table in the database is `artists` using 128 MB of storage. `songplays` utilizes 96 MB while `time`, `songs` and `users` take up the least amount of storage. 24 | 25 | ![Table sizes](table_sizes.png) 26 | 27 | ## Optimization of Table Design 28 | 29 | Redshift automatically partitions and stores database tables on multiple slices within the cluster. 30 | * Advantages: rapid and flexible scaling 31 | * Disadvantages: decreased query performance 32 | 33 | Executing queries across different slices can increase copying and processing costs compared to an environment where all the data is located on a single machine. 34 | 35 | **I utilized Redshift's 'KEY' and 'ALL' distribution strategies in my table design.** 36 | 37 | The 'distkey' function specifies that data in two tables with the same distkey column values will be stored on the same slice. 38 | 39 | Distkey and sortkeys were selected as follows: 40 | * distkey is `artist_id` because it is the join field between the fact table and the largest dimension table. 41 | * `users` and `songs` are distributed 'ALL' because they are the two smallest tables in the database. The sortkeys are the fields that join to the fact table. 42 | * Both `time` and `songplays` are sorted on `start_time`. I expect many of the analysts' queries will focus on the most recent data or on song trends over specific time periods. 43 | 44 | ### Optimization results 45 | 46 | The execution plan for my test query shows only one broadcast hash join (DS_BCAST_INNER) although all five tables are joined. 47 | 48 | ![query execution](query_execution.png) 49 | -------------------------------------------------------------------------------- /Project 3 Create AWS Redshift Data Warehouse/create_tables.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import psycopg2 3 | from sql_queries import create_table_queries, drop_table_queries 4 | 5 | 6 | def drop_tables(cur, conn): 7 | """Run drop table queries.""" 8 | for query in drop_table_queries: 9 | cur.execute(query) 10 | conn.commit() 11 | 12 | 13 | def create_tables(cur, conn): 14 | """Run create table queries.""" 15 | for query in create_table_queries: 16 | cur.execute(query) 17 | conn.commit() 18 | 19 | 20 | def main(): 21 | """Create database and tables.""" 22 | config = configparser.ConfigParser() 23 | config.read('dwh.cfg') 24 | 25 | conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values())) 26 | cur = conn.cursor() 27 | 28 | drop_tables(cur, conn) 29 | create_tables(cur, conn) 30 | 31 | conn.close() 32 | 33 | 34 | if __name__ == "__main__": 35 | main() -------------------------------------------------------------------------------- /Project 3 Create AWS Redshift Data Warehouse/database_schema_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbmayer/Udacity-Data-Engineering-Nanodegree/84c28b9d8538a5aa3b23867abad93e8de30c3476/Project 3 Create AWS Redshift Data Warehouse/database_schema_diagram.png -------------------------------------------------------------------------------- /Project 3 Create AWS Redshift Data Warehouse/dwh.cfg: -------------------------------------------------------------------------------- 1 | [CLUSTER] 2 | HOST= 3 | DB_NAME= 4 | DB_USER= 5 | DB_PASSWORD= 6 | DB_PORT=5439 7 | 8 | [IAM_ROLE] 9 | ARN= 10 | 11 | [S3] 12 | LOG_DATA='s3://udacity-dend/log_data' 13 | LOG_JSONPATH='s3://udacity-dend/log_json_path.json' 14 | SONG_DATA='s3://udacity-dend/song_data' 15 | 16 | [GEO] 17 | REGION='us-west-2' -------------------------------------------------------------------------------- /Project 3 Create AWS Redshift Data Warehouse/etl.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import psycopg2 3 | from sql_queries import copy_table_queries, insert_table_queries 4 | 5 | 6 | def load_staging_tables(cur, conn): 7 | """Stage data from S3 into tables in Redshift.""" 8 | for query in copy_table_queries: 9 | cur.execute(query) 10 | conn.commit() 11 | 12 | 13 | def insert_tables(cur, conn): 14 | """Transform data from staging tables into analytical tables in star schema.""" 15 | for query in insert_table_queries: 16 | cur.execute(query) 17 | conn.commit() 18 | 19 | 20 | def main(): 21 | """Load staging tables from S3 then transform into analytical tables.""" 22 | config = configparser.ConfigParser() 23 | config.read('dwh.cfg') 24 | 25 | conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values())) 26 | cur = conn.cursor() 27 | 28 | load_staging_tables(cur, conn) 29 | insert_tables(cur, conn) 30 | 31 | conn.close() 32 | 33 | 34 | if __name__ == "__main__": 35 | main() -------------------------------------------------------------------------------- /Project 3 Create AWS Redshift Data Warehouse/query_execution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbmayer/Udacity-Data-Engineering-Nanodegree/84c28b9d8538a5aa3b23867abad93e8de30c3476/Project 3 Create AWS Redshift Data Warehouse/query_execution.png -------------------------------------------------------------------------------- /Project 3 Create AWS Redshift Data Warehouse/sql_queries.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | 3 | 4 | # CONFIG 5 | config = configparser.ConfigParser() 6 | config.read('dwh.cfg') 7 | LOG_DATA = config.get('S3', 'LOG_DATA') 8 | SONG_DATA = config.get('S3', 'SONG_DATA') 9 | ARN = config.get('IAM_ROLE', 'ARN') 10 | LOG_JSONPATH = config.get('S3', 'LOG_JSONPATH') 11 | REGION = config.get('GEO', 'REGION') 12 | 13 | # DROP TABLES 14 | 15 | staging_events_table_drop = "DROP TABLE IF EXISTS staging_events" 16 | staging_songs_table_drop = "DROP TABLE IF EXISTS staging_songs" 17 | songplay_table_drop = "DROP TABLE IF EXISTS songplays" 18 | user_table_drop = "DROP TABLE IF EXISTS users" 19 | song_table_drop = "DROP TABLE IF EXISTS songs" 20 | artist_table_drop = "DROP TABLE IF EXISTS artists" 21 | time_table_drop = "DROP TABLE IF EXISTS time" 22 | 23 | # CREATE TABLES 24 | 25 | staging_events_table_create= ("""CREATE TABLE IF NOT EXISTS staging_events ( 26 | artist text, 27 | auth text, 28 | firstName text, 29 | gender text, 30 | ItemInSession int, 31 | lastName text, 32 | length float8, 33 | level text, 34 | location text, 35 | method text, 36 | page text, 37 | registration text, 38 | sessionId int, 39 | song text, 40 | status int, 41 | ts timestamp, 42 | userAgent text, 43 | userId int) 44 | """) 45 | 46 | staging_songs_table_create = ("""CREATE TABLE IF NOT EXISTS staging_songs ( 47 | song_id text PRIMARY KEY, 48 | artist_id text, 49 | artist_latitude float8, 50 | artist_location text, 51 | artist_longitude float8, 52 | artist_name text, 53 | duration float8, 54 | num_songs int, 55 | title text, 56 | year int) 57 | """) 58 | 59 | songplay_table_create = ("""CREATE TABLE IF NOT EXISTS songplays ( 60 | songplay_id int IDENTITY PRIMARY KEY, 61 | start_time timestamp NOT NULL REFERENCES time(start_time) sortkey, 62 | user_id int NOT NULL REFERENCES users(user_id), 63 | level text NOT NULL, 64 | song_id text NOT NULL REFERENCES songs(song_id), 65 | artist_id text NOT NULL REFERENCES artists(artist_id) distkey, 66 | session_id int NOT NULL, 67 | location text NOT NULL, 68 | user_agent text NOT NULL) 69 | """) 70 | 71 | user_table_create = ("""CREATE TABLE IF NOT EXISTS users ( 72 | user_id int PRIMARY KEY sortkey, 73 | first_name text NOT NULL, 74 | last_name text NOT NULL, 75 | gender text NOT NULL, 76 | level text NOT NULL) 77 | diststyle ALL; 78 | """) 79 | 80 | song_table_create = ("""CREATE TABLE IF NOT EXISTS songs ( 81 | song_id text PRIMARY KEY sortkey, 82 | title text NOT NULL, 83 | artist_id text NOT NULL, 84 | year int NOT NULL, 85 | duration numeric NOT NULL) 86 | diststyle ALL; 87 | """) 88 | 89 | artist_table_create = ("""CREATE TABLE IF NOT EXISTS artists ( 90 | artist_id text PRIMARY KEY distkey, 91 | name text NOT NULL, 92 | location text NOT NULL, 93 | lattitude float8 NOT NULL, 94 | longitude float8 NOT NULL) 95 | """) 96 | 97 | time_table_create = ("""CREATE TABLE IF NOT EXISTS time ( 98 | start_time timestamp PRIMARY KEY sortkey, 99 | hour int NOT NULL, 100 | day int NOT NULL, 101 | week int NOT NULL, 102 | month int NOT NULL, 103 | year int NOT NULL, 104 | weekday int NOT NULL) 105 | """) 106 | 107 | # STAGING TABLES 108 | 109 | staging_events_copy = ("""COPY staging_events FROM {} iam_role {} region {} FORMAT AS JSON {} timeformat 'epochmillisecs'; 110 | """).format(LOG_DATA, ARN, REGION, LOG_JSONPATH) 111 | 112 | staging_songs_copy = ("""COPY staging_songs FROM {} iam_role {} region {} FORMAT AS JSON 'auto'; 113 | """).format(SONG_DATA, ARN, REGION) 114 | 115 | # FINAL TABLES 116 | 117 | songplay_table_insert = ("""INSERT INTO songplays (start_time, user_id, level, song_id, artist_id, session_id, location, user_agent) 118 | SELECT DISTINCT se.ts, se.userId, se.level, ss.song_id, ss.artist_id, se.sessionId, se.location, se.userAgent 119 | FROM staging_events se 120 | INNER JOIN staging_songs ss 121 | ON se.song = ss.title 122 | WHERE se.page = 'NextSong' 123 | """) 124 | 125 | user_table_insert = ("""INSERT INTO users (user_id, first_name, last_name, gender, level) 126 | SELECT DISTINCT se.userId, se.firstName, se.lastName, se.gender, se.level 127 | FROM staging_events se 128 | WHERE se.userId IS NOT NULL 129 | """) 130 | 131 | song_table_insert = ("""INSERT INTO songs (song_id, title, artist_id, year, duration) 132 | SELECT DISTINCT ss.song_id, ss.title, ss.artist_id, ss.year, ss.duration 133 | FROM staging_songs ss 134 | """) 135 | 136 | artist_table_insert = ("""INSERT INTO artists (artist_id, name, location, lattitude, longitude) 137 | SELECT DISTINCT ss.artist_id, 138 | ss.artist_name, 139 | CASE WHEN ss.artist_location IS NULL THEN 'N/A' ELSE ss.artist_location END, 140 | CASE WHEN ss.artist_latitude IS NULL THEN 0.0 ELSE ss.artist_latitude END, 141 | CASE WHEN ss.artist_longitude IS NULL THEN 0.0 ELSE ss.artist_longitude END 142 | FROM staging_songs ss 143 | WHERE ss.artist_id IS NOT NULL 144 | """) 145 | 146 | time_table_insert = ("""INSERT INTO time (start_time, hour, day, week, month, year, weekday) 147 | SELECT DISTINCT 148 | se.ts, 149 | CAST(DATE_PART('hour', se.ts) as Integer), 150 | CAST(DATE_PART('day', se.ts) as Integer), 151 | CAST(DATE_PART('week', se.ts) as Integer), 152 | CAST(DATE_PART('month', se.ts) as Integer), 153 | CAST(DATE_PART('year', se.ts) as Integer), 154 | CAST(DATE_PART('dow', se.ts) as Integer) 155 | FROM staging_events se 156 | WHERE se.page = 'NextSong' 157 | """) 158 | 159 | # QUERY LISTS 160 | 161 | create_table_queries = [staging_events_table_create, staging_songs_table_create, user_table_create, song_table_create, artist_table_create, time_table_create, songplay_table_create] 162 | drop_table_queries = [staging_events_table_drop, staging_songs_table_drop, songplay_table_drop, user_table_drop, song_table_drop, artist_table_drop, time_table_drop] 163 | copy_table_queries = [staging_events_copy, staging_songs_copy] 164 | insert_table_queries = [songplay_table_insert, user_table_insert, song_table_insert, artist_table_insert, time_table_insert] 165 | -------------------------------------------------------------------------------- /Project 3 Create AWS Redshift Data Warehouse/table_sizes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbmayer/Udacity-Data-Engineering-Nanodegree/84c28b9d8538a5aa3b23867abad93e8de30c3476/Project 3 Create AWS Redshift Data Warehouse/table_sizes.png -------------------------------------------------------------------------------- /Project 4 Data Lake/README.md: -------------------------------------------------------------------------------- 1 | # DEND Project 4: Data Lake 2 | 3 | ## Project Summary 4 | 5 | A fictional music streaming startup, Sparkify, has grown their user base and want to move their data warehouse to a data lake. Their data resides in S3, in a directory of JSON logs on user activity on the app, as well as a directory with JSON metadata on the songs in their app. 6 | 7 | As their data engineer, I have been tasked with building an ETL pipeline that extracts the data from S3, processes it using Spark, and loads the data back into S3 as a set of dimensional tables. I have written a script in python that uses Spark SQL and Spark Data Frames to execute the following: 8 | 9 | * create a Spark session using the Apache Hadoop Amazon Web Services Support module 10 | * load AWS access keys into environment variables 11 | * ingest log files and song data files from S3 12 | * clean and process the data: 13 | * add unique row identifiers to all fact and dimension tables 14 | * remove duplicate rows 15 | * impute nulls to desired values 16 | * parse timestamps into time and date components 17 | * create tables for 18 | * write the final set of tables to S3 19 | 20 | This project was submitted for Udacity's Data Engineering Nanodegree (DEND) in Spring 2019. 21 | 22 | ## How to Use 23 | 24 | Run etl.py from terminal or python console. 25 | 26 | ## Files in Repository 27 | 28 | The tables produced by the ETL process are called songplays, users, artists, songs and time. Each of the five tables are written to parquet files in a separate analytics directory on S3. Each table has its own folder within the directory. Songs table files are partitioned by year and then artist. Time table files are partitioned by year and month. Songplays table files are partitioned by year and month. 29 | 30 | #### Fact Table 31 | 32 | **songplays** - records in log data associated with song plays i.e. records with page NextSong 33 | * *songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent* 34 | 35 | #### Dimension Tables 36 | 37 | **users** - users in the app 38 | * *user_id, first_name, last_name, gender, level* 39 | 40 | **songs** - songs in music database 41 | * *song_id, title, artist_id, year, duration* 42 | 43 | **artists** - artists in music database 44 | * *artist_id, name, location, lattitude, longitude* 45 | 46 | **time** - timestamps of records in songplays broken down into specific units 47 | *start_time, hour, day, week, month, year, weekday* 48 | 49 | ## Discussion 50 | 51 | One of the primary challenges in this assignment was learning to deal with extremely long times for data transfer during development and testing. My first attempt to use S3 failed to complete a table write of less than 20MB over a period of eight hours. 52 | 53 | Some strategies that I used to deal with slow loading times included: 54 | * Developing ETL initially on a small set of sample files in a standalone workspace offered by the course provider. 55 | * Printing frequent updates to the console to indicate where in the ETL process progress was getting stuck. 56 | * Testing the script on a small subset of the log and songplays data in the S3 repository 57 | 58 | Other students/mentors suggested running the script from Amazon EMR, a managed service for data processing that supports Spark as well as other big data frameworks. This would have been a good approach for a production project. 59 | 60 | For future reference, Amazon provides [guidelines](https://docs.aws.amazon.com/AmazonS3/latest/dev/optimizing-perforance-guidelines.html) for optimizing data transfer performance when using S3. 61 | -------------------------------------------------------------------------------- /Project 4 Data Lake/dl.cfg: -------------------------------------------------------------------------------- 1 | [AWS] 2 | AWS_ACCESS_KEY_ID= 3 | AWS_SECRET_ACCESS_KEY= 4 | -------------------------------------------------------------------------------- /Project 4 Data Lake/etl.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | from datetime import datetime 3 | import os 4 | from pyspark.sql import SparkSession 5 | from pyspark.sql.functions import udf, col 6 | import pyspark.sql.functions as F 7 | 8 | 9 | config = configparser.ConfigParser() 10 | config.read('dl.cfg') 11 | 12 | os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID'] 13 | os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY'] 14 | 15 | 16 | def create_spark_session(): 17 | """Create the Spark session""" 18 | spark = SparkSession \ 19 | .builder \ 20 | .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \ 21 | .getOrCreate() 22 | return spark 23 | 24 | 25 | def process_song_data(spark, input_data, output_data): 26 | """Create songs and artists tables from song data. 27 | 28 | Load song files from S3, process data into songs and artists tables, and write tables to partitioned parquet files on S3. 29 | 30 | Parameters: 31 | spark - an active Spark session 32 | input_data - path to S3 bucket with input data 33 | output_data - path to S3 bucket to store output tables 34 | 35 | Returns: None 36 | 37 | """ 38 | # get filepath to song data file 39 | song_data = os.path.join(input_data, 'song_data/*/*/*/*.json') 40 | 41 | # read song data file 42 | print('starting read of song_data json files: ' + str(datetime.now())) 43 | df = spark.read.json(song_data) 44 | print('loading of song_data files complete: ' + str(datetime.now())) 45 | 46 | # extract columns to create songs 47 | songs_table = df.select(["song_id", "title", "artist_id", "year", "duration"]) \ 48 | .dropDuplicates() 49 | 50 | # write songs table to parquet files partitioned by year and artist 51 | songs_path = os.path.join(output_data, 'songs') 52 | print('writing songs table to S3: ' + str(datetime.now())) 53 | songs_table.write.parquet(songs_path, mode='overwrite', partitionBy=["year", "artist_id"]) 54 | print('write of songs table to S3 complete: ' + str(datetime.now())) 55 | 56 | # extract columns to create artists table artist_id, name, location, lattitude, longitude 57 | artists_table = df.selectExpr(["artist_id", 58 | "artist_name", 59 | "coalesce(nullif(artist_location, ''), 'N/A') as location", 60 | "coalesce(artist_latitude, 0.0) as latitude", 61 | "coalesce(artist_longitude, 0.0) as longitude"]) \ 62 | .dropDuplicates() 63 | 64 | # write artists table to parquet files 65 | artists_path = os.path.join(output_data, 'artists') 66 | print('writing artists table to S3: ' + str(datetime.now())) 67 | artists_table.write.parquet(artists_path, mode='overwrite') 68 | print('write of artists table to S3 complete: ' + str(datetime.now())) 69 | 70 | 71 | def process_log_data(spark, input_data, output_data): 72 | """Create users, time and songplays tables. 73 | 74 | Load log files and input tables from S3, process data into output table formats, and write tables to partitioned parquet files on S3. 75 | 76 | Parameters: 77 | spark - an active Spark session 78 | input_data - path to S3 bucket with input data 79 | output_data - path to S3 bucket to store output tables 80 | 81 | Returns: None 82 | 83 | """ 84 | # get filepath to log data file 85 | log_data = os.path.join(input_data, 'log_data/*.json') 86 | 87 | # read log data file 88 | print('reading log data from S3: ' + str(datetime.now())) 89 | df = spark.read.json(log_data) 90 | print('loading of log data from S3 complete: ' + str(datetime.now())) 91 | 92 | # filter by actions for song plays 93 | df = df.filter(df.page == 'NextSong') 94 | 95 | # extract columns for users table 96 | users_table = df.selectExpr(["userId as user_id", 97 | "firstName as first_name", 98 | "lastName as last_name", 99 | "gender", 100 | "level"]) \ 101 | .dropDuplicates() 102 | 103 | # write users table to parquet files 104 | users_path = os.path.join(output_data, 'users') 105 | print('writing users table to S3: ' + str(datetime.now())) 106 | users_table.write.parquet(users_path, mode='overwrite') 107 | print('write of users table to S3 complete: ' + str(datetime.now())) 108 | 109 | # create timestamp column from original timestamp column 110 | df = df.withColumn("log_timestamp", F.to_timestamp(df.ts/1000)) 111 | 112 | # create datetime column from original timestamp column 113 | df = df.withColumn("log_datetime", F.to_date(df.log_timestamp)) 114 | 115 | # extract columns to create time table start_time, hour, day, week, month, year, weekday 116 | time_table = df.selectExpr(["log_timestamp as start_time", 117 | "hour(log_datetime) as hour", 118 | "dayofmonth(log_datetime) as day", 119 | "weekofyear(log_datetime) as week", 120 | "month(log_datetime) as month", 121 | "year(log_datetime) as year", 122 | "dayofweek(log_datetime) as weekday"]) \ 123 | .dropDuplicates() 124 | 125 | # write time table to parquet files partitioned by year and month 126 | time_path = os.path.join(output_data, 'time') 127 | print('writing time table to S3 partitioned by year and month: ' + str(datetime.now())) 128 | time_table.write.parquet(time_path, mode='append', partitionBy=["year", "month"]) 129 | print('write of time table to S3 complete: ' + str(datetime.now())) 130 | 131 | # read in song data to use for songplays table 132 | print('reading songs table from S3: ' + str(datetime.now())) 133 | songs_path = os.path.join(output_data, 'songs') 134 | song_df = spark.read.parquet(songs_path) 135 | print('loading of songs table from S3 complete: ' + str(datetime.now())) 136 | song_df = song_df.withColumnRenamed("artist_id", "songs_artist_id") 137 | 138 | # read in artists data to use for songplays table 139 | artists_path = os.path.join(output_data, 'artists') 140 | print('reading artists table from S3: ' + str(datetime.now())) 141 | artists_df = spark.read.parquet(artists_path) 142 | print('loading of artists table form S3 complete: ' + str(datetime.now())) 143 | artists_df = artists_df.withColumnRenamed("artist_id", "artists_artist_id") \ 144 | .withColumnRenamed("location", "artist_location") 145 | 146 | # extract columns from joined song and log datasets to create songplays table 147 | print('creating songplays table: ' + str(datetime.now())) 148 | songplays_table = df.select(df.log_timestamp.alias("start_time"), 149 | df.userId.alias("user_id"), 150 | "level", 151 | "song", 152 | "artist", 153 | df.sessionId.alias("session_id"), 154 | "location", 155 | df.userAgent.alias("user_agent")) \ 156 | .join(song_df, df.song==song_df.title, 'left_outer') \ 157 | .join(artists_df, df.artist==artists_df.artist_name, 'left_outer') \ 158 | .selectExpr("start_time", 159 | "user_id", 160 | "level", 161 | "song_id", 162 | "coalesce(artists_artist_id, songs_artist_id) as artist_id", 163 | "session_id", 164 | "location", 165 | "user_agent", 166 | "year(start_time) as year", 167 | "month(start_time) as month") \ 168 | .dropDuplicates() \ 169 | .withColumn('songplay_id', F.monotonically_increasing_id()) 170 | 171 | # write songplays table to parquet files partitioned by year and month 172 | songplays_path = os.path.join(output_data, 'songplays') 173 | 174 | print('writing songplays table to S3: ' + str(datetime.now())) 175 | songplays_table.write.parquet(songplays_path, mode='overwrite', partitionBy=["year", "month"]) 176 | print('write of songplays table to S3 complete: ' + str(datetime.now())) 177 | 178 | 179 | def main(): 180 | print('creating spark session: ' + str(datetime.now())) 181 | spark = create_spark_session() 182 | input_data = "s3a://udacity-dend/" 183 | output_data = "s3a://dend-rbmayer/" 184 | 185 | print('starting song data processing: ' + str(datetime.now())) 186 | process_song_data(spark, input_data, output_data) 187 | print('song data processing complete: ' + str(datetime.now())) 188 | print('starting log data processing: ' + str(datetime.now())) 189 | process_log_data(spark, input_data, output_data) 190 | print('ETL complete: ' + str(datetime.now())) 191 | 192 | 193 | if __name__ == "__main__": 194 | main() 195 | -------------------------------------------------------------------------------- /Project 5 Data Pipelines with Airflow/README.md: -------------------------------------------------------------------------------- 1 | # DEND Project 5: Data Pipelines with Airflow 2 | 3 | ## Project Summary 4 | 5 | A fictional music streaming company, Sparkify, has decided that it is time to introduce more automation and monitoring to their data warehouse ETL pipelines. They have come to the conclusion that the best tool to achieve this is Apache Airflow. 6 | 7 | The source data resides in Amazon S3 buckets and needs to be processed in Sparkify's data warehouse in Amazon Redshift. The source datasets may consist of CSV or JSON logs that record user activity in the application and store metadata about the songs that have been played. 8 | 9 | For this project, I have created a high grade data pipeline using the Airflow python API. The pipeline is dynamic, built from reusable tasks, can be monitored, allows easy backfills, and conducts automated data quality checks. 10 | 11 | This project was submitted for Udacity's Data Engineering Nanodegree (DEND) in July 2019. 12 | 13 | ## How to Run 14 | 15 | Prerequisites: Access to AWS credentials and an Amazon Redshift cluster. 16 | 17 | 1. Put project files in their respective folders in an Airflow installation. 18 | 2. Adjust parameters in the DAG script, udac_example_dag.py, as desired. 19 | 3. Create aws_credentials and redshift connections in Airflow. 20 | 3. Launch udac_example_dag from the Airflow UI. 21 | 22 | ## Files in Repository 23 | 24 | ```/airflow/dags/udac_example_dag.py``` DAG definition file. Calls Operators to stage data to redshift, populate the data warehouse and run data quality checks. 25 | 26 | ```/airflow/create_tables.sql``` Optional script to create staging and data warehouse tables in Redshift. 27 | 28 | ```/airflow/plugins/operators/stage_redshift.py``` Defines the custom operator **StageToRedshiftOperator**. This operator loads data from S3 to staging tables in redshift. User may specify csv or JSON file format. Csv file options include delimiter and whether to ignore headers. JSON options include automatic parsing or use of JSONpaths file in the COPY command. 29 | 30 | ```/airflow/plugins/operators/load_fact.py``` Defines the custom operator **LoadFactOperator**. This operator appends data from staging tables into the main fact table. 31 | 32 | ```/airflow/plugins/operators/load_dimension.py``` Defines the custom operator **LoadDimensionOperator**. This operator loads data into dimension tables from staging tables. Update mode can set to 'insert' or 'overwrite'. 33 | 34 | ```/airflow/plugins/operators/data_quality.py``` Defines the custom operator **DataQualityOperator**. This operator performs any number of data quality checks at the end of the pipeline run. The project provides two pre-defined checks in the helper files: 35 | * empty_table_check: raises task error on finding 0 rows in user-specified table 36 | * songplay_id_check: raises task error when a duplicate primary key (songplay_id) is detected in the fact table 37 | 38 | Users can specify their own data quality checks by entering the SQL query, table name and expected results as parameters. 39 | -------------------------------------------------------------------------------- /Project 5 Data Pipelines with Airflow/airflow/create_tables.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE public.artists ( 2 | artistid varchar(256) NOT NULL, 3 | name varchar(256), 4 | location varchar(256), 5 | lattitude numeric(18,0), 6 | longitude numeric(18,0) 7 | ); 8 | 9 | CREATE TABLE public.songplays ( 10 | songplay_id varchar(32) NOT NULL, 11 | start_time timestamp NOT NULL, 12 | userid int4 NOT NULL, 13 | "level" varchar(256), 14 | song_id varchar(256), 15 | artist_id varchar(256), 16 | sessionid int4, 17 | location varchar(256), 18 | user_agent varchar(256), 19 | CONSTRAINT songplays_pkey PRIMARY KEY (songplay_id) 20 | ); 21 | 22 | CREATE TABLE public.songs ( 23 | songid varchar(256) NOT NULL, 24 | title varchar(256), 25 | artistid varchar(256), 26 | "year" int4, 27 | duration numeric(18,0), 28 | CONSTRAINT songs_pkey PRIMARY KEY (songid) 29 | ); 30 | 31 | CREATE TABLE public.staging_events ( 32 | artist varchar(256), 33 | auth varchar(256), 34 | firstname varchar(256), 35 | gender varchar(256), 36 | iteminsession int4, 37 | lastname varchar(256), 38 | length numeric(18,0), 39 | "level" varchar(256), 40 | location varchar(256), 41 | "method" varchar(256), 42 | page varchar(256), 43 | registration numeric(18,0), 44 | sessionid int4, 45 | song varchar(256), 46 | status int4, 47 | ts int8, 48 | useragent varchar(256), 49 | userid int4 50 | ); 51 | 52 | CREATE TABLE public.staging_songs ( 53 | num_songs int4, 54 | artist_id varchar(256), 55 | artist_name varchar(256), 56 | artist_latitude numeric(18,0), 57 | artist_longitude numeric(18,0), 58 | artist_location varchar(256), 59 | song_id varchar(256), 60 | title varchar(256), 61 | duration numeric(18,0), 62 | "year" int4 63 | ); 64 | 65 | CREATE TABLE public.users ( 66 | userid int4 NOT NULL, 67 | first_name varchar(256), 68 | last_name varchar(256), 69 | gender varchar(256), 70 | "level" varchar(256), 71 | CONSTRAINT users_pkey PRIMARY KEY (userid) 72 | ); 73 | 74 | CREATE TABLE IF NOT EXISTS public.time ( 75 | start_time timestamp, 76 | hour int NOT NULL, 77 | day int NOT NULL, 78 | week int NOT NULL, 79 | month int NOT NULL, 80 | year int NOT NULL, 81 | weekday int NOT NULL, 82 | CONSTRAINT time_pkey PRIMARY KEY (start_time) 83 | ); 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /Project 5 Data Pipelines with Airflow/airflow/dags/udac_example_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | import os 3 | from airflow import DAG 4 | from airflow.operators.dummy_operator import DummyOperator 5 | from airflow.operators import (StageToRedshiftOperator, LoadFactOperator, LoadDimensionOperator, DataQualityOperator) 6 | from helpers import SqlQueries, DataChecks 7 | 8 | default_args = { 9 | 'owner': 'udacity', 10 | 'start_date': datetime(2018, 11, 1), 11 | 'end_date': datetime(2018, 11, 2), 12 | 'depends_on_past': False, 13 | 'retries': 3, 14 | 'retry_delay': timedelta(minutes=5), 15 | 'catchup': False, 16 | 'email_on_retry': False, 17 | } 18 | 19 | dag = DAG('udac_example_dag', 20 | default_args=default_args, 21 | description='Load and transform data in Redshift with Airflow', 22 | schedule_interval='0 * * * *' # hourly schedule 23 | ) 24 | 25 | start_operator = DummyOperator(task_id='Begin_execution', dag=dag) 26 | 27 | stage_events_to_redshift = StageToRedshiftOperator( 28 | task_id='Stage_events', 29 | redshift_conn_id="redshift", 30 | aws_credentials_id="aws_credentials", 31 | table="staging_events", 32 | s3_bucket="udacity-dend", 33 | s3_key="log_data/{execution_date.year}/{execution_date.month}/", 34 | data_format="json", 35 | dag=dag, 36 | provide_context=True, 37 | ) 38 | 39 | stage_songs_to_redshift = StageToRedshiftOperator( 40 | task_id='Stage_songs', 41 | redshift_conn_id="redshift", 42 | aws_credentials_id="aws_credentials", 43 | table="staging_songs", 44 | s3_bucket="udacity-dend", 45 | s3_key="song_data", 46 | ignore_headers="0", 47 | data_format="json", 48 | dag=dag, 49 | provide_context=True 50 | ) 51 | 52 | load_songplays_table = LoadFactOperator( 53 | task_id='Load_songplays_fact_table', 54 | redshift_conn_id="redshift", 55 | destination_table="songplays", 56 | sql_statement=SqlQueries.songplay_table_insert, 57 | dag=dag 58 | ) 59 | 60 | load_user_dimension_table = LoadDimensionOperator( 61 | task_id='Load_user_dim_table', 62 | redshift_conn_id="redshift", 63 | destination_table="users", 64 | sql_statement=SqlQueries.user_table_insert, 65 | update_mode="insert", 66 | dag=dag 67 | ) 68 | 69 | load_song_dimension_table = LoadDimensionOperator( 70 | task_id='Load_song_dim_table', 71 | redshift_conn_id="redshift", 72 | destination_table="songs", 73 | sql_statement=SqlQueries.song_table_insert, 74 | update_mode="insert", 75 | dag=dag 76 | ) 77 | 78 | load_artist_dimension_table = LoadDimensionOperator( 79 | task_id='Load_artist_dim_table', 80 | redshift_conn_id="redshift", 81 | destination_table="artists", 82 | sql_statement=SqlQueries.artist_table_insert, 83 | update_mode="insert", 84 | dag=dag 85 | ) 86 | 87 | load_time_dimension_table = LoadDimensionOperator( 88 | task_id='Load_time_dim_table', 89 | redshift_conn_id="redshift", 90 | destination_table="time", 91 | sql_statement=SqlQueries.time_table_insert, 92 | update_mode="insert", 93 | dag=dag 94 | ) 95 | 96 | run_quality_checks = DataQualityOperator( 97 | task_id='Run_data_quality_checks', 98 | dag=dag, 99 | redshift_conn_id="redshift", 100 | data_check_query=[DataChecks.empty_table_check, 101 | DataChecks.empty_table_check, 102 | DataChecks.empty_table_check, 103 | DataChecks.songplay_id_check], 104 | table=['staging_events', 'staging_songs', 'songplays', ""], 105 | expected_result=[1, 1, 1, 1] 106 | ) 107 | 108 | end_operator = DummyOperator(task_id='Stop_execution', dag=dag) 109 | 110 | 111 | start_operator >> stage_events_to_redshift 112 | start_operator >> stage_songs_to_redshift 113 | stage_events_to_redshift >> load_songplays_table 114 | stage_songs_to_redshift >> load_songplays_table 115 | load_songplays_table >> load_user_dimension_table 116 | load_songplays_table >> load_song_dimension_table 117 | load_songplays_table >> load_artist_dimension_table 118 | load_songplays_table >> load_time_dimension_table 119 | load_user_dimension_table >> run_quality_checks 120 | load_song_dimension_table >> run_quality_checks 121 | load_artist_dimension_table >> run_quality_checks 122 | load_time_dimension_table >> run_quality_checks 123 | run_quality_checks >> end_operator -------------------------------------------------------------------------------- /Project 5 Data Pipelines with Airflow/airflow/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, absolute_import, print_function 2 | 3 | from airflow.plugins_manager import AirflowPlugin 4 | 5 | import operators 6 | import helpers 7 | 8 | # Defining the plugin class 9 | class UdacityPlugin(AirflowPlugin): 10 | name = "udacity_plugin" 11 | operators = [ 12 | operators.StageToRedshiftOperator, 13 | operators.LoadFactOperator, 14 | operators.LoadDimensionOperator, 15 | operators.DataQualityOperator 16 | ] 17 | helpers = [ 18 | helpers.SqlQueries 19 | ] 20 | -------------------------------------------------------------------------------- /Project 5 Data Pipelines with Airflow/airflow/plugins/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | from helpers.sql_queries import SqlQueries 2 | from helpers.data_quality_checks import DataChecks 3 | 4 | __all__ = [ 5 | 'SqlQueries', 6 | 'DataChecks' 7 | ] -------------------------------------------------------------------------------- /Project 5 Data Pipelines with Airflow/airflow/plugins/helpers/data_quality_checks.py: -------------------------------------------------------------------------------- 1 | class DataChecks: 2 | """Store data quality check statements and expected results""" 3 | empty_table_check = """SELECT (CASE WHEN COUNT(*) > 0 THEN COUNT(*)/COUNT(*) ELSE 0 END) as row_flag from {}""" 4 | empty_table_check_expected_result = 1 5 | 6 | songplay_id_check = (""" 7 | SELECT MAX(total.songplay_id_count) as max_count 8 | FROM (SELECT COUNT(songplay_id) as songplay_id_count 9 | FROM songplays 10 | GROUP BY songplay_id) total 11 | """) 12 | songplay_id_check_expected_result = 1 13 | -------------------------------------------------------------------------------- /Project 5 Data Pipelines with Airflow/airflow/plugins/helpers/sql_queries.py: -------------------------------------------------------------------------------- 1 | class SqlQueries: 2 | """Store SQL statements for DAG""" 3 | songplay_table_insert = (""" 4 | SELECT 5 | md5(song_id || CAST(events.start_time as VARCHAR) || nvl(CAST(events.userid as VARCHAR), '-9999')) as songplay_id, 6 | events.start_time, 7 | nvl(events.userid, -9999), 8 | events.level, 9 | songs.song_id, 10 | songs.artist_id, 11 | events.sessionid, 12 | events.location, 13 | events.useragent 14 | FROM (SELECT TIMESTAMP 'epoch' + ts/1000 * interval '1 second' AS start_time, * 15 | FROM staging_events 16 | WHERE page='NextSong' 17 | ) events 18 | LEFT JOIN staging_songs songs 19 | ON events.song = songs.title 20 | AND events.artist = songs.artist_name 21 | AND events.length = songs.duration 22 | WHERE events.start_time IS NOT NULL 23 | AND song_id IS NOT NULL 24 | """) 25 | 26 | user_table_insert = (""" 27 | SELECT distinct userid, firstname, lastname, gender, level 28 | FROM staging_events 29 | WHERE page='NextSong' 30 | AND userid IS NOT NULL 31 | """) 32 | 33 | song_table_insert = (""" 34 | SELECT distinct song_id, title, artist_id, year, duration 35 | FROM staging_songs 36 | WHERE song_id IS NOT NULL 37 | """) 38 | 39 | artist_table_insert = (""" 40 | SELECT distinct artist_id, artist_name, artist_location, artist_latitude, artist_longitude 41 | FROM staging_songs 42 | WHERE artist_id IS NOT NULL 43 | """) 44 | 45 | time_table_insert = (""" 46 | SELECT start_time, extract(hour from start_time), extract(day from start_time), extract(week from start_time), 47 | extract(month from start_time), extract(year from start_time), extract(dayofweek from start_time) 48 | FROM songplays 49 | WHERE start_time IS NOT NULL 50 | """) 51 | -------------------------------------------------------------------------------- /Project 5 Data Pipelines with Airflow/airflow/plugins/operators/__init__.py: -------------------------------------------------------------------------------- 1 | from operators.stage_redshift import StageToRedshiftOperator 2 | from operators.load_fact import LoadFactOperator 3 | from operators.load_dimension import LoadDimensionOperator 4 | from operators.data_quality import DataQualityOperator 5 | 6 | __all__ = [ 7 | 'StageToRedshiftOperator', 8 | 'LoadFactOperator', 9 | 'LoadDimensionOperator', 10 | 'DataQualityOperator' 11 | ] 12 | -------------------------------------------------------------------------------- /Project 5 Data Pipelines with Airflow/airflow/plugins/operators/data_quality.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | class DataQualityOperator(BaseOperator): 6 | """Run data quality checks on one or more tables. 7 | 8 | Parameters: 9 | data_check_query: A list of one or more queries to check data. 10 | table: A list of one or more tables for the data check queries. 11 | expected_results: A list of expected results for each data check query. 12 | 13 | Returns: Exception raised on data check failure. 14 | """ 15 | 16 | ui_color = '#89DA59' 17 | 18 | @apply_defaults 19 | def __init__(self, 20 | redshift_conn_id="", 21 | data_check_query=[], 22 | table=[], 23 | expected_result=[], 24 | *args, **kwargs): 25 | 26 | super(DataQualityOperator, self).__init__(*args, **kwargs) 27 | self.redshift_conn_id=redshift_conn_id 28 | self.data_check_query=data_check_query 29 | self.table=table 30 | self.expected_result=expected_result 31 | 32 | def execute(self, context): 33 | self.log.info('Running data quality checks') 34 | 35 | self.log.info('Fetching redshift hook') 36 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) 37 | 38 | checks = zip(self.data_check_query, self.table, self.expected_result) 39 | for check in checks: 40 | try: 41 | redshift.run(check[0].format(check[1])) == check[2] 42 | self.log.info('Data quality check passed.') 43 | except: 44 | self.log.info('Data quality check failed.') 45 | raise AssertionError('Data quality check failed.') -------------------------------------------------------------------------------- /Project 5 Data Pipelines with Airflow/airflow/plugins/operators/load_dimension.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | class LoadDimensionOperator(BaseOperator): 6 | """Load data into dimension table from staging tables. 7 | 8 | Parameters: 9 | redshift_conn_id: Conn Id of the Airflow connection to redshift database 10 | destination_table: name of the dimension table to update 11 | sql_statement: 'select' query to retrieve rows for insertion in destination table 12 | update_mode: 'insert' or 'overwrite'. 'overwrite' truncates the destination table before inserting rows 13 | 14 | Returns: None 15 | """ 16 | 17 | ui_color = '#80BD9E' 18 | 19 | @apply_defaults 20 | def __init__(self, 21 | redshift_conn_id = "", 22 | destination_table = "", 23 | sql_statement = "", 24 | update_mode = "overwrite", # insert, overwrite 25 | *args, **kwargs): 26 | 27 | super(LoadDimensionOperator, self).__init__(*args, **kwargs) 28 | self.redshift_conn_id=redshift_conn_id 29 | self.destination_table=destination_table 30 | self.sql_statement=sql_statement 31 | self.update_mode=update_mode 32 | 33 | def execute(self, context): 34 | self.log.info('Fetching redshift hook') 35 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) 36 | 37 | self.log.info('Loading dimension table {}'.format(self.destination_table)) 38 | if self.update_mode == 'overwrite': 39 | update_query = 'TRUNCATE {}; INSERT INTO {} ({})'.format(self.destination_table, self.destination_table, self.sql_statement) 40 | elif self.update_mode == 'insert': 41 | update_query = 'INSERT INTO {} ({})'.format(self.destination_table, self.sql_statement) 42 | redshift.run(update_query) 43 | -------------------------------------------------------------------------------- /Project 5 Data Pipelines with Airflow/airflow/plugins/operators/load_fact.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | class LoadFactOperator(BaseOperator): 6 | """Load data into fact table from staging tables. 7 | 8 | Parameters: 9 | redshift_conn_id: Conn Id of the Airflow connection to redshift database 10 | destination_table: name of the fact table to update 11 | sql_statement: 'select' query to retrieve rows for insertion in fact table 12 | 13 | Returns: None 14 | """ 15 | ui_color = '#F98866' 16 | 17 | @apply_defaults 18 | def __init__(self, 19 | redshift_conn_id = "", 20 | destination_table = "", 21 | sql_statement = "", 22 | *args, **kwargs): 23 | 24 | super(LoadFactOperator, self).__init__(*args, **kwargs) 25 | self.redshift_conn_id=redshift_conn_id 26 | self.destination_table=destination_table 27 | self.sql_statement=sql_statement 28 | 29 | def execute(self, context): 30 | self.log.info('Fetching redshift hook') 31 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) 32 | 33 | self.log.info('Loading fact table') 34 | insert_query = 'INSERT INTO {} ({})'.format(self.destination_table, self.sql_statement) 35 | redshift.run(insert_query) 36 | -------------------------------------------------------------------------------- /Project 5 Data Pipelines with Airflow/airflow/plugins/operators/stage_redshift.py: -------------------------------------------------------------------------------- 1 | from airflow.contrib.hooks.aws_hook import AwsHook 2 | from airflow.hooks.postgres_hook import PostgresHook 3 | from airflow.models import BaseOperator 4 | from airflow.utils.decorators import apply_defaults 5 | 6 | class StageToRedshiftOperator(BaseOperator): 7 | """Transfer data from S3 to staging tables in redshift database. 8 | 9 | Parameters: 10 | aws_credentials_id: Conn Id of the Airflow connection to Amazon Web Services 11 | redshift_conn_id: Conn Id of the Airflow connection to redshift database 12 | table: name of the staging table to populate 13 | s3_bucket: name of S3 bucket, e.g. "udacity-dend" 14 | s3_key: name of S3 key. This field is templatable when context is enabled, e.g. "log_data/{execution_date.year}/{execution_date.month}/" 15 | delimiter: csv field delimiter 16 | ignore_headers: '0' or '1' 17 | data_format: 'csv' or 'json' 18 | jsonpaths: path to JSONpaths file 19 | 20 | Returns: None 21 | """ 22 | template_fields = ("s3_key",) 23 | ui_color = '#358140' 24 | copy_sql = """ 25 | COPY {} 26 | FROM '{}' 27 | ACCESS_KEY_ID '{}' 28 | SECRET_ACCESS_KEY '{}' 29 | IGNOREHEADER {} 30 | {} 31 | """ 32 | 33 | @apply_defaults 34 | def __init__(self, 35 | aws_credentials_id="", 36 | redshift_conn_id="", 37 | table="", 38 | s3_bucket="", 39 | s3_key="", 40 | delimiter=",", 41 | ignore_headers=1, 42 | data_format="csv", 43 | jsonpaths="", 44 | *args, **kwargs): 45 | 46 | super(StageToRedshiftOperator, self).__init__(*args, **kwargs) 47 | self.aws_credentials_id=aws_credentials_id 48 | self.redshift_conn_id=redshift_conn_id 49 | self.table=table 50 | self.s3_bucket=s3_bucket # udacity-dend 51 | self.s3_key=s3_key # log_data, song_data 52 | self.delimiter=delimiter 53 | self.ignore_headers=ignore_headers 54 | self.data_format=data_format.lower() # 'csv', 'json' 55 | self.jsonpaths=jsonpaths 56 | 57 | 58 | def execute(self, context): 59 | aws_hook = AwsHook(self.aws_credentials_id) 60 | credentials = aws_hook.get_credentials() 61 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) 62 | 63 | self.log.info("Clearing data from destination Redshift table") 64 | redshift.run("DELETE FROM {}".format(self.table)) 65 | 66 | self.log.info("Copying data from S3 to Redshift") 67 | 68 | # select csv or json format 69 | if self.data_format == 'csv': 70 | autoformat = "DELIMITER '{}'".format(self.delimiter) 71 | elif self.data_format == 'json': 72 | json_option = self.jsonpaths or 'auto' 73 | autoformat = "FORMAT AS JSON '{}'".format(json_option) 74 | 75 | # set S3 path based on execution dates 76 | rendered_key = self.s3_key.format(**context) 77 | self.log.info('Rendered key is ' + rendered_key) 78 | s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) 79 | formatted_sql = StageToRedshiftOperator.copy_sql.format( 80 | self.table, 81 | s3_path, 82 | credentials.access_key, 83 | credentials.secret_key, 84 | self.ignore_headers, 85 | autoformat 86 | ) 87 | redshift.run(formatted_sql) 88 | 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /__pycache__/sql_queries.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbmayer/Udacity-Data-Engineering-Nanodegree/84c28b9d8538a5aa3b23867abad93e8de30c3476/__pycache__/sql_queries.cpython-36.pyc --------------------------------------------------------------------------------