├── AWS_Services ├── README.md └── aws-s3-cheat-sheet.png ├── Airflow_CloudFormation.yaml ├── Airflow_Data_Pipelines ├── Setup_Redshift_Connection_Airflow.md ├── dags │ ├── sparkify_dimension_subdag.py │ └── udac_example_dag.py └── plugins │ ├── __init__.py │ ├── helpers │ ├── __init__.py │ └── sql_queries.py │ └── operators │ ├── __init__.py │ ├── create_table.py │ ├── data_quality.py │ ├── load_dimension.py │ ├── load_fact.py │ └── stage_redshift.py ├── Airflow_Livy_Setup_CloudFormation.md ├── Data_Api_to_Postgres ├── README.md ├── Results.PNG ├── __init__.py ├── auth.py ├── businesssearch.py ├── databasedriver.py ├── driver.py ├── queries.py └── request.py ├── Data_Lake ├── README.md ├── data │ ├── log-data │ │ ├── 2018-11-01-events.json │ │ ├── 2018-11-02-events.json │ │ ├── 2018-11-03-events.json │ │ ├── 2018-11-04-events.json │ │ ├── 2018-11-05-events.json │ │ ├── 2018-11-06-events.json │ │ ├── 2018-11-07-events.json │ │ ├── 2018-11-08-events.json │ │ ├── 2018-11-09-events.json │ │ ├── 2018-11-10-events.json │ │ ├── 2018-11-11-events.json │ │ ├── 2018-11-12-events.json │ │ ├── 2018-11-13-events.json │ │ ├── 2018-11-14-events.json │ │ ├── 2018-11-15-events.json │ │ ├── 2018-11-16-events.json │ │ ├── 2018-11-17-events.json │ │ ├── 2018-11-18-events.json │ │ ├── 2018-11-19-events.json │ │ ├── 2018-11-20-events.json │ │ ├── 2018-11-21-events.json │ │ ├── 2018-11-22-events.json │ │ ├── 2018-11-23-events.json │ │ ├── 2018-11-24-events.json │ │ ├── 2018-11-25-events.json │ │ ├── 2018-11-26-events.json │ │ ├── 2018-11-27-events.json │ │ ├── 2018-11-28-events.json │ │ ├── 2018-11-29-events.json │ │ └── 2018-11-30-events.json │ └── song_data │ │ └── A │ │ ├── .DS_Store │ │ ├── A │ │ ├── .DS_Store │ │ ├── A │ │ │ ├── TRAAAAW128F429D538.json │ │ │ ├── TRAAABD128F429CF47.json │ │ │ ├── TRAAADZ128F9348C2E.json │ │ │ ├── TRAAAEF128F4273421.json │ │ │ ├── TRAAAFD128F92F423A.json │ │ │ ├── TRAAAMO128F1481E7F.json │ │ │ ├── TRAAAMQ128F1460CD3.json │ │ │ ├── TRAAAPK128E0786D96.json │ │ │ ├── TRAAARJ128F9320760.json │ │ │ ├── TRAAAVG12903CFA543.json │ │ │ └── TRAAAVO128F93133D4.json │ │ ├── B │ │ │ ├── TRAABCL128F4286650.json │ │ │ ├── TRAABDL12903CAABBA.json │ │ │ ├── TRAABJL12903CDCF1A.json │ │ │ ├── TRAABJV128F1460C49.json │ │ │ ├── TRAABLR128F423B7E3.json │ │ │ ├── TRAABNV128F425CEE1.json │ │ │ ├── TRAABRB128F9306DD5.json │ │ │ ├── TRAABVM128F92CA9DC.json │ │ │ ├── TRAABXG128F9318EBD.json │ │ │ ├── TRAABYN12903CFD305.json │ │ │ └── TRAABYW128F4244559.json │ │ └── C │ │ │ ├── TRAACCG128F92E8A55.json │ │ │ ├── TRAACER128F4290F96.json │ │ │ ├── TRAACFV128F935E50B.json │ │ │ ├── TRAACHN128F1489601.json │ │ │ ├── TRAACIW12903CC0F6D.json │ │ │ ├── TRAACLV128F427E123.json │ │ │ ├── TRAACNS128F14A2DF5.json │ │ │ ├── TRAACOW128F933E35F.json │ │ │ ├── TRAACPE128F421C1B9.json │ │ │ ├── TRAACQT128F9331780.json │ │ │ ├── TRAACSL128F93462F4.json │ │ │ ├── TRAACTB12903CAAF15.json │ │ │ ├── TRAACVS128E078BE39.json │ │ │ └── TRAACZK128F4243829.json │ │ └── B │ │ ├── .DS_Store │ │ ├── A │ │ ├── TRABACN128F425B784.json │ │ ├── TRABAFJ128F42AF24E.json │ │ ├── TRABAFP128F931E9A1.json │ │ ├── TRABAIO128F42938F9.json │ │ ├── TRABATO128F42627E9.json │ │ ├── TRABAVQ12903CBF7E0.json │ │ ├── TRABAWW128F4250A31.json │ │ ├── TRABAXL128F424FC50.json │ │ ├── TRABAXR128F426515F.json │ │ ├── TRABAXV128F92F6AE3.json │ │ └── TRABAZH128F930419A.json │ │ ├── B │ │ ├── TRABBAM128F429D223.json │ │ ├── TRABBBV128F42967D7.json │ │ ├── TRABBJE12903CDB442.json │ │ ├── TRABBKX128F4285205.json │ │ ├── TRABBLU128F93349CF.json │ │ ├── TRABBNP128F932546F.json │ │ ├── TRABBOP128F931B50D.json │ │ ├── TRABBOR128F4286200.json │ │ ├── TRABBTA128F933D304.json │ │ ├── TRABBVJ128F92F7EAA.json │ │ ├── TRABBXU128F92FEF48.json │ │ └── TRABBZN12903CD9297.json │ │ └── C │ │ ├── TRABCAJ12903CDFCC2.json │ │ ├── TRABCEC128F426456E.json │ │ ├── TRABCEI128F424C983.json │ │ ├── TRABCFL128F149BB0D.json │ │ ├── TRABCIX128F4265903.json │ │ ├── TRABCKL128F423A778.json │ │ ├── TRABCPZ128F4275C32.json │ │ ├── TRABCRU128F423F449.json │ │ ├── TRABCTK128F934B224.json │ │ ├── TRABCUQ128E0783E2B.json │ │ ├── TRABCXB128F4286BD3.json │ │ └── TRABCYE128F934CE1D.json └── etl.py ├── Data_Modeling_with_Apache_Cassandra ├── Project_1B_ Project_Template.ipynb ├── event_data │ ├── 2018-11-01-events.csv │ ├── 2018-11-02-events.csv │ ├── 2018-11-03-events.csv │ ├── 2018-11-04-events.csv │ ├── 2018-11-05-events.csv │ ├── 2018-11-06-events.csv │ ├── 2018-11-07-events.csv │ ├── 2018-11-08-events.csv │ ├── 2018-11-09-events.csv │ ├── 2018-11-10-events.csv │ ├── 2018-11-11-events.csv │ ├── 2018-11-12-events.csv │ ├── 2018-11-13-events.csv │ ├── 2018-11-14-events.csv │ ├── 2018-11-15-events.csv │ ├── 2018-11-16-events.csv │ ├── 2018-11-17-events.csv │ ├── 2018-11-18-events.csv │ ├── 2018-11-19-events.csv │ ├── 2018-11-20-events.csv │ ├── 2018-11-21-events.csv │ ├── 2018-11-22-events.csv │ ├── 2018-11-23-events.csv │ ├── 2018-11-24-events.csv │ ├── 2018-11-25-events.csv │ ├── 2018-11-26-events.csv │ ├── 2018-11-27-events.csv │ ├── 2018-11-28-events.csv │ ├── 2018-11-29-events.csv │ └── 2018-11-30-events.csv ├── event_datafile_new.csv └── images │ └── image_event_datafile_new.jpg ├── Data_Modeling_with_Postgres ├── README.md ├── __pycache__ │ ├── create_tables.cpython-37.pyc │ ├── etl.cpython-37.pyc │ └── sql_queries.cpython-37.pyc ├── create_tables.py ├── data │ ├── log_data │ │ └── 2018 │ │ │ └── 11 │ │ │ ├── 2018-11-01-events.json │ │ │ ├── 2018-11-02-events.json │ │ │ ├── 2018-11-03-events.json │ │ │ ├── 2018-11-04-events.json │ │ │ ├── 2018-11-05-events.json │ │ │ ├── 2018-11-06-events.json │ │ │ ├── 2018-11-07-events.json │ │ │ ├── 2018-11-08-events.json │ │ │ ├── 2018-11-09-events.json │ │ │ ├── 2018-11-10-events.json │ │ │ ├── 2018-11-11-events.json │ │ │ ├── 2018-11-12-events.json │ │ │ ├── 2018-11-13-events.json │ │ │ ├── 2018-11-14-events.json │ │ │ ├── 2018-11-15-events.json │ │ │ ├── 2018-11-16-events.json │ │ │ ├── 2018-11-17-events.json │ │ │ ├── 2018-11-18-events.json │ │ │ ├── 2018-11-19-events.json │ │ │ ├── 2018-11-20-events.json │ │ │ ├── 2018-11-21-events.json │ │ │ ├── 2018-11-22-events.json │ │ │ ├── 2018-11-23-events.json │ │ │ ├── 2018-11-24-events.json │ │ │ ├── 2018-11-25-events.json │ │ │ ├── 2018-11-26-events.json │ │ │ ├── 2018-11-27-events.json │ │ │ ├── 2018-11-28-events.json │ │ │ ├── 2018-11-29-events.json │ │ │ └── 2018-11-30-events.json │ └── song_data │ │ └── A │ │ ├── A │ │ ├── A │ │ │ ├── TRAAAAW128F429D538.json │ │ │ ├── TRAAABD128F429CF47.json │ │ │ ├── TRAAADZ128F9348C2E.json │ │ │ ├── TRAAAEF128F4273421.json │ │ │ ├── TRAAAFD128F92F423A.json │ │ │ ├── TRAAAMO128F1481E7F.json │ │ │ ├── TRAAAMQ128F1460CD3.json │ │ │ ├── TRAAAPK128E0786D96.json │ │ │ ├── TRAAARJ128F9320760.json │ │ │ └── TRAAAVG12903CFA543.json │ │ ├── B │ │ │ ├── TRAABCL128F4286650.json │ │ │ ├── TRAABDL12903CAABBA.json │ │ │ ├── TRAABJL12903CDCF1A.json │ │ │ ├── TRAABJV128F1460C49.json │ │ │ ├── TRAABLR128F423B7E3.json │ │ │ ├── TRAABNV128F425CEE1.json │ │ │ ├── TRAABRB128F9306DD5.json │ │ │ ├── TRAABVM128F92CA9DC.json │ │ │ ├── TRAABXG128F9318EBD.json │ │ │ └── TRAABYN12903CFD305.json │ │ └── C │ │ │ ├── TRAACCG128F92E8A55.json │ │ │ ├── TRAACER128F4290F96.json │ │ │ ├── TRAACFV128F935E50B.json │ │ │ ├── TRAACHN128F1489601.json │ │ │ ├── TRAACIW12903CC0F6D.json │ │ │ ├── TRAACLV128F427E123.json │ │ │ ├── TRAACNS128F14A2DF5.json │ │ │ ├── TRAACOW128F933E35F.json │ │ │ ├── TRAACPE128F421C1B9.json │ │ │ └── TRAACQT128F9331780.json │ │ └── B │ │ ├── A │ │ ├── TRABACN128F425B784.json │ │ ├── TRABAFJ128F42AF24E.json │ │ ├── TRABAFP128F931E9A1.json │ │ ├── TRABAIO128F42938F9.json │ │ ├── TRABATO128F42627E9.json │ │ ├── TRABAVQ12903CBF7E0.json │ │ ├── TRABAWW128F4250A31.json │ │ ├── TRABAXL128F424FC50.json │ │ ├── TRABAXR128F426515F.json │ │ └── TRABAXV128F92F6AE3.json │ │ ├── B │ │ ├── TRABBAM128F429D223.json │ │ ├── TRABBBV128F42967D7.json │ │ ├── TRABBJE12903CDB442.json │ │ ├── TRABBKX128F4285205.json │ │ ├── TRABBLU128F93349CF.json │ │ ├── TRABBNP128F932546F.json │ │ ├── TRABBOP128F931B50D.json │ │ ├── TRABBOR128F4286200.json │ │ ├── TRABBTA128F933D304.json │ │ └── TRABBVJ128F92F7EAA.json │ │ └── C │ │ ├── TRABCAJ12903CDFCC2.json │ │ ├── TRABCEC128F426456E.json │ │ ├── TRABCEI128F424C983.json │ │ ├── TRABCFL128F149BB0D.json │ │ ├── TRABCIX128F4265903.json │ │ ├── TRABCKL128F423A778.json │ │ ├── TRABCPZ128F4275C32.json │ │ ├── TRABCRU128F423F449.json │ │ ├── TRABCTK128F934B224.json │ │ └── TRABCUQ128E0783E2B.json ├── etl.ipynb ├── etl.py ├── main.py ├── sql_queries.py └── test.ipynb ├── Data_Warehouse ├── README.md ├── create_tables.py ├── etl.py ├── log_json_path.json └── sql_queries.py ├── LICENSE ├── README.md ├── Redshift_Cluster_IaC.py ├── Redshift_IaC_README.md ├── Redshift_test.py ├── airflow_livy.png ├── architecture.png ├── image.jpeg └── logging.ini /AWS_Services/README.md: -------------------------------------------------------------------------------- 1 | # Launching EMR cluster from command line 2 | ### Below example creates a 3 Node EMR cluster with 1 master and 2 slave Nodes. 3 | 4 | aws emr create-cluster \ 5 | --applications Name=Ganglia Name=Spark Name=Zeppelin \ 6 | --ebs-root-volume-size 10 \ 7 | --ec2-attributes \ 8 | '{"KeyName":,"InstanceProfile":,"SubnetId":,"EmrManagedSlaveSecurityGroup":,"EmrManagedMasterSecurityGroup":}' \ 9 | --service-role IAMROLE \ 10 | --enable-debugging \ 11 | --release-label \ 12 | --log-uri \ 13 | --name \ 14 | --instance-groups \ 15 | '[ \ 16 | {"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"MASTER","InstanceType":"m5.xlarge","Name":"Master Instance Group"}, \ 17 | {"InstanceCount":2,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"CORE","InstanceType":"m5.xlarge","Name":"Core Instance Group"}\ 18 | ]' \ 19 | --scale-down-behavior TERMINATE_AT_TASK_COMPLETION \ 20 | --region us-east-1 21 | 22 | 23 | # AWS s3 CLI Cheat Sheet 24 | ![s3 cli cheat sheet](https://github.com/san089/Data_Engineering_Projects/blob/master/AWS_Services/aws-s3-cheat-sheet.png) 25 | -------------------------------------------------------------------------------- /AWS_Services/aws-s3-cheat-sheet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/san089/Udacity-Data-Engineering-Projects/c26bd834294cda2f4f2dfbf68c4c326635c55a93/AWS_Services/aws-s3-cheat-sheet.png -------------------------------------------------------------------------------- /Airflow_CloudFormation.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | 3 | Description: Airflow server backed by Postgres RDS 4 | 5 | Parameters: 6 | KeyName: 7 | Description: Name of an existing EC2 KeyPair to enable SSH access into the Airflow web server 8 | Type: AWS::EC2::KeyPair::KeyName 9 | ConstraintDescription: Must be the name of an existing EC2 KeyPair 10 | S3BucketName: 11 | Description: REQUIRED - A new S3 Bucket name. This bucket will be used to read and write the Movielens dataset. 12 | Type: String 13 | AllowedPattern: '.+' 14 | DBPassword: 15 | Default: airflowpassword 16 | NoEcho: 'true' 17 | Description: Airflow database admin account password 18 | Type: String 19 | MinLength: '8' 20 | MaxLength: '41' 21 | AllowedPattern: '[a-zA-Z0-9]*' 22 | ConstraintDescription: Must contain only alphanumeric characters 23 | 24 | # Mapping to find the Amazon Linux AMI in each region. 25 | Mappings: 26 | RegionMap: 27 | us-east-1: 28 | AMI: ami-97785bed 29 | us-east-2: 30 | AMI: ami-f63b1193 31 | us-west-1: 32 | AMI: ami-824c4ee2 33 | us-west-2: 34 | AMI: ami-f2d3638a 35 | ca-central-1: 36 | AMI: ami-a954d1cd 37 | eu-west-1: 38 | AMI: ami-d834aba1 39 | eu-west-2: 40 | AMI: ami-403e2524 41 | eu-west-3: 42 | AMI: ami-8ee056f3 43 | eu-central-1: 44 | AMI: ami-5652ce39 45 | sa-east-1: 46 | AMI: ami-84175ae8 47 | ap-south-1: 48 | AMI: ami-531a4c3c 49 | ap-southeast-1: 50 | AMI: ami-68097514 51 | ap-southeast-2: 52 | AMI: ami-942dd1f6 53 | ap-northeast-1: 54 | AMI: ami-ceafcba8 55 | ap-northeast-2: 56 | AMI: ami-863090e8 57 | Resources: 58 | EC2Instance: 59 | Type: AWS::EC2::Instance 60 | Properties: 61 | KeyName: !Ref 'KeyName' 62 | SecurityGroups: [!Ref 'AirflowEC2SecurityGroup'] 63 | InstanceType: 'm4.xlarge' 64 | IamInstanceProfile: 65 | Ref: EC2InstanceProfile 66 | Tags: 67 | - 68 | Key: Name 69 | Value: Airflow 70 | ImageId: !FindInMap 71 | - RegionMap 72 | - !Ref 'AWS::Region' 73 | - AMI 74 | UserData: 75 | Fn::Base64: !Sub | 76 | #!/bin/bash 77 | set -x 78 | exec > >(tee /var/log/user-data.log|logger -t user-data ) 2>&1 79 | # Get the latest CloudFormation package 80 | echo "Installing aws-cfn" 81 | yum install -y aws-cfn-bootstrap 82 | # Start cfn-init 83 | /opt/aws/bin/cfn-init -v -c install --stack ${AWS::StackId} --resource EC2Instance --region ${AWS::Region} 84 | # Download and unzip the Movielens dataset 85 | wget http://files.grouplens.org/datasets/movielens/ml-latest.zip && unzip ml-latest.zip 86 | # Upload the movielens dataset files to the S3 bucket 87 | aws s3 cp ml-latest s3://${S3BucketName} --recursive 88 | # Install git 89 | sudo yum install -y git 90 | # Clone the git repository 91 | git clone https://github.com/aws-samples/aws-concurrent-data-orchestration-pipeline-emr-livy.git 92 | sudo pip install boto3 93 | # Install airflow using pip 94 | echo "Install Apache Airflow" 95 | sudo SLUGIFY_USES_TEXT_UNIDECODE=yes pip install -U apache-airflow 96 | # Encrypt connection passwords in metadata db 97 | sudo pip install apache-airflow[crypto] 98 | # Postgres operators and hook, support as an Airflow backend 99 | sudo pip install apache-airflow[postgres] 100 | sudo -H pip install six==1.10.0 101 | sudo pip install --upgrade six 102 | sudo pip install markupsafe 103 | sudo pip install --upgrade MarkupSafe 104 | echo 'export PATH=/usr/local/bin:$PATH' >> /root/.bash_profile 105 | source /root/.bash_profile 106 | # Initialize Airflow 107 | airflow initdb 108 | # Update the RDS connection in the Airflow Config file 109 | sed -i '/sql_alchemy_conn/s/^/#/g' ~/airflow/airflow.cfg 110 | sed -i '/sql_alchemy_conn/ a sql_alchemy_conn = postgresql://airflow:${DBPassword}@${DBInstance.Endpoint.Address}:${DBInstance.Endpoint.Port}/airflowdb' ~/airflow/airflow.cfg 111 | # Update the type of executor in the Airflow Config file 112 | sed -i '/executor = SequentialExecutor/s/^/#/g' ~/airflow/airflow.cfg 113 | sed -i '/executor = SequentialExecutor/ a executor = LocalExecutor' ~/airflow/airflow.cfg 114 | airflow initdb 115 | # Move all the files to the ~/airflow directory. The Airflow config file is setup to hold all the DAG related files in the ~/airflow/ folder. 116 | mv aws-concurrent-data-orchestration-pipeline-emr-livy/* ~/airflow/ 117 | # Delete the higher-level git repository directory 118 | rm -rf aws-concurrent-data-orchestration-pipeline-emr-livy 119 | # Replace the name of the S3 bucket in each of the .scala files. CHANGE THE HIGHLIGHTED PORTION BELOW TO THE NAME OF THE S3 BUCKET YOU CREATED IN STEP 1. The below command replaces the instance of the string ‘’ in each of the scripts to the name of the actual bucket. 120 | sed -i 's//${S3BucketName}/g' /root/airflow/dags/transform/* 121 | # Run Airflow webserver 122 | airflow webserver 123 | Metadata: 124 | AWS::CloudFormation::Init: 125 | configSets: 126 | install: 127 | - gcc 128 | gcc: 129 | packages: 130 | yum: 131 | gcc: [] 132 | DependsOn: 133 | - DBInstance 134 | - AirflowEC2SecurityGroup 135 | DBInstance: 136 | Type: AWS::RDS::DBInstance 137 | DeletionPolicy: Delete 138 | Properties: 139 | DBName: airflowdb 140 | Engine: postgres 141 | MasterUsername: airflow 142 | MasterUserPassword: !Ref 'DBPassword' 143 | DBInstanceClass: db.t2.small 144 | AllocatedStorage: 5 145 | DBSecurityGroups: 146 | - Ref: DBSecurityGroup 147 | AirflowEC2SecurityGroup: 148 | Type: AWS::EC2::SecurityGroup 149 | Properties: 150 | GroupName: AirflowEC2SG 151 | GroupDescription: Enable HTTP access via port 80 + SSH access 152 | SecurityGroupIngress: 153 | - IpProtocol: tcp 154 | FromPort: 80 155 | ToPort: 80 156 | CidrIp: 0.0.0.0/0 157 | - IpProtocol: tcp 158 | FromPort: 8080 159 | ToPort: 8080 160 | CidrIp: 0.0.0.0/0 161 | - IpProtocol: tcp 162 | FromPort: 22 163 | ToPort: 22 164 | CidrIp: 0.0.0.0/0 165 | AirflowEMRMasterEC2SecurityGroup: 166 | Type: AWS::EC2::SecurityGroup 167 | Properties: 168 | GroupName: AirflowEMRMasterSG 169 | GroupDescription: Airflow EMR Master SG 170 | DependsOn: 171 | - AirflowEC2SecurityGroup 172 | AirflowEMRMasterInboundRule: 173 | Type: AWS::EC2::SecurityGroupIngress 174 | Properties: 175 | IpProtocol: tcp 176 | FromPort: '8998' 177 | ToPort: '8998' 178 | SourceSecurityGroupName: !Ref 'AirflowEC2SecurityGroup' 179 | GroupName: !Ref 'AirflowEMRMasterEC2SecurityGroup' 180 | AirflowEMRSlaveEC2SecurityGroup: 181 | Type: AWS::EC2::SecurityGroup 182 | Properties: 183 | GroupName: AirflowEMRSlaveSG 184 | GroupDescription: Airflow EMR Slave SG 185 | DBSecurityGroup: 186 | Type: AWS::RDS::DBSecurityGroup 187 | Properties: 188 | GroupDescription: Frontend Access 189 | DBSecurityGroupIngress: 190 | EC2SecurityGroupName: 191 | Ref: AirflowEC2SecurityGroup 192 | EC2Role: 193 | Type: AWS::IAM::Role 194 | Properties: 195 | RoleName: AirflowInstanceRole 196 | AssumeRolePolicyDocument: 197 | Version: "2012-10-17" 198 | Statement: 199 | - 200 | Effect: "Allow" 201 | Principal: 202 | Service: 203 | - "ec2.amazonaws.com" 204 | Action: 205 | - "sts:AssumeRole" 206 | ManagedPolicyArns: 207 | - arn:aws:iam::aws:policy/AmazonS3FullAccess 208 | - arn:aws:iam::aws:policy/AmazonElasticMapReduceFullAccess 209 | EC2InstanceProfile: 210 | Type: AWS::IAM::InstanceProfile 211 | Properties: 212 | InstanceProfileName: AirflowInstanceProfile 213 | Roles: 214 | - 215 | Ref: EC2Role 216 | EmrRole: 217 | Type: AWS::IAM::Role 218 | Properties: 219 | RoleName: EmrRole 220 | AssumeRolePolicyDocument: 221 | Version: "2012-10-17" 222 | Statement: 223 | - 224 | Effect: "Allow" 225 | Principal: 226 | Service: 227 | - "elasticmapreduce.amazonaws.com" 228 | - "s3.amazonaws.com" 229 | Action: 230 | - "sts:AssumeRole" 231 | ManagedPolicyArns: 232 | - arn:aws:iam::aws:policy/AmazonS3FullAccess 233 | - arn:aws:iam::aws:policy/AmazonElasticMapReduceFullAccess 234 | EmrEc2Role: 235 | Type: AWS::IAM::Role 236 | Properties: 237 | RoleName: EmrEc2Role 238 | AssumeRolePolicyDocument: 239 | Version: "2012-10-17" 240 | Statement: 241 | - 242 | Effect: "Allow" 243 | Principal: 244 | Service: 245 | - "ec2.amazonaws.com" 246 | Action: 247 | - "sts:AssumeRole" 248 | ManagedPolicyArns: 249 | - arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceforEC2Role 250 | - arn:aws:iam::aws:policy/AmazonS3FullAccess 251 | EmrEc2InstanceProfile: 252 | Type: AWS::IAM::InstanceProfile 253 | Properties: 254 | InstanceProfileName: EmrEc2InstanceProfile 255 | Roles: 256 | - 257 | Ref: EmrEc2Role 258 | S3Bucket: 259 | Type: AWS::S3::Bucket 260 | DeletionPolicy: Retain 261 | Properties: 262 | AccessControl: BucketOwnerFullControl 263 | BucketName: !Ref 'S3BucketName' 264 | Outputs: 265 | AirflowEC2PublicDNSName: 266 | Description: Public DNS Name of the Airflow EC2 instance 267 | Value: !Join ["", ["http://", !GetAtt EC2Instance.PublicDnsName, ":8080"]] -------------------------------------------------------------------------------- /Airflow_Data_Pipelines/Setup_Redshift_Connection_Airflow.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/san089/Udacity-Data-Engineering-Projects/c26bd834294cda2f4f2dfbf68c4c326635c55a93/Airflow_Data_Pipelines/Setup_Redshift_Connection_Airflow.md -------------------------------------------------------------------------------- /Airflow_Data_Pipelines/dags/sparkify_dimension_subdag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from airflow import DAG 3 | from airflow.operators.dummy_operator import DummyOperator 4 | from airflow.operators import LoadDimensionOperator 5 | from helpers import SqlQueries 6 | 7 | 8 | def load_dimension_subdag( 9 | parent_dag_name, 10 | task_id, 11 | redshift_conn_id, 12 | sql_statement, 13 | delete_load, 14 | table_name, 15 | *args, **kwargs): 16 | 17 | dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) 18 | 19 | load_dimension_table = LoadDimensionOperator( 20 | task_id=task_id, 21 | dag=dag, 22 | redshift_conn_id=redshift_conn_id, 23 | sql_query = sql_statement, 24 | delete_load = delete_load, 25 | table_name = table_name, 26 | ) 27 | 28 | load_dimension_table 29 | 30 | return dag -------------------------------------------------------------------------------- /Airflow_Data_Pipelines/dags/udac_example_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | import os 3 | from airflow import DAG 4 | from airflow.operators.dummy_operator import DummyOperator 5 | from airflow.operators import ( CreateTableOperator, StageToRedshiftOperator, LoadFactOperator, 6 | LoadDimensionOperator, DataQualityOperator) 7 | from helpers import SqlQueries 8 | from sparkify_dimension_subdag import load_dimension_subdag 9 | from airflow.operators.subdag_operator import SubDagOperator 10 | 11 | 12 | #AWS_KEY = os.environ.get('AWS_KEY') 13 | #AWS_SECRET = os.environ.get('AWS_SECRET') 14 | 15 | s3_bucket = 'udacity-dend-warehouse' 16 | song_s3_key = "song_data" 17 | log_s3_key = "log-data" 18 | log_json_file = "log_json_path.json" 19 | 20 | default_args = { 21 | 'owner': 'udacity', 22 | 'depends_on_past': True, 23 | 'start_date': datetime(2019, 1, 12), 24 | 'email_on_failure': False, 25 | 'email_on_retry': False, 26 | 'retries': 1, 27 | 'retry_delay': timedelta(minutes=5), 28 | 'catchup': True 29 | } 30 | 31 | dag_name = 'udac_example_dag' 32 | dag = DAG(dag_name, 33 | default_args=default_args, 34 | description='Load and transform data in Redshift with Airflow', 35 | schedule_interval='0 * * * *', 36 | max_active_runs = 1 37 | ) 38 | 39 | start_operator = DummyOperator(task_id='Begin_execution', dag=dag) 40 | 41 | create_tables_in_redshift = CreateTableOperator( 42 | task_id = 'create_tables_in_redshift', 43 | redshift_conn_id = 'redshift', 44 | dag = dag 45 | ) 46 | 47 | stage_events_to_redshift = StageToRedshiftOperator( 48 | task_id='Stage_events', 49 | table_name="staging_events", 50 | s3_bucket = s3_bucket, 51 | s3_key = log_s3_key, 52 | file_format="JSON", 53 | log_json_file = log_json_file, 54 | redshift_conn_id = "redshift", 55 | aws_credential_id="aws_credentials", 56 | dag=dag, 57 | provide_context=True 58 | ) 59 | 60 | 61 | 62 | stage_songs_to_redshift = StageToRedshiftOperator( 63 | task_id='Stage_songs', 64 | table_name="staging_songs", 65 | s3_bucket = s3_bucket, 66 | s3_key = song_s3_key, 67 | file_format="JSON", 68 | redshift_conn_id = "redshift", 69 | aws_credential_id="aws_credentials", 70 | dag=dag, 71 | provide_context=True 72 | ) 73 | 74 | 75 | load_songplays_table = LoadFactOperator( 76 | task_id='Load_songplays_fact_table', 77 | redshift_conn_id = 'redshift', 78 | sql_query = SqlQueries.songplay_table_insert, 79 | dag=dag 80 | ) 81 | 82 | 83 | load_user_dimension_table = SubDagOperator( 84 | subdag=load_dimension_subdag( 85 | parent_dag_name=dag_name, 86 | task_id="Load_user_dim_table", 87 | redshift_conn_id="redshift", 88 | start_date=default_args['start_date'], 89 | sql_statement=SqlQueries.user_table_insert, 90 | delete_load = True, 91 | table_name = "users", 92 | ), 93 | task_id="Load_user_dim_table", 94 | dag=dag, 95 | ) 96 | 97 | 98 | load_song_dimension_table = SubDagOperator( 99 | subdag=load_dimension_subdag( 100 | parent_dag_name=dag_name, 101 | task_id="Load_song_dim_table", 102 | redshift_conn_id="redshift", 103 | start_date=default_args['start_date'], 104 | sql_statement=SqlQueries.song_table_insert, 105 | delete_load = True, 106 | table_name = "songs", 107 | ), 108 | task_id="Load_song_dim_table", 109 | dag=dag, 110 | ) 111 | 112 | 113 | load_artist_dimension_table = SubDagOperator( 114 | subdag=load_dimension_subdag( 115 | parent_dag_name=dag_name, 116 | task_id="Load_artist_dim_table", 117 | redshift_conn_id="redshift", 118 | start_date=default_args['start_date'], 119 | sql_statement=SqlQueries.artist_table_insert, 120 | delete_load = True, 121 | table_name = "artists", 122 | ), 123 | task_id="Load_artist_dim_table", 124 | dag=dag, 125 | ) 126 | 127 | 128 | load_time_dimension_table = SubDagOperator( 129 | subdag=load_dimension_subdag( 130 | parent_dag_name=dag_name, 131 | task_id="Load_time_dim_table", 132 | redshift_conn_id="redshift", 133 | start_date=default_args['start_date'], 134 | sql_statement=SqlQueries.time_table_insert, 135 | delete_load = True, 136 | table_name = "time", 137 | ), 138 | task_id="Load_time_dim_table", 139 | dag=dag, 140 | ) 141 | 142 | 143 | run_quality_checks = DataQualityOperator( 144 | task_id='Run_data_quality_checks', 145 | dag=dag, 146 | redshift_conn_id = "redshift", 147 | tables = ["artists", "songplays", "songs", "time", "users"] 148 | 149 | ) 150 | 151 | end_operator = DummyOperator(task_id='Stop_execution', dag=dag) 152 | 153 | start_operator >> create_tables_in_redshift 154 | create_tables_in_redshift >> [stage_songs_to_redshift, stage_events_to_redshift] >> load_songplays_table 155 | 156 | load_songplays_table >> [load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, load_time_dimension_table] >> run_quality_checks >> end_operator 157 | 158 | -------------------------------------------------------------------------------- /Airflow_Data_Pipelines/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, absolute_import, print_function 2 | 3 | from airflow.plugins_manager import AirflowPlugin 4 | 5 | import operators 6 | import helpers 7 | 8 | # Defining the plugin class 9 | class UdacityPlugin(AirflowPlugin): 10 | name = "udacity_plugin" 11 | operators = [ 12 | operators.CreateTableOperator, 13 | operators.StageToRedshiftOperator, 14 | operators.LoadFactOperator, 15 | operators.LoadDimensionOperator, 16 | operators.DataQualityOperator 17 | ] 18 | helpers = [ 19 | helpers.SqlQueries 20 | ] 21 | -------------------------------------------------------------------------------- /Airflow_Data_Pipelines/plugins/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | from helpers.sql_queries import SqlQueries 2 | 3 | __all__ = [ 4 | 'SqlQueries', 5 | ] -------------------------------------------------------------------------------- /Airflow_Data_Pipelines/plugins/helpers/sql_queries.py: -------------------------------------------------------------------------------- 1 | class SqlQueries: 2 | songplay_table_insert = (""" 3 | INSERT INTO songplays (playid, start_time, userid, level, songid, artistid, sessionid, location, user_agent) 4 | SELECT 5 | md5(events.sessionid || events.start_time) songplay_id, 6 | events.start_time, 7 | events.userid, 8 | events.level, 9 | songs.song_id, 10 | songs.artist_id, 11 | events.sessionid, 12 | events.location, 13 | events.useragent 14 | FROM (SELECT TIMESTAMP 'epoch' + ts/1000 * interval '1 second' AS start_time, * 15 | FROM staging_events 16 | WHERE page='NextSong') events 17 | LEFT JOIN staging_songs songs 18 | ON events.song = songs.title 19 | AND events.artist = songs.artist_name 20 | AND events.length = songs.duration 21 | """) 22 | 23 | user_table_insert = (""" 24 | INSERT INTO users (userid, first_name, last_name, gender, level) 25 | SELECT distinct userid, firstname, lastname, gender, level 26 | FROM staging_events 27 | WHERE page='NextSong' 28 | """) 29 | 30 | song_table_insert = (""" 31 | INSERT INTO songs (songid, title, artistid, year, duration) 32 | SELECT distinct song_id, title, artist_id, year, duration 33 | FROM staging_songs 34 | """) 35 | 36 | artist_table_insert = (""" 37 | INSERT INTO artists (artistid, name, location, lattitude, longitude) 38 | SELECT distinct artist_id, artist_name, artist_location, artist_latitude, artist_longitude 39 | FROM staging_songs 40 | """) 41 | 42 | time_table_insert = (""" 43 | INSERT INTO time (start_time, hour, day, week, month, year, dayofweek) 44 | SELECT start_time, extract(hour from start_time), extract(day from start_time), extract(week from start_time), 45 | extract(month from start_time), extract(year from start_time), extract(dayofweek from start_time) 46 | FROM songplays 47 | """) -------------------------------------------------------------------------------- /Airflow_Data_Pipelines/plugins/operators/__init__.py: -------------------------------------------------------------------------------- 1 | from operators.create_table import CreateTableOperator 2 | from operators.stage_redshift import StageToRedshiftOperator 3 | from operators.load_fact import LoadFactOperator 4 | from operators.load_dimension import LoadDimensionOperator 5 | from operators.data_quality import DataQualityOperator 6 | 7 | __all__ = [ 8 | 'CreateTableOperator', 9 | 'StageToRedshiftOperator', 10 | 'LoadFactOperator', 11 | 'LoadDimensionOperator', 12 | 'DataQualityOperator' 13 | ] 14 | -------------------------------------------------------------------------------- /Airflow_Data_Pipelines/plugins/operators/create_table.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | 6 | class CreateTableOperator(BaseOperator): 7 | ui_color = '#358140' 8 | 9 | @apply_defaults 10 | def __init__(self, redshift_conn_id = "", *args, **kwargs): 11 | 12 | super(CreateTableOperator, self).__init__(*args, **kwargs) 13 | self.redshift_conn_id = redshift_conn_id 14 | 15 | def execute(self, context): 16 | self.log.info('Creating Postgres SQL Hook') 17 | redshift = PostgresHook(postgres_conn_id = self.redshift_conn_id) 18 | 19 | self.log.info('Executing creating tables in Redshift.') 20 | queries = open('/home/workspace/airflow/create_tables.sql', 'r').read() 21 | redshift.run(queries) 22 | 23 | self.log.info("Tables created ") 24 | 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /Airflow_Data_Pipelines/plugins/operators/data_quality.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | class DataQualityOperator(BaseOperator): 6 | 7 | ui_color = '#89DA59' 8 | 9 | @apply_defaults 10 | def __init__(self, 11 | redshift_conn_id="", 12 | tables = [], 13 | *args, **kwargs): 14 | 15 | super(DataQualityOperator, self).__init__(*args, **kwargs) 16 | self.redshift_conn_id = redshift_conn_id 17 | self.tables = tables 18 | 19 | def execute(self, context): 20 | redshift_hook = PostgresHook(postgres_conn_id = self.redshift_conn_id) 21 | 22 | for table in self.tables: 23 | 24 | self.log.info(f"Starting data quality validation on table : {table}") 25 | records = redshift_hook.get_records(f"select count(*) from {table};") 26 | 27 | if len(records) < 1 or len(records[0]) < 1 or records[0][0] < 1: 28 | self.log.error(f"Data Quality validation failed for table : {table}.") 29 | raise ValueError(f"Data Quality validation failed for table : {table}") 30 | self.log.info(f"Data Quality Validation Passed on table : {table}!!!") 31 | 32 | 33 | 34 | self.log.info('DataQualityOperator not implemented yet') -------------------------------------------------------------------------------- /Airflow_Data_Pipelines/plugins/operators/load_dimension.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | class LoadDimensionOperator(BaseOperator): 6 | 7 | ui_color = '#80BD9E' 8 | 9 | @apply_defaults 10 | def __init__(self, 11 | redshift_conn_id="", 12 | sql_query = "", 13 | delete_load = False, 14 | table_name = "", 15 | *args, **kwargs): 16 | 17 | super(LoadDimensionOperator, self).__init__(*args, **kwargs) 18 | self.redshift_conn_id = redshift_conn_id 19 | self.sql_query = sql_query 20 | self.table_name = table_name 21 | self.delete_load = delete_load 22 | 23 | def execute(self, context): 24 | redshift_hook = PostgresHook(postgres_conn_id = self.redshift_conn_id) 25 | if self.delete_load: 26 | self.log.info(f"Delete load operation set to TRUE. Running delete statement on table {self.table_name}") 27 | redshift_hook.run(f"DELETE FROM {self.table_name}") 28 | 29 | self.log.info(f"Running query to load data into Dimension Table {self.table_name}") 30 | redshift_hook.run(self.sql_query) 31 | self.log.info(f"Dimension Table {self.table_name} loaded.") 32 | -------------------------------------------------------------------------------- /Airflow_Data_Pipelines/plugins/operators/load_fact.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | class LoadFactOperator(BaseOperator): 6 | 7 | ui_color = '#F98866' 8 | 9 | @apply_defaults 10 | def __init__(self, 11 | redshift_conn_id="", 12 | sql_query = "", 13 | *args, **kwargs): 14 | 15 | super(LoadFactOperator, self).__init__(*args, **kwargs) 16 | self.redshift_conn_id = redshift_conn_id 17 | self.sql_query = sql_query 18 | 19 | 20 | def execute(self, context): 21 | redshift_hook = PostgresHook(postgres_conn_id = self.redshift_conn_id) 22 | redshift_hook.run(self.sql_query) 23 | 24 | -------------------------------------------------------------------------------- /Airflow_Data_Pipelines/plugins/operators/stage_redshift.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | from airflow.contrib.hooks.aws_hook import AwsHook 5 | 6 | class StageToRedshiftOperator(BaseOperator): 7 | ui_color = '#358140' 8 | 9 | copy_query = " COPY {} \ 10 | FROM '{}' \ 11 | ACCESS_KEY_ID '{}' \ 12 | SECRET_ACCESS_KEY '{}' \ 13 | FORMAT AS json '{}'; \ 14 | " 15 | 16 | @apply_defaults 17 | def __init__(self, 18 | redshift_conn_id="", 19 | aws_credential_id="", 20 | table_name = "", 21 | s3_bucket="", 22 | s3_key = "", 23 | file_format = "", 24 | log_json_file = "", 25 | *args, **kwargs): 26 | 27 | super(StageToRedshiftOperator, self).__init__(*args, **kwargs) 28 | self.redshift_conn_id = redshift_conn_id 29 | self.aws_credential_id = aws_credential_id 30 | self.table_name = table_name 31 | self.s3_bucket = s3_bucket 32 | self.s3_key = s3_key 33 | self.file_format = file_format 34 | self.log_json_file = log_json_file 35 | self.execution_date = kwargs.get('execution_date') 36 | 37 | def execute(self, context): 38 | aws_hook = AwsHook(self.aws_credential_id) 39 | credentials = aws_hook.get_credentials() 40 | 41 | 42 | s3_path = "s3://{}/{}".format(self.s3_bucket, self.s3_key) 43 | self.log.info(f"Picking staging file for table {self.table_name} from location : {s3_path}") 44 | 45 | if self.log_json_file != "": 46 | self.log_json_file = "s3://{}/{}".format(self.s3_bucket, self.log_json_file) 47 | copy_query = self.copy_query.format(self.table_name, s3_path, credentials.access_key, credentials.secret_key, self.log_json_file) 48 | else: 49 | copy_query = self.copy_query.format(self.table_name, s3_path, credentials.access_key, credentials.secret_key, 'auto') 50 | 51 | 52 | self.log.info(f"Running copy query : {copy_query}") 53 | redshift_hook = PostgresHook(postgres_conn_id = self.redshift_conn_id) 54 | 55 | redshift_hook.run(copy_query) 56 | self.log.info(f"Table {self.table_name} staged successfully!!") 57 | -------------------------------------------------------------------------------- /Airflow_Livy_Setup_CloudFormation.md: -------------------------------------------------------------------------------- 1 | ## Data Orchestration Pipeline Using Amazon EMR and Apache Livy 2 | ## Setting up Airflow using AWS CloudFormation script 3 | 4 | ![Airflow_Livy_Architecture](https://github.com/san089/Data_Engineering_Projects/blob/master/airflow_livy.png) 5 | 6 | Script is available publically and can be imported from - https://s3.amazonaws.com/aws-bigdata-blog/artifacts/airflow.livy.emr/airflow.yaml 7 | 8 | **This requires access to an Amazon EC2 key pair in the AWS Region you’re launching your CloudFormation stack. Please make sure to create a key-pair in the AWS Region first. Follow : [create-your-key-pair](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html#having-ec2-create-your-key-pair)** 9 | 10 | Steps to import: 11 | 1. Go to AWS Console -> Search for CloudFormation Service and open it. 12 | 2. Click on create stack -> Select **Template is Ready** 13 | 3. In the Amazon S3 URL paste the URL mentioned above. 14 | 4. This will load a template from `airflow.yaml` 15 | 5. Click Next -> Specify DBPassword and KeyName(the already existing key-pair) and S3BucketName (bucket should not be exisiting, it will automatically create a new bucket). 16 | 6. Click Next -> Next to run the stack. 17 | 18 | After the stack run is successfully completed, got to EC2 and you will see a new instance launched. Connect to instance using ssh connection. You can use putty or can connect using command line using ssh. 19 | 20 | [Connect to EC2 using putty](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/putty.html) 21 | 22 | **Connect using ssh from command line** 23 | 24 | chmod 400 airflow_key_pair.pem 25 | ssh -i "airflow_key_pair.pem" ec2-user@ec2-your-public-ip.your-region.compute.amazonaws.com 26 | 27 | After you are logged in: 28 | Run Command: 29 | 30 | # sudo as the root user 31 | sudo su 32 | 33 | export AIRFLOW_HOME=~/airflow 34 | # Navigate to the airflow directory which was created by the cloudformation template – Look at the user-data section. 35 | cd ~/airflow 36 | source ~/.bash_profile 37 | 38 | #### Airflow initialization and running webserver 39 | 40 | # initialise the SqlLite database, 41 | # below command will pick changes from airflow.cfg 42 | airflow initdb 43 | 44 | Open two new terminals. One to start the web server (you can set the port as well) and the other for a scheduler 45 | 46 | 47 | # Run the webserver on the custom port you can specify 48 | # MAKE SURE THIS PORT IS SPECIFIED IN YOUR SECURITY GROUP FOR INBOUND TRAFFIC. READ BELOW ARTICLE FOR MORE DETAILS. 49 | 50 | airflow webserver --port= 51 | 52 | # RUN THE SCHEDULER 53 | airflow scheduler 54 | 55 | 56 | [Authorizing Access To An Instance](https://docs.aws.amazon.com/AWSEC2/latest/WindowsGuide/authorizing-access-to-an-instance.html) 57 | 58 | #### Once the scheduler is running you can access airflow UI using your brower. 59 | To see the Airflow webserver, open any browser and type in the 60 | 61 | : 62 | 63 | 64 | REFERENCES: 65 | 66 | [Build-a-concurrent-data-orchestration-pipeline-using-amazon-emr-and-apache-livy](https://aws.amazon.com/blogs/big-data/build-a-concurrent-data-orchestration-pipeline-using-amazon-emr-and-apache-livy/) 67 | 68 | [Airflow Installation Steps](https://limitlessdatascience.wordpress.com/2019/10/01/apache-airflow-installation-steps/) 69 | -------------------------------------------------------------------------------- /Data_Api_to_Postgres/README.md: -------------------------------------------------------------------------------- 1 | # API data to Database 2 | ## Overview 3 | This project is to build a simple ETL pipeline to fetch real-time data from an open source API and store that data into a database. For this case we have used Yelp FUSION API as the open source API available and for database we used Postgres. 4 | 5 | ## Config File 6 | ``` 7 | [KEYS] 8 | CLIENT_KEY= 9 | API_KEY= 10 | 11 | 12 | [DATABASE] 13 | host= 14 | database= 15 | username= 16 | password= 17 | port= 18 | 19 | ``` 20 | 21 | 22 | ## Files 23 | ``` 24 | auth.py - Contains configuration variable for making HTTP Request 25 | 26 | businesssearch.py - Contains class to handle results returned from the search request 27 | 28 | databasedriver.py - Contains Connection detials to Postgres database and executing queries 29 | 30 | queries.py - Contains queries to create schema and tables in postgres and insert statement format 31 | 32 | request.py - Contains class to handle making request to the API 33 | 34 | driver.py - Entry point for the application, contains parsing command line arguments and control the program flow. 35 | ``` 36 | 37 | ## How to Run 38 | `python driver.py --term food --location Montreal --price 4` 39 | 40 | 41 | ## Results 42 | ![RESULTS](https://github.com/san089/Udacity-Data-Engineering-Projects/blob/master/Data_Api_to_Postgres/Results.PNG) 43 | -------------------------------------------------------------------------------- /Data_Api_to_Postgres/Results.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/san089/Udacity-Data-Engineering-Projects/c26bd834294cda2f4f2dfbf68c4c326635c55a93/Data_Api_to_Postgres/Results.PNG -------------------------------------------------------------------------------- /Data_Api_to_Postgres/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/san089/Udacity-Data-Engineering-Projects/c26bd834294cda2f4f2dfbf68c4c326635c55a93/Data_Api_to_Postgres/__init__.py -------------------------------------------------------------------------------- /Data_Api_to_Postgres/auth.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | from pathlib import Path 3 | 4 | config = configparser.ConfigParser() 5 | config.read_file(open(f"{Path(__file__).parents[0]}/config.cfg")) 6 | 7 | api_key = config['KEYS']['API_KEY'] 8 | headers = {'Authorization': 'Bearer %s' % api_key} -------------------------------------------------------------------------------- /Data_Api_to_Postgres/businesssearch.py: -------------------------------------------------------------------------------- 1 | # This is request module of this project 2 | from request import Request 3 | from auth import headers 4 | import json 5 | 6 | class BusinessSearch: 7 | def __init__(self, term, location, price=None): 8 | self._param = {'term' : term, 'location' : location} 9 | if price: 10 | self._param['price'] = price 11 | self._base_url = 'https://api.yelp.com/v3/businesses/search' 12 | self._business_list = self._search_business() 13 | 14 | def _search_business(self): 15 | business_search_request = Request.get_content(url=self._base_url, param=self._param) 16 | return business_search_request['businesses'] if business_search_request is not None else [] 17 | 18 | def _parse_results(self, data): 19 | # Categories data : 'categories': [{'alias': 'bakeries', 'title': 'Bakeries'}] 20 | categories = ' '.join([category['title'] for category in data['categories']]) 21 | 22 | # Longitude and latitude data : 'coordinates': {'latitude': 45.5232, 'longitude': -73.583459} 23 | longitude = data['coordinates']['longitude'] 24 | latitude = data['coordinates']['latitude'] 25 | 26 | # Location example : 'location': { 'display_address': ['316 Avenue du Mont-Royal E', 'Montreal, QC H2T 1P7', 'Canada']} 27 | location = ','.join(data['location']['display_address']) 28 | 29 | return {"id" : data['id'], "name" : self._add_escape_character(data['name']), "image_url" : data['image_url'], "url" : data['url'], 30 | "review_count" : data['review_count'], "categories" : categories, "rating" : data['rating'], 31 | "latitude" : latitude, "longitude" : longitude, "price" : data['price'], "location" : location, 32 | "display_phone" : data['display_phone'] 33 | } 34 | 35 | def _add_escape_character(self, data): 36 | return data.replace("'", "''") 37 | 38 | def get_results(self): 39 | return [self._parse_results(business) for business in self._business_list] -------------------------------------------------------------------------------- /Data_Api_to_Postgres/databasedriver.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | import configparser 3 | from pathlib import Path 4 | from queries import create_business_schema, create_business_table 5 | 6 | config = configparser.ConfigParser() 7 | config.read_file(open(f"{Path(__file__).parents[0]}/config.cfg")) 8 | 9 | class DatabaseDriver: 10 | 11 | def __init__(self): 12 | self._conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['DATABASE'].values())) 13 | self._cur = self._conn.cursor() 14 | 15 | def execute_query(self, query): 16 | self._cur.execute(query) 17 | 18 | def setup(self): 19 | self.execute_query(create_business_schema) 20 | self.execute_query(create_business_table) -------------------------------------------------------------------------------- /Data_Api_to_Postgres/driver.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | from pathlib import Path 3 | from businesssearch import BusinessSearch 4 | from queries import create_business_schema, create_business_table, insert_business_table 5 | from databasedriver import DatabaseDriver 6 | import argparse 7 | 8 | config = configparser.ConfigParser() 9 | config.read_file(open(f"{Path(__file__).parents[0]}/config.cfg")) 10 | 11 | parser = argparse.ArgumentParser( 12 | description="A Example yelp business finder based on parameters such as term, location, price, ") 13 | 14 | api_key = config['KEYS']['API_KEY'] 15 | headers = {'Authorization': 'Bearer %s' % api_key} 16 | 17 | def to_string(data): 18 | return [str(value) for value in data.values()] 19 | 20 | def main(): 21 | args = parser.parse_args() 22 | # Pricing levels to filter the search result with: 1 = $, 2 = $$, 3 = $$$, 4 = $$$$. 23 | b = BusinessSearch(term=args.term, location=args.location, price=args.price) 24 | db = DatabaseDriver() 25 | db.setup() 26 | 27 | queries = [insert_business_table.format(*to_string(result)) for result in b.get_results()] 28 | query_to_execute = "BEGIN; \n" + '\n'.join(queries) + "\nCOMMIT;" 29 | db.execute_query(query_to_execute) 30 | 31 | if __name__ == "__main__": 32 | parser._action_groups.pop() 33 | required = parser.add_argument_group('required arguments') 34 | optional = parser.add_argument_group('optional arguments') 35 | required.add_argument("-t", "--term", metavar='', required=True, 36 | help="Search term, for example \"food\" or \"restaurants\". The term may also be business names, such as \"Starbucks.\".") 37 | required.add_argument("-l", "--location", metavar='', required=True, 38 | help="This string indicates the geographic area to be used when searching for businesses. ") 39 | optional.add_argument("-p", "--price", type=int, metavar='', required=False, default=1, 40 | help="Pricing levels to filter the search result with: 1 = $, 2 = $$, 3 = $$$, 4 = $$$$.") 41 | 42 | main() -------------------------------------------------------------------------------- /Data_Api_to_Postgres/queries.py: -------------------------------------------------------------------------------- 1 | create_business_schema = """CREATE SCHEMA IF NOT EXISTS yelp;""" 2 | 3 | create_business_table = """ 4 | CREATE TABLE IF NOT EXISTS yelp.business ( 5 | business_id varchar PRIMARY KEY, 6 | business_name varchar, 7 | image_url varchar, 8 | url varchar, 9 | review_count int, 10 | categories varchar, 11 | rating float, 12 | latitude float, 13 | longitude float, 14 | price varchar, 15 | location varchar, 16 | phone varchar 17 | ); 18 | """ 19 | 20 | insert_business_table = """INSERT INTO yelp.business VALUES ('{}', '{}', '{}', '{}', {}, '{}', {}, {}, {}, '{}', '{}', '{}') 21 | ON CONFLICT (business_id) 22 | DO UPDATE SET 23 | business_id = EXCLUDED.business_id, 24 | business_name = EXCLUDED.business_name, 25 | image_url = EXCLUDED.image_url, 26 | url = EXCLUDED.url, 27 | review_count = EXCLUDED.review_count, 28 | categories = EXCLUDED.categories, 29 | rating = EXCLUDED.rating, 30 | latitude = EXCLUDED.latitude, 31 | longitude = EXCLUDED.longitude, 32 | price = EXCLUDED.price, 33 | location = EXCLUDED.location, 34 | phone = EXCLUDED.phone; 35 | """ -------------------------------------------------------------------------------- /Data_Api_to_Postgres/request.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from auth import headers 3 | import json 4 | 5 | class Request: 6 | def __init__(self): 7 | self._header = headers 8 | 9 | @staticmethod 10 | def get_content(url, param): 11 | response = requests.get(url, headers=headers, params=param) 12 | if response.status_code == 200: 13 | return json.loads(response.content) 14 | else: 15 | print(f"Request completed with Error. Response Code : {response.status_code}") 16 | return None -------------------------------------------------------------------------------- /Data_Lake/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Project - Data Lake 3 | A music streaming startup, Sparkify, has grown their user base and song database even more and want to move their data warehouse to a data lake. Their data resides in S3, in a directory of JSON logs on user activity on the app, as well as a directory with JSON metadata on the songs in their app. 4 | 5 | In this project, we will build an ETL pipeline for a data lake hosted on S3. We will load data from S3, process the data into analytics tables using Spark, and load them back into S3. We will deploy this Spark process on a cluster using AWS. 6 | 7 | ## Deployement 8 | 9 | File `dl.cfg` is not provided here. File contains : 10 | 11 | 12 | ``` 13 | KEY=YOUR_AWS_ACCESS_KEY 14 | SECRET=YOUR_AWS_SECRET_KEY 15 | ``` 16 | 17 | If you are using local as your development environemnt - Moving project directory from local to EMR 18 | 19 | 20 | 21 | 22 | scp -i <.pem-file> @:~ 23 | 24 | Running spark job (Before running job make sure EMR Role have access to s3) 25 | 26 | spark-submit etl.py --master yarn --deploy-mode client --driver-memory 4g --num-executors 2 --executor-memory 2g --executor-core 2 27 | 28 | ## ETL Pipeline 29 | 30 | 1. Read data from S3 31 | 32 | - Song data: `s3://udacity-dend/song_data` 33 | - Log data: `s3://udacity-dend/log_data` 34 | 35 | The script reads song_data and load_data from S3. 36 | 37 | 3. Process data using spark 38 | 39 | Transforms them to create five different tables listed below : 40 | #### Fact Table 41 | **songplays** - records in log data associated with song plays i.e. records with page `NextSong` 42 | - _songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent_ 43 | 44 | #### Dimension Tables 45 | **users** - users in the app 46 | Fields - _user_id, first_name, last_name, gender, level_ 47 | 48 | **songs** - songs in music database 49 | Fields - _song_id, title, artist_id, year, duration_ 50 | 51 | **artists** - artists in music database 52 | Fields - _artist_id, name, location, lattitude, longitude_ 53 | 54 | **time** - timestamps of records in **songplays** broken down into specific units 55 | Fields - _start_time, hour, day, week, month, year, weekday_ 56 | 57 | 4. Load it back to S3 58 | 59 | Writes them to partitioned parquet files in table directories on S3. 60 | -------------------------------------------------------------------------------- /Data_Lake/data/log-data/2018-11-01-events.json: -------------------------------------------------------------------------------- 1 | {"artist":null,"auth":"Logged In","firstName":"Walter","gender":"M","itemInSession":0,"lastName":"Frye","length":null,"level":"free","location":"San Francisco-Oakland-Hayward, CA","method":"GET","page":"Home","registration":1540919166796.0,"sessionId":38,"song":null,"status":200,"ts":1541105830796,"userAgent":"\"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/36.0.1985.143 Safari\/537.36\"","userId":"39"} 2 | {"artist":null,"auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":0,"lastName":"Summers","length":null,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"GET","page":"Home","registration":1540344794796.0,"sessionId":139,"song":null,"status":200,"ts":1541106106796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 3 | {"artist":"Des'ree","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":1,"lastName":"Summers","length":246.30812,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"You Gotta Be","status":200,"ts":1541106106796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 4 | {"artist":null,"auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":2,"lastName":"Summers","length":null,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"GET","page":"Upgrade","registration":1540344794796.0,"sessionId":139,"song":null,"status":200,"ts":1541106132796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 5 | {"artist":"Mr Oizo","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":3,"lastName":"Summers","length":144.03873,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Flat 55","status":200,"ts":1541106352796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 6 | {"artist":"Tamba Trio","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":4,"lastName":"Summers","length":177.18812,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Quem Quiser Encontrar O Amor","status":200,"ts":1541106496796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 7 | {"artist":"The Mars Volta","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":5,"lastName":"Summers","length":380.42077,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Eriatarka","status":200,"ts":1541106673796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 8 | {"artist":"Infected Mushroom","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":6,"lastName":"Summers","length":440.2673,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Becoming Insane","status":200,"ts":1541107053796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 9 | {"artist":"Blue October \/ Imogen Heap","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":7,"lastName":"Summers","length":241.3971,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Congratulations","status":200,"ts":1541107493796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 10 | {"artist":"Girl Talk","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":8,"lastName":"Summers","length":160.15628,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Once again","status":200,"ts":1541107734796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 11 | {"artist":"Black Eyed Peas","auth":"Logged In","firstName":"Sylvie","gender":"F","itemInSession":0,"lastName":"Cruz","length":214.93506,"level":"free","location":"Washington-Arlington-Alexandria, DC-VA-MD-WV","method":"PUT","page":"NextSong","registration":1540266185796.0,"sessionId":9,"song":"Pump It","status":200,"ts":1541108520796,"userAgent":"\"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit\/537.77.4 (KHTML, like Gecko) Version\/7.0.5 Safari\/537.77.4\"","userId":"10"} 12 | {"artist":null,"auth":"Logged In","firstName":"Ryan","gender":"M","itemInSession":0,"lastName":"Smith","length":null,"level":"free","location":"San Jose-Sunnyvale-Santa Clara, CA","method":"GET","page":"Home","registration":1541016707796.0,"sessionId":169,"song":null,"status":200,"ts":1541109015796,"userAgent":"\"Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Ubuntu Chromium\/36.0.1985.125 Chrome\/36.0.1985.125 Safari\/537.36\"","userId":"26"} 13 | {"artist":"Fall Out Boy","auth":"Logged In","firstName":"Ryan","gender":"M","itemInSession":1,"lastName":"Smith","length":200.72444,"level":"free","location":"San Jose-Sunnyvale-Santa Clara, CA","method":"PUT","page":"NextSong","registration":1541016707796.0,"sessionId":169,"song":"Nobody Puts Baby In The Corner","status":200,"ts":1541109125796,"userAgent":"\"Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Ubuntu Chromium\/36.0.1985.125 Chrome\/36.0.1985.125 Safari\/537.36\"","userId":"26"} 14 | {"artist":"M.I.A.","auth":"Logged In","firstName":"Ryan","gender":"M","itemInSession":2,"lastName":"Smith","length":233.7171,"level":"free","location":"San Jose-Sunnyvale-Santa Clara, CA","method":"PUT","page":"NextSong","registration":1541016707796.0,"sessionId":169,"song":"Mango Pickle Down River (With The Wilcannia Mob)","status":200,"ts":1541109325796,"userAgent":"\"Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Ubuntu Chromium\/36.0.1985.125 Chrome\/36.0.1985.125 Safari\/537.36\"","userId":"26"} 15 | {"artist":"Survivor","auth":"Logged In","firstName":"Jayden","gender":"M","itemInSession":0,"lastName":"Fox","length":245.36771,"level":"free","location":"New Orleans-Metairie, LA","method":"PUT","page":"NextSong","registration":1541033612796.0,"sessionId":100,"song":"Eye Of The Tiger","status":200,"ts":1541110994796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.3; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/36.0.1985.143 Safari\/537.36\"","userId":"101"} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/san089/Udacity-Data-Engineering-Projects/c26bd834294cda2f4f2dfbf68c4c326635c55a93/Data_Lake/data/song_data/A/.DS_Store -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/san089/Udacity-Data-Engineering-Projects/c26bd834294cda2f4f2dfbf68c4c326635c55a93/Data_Lake/data/song_data/A/A/.DS_Store -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/A/TRAAAAW128F429D538.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD7TVE1187B99BFB1", "artist_latitude": null, "artist_longitude": null, "artist_location": "California - LA", "artist_name": "Casual", "song_id": "SOMZWCG12A8C13C480", "title": "I Didn't Mean To", "duration": 218.93179, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/A/TRAAABD128F429CF47.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARMJAGH1187FB546F3", "artist_latitude": 35.14968, "artist_longitude": -90.04892, "artist_location": "Memphis, TN", "artist_name": "The Box Tops", "song_id": "SOCIWDW12A8C13D406", "title": "Soul Deep", "duration": 148.03546, "year": 1969} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/A/TRAAADZ128F9348C2E.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARKRRTF1187B9984DA", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Sonora Santanera", "song_id": "SOXVLOJ12AB0189215", "title": "Amor De Cabaret", "duration": 177.47546, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/A/TRAAAEF128F4273421.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR7G5I41187FB4CE6C", "artist_latitude": null, "artist_longitude": null, "artist_location": "London, England", "artist_name": "Adam Ant", "song_id": "SONHOTT12A8C13493C", "title": "Something Girls", "duration": 233.40363, "year": 1982} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/A/TRAAAFD128F92F423A.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARXR32B1187FB57099", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Gob", "song_id": "SOFSOCN12A8C143F5D", "title": "Face the Ashes", "duration": 209.60608, "year": 2007} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/A/TRAAAMO128F1481E7F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARKFYS91187B98E58F", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Jeff And Sheri Easter", "song_id": "SOYMRWW12A6D4FAB14", "title": "The Moon And I (Ordinary Day Album Version)", "duration": 267.7024, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/A/TRAAAMQ128F1460CD3.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD0S291187B9B7BF5", "artist_latitude": null, "artist_longitude": null, "artist_location": "Ohio", "artist_name": "Rated R", "song_id": "SOMJBYD12A6D4F8557", "title": "Keepin It Real (Skit)", "duration": 114.78159, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/A/TRAAAPK128E0786D96.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR10USD1187B99F3F1", "artist_latitude": null, "artist_longitude": null, "artist_location": "Burlington, Ontario, Canada", "artist_name": "Tweeterfriendly Music", "song_id": "SOHKNRJ12A6701D1F8", "title": "Drop of Rain", "duration": 189.57016, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/A/TRAAARJ128F9320760.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR8ZCNI1187B9A069B", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Planet P Project", "song_id": "SOIAZJW12AB01853F1", "title": "Pink World", "duration": 269.81832, "year": 1984} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/A/TRAAAVG12903CFA543.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNTLGG11E2835DDB9", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Clp", "song_id": "SOUDSGM12AC9618304", "title": "Insatiable (Instrumental Version)", "duration": 266.39628, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/A/TRAAAVO128F93133D4.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGSJW91187B9B1D6B", "artist_latitude": 35.21962, "artist_longitude": -80.01955, "artist_location": "North Carolina", "artist_name": "JennyAnyKind", "song_id": "SOQHXMF12AB0182363", "title": "Young Boy Blues", "duration": 218.77506, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/B/TRAABCL128F4286650.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARC43071187B990240", "artist_latitude": null, "artist_longitude": null, "artist_location": "Wisner, LA", "artist_name": "Wayne Watson", "song_id": "SOKEJEJ12A8C13E0D0", "title": "The Urgency (LP Version)", "duration": 245.21098, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/B/TRAABDL12903CAABBA.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARL7K851187B99ACD2", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Andy Andy", "song_id": "SOMUYGI12AB0188633", "title": "La Culpa", "duration": 226.35057, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/B/TRAABJL12903CDCF1A.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARHHO3O1187B989413", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Bob Azzam", "song_id": "SORAMLE12AB017C8B0", "title": "Auguri Cha Cha", "duration": 191.84281, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/B/TRAABJV128F1460C49.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARIK43K1187B9AE54C", "artist_latitude": null, "artist_longitude": null, "artist_location": "Beverly Hills, CA", "artist_name": "Lionel Richie", "song_id": "SOBONFF12A6D4F84D8", "title": "Tonight Will Be Alright", "duration": 307.3824, "year": 1986} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/B/TRAABLR128F423B7E3.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD842G1187B997376", "artist_latitude": 43.64856, "artist_longitude": -79.38533, "artist_location": "Toronto, Ontario, Canada", "artist_name": "Blue Rodeo", "song_id": "SOHUOAP12A8AE488E9", "title": "Floating", "duration": 491.12771, "year": 1987} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/B/TRAABNV128F425CEE1.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARIG6O41187B988BDD", "artist_latitude": 37.16793, "artist_longitude": -95.84502, "artist_location": "United States", "artist_name": "Richard Souther", "song_id": "SOUQQEA12A8C134B1B", "title": "High Tide", "duration": 228.5971, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/B/TRAABRB128F9306DD5.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR1ZHYZ1187FB3C717", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Faiz Ali Faiz", "song_id": "SOILPQQ12AB017E82A", "title": "Sohna Nee Sohna Data", "duration": 599.24853, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/B/TRAABVM128F92CA9DC.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARYKCQI1187FB3B18F", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Tesla", "song_id": "SOXLBJT12A8C140925", "title": "Caught In A Dream", "duration": 290.29832, "year": 2004} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/B/TRAABXG128F9318EBD.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNPAGP1241B9C7FD4", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "lextrical", "song_id": "SOZVMJI12AB01808AF", "title": "Synthetic Dream", "duration": 165.69424, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/B/TRAABYN12903CFD305.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARQGYP71187FB44566", "artist_latitude": 34.31109, "artist_longitude": -94.02978, "artist_location": "Mineola, AR", "artist_name": "Jimmy Wakely", "song_id": "SOWTBJW12AC468AC6E", "title": "Broken-Down Merry-Go-Round", "duration": 151.84934, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/B/TRAABYW128F4244559.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARI3BMM1187FB4255E", "artist_latitude": 38.8991, "artist_longitude": -77.029, "artist_location": "Washington", "artist_name": "Alice Stuart", "song_id": "SOBEBDG12A58A76D60", "title": "Kassie Jones", "duration": 220.78649, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/C/TRAACCG128F92E8A55.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR5KOSW1187FB35FF4", "artist_latitude": 49.80388, "artist_longitude": 15.47491, "artist_location": "Dubai UAE", "artist_name": "Elena", "song_id": "SOZCTXZ12AB0182364", "title": "Setanta matins", "duration": 269.58322, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/C/TRAACER128F4290F96.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARMAC4T1187FB3FA4C", "artist_latitude": 40.82624, "artist_longitude": -74.47995, "artist_location": "Morris Plains, NJ", "artist_name": "The Dillinger Escape Plan", "song_id": "SOBBUGU12A8C13E95D", "title": "Setting Fire to Sleeping Giants", "duration": 207.77751, "year": 2004} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/C/TRAACFV128F935E50B.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR47JEX1187B995D81", "artist_latitude": 37.83721, "artist_longitude": -94.35868, "artist_location": "Nevada, MO", "artist_name": "SUE THOMPSON", "song_id": "SOBLGCN12AB0183212", "title": "James (Hold The Ladder Steady)", "duration": 124.86485, "year": 1985} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/C/TRAACHN128F1489601.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGIWFO1187B9B55B7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Five Bolt Main", "song_id": "SOPSWQW12A6D4F8781", "title": "Made Like This (Live)", "duration": 225.09669, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/C/TRAACIW12903CC0F6D.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNTLGG11E2835DDB9", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Clp", "song_id": "SOZQDIU12A58A7BCF6", "title": "Superconfidential", "duration": 338.31138, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/C/TRAACLV128F427E123.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARDNS031187B9924F0", "artist_latitude": 32.67828, "artist_longitude": -83.22295, "artist_location": "Georgia", "artist_name": "Tim Wilson", "song_id": "SONYPOM12A8C13B2D7", "title": "I Think My Wife Is Running Around On Me (Taco Hell)", "duration": 186.48771, "year": 2005} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/C/TRAACNS128F14A2DF5.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AROUOZZ1187B9ABE51", "artist_latitude": 40.79195, "artist_longitude": -73.94512, "artist_location": "New York, NY [Spanish Harlem]", "artist_name": "Willie Bobo", "song_id": "SOBZBAZ12A6D4F8742", "title": "Spanish Grease", "duration": 168.25424, "year": 1997} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/C/TRAACOW128F933E35F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARH4Z031187B9A71F2", "artist_latitude": 40.73197, "artist_longitude": -74.17418, "artist_location": "Newark, NJ", "artist_name": "Faye Adams", "song_id": "SOVYKGO12AB0187199", "title": "Crazy Mixed Up World", "duration": 156.39465, "year": 1961} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/C/TRAACPE128F421C1B9.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARB29H41187B98F0EF", "artist_latitude": 41.88415, "artist_longitude": -87.63241, "artist_location": "Chicago", "artist_name": "Terry Callier", "song_id": "SOGNCJP12A58A80271", "title": "Do You Finally Need A Friend", "duration": 342.56934, "year": 1972} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/C/TRAACQT128F9331780.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR1Y2PT1187FB5B9CE", "artist_latitude": 27.94017, "artist_longitude": -82.32547, "artist_location": "Brandon", "artist_name": "John Wesley", "song_id": "SOLLHMX12AB01846DC", "title": "The Emperor Falls", "duration": 484.62322, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/C/TRAACSL128F93462F4.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARAJPHH1187FB5566A", "artist_latitude": 40.7038, "artist_longitude": -73.83168, "artist_location": "Queens, NY", "artist_name": "The Shangri-Las", "song_id": "SOYTPEP12AB0180E7B", "title": "Twist and Shout", "duration": 164.80608, "year": 1964} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/C/TRAACTB12903CAAF15.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR0RCMP1187FB3F427", "artist_latitude": 30.08615, "artist_longitude": -94.10158, "artist_location": "Beaumont, TX", "artist_name": "Billie Jo Spears", "song_id": "SOGXHEG12AB018653E", "title": "It Makes No Difference Now", "duration": 133.32853, "year": 1992} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/C/TRAACVS128E078BE39.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AREBBGV1187FB523D2", "artist_latitude": null, "artist_longitude": null, "artist_location": "Houston, TX", "artist_name": "Mike Jones (Featuring CJ_ Mello & Lil' Bran)", "song_id": "SOOLYAZ12A6701F4A6", "title": "Laws Patrolling (Album Version)", "duration": 173.66159, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/A/C/TRAACZK128F4243829.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGUVEV1187B98BA17", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Sierra Maestra", "song_id": "SOGOSOV12AF72A285E", "title": "\u00bfD\u00f3nde va Chichi?", "duration": 313.12934, "year": 1997} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/san089/Udacity-Data-Engineering-Projects/c26bd834294cda2f4f2dfbf68c4c326635c55a93/Data_Lake/data/song_data/A/B/.DS_Store -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/A/TRABACN128F425B784.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD7TVE1187B99BFB1", "artist_latitude": null, "artist_longitude": null, "artist_location": "California - LA", "artist_name": "Casual", "song_id": "SOQLGFP12A58A7800E", "title": "OAKtown", "duration": 259.44771, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/A/TRABAFJ128F42AF24E.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR3JMC51187B9AE49D", "artist_latitude": 28.53823, "artist_longitude": -81.37739, "artist_location": "Orlando, FL", "artist_name": "Backstreet Boys", "song_id": "SOPVXLX12A8C1402D5", "title": "Larger Than Life", "duration": 236.25098, "year": 1999} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/A/TRABAFP128F931E9A1.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARPBNLO1187FB3D52F", "artist_latitude": 40.71455, "artist_longitude": -74.00712, "artist_location": "New York, NY", "artist_name": "Tiny Tim", "song_id": "SOAOIBZ12AB01815BE", "title": "I Hold Your Hand In Mine [Live At Royal Albert Hall]", "duration": 43.36281, "year": 2000} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/A/TRABAIO128F42938F9.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR9AWNF1187B9AB0B4", "artist_latitude": null, "artist_longitude": null, "artist_location": "Seattle, Washington USA", "artist_name": "Kenny G featuring Daryl Hall", "song_id": "SOZHPGD12A8C1394FE", "title": "Baby Come To Me", "duration": 236.93016, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/A/TRABATO128F42627E9.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AROGWRA122988FEE45", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Christos Dantis", "song_id": "SOSLAVG12A8C13397F", "title": "Den Pai Alo", "duration": 243.82649, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/A/TRABAVQ12903CBF7E0.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARMBR4Y1187B9990EB", "artist_latitude": 37.77916, "artist_longitude": -122.42005, "artist_location": "California - SF", "artist_name": "David Martin", "song_id": "SOTTDKS12AB018D69B", "title": "It Wont Be Christmas", "duration": 241.47546, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/A/TRABAWW128F4250A31.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARQ9BO41187FB5CF1F", "artist_latitude": 40.99471, "artist_longitude": -77.60454, "artist_location": "Pennsylvania", "artist_name": "John Davis", "song_id": "SOMVWWT12A58A7AE05", "title": "Knocked Out Of The Park", "duration": 183.17016, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/A/TRABAXL128F424FC50.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARKULSX1187FB45F84", "artist_latitude": 39.49974, "artist_longitude": -111.54732, "artist_location": "Utah", "artist_name": "Trafik", "song_id": "SOQVMXR12A81C21483", "title": "Salt In NYC", "duration": 424.12363, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/A/TRABAXR128F426515F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARI2JSK1187FB496EF", "artist_latitude": 51.50632, "artist_longitude": -0.12714, "artist_location": "London, England", "artist_name": "Nick Ingman;Gavyn Wright", "song_id": "SODUJBS12A8C132150", "title": "Wessex Loses a Bride", "duration": 111.62077, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/A/TRABAXV128F92F6AE3.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AREDBBQ1187B98AFF5", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Eddie Calvert", "song_id": "SOBBXLX12A58A79DDA", "title": "Erica (2005 Digital Remaster)", "duration": 138.63138, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/A/TRABAZH128F930419A.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR7ZKHQ1187B98DD73", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Glad", "song_id": "SOTUKVB12AB0181477", "title": "Blessed Assurance", "duration": 270.602, "year": 1993} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/B/TRABBAM128F429D223.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARBGXIG122988F409D", "artist_latitude": 37.77916, "artist_longitude": -122.42005, "artist_location": "California - SF", "artist_name": "Steel Rain", "song_id": "SOOJPRH12A8C141995", "title": "Loaded Like A Gun", "duration": 173.19138, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/B/TRABBBV128F42967D7.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR7SMBG1187B9B9066", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Los Manolos", "song_id": "SOBCOSW12A8C13D398", "title": "Rumba De Barcelona", "duration": 218.38322, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/B/TRABBJE12903CDB442.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGCY1Y1187B9A4FA5", "artist_latitude": 36.16778, "artist_longitude": -86.77836, "artist_location": "Nashville, TN.", "artist_name": "Gloriana", "song_id": "SOQOTLQ12AB01868D0", "title": "Clementina Santaf\u00e8", "duration": 153.33832, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/B/TRABBKX128F4285205.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR36F9J1187FB406F1", "artist_latitude": 56.27609, "artist_longitude": 9.51695, "artist_location": "Denmark", "artist_name": "Bombay Rockers", "song_id": "SOBKWDJ12A8C13B2F3", "title": "Wild Rose (Back 2 Basics Mix)", "duration": 230.71302, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/B/TRABBLU128F93349CF.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNNKDK1187B98BBD5", "artist_latitude": 45.80726, "artist_longitude": 15.9676, "artist_location": "Zagreb Croatia", "artist_name": "Jinx", "song_id": "SOFNOQK12AB01840FC", "title": "Kutt Free (DJ Volume Remix)", "duration": 407.37914, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/B/TRABBNP128F932546F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR62SOJ1187FB47BB5", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Chase & Status", "song_id": "SOGVQGJ12AB017F169", "title": "Ten Tonne", "duration": 337.68444, "year": 2005} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/B/TRABBOP128F931B50D.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARBEBBY1187B9B43DB", "artist_latitude": null, "artist_longitude": null, "artist_location": "Gainesville, FL", "artist_name": "Tom Petty", "song_id": "SOFFKZS12AB017F194", "title": "A Higher Place (Album Version)", "duration": 236.17261, "year": 1994} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/B/TRABBOR128F4286200.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARDR4AC1187FB371A1", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Montserrat Caball\u00e9;Placido Domingo;Vicente Sardinero;Judith Blegen;Sherrill Milnes;Georg Solti", "song_id": "SOBAYLL12A8C138AF9", "title": "Sono andati? Fingevo di dormire", "duration": 511.16363, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/B/TRABBTA128F933D304.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARAGB2O1187FB3A161", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Pucho & His Latin Soul Brothers", "song_id": "SOLEYHO12AB0188A85", "title": "Got My Mojo Workin", "duration": 338.23302, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/B/TRABBVJ128F92F7EAA.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AREDL271187FB40F44", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Soul Mekanik", "song_id": "SOPEGZN12AB0181B3D", "title": "Get Your Head Stuck On Your Neck", "duration": 45.66159, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/B/TRABBXU128F92FEF48.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARP6N5A1187B99D1A3", "artist_latitude": null, "artist_longitude": null, "artist_location": "Hamtramck, MI", "artist_name": "Mitch Ryder", "song_id": "SOXILUQ12A58A7C72A", "title": "Jenny Take a Ride", "duration": 207.43791, "year": 2004} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/B/TRABBZN12903CD9297.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGSAFR1269FB35070", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Blingtones", "song_id": "SOTCKKY12AB018A141", "title": "Sonnerie lalaleul\u00e9 hi houuu", "duration": 29.54404, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/C/TRABCAJ12903CDFCC2.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARULZCI1241B9C8611", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Luna Orbit Project", "song_id": "SOSWKAV12AB018FC91", "title": "Midnight Star", "duration": 335.51628, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/C/TRABCEC128F426456E.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR0IAWL1187B9A96D0", "artist_latitude": 8.4177, "artist_longitude": -80.11278, "artist_location": "Panama", "artist_name": "Danilo Perez", "song_id": "SONSKXP12A8C13A2C9", "title": "Native Soul", "duration": 197.19791, "year": 2003} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/C/TRABCEI128F424C983.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARJIE2Y1187B994AB7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Line Renaud", "song_id": "SOUPIRU12A6D4FA1E1", "title": "Der Kleine Dompfaff", "duration": 152.92036, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/C/TRABCFL128F149BB0D.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARLTWXK1187FB5A3F8", "artist_latitude": 32.74863, "artist_longitude": -97.32925, "artist_location": "Fort Worth, TX", "artist_name": "King Curtis", "song_id": "SODREIN12A58A7F2E5", "title": "A Whiter Shade Of Pale (Live @ Fillmore West)", "duration": 326.00771, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/C/TRABCIX128F4265903.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNF6401187FB57032", "artist_latitude": 40.79086, "artist_longitude": -73.96644, "artist_location": "New York, NY [Manhattan]", "artist_name": "Sophie B. Hawkins", "song_id": "SONWXQJ12A8C134D94", "title": "The Ballad Of Sleeping Beauty", "duration": 305.162, "year": 1994} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/C/TRABCKL128F423A778.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARPFHN61187FB575F6", "artist_latitude": 41.88415, "artist_longitude": -87.63241, "artist_location": "Chicago, IL", "artist_name": "Lupe Fiasco", "song_id": "SOWQTQZ12A58A7B63E", "title": "Streets On Fire (Explicit Album Version)", "duration": 279.97995, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/C/TRABCPZ128F4275C32.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR051KA1187B98B2FF", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Wilks", "song_id": "SOLYIBD12A8C135045", "title": "Music is what we love", "duration": 261.51138, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/C/TRABCRU128F423F449.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR8IEZO1187B99055E", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Marc Shaiman", "song_id": "SOINLJW12A8C13314C", "title": "City Slickers", "duration": 149.86404, "year": 2008} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/C/TRABCTK128F934B224.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR558FS1187FB45658", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "40 Grit", "song_id": "SOGDBUF12A8C140FAA", "title": "Intro", "duration": 75.67628, "year": 2003} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/C/TRABCUQ128E0783E2B.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARVBRGZ1187FB4675A", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Gwen Stefani", "song_id": "SORRZGD12A6310DBC3", "title": "Harajuku Girls", "duration": 290.55955, "year": 2004} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/C/TRABCXB128F4286BD3.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARWB3G61187FB49404", "artist_latitude": null, "artist_longitude": null, "artist_location": "Hamilton, Ohio", "artist_name": "Steve Morse", "song_id": "SODAUVL12A8C13D184", "title": "Prognosis", "duration": 363.85914, "year": 2000} -------------------------------------------------------------------------------- /Data_Lake/data/song_data/A/B/C/TRABCYE128F934CE1D.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AREVWGE1187B9B890A", "artist_latitude": -13.442, "artist_longitude": -41.9952, "artist_location": "Noci (BA)", "artist_name": "Bitter End", "song_id": "SOFCHDR12AB01866EF", "title": "Living Hell", "duration": 282.43546, "year": 0} -------------------------------------------------------------------------------- /Data_Lake/etl.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | from datetime import datetime 3 | import os 4 | from pyspark.sql import SparkSession 5 | from pyspark.sql.functions import udf, col, monotonically_increasing_id 6 | from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format, dayofweek 7 | from pyspark.sql.types import * 8 | 9 | config = configparser.ConfigParser() 10 | config.read('dl.cfg') 11 | 12 | os.environ['AWS_ACCESS_KEY_ID'] = config['AWS_ACCESS_KEY_ID'] 13 | os.environ['AWS_SECRET_ACCESS_KEY'] = config['AWS_SECRET_ACCESS_KEY'] 14 | 15 | 16 | def create_spark_session(): 17 | spark = SparkSession \ 18 | .builder \ 19 | .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \ 20 | .getOrCreate() 21 | return spark 22 | 23 | 24 | def process_song_data(spark, input_data, output_data): 25 | """ 26 | Description: 27 | Process the songs data files and create extract songs table and artist table data from it. 28 | 29 | :param spark: a spark session instance 30 | :param input_data: input file path 31 | :param output_data: output file path 32 | """ 33 | 34 | # get filepath to song data file 35 | song_data = input_data + "song_data/*/*/*/*" 36 | 37 | # read song data file 38 | df = spark.read.json(song_data, mode='PERMISSIVE', columnNameOfCorruptRecord='corrupt_record').drop_duplicates() 39 | 40 | # extract columns to create songs table 41 | songs_table = df.select("song_id","title","artist_id","year","duration").drop_duplicates() 42 | 43 | # write songs table to parquet files partitioned by year and artist 44 | songs_table.write.parquet(output_data + "songs/", mode="overwrite", partitionBy=["year","artist_id"]) 45 | 46 | # extract columns to create artists table 47 | artists_table = df.select("artist_id","artist_name","artist_location","artist_latitude","artist_longitude").drop_duplicates() 48 | 49 | # write artists table to parquet files 50 | artists_table.write.parquet(output_data + "artists/", mode="overwrite") 51 | 52 | 53 | def process_log_data(spark, input_data, output_data): 54 | """ 55 | Description: 56 | Process the event log file and extract data for table time, users and songplays from it. 57 | 58 | :param spark: a spark session instance 59 | :param input_data: input file path 60 | :param output_data: output file path 61 | """ 62 | 63 | # get filepath to log data file 64 | log_data = os.path.join(input_data, "log-data/") 65 | 66 | # read log data file 67 | df = spark.read.json(log_data, mode='PERMISSIVE', columnNameOfCorruptRecord='corrupt_record').drop_duplicates() 68 | 69 | # filter by actions for song plays 70 | df = df.filter(df.page == "NextSong") 71 | 72 | # extract columns for users table 73 | users_table = df.select("userId","firstName","lastName","gender","level").drop_duplicates() 74 | 75 | # write users table to parquet files 76 | users_table.write.parquet(os.path.join(output_data, "users/") , mode="overwrite") 77 | 78 | # create timestamp column from original timestamp column 79 | get_timestamp = udf(lambda x : datetime.utcfromtimestamp(int(x)/1000), TimestampType()) 80 | df = df.withColumn("start_time", get_timestamp("ts")) 81 | 82 | # extract columns to create time table 83 | time_table = df.withColumn("hour",hour("start_time"))\ 84 | .withColumn("day",dayofmonth("start_time"))\ 85 | .withColumn("week",weekofyear("start_time"))\ 86 | .withColumn("month",month("start_time"))\ 87 | .withColumn("year",year("start_time"))\ 88 | .withColumn("weekday",dayofweek("start_time"))\ 89 | .select("ts","start_time","hour", "day", "week", "month", "year", "weekday").drop_duplicates() 90 | 91 | # write time table to parquet files partitioned by year and month 92 | time_table.write.parquet(os.path.join(output_data, "time_table/"), mode='overwrite', partitionBy=["year","month"]) 93 | 94 | # read in song data to use for songplays table 95 | song_df = spark.read\ 96 | .format("parquet")\ 97 | .option("basePath", os.path.join(output_data, "songs/"))\ 98 | .load(os.path.join(output_data, "songs/*/*/")) 99 | 100 | # extract columns from joined song and log datasets to create songplays table 101 | songplays_table = df.join(song_df, df.song == song_df.title, how='inner')\ 102 | .select(monotonically_increasing_id().alias("songplay_id"),col("start_time"),col("userId").alias("user_id"),"level","song_id","artist_id", col("sessionId").alias("session_id"), "location", col("userAgent").alias("user_agent")) 103 | 104 | songplays_table = songplays_table.join(time_table, songplays_table.start_time == time_table.start_time, how="inner")\ 105 | .select("songplay_id", songplays_table.start_time, "user_id", "level", "song_id", "artist_id", "session_id", "location", "user_agent", "year", "month") 106 | 107 | # write songplays table to parquet files partitioned by year and month 108 | songplays_table.drop_duplicates().write.parquet(os.path.join(output_data, "songplays/"), mode="overwrite", partitionBy=["year","month"]) 109 | 110 | 111 | def main(): 112 | spark = create_spark_session() 113 | input_data = "s3://udacity-spark-project/" 114 | output_data = "s3://udacity-spark-project/output/" 115 | 116 | process_song_data(spark, input_data, output_data) 117 | process_log_data(spark, input_data, output_data) 118 | 119 | 120 | if __name__ == "__main__": 121 | main() 122 | -------------------------------------------------------------------------------- /Data_Modeling_with_Apache_Cassandra/event_data/2018-11-01-events.csv: -------------------------------------------------------------------------------- 1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId 2 | ,Logged In,Walter,M,0,Frye,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54092E+12,38,,200,1.54111E+12,39 3 | ,Logged In,Kaylee,F,0,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Home,1.54034E+12,139,,200,1.54111E+12,8 4 | Des'ree,Logged In,Kaylee,F,1,Summers,246.30812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,You Gotta Be,200,1.54111E+12,8 5 | ,Logged In,Kaylee,F,2,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Upgrade,1.54034E+12,139,,200,1.54111E+12,8 6 | Mr Oizo,Logged In,Kaylee,F,3,Summers,144.03873,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Flat 55,200,1.54111E+12,8 7 | Tamba Trio,Logged In,Kaylee,F,4,Summers,177.18812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Quem Quiser Encontrar O Amor,200,1.54111E+12,8 8 | The Mars Volta,Logged In,Kaylee,F,5,Summers,380.42077,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Eriatarka,200,1.54111E+12,8 9 | Infected Mushroom,Logged In,Kaylee,F,6,Summers,440.2673,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Becoming Insane,200,1.54111E+12,8 10 | Blue October / Imogen Heap,Logged In,Kaylee,F,7,Summers,241.3971,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Congratulations,200,1.54111E+12,8 11 | Girl Talk,Logged In,Kaylee,F,8,Summers,160.15628,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Once again,200,1.54111E+12,8 12 | Black Eyed Peas,Logged In,Sylvie,F,0,Cruz,214.93506,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV",PUT,NextSong,1.54027E+12,9,Pump It,200,1.54111E+12,10 13 | ,Logged In,Ryan,M,0,Smith,,free,"San Jose-Sunnyvale-Santa Clara, CA",GET,Home,1.54102E+12,169,,200,1.54111E+12,26 14 | Fall Out Boy,Logged In,Ryan,M,1,Smith,200.72444,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,169,Nobody Puts Baby In The Corner,200,1.54111E+12,26 15 | M.I.A.,Logged In,Ryan,M,2,Smith,233.7171,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,169,Mango Pickle Down River (With The Wilcannia Mob),200,1.54111E+12,26 16 | Survivor,Logged In,Jayden,M,0,Fox,245.36771,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,100,Eye Of The Tiger,200,1.54111E+12,101 17 | -------------------------------------------------------------------------------- /Data_Modeling_with_Apache_Cassandra/event_data/2018-11-10-events.csv: -------------------------------------------------------------------------------- 1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId 2 | Hoobastank,Logged In,Cierra,F,0,Finley,241.3971,free,"Richmond, VA",PUT,NextSong,1.54101E+12,132,Say The Same,200,1.54181E+12,96 3 | Mark Knopfler,Logged In,Cierra,F,1,Finley,249.3122,free,"Richmond, VA",PUT,NextSong,1.54101E+12,132,Why Aye Man,200,1.54181E+12,96 4 | Mogwai,Logged In,Cierra,F,2,Finley,341.28934,free,"Richmond, VA",PUT,NextSong,1.54101E+12,132,We're No Here,200,1.54181E+12,96 5 | The Casualties,Logged In,Cierra,F,3,Finley,181.49832,free,"Richmond, VA",PUT,NextSong,1.54101E+12,132,Punx Unite,200,1.54181E+12,96 6 | ,Logged In,Cecilia,F,0,Owens,,free,"Atlanta-Sandy Springs-Roswell, GA",GET,Home,1.54103E+12,424,,200,1.54181E+12,6 7 | The Living End,Logged In,Ryan,M,0,Smith,188.62975,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,433,Roll On (Album Version),200,1.54182E+12,26 8 | Aloe Blacc,Logged In,Rylan,M,0,George,244.1922,free,"Birmingham-Hoover, AL",PUT,NextSong,1.54102E+12,402,I Need A Dollar,200,1.54183E+12,16 9 | Faith No More,Logged In,Rylan,M,1,George,326.50404,free,"Birmingham-Hoover, AL",PUT,NextSong,1.54102E+12,402,Helpless,200,1.54183E+12,16 10 | Chris Cornell,Logged In,Aleena,F,0,Kirby,353.69751,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Sunshower (Great Expectations Soundtrack),200,1.54184E+12,44 11 | Weezer,Logged In,Aleena,F,1,Kirby,203.93751,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,My Name Is Jonas,200,1.54184E+12,44 12 | Stream of Passion feat. Ayreon,Logged In,Aleena,F,2,Kirby,257.56689,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Valley Of The Queens,200,1.54184E+12,44 13 | Lupe Fiasco,Logged In,Aleena,F,3,Kirby,273.94567,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Shining Down [feat. Matthew Santos] (Amended Album Version),200,1.54184E+12,44 14 | Tom Petty,Logged In,Aleena,F,4,Kirby,263.23546,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Runnin' Down A Dream,200,1.54184E+12,44 15 | The Killers,Logged In,Aleena,F,5,Kirby,220.89098,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,When You Were Young,200,1.54184E+12,44 16 | Afghan Whigs,Logged In,Aleena,F,6,Kirby,179.40853,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,I'm Her Slave (Album),200,1.54184E+12,44 17 | CSS,Logged In,Aleena,F,7,Kirby,213.75955,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Let's Make Love And Listen To Death From Above [Dan Carey Mix] (remastered album version),200,1.54184E+12,44 18 | Mos Def / Talib Kweli,Logged In,Aleena,F,8,Kirby,141.37424,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,History,200,1.54184E+12,44 19 | Ryan Leslie,Logged In,Aleena,F,9,Kirby,203.96363,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,How It Was Supposed To Be,200,1.54184E+12,44 20 | Mark Lowry,Logged In,Aleena,F,10,Kirby,168.28036,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Get Together With The Lord (The Best Of Mark Lowry - Volume 2 Version),200,1.54184E+12,44 21 | Beirut,Logged In,Aleena,F,11,Kirby,230.19057,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Nantes,200,1.54184E+12,44 22 | MODESELEKTOR FEAT. PUPPETMASTAZ,Logged In,Aleena,F,12,Kirby,52.79302,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,THE DARK SIDE OF THE FROG,200,1.54184E+12,44 23 | Kid Cudi / Kanye West / Common,Logged In,Aleena,F,13,Kirby,237.76608,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Make Her Say,200,1.54184E+12,44 24 | Julie Ruin,Logged In,Aleena,F,14,Kirby,142.47138,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Breakout A-Town,200,1.54184E+12,44 25 | Sons And Daughters,Logged In,Aleena,F,15,Kirby,165.90322,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,The Bell,200,1.54184E+12,44 26 | Children 18:3,Logged In,Aleena,F,16,Kirby,178.52036,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Mock The Music,200,1.54184E+12,44 27 | Chris Cagle,Logged In,Aleena,F,17,Kirby,232.85506,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Miss Me Baby,200,1.54184E+12,44 28 | John Waite,Logged In,Aleena,F,18,Kirby,269.76608,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Missing You,200,1.54184E+12,44 29 | Basshunter,Logged In,Aleena,F,19,Kirby,223.32036,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Walk On Water,200,1.54184E+12,44 30 | Jay-Z / Lil Wayne,Logged In,Aleena,F,20,Kirby,236.01587,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Hello Brooklyn 2.0,200,1.54184E+12,44 31 | Snow Patrol,Logged In,Aleena,F,21,Kirby,273.6322,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,It's Beginning To Get To Me,200,1.54184E+12,44 32 | Coldcut,Logged In,Aleena,F,22,Kirby,203.07546,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Autumn Leaves,200,1.54184E+12,44 33 | Magic Dirt,Logged In,Aleena,F,23,Kirby,251.79383,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Plastic Loveless Letter,200,1.54184E+12,44 34 | J. Karjalainen & Mustat Lasit,Logged In,Aleena,F,24,Kirby,336.74404,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Sinisten tähtien alla,200,1.54184E+12,44 35 | OneRepublic,Logged In,Aleena,F,25,Kirby,224.67873,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Secrets,200,1.54184E+12,44 36 | Nirvana,Logged In,Aleena,F,26,Kirby,219.08853,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Come As You Are,200,1.54184E+12,44 37 | ,Logged In,Theodore,M,0,Smith,,free,"Houston-The Woodlands-Sugar Land, TX",GET,Home,1.54031E+12,359,,200,1.54184E+12,52 38 | Joyce Cooling,Logged In,Aleena,F,27,Kirby,248.11057,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,It's Time I Go (Jazz),200,1.54184E+12,44 39 | Beastie Boys,Logged In,Aleena,F,28,Kirby,211.722,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Unite (2009 Digital Remaster),200,1.54184E+12,44 40 | Usher Featuring Lil' Jon & Ludacris,Logged In,Aleena,F,29,Kirby,250.38322,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Yeah!,200,1.54184E+12,44 41 | Nelly / Paul Wall / Ali & Gipp,Logged In,Aleena,F,30,Kirby,272.50893,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Grillz,200,1.54184E+12,44 42 | The Audition,Logged In,Aleena,F,31,Kirby,207.20281,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,The Running Man,200,1.54184E+12,44 43 | Savage Garden,Logged In,Aleena,F,32,Kirby,277.26322,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Truly Madly Deeply,200,1.54184E+12,44 44 | Adam Green,Logged In,Aleena,F,33,Kirby,141.00853,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Festival Song,200,1.54184E+12,44 45 | Tom Petty,Logged In,Aleena,F,34,Kirby,204.82567,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Square One (Album Version),200,1.54184E+12,44 46 | Muse,Logged In,Aleena,F,35,Kirby,209.34485,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Supermassive Black Hole (Album Version),200,1.54184E+12,44 47 | The Gerbils,Logged In,Jordan,F,0,Hicks,27.01016,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,(iii),200,1.54184E+12,37 48 | Robert Plant,Logged In,Jordan,F,1,Hicks,265.66485,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,Dancing In Heaven (2006 Remastered LP Version),200,1.54184E+12,37 49 | Metallica,Logged In,Jordan,F,2,Hicks,387.02975,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,Welcome Home (Sanitarium),200,1.54184E+12,37 50 | Infected Mushroom,Logged In,Jordan,F,3,Hicks,506.51383,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,Deeply Disturbed,200,1.54184E+12,37 51 | Eliza Doolittle,Logged In,Jordan,F,4,Hicks,184.60689,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,Rollerblades,200,1.54185E+12,37 52 | Alvin And The Chipmunks,Logged In,Jordan,F,5,Hicks,162.63791,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,Ain't No Party,200,1.54185E+12,37 53 | Chromeo,Logged In,Jordan,F,6,Hicks,348.65587,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,You're So Gangsta,200,1.54185E+12,37 54 | Keisha White,Logged In,Kevin,M,0,Arellano,251.42812,free,"Harrisburg-Carlisle, PA",PUT,NextSong,1.54001E+12,387,Brother,200,1.54185E+12,66 55 | Juanes,Logged In,Kevin,M,1,Arellano,247.37914,free,"Harrisburg-Carlisle, PA",PUT,NextSong,1.54001E+12,387,Damelo,200,1.54185E+12,66 56 | ,Logged In,Walter,M,0,Frye,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54092E+12,180,,200,1.54185E+12,39 57 | Karnivool,Logged In,Ryan,M,0,Smith,470.80444,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,445,Umbra,200,1.54186E+12,26 58 | WES,Logged In,Cecilia,F,0,Owens,221.57016,free,"Atlanta-Sandy Springs-Roswell, GA",PUT,NextSong,1.54103E+12,444,Alane,200,1.54186E+12,6 59 | Asia 2001,Logged In,Cecilia,F,1,Owens,150.30812,free,"Atlanta-Sandy Springs-Roswell, GA",PUT,NextSong,1.54103E+12,444,Epilogue,200,1.54186E+12,6 60 | Spike Milligan,Logged In,Samuel,M,0,Gonzalez,220.39465,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.54049E+12,384,Nothing At All,200,1.54186E+12,61 61 | Laura Izibor,Logged In,Anabelle,F,0,Simpson,211.56526,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1.54104E+12,378,Carousel (PSILY Album Version),200,1.54187E+12,69 62 | ,Logged In,Lily,F,0,Burns,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1.54062E+12,426,,200,1.54187E+12,32 63 | Ryan Adams,Logged In,Braden,M,0,Parker,248.5024,free,"Youngstown-Warren-Boardman, OH-PA",PUT,NextSong,1.541E+12,246,Wonderwall,200,1.54187E+12,74 64 | ,Logged In,Adelyn,F,0,Jordan,,free,"Chicago-Naperville-Elgin, IL-IN-WI",GET,Home,1.54013E+12,391,,200,1.54187E+12,7 65 | Method Man,Logged In,Adelyn,F,1,Jordan,204.64281,free,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54013E+12,391,The Motto,200,1.54187E+12,7 66 | The Stanley Brothers,Logged In,Adelyn,F,2,Jordan,179.69587,free,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54013E+12,391,I'm A Man Of Constant Sorrow,200,1.54187E+12,7 67 | Dexter Freebish,Logged In,Adelyn,F,3,Jordan,210.54649,free,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54013E+12,391,Deeper,200,1.54187E+12,7 68 | Jamiroquai,Logged In,Jacob,M,0,Rogers,362.05669,free,"San Diego-Carlsbad, CA",PUT,NextSong,1.54098E+12,432,Talullah,200,1.54187E+12,18 69 | Michael Cera & Ellen Page,Logged In,Matthew,M,0,Jones,116.71465,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Anyone Else But You,200,1.54188E+12,36 70 | The Cat Empire,Logged In,Matthew,M,1,Jones,218.22649,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,How To Explain,200,1.54188E+12,36 71 | Bryn Terfel / Berliner Philharmoniker / Claudio Abbado,Logged In,Matthew,M,2,Jones,967.36608,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Wotan's Farewell & Magic Fire Music,200,1.54188E+12,36 72 | The Fugees,Logged In,Matthew,M,3,Jones,281.20771,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Ready Or Not,200,1.54188E+12,36 73 | Hardline,Logged In,Matthew,M,4,Jones,234.73587,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Everything,200,1.54188E+12,36 74 | The Funky Lowlives,Logged In,Matthew,M,5,Jones,280.34567,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Sail Into the Sun,200,1.54188E+12,36 75 | DL Incognito,Logged In,Matthew,M,6,Jones,221.07383,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Proof,200,1.54188E+12,36 76 | ,Logged In,Theodore,M,0,Smith,,free,"Houston-The Woodlands-Sugar Land, TX",GET,Home,1.54031E+12,447,,200,1.54188E+12,52 77 | Justice,Logged In,Matthew,M,7,Jones,243.40853,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,TTHHEE PPAARRTTYY,200,1.54188E+12,36 78 | Earth_ Wind & Fire,Logged In,Theodore,M,1,Smith,178.20689,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.54031E+12,447,Night Dreamin',200,1.54188E+12,52 79 | Strawbs,Logged In,Matthew,M,8,Jones,255.81669,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Sheep,200,1.54188E+12,36 80 | Angus & Julia Stone,Logged In,Matthew,M,9,Jones,172.85179,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Wasted,200,1.54188E+12,36 81 | Sara Bareilles,Logged In,Matthew,M,10,Jones,260.8322,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Love Song,200,1.54188E+12,36 82 | Bruna Caram,Logged In,Matthew,M,11,Jones,198.63465,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Meus Sonhos,200,1.54188E+12,36 83 | Nando Reis,Logged In,Matthew,M,12,Jones,239.82975,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,O Segundo Sol,200,1.54188E+12,36 84 | The Black Keys,Logged In,Matthew,M,13,Jones,189.28281,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Same Old Thing,200,1.54188E+12,36 85 | Kreator,Logged In,Matthew,M,14,Jones,294.53016,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Riot Of Violence,200,1.54188E+12,36 86 | Audioslave,Logged In,Matthew,M,15,Jones,277.83791,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Show Me How To Live,200,1.54188E+12,36 87 | Red Hot Chili Peppers,Logged In,Matthew,M,16,Jones,269.34812,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Parallel Universe (Album Version),200,1.54188E+12,36 88 | Manu Chao,Logged In,Matthew,M,17,Jones,288.15628,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Me Quedo Contigo [Si Me Das A Elegir],200,1.54188E+12,36 89 | Barry Tuckwell/Academy of St Martin-in-the-Fields/Sir Neville Marriner,Logged In,Matthew,M,18,Jones,277.15873,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Horn Concerto No. 4 in E flat K495: II. Romance (Andante cantabile),200,1.54188E+12,36 90 | Ron Carter,Logged In,Matthew,M,19,Jones,497.13587,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,I CAN'T GET STARTED,200,1.54188E+12,36 91 | ,Logged In,Theodore,M,0,Harris,,free,"Red Bluff, CA",GET,Home,1.5411E+12,440,,200,1.54188E+12,14 92 | ,Logged In,Theodore,M,1,Harris,,free,"Red Bluff, CA",GET,Home,1.5411E+12,440,,200,1.54188E+12,14 93 | Lifehouse,Logged In,Theodore,M,2,Harris,195.47383,free,"Red Bluff, CA",PUT,NextSong,1.5411E+12,440,You And Me (Wedding Version),200,1.54188E+12,14 94 | Yann Tiersen,Logged In,Kaylee,F,0,Summers,158.71955,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,333,La Valse D'Amélie (Version Piano),200,1.54189E+12,8 95 | ISRAEL & NEW BREED,Logged In,Kaylee,F,1,Summers,176.48281,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,333,Awesome Medley,200,1.54189E+12,8 96 | ,Logged In,Molly,F,0,Taylor,,free,"St. Louis, MO-IL",GET,Home,1.54099E+12,396,,200,1.54189E+12,35 97 | Stellar Kart,Logged In,Molly,F,1,Taylor,186.17424,free,"St. Louis, MO-IL",PUT,NextSong,1.54099E+12,396,Jesus Loves You (Album Version),200,1.54189E+12,35 98 | -------------------------------------------------------------------------------- /Data_Modeling_with_Apache_Cassandra/event_data/2018-11-11-events.csv: -------------------------------------------------------------------------------- 1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId 2 | Frumpies,Logged In,Anabelle,F,0,Simpson,134.47791,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1.54104E+12,455,Hello Kitty,200,1.5419E+12,69 3 | Kenny G with Peabo Bryson,Logged In,Anabelle,F,1,Simpson,264.75057,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1.54104E+12,455,By The Time This Night Is Over,200,1.5419E+12,69 4 | Biffy Clyro,Logged In,Anabelle,F,2,Simpson,189.83138,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1.54104E+12,455,God & Satan,200,1.5419E+12,69 5 | ,Logged In,Lily,F,0,Burns,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1.54062E+12,456,,200,1.54191E+12,32 6 | HIM,Logged In,Lily,F,1,Burns,212.06159,free,"New York-Newark-Jersey City, NY-NJ-PA",PUT,NextSong,1.54062E+12,456,Beautiful,200,1.54191E+12,32 7 | Matmos,Logged In,Joseph,M,0,Gutierrez,1449.11628,free,"Columbia, SC",PUT,NextSong,1.54081E+12,284,Supreme Balloon,200,1.54191E+12,75 8 | Gary Allan,Logged In,Ryann,F,0,Smith,259.83955,free,"Palestine, TX",PUT,NextSong,1.54069E+12,328,The One,200,1.54193E+12,92 9 | Miracle Fortress,Logged In,Ryann,F,1,Smith,200.9073,free,"Palestine, TX",PUT,NextSong,1.54069E+12,328,Five Roses,200,1.54193E+12,92 10 | Don Omar,Logged In,Ryann,F,2,Smith,261.35465,free,"Palestine, TX",PUT,NextSong,1.54069E+12,328,Cuentale,200,1.54193E+12,92 11 | Jay-Z,Logged In,Ryann,F,3,Smith,212.27057,free,"Palestine, TX",PUT,NextSong,1.54069E+12,328,D'Evils,200,1.54193E+12,92 12 | Red Hot Chili Peppers,Logged In,Ryann,F,4,Smith,231.33995,free,"Palestine, TX",PUT,NextSong,1.54069E+12,328,Easily (Album Version),200,1.54193E+12,92 13 | ,Logged In,Chloe,F,0,Cuevas,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,437,,200,1.54193E+12,49 14 | Flogging Molly,Logged In,Chloe,F,1,Cuevas,361.9522,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,Rebels of the Sacred Heart,200,1.54193E+12,49 15 | Reverend Horton Heat,Logged In,Chloe,F,2,Cuevas,158.64118,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,Now_ Right Now,200,1.54193E+12,49 16 | Sea Wolf,Logged In,Chloe,F,3,Cuevas,232.61995,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,I Made A Resolution,200,1.54193E+12,49 17 | Jason Mraz & Colbie Caillat,Logged In,Chloe,F,4,Cuevas,189.6224,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,Lucky (Album Version),200,1.54193E+12,49 18 | Jamie Lidell,Logged In,Chloe,F,5,Cuevas,175.25506,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,Enoughs Enough,200,1.54193E+12,49 19 | Feist,Logged In,Chloe,F,6,Cuevas,212.79302,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,Mushaboom (Postal Service Mix),200,1.54193E+12,49 20 | ,Logged In,Chloe,F,7,Cuevas,,free,"San Francisco-Oakland-Hayward, CA",PUT,Logout,1.54094E+12,437,,307,1.54193E+12,49 21 | ,Logged Out,,,8,,,free,,GET,Home,,437,,200,1.54193E+12, 22 | ,Logged Out,,,9,,,free,,GET,Home,,437,,200,1.54193E+12, 23 | ,Logged Out,,,10,,,free,,PUT,Login,,437,,307,1.54193E+12, 24 | ,Logged In,Chloe,F,11,Cuevas,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,437,,200,1.54193E+12,49 25 | Sex Slaves,Logged In,Chloe,F,12,Cuevas,175.51628,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,We're Going Out Tonight,200,1.54193E+12,49 26 | ,Logged In,Chloe,F,0,Cuevas,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,469,,200,1.54194E+12,49 27 | Rise Against,Logged In,Chloe,F,1,Cuevas,169.482,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,469,To Them These Streets Belong,200,1.54194E+12,49 28 | ,Logged In,Mohammad,M,0,Rodriguez,,free,"Sacramento--Roseville--Arden-Arcade, CA",GET,Home,1.54051E+12,441,,200,1.54194E+12,88 29 | Beyoncé,Logged In,Mohammad,M,1,Rodriguez,359.54893,free,"Sacramento--Roseville--Arden-Arcade, CA",PUT,NextSong,1.54051E+12,441,Get Me Bodied,200,1.54194E+12,88 30 | Nate Dogg,Logged In,Mohammad,M,2,Rodriguez,356.38812,free,"Sacramento--Roseville--Arden-Arcade, CA",PUT,NextSong,1.54051E+12,441,Never Leave Me Alone,200,1.54194E+12,88 31 | ,Logged In,Cierra,F,0,Finley,,free,"Richmond, VA",GET,Home,1.54101E+12,443,,200,1.54195E+12,96 32 | Taylor Swift,Logged In,Cierra,F,1,Finley,233.89995,free,"Richmond, VA",PUT,NextSong,1.54101E+12,443,Love Story,200,1.54195E+12,96 33 | Lynyrd Skynyrd,Logged In,Ryan,M,0,Smith,216.60689,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,452,Sweet home Alabama,200,1.54195E+12,26 34 | Kelis,Logged In,Cierra,F,2,Finley,293.58975,free,"Richmond, VA",PUT,NextSong,1.54101E+12,443,Caught Out There (Explicit),200,1.54195E+12,96 35 | The Kills,Logged In,Cierra,F,3,Finley,203.38893,free,"Richmond, VA",PUT,NextSong,1.54101E+12,443,Last Day Of Magic,200,1.54195E+12,96 36 | ,Logged In,Aleena,F,0,Kirby,,paid,"Waterloo-Cedar Falls, IA",GET,Home,1.54102E+12,448,,200,1.54195E+12,44 37 | Collie Buddz featuring Paul Wall,Logged In,Aleena,F,1,Kirby,271.62077,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,What A Feeling,200,1.54195E+12,44 38 | Charttraxx Karaoke,Logged In,Cierra,F,4,Finley,225.17506,free,"Richmond, VA",PUT,NextSong,1.54101E+12,443,Fireflies,200,1.54195E+12,96 39 | Band Of Horses,Logged In,Aleena,F,2,Kirby,321.14893,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,The Funeral (Album Version),200,1.54195E+12,44 40 | Coldplay,Logged In,Aleena,F,3,Kirby,307.51302,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Clocks,200,1.54195E+12,44 41 | Bon Jovi,Logged In,Aleena,F,4,Kirby,228.75383,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Have A Nice Day,200,1.54195E+12,44 42 | P.O.D.,Logged In,Aleena,F,5,Kirby,203.7024,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Alive (2006 Remastered Album Version),200,1.54195E+12,44 43 | Bloc Party,Logged In,Aleena,F,6,Kirby,222.04036,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Plans (Replanned by Mogwai),200,1.54195E+12,44 44 | Los Prisioneros,Logged In,Aleena,F,7,Kirby,211.12118,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Pa Pa Pa,200,1.54195E+12,44 45 | Octopus Project,Logged In,Aleena,F,8,Kirby,175.25506,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Lots More Stairs,200,1.54195E+12,44 46 | Roudoudou,Logged In,Aleena,F,9,Kirby,18.41587,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Ecoute Ce Scratch,200,1.54195E+12,44 47 | Africando,Logged In,Aleena,F,10,Kirby,253.54404,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Tierra Tradicional,200,1.54195E+12,44 48 | RUN-DMC Featuring Method Man_ Kenny Cash_ Mike Ransom_ and Jamel Simmons,Logged In,Aleena,F,11,Kirby,266.52689,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Simmons Incorporated,200,1.54195E+12,44 49 | ,Logged In,Colm,M,0,Santana,,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",GET,Home,1.54086E+12,414,,200,1.54195E+12,67 50 | Graham Coxon,Logged In,Colm,M,1,Santana,197.14567,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,I'm Goin' Away,200,1.54195E+12,67 51 | Queens Of The Stone Age,Logged In,Aleena,F,12,Kirby,231.02649,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,In The Fade,200,1.54195E+12,44 52 | Dance Gavin Dance,Logged In,Colm,M,2,Santana,193.30567,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,Strawberry André (Album Version),200,1.54195E+12,67 53 | ,Logged In,Colm,M,3,Santana,,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",GET,Home,1.54086E+12,414,,200,1.54195E+12,67 54 | Passion Pit,Logged In,Aleena,F,13,Kirby,243.69587,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Eyes As Candles,200,1.54195E+12,44 55 | ,Logged In,Aleena,F,14,Kirby,,paid,"Waterloo-Cedar Falls, IA",GET,Home,1.54102E+12,448,,200,1.54195E+12,44 56 | Black Eyed Peas,Logged In,Colm,M,4,Santana,229.61587,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,Let's Get It Started,200,1.54195E+12,67 57 | Plastic Bertrand,Logged In,Colm,M,5,Santana,180.00934,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,Ca plane pour moi,200,1.54195E+12,67 58 | Cream,Logged In,Colm,M,6,Santana,166.5824,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,Strange Brew,200,1.54195E+12,67 59 | Coldplay,Logged In,Colm,M,7,Santana,284.39465,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,A Message,200,1.54195E+12,67 60 | Cute Is What We Aim For,Logged In,Colm,M,8,Santana,172.22485,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,Sweat the Battle Before the Battle Sweats You (Album Version),200,1.54195E+12,67 61 | Metallica,Logged In,Connar,M,0,Moreno,256.9922,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.54082E+12,218,Of Wolf And Man,200,1.54195E+12,62 62 | The Kills,Logged In,Connar,M,1,Moreno,217.70404,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.54082E+12,218,Tape Song,200,1.54195E+12,62 63 | Foo Fighters,Logged In,Connar,M,2,Moreno,271.38567,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.54082E+12,218,The Pretender,200,1.54195E+12,62 64 | Plaid,Logged In,Connar,M,3,Moreno,260.96281,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.54082E+12,218,Eyen [Chosen by fans on Warp20.net],200,1.54195E+12,62 65 | ,Logged In,Brayden,M,0,Clark,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1.54103E+12,120,,200,1.54195E+12,41 66 | ,Logged In,Theodore,M,0,Harris,,free,"Red Bluff, CA",GET,Home,1.5411E+12,462,,200,1.54196E+12,14 67 | The Van Pelt,Logged In,Theodore,M,1,Harris,208.71791,free,"Red Bluff, CA",PUT,NextSong,1.5411E+12,462,It's New To Me,200,1.54196E+12,14 68 | ,Logged In,Ryan,M,0,Smith,,free,"San Jose-Sunnyvale-Santa Clara, CA",GET,Home,1.54102E+12,472,,200,1.54196E+12,26 69 | 44,Logged In,Ryan,M,1,Smith,224.57424,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,472,Make You Smile,200,1.54196E+12,26 70 | ,Logged In,Rylan,M,0,George,,free,"Birmingham-Hoover, AL",GET,Home,1.54102E+12,446,,200,1.54196E+12,16 71 | Chris Brown,Logged In,Rylan,M,1,George,275.1473,free,"Birmingham-Hoover, AL",PUT,NextSong,1.54102E+12,446,I May Never Find,200,1.54196E+12,16 72 | KT Tunstall,Logged In,Ryan,M,2,Smith,170.47465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,472,Black Horse And The Cherry Tree (Radio Version),200,1.54196E+12,26 73 | Cascada,Logged In,Rylan,M,2,George,184.39791,free,"Birmingham-Hoover, AL",PUT,NextSong,1.54102E+12,446,Kids In America,200,1.54196E+12,16 74 | Incubus,Logged In,Ryan,M,3,Smith,293.38077,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,472,Black Heart Inertia,200,1.54196E+12,26 75 | ,Logged In,Ryan,M,4,Smith,,free,"San Jose-Sunnyvale-Santa Clara, CA",GET,Help,1.54102E+12,472,,200,1.54196E+12,26 76 | ,Logged In,Ryan,M,5,Smith,,free,"San Jose-Sunnyvale-Santa Clara, CA",GET,Home,1.54102E+12,472,,200,1.54196E+12,26 77 | ,Logged In,Tegan,F,0,Levine,,paid,"Portland-South Portland, ME",GET,Home,1.54079E+12,435,,200,1.54197E+12,80 78 | Miike Snow,Logged In,Tegan,F,1,Levine,220.83873,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Black & Blue,200,1.54197E+12,80 79 | Cartola,Logged In,Tegan,F,2,Levine,208.92689,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Sala De Recepção,200,1.54197E+12,80 80 | Kill The Client,Logged In,Tegan,F,3,Levine,70.68689,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Commander In Thief,200,1.54197E+12,80 81 | ,Logged In,Tegan,F,4,Levine,,paid,"Portland-South Portland, ME",GET,Home,1.54079E+12,435,,200,1.54197E+12,80 82 | Wolfmother,Logged In,Tegan,F,5,Levine,175.82975,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Woman,200,1.54197E+12,80 83 | Old Crow Medicine Show,Logged In,Tegan,F,6,Levine,231.73179,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Wagon Wheel,200,1.54197E+12,80 84 | Architecture In Helsinki,Logged In,Tegan,F,7,Levine,173.73995,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Debbie,200,1.54197E+12,80 85 | Charlie Louvin,Logged In,Tegan,F,8,Levine,170.86649,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,I Think I'll Live,200,1.54197E+12,80 86 | Miguel Morales,Logged In,Tegan,F,9,Levine,270.78485,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,La Derrota de Un Don Juan,200,1.54197E+12,80 87 | Dominique A,Logged In,Tegan,F,10,Levine,153.20771,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Le Courage Des Oiseaux,200,1.54197E+12,80 88 | Cock Sparrer,Logged In,Tegan,F,11,Levine,203.25832,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Run With The Blind,200,1.54197E+12,80 89 | Jimmy Wakely,Logged In,Tegan,F,12,Levine,165.74649,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,I Love You So Much It Hurts,200,1.54197E+12,80 90 | Peter Doherty,Logged In,Tegan,F,13,Levine,217.02485,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,A Little Death Around the Eyes,200,1.54197E+12,80 91 | Katy Perry,Logged In,Tegan,F,14,Levine,246.41261,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Thinking Of You,200,1.54197E+12,80 92 | Sidewalk Prophets,Logged In,Molly,F,0,Taylor,260.62322,free,"St. Louis, MO-IL",PUT,NextSong,1.54099E+12,464,You Love Me Anyway (Album),200,1.54197E+12,35 93 | Rise Against,Logged In,Molly,F,1,Taylor,221.17832,free,"St. Louis, MO-IL",PUT,NextSong,1.54099E+12,464,Torches,200,1.54197E+12,35 94 | K'Naan,Logged In,Molly,F,2,Taylor,220.49914,free,"St. Louis, MO-IL",PUT,NextSong,1.54099E+12,464,Wavin' Flag,200,1.54197E+12,35 95 | Patrick Jumpen,Logged In,Ryan,M,0,Smith,208.87465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,480,Holiday,200,1.54198E+12,26 96 | Alicia Keys,Logged In,Ryan,M,1,Smith,216.47628,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,480,Empire State Of Mind (Part II) Broken Down,200,1.54198E+12,26 97 | -------------------------------------------------------------------------------- /Data_Modeling_with_Apache_Cassandra/event_data/2018-11-22-events.csv: -------------------------------------------------------------------------------- 1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId 2 | Dee Dee Bridgewater,Logged In,Lily,F,38,Koch,318.64118,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,La Vie En Rose,200,1.54285E+12,15 3 | Tim O'brien,Logged In,Lily,F,39,Koch,176.14322,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Think About Last Night,200,1.54285E+12,15 4 | Nirvana,Logged In,Lily,F,40,Koch,215.11791,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Love Buzz,200,1.54285E+12,15 5 | Weezer,Logged In,Lily,F,41,Koch,479.32036,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Only In Dreams,200,1.54285E+12,15 6 | Nightwish,Logged In,Lily,F,42,Koch,286.1971,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,She Is My Sin,200,1.54285E+12,15 7 | California Swag District,Logged In,Lily,F,43,Koch,239.17669,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Teach Me How To Dougie,200,1.54285E+12,15 8 | Miike Snow,Logged In,Lily,F,44,Koch,385.35791,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Silvia,200,1.54285E+12,15 9 | Katy Perry,Logged In,Lily,F,45,Koch,179.40853,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,I Kissed A Girl,200,1.54285E+12,15 10 | Sikth,Logged In,Lily,F,46,Koch,250.53995,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Peep Show,200,1.54285E+12,15 11 | Lily Allen,Logged In,Lily,F,47,Koch,199.88853,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Not Fair,200,1.54285E+12,15 12 | The Presidents of the United States of America,Logged In,Lily,F,48,Koch,495.77751,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Lump,200,1.54285E+12,15 13 | Wordsworth,Logged In,Lily,F,49,Koch,253.1522,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Right Now (Produced by Ayatollah),200,1.54285E+12,15 14 | Rihanna,Logged In,Lily,F,50,Koch,229.04118,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Take A Bow,200,1.54285E+12,15 15 | Tomas Bodin,Logged In,Lily,F,51,Koch,396.53832,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Back To The African Garden,200,1.54285E+12,15 16 | Black Eyed Peas,Logged In,Lily,F,52,Koch,326.86975,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,My Humps,200,1.54285E+12,15 17 | Carolina Liar,Logged In,Lily,F,53,Koch,240.45669,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Show Me What I'm Looking For (Album Version),200,1.54285E+12,15 18 | Kansas,Logged In,Lily,F,54,Koch,202.29179,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Dust in The Wind,200,1.54285E+12,15 19 | Onar,Logged In,Lily,F,55,Koch,306.6771,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Xehasmeni Melodia,200,1.54285E+12,15 20 | Live,Logged In,Lily,F,56,Koch,286.98077,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Lakini's Juice,200,1.54285E+12,15 21 | Abstract Rude,Logged In,Lily,F,57,Koch,196.85832,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Nuff Fire,200,1.54285E+12,15 22 | Johnny Horton,Logged In,Lily,F,58,Koch,131.81342,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Mean Mean Son Of A Gun,200,1.54285E+12,15 23 | The Men They Couldn't Hang,Logged In,Lily,F,59,Koch,251.14077,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Ironmasters,200,1.54285E+12,15 24 | Rilo Kiley,Logged In,Lily,F,60,Koch,234.03057,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,The Absence Of God (Album Version),200,1.54285E+12,15 25 | Shwayze,Logged In,Lily,F,61,Koch,201.63873,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Lost My Mind,200,1.54285E+12,15 26 | Bram Vermeulen,Logged In,Lily,F,62,Koch,251.42812,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Mamma,200,1.54285E+12,15 27 | Death Cab for Cutie,Logged In,Lily,F,63,Koch,189.3873,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,I Will Follow You into the Dark (Album Version),200,1.54285E+12,15 28 | Dwight Yoakam,Logged In,Lily,F,64,Koch,239.3073,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,You're The One,200,1.54285E+12,15 29 | Jadakiss / Ghostface Killah / Raekwon,Logged In,Lily,F,65,Koch,173.76608,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Cartel Gathering,200,1.54285E+12,15 30 | Rosana,Logged In,Lily,F,66,Koch,256.31302,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Si tu no estas,200,1.54285E+12,15 31 | ,Logged In,Kaylee,F,0,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Home,1.54034E+12,775,,200,1.54285E+12,8 32 | ,Logged In,Kaylee,F,1,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Upgrade,1.54034E+12,775,,200,1.54285E+12,8 33 | ,Logged In,Kaylee,F,2,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Home,1.54034E+12,775,,200,1.54285E+12,8 34 | The Killers,Logged In,Lily,F,67,Koch,230.39955,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,The Ballad of Michael Valentine,200,1.54285E+12,15 35 | Alliance Ethnik,Logged In,Lily,F,68,Koch,195.94404,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Sincerité Et Jalousie,200,1.54285E+12,15 36 | Enya,Logged In,Lily,F,69,Koch,289.802,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,China Roses,200,1.54285E+12,15 37 | Aya RL,Logged In,Lily,F,70,Koch,225.43628,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Jazz,200,1.54285E+12,15 38 | ,Logged In,Lily,F,71,Koch,,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,Logout,1.54105E+12,818,,307,1.54285E+12,15 39 | ,Logged Out,,,72,,,paid,,GET,Home,,818,,200,1.54285E+12, 40 | ,Logged Out,,,73,,,paid,,GET,About,,818,,200,1.54285E+12, 41 | Clor,Logged In,Ryan,M,0,Smith,227.68281,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,820,Love + Pain,200,1.54286E+12,26 42 | Alejandro Fernandez,Logged In,Ryan,M,1,Smith,262.84363,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,820,Solitario Y Solo,200,1.54286E+12,26 43 | Yonder Mountain String Band,Logged In,Ryan,M,2,Smith,152.18893,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,820,Midwest Gospel Radio,200,1.54286E+12,26 44 | K'Naan,Logged In,Ava,F,0,Robinson,220.49914,free,"New Haven-Milford, CT",PUT,NextSong,1.54093E+12,824,Wavin' Flag,200,1.54287E+12,50 45 | Cradle Of Filth,Logged In,Kate,F,0,Harrell,453.09342,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Her Ghost In The Fog,200,1.54288E+12,97 46 | Amanda Marshall,Logged In,Kate,F,1,Harrell,274.28526,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Let It Rain,200,1.54288E+12,97 47 | Rammstein,Logged In,Kate,F,2,Harrell,272.40444,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Sonne,200,1.54288E+12,97 48 | Cat Stevens,Logged In,Kate,F,3,Harrell,167.6273,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,If You Want To Sing Out_ Sing Out,200,1.54288E+12,97 49 | Emma Shapplin,Logged In,Kate,F,4,Harrell,267.62404,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Spente Le Stelle,200,1.54289E+12,97 50 | Modest Mouse,Logged In,Kate,F,5,Harrell,209.52771,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Float On,200,1.54289E+12,97 51 | Flaco Jimenez,Logged In,Kate,F,6,Harrell,155.81995,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,En El Cielo No Hay Cerveza (In Heaven There Is No Beer),200,1.54289E+12,97 52 | Modest Mouse,Logged In,Kate,F,7,Harrell,209.52771,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Float On,200,1.54289E+12,97 53 | Cedric Gervais feat. Second Sun,Logged In,Kate,F,8,Harrell,230.32118,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Pills (Radio Edit) (Radio Edit),200,1.54289E+12,97 54 | Sheena Easton,Logged In,Kate,F,9,Harrell,239.62077,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Strut (1993 Digital Remaster),200,1.54289E+12,97 55 | Everything But The Girl,Logged In,Kate,F,10,Harrell,218.74893,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,My Baby Don't Love Me,200,1.54289E+12,97 56 | Florence + The Machine,Logged In,Kate,F,11,Harrell,219.66322,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Dog Days Are Over (Radio Edit),200,1.54289E+12,97 57 | BoDeans,Logged In,Kate,F,12,Harrell,354.01098,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Naked (Live),200,1.54289E+12,97 58 | OneRepublic,Logged In,Kate,F,13,Harrell,208.14322,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Apologize,200,1.54289E+12,97 59 | Miley Cyrus,Logged In,Kate,F,14,Harrell,194.45506,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Full Circle,200,1.54289E+12,97 60 | Coldplay,Logged In,Kate,F,15,Harrell,139.12771,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Don't Panic,200,1.54289E+12,97 61 | Atreyu,Logged In,Kate,F,16,Harrell,308.37506,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,You Were The King_ Now You're Unconscious (Album Version),200,1.54289E+12,97 62 | Bruce Springsteen,Logged In,Kate,F,17,Harrell,270.54975,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Born To Run,200,1.54289E+12,97 63 | Björk,Logged In,Kate,F,18,Harrell,348.57751,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Undo,200,1.54289E+12,97 64 | Big Shug,Logged In,Kate,F,19,Harrell,140.56444,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,It Just Don't Stop,200,1.54289E+12,97 65 | The Wallflowers,Logged In,Kate,F,20,Harrell,315.24526,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Be Your Own Girl,200,1.54289E+12,97 66 | Chris Brown,Logged In,Kate,F,21,Harrell,203.80689,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Ain't No Way (You Won't Love Me),200,1.54289E+12,97 67 | Charly García,Logged In,Kate,F,22,Harrell,231.73179,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Filosofia Barata Y Zapatos De Goma,200,1.54289E+12,97 68 | N.W.A ft. Eazy-E,Logged In,Kate,F,23,Harrell,338.18077,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Boyz-N-The-Hood,200,1.54289E+12,97 69 | The Mighty Mighty Bosstones,Logged In,Kate,F,24,Harrell,158.87628,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,1/2/2008,200,1.54289E+12,97 70 | Beastie Boys,Logged In,Kate,F,25,Harrell,211.722,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Unite (2009 Digital Remaster),200,1.54289E+12,97 71 | Yuksek,Logged In,Kate,F,26,Harrell,218.95791,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Take A Ride,200,1.54289E+12,97 72 | Fernando Ubiergo,Logged In,Kate,F,27,Harrell,218.74893,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Cuando Agosto Era 21,200,1.54289E+12,97 73 | Phoenix,Logged In,Kate,F,28,Harrell,192.86159,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Napoleon Says,200,1.54289E+12,97 74 | Radney Foster,Logged In,Jayden,M,0,Fox,288.96608,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,790,Sweet And Wild,200,1.54289E+12,101 75 | Neneh Cherry,Logged In,Jayden,M,1,Fox,232.202,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,790,Manchild,200,1.54289E+12,101 76 | Hooligans,Logged In,Ayla,F,0,Johnson,189.98812,free,"Santa Rosa, CA",PUT,NextSong,1.54088E+12,785,Szex & KV,200,1.54289E+12,63 77 | Kid Cudi / MGMT / Ratatat,Logged In,Lily,F,0,Burns,295.67955,free,"New York-Newark-Jersey City, NY-NJ-PA",PUT,NextSong,1.54062E+12,786,Pursuit Of Happiness (nightmare),200,1.5429E+12,32 78 | Foals,Logged In,Morris,M,0,Gilmore,316.89098,free,"Raleigh, NC",PUT,NextSong,1.54097E+12,351,Blue Blood,200,1.5429E+12,23 79 | 'N Sync/Phil Collins,Logged In,Morris,M,1,Gilmore,143.64689,free,"Raleigh, NC",PUT,NextSong,1.54097E+12,351,Trashin' The Camp (Phil And 'N Sync Version),200,1.5429E+12,23 80 | Kristian Stanfill,Logged In,Jayden,M,0,Fox,287.50322,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,838,I Need You,200,1.5429E+12,101 81 | Enrique Iglesias,Logged In,Jayden,M,1,Fox,241.42322,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,838,Tired Of Being Sorry,200,1.5429E+12,101 82 | Michael Cretu,Logged In,Jayden,M,2,Fox,301.06077,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,838,The Invisible Man,200,1.5429E+12,101 83 | Tommy Emmanuel,Logged In,Jayden,M,3,Fox,168.14975,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,838,Windy & Warm,200,1.5429E+12,101 84 | ,Logged In,Jayden,M,4,Fox,,free,"New Orleans-Metairie, LA",PUT,Logout,1.54103E+12,838,,307,1.5429E+12,101 85 | ,Logged Out,,,5,,,free,,GET,Home,,838,,200,1.5429E+12, 86 | ,Logged Out,,,6,,,free,,PUT,Login,,838,,307,1.5429E+12, 87 | ,Logged In,Jayden,M,7,Fox,,free,"New Orleans-Metairie, LA",GET,Home,1.54103E+12,838,,200,1.5429E+12,101 88 | ,Logged In,Jordan,F,0,Rodriguez,,free,"Los Angeles-Long Beach-Anaheim, CA",GET,Home,1.54099E+12,523,,200,1.5429E+12,68 89 | Cherise,Logged In,Stefany,F,0,White,229.69424,free,"Lubbock, TX",PUT,NextSong,1.54071E+12,772,No Good 4 You,200,1.54291E+12,83 90 | ,Logged In,Ryan,M,0,Smith,,free,"San Jose-Sunnyvale-Santa Clara, CA",GET,Home,1.54102E+12,835,,200,1.54291E+12,26 91 | Anna Waronker,Logged In,Jayden,F,0,Duffy,189.6224,free,"Seattle-Tacoma-Bellevue, WA",PUT,NextSong,1.54015E+12,662,Nothing Personal,200,1.54291E+12,76 92 | King Changó,Logged In,Cecilia,F,0,Owens,340.74077,free,"Atlanta-Sandy Springs-Roswell, GA",PUT,NextSong,1.54103E+12,763,Confesión,200,1.54292E+12,6 93 | Gang Of Four,Logged In,Cecilia,F,1,Owens,193.14893,free,"Atlanta-Sandy Springs-Roswell, GA",PUT,NextSong,1.54103E+12,763,I Found That Essence Rare,200,1.54292E+12,6 94 | Line Renaud,Logged In,Cecilia,F,2,Owens,176.16934,free,"Atlanta-Sandy Springs-Roswell, GA",PUT,NextSong,1.54103E+12,763,Le Soir,200,1.54292E+12,6 95 | ,Logged Out,,,0,,,paid,,PUT,Login,,823,,307,1.54292E+12, 96 | ,Logged In,Tegan,F,1,Levine,,paid,"Portland-South Portland, ME",GET,Home,1.54079E+12,823,,200,1.54292E+12,80 97 | the bird and the bee,Logged In,Tegan,F,2,Levine,198.1122,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,823,Last Day Of Our Love,200,1.54292E+12,80 98 | ,Logged Out,,,0,,,paid,,GET,Home,,831,,200,1.54293E+12, 99 | ,Logged Out,,,1,,,paid,,GET,Home,,831,,200,1.54293E+12, 100 | -------------------------------------------------------------------------------- /Data_Modeling_with_Apache_Cassandra/event_data/2018-11-25-events.csv: -------------------------------------------------------------------------------- 1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId 2 | matchbox twenty,Logged In,Jayden,F,0,Duffy,177.65832,free,"Seattle-Tacoma-Bellevue, WA",PUT,NextSong,1.54015E+12,846,Argue (LP Version),200,1.54311E+12,76 3 | The Lonely Island / T-Pain,Logged In,Jayden,F,1,Duffy,156.23791,free,"Seattle-Tacoma-Bellevue, WA",PUT,NextSong,1.54015E+12,846,I'm On A Boat,200,1.54311E+12,76 4 | ,Logged In,Jayden,F,2,Duffy,,free,"Seattle-Tacoma-Bellevue, WA",GET,Home,1.54015E+12,846,,200,1.54311E+12,76 5 | ,Logged In,Jayden,F,3,Duffy,,free,"Seattle-Tacoma-Bellevue, WA",GET,Settings,1.54015E+12,846,,200,1.54311E+12,76 6 | ,Logged In,Jayden,F,4,Duffy,,free,"Seattle-Tacoma-Bellevue, WA",PUT,Save Settings,1.54015E+12,846,,307,1.54311E+12,76 7 | John Mayer,Logged In,Wyatt,M,0,Scott,275.27791,free,"Eureka-Arcata-Fortuna, CA",PUT,NextSong,1.54087E+12,856,All We Ever Do Is Say Goodbye,200,1.54311E+12,9 8 | ,Logged In,Wyatt,M,1,Scott,,free,"Eureka-Arcata-Fortuna, CA",GET,Home,1.54087E+12,856,,200,1.54311E+12,9 9 | 10_000 Maniacs,Logged In,Wyatt,M,2,Scott,251.8722,free,"Eureka-Arcata-Fortuna, CA",PUT,NextSong,1.54087E+12,856,Gun Shy (LP Version),200,1.54311E+12,9 10 | Leona Lewis,Logged In,Chloe,F,0,Cuevas,203.88526,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Forgive Me,200,1.54312E+12,49 11 | Nine Inch Nails,Logged In,Chloe,F,1,Cuevas,277.83791,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,La Mer,200,1.54312E+12,49 12 | Audioslave,Logged In,Chloe,F,2,Cuevas,334.91546,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,I Am The Highway,200,1.54312E+12,49 13 | Kid Rock,Logged In,Chloe,F,3,Cuevas,296.95955,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,All Summer Long (Album Version),200,1.54312E+12,49 14 | The Jets,Logged In,Chloe,F,4,Cuevas,220.89098,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,I Do You,200,1.54312E+12,49 15 | The Gerbils,Logged In,Chloe,F,5,Cuevas,27.01016,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,(iii),200,1.54312E+12,49 16 | Damian Marley / Stephen Marley / Yami Bolo,Logged In,Chloe,F,6,Cuevas,304.69179,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Still Searching,200,1.54312E+12,49 17 | ,Logged In,Chloe,F,7,Cuevas,,paid,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,916,,200,1.54312E+12,49 18 | The Bloody Beetroots,Logged In,Chloe,F,8,Cuevas,201.97832,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Warp 1.9 (feat. Steve Aoki),200,1.54312E+12,49 19 | ,Logged In,Chloe,F,9,Cuevas,,paid,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,916,,200,1.54313E+12,49 20 | The Specials,Logged In,Chloe,F,10,Cuevas,188.81261,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Rat Race,200,1.54313E+12,49 21 | The Lively Ones,Logged In,Chloe,F,11,Cuevas,142.52363,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Walkin' The Board (LP Version),200,1.54313E+12,49 22 | Katie Melua,Logged In,Chloe,F,12,Cuevas,252.78649,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Blues In The Night,200,1.54313E+12,49 23 | Jason Mraz,Logged In,Chloe,F,13,Cuevas,243.48689,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,I'm Yours (Album Version),200,1.54313E+12,49 24 | Fisher,Logged In,Chloe,F,14,Cuevas,133.98159,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Rianna,200,1.54313E+12,49 25 | Zee Avi,Logged In,Chloe,F,15,Cuevas,160.62649,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,No Christmas For Me,200,1.54313E+12,49 26 | Black Eyed Peas,Logged In,Chloe,F,16,Cuevas,289.12281,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,I Gotta Feeling,200,1.54313E+12,49 27 | Emiliana Torrini,Logged In,Chloe,F,17,Cuevas,184.29342,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Sunny Road,200,1.54313E+12,49 28 | ,Logged In,Chloe,F,18,Cuevas,,paid,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,916,,200,1.54313E+12,49 29 | Days Of The New,Logged In,Chloe,F,19,Cuevas,258.5073,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,The Down Town,200,1.54313E+12,49 30 | Julio Iglesias duet with Willie Nelson,Logged In,Chloe,F,20,Cuevas,212.16608,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,To All The Girls I've Loved Before (With Julio Iglesias),200,1.54313E+12,49 31 | ,Logged In,Jacqueline,F,0,Lynch,,paid,"Atlanta-Sandy Springs-Roswell, GA",GET,Home,1.54022E+12,914,,200,1.54313E+12,29 32 | Jason Mraz & Colbie Caillat,Logged In,Chloe,F,0,Roth,189.6224,free,"Indianapolis-Carmel-Anderson, IN",PUT,NextSong,1.5407E+12,704,Lucky (Album Version),200,1.54314E+12,78 33 | ,Logged In,Anabelle,F,0,Simpson,,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",GET,Home,1.54104E+12,901,,200,1.54315E+12,69 34 | R. Kelly,Logged In,Anabelle,F,1,Simpson,234.39628,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1.54104E+12,901,The World's Greatest,200,1.54315E+12,69 35 | ,Logged In,Kynnedi,F,0,Sanchez,,free,"Cedar Rapids, IA",GET,Home,1.54108E+12,804,,200,1.54315E+12,89 36 | Jacky Terrasson,Logged In,Marina,F,0,Sutton,342.7522,free,"Salinas, CA",PUT,NextSong,1.54106E+12,373,Le Jardin d'Hiver,200,1.54315E+12,48 37 | Papa Roach,Logged In,Theodore,M,0,Harris,202.1873,free,"Red Bluff, CA",PUT,NextSong,1.5411E+12,813,Alive,200,1.54316E+12,14 38 | Burt Bacharach,Logged In,Theodore,M,1,Harris,156.96934,free,"Red Bluff, CA",PUT,NextSong,1.5411E+12,813,Casino Royale Theme (Main Title),200,1.54316E+12,14 39 | ,Logged In,Chloe,F,0,Cuevas,,paid,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,923,,200,1.54316E+12,49 40 | Floetry,Logged In,Chloe,F,1,Cuevas,254.48444,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,Sunshine,200,1.54316E+12,49 41 | The Rakes,Logged In,Chloe,F,2,Cuevas,225.2273,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,Leave The City And Come Home,200,1.54316E+12,49 42 | Dwight Yoakam,Logged In,Chloe,F,3,Cuevas,239.3073,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,You're The One,200,1.54316E+12,49 43 | Ween,Logged In,Chloe,F,4,Cuevas,228.10077,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,Voodoo Lady,200,1.54316E+12,49 44 | Café Quijano,Logged In,Chloe,F,5,Cuevas,197.32853,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,La Lola,200,1.54316E+12,49 45 | ,Logged In,Chloe,F,0,Roth,,free,"Indianapolis-Carmel-Anderson, IN",GET,Home,1.5407E+12,925,,200,1.54317E+12,78 46 | Parov Stelar,Logged In,Chloe,F,1,Roth,203.65016,free,"Indianapolis-Carmel-Anderson, IN",PUT,NextSong,1.5407E+12,925,Good Bye Emily (feat. Gabriella Hanninen),200,1.54317E+12,78 47 | ,Logged In,Chloe,F,2,Roth,,free,"Indianapolis-Carmel-Anderson, IN",GET,Home,1.5407E+12,925,,200,1.54317E+12,78 48 | ,Logged In,Tegan,F,0,Levine,,paid,"Portland-South Portland, ME",GET,Home,1.54079E+12,915,,200,1.54317E+12,80 49 | Bryan Adams,Logged In,Tegan,F,1,Levine,166.29506,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,I Will Always Return,200,1.54317E+12,80 50 | KT Tunstall,Logged In,Tegan,F,2,Levine,192.31302,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,White Bird,200,1.54317E+12,80 51 | Technicolour,Logged In,Tegan,F,3,Levine,235.12771,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,Turn Away,200,1.54317E+12,80 52 | The Dears,Logged In,Tegan,F,4,Levine,289.95873,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,Lost In The Plot,200,1.54317E+12,80 53 | Go West,Logged In,Tegan,F,5,Levine,259.49995,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,Never Let Them See You Sweat,200,1.54317E+12,80 54 | ,Logged In,Tegan,F,6,Levine,,paid,"Portland-South Portland, ME",PUT,Logout,1.54079E+12,915,,307,1.54317E+12,80 55 | ,Logged In,Sylvie,F,0,Cruz,,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV",GET,Home,1.54027E+12,912,,200,1.54317E+12,10 56 | ,Logged Out,,,7,,,paid,,GET,Home,,915,,200,1.54317E+12, 57 | Gondwana,Logged In,Jordan,F,0,Hicks,262.5824,free,"Salinas, CA",PUT,NextSong,1.54001E+12,814,Mi Princesa,200,1.54319E+12,37 58 | ,Logged In,Kevin,M,0,Arellano,,free,"Harrisburg-Carlisle, PA",GET,Home,1.54001E+12,855,,200,1.54319E+12,66 59 | Ella Fitzgerald,Logged In,Jordan,F,1,Hicks,427.15383,free,"Salinas, CA",PUT,NextSong,1.54001E+12,814,On Green Dolphin Street (Medley) (1999 Digital Remaster),200,1.54319E+12,37 60 | Creedence Clearwater Revival,Logged In,Jordan,F,2,Hicks,184.73751,free,"Salinas, CA",PUT,NextSong,1.54001E+12,814,Run Through The Jungle,200,1.54319E+12,37 61 | -------------------------------------------------------------------------------- /Data_Modeling_with_Apache_Cassandra/images/image_event_datafile_new.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/san089/Udacity-Data-Engineering-Projects/c26bd834294cda2f4f2dfbf68c4c326635c55a93/Data_Modeling_with_Apache_Cassandra/images/image_event_datafile_new.jpg -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Data Modeling with Postgres 3 | 4 | ## **Overview** 5 | In this project, we apply Data Modeling with Postgres and build an ETL pipeline using Python. A startup wants to analyze the data they've been collecting on songs and user activity on their new music streaming app. Currently, they are collecting data in json format and the analytics team is particularly interested in understanding what songs users are listening to. 6 | 7 | 8 | ## **Song Dataset** 9 | Songs dataset is a subset of [Million Song Dataset](http://millionsongdataset.com/). 10 | 11 | Sample Record : 12 | ``` 13 | {"num_songs": 1, "artist_id": "ARJIE2Y1187B994AB7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Line Renaud", "song_id": "SOUPIRU12A6D4FA1E1", "title": "Der Kleine Dompfaff", "duration": 152.92036, "year": 0} 14 | ``` 15 | 16 | ## **Log Dataset** 17 | Logs dataset is generated by [Event Simulator](https://github.com/Interana/eventsim). 18 | 19 | Sample Record : 20 | ``` 21 | {"artist": null, "auth": "Logged In", "firstName": "Walter", "gender": "M", "itemInSession": 0, "lastName": "Frye", "length": null, "level": "free", "location": "San Francisco-Oakland-Hayward, CA", "method": "GET","page": "Home", "registration": 1540919166796.0, "sessionId": 38, "song": null, "status": 200, "ts": 1541105830796, "userAgent": "\"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/36.0.1985.143 Safari\/537.36\"", "userId": "39"} 22 | ``` 23 | 24 | 25 | ## Schema 26 | 27 | #### Fact Table 28 | **songplays** - records in log data associated with song plays i.e. records with page `NextSong` 29 | 30 | ``` 31 | songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent 32 | ``` 33 | 34 | #### Dimension Tables 35 | **users** - users in the app 36 | ``` 37 | user_id, first_name, last_name, gender, level 38 | ``` 39 | **songs** - songs in music database 40 | ``` 41 | song_id, title, artist_id, year, duration 42 | ``` 43 | **artists** - artists in music database 44 | ``` 45 | artist_id, name, location, latitude, longitude 46 | ``` 47 | **time** - timestamps of records in **songplays** broken down into specific units 48 | ``` 49 | start_time, hour, day, week, month, year, weekday 50 | ``` 51 | 52 | ## Project Files 53 | 54 | ```sql_queries.py``` -> contains sql queries for dropping and creating fact and dimension tables. Also, contains insertion query template. 55 | 56 | ```create_tables.py``` -> contains code for setting up database. Running this file creates **sparkifydb** and also creates the fact and dimension tables. 57 | 58 | ```etl.ipynb``` -> a jupyter notebook to analyse dataset before loading. 59 | 60 | ```etl.py``` -> read and process **song_data** and **log_data** 61 | 62 | ```test.ipynb``` -> a notebook to connect to postgres db and validate the data loaded. 63 | 64 | ## Environment 65 | Python 3.6 or above 66 | 67 | PostgresSQL 9.5 or above 68 | 69 | psycopg2 - PostgreSQL database adapter for Python 70 | 71 | 72 | ## How to run 73 | 74 | Run the drive program ```main.py``` as below. 75 | ``` 76 | python main.py 77 | ``` 78 | 79 | The ```create_tables.py``` and ```etl.py``` file can also be run independently as below: 80 | ``` 81 | python create_tables.py 82 | python etl.py 83 | ``` 84 | 85 | 86 | #### Reference: 87 | [Psycopg](http://initd.org/psycopg/docs/) 88 | 89 | [PostgreSQL Documentation](https://www.postgresql.org/docs/) 90 | 91 | [Pandas Documentation](https://pandas.pydata.org/pandas-docs/stable/) 92 | 93 | -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/__pycache__/create_tables.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/san089/Udacity-Data-Engineering-Projects/c26bd834294cda2f4f2dfbf68c4c326635c55a93/Data_Modeling_with_Postgres/__pycache__/create_tables.cpython-37.pyc -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/__pycache__/etl.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/san089/Udacity-Data-Engineering-Projects/c26bd834294cda2f4f2dfbf68c4c326635c55a93/Data_Modeling_with_Postgres/__pycache__/etl.cpython-37.pyc -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/__pycache__/sql_queries.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/san089/Udacity-Data-Engineering-Projects/c26bd834294cda2f4f2dfbf68c4c326635c55a93/Data_Modeling_with_Postgres/__pycache__/sql_queries.cpython-37.pyc -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/create_tables.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | from sql_queries import create_table_queries, drop_table_queries 3 | 4 | 5 | def create_database(): 6 | """ 7 | 8 | Establishes database connection and return's the connection and cursor references. 9 | :return: return's (cur, conn) a cursor and connection reference 10 | """ 11 | # connect to default database 12 | #conn = psycopg2.connect("host=127.0.0.1 dbname=studentdb user=student password=student") 13 | conn = psycopg2.connect("host=127.0.0.1 dbname=studentdb user=postgres password=admin") 14 | conn.set_session(autocommit=True) 15 | cur = conn.cursor() 16 | 17 | # create sparkify database with UTF8 encoding 18 | cur.execute("DROP DATABASE IF EXISTS sparkifydb") 19 | cur.execute("CREATE DATABASE sparkifydb WITH ENCODING 'utf8' TEMPLATE template0") 20 | 21 | # close connection to default database 22 | conn.close() 23 | 24 | # connect to sparkify database 25 | conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=postgres password=admin") 26 | cur = conn.cursor() 27 | 28 | return cur, conn 29 | 30 | 31 | def drop_tables(cur, conn): 32 | """ 33 | Run's all the drop table queries defined in sql_queries.py 34 | :param cur: cursor to the database 35 | :param conn: database connection reference 36 | """ 37 | for query in drop_table_queries: 38 | cur.execute(query) 39 | conn.commit() 40 | 41 | 42 | def create_tables(cur, conn): 43 | """ 44 | Run's all the create table queries defined in sql_queries.py 45 | :param cur: cursor to the database 46 | :param conn: database connection reference 47 | """ 48 | for query in create_table_queries: 49 | cur.execute(query) 50 | conn.commit() 51 | 52 | 53 | def main(): 54 | """ 55 | Driver main function. 56 | """ 57 | cur, conn = create_database() 58 | 59 | drop_tables(cur, conn) 60 | print("Table dropped successfully!!") 61 | 62 | create_tables(cur, conn) 63 | print("Table created successfully!!") 64 | 65 | conn.close() 66 | 67 | 68 | if __name__ == "__main__": 69 | main() -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/log_data/2018/11/2018-11-01-events.json: -------------------------------------------------------------------------------- 1 | {"artist":null,"auth":"Logged In","firstName":"Walter","gender":"M","itemInSession":0,"lastName":"Frye","length":null,"level":"free","location":"San Francisco-Oakland-Hayward, CA","method":"GET","page":"Home","registration":1540919166796.0,"sessionId":38,"song":null,"status":200,"ts":1541105830796,"userAgent":"\"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/36.0.1985.143 Safari\/537.36\"","userId":"39"} 2 | {"artist":null,"auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":0,"lastName":"Summers","length":null,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"GET","page":"Home","registration":1540344794796.0,"sessionId":139,"song":null,"status":200,"ts":1541106106796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 3 | {"artist":"Des'ree","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":1,"lastName":"Summers","length":246.30812,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"You Gotta Be","status":200,"ts":1541106106796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 4 | {"artist":null,"auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":2,"lastName":"Summers","length":null,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"GET","page":"Upgrade","registration":1540344794796.0,"sessionId":139,"song":null,"status":200,"ts":1541106132796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 5 | {"artist":"Mr Oizo","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":3,"lastName":"Summers","length":144.03873,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Flat 55","status":200,"ts":1541106352796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 6 | {"artist":"Tamba Trio","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":4,"lastName":"Summers","length":177.18812,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Quem Quiser Encontrar O Amor","status":200,"ts":1541106496796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 7 | {"artist":"The Mars Volta","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":5,"lastName":"Summers","length":380.42077,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Eriatarka","status":200,"ts":1541106673796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 8 | {"artist":"Infected Mushroom","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":6,"lastName":"Summers","length":440.2673,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Becoming Insane","status":200,"ts":1541107053796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 9 | {"artist":"Blue October \/ Imogen Heap","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":7,"lastName":"Summers","length":241.3971,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Congratulations","status":200,"ts":1541107493796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 10 | {"artist":"Girl Talk","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":8,"lastName":"Summers","length":160.15628,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Once again","status":200,"ts":1541107734796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 11 | {"artist":"Black Eyed Peas","auth":"Logged In","firstName":"Sylvie","gender":"F","itemInSession":0,"lastName":"Cruz","length":214.93506,"level":"free","location":"Washington-Arlington-Alexandria, DC-VA-MD-WV","method":"PUT","page":"NextSong","registration":1540266185796.0,"sessionId":9,"song":"Pump It","status":200,"ts":1541108520796,"userAgent":"\"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit\/537.77.4 (KHTML, like Gecko) Version\/7.0.5 Safari\/537.77.4\"","userId":"10"} 12 | {"artist":null,"auth":"Logged In","firstName":"Ryan","gender":"M","itemInSession":0,"lastName":"Smith","length":null,"level":"free","location":"San Jose-Sunnyvale-Santa Clara, CA","method":"GET","page":"Home","registration":1541016707796.0,"sessionId":169,"song":null,"status":200,"ts":1541109015796,"userAgent":"\"Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Ubuntu Chromium\/36.0.1985.125 Chrome\/36.0.1985.125 Safari\/537.36\"","userId":"26"} 13 | {"artist":"Fall Out Boy","auth":"Logged In","firstName":"Ryan","gender":"M","itemInSession":1,"lastName":"Smith","length":200.72444,"level":"free","location":"San Jose-Sunnyvale-Santa Clara, CA","method":"PUT","page":"NextSong","registration":1541016707796.0,"sessionId":169,"song":"Nobody Puts Baby In The Corner","status":200,"ts":1541109125796,"userAgent":"\"Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Ubuntu Chromium\/36.0.1985.125 Chrome\/36.0.1985.125 Safari\/537.36\"","userId":"26"} 14 | {"artist":"M.I.A.","auth":"Logged In","firstName":"Ryan","gender":"M","itemInSession":2,"lastName":"Smith","length":233.7171,"level":"free","location":"San Jose-Sunnyvale-Santa Clara, CA","method":"PUT","page":"NextSong","registration":1541016707796.0,"sessionId":169,"song":"Mango Pickle Down River (With The Wilcannia Mob)","status":200,"ts":1541109325796,"userAgent":"\"Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Ubuntu Chromium\/36.0.1985.125 Chrome\/36.0.1985.125 Safari\/537.36\"","userId":"26"} 15 | {"artist":"Survivor","auth":"Logged In","firstName":"Jayden","gender":"M","itemInSession":0,"lastName":"Fox","length":245.36771,"level":"free","location":"New Orleans-Metairie, LA","method":"PUT","page":"NextSong","registration":1541033612796.0,"sessionId":100,"song":"Eye Of The Tiger","status":200,"ts":1541110994796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.3; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/36.0.1985.143 Safari\/537.36\"","userId":"101"} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/A/TRAAAAW128F429D538.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD7TVE1187B99BFB1", "artist_latitude": null, "artist_longitude": null, "artist_location": "California - LA", "artist_name": "Casual", "song_id": "SOMZWCG12A8C13C480", "title": "I Didn't Mean To", "duration": 218.93179, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/A/TRAAABD128F429CF47.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARMJAGH1187FB546F3", "artist_latitude": 35.14968, "artist_longitude": -90.04892, "artist_location": "Memphis, TN", "artist_name": "The Box Tops", "song_id": "SOCIWDW12A8C13D406", "title": "Soul Deep", "duration": 148.03546, "year": 1969} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/A/TRAAADZ128F9348C2E.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARKRRTF1187B9984DA", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Sonora Santanera", "song_id": "SOXVLOJ12AB0189215", "title": "Amor De Cabaret", "duration": 177.47546, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/A/TRAAAEF128F4273421.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR7G5I41187FB4CE6C", "artist_latitude": null, "artist_longitude": null, "artist_location": "London, England", "artist_name": "Adam Ant", "song_id": "SONHOTT12A8C13493C", "title": "Something Girls", "duration": 233.40363, "year": 1982} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/A/TRAAAFD128F92F423A.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARXR32B1187FB57099", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Gob", "song_id": "SOFSOCN12A8C143F5D", "title": "Face the Ashes", "duration": 209.60608, "year": 2007} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/A/TRAAAMO128F1481E7F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARKFYS91187B98E58F", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Jeff And Sheri Easter", "song_id": "SOYMRWW12A6D4FAB14", "title": "The Moon And I (Ordinary Day Album Version)", "duration": 267.7024, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/A/TRAAAMQ128F1460CD3.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD0S291187B9B7BF5", "artist_latitude": null, "artist_longitude": null, "artist_location": "Ohio", "artist_name": "Rated R", "song_id": "SOMJBYD12A6D4F8557", "title": "Keepin It Real (Skit)", "duration": 114.78159, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/A/TRAAAPK128E0786D96.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR10USD1187B99F3F1", "artist_latitude": null, "artist_longitude": null, "artist_location": "Burlington, Ontario, Canada", "artist_name": "Tweeterfriendly Music", "song_id": "SOHKNRJ12A6701D1F8", "title": "Drop of Rain", "duration": 189.57016, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/A/TRAAARJ128F9320760.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR8ZCNI1187B9A069B", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Planet P Project", "song_id": "SOIAZJW12AB01853F1", "title": "Pink World", "duration": 269.81832, "year": 1984} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/A/TRAAAVG12903CFA543.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNTLGG11E2835DDB9", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Clp", "song_id": "SOUDSGM12AC9618304", "title": "Insatiable (Instrumental Version)", "duration": 266.39628, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/B/TRAABCL128F4286650.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARC43071187B990240", "artist_latitude": null, "artist_longitude": null, "artist_location": "Wisner, LA", "artist_name": "Wayne Watson", "song_id": "SOKEJEJ12A8C13E0D0", "title": "The Urgency (LP Version)", "duration": 245.21098, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/B/TRAABDL12903CAABBA.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARL7K851187B99ACD2", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Andy Andy", "song_id": "SOMUYGI12AB0188633", "title": "La Culpa", "duration": 226.35057, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/B/TRAABJL12903CDCF1A.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARHHO3O1187B989413", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Bob Azzam", "song_id": "SORAMLE12AB017C8B0", "title": "Auguri Cha Cha", "duration": 191.84281, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/B/TRAABJV128F1460C49.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARIK43K1187B9AE54C", "artist_latitude": null, "artist_longitude": null, "artist_location": "Beverly Hills, CA", "artist_name": "Lionel Richie", "song_id": "SOBONFF12A6D4F84D8", "title": "Tonight Will Be Alright", "duration": 307.3824, "year": 1986} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/B/TRAABLR128F423B7E3.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD842G1187B997376", "artist_latitude": 43.64856, "artist_longitude": -79.38533, "artist_location": "Toronto, Ontario, Canada", "artist_name": "Blue Rodeo", "song_id": "SOHUOAP12A8AE488E9", "title": "Floating", "duration": 491.12771, "year": 1987} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/B/TRAABNV128F425CEE1.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARIG6O41187B988BDD", "artist_latitude": 37.16793, "artist_longitude": -95.84502, "artist_location": "United States", "artist_name": "Richard Souther", "song_id": "SOUQQEA12A8C134B1B", "title": "High Tide", "duration": 228.5971, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/B/TRAABRB128F9306DD5.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR1ZHYZ1187FB3C717", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Faiz Ali Faiz", "song_id": "SOILPQQ12AB017E82A", "title": "Sohna Nee Sohna Data", "duration": 599.24853, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/B/TRAABVM128F92CA9DC.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARYKCQI1187FB3B18F", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Tesla", "song_id": "SOXLBJT12A8C140925", "title": "Caught In A Dream", "duration": 290.29832, "year": 2004} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/B/TRAABXG128F9318EBD.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNPAGP1241B9C7FD4", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "lextrical", "song_id": "SOZVMJI12AB01808AF", "title": "Synthetic Dream", "duration": 165.69424, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/B/TRAABYN12903CFD305.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARQGYP71187FB44566", "artist_latitude": 34.31109, "artist_longitude": -94.02978, "artist_location": "Mineola, AR", "artist_name": "Jimmy Wakely", "song_id": "SOWTBJW12AC468AC6E", "title": "Broken-Down Merry-Go-Round", "duration": 151.84934, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/C/TRAACCG128F92E8A55.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR5KOSW1187FB35FF4", "artist_latitude": 49.80388, "artist_longitude": 15.47491, "artist_location": "Dubai UAE", "artist_name": "Elena", "song_id": "SOZCTXZ12AB0182364", "title": "Setanta matins", "duration": 269.58322, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/C/TRAACER128F4290F96.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARMAC4T1187FB3FA4C", "artist_latitude": 40.82624, "artist_longitude": -74.47995, "artist_location": "Morris Plains, NJ", "artist_name": "The Dillinger Escape Plan", "song_id": "SOBBUGU12A8C13E95D", "title": "Setting Fire to Sleeping Giants", "duration": 207.77751, "year": 2004} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/C/TRAACFV128F935E50B.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR47JEX1187B995D81", "artist_latitude": 37.83721, "artist_longitude": -94.35868, "artist_location": "Nevada, MO", "artist_name": "SUE THOMPSON", "song_id": "SOBLGCN12AB0183212", "title": "James (Hold The Ladder Steady)", "duration": 124.86485, "year": 1985} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/C/TRAACHN128F1489601.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGIWFO1187B9B55B7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Five Bolt Main", "song_id": "SOPSWQW12A6D4F8781", "title": "Made Like This (Live)", "duration": 225.09669, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/C/TRAACIW12903CC0F6D.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNTLGG11E2835DDB9", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Clp", "song_id": "SOZQDIU12A58A7BCF6", "title": "Superconfidential", "duration": 338.31138, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/C/TRAACLV128F427E123.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARDNS031187B9924F0", "artist_latitude": 32.67828, "artist_longitude": -83.22295, "artist_location": "Georgia", "artist_name": "Tim Wilson", "song_id": "SONYPOM12A8C13B2D7", "title": "I Think My Wife Is Running Around On Me (Taco Hell)", "duration": 186.48771, "year": 2005} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/C/TRAACNS128F14A2DF5.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AROUOZZ1187B9ABE51", "artist_latitude": 40.79195, "artist_longitude": -73.94512, "artist_location": "New York, NY [Spanish Harlem]", "artist_name": "Willie Bobo", "song_id": "SOBZBAZ12A6D4F8742", "title": "Spanish Grease", "duration": 168.25424, "year": 1997} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/C/TRAACOW128F933E35F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARH4Z031187B9A71F2", "artist_latitude": 40.73197, "artist_longitude": -74.17418, "artist_location": "Newark, NJ", "artist_name": "Faye Adams", "song_id": "SOVYKGO12AB0187199", "title": "Crazy Mixed Up World", "duration": 156.39465, "year": 1961} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/C/TRAACPE128F421C1B9.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARB29H41187B98F0EF", "artist_latitude": 41.88415, "artist_longitude": -87.63241, "artist_location": "Chicago", "artist_name": "Terry Callier", "song_id": "SOGNCJP12A58A80271", "title": "Do You Finally Need A Friend", "duration": 342.56934, "year": 1972} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/A/C/TRAACQT128F9331780.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR1Y2PT1187FB5B9CE", "artist_latitude": 27.94017, "artist_longitude": -82.32547, "artist_location": "Brandon", "artist_name": "John Wesley", "song_id": "SOLLHMX12AB01846DC", "title": "The Emperor Falls", "duration": 484.62322, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/A/TRABACN128F425B784.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD7TVE1187B99BFB1", "artist_latitude": null, "artist_longitude": null, "artist_location": "California - LA", "artist_name": "Casual", "song_id": "SOQLGFP12A58A7800E", "title": "OAKtown", "duration": 259.44771, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/A/TRABAFJ128F42AF24E.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR3JMC51187B9AE49D", "artist_latitude": 28.53823, "artist_longitude": -81.37739, "artist_location": "Orlando, FL", "artist_name": "Backstreet Boys", "song_id": "SOPVXLX12A8C1402D5", "title": "Larger Than Life", "duration": 236.25098, "year": 1999} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/A/TRABAFP128F931E9A1.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARPBNLO1187FB3D52F", "artist_latitude": 40.71455, "artist_longitude": -74.00712, "artist_location": "New York, NY", "artist_name": "Tiny Tim", "song_id": "SOAOIBZ12AB01815BE", "title": "I Hold Your Hand In Mine [Live At Royal Albert Hall]", "duration": 43.36281, "year": 2000} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/A/TRABAIO128F42938F9.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR9AWNF1187B9AB0B4", "artist_latitude": null, "artist_longitude": null, "artist_location": "Seattle, Washington USA", "artist_name": "Kenny G featuring Daryl Hall", "song_id": "SOZHPGD12A8C1394FE", "title": "Baby Come To Me", "duration": 236.93016, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/A/TRABATO128F42627E9.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AROGWRA122988FEE45", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Christos Dantis", "song_id": "SOSLAVG12A8C13397F", "title": "Den Pai Alo", "duration": 243.82649, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/A/TRABAVQ12903CBF7E0.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARMBR4Y1187B9990EB", "artist_latitude": 37.77916, "artist_longitude": -122.42005, "artist_location": "California - SF", "artist_name": "David Martin", "song_id": "SOTTDKS12AB018D69B", "title": "It Wont Be Christmas", "duration": 241.47546, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/A/TRABAWW128F4250A31.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARQ9BO41187FB5CF1F", "artist_latitude": 40.99471, "artist_longitude": -77.60454, "artist_location": "Pennsylvania", "artist_name": "John Davis", "song_id": "SOMVWWT12A58A7AE05", "title": "Knocked Out Of The Park", "duration": 183.17016, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/A/TRABAXL128F424FC50.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARKULSX1187FB45F84", "artist_latitude": 39.49974, "artist_longitude": -111.54732, "artist_location": "Utah", "artist_name": "Trafik", "song_id": "SOQVMXR12A81C21483", "title": "Salt In NYC", "duration": 424.12363, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/A/TRABAXR128F426515F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARI2JSK1187FB496EF", "artist_latitude": 51.50632, "artist_longitude": -0.12714, "artist_location": "London, England", "artist_name": "Nick Ingman;Gavyn Wright", "song_id": "SODUJBS12A8C132150", "title": "Wessex Loses a Bride", "duration": 111.62077, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/A/TRABAXV128F92F6AE3.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AREDBBQ1187B98AFF5", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Eddie Calvert", "song_id": "SOBBXLX12A58A79DDA", "title": "Erica (2005 Digital Remaster)", "duration": 138.63138, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/B/TRABBAM128F429D223.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARBGXIG122988F409D", "artist_latitude": 37.77916, "artist_longitude": -122.42005, "artist_location": "California - SF", "artist_name": "Steel Rain", "song_id": "SOOJPRH12A8C141995", "title": "Loaded Like A Gun", "duration": 173.19138, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/B/TRABBBV128F42967D7.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR7SMBG1187B9B9066", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Los Manolos", "song_id": "SOBCOSW12A8C13D398", "title": "Rumba De Barcelona", "duration": 218.38322, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/B/TRABBJE12903CDB442.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGCY1Y1187B9A4FA5", "artist_latitude": 36.16778, "artist_longitude": -86.77836, "artist_location": "Nashville, TN.", "artist_name": "Gloriana", "song_id": "SOQOTLQ12AB01868D0", "title": "Clementina Santaf\u00e8", "duration": 153.33832, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/B/TRABBKX128F4285205.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR36F9J1187FB406F1", "artist_latitude": 56.27609, "artist_longitude": 9.51695, "artist_location": "Denmark", "artist_name": "Bombay Rockers", "song_id": "SOBKWDJ12A8C13B2F3", "title": "Wild Rose (Back 2 Basics Mix)", "duration": 230.71302, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/B/TRABBLU128F93349CF.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNNKDK1187B98BBD5", "artist_latitude": 45.80726, "artist_longitude": 15.9676, "artist_location": "Zagreb Croatia", "artist_name": "Jinx", "song_id": "SOFNOQK12AB01840FC", "title": "Kutt Free (DJ Volume Remix)", "duration": 407.37914, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/B/TRABBNP128F932546F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR62SOJ1187FB47BB5", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Chase & Status", "song_id": "SOGVQGJ12AB017F169", "title": "Ten Tonne", "duration": 337.68444, "year": 2005} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/B/TRABBOP128F931B50D.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARBEBBY1187B9B43DB", "artist_latitude": null, "artist_longitude": null, "artist_location": "Gainesville, FL", "artist_name": "Tom Petty", "song_id": "SOFFKZS12AB017F194", "title": "A Higher Place (Album Version)", "duration": 236.17261, "year": 1994} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/B/TRABBOR128F4286200.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARDR4AC1187FB371A1", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Montserrat Caball\u00e9;Placido Domingo;Vicente Sardinero;Judith Blegen;Sherrill Milnes;Georg Solti", "song_id": "SOBAYLL12A8C138AF9", "title": "Sono andati? Fingevo di dormire", "duration": 511.16363, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/B/TRABBTA128F933D304.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARAGB2O1187FB3A161", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Pucho & His Latin Soul Brothers", "song_id": "SOLEYHO12AB0188A85", "title": "Got My Mojo Workin", "duration": 338.23302, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/B/TRABBVJ128F92F7EAA.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AREDL271187FB40F44", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Soul Mekanik", "song_id": "SOPEGZN12AB0181B3D", "title": "Get Your Head Stuck On Your Neck", "duration": 45.66159, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/C/TRABCAJ12903CDFCC2.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARULZCI1241B9C8611", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Luna Orbit Project", "song_id": "SOSWKAV12AB018FC91", "title": "Midnight Star", "duration": 335.51628, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/C/TRABCEC128F426456E.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR0IAWL1187B9A96D0", "artist_latitude": 8.4177, "artist_longitude": -80.11278, "artist_location": "Panama", "artist_name": "Danilo Perez", "song_id": "SONSKXP12A8C13A2C9", "title": "Native Soul", "duration": 197.19791, "year": 2003} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/C/TRABCEI128F424C983.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARJIE2Y1187B994AB7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Line Renaud", "song_id": "SOUPIRU12A6D4FA1E1", "title": "Der Kleine Dompfaff", "duration": 152.92036, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/C/TRABCFL128F149BB0D.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARLTWXK1187FB5A3F8", "artist_latitude": 32.74863, "artist_longitude": -97.32925, "artist_location": "Fort Worth, TX", "artist_name": "King Curtis", "song_id": "SODREIN12A58A7F2E5", "title": "A Whiter Shade Of Pale (Live @ Fillmore West)", "duration": 326.00771, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/C/TRABCIX128F4265903.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNF6401187FB57032", "artist_latitude": 40.79086, "artist_longitude": -73.96644, "artist_location": "New York, NY [Manhattan]", "artist_name": "Sophie B. Hawkins", "song_id": "SONWXQJ12A8C134D94", "title": "The Ballad Of Sleeping Beauty", "duration": 305.162, "year": 1994} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/C/TRABCKL128F423A778.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARPFHN61187FB575F6", "artist_latitude": 41.88415, "artist_longitude": -87.63241, "artist_location": "Chicago, IL", "artist_name": "Lupe Fiasco", "song_id": "SOWQTQZ12A58A7B63E", "title": "Streets On Fire (Explicit Album Version)", "duration": 279.97995, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/C/TRABCPZ128F4275C32.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR051KA1187B98B2FF", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Wilks", "song_id": "SOLYIBD12A8C135045", "title": "Music is what we love", "duration": 261.51138, "year": 0} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/C/TRABCRU128F423F449.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR8IEZO1187B99055E", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Marc Shaiman", "song_id": "SOINLJW12A8C13314C", "title": "City Slickers", "duration": 149.86404, "year": 2008} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/C/TRABCTK128F934B224.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR558FS1187FB45658", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "40 Grit", "song_id": "SOGDBUF12A8C140FAA", "title": "Intro", "duration": 75.67628, "year": 2003} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/data/song_data/A/B/C/TRABCUQ128E0783E2B.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARVBRGZ1187FB4675A", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Gwen Stefani", "song_id": "SORRZGD12A6310DBC3", "title": "Harajuku Girls", "duration": 290.55955, "year": 2004} -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/etl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import psycopg2 4 | import pandas as pd 5 | from sql_queries import * 6 | 7 | 8 | def process_song_file(cur, filepath): 9 | """ 10 | Process songs files and insert records into the Postgres database. 11 | :param cur: cursor reference 12 | :param filepath: complete file path for the file to load 13 | """ 14 | 15 | # open song file 16 | df = pd.DataFrame([pd.read_json(filepath, typ='series', convert_dates=False)]) 17 | 18 | for value in df.values: 19 | num_songs, artist_id, artist_latitude, artist_longitude, artist_location, artist_name, song_id, title, duration, year = value 20 | 21 | # insert artist record 22 | artist_data = (artist_id, artist_name, artist_location, artist_latitude, artist_longitude) 23 | cur.execute(artist_table_insert, artist_data) 24 | 25 | # insert song record 26 | song_data = (song_id, title, artist_id, year, duration) 27 | cur.execute(song_table_insert, song_data) 28 | 29 | print(f"Records inserted for file {filepath}") 30 | 31 | 32 | def process_log_file(cur, filepath): 33 | """ 34 | Process Event log files and insert records into the Postgres database. 35 | :param cur: cursor reference 36 | :param filepath: complete file path for the file to load 37 | """ 38 | # open log file 39 | df = df = pd.read_json(filepath, lines=True) 40 | 41 | # filter by NextSong action 42 | df = df[df['page'] == "NextSong"].astype({'ts': 'datetime64[ms]'}) 43 | 44 | # convert timestamp column to datetime 45 | t = pd.Series(df['ts'], index=df.index) 46 | 47 | # insert time data records 48 | column_labels = ["timestamp", "hour", "day", "weelofyear", "month", "year", "weekday"] 49 | time_data = [] 50 | for data in t: 51 | time_data.append([data ,data.hour, data.day, data.weekofyear, data.month, data.year, data.day_name()]) 52 | 53 | time_df = pd.DataFrame.from_records(data = time_data, columns = column_labels) 54 | 55 | for i, row in time_df.iterrows(): 56 | cur.execute(time_table_insert, list(row)) 57 | 58 | # load user table 59 | user_df = df[['userId','firstName','lastName','gender','level']] 60 | 61 | # insert user records 62 | for i, row in user_df.iterrows(): 63 | cur.execute(user_table_insert, row) 64 | 65 | # insert songplay records 66 | for index, row in df.iterrows(): 67 | 68 | # get songid and artistid from song and artist tables 69 | cur.execute(song_select, (row.song, row.artist, row.length)) 70 | results = cur.fetchone() 71 | 72 | if results: 73 | songid, artistid = results 74 | else: 75 | songid, artistid = None, None 76 | 77 | # insert songplay record 78 | songplay_data = ( row.ts, row.userId, row.level, songid, artistid, row.sessionId, row.location, row.userAgent) 79 | cur.execute(songplay_table_insert, songplay_data) 80 | 81 | 82 | def process_data(cur, conn, filepath, func): 83 | """ 84 | Driver function to load data from songs and event log files into Postgres database. 85 | :param cur: a database cursor reference 86 | :param conn: database connection reference 87 | :param filepath: parent directory where the files exists 88 | :param func: function to call 89 | """ 90 | # get all files matching extension from directory 91 | all_files = [] 92 | for root, dirs, files in os.walk(filepath): 93 | files = glob.glob(os.path.join(root,'*.json')) 94 | for f in files : 95 | all_files.append(os.path.abspath(f)) 96 | 97 | # get total number of files found 98 | num_files = len(all_files) 99 | print('{} files found in {}'.format(num_files, filepath)) 100 | 101 | # iterate over files and process 102 | for i, datafile in enumerate(all_files, 1): 103 | func(cur, datafile) 104 | conn.commit() 105 | print('{}/{} files processed.'.format(i, num_files)) 106 | 107 | 108 | def main(): 109 | """ 110 | Driver function for loading songs and log data into Postgres database 111 | """ 112 | conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=postgres password=admin") 113 | cur = conn.cursor() 114 | 115 | process_data(cur, conn, filepath='data/song_data', func=process_song_file) 116 | process_data(cur, conn, filepath='data/log_data', func=process_log_file) 117 | 118 | conn.close() 119 | 120 | 121 | if __name__ == "__main__": 122 | main() 123 | print("\n\nFinished processing!!!\n\n") -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/main.py: -------------------------------------------------------------------------------- 1 | from create_tables import main as create_table_main 2 | from etl import main as etl_main 3 | 4 | if __name__ == "__main__": 5 | create_table_main() 6 | etl_main() 7 | -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/sql_queries.py: -------------------------------------------------------------------------------- 1 | # DROP TABLES 2 | 3 | songplay_table_drop = "DROP TABLE IF EXISTS songplays" 4 | user_table_drop = "DROP TABLE IF EXISTS users" 5 | song_table_drop = "DROP TABLE IF EXISTS songs" 6 | artist_table_drop = "DROP TABLE IF EXISTS artists" 7 | time_table_drop = "DROP TABLE IF EXISTS time" 8 | 9 | # CREATE TABLES 10 | 11 | songplay_table_create = ("""CREATE TABLE IF NOT EXISTS songplays( 12 | songplay_id SERIAL CONSTRAINT songplay_pk PRIMARY KEY, 13 | start_time TIMESTAMP REFERENCES time (start_time), 14 | user_id INT REFERENCES users (user_id), 15 | level VARCHAR NOT NULL, 16 | song_id VARCHAR REFERENCES songs (song_id), 17 | artist_id VARCHAR REFERENCES artists (artist_id), 18 | session_id INT NOT NULL, 19 | location VARCHAR, 20 | user_agent TEXT 21 | )""") 22 | 23 | user_table_create = ("""CREATE TABLE IF NOT EXISTS users( 24 | user_id INT CONSTRAINT users_pk PRIMARY KEY, 25 | first_name VARCHAR, 26 | last_name VARCHAR, 27 | gender CHAR(1), 28 | level VARCHAR NOT NULL 29 | )""") 30 | 31 | song_table_create = ("""CREATE TABLE IF NOT EXISTS songs( 32 | song_id VARCHAR CONSTRAINT songs_pk PRIMARY KEY, 33 | title VARCHAR, 34 | artist_id VARCHAR REFERENCES artists (artist_id), 35 | year INT CHECK (year >= 0), 36 | duration FLOAT 37 | )""") 38 | 39 | artist_table_create = ("""CREATE TABLE IF NOT EXISTS artists( 40 | artist_id VARCHAR CONSTRAINT artist_pk PRIMARY KEY, 41 | name VARCHAR, 42 | location VARCHAR, 43 | latitude DECIMAL(9,6), 44 | longitude DECIMAL(9,6) 45 | )""") 46 | 47 | time_table_create = ("""CREATE TABLE IF NOT EXISTS time( 48 | start_time TIMESTAMP CONSTRAINT time_pk PRIMARY KEY, 49 | hour INT NOT NULL CHECK (hour >= 0), 50 | day INT NOT NULL CHECK (day >= 0), 51 | week INT NOT NULL CHECK (week >= 0), 52 | month INT NOT NULL CHECK (month >= 0), 53 | year INT NOT NULL CHECK (year >= 0), 54 | weekday VARCHAR NOT NULL 55 | )""") 56 | 57 | # INSERT RECORDS 58 | 59 | songplay_table_insert = ("""INSERT INTO songplays VALUES (DEFAULT, %s, %s, %s, %s, %s, %s, %s, %s ) 60 | """) 61 | 62 | 63 | # Updating the user level on conflict 64 | user_table_insert = ("""INSERT INTO users (user_id, first_name, last_name, gender, level) VALUES (%s, %s, %s, %s, %s) 65 | ON CONFLICT (user_id) DO UPDATE SET 66 | level = EXCLUDED.level 67 | """) 68 | 69 | song_table_insert = ("""INSERT INTO songs (song_id, title, artist_id, year, duration) VALUES (%s, %s, %s, %s, %s) 70 | ON CONFLICT (song_id) DO NOTHING 71 | """) 72 | 73 | 74 | # Artist location, latitude and longitude might change and need to be updated. 75 | artist_table_insert = ("""INSERT INTO artists (artist_id, name, location, latitude, longitude) VALUES (%s, %s, %s, %s, %s) 76 | ON CONFLICT (artist_id) DO UPDATE SET 77 | location = EXCLUDED.location, 78 | latitude = EXCLUDED.latitude, 79 | longitude = EXCLUDED.longitude 80 | """) 81 | 82 | time_table_insert = ("""INSERT INTO time VALUES (%s, %s, %s, %s, %s, %s, %s) ON CONFLICT (start_time) DO NOTHING 83 | """) 84 | 85 | # FIND SONGS 86 | 87 | song_select = (""" 88 | SELECT song_id, artists.artist_id 89 | FROM songs JOIN artists ON songs.artist_id = artists.artist_id 90 | WHERE songs.title = %s 91 | AND artists.name = %s 92 | AND songs.duration = %s 93 | """) 94 | 95 | # QUERY LISTS 96 | 97 | create_table_queries = [user_table_create, artist_table_create, song_table_create, time_table_create, songplay_table_create] 98 | drop_table_queries = [songplay_table_drop, user_table_drop, song_table_drop, artist_table_drop, time_table_drop] 99 | 100 | -------------------------------------------------------------------------------- /Data_Modeling_with_Postgres/test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext sql" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "%sql postgresql://student:student@127.0.0.1/sparkifydb" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "%sql SELECT * FROM songplays LIMIT 5;" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "%sql SELECT * FROM users LIMIT 5;" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "%sql SELECT * FROM songs LIMIT 5;" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "%sql SELECT * FROM artists LIMIT 5;" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "%sql SELECT * FROM time LIMIT 5;" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "## REMEMBER: Restart this notebook to close connection to `sparkifydb`\n", 71 | "Each time you run the cells above, remember to restart this notebook to close the connection to your database. Otherwise, you won't be able to run your code in `create_tables.py`, `etl.py`, or `etl.ipynb` files since you can't make multiple connections to the same database (in this case, sparkifydb)." 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [] 80 | } 81 | ], 82 | "metadata": { 83 | "kernelspec": { 84 | "display_name": "Python 3", 85 | "language": "python", 86 | "name": "python3" 87 | }, 88 | "language_info": { 89 | "codemirror_mode": { 90 | "name": "ipython", 91 | "version": 3 92 | }, 93 | "file_extension": ".py", 94 | "mimetype": "text/x-python", 95 | "name": "python", 96 | "nbconvert_exporter": "python", 97 | "pygments_lexer": "ipython3", 98 | "version": "3.6.3" 99 | } 100 | }, 101 | "nbformat": 4, 102 | "nbformat_minor": 2 103 | } 104 | -------------------------------------------------------------------------------- /Data_Warehouse/README.md: -------------------------------------------------------------------------------- 1 | # Project Data Warehouse 2 | ## Project Overview 3 | 4 | A music streaming startup, Sparkify, has grown their user base and song database and want to move their processes and data onto the cloud. Their data resides in S3, in a directory of JSON logs on user activity on the app, as well as a directory with JSON metadata on the songs in their app. 5 | 6 | In this project, we will create an ETL pipeline to build a data warehouses hosted on Redshift. 7 | 8 | ## Song Dataset 9 | We will be working with two datasets that reside in S3. 10 | 11 | #### Song Dataset: 12 | It's a subset of real data from [Million Song Dataset](https://labrosa.ee.columbia.edu/millionsong/). Each file is in JSON format and contains metadata about a song and the artist of that song. The files are partitioned by the first three letters of each song's track ID. 13 | 14 | Sample Data: 15 | ``` 16 | {"num_songs": 1, "artist_id": "ARJIE2Y1187B994AB7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Line Renaud", "song_id": "SOUPIRU12A6D4FA1E1", "title": "Der Kleine Dompfaff", "duration": 152.92036, "year": 0} 17 | ``` 18 | 19 | ## Log Dataset 20 | The second dataset consists of log files in JSON format generated by this [event simulator](https://github.com/Interana/eventsim) based on the songs in the dataset above. These simulate app activity logs from an imaginary music streaming app based on configuration settings. 21 | 22 | The log files in the dataset are partitioned by year and month. 23 | 24 | Sample Data: 25 | 26 | {"artist":null,"auth":"Logged In","firstName":"Walter","gender":"M","itemInSession":0,"lastName":"Frye","length":null,"level":"free","location":"San Francisco-Oakland-Hayward, CA","method":"GET","page":"Home","registration":1540919166796.0,"sessionId":38,"song":null,"status":200,"ts":1541105830796,"userAgent":"\"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/36.0.1985.143 Safari\/537.36\"","userId":"39"} 27 | 28 | 29 | ## Schema for Song Play Analysis 30 | 31 | #### Fact Table 32 | songplays - records in event data associated with song plays. Columns for the table: 33 | 34 | songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent 35 | 36 | #### Dimension Tables 37 | ##### users 38 | 39 | user_id, first_name, last_name, gender, level 40 | ##### songs 41 | 42 | song_id, title, artist_id, year, duration 43 | 44 | ##### artists 45 | 46 | artist_id, name, location, lattitude, longitude 47 | 48 | ##### time 49 | 50 | start_time, hour, day, week, month, year, weekday 51 | 52 | 53 | ## How to Run 54 | #### Use [Redshift_Cluster_IaC.py](https://github.com/san089/Data_Engineering_Projects/blob/master/Redshift_Cluster_IaC.py "Redshift_Cluster_IaC.py") from [Data_Engineering_Projects](https://github.com/san089/Data_Engineering_Projects) to launch Redshift Cluster. 55 | 56 | #### Setup Configurations 57 | Setup the dwh.cfg file (File not added in this repository). File format for **dwh.cfg** 58 | 59 | ``` 60 | [CLUSTER] 61 | HOST='' 62 | DB_NAME='' 63 | DB_USER='' 64 | DB_PASSWORD='' 65 | DB_PORT=5439 66 | 67 | [IAM_ROLE] 68 | ARN= 69 | 70 | [S3] 71 | LOG_DATA='s3://udacity-dend/log_data' 72 | LOG_JSONPATH='s3://udacity-dend/log_json_path.json' 73 | SONG_DATA='s3://udacity-dend/song_data' 74 | 75 | ``` 76 | 77 | #### Create tables 78 | 79 | $ python create_tables.py 80 | 81 | #### Load Data 82 | 83 | $ python create_tables.py 84 | 85 | 86 | Reference: [AWS Redshift Doc](https://aws.amazon.com/redshift/getting-started/?p=rs&bttn=hero&exp=b) -------------------------------------------------------------------------------- /Data_Warehouse/create_tables.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import psycopg2 3 | from sql_queries import create_table_queries, drop_table_queries 4 | 5 | 6 | def drop_tables(cur, conn): 7 | for query in drop_table_queries: 8 | cur.execute(query) 9 | conn.commit() 10 | 11 | 12 | def create_tables(cur, conn): 13 | for query in create_table_queries: 14 | cur.execute(query) 15 | conn.commit() 16 | 17 | 18 | def main(): 19 | config = configparser.ConfigParser() 20 | config.read('dwh.cfg') 21 | 22 | conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values())) 23 | cur = conn.cursor() 24 | 25 | drop_tables(cur, conn) 26 | create_tables(cur, conn) 27 | 28 | conn.close() 29 | 30 | 31 | if __name__ == "__main__": 32 | main() -------------------------------------------------------------------------------- /Data_Warehouse/etl.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import psycopg2 3 | from sql_queries import copy_table_queries, insert_table_queries 4 | 5 | 6 | def load_staging_tables(cur, conn): 7 | for query in copy_table_queries: 8 | cur.execute(query) 9 | conn.commit() 10 | 11 | 12 | def insert_tables(cur, conn): 13 | for query in insert_table_queries: 14 | cur.execute(query) 15 | conn.commit() 16 | 17 | 18 | def main(): 19 | config = configparser.ConfigParser() 20 | config.read('dwh.cfg') 21 | 22 | conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values())) 23 | cur = conn.cursor() 24 | 25 | load_staging_tables(cur, conn) 26 | insert_tables(cur, conn) 27 | 28 | conn.close() 29 | 30 | 31 | if __name__ == "__main__": 32 | main() -------------------------------------------------------------------------------- /Data_Warehouse/log_json_path.json: -------------------------------------------------------------------------------- 1 | { 2 | "jsonpaths": [ 3 | "$.artist", 4 | "$.auth", 5 | "$.firstName", 6 | "$.gender", 7 | "$.itemInSession", 8 | "$.lastName", 9 | "$.length", 10 | "$.level", 11 | "$.location", 12 | "$.method", 13 | "$.page", 14 | "$.registration", 15 | "$.sessionId", 16 | "$.song", 17 | "$.status", 18 | "$.ts", 19 | "$.userAgent", 20 | "$.userId" 21 | ] 22 | } -------------------------------------------------------------------------------- /Data_Warehouse/sql_queries.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | 3 | 4 | # CONFIG 5 | config = configparser.ConfigParser() 6 | config.read('dwh.cfg') 7 | 8 | # DROP TABLES 9 | 10 | staging_events_table_drop = "DROP TABle IF EXISTS staging_events;" 11 | staging_songs_table_drop = "DROP TABLE IF EXISTS staging_songs;" 12 | songplay_table_drop = "DROP TABLE IF EXISTS songplays;" 13 | user_table_drop = "DROP TABLE IF EXISTS users;" 14 | song_table_drop = "DROP TABLE IF EXISTS songs;" 15 | artist_table_drop = "DROP TABLE IF EXISTS artists;" 16 | time_table_drop = "DROP TABLE IF EXISTS time;" 17 | 18 | # CREATE TABLES 19 | 20 | staging_events_table_create= (""" 21 | CREATE TABLE IF NOT EXISTS staging_events 22 | ( 23 | artist VARCHAR, 24 | auth VARCHAR, 25 | firstName VARCHAR(50), 26 | gender CHAR, 27 | itemInSession INTEGER, 28 | lastName VARCHAR(50), 29 | length FLOAT, 30 | level VARCHAR, 31 | location VARCHAR, 32 | method VARCHAR, 33 | page VARCHAR, 34 | registration FLOAT, 35 | sessionId INTEGER, 36 | song VARCHAR, 37 | status INTEGER, 38 | ts BIGINT, 39 | userAgent VARCHAR, 40 | userId INTEGER 41 | ); 42 | """) 43 | 44 | staging_songs_table_create = (""" 45 | CREATE TABLE IF NOT EXISTS staging_songs 46 | ( 47 | num_songs INTEGER, 48 | artist_id VARCHAR, 49 | artist_latitude FLOAT, 50 | artist_longitude FLOAT, 51 | artist_location VARCHAR, 52 | artist_name VARCHAR, 53 | song_id VARCHAR, 54 | title VARCHAR, 55 | duration FLOAT, 56 | year FLOAT 57 | ); 58 | """) 59 | 60 | songplay_table_create = (""" 61 | CREATE TABLE IF NOT EXISTS songplays 62 | ( 63 | songplay_id INTEGER IDENTITY (1, 1) PRIMARY KEY , 64 | start_time TIMESTAMP, 65 | user_id INTEGER, 66 | level VARCHAR, 67 | song_id VARCHAR, 68 | artist_id VARCHAR, 69 | session_id INTEGER, 70 | location VARCHAR, 71 | user_agent VARCHAR 72 | ) 73 | DISTSTYLE KEY 74 | DISTKEY ( start_time ) 75 | SORTKEY ( start_time ); 76 | """) 77 | 78 | user_table_create = (""" 79 | CREATE TABLE IF NOT EXISTS users 80 | ( 81 | userId INTEGER PRIMARY KEY, 82 | firsname VARCHAR(50), 83 | lastname VARCHAR(50), 84 | gender CHAR(1) ENCODE BYTEDICT, 85 | level VARCHAR ENCODE BYTEDICT 86 | ) 87 | SORTKEY (userId); 88 | """) 89 | 90 | song_table_create = (""" 91 | CREATE TABLE IF NOT EXISTS songs 92 | ( 93 | song_id VARCHAR PRIMARY KEY, 94 | title VARCHAR, 95 | artist_id VARCHAR, 96 | year INTEGER ENCODE BYTEDICT, 97 | duration FLOAT 98 | ) 99 | SORTKEY (song_id); 100 | """) 101 | 102 | artist_table_create = (""" 103 | CREATE TABLE IF NOT EXISTS artists 104 | ( 105 | artist_id VARCHAR PRIMARY KEY , 106 | name VARCHAR, 107 | location VARCHAR, 108 | latitude FLOAT, 109 | longitude FLOAT 110 | ) 111 | SORTKEY (artist_id); 112 | """) 113 | 114 | time_table_create = (""" 115 | CREATE TABLE IF NOT EXISTS time 116 | ( 117 | start_time TIMESTAMP PRIMARY KEY , 118 | hour INTEGER, 119 | day INTEGER, 120 | week INTEGER, 121 | month INTEGER, 122 | year INTEGER ENCODE BYTEDICT , 123 | weekday VARCHAR(9) ENCODE BYTEDICT 124 | ) 125 | DISTSTYLE KEY 126 | DISTKEY ( start_time ) 127 | SORTKEY (start_time); 128 | """) 129 | 130 | # STAGING TABLES 131 | 132 | staging_events_copy = (""" 133 | COPY staging_events 134 | FROM {} 135 | iam_role {} 136 | FORMAT AS json {}; 137 | """).format(config['S3']['LOG_DATA'], config['IAM_ROLE']['ARN'], config['S3']['LOG_JSONPATH']) 138 | 139 | staging_songs_copy = (""" 140 | COPY staging_songs 141 | FROM {} 142 | iam_role {} 143 | FORMAT AS json 'auto'; 144 | """).format(config['S3']['SONG_DATA'], config['IAM_ROLE']['ARN']) 145 | 146 | # FINAL TABLES 147 | 148 | songplay_table_insert = (""" 149 | INSERT INTO songplays (START_TIME, USER_ID, LEVEL, SONG_ID, ARTIST_ID, SESSION_ID, LOCATION, USER_AGENT) 150 | SELECT DISTINCT 151 | TIMESTAMP 'epoch' + (se.ts / 1000) * INTERVAL '1 second' as start_time, 152 | se.userId, 153 | se.level, 154 | ss.song_id, 155 | ss.artist_id, 156 | se.sessionId, 157 | se.location, 158 | se.userAgent 159 | FROM staging_songs ss 160 | INNER JOIN staging_events se 161 | ON (ss.title = se.song AND se.artist = ss.artist_name) 162 | AND se.page = 'NextSong'; 163 | """) 164 | 165 | user_table_insert = (""" 166 | INSERT INTO users 167 | SELECT DISTINCT userId, firstName, lastName, gender, level 168 | FROM staging_events 169 | WHERE userId IS NOT NULL 170 | AND page = 'NextSong'; 171 | """) 172 | 173 | song_table_insert = (""" 174 | INSERT INTO songs 175 | SELECT 176 | DISTINCT song_id, title, artist_id, year, duration 177 | FROM staging_songs 178 | WHERE song_id IS NOT NULL; 179 | """) 180 | 181 | artist_table_insert = (""" 182 | INSERT INTO artists 183 | SELECT 184 | DISTINCT artist_id, artist_name, artist_location, artist_latitude, artist_longitude 185 | FROM staging_songs; 186 | """) 187 | 188 | time_table_insert = (""" 189 | insert into time 190 | SELECT DISTINCT 191 | TIMESTAMP 'epoch' + (ts/1000) * INTERVAL '1 second' as start_time, 192 | EXTRACT(HOUR FROM start_time) AS hour, 193 | EXTRACT(DAY FROM start_time) AS day, 194 | EXTRACT(WEEKS FROM start_time) AS week, 195 | EXTRACT(MONTH FROM start_time) AS month, 196 | EXTRACT(YEAR FROM start_time) AS year, 197 | to_char(start_time, 'Day') AS weekday 198 | FROM staging_events; 199 | """) 200 | 201 | # QUERY LISTS 202 | 203 | create_table_queries = [staging_events_table_create, staging_songs_table_create, songplay_table_create, user_table_create, song_table_create, artist_table_create, time_table_create] 204 | drop_table_queries = [staging_events_table_drop, staging_songs_table_drop, songplay_table_drop, user_table_drop, song_table_drop, artist_table_drop, time_table_drop] 205 | copy_table_queries = [staging_events_copy, staging_songs_copy] 206 | insert_table_queries = [songplay_table_insert, user_table_insert, song_table_insert, artist_table_insert, time_table_insert] 207 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Project License 2 | 3 | This work was done by me as a part of Udacity Data Engineering Nanodegree. When you enroll for Udacity Courses you accept the Udacity Honor Code and below are 2 of the several terms you agree to: 4 | 5 | 1. You will not cheat on any assignment, projects, or assessments, as applicable, for the Online Courses. For example, giving or receiving unpermitted aid on, or copying or sharing of, another User’s (as defined in the Terms of Use) projects or other class work (or any portion thereof); 6 | 7 | 2. You will not plagiarize (a form of cheating) the work of others. For example, using another person’s original work (e.g., code, language, formulas, etc.) in your assignments, projects, or assessments. 8 | 9 | For more information check here - https://www.udacity.com/legal/en-us/community-guidelines 10 | 11 | If you are using the code from this repository for project submission you are violating the above terms and you take full responsibilities of it. This repo is just to refer the code and not to use it for your submission on Udacity. 12 | 13 | I, Sanchit Kumar the author of this repository, in no way responsible if you get expelled from the course violating the Udacity Honor Code. 14 | 15 | Copyright (c) 2020 Sanchit Kumar 16 | 17 | Besides the above notice, the following license applies and this license notice must be included in all works derived from this project. 18 | 19 | MIT License 20 | 21 | Copyright (c) 2020 Sanchit Kumar 22 | 23 | Permission is hereby granted, free of charge, to any person obtaining a copy 24 | of this software and associated documentation files (the "Software"), to deal 25 | in the Software without restriction, including without limitation the rights 26 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 27 | copies of the Software, and to permit persons to whom the Software is 28 | furnished to do so, subject to the following conditions: 29 | 30 | The above copyright notice and this permission notice shall be included in all 31 | copies or substantial portions of the Software. 32 | 33 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 34 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 35 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 36 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 37 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 38 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 39 | SOFTWARE. 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Engineering Projects 2 | 3 | ![](https://github.com/san089/Udacity-Data-Engineering-Projects/blob/master/image.jpeg) 4 | 5 | ## Project 1: Data Modeling with Postgres 6 | In this project, we apply Data Modeling with Postgres and build an ETL pipeline using Python. A startup wants to analyze the data they've been collecting on songs and user activity on their new music streaming app. Currently, they are collecting data in json format and the analytics team is particularly interested in understanding what songs users are listening to. 7 | 8 | Link: [Data_Modeling_with_Postgres](https://github.com/san089/Udacity-Data-Engineering-Projects/tree/master/Data_Modeling_with_Postgres) 9 | 10 | ## Project 2: Data Modeling with Cassandra 11 | In this project, we apply Data Modeling with Cassandra and build an ETL pipeline using Python. We will build a Data Model around our queries that we want to get answers for. 12 | For our use case we want below answers: 13 | 14 | - Get details of a song that was herad on the music app history during a particular session. 15 | - Get songs played by a user during particular session on music app. 16 | - Get all users from the music app history who listened to a particular song. 17 | 18 | Link : [Data_Modeling_with_Apache_Cassandra](https://github.com/san089/Udacity-Data-Engineering-Projects/tree/master/Data_Modeling_with_Apache_Cassandra) 19 | 20 | ## Project 3: Data Warehouse 21 | In this project, we apply the Data Warehouse architectures we learnt and build a Data Warehouse on AWS cloud. We build an ETL pipeline to extract and transform data stored in json format in s3 buckets and move the data to Warehouse hosted on Amazon Redshift. 22 | 23 | Use Redshift IaC script - [Redshift_IaC_README](https://github.com/san089/Udacity-Data-Engineering-Projects/blob/master/Redshift_IaC_README.md) 24 | 25 | Link - [Data_Warehouse](https://github.com/san089/Udacity-Data-Engineering-Projects/tree/master/Data_Warehouse) 26 | 27 | ## Project 4: Data Lake 28 | In this project, we will build a Data Lake on AWS cloud using Spark and AWS EMR cluster. The data lake will serve as a Single Source of Truth for the Analytics Platform. We will write spark jobs to perform ELT operations that picks data from landing zone on S3 and transform and stores data on the S3 processed zone. 29 | 30 | Link: [Data_Lake](https://github.com/san089/Udacity-Data-Engineering-Projects/tree/master/Data_Lake) 31 | 32 | ## Project 5: Data Pipelines with Airflow 33 | In this project, we will orchestrate our Data Pipeline workflow using an open-source Apache project called Apache Airflow. We will schedule our ETL jobs in Airflow, create project related custom plugins and operators and automate the pipeline execution. 34 | 35 | Link: [Airflow_Data_Pipelines](https://github.com/san089/Udacity-Data-Engineering-Projects/tree/master/Airflow_Data_Pipelines) 36 | 37 | ## Project 6: Api Data to Postgres 38 | In this project, we build an etl pipeline to fetch data from yelp API and insert it into the Postgres Database. This project is a very basic example of fetching real time data from an open source API. 39 | 40 | Link: [API to Postgres](https://github.com/san089/Udacity-Data-Engineering-Projects/tree/master/Data_Api_to_Postgres) 41 | 42 | ## CAPSTONE PROJECT 43 | Udacity provides their own crafted Capstone project with dataset that include data on immigration to the United States, and supplementary datasets that include data on airport codes, U.S. city demographics, and temperature data. 44 | 45 | I worked on my own open-ended project.
46 | Here is the link - [goodreads_etl_pipeline](https://github.com/san089/goodreads_etl_pipeline) 47 | -------------------------------------------------------------------------------- /Redshift_IaC_README.md: -------------------------------------------------------------------------------- 1 | # Redhsift IaC (Infrastructure as Code) Utility 2 | 3 | Link to Code: [Redshift_IaC_Utility](https://github.com/san089/Udacity-Data-Engineering-Projects/blob/master/Redshift_Cluster_IaC.py) 4 | 5 | This utility automates the Infrastructure deployement and configuration for Redshift cluster along with the pre-requisites to spin up the cluster. 6 | 7 | The code performs 3 setups: 8 | 9 | - Create or delete an IAM role and apply appropriate policy to allow access to other AWS services 10 | - Create or delete a VPC security group, with appropriate Indound rules to allow connection to the cluster. 11 | - Finally, spin up a cluster using the Hardware configurations defined in config file, set-up master DB configs, apply cluster permission's with the IAM role and set VPC security groups as created above. 12 | 13 | ### Setup Configurations File - cluster.config 14 | 15 | [AWS] 16 | KEY= 17 | SECRET= 18 | 19 | [DWH] 20 | DWH_CLUSTER_TYPE= 21 | DWH_NUM_NODES= 22 | DWH_NODE_TYPE= 23 | DWH_CLUSTER_IDENTIFIER= 24 | DWH_DB= 25 | DWH_DB_USER= 26 | DWH_DB_PASSWORD= 27 | DWH_PORT= 28 | 29 | 30 | [IAM_ROLE] 31 | NAME= 32 | DESCRIPTION= 33 | POLICY_ARN= 34 | 35 | [SECURITY_GROUP] 36 | NAME= 37 | DESCRIPTION= 38 | 39 | [INBOUND_RULE] 40 | TYPE= 41 | PROTOCOL= 42 | PORT_RANGE= 43 | CIDRIP= 44 | DESCRIPTION= 45 | 46 | ### Usage 47 | 48 | 49 | > python Redshift_Cluster_IaC.py -h 50 | 51 | usage: Redshift_Cluster_IaC.py [-h] -c -d [-v] 52 | 53 | A Redshift cluster IaC (Infrastructure as Code). It creates IAM role for the 54 | Redshift, creates security group and sets up ingress parameters. Finally spin- 55 | up a redshift cluster. 56 | 57 | required arguments: 58 | -c , --create True or False. Create IAM roles, security group and 59 | redshift cluster if ie does not exist. 60 | -d , --delete True or False. Delete the roles, securitygroup and 61 | cluster. WARNING: Deletes the Redshift cluster, IAM role 62 | and security group. 63 | 64 | optional arguments: 65 | -v , --verbosity Increase output verbosity. Default set to DEBUG. 66 | 67 | ### How to run 68 | Create a new cluster: 69 | 70 | python Redshift_Cluster_IaC.py --create TRUE --delete FALSE --verbosity TRUE 71 | 72 | Delete cluster: 73 | 74 | python Redshift_Cluster_IaC.py --create FALSE --delete TRUE --verbosity TRUE 75 | 76 | 77 | 78 |
79 |
80 | -------------------------------------------------------------------------------- /Redshift_test.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | import configparser 3 | 4 | # Loading cluster configurations from cluster.config 5 | config = configparser.ConfigParser() 6 | config.read_file(open('cluster.config')) 7 | 8 | def test_connection(host): 9 | 10 | dbname = config.get('DWH','DWH_DB') 11 | port = config.get('DWH','DWH_PORT') 12 | user = config.get('DWH','DWH_DB_USER') 13 | password = config.get('DWH','DWH_DB_PASSWORD') 14 | 15 | con=psycopg2.connect(dbname= dbname, host=host, port= port, user= user, password= password) 16 | cur = con.cursor() 17 | 18 | cur.execute("CREATE TABLE test (id int);") 19 | cur.execute("INSERT INTO test VALUES (10);") 20 | print(cur.execute('SELECT * FROM test')) 21 | 22 | con.close() -------------------------------------------------------------------------------- /airflow_livy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/san089/Udacity-Data-Engineering-Projects/c26bd834294cda2f4f2dfbf68c4c326635c55a93/airflow_livy.png -------------------------------------------------------------------------------- /architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/san089/Udacity-Data-Engineering-Projects/c26bd834294cda2f4f2dfbf68c4c326635c55a93/architecture.png -------------------------------------------------------------------------------- /image.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/san089/Udacity-Data-Engineering-Projects/c26bd834294cda2f4f2dfbf68c4c326635c55a93/image.jpeg -------------------------------------------------------------------------------- /logging.ini: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root 3 | 4 | [handlers] 5 | keys=stream_handler 6 | 7 | [formatters] 8 | keys=formatter 9 | 10 | [logger_root] 11 | level=DEBUG 12 | handlers=stream_handler 13 | 14 | [handler_stream_handler] 15 | class=StreamHandler 16 | level=DEBUG 17 | formatter=formatter 18 | args=(sys.stderr,) 19 | 20 | [formatter_formatter] 21 | format=%(asctime)s %(name)-12s %(levelname)-8s %(message)s --------------------------------------------------------------------------------