├── Dags ├── spotify_etl.py └── spotify_final_dag.py ├── Delete_tables.py ├── Extract.py ├── Load.py ├── README.md ├── Transform.py ├── __pycache__ ├── Extract.cpython-37.pyc └── Transform.cpython-37.pyc ├── docker-compose.yml └── my_played_tracks.sqlite /Dags/spotify_etl.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import requests 3 | from datetime import datetime 4 | import datetime 5 | import pandas as pd 6 | import requests 7 | from datetime import datetime 8 | import datetime 9 | 10 | 11 | 12 | USER_ID = "YOUR_USERNAME_HERE" 13 | TOKEN = "YOUR_TOKEN_HERE" 14 | print('started') 15 | # Creating an function to be used in other pyrhon files 16 | def return_dataframe(): 17 | input_variables = { 18 | "Accept" : "application/json", 19 | "Content-Type" : "application/json", 20 | "Authorization" : "Bearer {token}".format(token=TOKEN) 21 | } 22 | 23 | today = datetime.datetime.now() 24 | yesterday = today - datetime.timedelta(days=1) 25 | yesterday_unix_timestamp = int(yesterday.timestamp()) * 1000 26 | 27 | # Download all songs you've listened to "after yesterday", which means in the last 24 hours 28 | r = requests.get("https://api.spotify.com/v1/me/player/recently-played?limit=50&after={time}".format(time=yesterday_unix_timestamp), headers = input_variables) 29 | 30 | data = r.json() 31 | song_names = [] 32 | artist_names = [] 33 | played_at_list = [] 34 | timestamps = [] 35 | 36 | # Extracting only the relevant bits of data from the json object 37 | for song in data["items"]: 38 | song_names.append(song["track"]["name"]) 39 | artist_names.append(song["track"]["album"]["artists"][0]["name"]) 40 | played_at_list.append(song["played_at"]) 41 | timestamps.append(song["played_at"][0:10]) 42 | 43 | # Prepare a dictionary in order to turn it into a pandas dataframe below 44 | song_dict = { 45 | "song_name" : song_names, 46 | "artist_name": artist_names, 47 | "played_at" : played_at_list, 48 | "timestamp" : timestamps 49 | } 50 | song_df = pd.DataFrame(song_dict, columns = ["song_name", "artist_name", "played_at", "timestamp"]) 51 | return song_df 52 | 53 | def Data_Quality(load_df): 54 | #Checking Whether the DataFrame is empty 55 | if load_df.empty: 56 | print('No Songs Extracted') 57 | return False 58 | 59 | #Enforcing Primary keys since we don't need duplicates 60 | if pd.Series(load_df['played_at']).is_unique: 61 | pass 62 | else: 63 | #The Reason for using exception is to immediately terminate the program and avoid further processing 64 | raise Exception("Primary Key Exception,Data Might Contain duplicates") 65 | 66 | #Checking for Nulls in our data frame 67 | if load_df.isnull().values.any(): 68 | raise Exception("Null values found") 69 | 70 | # Writing some Transformation Queries to get the count of artist 71 | def Transform_df(load_df): 72 | 73 | #Applying transformation logic 74 | Transformed_df=load_df.groupby(['timestamp','artist_name'],as_index = False).count() 75 | Transformed_df.rename(columns ={'played_at':'count'}, inplace=True) 76 | 77 | #Creating a Primary Key based on Timestamp and artist name 78 | Transformed_df["ID"] = Transformed_df['timestamp'].astype(str) +"-"+ Transformed_df["artist_name"] 79 | 80 | return Transformed_df[['ID','timestamp','artist_name','count']] 81 | 82 | 83 | 84 | 85 | 86 | def spotify_etl(): 87 | #Importing the songs_df from the Extract.py 88 | load_df=return_dataframe() 89 | Data_Quality(load_df) 90 | #calling the transformation 91 | Transformed_df=Transform_df(load_df) 92 | print(load_df) 93 | return (load_df) 94 | 95 | spotify_etl() 96 | 97 | -------------------------------------------------------------------------------- /Dags/spotify_final_dag.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | from airflow import DAG 3 | from airflow.operators.python_operator import PythonOperator 4 | from airflow.hooks.base_hook import BaseHook 5 | from airflow.providers.postgres.hooks.postgres import PostgresHook 6 | from airflow.providers.postgres.operators.postgres import PostgresOperator 7 | from sqlalchemy import create_engine 8 | 9 | from airflow.utils.dates import days_ago 10 | from spotify_etl import spotify_etl 11 | 12 | default_args = { 13 | 'owner': 'airflow', 14 | 'depends_on_past': False, 15 | 'start_date': dt.datetime(2023,1,29), 16 | 'email': ['airflow@example.com'], 17 | 'email_on_failure': False, 18 | 'email_on_retry': False, 19 | 'retries': 1, 20 | 'retry_delay': dt.timedelta(minutes=1) 21 | } 22 | 23 | dag = DAG( 24 | 'spotify_final_dag', 25 | default_args=default_args, 26 | description='Spotify ETL process 1-min', 27 | schedule_interval=dt.timedelta(minutes=50), 28 | ) 29 | 30 | def ETL(): 31 | print("started") 32 | df=spotify_etl() 33 | #print(df) 34 | conn = BaseHook.get_connection('postgre_sql') 35 | engine = create_engine(f'postgresql://{conn.login}:{conn.password}@{conn.host}:{conn.port}/{conn.schema}') 36 | df.to_sql('my_played_tracks', engine, if_exists='replace') 37 | 38 | with dag: 39 | create_table= PostgresOperator( 40 | task_id='create_table', 41 | postgres_conn_id='postgre_sql', 42 | sql=""" 43 | CREATE TABLE IF NOT EXISTS my_played_tracks( 44 | song_name VARCHAR(200), 45 | artist_name VARCHAR(200), 46 | played_at VARCHAR(200), 47 | timestamp VARCHAR(200), 48 | CONSTRAINT primary_key_constraint PRIMARY KEY (played_at) 49 | ) 50 | """ 51 | ) 52 | 53 | run_etl = PythonOperator( 54 | task_id='spotify_etl_final', 55 | python_callable=ETL, 56 | dag=dag, 57 | ) 58 | 59 | create_table >> run_etl 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /Delete_tables.py: -------------------------------------------------------------------------------- 1 | import Extract 2 | import Transform 3 | import sqlalchemy 4 | import pandas as pd 5 | from sqlalchemy.orm import sessionmaker 6 | import requests 7 | import json 8 | from datetime import datetime 9 | import datetime 10 | import sqlite3 11 | 12 | DATABASE_LOCATION = "sqlite:///my_played_tracks.sqlite" 13 | 14 | if __name__ == "__main__": 15 | 16 | #Connecting into Database 17 | engine = sqlalchemy.create_engine(DATABASE_LOCATION) 18 | conn = sqlite3.connect('my_played_tracks.sqlite') 19 | cursor = conn.cursor() 20 | print("Opened database successfully") 21 | 22 | #Deleting the Tables 23 | cursor.execute('DROP TABLE my_played_tracks') 24 | cursor.execute('DROP TABLE fav_artist') 25 | 26 | conn.close() 27 | print("Close database successfully") 28 | 29 | -------------------------------------------------------------------------------- /Extract.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import requests 3 | from datetime import datetime 4 | import datetime 5 | 6 | 7 | USER_ID = "YOUR_USER_NAME" 8 | TOKEN = "YOUR_TOKEN" 9 | 10 | 11 | # Creating an function to be used in other pyrhon files 12 | def return_dataframe(): 13 | input_variables = { 14 | "Accept" : "application/json", 15 | "Content-Type" : "application/json", 16 | "Authorization" : "Bearer {token}".format(token=TOKEN) 17 | } 18 | 19 | today = datetime.datetime.now() 20 | yesterday = today - datetime.timedelta(days=2) 21 | yesterday_unix_timestamp = int(yesterday.timestamp()) * 1000 22 | 23 | # Download all songs you've listened to "after yesterday", which means in the last 24 hours 24 | r = requests.get("https://api.spotify.com/v1/me/player/recently-played?after={time}".format(time=yesterday_unix_timestamp), headers = input_variables) 25 | 26 | data = r.json() 27 | song_names = [] 28 | artist_names = [] 29 | played_at_list = [] 30 | timestamps = [] 31 | 32 | # Extracting only the relevant bits of data from the json object 33 | for song in data["items"]: 34 | song_names.append(song["track"]["name"]) 35 | artist_names.append(song["track"]["album"]["artists"][0]["name"]) 36 | played_at_list.append(song["played_at"]) 37 | timestamps.append(song["played_at"][0:10]) 38 | 39 | # Prepare a dictionary in order to turn it into a pandas dataframe below 40 | song_dict = { 41 | "song_name" : song_names, 42 | "artist_name": artist_names, 43 | "played_at" : played_at_list, 44 | "timestamp" : timestamps 45 | } 46 | song_df = pd.DataFrame(song_dict, columns = ["song_name", "artist_name", "played_at", "timestamp"]) 47 | return song_df 48 | -------------------------------------------------------------------------------- /Load.py: -------------------------------------------------------------------------------- 1 | import Extract 2 | import Transform 3 | import sqlalchemy 4 | import pandas as pd 5 | from sqlalchemy.orm import sessionmaker 6 | import requests 7 | import json 8 | from datetime import datetime 9 | import datetime 10 | import sqlite3 11 | 12 | DATABASE_LOCATION = "sqlite:///my_played_tracks.sqlite" 13 | 14 | if __name__ == "__main__": 15 | 16 | #Importing the songs_df from the Extract.py 17 | load_df=Extract.return_dataframe() 18 | if(Transform.Data_Quality(load_df) == False): 19 | raise ("Failed at Data Validation") 20 | Transformed_df=Transform.Transform_df(load_df) 21 | #The Two Data Frame that need to be Loaded in to the DataBase 22 | 23 | #Loading into Database 24 | engine = sqlalchemy.create_engine(DATABASE_LOCATION) 25 | conn = sqlite3.connect('my_played_tracks.sqlite') 26 | cursor = conn.cursor() 27 | 28 | #SQL Query to Create Played Songs 29 | sql_query_1 = """ 30 | CREATE TABLE IF NOT EXISTS my_played_tracks( 31 | song_name VARCHAR(200), 32 | artist_name VARCHAR(200), 33 | played_at VARCHAR(200), 34 | timestamp VARCHAR(200), 35 | CONSTRAINT primary_key_constraint PRIMARY KEY (played_at) 36 | ) 37 | """ 38 | #SQL Query to Create Most Listened Artist 39 | sql_query_2 = """ 40 | CREATE TABLE IF NOT EXISTS fav_artist( 41 | timestamp VARCHAR(200), 42 | ID VARCHAR(200), 43 | artist_name VARCHAR(200), 44 | count VARCHAR(200), 45 | CONSTRAINT primary_key_constraint PRIMARY KEY (ID) 46 | ) 47 | """ 48 | cursor.execute(sql_query_1) 49 | cursor.execute(sql_query_2) 50 | print("Opened database successfully") 51 | 52 | #We need to only Append New Data to avoid duplicates 53 | try: 54 | load_df.to_sql("my_played_tracks", engine, index=False, if_exists='append') 55 | except: 56 | print("Data already exists in the database") 57 | try: 58 | Transformed_df.to_sql("fav_artist", engine, index=False, if_exists='append') 59 | except: 60 | print("Data already exists in the database2") 61 | 62 | #cursor.execute('DROP TABLE my_played_tracks') 63 | #cursor.execute('DROP TABLE fav_artist') 64 | 65 | conn.close() 66 | print("Close database successfully") 67 | 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Engineering Project-2|Building Spotify ETL using Python and Airflow 2 | 3 | Create an Extract Transform Load pipeline using python and automate with airflow. 4 | 5 | ![](https://miro.medium.com/max/749/1*dm8hVrPTPMenyRY4uJiBIA@2x.png) 6 | 7 | Image by Author 8 | 9 | Inthis blog post, will explain how to create a simple ETL(Extract, Transform, Load) pipeline using Python and automate the process through Apache airflow. 10 | 11 | # Problem Statement: 12 | 13 | We need to use Spotify’s API to read the data and perform some basic transformations and Data Quality checks finally will load the retrieved data to PostgreSQL DB and then automate the entire process through airflow. **Est.Time:**[4–7 Hours] 14 | 15 | # Tech Stack / Skill used: 16 | 17 | 1. Python 18 | 2. API’s 19 | 3. Docker 20 | 4. Airflow 21 | 5. PostgreSQL 22 | 23 | # Prerequisite: 24 | 25 | 1. Knowledge on API 26 | 2. Understand what docker and docker-compose 27 | 3. Intermediate Python and SQL 28 | 4. A basic understanding of Airflow [this](https://www.youtube.com/watch?v=AHMm1wfGuHE&t=705s) will help 29 | 30 | # Learning Outcomes: 31 | 32 | 1. Understand how to interact with API to retrieve data 33 | 2. Handling Dataframe in pandas 34 | 3. Setting up Airflow and PostgreSQL through Docker-Compose. 35 | 4. Learning to Create DAGs in Airflow 36 | 37 | # Introduction: 38 | 39 | This is a beginner-friendly project to get started with building a simple pipeline and automating through airflow. First, we will focus on entirely building the pipeline and then extend the project by combining it with Airflow. 40 | 41 | # Building ETL Pipeline: 42 | 43 | **Dataset:** In this project, we are using Spotify’s API so please go ahead and create an account for yourself. After creating the account head to this [page](https://developer.spotify.com/console/get-recently-played/?limit=&after=&before=). Now you will be able to see a get token icon click that and select user recently played and click get token. 44 | 45 | ![](https://miro.medium.com/max/690/1*4UKYwl00ALuF9PQj-TTJyA.png) 46 | 47 | Image by Author 48 | 49 | You can see your token like this. 50 | 51 | ![](https://miro.medium.com/max/749/1*CPZYseTyKH-CruoJpyNl-w.png) 52 | 53 | Image by Author 54 | 55 | Now, this is the procedure to get the token. You may need to generate this often as it expires after some time. 56 | 57 | ## Extract.py 58 | 59 | We are using this token to Extract the Data from Spotify. We are Creating a function return_dataframe(). The Below python code explains how we extract API data and convert it to a Dataframe. 60 | 61 | ## Transform.py 62 | 63 | Here we are exporting the Extract file to get the data. 64 | 65 | **def Data_Quality(load_df):** Used to check for the empty data frame, enforce unique constraints, checking for null values. Since these data might ruin our database it's important we enforce these Data Quality checks. 66 | 67 | **def Transform_df(load_df):** Now we are writing some logic according to our requirement here we wanted to know our favorite artist so we are grouping the songs listened to by the artist. Note: This step is not required you can implement it or any other logic if you wish but make sure you enforce the primary constraint. 68 | 69 | ## Load.py 70 | 71 | In the load step, we are using sqlalchemy and SQLite to load our data into a database and save the file in our project directory. 72 | 73 | Finally, we have completed our ETL pipeline successfully. The structure of the project folder should look like this(inside the project folder we have 3 files). 74 | 75 | E:\DE\PROJECTS\SPOTIFY_ETL\SPOTIFY_ETL 76 | │ Extract.py 77 | │ Load.py 78 | │ my_played_tracks.sqlite 79 | │ spotify_etl.py 80 | │ Transform.py 81 | └─── 82 | 83 | After running the **Load.py** you could see a .sqlite file will be saved to the project folder, to check the data inside the file head [here](https://inloop.github.io/sqlite-viewer/) and drop your file. 84 | 85 | ![](https://miro.medium.com/max/749/1*OpGD1spYMVIulWVCKPttlw.png) 86 | 87 | Image by Author 88 | 89 | Now we will automate this process using Airflow. 90 | 91 | # Automating through Airflow 92 | 93 | For those who have made it this far I appreciate your efforts 👏 but from here it gets a little tricky. Hence, I am mentioning some important points below. 94 | 95 | 1. We have completed an ETL and this itself is a mini project hence save the work. 96 | 2. Now we are going to extend this with airflow using docker. 97 | 3. Why docker? we are using docker since it’s easier to install and maintain and it's OS independent. 98 | 4. How to set up airflow using Docker? Follow the guide provided in this [blog](https://medium.com/@garc1a0scar/how-to-start-with-apache-airflow-in-docker-windows-902674ad1bbe). 99 | 5. You need to change the Yaml file alone from the above guidelines please refer [here](https://github.com/sidharth1805/Spotify_etl/blob/main/docker-compose.yml). 100 | 6. After setting up the airflow place your dags inside the dags folder. 101 | 7. After the docker is up you could see 4 services running. 102 | 103 | ![](https://miro.medium.com/max/749/1*txaw4D2bowisN98SbG6PZQ.png) 104 | 105 | Image by Author 106 | 107 | Your Airflow folder should look like the below structure. 108 | 109 | C:\USERS\SIDHA\DOCKER\AIRFLOW 110 | │ docker-compose.yml 111 | ├───dags 112 | │ │ YOUR_DAGS.py 113 | ├───logs 114 | ├───plugins 115 | └───scripts 116 | 117 | Now that we have set up airflow we can view the airflow UI by visiting the [8080 port](http://localhost:8080/). The username and password would be airflow. 118 | 119 | It’s time to create the required Dag for our project. But Before Jumping on to DAG let us understand what dag is DAG stands for Directed Acyclic Graph which is a set of tasks defined in the order of execution. 120 | 121 | ![](https://miro.medium.com/max/749/1*cImMkJ3NRWWLmw2o4mH9NQ.png) 122 | 123 | Image by Author 124 | 125 | So inside our dag, we need to create tasks to get our job done. To keep it simple I will use two tasks i.e. one to create Postgres Table and another to load the Data to the Table our dag will look like this. 126 | 127 | ![](https://miro.medium.com/max/679/1*hYbRd0gKRffQZn1Xipt-BA.png) 128 | 129 | ## spotify_etl.py 130 | 131 | In this Python File will write a logic to extract data from API → Do Quality Checks →Transform Data. 132 | 133 | 1. **yesterday = today — datetime.timedelta(days=1)** → Defines the number of days you want data for, change as you wish since our job is the daily load I have set it to 1. 134 | 2. **def spotify_etl()** → Core function which returns the Data Frame to the DAG python file. 135 | 3. This file needs to be placed inside the dags folder 136 | 137 | ## spotify_final_dag.py 138 | 139 | This is the most important section you need to pay attention to. First, learn the basics about airflow DAG’s [here](https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/dags.html) it might take around 15 mins or you can search for it on youtube. After the basics please follow the below guideline. 140 | 141 | 1. **from airflow.operators.python_operator import PythonOperator** → we are using the python operator to perform python functions such as inserting DataFrame to the table. 142 | 2. **from airflow.providers.postgres.operators.postgres import PostgresOperator** → we are using the Postgres operator to create tables in our Postgres database. 143 | 3. **from airflow. hooks.base_hook import BaseHook** → A hook is an abstraction of a specific API that allows Airflow to interact with an external system. Hooks are built into many operators, but they can also be used directly in DAG code. We are using a hook here to connect Postgres Database from our python function 144 | 4. **from spotify_etl import spotify_etl** → Importing spotify_etl function from spotify_etl.py 145 | 146 | ## Code Explanation: 147 | 148 | Setting up the default arguments and interval time. We can change the interval time start date according to our needs. 149 | 150 | ![](https://miro.medium.com/max/749/1*YoZQLQWbgXUn4WOo_Z1o7Q.png) 151 | 152 | Understanding Postgres connection and task. 153 | 154 | 1. **conn = BaseHook.get_connection(‘[Your Connection ID]’)** → Connects to your Postgres DB. 155 | 2. **df.to_sql(‘[Your Table Name]’, engine, if_exists=’replace’)** → Loads the DF to the table 156 | 3. **create_table >> run_etl** → Defining the flow of the task 157 | 158 | ![](https://miro.medium.com/max/749/1*WjsH1W213_nOkQv8pgrAvA.png) 159 | 160 | ## Setting up the Postgres Connection on Airflow UI: 161 | 162 | Head to the Airflow UI and click connection. 163 | 164 | ![](https://miro.medium.com/max/749/1*uLaXIfiaWXPbVCJPr8oeMg.png) 165 | 166 | Then create a connection. 167 | 168 | 1. **The connection id:** postgre_sql would be the one we used in our code. 169 | 2. **Connection Type:** Postgres 170 | 3. **Host:** Postgres 171 | 4. **Schema:** spotify_db, you can use your own name but make sure to check in the proper database when validating. 172 | 5. **Login:** airflow 173 | 6. **Password:** airflow 174 | 7. **Port:** 5432 175 | 176 | ![](https://miro.medium.com/max/749/1*WJQ5g1OCONYLy-nYP4DBDQ.png) 177 | 178 | Now its time to deploy :) 179 | 180 | ## **Deployment:** 181 | 182 | Check for your Dag in the dags section 183 | 184 | ![](https://miro.medium.com/max/749/1*S0T8xwbjYmgl1hZLy70Z-A.png) 185 | 186 | ![](https://miro.medium.com/max/749/1*J8wRUrI_5YYVoh8eeDP1vw.png) 187 | 188 | After activation now run the dag by triggering. 189 | 190 | ![](https://miro.medium.com/max/749/1*vK59llhtj85Q-MZg1qL5bA.png) 191 | 192 | After completion check for the logs. 193 | 194 | ![](https://miro.medium.com/max/749/1*BuGKkg7OIav6AOGFTgSwcw.png) 195 | 196 | Now we will validate by connecting to the Postgres database. Open terminal and execute: **pgcli -h localhost -p 5432 -u airflow -d spotify_db.**It will prompt for a password, enter airflow. Here **spotify_db** is our database name or schema of connection. 197 | 198 | ![](https://miro.medium.com/max/749/1*NoiKtC3VXkhOV9BySpH1TA.png) 199 | 200 | Type \d to see the tables. 201 | 202 | ![](https://miro.medium.com/max/501/1*jIO4zUKsmKPxz7jfVkNTLQ.png) 203 | 204 | We can see our table is created and now let's check the data. 205 | 206 | ![](https://miro.medium.com/max/749/1*BcUcfyKepcxVLisLEUk_Cw.png) 207 | 208 | Finally, we have made it :) 209 | 210 | Now our requirement is to automate this entire process for demonstration purposes let me change the airflow interval time to **3 mins** and listen to some music🎵🎵🎵. 211 | 212 | Now the current timestamp is shown below, let’s check after some time. 213 | 214 | ![](https://miro.medium.com/max/553/1*1o7tk-Sy4CfoJK6TB6u1Jw.png) 215 | 216 | Hurray 🥳 we have made it. 217 | 218 | ![](https://miro.medium.com/max/749/1*sqe43Kaofk9I-AmLzXQvIA.png) 219 | 220 | The new song that I just listened to has been uploaded to our database automatically. 221 | 222 | Let’s check the logs and task details. It’s a **scheduled task**. 223 | 224 | ![](https://miro.medium.com/max/749/1*UXLjVHYORJ69U3QZdIuSgg.png) 225 | 226 | ![](https://miro.medium.com/max/749/1*7OM4LPzn7sR66cjBMGgNnw.png) 227 | 228 | # Conclusion: 229 | 230 | **Note:** We may need to change the token in our ETL python file since it expires after some time. There are some limitations to this project they can be overcome by using a refresh token to automatically renew the token and we can set up the airflow in cloud services to run 24/7 and pick data once a day making it a Daily load but to make this beginner-friendly I haven't covered those I will let those as TODO 😉. 231 | 232 | Github Repo:[https://github.com/sidharth1805/Spotify_etl](https://github.com/sidharth1805/Spotify_etl). I hope you would have enjoyed the guided project a lot. I am pretty sure that you will face a lot of issues while doing the project Stack overflow is our best friend and feel free to connect with me on [LinkedIn](https://www.linkedin.com/in/sidharth-ramalingam/) for any further questions. Follow me on [medium](https://medium.com/@sidharth.ramalingam) to learn more about Data engineering stuff. 233 | -------------------------------------------------------------------------------- /Transform.py: -------------------------------------------------------------------------------- 1 | import Extract 2 | import pandas as pd 3 | 4 | # Set of Data Quality Checks Needed to Perform Before Loading 5 | def Data_Quality(load_df): 6 | #Checking Whether the DataFrame is empty 7 | if load_df.empty: 8 | print('No Songs Extracted') 9 | return False 10 | 11 | #Enforcing Primary keys since we don't need duplicates 12 | if pd.Series(load_df['played_at']).is_unique: 13 | pass 14 | else: 15 | #The Reason for using exception is to immediately terminate the program and avoid further processing 16 | raise Exception("Primary Key Exception,Data Might Contain duplicates") 17 | 18 | #Checking for Nulls in our data frame 19 | if load_df.isnull().values.any(): 20 | raise Exception("Null values found") 21 | 22 | # Writing some Transformation Queries to get the count of artist 23 | def Transform_df(load_df): 24 | 25 | #Applying transformation logic 26 | Transformed_df=load_df.groupby(['timestamp','artist_name'],as_index = False).count() 27 | Transformed_df.rename(columns ={'played_at':'count'}, inplace=True) 28 | 29 | #Creating a Primary Key based on Timestamp and artist name 30 | Transformed_df["ID"] = Transformed_df['timestamp'].astype(str) +"-"+ Transformed_df["artist_name"] 31 | 32 | return Transformed_df[['ID','timestamp','artist_name','count']] 33 | 34 | if __name__ == "__main__": 35 | 36 | #Importing the songs_df from the Extract.py 37 | load_df=Extract.return_dataframe() 38 | Data_Quality(load_df) 39 | #calling the transformation 40 | Transformed_df=Transform_df(load_df) 41 | print(Transformed_df) 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /__pycache__/Extract.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sidharth1805/Spotify_etl/020dffc5a9517ed8e9bf37e69466ca0f3cd97837/__pycache__/Extract.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/Transform.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sidharth1805/Spotify_etl/020dffc5a9517ed8e9bf37e69466ca0f3cd97837/__pycache__/Transform.cpython-37.pyc -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. 20 | # 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment. 22 | # 23 | # This configuration supports basic configuration using environment variables or an .env file 24 | # The following variables are supported: 25 | # 26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. 27 | # Default: apache/airflow:2.5.1 28 | # AIRFLOW_UID - User ID in Airflow containers 29 | # Default: 50000 30 | # AIRFLOW_PROJ_DIR - Base path to which all the files will be volumed. 31 | # Default: . 32 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode 33 | # 34 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested). 35 | # Default: airflow 36 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested). 37 | # Default: airflow 38 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. 39 | # Default: '' 40 | # 41 | # Feel free to modify this file to suit your needs. 42 | --- 43 | version: '3' 44 | x-airflow-common: 45 | &airflow-common 46 | # In order to add custom dependencies or upgrade provider packages you can use your extended image. 47 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml 48 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images. 49 | image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.5.1} 50 | # build: . 51 | environment: 52 | &airflow-common-env 53 | AIRFLOW__CORE__EXECUTOR: LocalExecutor 54 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 55 | # For backward compatibility, with Airflow <2.3 56 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 57 | AIRFLOW__CORE__FERNET_KEY: '' 58 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 59 | AIRFLOW__CORE__LOAD_EXAMPLES: 'true' 60 | AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session' 61 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} 62 | volumes: 63 | - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags 64 | - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs 65 | - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins 66 | user: "${AIRFLOW_UID:-50000}:0" 67 | depends_on: 68 | &airflow-common-depends-on 69 | postgres: 70 | condition: service_healthy 71 | 72 | services: 73 | postgres: 74 | image: postgres:13 75 | environment: 76 | POSTGRES_USER: airflow 77 | POSTGRES_PASSWORD: airflow 78 | POSTGRES_DB: airflow 79 | volumes: 80 | - postgres-db-volume:/var/lib/postgresql/data 81 | ports: 82 | - 5432:5432 83 | healthcheck: 84 | test: ["CMD", "pg_isready", "-U", "airflow"] 85 | interval: 5s 86 | retries: 5 87 | restart: always 88 | 89 | airflow-webserver: 90 | <<: *airflow-common 91 | command: webserver 92 | ports: 93 | - 8080:8080 94 | healthcheck: 95 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] 96 | interval: 10s 97 | timeout: 10s 98 | retries: 5 99 | restart: always 100 | depends_on: 101 | <<: *airflow-common-depends-on 102 | airflow-init: 103 | condition: service_completed_successfully 104 | 105 | airflow-scheduler: 106 | <<: *airflow-common 107 | command: scheduler 108 | healthcheck: 109 | test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"'] 110 | interval: 10s 111 | timeout: 10s 112 | retries: 5 113 | restart: always 114 | depends_on: 115 | <<: *airflow-common-depends-on 116 | airflow-init: 117 | condition: service_completed_successfully 118 | 119 | airflow-triggerer: 120 | <<: *airflow-common 121 | command: triggerer 122 | healthcheck: 123 | test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'] 124 | interval: 10s 125 | timeout: 10s 126 | retries: 5 127 | restart: always 128 | depends_on: 129 | <<: *airflow-common-depends-on 130 | airflow-init: 131 | condition: service_completed_successfully 132 | 133 | airflow-init: 134 | <<: *airflow-common 135 | entrypoint: /bin/bash 136 | environment: 137 | <<: *airflow-common-env 138 | _AIRFLOW_DB_UPGRADE: 'true' 139 | _AIRFLOW_WWW_USER_CREATE: 'true' 140 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 141 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 142 | _PIP_ADDITIONAL_REQUIREMENTS: '' 143 | user: "0:0" 144 | volumes: 145 | - ${AIRFLOW_PROJ_DIR:-.}:/sources 146 | 147 | airflow-cli: 148 | <<: *airflow-common 149 | profiles: 150 | - debug 151 | environment: 152 | <<: *airflow-common-env 153 | CONNECTION_CHECK_MAX_COUNT: "0" 154 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 155 | command: 156 | - bash 157 | - -c 158 | - airflow 159 | 160 | volumes: 161 | postgres-db-volume: -------------------------------------------------------------------------------- /my_played_tracks.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sidharth1805/Spotify_etl/020dffc5a9517ed8e9bf37e69466ca0f3cd97837/my_played_tracks.sqlite --------------------------------------------------------------------------------