├── Dags
    ├── spotify_etl.py
    └── spotify_final_dag.py
├── Delete_tables.py
├── Extract.py
├── Load.py
├── README.md
├── Transform.py
├── __pycache__
    ├── Extract.cpython-37.pyc
    └── Transform.cpython-37.pyc
├── docker-compose.yml
└── my_played_tracks.sqlite


/Dags/spotify_etl.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd 
 2 | import requests
 3 | from datetime import datetime
 4 | import datetime
 5 | import pandas as pd 
 6 | import requests
 7 | from datetime import datetime
 8 | import datetime
 9 | 
10 | 
11 | 
12 | USER_ID = "YOUR_USERNAME_HERE" 
13 | TOKEN = "YOUR_TOKEN_HERE"
14 | print('started')
15 | # Creating an function to be used in other pyrhon files
16 | def return_dataframe(): 
17 |     input_variables = {
18 |         "Accept" : "application/json",
19 |         "Content-Type" : "application/json",
20 |         "Authorization" : "Bearer {token}".format(token=TOKEN)
21 |     }
22 |      
23 |     today = datetime.datetime.now()
24 |     yesterday = today - datetime.timedelta(days=1)
25 |     yesterday_unix_timestamp = int(yesterday.timestamp()) * 1000
26 | 
27 |     # Download all songs you've listened to "after yesterday", which means in the last 24 hours      
28 |     r = requests.get("https://api.spotify.com/v1/me/player/recently-played?limit=50&after={time}".format(time=yesterday_unix_timestamp), headers = input_variables)
29 | 
30 |     data = r.json()
31 |     song_names = []
32 |     artist_names = []
33 |     played_at_list = []
34 |     timestamps = []
35 | 
36 |     # Extracting only the relevant bits of data from the json object      
37 |     for song in data["items"]:
38 |         song_names.append(song["track"]["name"])
39 |         artist_names.append(song["track"]["album"]["artists"][0]["name"])
40 |         played_at_list.append(song["played_at"])
41 |         timestamps.append(song["played_at"][0:10])
42 |         
43 |     # Prepare a dictionary in order to turn it into a pandas dataframe below       
44 |     song_dict = {
45 |         "song_name" : song_names,
46 |         "artist_name": artist_names,
47 |         "played_at" : played_at_list,
48 |         "timestamp" : timestamps
49 |     }
50 |     song_df = pd.DataFrame(song_dict, columns = ["song_name", "artist_name", "played_at", "timestamp"])
51 |     return song_df
52 | 
53 | def Data_Quality(load_df):
54 |     #Checking Whether the DataFrame is empty
55 |     if load_df.empty:
56 |         print('No Songs Extracted')
57 |         return False
58 |     
59 |     #Enforcing Primary keys since we don't need duplicates
60 |     if pd.Series(load_df['played_at']).is_unique:
61 |        pass
62 |     else:
63 |         #The Reason for using exception is to immediately terminate the program and avoid further processing
64 |         raise Exception("Primary Key Exception,Data Might Contain duplicates")
65 |     
66 |     #Checking for Nulls in our data frame 
67 |     if load_df.isnull().values.any():
68 |         raise Exception("Null values found")
69 | 
70 | # Writing some Transformation Queries to get the count of artist
71 | def Transform_df(load_df):
72 | 
73 |     #Applying transformation logic
74 |     Transformed_df=load_df.groupby(['timestamp','artist_name'],as_index = False).count()
75 |     Transformed_df.rename(columns ={'played_at':'count'}, inplace=True)
76 | 
77 |     #Creating a Primary Key based on Timestamp and artist name
78 |     Transformed_df["ID"] = Transformed_df['timestamp'].astype(str) +"-"+ Transformed_df["artist_name"]
79 | 
80 |     return Transformed_df[['ID','timestamp','artist_name','count']]
81 | 
82 | 
83 | 
84 | 
85 | 
86 | def spotify_etl():
87 |     #Importing the songs_df from the Extract.py
88 |     load_df=return_dataframe()
89 |     Data_Quality(load_df)
90 |     #calling the transformation
91 |     Transformed_df=Transform_df(load_df)    
92 |     print(load_df)
93 |     return (load_df)
94 | 
95 | spotify_etl()
96 | 
97 | 


--------------------------------------------------------------------------------
/Dags/spotify_final_dag.py:
--------------------------------------------------------------------------------
 1 | import datetime as dt
 2 | from airflow import DAG
 3 | from airflow.operators.python_operator import PythonOperator
 4 | from airflow.hooks.base_hook import BaseHook
 5 | from airflow.providers.postgres.hooks.postgres import PostgresHook
 6 | from airflow.providers.postgres.operators.postgres import PostgresOperator
 7 | from sqlalchemy import create_engine
 8 | 
 9 | from airflow.utils.dates import days_ago
10 | from spotify_etl import spotify_etl
11 | 
12 | default_args = {
13 |     'owner': 'airflow',
14 |     'depends_on_past': False,
15 |     'start_date': dt.datetime(2023,1,29),
16 |     'email': ['airflow@example.com'],
17 |     'email_on_failure': False,
18 |     'email_on_retry': False,
19 |     'retries': 1,
20 |     'retry_delay': dt.timedelta(minutes=1)
21 | }
22 | 
23 | dag = DAG(
24 |     'spotify_final_dag',
25 |     default_args=default_args,
26 |     description='Spotify ETL process 1-min',
27 |     schedule_interval=dt.timedelta(minutes=50),
28 | )
29 | 
30 | def ETL():
31 |     print("started")
32 |     df=spotify_etl()
33 |     #print(df)
34 |     conn = BaseHook.get_connection('postgre_sql')
35 |     engine = create_engine(f'postgresql://{conn.login}:{conn.password}@{conn.host}:{conn.port}/{conn.schema}')
36 |     df.to_sql('my_played_tracks', engine, if_exists='replace')
37 | 
38 | with dag:    
39 |     create_table= PostgresOperator(
40 |         task_id='create_table',
41 |         postgres_conn_id='postgre_sql',
42 |         sql="""
43 |             CREATE TABLE IF NOT EXISTS my_played_tracks(
44 |             song_name VARCHAR(200),
45 |             artist_name VARCHAR(200),
46 |             played_at VARCHAR(200),
47 |             timestamp VARCHAR(200),
48 |             CONSTRAINT primary_key_constraint PRIMARY KEY (played_at)
49 |         )
50 |         """
51 |     )
52 | 
53 |     run_etl = PythonOperator(
54 |         task_id='spotify_etl_final',
55 |         python_callable=ETL,
56 |         dag=dag,
57 |     )
58 | 
59 |     create_table >> run_etl
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/Delete_tables.py:
--------------------------------------------------------------------------------
 1 | import Extract
 2 | import Transform
 3 | import sqlalchemy
 4 | import pandas as pd 
 5 | from sqlalchemy.orm import sessionmaker
 6 | import requests
 7 | import json
 8 | from datetime import datetime
 9 | import datetime
10 | import sqlite3
11 | 
12 | DATABASE_LOCATION = "sqlite:///my_played_tracks.sqlite"
13 | 
14 | if __name__ == "__main__":
15 | 
16 | #Connecting into Database
17 |     engine = sqlalchemy.create_engine(DATABASE_LOCATION)
18 |     conn = sqlite3.connect('my_played_tracks.sqlite')
19 |     cursor = conn.cursor()
20 |     print("Opened database successfully")
21 | 
22 | #Deleting the Tables
23 |     cursor.execute('DROP TABLE my_played_tracks')
24 |     cursor.execute('DROP TABLE fav_artist')
25 | 
26 |     conn.close()
27 |     print("Close database successfully")
28 |     
29 |     


--------------------------------------------------------------------------------
/Extract.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd 
 2 | import requests
 3 | from datetime import datetime
 4 | import datetime
 5 | 
 6 | 
 7 | USER_ID = "YOUR_USER_NAME" 
 8 | TOKEN = "YOUR_TOKEN"
 9 | 
10 | 
11 | # Creating an function to be used in other pyrhon files
12 | def return_dataframe(): 
13 |     input_variables = {
14 |         "Accept" : "application/json",
15 |         "Content-Type" : "application/json",
16 |         "Authorization" : "Bearer {token}".format(token=TOKEN)
17 |     }
18 |      
19 |     today = datetime.datetime.now()
20 |     yesterday = today - datetime.timedelta(days=2)
21 |     yesterday_unix_timestamp = int(yesterday.timestamp()) * 1000
22 | 
23 |     # Download all songs you've listened to "after yesterday", which means in the last 24 hours      
24 |     r = requests.get("https://api.spotify.com/v1/me/player/recently-played?after={time}".format(time=yesterday_unix_timestamp), headers = input_variables)
25 | 
26 |     data = r.json()
27 |     song_names = []
28 |     artist_names = []
29 |     played_at_list = []
30 |     timestamps = []
31 | 
32 |     # Extracting only the relevant bits of data from the json object      
33 |     for song in data["items"]:
34 |         song_names.append(song["track"]["name"])
35 |         artist_names.append(song["track"]["album"]["artists"][0]["name"])
36 |         played_at_list.append(song["played_at"])
37 |         timestamps.append(song["played_at"][0:10])
38 |         
39 |     # Prepare a dictionary in order to turn it into a pandas dataframe below       
40 |     song_dict = {
41 |         "song_name" : song_names,
42 |         "artist_name": artist_names,
43 |         "played_at" : played_at_list,
44 |         "timestamp" : timestamps
45 |     }
46 |     song_df = pd.DataFrame(song_dict, columns = ["song_name", "artist_name", "played_at", "timestamp"])
47 |     return song_df
48 | 


--------------------------------------------------------------------------------
/Load.py:
--------------------------------------------------------------------------------
 1 | import Extract
 2 | import Transform
 3 | import sqlalchemy
 4 | import pandas as pd 
 5 | from sqlalchemy.orm import sessionmaker
 6 | import requests
 7 | import json
 8 | from datetime import datetime
 9 | import datetime
10 | import sqlite3
11 | 
12 | DATABASE_LOCATION = "sqlite:///my_played_tracks.sqlite"
13 | 
14 | if __name__ == "__main__":
15 | 
16 | #Importing the songs_df from the Extract.py
17 |     load_df=Extract.return_dataframe()
18 |     if(Transform.Data_Quality(load_df) == False):
19 |         raise ("Failed at Data Validation")
20 |     Transformed_df=Transform.Transform_df(load_df)
21 |     #The Two Data Frame that need to be Loaded in to the DataBase
22 | 
23 | #Loading into Database
24 |     engine = sqlalchemy.create_engine(DATABASE_LOCATION)
25 |     conn = sqlite3.connect('my_played_tracks.sqlite')
26 |     cursor = conn.cursor()
27 | 
28 |     #SQL Query to Create Played Songs
29 |     sql_query_1 = """
30 |     CREATE TABLE IF NOT EXISTS my_played_tracks(
31 |         song_name VARCHAR(200),
32 |         artist_name VARCHAR(200),
33 |         played_at VARCHAR(200),
34 |         timestamp VARCHAR(200),
35 |         CONSTRAINT primary_key_constraint PRIMARY KEY (played_at)
36 |     )
37 |     """
38 |     #SQL Query to Create Most Listened Artist
39 |     sql_query_2 = """
40 |     CREATE TABLE IF NOT EXISTS fav_artist(
41 |         timestamp VARCHAR(200),
42 |         ID VARCHAR(200),
43 |         artist_name VARCHAR(200),
44 |         count VARCHAR(200),
45 |         CONSTRAINT primary_key_constraint PRIMARY KEY (ID)
46 |     )
47 |     """
48 |     cursor.execute(sql_query_1)
49 |     cursor.execute(sql_query_2)
50 |     print("Opened database successfully")
51 | 
52 |     #We need to only Append New Data to avoid duplicates
53 |     try:
54 |         load_df.to_sql("my_played_tracks", engine, index=False, if_exists='append')
55 |     except:
56 |         print("Data already exists in the database")
57 |     try:
58 |         Transformed_df.to_sql("fav_artist", engine, index=False, if_exists='append')
59 |     except:
60 |         print("Data already exists in the database2")
61 | 
62 |     #cursor.execute('DROP TABLE my_played_tracks')
63 |     #cursor.execute('DROP TABLE fav_artist')
64 | 
65 |     conn.close()
66 |     print("Close database successfully")
67 |     
68 |     


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Data Engineering Project-2|Building Spotify ETL using Python and Airflow
  2 | 
  3 | Create an Extract Transform Load pipeline using python and automate with airflow.
  4 | 
  5 | ![](https://miro.medium.com/max/749/1*dm8hVrPTPMenyRY4uJiBIA@2x.png)
  6 | 
  7 | Image by Author
  8 | 
  9 | Inthis blog post, will explain how to create a simple ETL(Extract, Transform, Load) pipeline using Python and automate the process through Apache airflow.
 10 | 
 11 | # Problem Statement:
 12 | 
 13 | We need to use Spotify’s API to read the data and perform some basic transformations and Data Quality checks finally will load the retrieved data to PostgreSQL DB and then automate the entire process through airflow.  **Est.Time:**[4–7 Hours]
 14 | 
 15 | # Tech Stack / Skill used:
 16 | 
 17 | 1.  Python
 18 | 2.  API’s
 19 | 3.  Docker
 20 | 4.  Airflow
 21 | 5.  PostgreSQL
 22 | 
 23 | # Prerequisite:
 24 | 
 25 | 1.  Knowledge on API
 26 | 2.  Understand what docker and docker-compose
 27 | 3.  Intermediate Python and SQL
 28 | 4.  A basic understanding of Airflow  [this](https://www.youtube.com/watch?v=AHMm1wfGuHE&t=705s) will help
 29 | 
 30 | # Learning Outcomes:
 31 | 
 32 | 1.  Understand how to interact with API to retrieve data
 33 | 2.  Handling Dataframe in pandas
 34 | 3.  Setting up Airflow and PostgreSQL through Docker-Compose.
 35 | 4.  Learning to Create DAGs in Airflow
 36 | 
 37 | # Introduction:
 38 | 
 39 | This is a beginner-friendly project to get started with building a simple pipeline and automating through airflow. First, we will focus on entirely building the pipeline and then extend the project by combining it with Airflow.
 40 | 
 41 | # Building ETL Pipeline:
 42 | 
 43 | **Dataset:** In this project, we are using Spotify’s API so please go ahead and create an account for yourself. After creating the account head to this  [page](https://developer.spotify.com/console/get-recently-played/?limit=&after=&before=). Now you will be able to see a get token icon click that and select user recently played and click get token.
 44 | 
 45 | ![](https://miro.medium.com/max/690/1*4UKYwl00ALuF9PQj-TTJyA.png)
 46 | 
 47 | Image by Author
 48 | 
 49 | You can see your token like this.
 50 | 
 51 | ![](https://miro.medium.com/max/749/1*CPZYseTyKH-CruoJpyNl-w.png)
 52 | 
 53 | Image by Author
 54 | 
 55 | Now, this is the procedure to get the token. You may need to generate this often as it expires after some time.
 56 | 
 57 | ## Extract.py
 58 | 
 59 | We are using this token to Extract the Data from Spotify. We are Creating a function return_dataframe(). The Below python code explains how we extract API data and convert it to a Dataframe.
 60 | 
 61 | ## Transform.py
 62 | 
 63 | Here we are exporting the Extract file to get the data.
 64 | 
 65 | **def Data_Quality(load_df):** Used to check for the empty data frame, enforce unique constraints, checking for null values. Since these data might ruin our database it's important we enforce these Data Quality checks.
 66 | 
 67 | **def Transform_df(load_df):** Now we are writing some logic according to our requirement here we wanted to know our favorite artist so we are grouping the songs listened to by the artist. Note: This step is not required you can implement it or any other logic if you wish but make sure you enforce the primary constraint.
 68 | 
 69 | ## Load.py
 70 | 
 71 | In the load step, we are using sqlalchemy and SQLite to load our data into a database and save the file in our project directory.
 72 | 
 73 | Finally, we have completed our ETL pipeline successfully. The structure of the project folder should look like this(inside the project folder we have 3 files).
 74 | 
 75 | E:\DE\PROJECTS\SPOTIFY_ETL\SPOTIFY_ETL  
 76 | │   Extract.py  
 77 | │   Load.py  
 78 | │   my_played_tracks.sqlite  
 79 | │   spotify_etl.py  
 80 | │   Transform.py  
 81 | └───
 82 | 
 83 | After running the  **Load.py**  you could see a .sqlite file will be saved to the project folder, to check the data inside the file head  [here](https://inloop.github.io/sqlite-viewer/)  and drop your file.
 84 | 
 85 | ![](https://miro.medium.com/max/749/1*OpGD1spYMVIulWVCKPttlw.png)
 86 | 
 87 | Image by Author
 88 | 
 89 | Now we will automate this process using Airflow.
 90 | 
 91 | # Automating through Airflow
 92 | 
 93 | For those who have made it this far I appreciate your efforts 👏 but from here it gets a little tricky. Hence, I am mentioning some important points below.
 94 | 
 95 | 1.  We have completed an ETL and this itself is a mini project hence save the work.
 96 | 2.  Now we are going to extend this with airflow using docker.
 97 | 3.  Why docker? we are using docker since it’s easier to install and maintain and it's OS independent.
 98 | 4.  How to set up airflow using Docker? Follow the guide provided in this  [blog](https://medium.com/@garc1a0scar/how-to-start-with-apache-airflow-in-docker-windows-902674ad1bbe).
 99 | 5.  You need to change the Yaml file alone from the above guidelines please refer  [here](https://github.com/sidharth1805/Spotify_etl/blob/main/docker-compose.yml).
100 | 6.  After setting up the airflow place your dags inside the dags folder.
101 | 7.  After the docker is up you could see 4 services running.
102 | 
103 | ![](https://miro.medium.com/max/749/1*txaw4D2bowisN98SbG6PZQ.png)
104 | 
105 | Image by Author
106 | 
107 | Your Airflow folder should look like the below structure.
108 | 
109 | C:\USERS\SIDHA\DOCKER\AIRFLOW  
110 | │   docker-compose.yml  
111 | ├───dags  
112 | │   │   YOUR_DAGS.py  
113 | ├───logs  
114 | ├───plugins  
115 | └───scripts
116 | 
117 | Now that we have set up airflow we can view the airflow UI by visiting the  [8080 port](http://localhost:8080/). The username and password would be airflow.
118 | 
119 | It’s time to create the required Dag for our project. But Before Jumping on to DAG let us understand what dag is DAG stands for Directed Acyclic Graph which is a set of tasks defined in the order of execution.
120 | 
121 | ![](https://miro.medium.com/max/749/1*cImMkJ3NRWWLmw2o4mH9NQ.png)
122 | 
123 | Image by Author
124 | 
125 | So inside our dag, we need to create tasks to get our job done. To keep it simple I will use two tasks i.e. one to create Postgres Table and another to load the Data to the Table our dag will look like this.
126 | 
127 | ![](https://miro.medium.com/max/679/1*hYbRd0gKRffQZn1Xipt-BA.png)
128 | 
129 | ## spotify_etl.py
130 | 
131 | In this Python File will write a logic to extract data from API → Do Quality Checks →Transform Data.
132 | 
133 | 1.  **yesterday = today — datetime.timedelta(days=1)**  → Defines the number of days you want data for, change as you wish since our job is the daily load I have set it to 1.
134 | 2.  **def spotify_etl()**  → Core function which returns the Data Frame to the DAG python file.
135 | 3.  This file needs to be placed inside the dags folder
136 | 
137 | ## spotify_final_dag.py
138 | 
139 | This is the most important section you need to pay attention to. First, learn the basics about airflow DAG’s  [here](https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/dags.html)  it might take around 15 mins or you can search for it on youtube. After the basics please follow the below guideline.
140 | 
141 | 1.  **from airflow.operators.python_operator import PythonOperator**  → we are using the python operator to perform python functions such as inserting DataFrame to the table.
142 | 2.  **from airflow.providers.postgres.operators.postgres import PostgresOperator**  → we are using the Postgres operator to create tables in our Postgres database.
143 | 3.  **from airflow. hooks.base_hook import BaseHook**  → A hook is an abstraction of a specific API that allows Airflow to interact with an external system. Hooks are built into many operators, but they can also be used directly in DAG code. We are using a hook here to connect Postgres Database from our python function
144 | 4.  **from spotify_etl import spotify_etl**  → Importing spotify_etl function from spotify_etl.py
145 | 
146 | ## Code Explanation:
147 | 
148 | Setting up the default arguments and interval time. We can change the interval time start date according to our needs.
149 | 
150 | ![](https://miro.medium.com/max/749/1*YoZQLQWbgXUn4WOo_Z1o7Q.png)
151 | 
152 | Understanding Postgres connection and task.
153 | 
154 | 1.  **conn = BaseHook.get_connection(‘[Your Connection ID]’)**  → Connects to your Postgres DB.
155 | 2.  **df.to_sql(‘[Your Table Name]’, engine, if_exists=’replace’)**  → Loads the DF to the table
156 | 3.  **create_table >> run_etl**  → Defining the flow of the task
157 | 
158 | ![](https://miro.medium.com/max/749/1*WjsH1W213_nOkQv8pgrAvA.png)
159 | 
160 | ## Setting up the Postgres Connection on Airflow UI:
161 | 
162 | Head to the Airflow UI and click connection.
163 | 
164 | ![](https://miro.medium.com/max/749/1*uLaXIfiaWXPbVCJPr8oeMg.png)
165 | 
166 | Then create a connection.
167 | 
168 | 1.  **The connection id:**  postgre_sql would be the one we used in our code.
169 | 2.  **Connection Type:**  Postgres
170 | 3.  **Host:** Postgres
171 | 4.  **Schema:**  spotify_db, you can use your own name but make sure to check in the proper database when validating.
172 | 5.  **Login:** airflow
173 | 6.  **Password:**  airflow
174 | 7.  **Port:**  5432
175 | 
176 | ![](https://miro.medium.com/max/749/1*WJQ5g1OCONYLy-nYP4DBDQ.png)
177 | 
178 | Now its time to deploy :)
179 | 
180 | ## **Deployment:**
181 | 
182 | Check for your Dag in the dags section
183 | 
184 | ![](https://miro.medium.com/max/749/1*S0T8xwbjYmgl1hZLy70Z-A.png)
185 | 
186 | ![](https://miro.medium.com/max/749/1*J8wRUrI_5YYVoh8eeDP1vw.png)
187 | 
188 | After activation now run the dag by triggering.
189 | 
190 | ![](https://miro.medium.com/max/749/1*vK59llhtj85Q-MZg1qL5bA.png)
191 | 
192 | After completion check for the logs.
193 | 
194 | ![](https://miro.medium.com/max/749/1*BuGKkg7OIav6AOGFTgSwcw.png)
195 | 
196 | Now we will validate by connecting to the Postgres database. Open terminal and execute:  **pgcli -h localhost -p 5432 -u airflow -d spotify_db.**It will prompt for a password, enter airflow. Here  **spotify_db** is our database name or schema of connection.
197 | 
198 | ![](https://miro.medium.com/max/749/1*NoiKtC3VXkhOV9BySpH1TA.png)
199 | 
200 | Type \d to see the tables.
201 | 
202 | ![](https://miro.medium.com/max/501/1*jIO4zUKsmKPxz7jfVkNTLQ.png)
203 | 
204 | We can see our table is created and now let's check the data.
205 | 
206 | ![](https://miro.medium.com/max/749/1*BcUcfyKepcxVLisLEUk_Cw.png)
207 | 
208 | Finally, we have made it :)
209 | 
210 | Now our requirement is to automate this entire process for demonstration purposes let me change the airflow interval time to  **3 mins**  and listen to some music🎵🎵🎵.
211 | 
212 | Now the current timestamp is shown below, let’s check after some time.
213 | 
214 | ![](https://miro.medium.com/max/553/1*1o7tk-Sy4CfoJK6TB6u1Jw.png)
215 | 
216 | Hurray 🥳 we have made it.
217 | 
218 | ![](https://miro.medium.com/max/749/1*sqe43Kaofk9I-AmLzXQvIA.png)
219 | 
220 | The new song that I just listened to has been uploaded to our database automatically.
221 | 
222 | Let’s check the logs and task details. It’s a  **scheduled task**.
223 | 
224 | ![](https://miro.medium.com/max/749/1*UXLjVHYORJ69U3QZdIuSgg.png)
225 | 
226 | ![](https://miro.medium.com/max/749/1*7OM4LPzn7sR66cjBMGgNnw.png)
227 | 
228 | # Conclusion:
229 | 
230 | **Note:**  We may need to change the token in our ETL python file since it expires after some time. There are some limitations to this project they can be overcome by using a refresh token to automatically renew the token and we can set up the airflow in cloud services to run 24/7 and pick data once a day making it a Daily load but to make this beginner-friendly I haven't covered those I will let those as TODO 😉.
231 | 
232 | Github Repo:[https://github.com/sidharth1805/Spotify_etl](https://github.com/sidharth1805/Spotify_etl). I hope you would have enjoyed the guided project a lot. I am pretty sure that you will face a lot of issues while doing the project Stack overflow is our best friend and feel free to connect with me on  [LinkedIn](https://www.linkedin.com/in/sidharth-ramalingam/)  for any further questions. Follow me on  [medium](https://medium.com/@sidharth.ramalingam) to learn more about Data engineering stuff.
233 | 


--------------------------------------------------------------------------------
/Transform.py:
--------------------------------------------------------------------------------
 1 | import Extract
 2 | import pandas as pd 
 3 | 
 4 | # Set of Data Quality Checks Needed to Perform Before Loading
 5 | def Data_Quality(load_df):
 6 |     #Checking Whether the DataFrame is empty
 7 |     if load_df.empty:
 8 |         print('No Songs Extracted')
 9 |         return False
10 |     
11 |     #Enforcing Primary keys since we don't need duplicates
12 |     if pd.Series(load_df['played_at']).is_unique:
13 |        pass
14 |     else:
15 |         #The Reason for using exception is to immediately terminate the program and avoid further processing
16 |         raise Exception("Primary Key Exception,Data Might Contain duplicates")
17 |     
18 |     #Checking for Nulls in our data frame 
19 |     if load_df.isnull().values.any():
20 |         raise Exception("Null values found")
21 | 
22 | # Writing some Transformation Queries to get the count of artist
23 | def Transform_df(load_df):
24 | 
25 |     #Applying transformation logic
26 |     Transformed_df=load_df.groupby(['timestamp','artist_name'],as_index = False).count()
27 |     Transformed_df.rename(columns ={'played_at':'count'}, inplace=True)
28 | 
29 |     #Creating a Primary Key based on Timestamp and artist name
30 |     Transformed_df["ID"] = Transformed_df['timestamp'].astype(str) +"-"+ Transformed_df["artist_name"]
31 | 
32 |     return Transformed_df[['ID','timestamp','artist_name','count']]
33 | 
34 | if __name__ == "__main__":
35 | 
36 |     #Importing the songs_df from the Extract.py
37 |     load_df=Extract.return_dataframe()
38 |     Data_Quality(load_df)
39 |     #calling the transformation
40 |     Transformed_df=Transform_df(load_df)    
41 |     print(Transformed_df)
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 |     
50 |     


--------------------------------------------------------------------------------
/__pycache__/Extract.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sidharth1805/Spotify_etl/020dffc5a9517ed8e9bf37e69466ca0f3cd97837/__pycache__/Extract.cpython-37.pyc


--------------------------------------------------------------------------------
/__pycache__/Transform.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sidharth1805/Spotify_etl/020dffc5a9517ed8e9bf37e69466ca0f3cd97837/__pycache__/Transform.cpython-37.pyc


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one
  2 | # or more contributor license agreements.  See the NOTICE file
  3 | # distributed with this work for additional information
  4 | # regarding copyright ownership.  The ASF licenses this file
  5 | # to you under the Apache License, Version 2.0 (the
  6 | # "License"); you may not use this file except in compliance
  7 | # with the License.  You may obtain a copy of the License at
  8 | #
  9 | #   http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing,
 12 | # software distributed under the License is distributed on an
 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 14 | # KIND, either express or implied.  See the License for the
 15 | # specific language governing permissions and limitations
 16 | # under the License.
 17 | #
 18 | 
 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
 20 | #
 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
 22 | #
 23 | # This configuration supports basic configuration using environment variables or an .env file
 24 | # The following variables are supported:
 25 | #
 26 | # AIRFLOW_IMAGE_NAME           - Docker image name used to run Airflow.
 27 | #                                Default: apache/airflow:2.5.1
 28 | # AIRFLOW_UID                  - User ID in Airflow containers
 29 | #                                Default: 50000
 30 | # AIRFLOW_PROJ_DIR             - Base path to which all the files will be volumed.
 31 | #                                Default: .
 32 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
 33 | #
 34 | # _AIRFLOW_WWW_USER_USERNAME   - Username for the administrator account (if requested).
 35 | #                                Default: airflow
 36 | # _AIRFLOW_WWW_USER_PASSWORD   - Password for the administrator account (if requested).
 37 | #                                Default: airflow
 38 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
 39 | #                                Default: ''
 40 | #
 41 | # Feel free to modify this file to suit your needs.
 42 | ---
 43 | version: '3'
 44 | x-airflow-common:
 45 |   &airflow-common
 46 |   # In order to add custom dependencies or upgrade provider packages you can use your extended image.
 47 |   # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
 48 |   # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
 49 |   image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.5.1}
 50 |   # build: .
 51 |   environment:
 52 |     &airflow-common-env
 53 |     AIRFLOW__CORE__EXECUTOR: LocalExecutor
 54 |     AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 55 |     # For backward compatibility, with Airflow <2.3
 56 |     AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 57 |     AIRFLOW__CORE__FERNET_KEY: ''
 58 |     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
 59 |     AIRFLOW__CORE__LOAD_EXAMPLES: 'true'
 60 |     AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
 61 |     _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
 62 |   volumes:
 63 |     - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
 64 |     - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
 65 |     - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
 66 |   user: "${AIRFLOW_UID:-50000}:0"
 67 |   depends_on:
 68 |     &airflow-common-depends-on
 69 |     postgres:
 70 |       condition: service_healthy
 71 | 
 72 | services:
 73 |   postgres:
 74 |     image: postgres:13
 75 |     environment:
 76 |       POSTGRES_USER: airflow
 77 |       POSTGRES_PASSWORD: airflow
 78 |       POSTGRES_DB: airflow
 79 |     volumes:
 80 |       - postgres-db-volume:/var/lib/postgresql/data
 81 |     ports:
 82 |       - 5432:5432
 83 |     healthcheck:
 84 |       test: ["CMD", "pg_isready", "-U", "airflow"]
 85 |       interval: 5s
 86 |       retries: 5
 87 |     restart: always
 88 | 
 89 |   airflow-webserver:
 90 |     <<: *airflow-common
 91 |     command: webserver
 92 |     ports:
 93 |       - 8080:8080
 94 |     healthcheck:
 95 |       test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
 96 |       interval: 10s
 97 |       timeout: 10s
 98 |       retries: 5
 99 |     restart: always
100 |     depends_on:
101 |       <<: *airflow-common-depends-on
102 |       airflow-init:
103 |         condition: service_completed_successfully
104 | 
105 |   airflow-scheduler:
106 |     <<: *airflow-common
107 |     command: scheduler
108 |     healthcheck:
109 |       test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"']
110 |       interval: 10s
111 |       timeout: 10s
112 |       retries: 5
113 |     restart: always
114 |     depends_on:
115 |       <<: *airflow-common-depends-on
116 |       airflow-init:
117 |         condition: service_completed_successfully
118 | 
119 |   airflow-triggerer:
120 |     <<: *airflow-common
121 |     command: triggerer
122 |     healthcheck:
123 |       test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
124 |       interval: 10s
125 |       timeout: 10s
126 |       retries: 5
127 |     restart: always
128 |     depends_on:
129 |       <<: *airflow-common-depends-on
130 |       airflow-init:
131 |         condition: service_completed_successfully
132 | 
133 |   airflow-init:
134 |     <<: *airflow-common
135 |     entrypoint: /bin/bash
136 |     environment:
137 |       <<: *airflow-common-env
138 |       _AIRFLOW_DB_UPGRADE: 'true'
139 |       _AIRFLOW_WWW_USER_CREATE: 'true'
140 |       _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
141 |       _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
142 |       _PIP_ADDITIONAL_REQUIREMENTS: ''
143 |     user: "0:0"
144 |     volumes:
145 |       - ${AIRFLOW_PROJ_DIR:-.}:/sources
146 | 
147 |   airflow-cli:
148 |     <<: *airflow-common
149 |     profiles:
150 |       - debug
151 |     environment:
152 |       <<: *airflow-common-env
153 |       CONNECTION_CHECK_MAX_COUNT: "0"
154 |     # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
155 |     command:
156 |       - bash
157 |       - -c
158 |       - airflow
159 | 
160 | volumes:
161 |   postgres-db-volume:


--------------------------------------------------------------------------------
/my_played_tracks.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sidharth1805/Spotify_etl/020dffc5a9517ed8e9bf37e69466ca0f3cd97837/my_played_tracks.sqlite


--------------------------------------------------------------------------------