├── Project Archicture.png ├── spotify_api_data_extract.py ├── README.md └── spotify_transformation_load_function.py /Project Archicture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/On-car/spotify-end-to-end-data-engineering--project/HEAD/Project Archicture.png -------------------------------------------------------------------------------- /spotify_api_data_extract.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import spotipy 4 | from spotipy.oauth2 import SpotifyClientCredentials 5 | import boto3 6 | from datetime import datetime 7 | 8 | def lambda_handler(event, context): 9 | 10 | cilent_id = os.environ.get('client_id') 11 | client_secret = os.environ.get('client_secret') 12 | 13 | client_credentials_manager = SpotifyClientCredentials(client_id=cilent_id, client_secret=client_secret) 14 | sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager) 15 | playlists = sp.user_playlists('spotify') 16 | 17 | playlist_link = "https://open.spotify.com/playlist/37i9dQZEVXbNG2KDcFcKOF?si=1333723a6eff4b7f" 18 | playlist_URI = playlist_link.split("/")[-1].split("?")[0] 19 | 20 | spotify_data = sp.playlist_tracks(playlist_URI) 21 | 22 | cilent = boto3.client('s3') 23 | 24 | filename = "spotify_raw_" + str(datetime.now()) + ".json" 25 | 26 | cilent.put_object( 27 | Bucket="spotify-etl-project-darshil", 28 | Key="raw_data/to_processed/" + filename, 29 | Body=json.dumps(spotify_data) 30 | ) 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spotify Data Pipeline: Extract, Transform, and Analyze with AWS. 2 | 3 | #### Spotify data pipeline: Extract, transform, and analyze using AWS, Lambda, Glue, Athena, and S3. 4 | 5 | ### OBJECTIVE: 6 | #### This project aims to build a comprehensive data pipeline for extracting, transforming, and analyzing Spotify data using various AWS services. The pipeline will integrate with the Spotify API to fetch relevant data and store it in an organized manner on AWS S3. The extraction process will be automated by deploying code on AWS Lambda, which will run at scheduled intervals or trigger events. 7 | #### Once the data is extracted, a transformation function will be implemented to clean and format the data for further analysis. This function will be designed to handle various data processing tasks, such as data normalization, aggregation, or filtering, based on specific requirements. 8 | #### To ensure the pipeline's efficiency and reliability, an automated trigger will be built on top of the transformation function. This trigger will monitor for any changes or updates in the extracted data and execute the transformation function accordingly. 9 | #### The transformed data will be stored back in AWS S3, maintaining proper file organization and structure. This will allow easy access and retrieval of the processed data for further analysis. 10 | #### Furthermore, to enable seamless analytics, the project will involve creating analytics tables using AWS Glue and Athena. These services will assist in defining the data schema and enable querying and analysis of the transformed data efficiently. 11 | #### By implementing this Spotify data pipeline on AWS, the project aims to provide a scalable, reliable, and automated solution for extracting, transforming, and analyzing Spotify data, unlocking valuable insights for various analytical purposes. 12 | 13 | ### Architecture : 14 | ![Architecture](https://github.com/On-car/spotify-end-to-end-data-engineering--project/blob/main/Project%20Archicture.png) 15 | 16 | ### Services Used: 17 | **The project utilizes the following AWS services:** 18 | 19 | 1. Spotify API 20 | 2. AWS Lambda 21 | 3. AWS S3 (Simple Storage Service) 22 | 4. AWS Glue 23 | 5. Amazon Athena 24 | 25 | -------------------------------------------------------------------------------- /spotify_transformation_load_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import boto3 3 | from datetime import datetime 4 | from io import StringIO 5 | import pandas as pd 6 | 7 | def album(data): 8 | album_list = [] 9 | for row in data['items']: 10 | album_id = row['track']['album']['id'] 11 | album_name = row['track']['album']['name'] 12 | album_release_date = row['track']['album']['release_date'] 13 | album_total_tracks = row['track']['album']['total_tracks'] 14 | album_url = row['track']['album']['external_urls']['spotify'] 15 | album_element = {'album_id':album_id,'name':album_name,'release_date':album_release_date, 16 | 'total_tracks':album_total_tracks,'url':album_url} 17 | album_list.append(album_element) 18 | return album_list 19 | 20 | def artist(data): 21 | artist_list = [] 22 | for row in data['items']: 23 | for key, value in row.items(): 24 | if key == "track": 25 | for artist in value['artists']: 26 | artist_dict = {'artist_id':artist['id'], 'artist_name':artist['name'], 'external_url': artist['href']} 27 | artist_list.append(artist_dict) 28 | return artist_list 29 | 30 | def songs(data): 31 | song_list = [] 32 | for row in data['items']: 33 | song_id = row['track']['id'] 34 | song_name = row['track']['name'] 35 | song_duration = row['track']['duration_ms'] 36 | song_url = row['track']['external_urls']['spotify'] 37 | song_popularity = row['track']['popularity'] 38 | song_added = row['added_at'] 39 | album_id = row['track']['album']['id'] 40 | artist_id = row['track']['album']['artists'][0]['id'] 41 | song_element = {'song_id':song_id,'song_name':song_name,'duration_ms':song_duration,'url':song_url, 42 | 'popularity':song_popularity,'song_added':song_added,'album_id':album_id, 43 | 'artist_id':artist_id 44 | } 45 | song_list.append(song_element) 46 | 47 | return song_list 48 | 49 | def lambda_handler(event, context): 50 | s3 = boto3.client('s3') 51 | Bucket = "spotify-etl-project-darshil" 52 | Key = "raw_data/to_processed/" 53 | 54 | spotify_data = [] 55 | spotify_keys = [] 56 | for file in s3.list_objects(Bucket=Bucket, Prefix=Key)['Contents']: 57 | file_key = file['Key'] 58 | if file_key.split('.')[-1] == "json": 59 | response = s3.get_object(Bucket = Bucket, Key = file_key) 60 | content = response['Body'] 61 | jsonObject = json.loads(content.read()) 62 | spotify_data.append(jsonObject) 63 | spotify_keys.append(file_key) 64 | 65 | for data in spotify_data: 66 | album_list = album(data) 67 | artist_list = artist(data) 68 | song_list = songs(data) 69 | 70 | album_df = pd.DataFrame.from_dict(album_list) 71 | album_df = album_df.drop_duplicates(subset=['album_id']) 72 | 73 | artist_df = pd.DataFrame.from_dict(artist_list) 74 | artist_df = artist_df.drop_duplicates(subset=['artist_id']) 75 | 76 | #Song Dataframe 77 | song_df = pd.DataFrame.from_dict(song_list) 78 | 79 | album_df['release_date'] = pd.to_datetime(album_df['release_date']) 80 | song_df['song_added'] = pd.to_datetime(song_df['song_added']) 81 | 82 | songs_key = "transformed_data/songs_data/songs_transformed_" + str(datetime.now()) + ".csv" 83 | song_buffer=StringIO() 84 | song_df.to_csv(song_buffer, index=False) 85 | song_content = song_buffer.getvalue() 86 | s3.put_object(Bucket=Bucket, Key=songs_key, Body=song_content) 87 | 88 | album_key = "transformed_data/album_data/album_transformed_" + str(datetime.now()) + ".csv" 89 | album_buffer=StringIO() 90 | album_df.to_csv(album_buffer, index=False) 91 | album_content = album_buffer.getvalue() 92 | s3.put_object(Bucket=Bucket, Key=album_key, Body=album_content) 93 | 94 | artist_key = "transformed_data/artist_data/artist_transformed_" + str(datetime.now()) + ".csv" 95 | artist_buffer=StringIO() 96 | artist_df.to_csv(artist_buffer, index=False) 97 | artist_content = artist_buffer.getvalue() 98 | s3.put_object(Bucket=Bucket, Key=artist_key, Body=artist_content) 99 | 100 | s3_resource = boto3.resource('s3') 101 | for key in spotify_keys: 102 | copy_source = { 103 | 'Bucket': Bucket, 104 | 'Key': key 105 | } 106 | s3_resource.meta.client.copy(copy_source, Bucket, 'raw_data/processed/' + key.split("/")[-1]) 107 | s3_resource.Object(Bucket, key).delete() --------------------------------------------------------------------------------