71 |
72 |
73 |
74 |
--------------------------------------------------------------------------------
/utils/spotify_track.py:
--------------------------------------------------------------------------------
1 | import eyed3
2 | import requests
3 | from requests import Response
4 | import hashlib
5 | import datetime
6 | import os
7 | import shutil
8 | import json
9 | from utils.spotify_album import SpotifyAlbum
10 | from utils.spotify_artist import SpotifyArtist
11 | from utils.deezer_utils import Deezer
12 | from utils.utils import clean_file_path
13 | from exceptions import SpotifyTrackException
14 |
15 |
16 | class SpotifyTrack:
17 | title = ''
18 | spotify_id = ''
19 | artist = None
20 | artists = []
21 | album = None
22 | thumbnail_href = ''
23 | release_date = 0
24 | disc_number = 0
25 | duration_ms = 0
26 | explicit = False
27 | href = ''
28 | popularity = 0
29 | audio = b''
30 | lyrics = ''
31 | thumnail = b''
32 | data_dump = ''
33 | isrc = ''
34 |
35 | def __init__(self, track_data=None) -> None:
36 | if track_data is not None:
37 | self.load_from_data(track_data)
38 |
39 | def load_from_data(self, data):
40 | if 'track' in data:
41 | data = data['track']
42 | self.data_dump = data
43 | self.album = SpotifyAlbum(data['album'])
44 | self.title = data['name']
45 | self.spotify_id = data['id']
46 | self.artists = [SpotifyArtist(x) for x in data['artists']]
47 | self.artist = self.artists[0]
48 | self.thumbnail_href = self.album.thumbnail_href
49 | self.release_date = self.album.release_date
50 | self.track_number = data['track_number']
51 | self.duration_ms = data['duration_ms']
52 | self.explicit = data['explicit']
53 | self.href = data['href']
54 | self.popularity = data['popularity']
55 | if 'isrc' in data['external_ids']:
56 | # isrc is not available for local files
57 | self.isrc = data['external_ids']['isrc']
58 |
59 | def __str__(self) -> str:
60 | return f'SpotifyTrack< {self.title} >'
61 |
62 | def __repr__(self) -> str:
63 | return self.__str__()
64 |
65 | def get_lyrics(self, scraper) -> str:
66 | if scraper is None:
67 | raise SpotifyTrackException('SCAPER NOT AVAILABLE!')
68 | return scraper.get_lyrics(self.spotify_id)
69 |
70 | def download_thumbnail(self, scraper) -> bytes:
71 | return scraper.get(self.thumbnail_href).content
72 |
73 | def get_download_link(self, scraper) -> str:
74 | if not self.isrc:
75 | return ''
76 | return Deezer.get_track_download_url(Deezer.get_track_data(Deezer.get_track_id_from_isrc(self.isrc)))[0]
77 |
78 | def download(self, scraper) -> bytes:
79 | if not self.isrc:
80 | raise SpotifyTrackException(f'Cannot download local file {self.title}!')
81 | # I'm used to C, sorry
82 | download_link = None
83 | try:
84 | download_link = self.get_download_link(scraper)
85 | except Exception as ex:
86 | raise SpotifyTrackException(f'Failed to get download url for {self.title} | Exception: {ex}')
87 | try:
88 | data = Deezer.decrypt_download_data(requests.get(download_link, headers={'Accept':'*/*'}), self.isrc)
89 | return data
90 | except Exception as ex:
91 | raise SpotifyTrackException(f'Failed to download {self.title} | Exception: {ex}')
92 |
93 | def package_download(self, scraper):
94 | self.audio = self.download(scraper)
95 | self.thumbnail = self.download_thumbnail(scraper)
96 | self.lyrics = self.get_lyrics(scraper)
97 |
98 | def preview_title(self):
99 | return f'{", ".join([x.name for x in self.artists])} - {self.title} [{self.album.title}]'
100 |
101 | def download_to_file(self, scraper, output_path: str):
102 | temp_file_path = f'temp/{hashlib.sha1(self.title.encode() + self.album.spotify_id.encode()).hexdigest()}.temp.mp3'
103 | self.package_download(scraper)
104 | with open(temp_file_path, 'wb') as f:
105 | f.write(self.audio)
106 |
107 | audio_file = eyed3.load(temp_file_path)
108 | audio_file.initTag(version=(2, 4, 0)) # version is important
109 | audio_file.tag.title = self.title
110 | audio_file.tag.artist = ';'.join([artist.name for artist in self.artists])
111 | audio_file.tag.album_artist = self.artists[0].name
112 | audio_file.tag.album = self.album.title
113 | audio_file.tag.original_release_date = datetime.datetime.fromtimestamp(self.album.release_date).year
114 | audio_file.tag.track_num = self.track_number
115 | audio_file.info.time_secs = self.duration_ms / 1000
116 | audio_file.tag.images.set(3, self.thumbnail, 'image/jpeg', u'cover')
117 | audio_file.tag.lyrics.set(str(self.lyrics))
118 | audio_file.tag.comments.set('', str(self.data_dump))
119 |
120 | audio_file.tag.save()
121 |
122 | full_output_path = output_path + '/' + clean_file_path(self.preview_title()) + '.mp3'
123 | os.makedirs(os.path.dirname(full_output_path), exist_ok=True)
124 | shutil.move(temp_file_path, full_output_path)
125 |
--------------------------------------------------------------------------------
/spotify_client.py:
--------------------------------------------------------------------------------
1 | from config import *
2 | from exceptions import SpotifyClientException
3 |
4 |
5 | class SpotifyClient:
6 | _proxy = PROXY
7 | _client_token = ''
8 | _access_token = ''
9 | _client_id = ''
10 | __USER_AGENT = USER_AGENT
11 | _verify_ssl = VERIFY_SSL
12 |
13 | user_data = None
14 |
15 | def __init__(self, sp_dc=None, sp_key=None):
16 | self.dc = sp_dc
17 | self.key = sp_key
18 | self.__HEADERS = {
19 | 'User-Agent': self.__USER_AGENT,
20 | 'Accept': 'application/json',
21 | 'Origin': 'https://open.spotify.com',
22 | 'Sec-Fetch-Dest': 'empty',
23 | 'Sec-Fetch-Mode': 'cors',
24 | 'Sec-Fetch-Site': 'same-origin',
25 | 'Referer': 'https://open.spotify.com/',
26 | 'Te': 'trailers',
27 | 'App-Platform': 'WebPlayer'
28 | }
29 | self.get_tokens(sp_dc, sp_key)
30 |
31 | def get_tokens(self, sp_dc=None, sp_key=None):
32 | self._access_token, self._client_id = self.get_access_token(sp_dc=sp_dc, sp_key=sp_key)
33 | self._client_token = self.get_client_token(self._client_id)
34 |
35 | print('Client token: ', self._client_token)
36 | print('Access token: ', self._access_token)
37 |
38 | def refresh_tokens(self):
39 | self.get_tokens(self.dc, self.key)
40 |
41 | def get_client_token(self, client_id: str):
42 | with requests.session() as session:
43 | session.proxies = self._proxy
44 | session.headers = self.__HEADERS
45 |
46 | # Clear old tokens, otherwise we will get 400 Bad Request
47 | if 'client_token' in session.headers:
48 | session.headers.pop('client_token')
49 | if 'Authorization' in session.headers:
50 | session.headers.pop('Authorization')
51 |
52 | data = {
53 | "client_data": {
54 | "client_version": "1.2.13.477.ga4363038",
55 | "client_id": client_id,
56 | "js_sdk_data":
57 | {
58 | "device_brand": "",
59 | "device_id": "",
60 | "device_model": "",
61 | "device_type": "",
62 | "os": "",
63 | "os_version": ""
64 | }
65 | }
66 | }
67 |
68 | response = session.post('https://clienttoken.spotify.com/v1/clienttoken', json=data, verify=self._verify_ssl)
69 | try:
70 | rj = response.json()
71 | except Exception as ex:
72 | print('Failed to parse client token response as json!', ex)
73 | exit(0)
74 | return rj['granted_token']['token']
75 |
76 | def get_access_token(self, keys=None, sp_dc=None, sp_key=None):
77 | with requests.session() as session:
78 | session.proxies = self._proxy
79 | session.headers = self.__HEADERS
80 | cookie = {}
81 | if keys is not None:
82 | cookie = keys
83 | if sp_dc is not None:
84 | cookie['sp_dc'] = sp_dc
85 | if sp_key is not None:
86 | cookie['sp_key'] = sp_key
87 | response = session.get('https://open.spotify.com/get_access_token', verify=self._verify_ssl, cookies=cookie)
88 | try:
89 | rj = response.json()
90 | except Exception as ex:
91 | print('An error occured when generating an access token!', ex)
92 | exit(0)
93 | print('Access token is anon: ', rj['isAnonymous'])
94 | self.is_anonymous = rj['isAnonymous']
95 | return rj['accessToken'], rj['clientId'] if rj['clientId'].lower() != 'unknown' else self._client_id
96 |
97 | def get_me(self):
98 | with requests.session() as session:
99 | session.proxies = self._proxy
100 | session.headers = self.__HEADERS
101 | session.headers.update({
102 | 'Client-Token': self._client_token,
103 | 'Authorization': f'Bearer {self._access_token}'
104 | })
105 |
106 | response_json = session.get('https://api.spotify.com/v1/me', verify=self._verify_ssl).json()
107 | self.user_data = response_json
108 | if not 'product' in self.user_data:
109 | raise SpotifyClientException('Spotify client keys are invalid.\nVerify that you have entered valid SP_KEY & SP_DC values.')
110 | if self.user_data['product'] == 'premium':
111 | raise SpotifyClientException('THIS USER IS PREMIUM!')
112 | return response_json
113 |
114 | def get_premium_keys(self):
115 | page = requests.get('https://www.rkstore.tn/2022/03/spotify-premium-cookies.html', verify=self._verify_ssl)
116 | root = html.document_fromstring(page.content)
117 | cookies_element = root.get_element_by_id('download_link')
118 | cookies = json.loads(cookies_element.text_content())
119 | prem_keys = {}
120 | for cookie in cookies:
121 | prem_keys[cookie['name']] = cookie['value']
122 | return prem_keys
123 |
124 | def get(self, url: str) -> Response:
125 | with requests.session() as session:
126 | session.proxies = self._proxy
127 | session.headers = self.__HEADERS
128 | session.headers.update({
129 | 'Client-Token': self._client_token,
130 | 'Authorization': f'Bearer {self._access_token}'
131 | })
132 |
133 | response = session.get(url, verify=self._verify_ssl)
134 | return response
135 |
136 | def post(self, url: str, payload=None) -> Response:
137 | with requests.session() as session:
138 | session.proxies = self._proxy
139 | session.headers = self.__HEADERS
140 | session.headers.update({
141 | 'Client-Token': self._client_token,
142 | 'Authorization': f'Bearer {self._access_token}'
143 | })
144 |
145 | response = session.post(url, verify=self._verify_ssl, data=payload)
146 | return response
147 |
148 |
--------------------------------------------------------------------------------
/utils/deezer_utils.py:
--------------------------------------------------------------------------------
1 | from config import *
2 | from exceptions import DeezerException
3 |
4 |
5 | class Deezer:
6 | #_cookies = {'dzr_uniq_id': 'dzr_uniq_id_frc3270536fa4e8fd6594415125daa7ba2096811', 'sid': 'fre82a0685d587f159cb7cf0a5f1e8f7aee759d2'}
7 | _cookies = {
8 | 'dzr_uniq_id': 'dzr_uniq_id_frffc916344f831b489e3f366778a86b7a0f3a2f',
9 | 'sid': 'fre1a5ee55bb5ebd4f8505add526aef95c47adf7',
10 | '_abck': 'C73904478BC37F15E7303B7140C34A1D~-1~YAAQvphmUrc22TWIAQAAxjnJaQktzRdJM/Z5JSO9mfO0N3a5a2jv1rvxchQJ+/438DyVm/nx+6lmw0PZL+S/zBD6rTRIsHiZzDHYGOL2JHskcx+qgFNFV3haB0NmrsRKzL48t0AfE+xh4uzKa1t6681eLEsxD2+XL4CLpP5dlj+ymhNqMFLY0eJ9fFCCGoXvLCSz8EXqD17PYcDD9DHDpGem7+JFNBfpMOtQuaynJh97LfFSwx/6uzpkjg/oO9cNZ1rfUk5Gy5WLkcz8hn4b6prZk1whzOhom5Zba6Vj1KOTY9DvT67udnGqlrau60nNnopoD1SBQNnFaGhGEV+6oUTCshYzMQ==~-1~-1~-1',
11 | 'bm_sz':'A81B5CF520F243866A08F5D742986440~YAAQvphmUrg22TWIAQAAxjnJaRNH5QoYzzhPG/doMRczrBcZ8c/bzqsA+MMcCmvUHPtqKvixyokOz4OYzTlV6t8WzsLDAm5gsrf+9Ul9+GLxF/8EjLqXWNalyUDfkOI6tByxylzmM5qobXBE6YOrdBjYBrLqNh32vLej8JPLSoXV37F6iT1i3+TZpUZAf0EYPOoQLIHs5sZbmWtECvjMB0VE6qEeLsOam+BrLd7CupnL+aq/s3JcLPnQft/k2p0f3XUSjywe7DGXPfxitcIDRAYYG8cWoY2ohhU9KJqKNyFM8LQ=~4338228~3488051',
12 | 'arl':'d4c0a94496e1193e04faf60bc5905f701d9a03c01f8aab3c19d96e82d622e930c1dc523dd78b0a88bfc416bad8096601d254c04d0e296d0e8e1f1be5df322d31ee5af48f6e782cff5b0c58b2f96c1980c7bb8755057c866c301752bf2f1da5b4',
13 | }
14 |
15 | @staticmethod
16 | def get_track_id_from_isrc(isrc: str) -> str:
17 | try:
18 | return str(requests.get(f'https://api.deezer.com/2.0/track/isrc:{isrc}').json()['id'])
19 | except KeyError:
20 | raise DeezerException(f'Could not find deezer track by isrc: {isrc}')
21 |
22 | @staticmethod
23 | def get_track_data(song_id: str) -> dict:
24 | #resp = requests.post('https://www.deezer.com/ajax/gw-light.php?api_version=1.0&api_token=By7mRaeO.7.UDI6~NtRjcR1whWRStYb4&input=3&method=deezer.pageTrack', data='{"sng_id":"' + song_id +'"}', cookies=Deezer._cookies)
25 | resp = requests.post('https://www.deezer.com/ajax/gw-light.php?api_version=1.0&api_token=YTIQw7E4nLSiyzB7A3s0kcBa1p63TSl6&input=3&method=deezer.pageTrack', data='{"sng_id":"' + song_id +'"}', cookies=Deezer._cookies)
26 | track_json = resp.json()
27 | data = {}
28 | data['md5_origin'] = track_json['results']['DATA']['MD5_ORIGIN']
29 | data['media_version'] = track_json['results']['DATA']['media_version'.upper()]
30 | data['id'] = song_id
31 | return data
32 |
33 | @staticmethod
34 | def get_track_download_url(track, **kwargs):
35 | """Gets and decrypts the download url of the given track in the given quality
36 | Arguments:
37 | track {dict} -- Track dictionary, similar to the {info} value that is returned {using get_track()}
38 | Keyword Arguments:
39 | quality {str} -- Use values from {constants.track_formats}, will get the default quality if None or an invalid is given. (default: {None})
40 | fallback {bool} -- Set to True to if you want to use fallback qualities when the given quality is not available. (default: {False})
41 | renew {bool} -- Will renew the track object (default: {False})
42 | Raises:
43 | DownloadLinkDecryptionError: Will be raised if the track dictionary does not have an MD5
44 | ValueError: Will be raised if valid track argument was given
45 | Returns:
46 | str -- Download url
47 | """
48 |
49 | # Decryption algo got from: https://git.fuwafuwa.moe/toad/ayeBot/src/branch/master/bot.py;
50 | # and https://notabug.org/deezpy-dev/Deezpy/src/master/deezpy.py
51 | # Huge thanks!
52 |
53 | quality = track_formats.FLAC
54 | fallback = True
55 |
56 | try:
57 | if not "md5_origin" in track:
58 | raise Exception(
59 | "MD5 is needed to decrypt the download link.")
60 |
61 | md5_origin = track["md5_origin"]
62 | track_id = track["id"]
63 | media_version = track["media_version"]
64 | except ValueError:
65 | raise ValueError(
66 | "You have passed an invalid argument.")
67 |
68 | def decrypt_url(quality_code):
69 | magic_char = "¤"
70 | step1 = magic_char.join((md5_origin,
71 | str(quality_code),
72 | track_id,
73 | media_version))
74 | m = hashlib.md5()
75 | m.update(bytes([ord(x) for x in step1]))
76 |
77 | step2 = m.hexdigest() + magic_char + step1 + magic_char
78 | step2 = step2.ljust(80, " ")
79 |
80 | cipher = Cipher(algorithms.AES(bytes('jo6aey6haid2Teih', 'ascii')),
81 | modes.ECB(), default_backend())
82 |
83 | encryptor = cipher.encryptor()
84 | step3 = encryptor.update(bytes([ord(x) for x in step2])).hex()
85 |
86 | cdn = track["md5_origin"][0]
87 |
88 | return f'https://e-cdns-proxy-{cdn}.dzcdn.net/mobile/1/{step3}'
89 |
90 | url = decrypt_url(track_formats.TRACK_FORMAT_MAP[quality]["code"])
91 | res = requests.get(url, stream=True)
92 |
93 | if not fallback or (res.status_code == 200 and int(res.headers["Content-length"]) > 0):
94 | res.close()
95 | return (url, quality)
96 | else:
97 | if "fallback_qualities" in kwargs:
98 | fallback_qualities = kwargs["fallback_qualities"]
99 | else:
100 | fallback_qualities = track_formats.FALLBACK_QUALITIES
101 |
102 | for key in fallback_qualities:
103 | url = decrypt_url(
104 | track_formats.TRACK_FORMAT_MAP[key]["code"])
105 |
106 | res = requests.get(
107 | url, stream=True)
108 |
109 | if res.status_code == 200 and int(res.headers["Content-length"]) > 0:
110 | res.close()
111 | return (url, key)
112 |
113 | @staticmethod
114 | def get_blowfish_key(track_id):
115 | secret = 'g4el58wc0zvf9na1'
116 |
117 | m = hashlib.md5()
118 | m.update(bytes([ord(x) for x in track_id]))
119 | id_md5 = m.hexdigest()
120 |
121 | blowfish_key = bytes(([(ord(id_md5[i]) ^ ord(id_md5[i+16]) ^ ord(secret[i]))
122 | for i in range(16)]))
123 |
124 | return blowfish_key
125 |
126 | @staticmethod
127 | def decrypt_download_data(content: Response, isrc: str) -> bytes:
128 | chunk_size = 2048
129 | data_iter = content.iter_content(chunk_size)
130 | i = 0
131 | decrypted = b''
132 | blowfish_key = Deezer.get_blowfish_key(Deezer.get_track_id_from_isrc(isrc))
133 | for chunk in data_iter:
134 | current_chunk_size = len(chunk)
135 |
136 | if i % 3 > 0:
137 | decrypted += chunk
138 | elif len(chunk) < chunk_size:
139 | decrypted += chunk
140 | break
141 | else:
142 | cipher = Cipher(algorithms.Blowfish(blowfish_key),
143 | modes.CBC(
144 | bytes([i for i in range(8)])),
145 | default_backend())
146 |
147 | decryptor = cipher.decryptor()
148 | dec_data = decryptor.update(
149 | chunk) + decryptor.finalize()
150 | decrypted += dec_data
151 |
152 | current_chunk_size = len(dec_data)
153 |
154 | i += 1
155 | return decrypted
156 |
--------------------------------------------------------------------------------
/spotify_scraper.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | from typing import Generator
3 | from config import *
4 | from utils.spotify_track import SpotifyTrack
5 | from utils.spotify_album import SpotifyAlbum
6 | from utils.spotify_playlist import SpotifyPlaylist
7 | from utils.spotify_category import SpotifyCategory
8 | from utils.spotify_artist import SpotifyArtist
9 | from spotify_client import SpotifyClient
10 | from typing import List
11 |
12 |
13 | class SpotifyScraper:
14 | _client = None
15 |
16 | class IDTypes(Enum):
17 | Playlist = 0
18 | Album = 1
19 | Artist = 2
20 | Track = 3
21 | User = 4
22 | Unknown = -1
23 |
24 | def __init__(self, sp_dc=None, sp_key=None, client=None) -> None:
25 | if client is not None:
26 | self._client = client
27 | else:
28 | self._client = SpotifyClient(sp_dc=sp_dc, sp_key=sp_key)
29 |
30 | def identify_link_type(self, link: str) -> IDTypes:
31 | if 'playlist' in link.lower():
32 | return self.IDTypes.Playlist
33 | elif 'album' in link.lower():
34 | return self.IDTypes.Album
35 | elif 'artist' in link.lower():
36 | return self.IDTypes.Artist
37 | elif 'track' in link.lower():
38 | return self.IDTypes.Track
39 | elif 'user' in link.lower():
40 | return self.IDTypes.User
41 | return self.IDTypes.Unknown
42 |
43 | def extract_id_from_link(self, link: str) -> str:
44 | return link[link.rindex('/') + 1:]
45 |
46 | def scrape_tracks(self, link: str, console=None) -> Generator[SpotifyTrack, None, None]:
47 | id_type = self.identify_link_type(link)
48 | if id_type == self.IDTypes.Playlist:
49 | return self.scrape_playlist_tracks(self.extract_id_from_link(link))
50 | elif id_type == self.IDTypes.Album:
51 | return self.scrape_album_tracks(self.extract_id_from_link(link))
52 | elif id_type == self.IDTypes.Artist:
53 | return self.scrape_artist_tracks(self.extract_id_from_link(link), intense=True, console=console)
54 | elif id_type == self.IDTypes.Track:
55 | return [SpotifyTrack(self.get(f'https://api.spotify.com/v1/tracks/{self.extract_id_from_link(link)}').json())]
56 | elif id_type == self.IDTypes.User:
57 | return self.scrape_user_items(self.extract_id_from_link(link))
58 |
59 | def scrape_pagination(self, url:str) -> Generator[str, None, None]:
60 | limit = 50
61 | offset = 0
62 | ret = self._client.get(f'{url}{"?" if "?" not in url else ""}&limit={limit}').json()
63 | for item in ret['items']:
64 | yield item
65 | while ret['next'] is not None:
66 | offset += limit
67 | ret = self._client.get(f'{url}{"?" if "?" not in url else ""}&offset={offset}&limit={limit}').json()
68 | for item in ret['items']:
69 | yield item
70 |
71 | def scrape_playlist(self, playlist_id: str):
72 | return self._client.get(f'https://api.spotify.com/v1/playlists/{playlist_id}').json()
73 |
74 | def scrape_playlist_tracks(self, playlist_id: str) -> Generator[SpotifyTrack, None, None]:
75 | tracks = []
76 | for track in self.scrape_pagination(f'https://api.spotify.com/v1/playlists/{playlist_id}/tracks?market=from_token'):
77 | spotify_track = SpotifyTrack(self.get(track['track']['href']).json())
78 | tracks.append(spotify_track)
79 | yield spotify_track
80 | if settings.AUTO_DOWNLOAD_PLAYLIST_METADATA:
81 | playlist = SpotifyPlaylist(playlist_id, tracks, self.get_playlist_data(playlist_id))
82 | playlist.export_to_file()
83 |
84 | def scrape_album(self, album_id: str):
85 | return self._client.get(f'https://api.spotify.com/v1/albums/{album_id}').json()
86 |
87 | def scrape_album_tracks(self, album_id: str) -> Generator[SpotifyTrack, None, None]:
88 | for track in self.scrape_pagination(f'https://api.spotify.com/v1/albums/{album_id}/tracks'):
89 | yield SpotifyTrack(self.get(track['href']).json())
90 |
91 | def scrape_artist(self, artist_id: str):
92 | return self.get(f'https://api.spotify.com/v1/artists/{artist_id}/top-tracks?market=from_token').json()
93 |
94 | def scrape_artist_albums(self, artist_id: str) -> Generator[SpotifyAlbum, None, None]:
95 | for album in self.scrape_pagination(f'https://api.spotify.com/v1/artists/{artist_id}/albums?market=from_token'):
96 | yield SpotifyAlbum(album)
97 |
98 | def scrape_artist_tracks(self, artist_id: str, intense:bool=False, console=None) -> Generator[SpotifyTrack, None, None]:
99 | tracks = self.scrape_artist(artist_id)['tracks']
100 | artist = SpotifyArtist(artist_data=tracks[0]['album']['artists'][0])
101 | for track_data in tracks:
102 | yield SpotifyTrack(track_data)
103 | for track in self.scrape_playlist_tracks(artist.get_this_is_playlist(self)):
104 | yield track
105 | if intense:
106 | for album in self.scrape_artist_albums(artist_id):
107 | for track in self.scrape_album_tracks(album.spotify_id):
108 | yield track
109 |
110 | def get(self, url: str) -> Response:
111 | return self._client.get(url)
112 |
113 | def post(self, url: str, payload=None) -> Response:
114 | return self._client.post(url, payload=payload)
115 |
116 | def get_lyrics(self, track_id: str) -> str:
117 | try:
118 | return self.get(f'https://spclient.wg.spotify.com/color-lyrics/v2/track/{track_id}').json()
119 | except Exception as ex:
120 | return ''
121 |
122 | def get_track_features(self, track_id: str) -> str:
123 | try:
124 | return self.get(f'https://api.spotify.com/v1/audio-features/{track_id}').json()
125 | except Exception as ex:
126 | return ''
127 |
128 | def get_category_playlist_ids(self, category_id: str, limit=50, offset=0) -> str:
129 | playlist_ids = []
130 | current_offset = offset
131 | has_next = True
132 | while len(playlist_ids) < limit and has_next:
133 | category_playlists_json = self.get_category_playlists(category_id, limit=50, offset=current_offset)
134 | has_next = category_playlists_json['playlists']['next'] is not None
135 | for playlist in category_playlists_json['playlists']['items']:
136 | if not playlist:
137 | continue
138 | playlist_ids.append(playlist['id'])
139 | return playlist_ids
140 |
141 | def get_category_playlists(self, category_id: str, limit:int=50, offset:int=0) -> str:
142 | data = self.get(f'https://api.spotify.com/v1/browse/categories/{category_id}/playlists/?limit={limit}&offset={offset}').json()
143 | return data
144 |
145 | def get_categories(self, limit=50) -> str:
146 | return self.get(f'https://api.spotify.com/v1/browse/categories/?limit={limit}&country=IL').json()
147 |
148 | def get_categories_full(self, query:str='') -> List[SpotifyCategory]:
149 | categories = self.get_categories()
150 | categories_data = []
151 | os.makedirs(f'{settings.DEFAULT_DOWNLOAD_DIRECTORY}/{settings.CATEGORY_METADATA_SUB_DIR}/', exist_ok=True)
152 | for category_json in categories['categories']['items']:
153 | if not query or query.lower() in category_json['name'].lower():
154 | category = SpotifyCategory(category_json)
155 | categories_data.append(category)
156 | return categories_data
157 |
158 | def get_playlist_data(self, playlist_id: str) -> str:
159 | return self.get(f'https://api.spotify.com/v1/playlists/{playlist_id}').json()
160 |
161 | def get_playlist(self, playlist_id: str) -> str:
162 | playlist_data = self.get_playlist_data(playlist_id)
163 | tracks = self.scrape_playlist_tracks(playlist_id)
164 | return SpotifyPlaylist(spotify_id=playlist_id, tracks=tracks, data=playlist_data)
165 |
166 | def scrape_user_items(self, user_id: str) -> Generator[SpotifyTrack, None, None]:
167 | for playlist in self.scrape_pagination(f'https://api.spotify.com/v1/users/{user_id}/playlists'):
168 | for track in self.scrape_playlist_tracks(playlist['id']):
169 | yield track
170 |
--------------------------------------------------------------------------------
/spotify_mass_download.py:
--------------------------------------------------------------------------------
1 | from threading import Thread, get_ident
2 | import pickle
3 | from typing import Generator
4 | from spotify_client import SpotifyClient
5 | from spotify_scraper import SpotifyScraper
6 | from config import *
7 | import base64
8 | from time import sleep
9 | from datetime import datetime
10 | import random
11 | from utils.utils import clean_file_path
12 | from utils.spotify_track import SpotifyTrack
13 |
14 | client = SpotifyClient(sp_key=SP_KEY, sp_dc=SP_DC)
15 | client.get_me()
16 | scraper = SpotifyScraper(client=client)
17 |
18 | g_downloaded_artist_covers = []
19 | g_downloaded_songs = []
20 | g_keep_saving = 0
21 |
22 |
23 | class Console:
24 | console_output = []
25 |
26 | def log(self, value: str):
27 | self.cout(value, 'inherit')
28 |
29 | def error(self, value: str):
30 | self.cout(value, 'rgba(255,30,30,0.9)')
31 |
32 | def info(self, value: str):
33 | self.cout(value, 'rgba(30,255,255,0.9)')
34 |
35 | def happy(self, value: str):
36 | self.cout(value, 'rgba(30,255,30,0.9)')
37 |
38 | def cout(self, value: str, color: str):
39 | self.console_output.append(
40 | {
41 | 'time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
42 | 'value': value,
43 | 'color': color,
44 | }
45 | )
46 |
47 | def get(self):
48 | return self.console_output
49 |
50 | console = Console()
51 |
52 |
53 | def download_track_list(download_dir: str, track_list: Generator[SpotifyTrack, None, None], recursive_artist: bool=False, recursive_album: bool=False, recursive: bool=False):
54 | global g_downloaded_songs, g_downloaded_artist_covers
55 | my_thread_id = str(get_ident()).zfill(6)
56 | artist_images_download_dir = f'{download_dir}/{settings.ARTIST_IMAGES_SUB_DIR}'
57 | downloaded_count = 0
58 | for track in track_list:
59 | try:
60 | if downloaded_count % 20 == 0:
61 | client.refresh_tokens()
62 | if track.spotify_id in g_downloaded_songs:
63 | console.info(f'Thread<{my_thread_id}> | Skipping already downloaded song: {track.title}')
64 | downloaded_count += 1
65 | continue
66 | track_path = f'{download_dir}{clean_file_path(track.artists[0].name)}/{clean_file_path(track.album.title)}'
67 | track.download_to_file(scraper, track_path)
68 | console.happy(f'Thread<{my_thread_id}> | Downloaded: {track.preview_title()}')
69 | g_downloaded_songs.append(track.spotify_id)
70 | if (recursive_album or recursive):
71 | download_track_list(download_dir=download_dir, track_list=scraper.scrape_album_tracks(track.album.spotify_id), recursive=False)
72 |
73 | for artist in track.artists:
74 | if artist.spotify_id not in g_downloaded_artist_covers:
75 | try:
76 | artist_image = artist.download_image(scraper)
77 | artist_name = base64.b64encode(artist.name.encode()).decode()
78 | with open(f'{artist_images_download_dir}/{artist_name}.jpg', 'wb') as f:
79 | f.write(artist_image)
80 | except Exception as ex:
81 | console.error(str(ex))
82 | g_downloaded_artist_covers.append(artist.spotify_id)
83 |
84 | if (recursive_artist or recursive):
85 | download_track_list(download_dir=download_dir, track_list=scraper.scrape_artist_tracks(track.artist.spotify_id), recursive=False)
86 | if recursive_artist:
87 | for album in scraper.scrape_artist_albums(artist.spotify_id):
88 | download_track_list(download_dir=download_dir, track_list=scraper.scrape_album_tracks(album['id']), recursive=False)
89 | except Exception as ex:
90 | console.error(f'Thread<{my_thread_id}> | Exception: {ex}')
91 | downloaded_count += 1
92 | if settings.VERBOSE_OUTPUTS:
93 | console.log(f'Thread<{my_thread_id}> | Processed {downloaded_count} tracks')
94 |
95 |
96 | def save_globals_save_file():
97 | global g_keep_saving, g_downloaded_artist_covers, g_downloaded_songs
98 | try:
99 | with open(settings.GLOBALS_SAVE_FILE, 'r') as f:
100 | data = json.loads(f.read())
101 | g_downloaded_songs = json.loads(data['songs'])
102 | g_downloaded_artist_covers = json.loads(data['artists'])
103 | console.log(f'Loaded {len(g_downloaded_songs)} songs & {len(g_downloaded_artist_covers)} artists')
104 | except Exception as ex:
105 | console.error(f'Failed to load globals save file! Exception: {ex}')
106 | if os.path.exists(settings.GLOBALS_SAVE_FILE):
107 | console.error(f'To avoid data loss, SpotiFile will now exit.')
108 | exit(1)
109 | while g_keep_saving > 0:
110 | with open(settings.GLOBALS_SAVE_FILE, 'w') as f:
111 | g_downloaded_songs_json = json.dumps(g_downloaded_songs)
112 | g_downloaded_artist_covers_json = json.dumps(g_downloaded_artist_covers)
113 | data = {'songs':g_downloaded_songs_json, 'artists': g_downloaded_artist_covers_json }
114 | f.write( json.dumps(data) )
115 | if settings.VERBOSE_OUTPUTS:
116 | console.log('Saved globals file!')
117 | sleep(settings.DOWNLOADS_FILE_SAVE_INTERVAL)
118 |
119 |
120 | def full_download(download_dir: str, identifier: str, recursive_artist: bool=False, recursive_album: bool=False, recursive: bool=False):
121 | global g_downloaded_songs, g_downloaded_artist_covers, g_keep_saving
122 | try:
123 | artist_images_download_dir = f'{download_dir}/{settings.ARTIST_IMAGES_SUB_DIR}'
124 | os.makedirs(artist_images_download_dir, exist_ok=True)
125 | os.makedirs(f'temp', exist_ok=True)
126 |
127 |
128 | g_keep_saving += 1
129 |
130 | client.refresh_tokens()
131 | console.log('Refreshed tokens!')
132 |
133 | console.log(f'Recieved scrape command on identifier: {identifier}, {recursive=}, {recursive_artist=}, {recursive_album=}')
134 | download_track_list(download_dir=download_dir, track_list=scraper.scrape_tracks(identifier, console=console), recursive=recursive, recursive_album=recursive_album, recursive_artist=recursive_artist)
135 |
136 | console.log(f'Comletely done scraping identifier: {identifier}!')
137 |
138 | g_keep_saving -= 1
139 | except Exception as ex:
140 | console.error(f'Full download exception: {ex}')
141 |
142 |
143 | def download_category_playlists(category_id, category_index, category_ids, download_meta_data_only):
144 | playlist_ids = scraper.get_category_playlist_ids(category_id)
145 | random.shuffle(playlist_ids)
146 | for playlist_index, playlist_id in enumerate(playlist_ids):
147 | console.log(f'Scraping playlist data from playlist {playlist_id} ({playlist_index + 1}/{len(playlist_ids)}) from category {category_id} ({category_index + 1}/{len(category_ids)})')
148 | try:
149 | playlist = scraper.get_playlist(playlist_id)
150 | playlist.export_to_file()
151 | if not download_meta_data_only:
152 | full_download(f'{settings.DEFAULT_DOWNLOAD_DIRECTORY}', identifier=playlist.href, recursive=True, recursive_album=True, recursive_artist=True)
153 | except Exception as ex:
154 | console.error(f'Scraping categories exception: {ex}')
155 |
156 |
157 | def download_all_categories_playlists(download_meta_data_only=True, query:str=''):
158 | client.refresh_tokens()
159 | os.makedirs(f'{settings.DEFAULT_DOWNLOAD_DIRECTORY}/{settings.PLAYLIST_METADATA_SUB_DIR}/', exist_ok=True)
160 | console.log(f'Scraping playlists from "{query}" categories')
161 | categories = scraper.get_categories_full(query=query)
162 | threads = []
163 | random.shuffle(categories)
164 | for category_index, category in enumerate(categories):
165 | console.log(f'Scraping playlists from category {category.name} ({category_index + 1}/{len(categories)})')
166 | category.download_metadata(scraper=scraper)
167 | try:
168 | thread = Thread(target=download_category_playlists, args=(category.spotify_id, category_index, categories, download_meta_data_only))
169 | thread.start()
170 | threads.append(thread)
171 | except Exception as ex:
172 | console.error(f'Scraping categories exception: {ex}')
173 |
174 | [x.join() for x in threads]
175 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SpotiFile
2 | ## A simple and open source spotify scraper.
3 | *Python 3.8+*
4 |
5 | ---
6 | ## 2024 Update: Project has been archived!
7 | Due to possible missuse of SpotiFile, I have decided to archive this repo.
8 | The existing code will stay up - though it no longer works and is not suited to interact with Spotify's new API.
9 | If you do wish to revive this project, please first review [Spotify's developers' ToS](https://developer.spotify.com/terms).
10 |
11 | ---
12 |
13 | ## Quick Start
14 | Make sure you have python 3.8 or above.
15 | $ git clone https://github.com/Michael-K-Stein/SpotiFile.git
16 | $ cd SpotiFile
17 | Now open config.py and setup your SP_KEY (Spotify has renamed this to sp_adid) and SP_DC tokens ([see below](https://github.com/Michael-K-Stein/SpotiFile#sp_key--sp_dc-tokens))
18 | $ python main.py
19 |
20 | ---
21 |
22 | *DISCLAIMER: This script is intended for personal and non-commercial use only. The purpose of this script is to create datasets for training machine learning models. Any use of this script that violates Deezer's Terms of Use or infringes on its intellectual property rights is strictly prohibited. The writer of this script is not responsible for any illegal or unauthorized use of the script by third parties. Users of this script assume all responsibility for their actions and agree to use the script at their own risk.*
23 | *AVIS DE NON-RESPONSABILITÉ : Ce script est destiné à un usage personnel et non commercial uniquement. Le but de ce script est de créer des ensembles de données pour entraîner des modèles d'apprentissage automatique. Toute utilisation de ce script qui viole les Conditions d'utilisation de Deezer ou porte atteinte à ses droits de propriété intellectuelle est strictement interdite en vertu de la loi française. L'auteur de ce script n'est pas responsable de toute utilisation illégale ou non autorisée du script par des tiers. Les utilisateurs de ce script assument toutes les responsabilités de leurs actions et conviennent de l'utiliser à leurs propres risques.*
24 |
25 | ---
26 |
27 | ## What?
28 | SpotiFile is a script which allows users to simply and easily, using a web-gui, scrape on Spotify playlists, albums, artists, etc.
29 | More advanced usages can be done by importing the relevant classes (e.g.
30 | ```python
31 | from spotify_scraper import SpotifyScraper
32 | ```
33 | ) and then using IPython to access specific Spotify API features.
34 | ### Advantages
35 | The main advantage of using SpotiFile is that it completely circumvents all of Spotify's api call limmits and restrictions. Spotifile offers an API to communicate with Spotify's API as if it were a real user.
36 | This allows SpotiFile to download information en-masse quickly.
37 |
38 | ---
39 |
40 | ## Why?
41 | Downloading massive amounts of songs and meta data can help if you prefer listening to music offline, or if you are desgining a music server which runs on an airgapped network.
42 | *We do not encourage music piracy nor condone any illegal activity. SpotiFile is a usefull research tool. Usage of SpotiFile for other purposes is at the user's own risk. Be warned, we will not bear any responsibility for improper use of this educational software!*
43 | ### Proper and legitimate uses of SpotiFile:
44 | + Scraping tracks to create datasets for machine learning models.
45 | + Creating remixes (for personal use only!)
46 | + Downloading music which no longer falls under copyright law ([Generally, content who's original artist passed away over 70 years ago](https://www.copyright.gov/help/faq/faq-duration.html)).
47 | ### Please notice Spotify's User Guidelines, and make sure you understand them. See section 5;
48 | *The following is not permitted for any reason whatsoever in relation to the Services and the material or content made available through the Services, or any part thereof:
49 | 5. "crawling" or "scraping", whether manually or by automated means, or otherwise using any automated means (including bots, scrapers, and spiders), to view, access or collect information;*
50 | Usage of this "scraper" is in violation of Spotify's User Guidelines. By using this code, you assume responsibility - as *you* are the one "scraping" Spotify using automated means.
51 | ### Please notice Deezer's Terms of Use, and make sure you understand them. See article 8 - Intellectual property;
52 | *The Recordings on the Deezer Free Service are protected digital files by national and international copyright and neighboring rights. They may only therefore be listened to within a private or family setting. Any use for a non-private purpose will expose the Deezer Free User to civil and/or criminal proceedings. Any other use of the Recordings is strictly forbidden and more particularly any download or attempt to download, any transfer or attempt to transfer permanently or temporarily on the hard drive of a computer or any other device (notably music players), any burn or attempt to burn a CD or any other support are expressly forbidden. Any resale, exchange or renting of these files is strictly prohibited.*
53 | Storing, or attempting to store files from Deezer is strictly prohibited. Use this software only to create, for personal use, a custom streaming app. Notice that you can only use this streaming app in a private or family setting. By using this code, you assume responsibility to perform only legal actions - such as *streaming* music from Deezer for personal use.
54 | ### Do adhere to your local laws regarding intellectual property!
55 | #### Notice: Local law (where this was written), explicitly permits reverse engeneering for non-commercial purposes.
56 |
57 | ---
58 |
59 | ## How?
60 | SpotiFile starts its life by authenticating as a normal Spotify user, and then performs a wide range of conventional and unconventional API calls to Spotify in order to retrieve relevant information.
61 | SpotiFile does not actually download audio from Spotify, since they use proper DRM encryption to protect against piracy. Rather, SpotiFile finds the relevant audio file on Deezer, using the copyright id (ironically). Then SpotiFile downloads the "encrypted" audio file from Deezer, which failed to implement DRM properly. Credit for reversing Deezer's encryption goes to https://git.fuwafuwa.moe/toad/ayeBot/src/branch/master/bot.py & https://notabug.org/deezpy-dev/Deezpy/src/master/deezpy.py & https://www.reddit.com/r/deemix/ (Original reversing algorithm has been taken down).
62 |
63 | ---
64 |
65 | ## Features
66 | + Authenticating as a legitimate Spotify user.
67 | + Scraping tracks from a playlist.
68 | + Scraping tracks from an album.
69 | + Scraping tracks from an artist.
70 | + Scraping playlists from a user.
71 | + Scraping playlists from a catergory.
72 | + Scraping a track from a track url.
73 | + Scraping artist images.
74 | + Scraping popular playlists' metadata and tracks.
75 | + Premium user token snatching (experimental).
76 | + Scraping song lyrics (time synced when possible).
77 | + Scraping track metadata.
78 | + Scraping category metadata.
79 |
80 | ---
81 |
82 | ## SP_KEY & SP_DC tokens
83 | Obtaining sp_dc and sp_key cookies (sp_key is now called sp_adid)
84 | SpotiFile uses two cookies to authenticate against Spotify in order to have access to the required services.
85 | *Shoutout to @fondberg for the explanation https://github.com/fondberg/spotcast*
86 |
87 | To obtain the cookies, these different methods can be used:
88 |
89 | ### Chrome based browser
90 | Open a new Incognito window at https://open.spotify.com and login to Spotify.
91 | Press Command+Option+I (Mac) or Control+Shift+I or F12. This should open the developer tools menu of your browser.
92 | Go into the application section.
93 | In the menu on the left go int Storage/Cookies/open.spotify.com.
94 | Find the sp_dc and sp_key and copy the values.
95 | Close the window without logging out (Otherwise the cookies are made invalid).
96 |
97 | ### Firefox based browser
98 | Open a new Incognito window at https://open.spotify.com and login to Spotify.
99 | Press Command+Option+I (Mac) or Control+Shift+I or F12. This should open the developer tools menu of your browser.
100 | Go into the Storage section. (You might have to click on the right arrows to reveal the section).
101 | Select the Cookies sub-menu and then https://open.spotify.com.
102 | Find the sp_dc and sp_key and copy the values.
103 | Close the window without logging out (Otherwise the cookies are made invalid).
104 |
105 | ---
106 |
107 | # Example usages:
108 | ## Using SpotiFile to create a song recommendation module based off song lyrics' semantic similarity:
109 | ```python
110 | from spotify_scraper import SpotifyScraper
111 | import nltk
112 | from nltk.corpus import stopwords
113 | from sklearn.feature_extraction.text import TfidfVectorizer
114 | from sklearn.metrics.pairwise import cosine_similarity
115 | import sys
116 |
117 |
118 | def semantic_similarity(paragraph1, paragraph2):
119 | # Preprocess text
120 | stop_words = set(stopwords.words('english'))
121 | paragraph1 = ' '.join([word.lower() for word in nltk.word_tokenize(paragraph1) if word.lower() not in stop_words])
122 | paragraph2 = ' '.join([word.lower() for word in nltk.word_tokenize(paragraph2) if word.lower() not in stop_words])
123 |
124 | # Compute similarity score
125 | tfidf_vectorizer = TfidfVectorizer()
126 | tfidf_matrix = tfidf_vectorizer.fit_transform([paragraph1, paragraph2])
127 | similarity_score = cosine_similarity(tfidf_matrix)[0][1]
128 |
129 | return similarity_score
130 |
131 |
132 | # Usage
133 | scraper = SpotifyScraper()
134 |
135 | lyrics1 = '\n'.join(x['words'] for x in scraper.get_lyrics(sys.argv[1])['lyrics']['lines'])
136 | lyrics2 = '\n'.join(x['words'] for x in scraper.get_lyrics(sys.argv[2])['lyrics']['lines'])
137 |
138 | sim = semantic_similarity(lyrics1, lyrics2)
139 |
140 | print(f'The similarity between the two tracks is: {sim}')
141 |
142 | ```
143 |
144 | ---
145 |
146 | ### Legal
147 | The use of a script to download music and lyrics from Deezer for personal use only, to create machine learning datasets for non-commercial use, is not illegal under French and Israeli law. The use of such a script falls under the doctrine of fair use or fair dealing, which allows individuals to make copies of copyrighted works for their own private and non-commercial use without requiring permission from the copyright owner.
148 |
149 | This interpretation is supported by precedent. In the case of Société Civile des Producteurs Phonographiques v. Delorme, the French Court of Cassation held that copying music for personal and non-commercial use is allowed under the doctrine of fair use. The court held that such copying did not infringe on the rights of the copyright owner as it did not compete with the original work or harm the market for the original work.
150 |
151 | Furthermore, the purpose of using the script is to create machine learning datasets for non-commercial use, which falls under the category of research and study. Many countries, including France and Israel, have exceptions to copyright infringement for the purposes of research and study, which allow individuals to use copyrighted works without the need for permission from the copyright owner.
152 |
153 | It is also worth noting that the script is not being used to distribute the copyrighted works to others or to make a profit, which reduces the likelihood of any significant harm to the copyright owner's rights.
154 |
155 | Finally, the disclaimer notice attached to the script explicitly states that the script is intended for personal and non-commercial use only, and that any use of the script that violates Deezer's Terms of Use or infringes on its intellectual property rights is strictly prohibited. The writer of the script has taken reasonable steps to ensure that users understand the limitations of the script and are aware that any unauthorized use is prohibited.
156 |
157 | In conclusion, the use of a script to download music and lyrics from Deezer for personal use only to create machine learning datasets for non-commercial use is legal under French and Israeli law. The doctrine of fair use and exceptions for research and study, as well as the absence of any significant harm to the copyright owner's rights and the presence of a clear disclaimer notice, support this interpretation.
158 |
--------------------------------------------------------------------------------