├── tracklists
    ├── __init__.py
    ├── scraper.py
    └── tracklists.py
├── .gitignore
├── requirements.txt
├── setup.py
├── README.md
└── LICENSE


/tracklists/__init__.py:
--------------------------------------------------------------------------------
1 | from .tracklists import *


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | .vscode
3 | __pycache__
4 | *.html
5 | */__pycache__
6 | try.py


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4==4.9.1
 2 | bs4==0.0.1
 3 | certifi==2020.4.5.2
 4 | chardet==3.0.4
 5 | fake-headers==1.0.2
 6 | html5lib==1.0.1
 7 | idna==2.9
 8 | requests==2.23.0
 9 | six==1.15.0
10 | soupsieve==2.0.1
11 | urllib3==1.26.5
12 | webencodings==0.5.1
13 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name="tracklists",
 5 |     version="0.0.1",
 6 |     description="Python library for accessing content on 1001tracklists.com.",
 7 |     url="https://github.com/leandertolksdorf/1001-tracklists-api",
 8 |     author="Leander Tolksdorf",
 9 |     author_email="a@b.com",
10 |     license="MIT",
11 |     packages=["tracklists"],
12 |     zip_safe = False
13 |     )


--------------------------------------------------------------------------------
/tracklists/scraper.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from fake_headers import Headers
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | 
 6 | def get_soup(url):
 7 | 	"""" Retrieve html and return a bs4 object """
 8 | 	response = requests.get(url, headers=Headers().generate())
 9 | 	soup = BeautifulSoup(response.text, "html.parser")
10 | 	if "Error 403" in soup.title.text:
11 | 		del soup
12 | 		raise Exception("Error 403: Captcha? https://www.1001tracklists.com/")
13 | 	else:
14 | 		return soup
15 | 
16 | 
17 | def get_json(url):
18 | 	""" Retrieve JSON from url """
19 | 	return requests.get(url, headers=Headers().generate()).json()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ♫ 1001-tracklists-api (unofficial)
 2 | 
 3 | An unofficial API to create Python objects with data from 1001tracklists.com.
 4 | 
 5 | ## Summary
 6 | 
 7 | This project is aimed at enabling easy access to the 1001tracklists database via Python.
 8 | 
 9 | The scraping is done with [BeautifulSoup](https://pypi.org/project/beautifulsoup4/).
10 | 
11 | ## Usage
12 | 
13 | Tracklists can be loaded into `Tracklist` objects using:
14 | ```python
15 | from tracklists import *
16 | tl = Tracklist(url)
17 | tracks = tl.tracks # A list of Track objects
18 | ```
19 | 
20 | To explore the data available for each object, you may wish to use:
21 | ```python
22 | vars(x) # To see all the attributes
23 | help(x) # For more complete documentation
24 | ```
25 | 
26 | ## Contributing
27 | 
28 | Everyone is welcome to contribute!
29 | 
30 | Tasks which can be worked on are included in the files with TODO tags. In addition, this is the general to-do list:
31 | 
32 | - [ ] Captcha handling
33 |     - When captcha occurs, pause scraping and redirect to solving page.
34 |     - or: Proxy rotation
35 | - [ ] JSON export
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Leander Tolksdorf
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/tracklists/tracklists.py:
--------------------------------------------------------------------------------
  1 | from .scraper import *
  2 | 
  3 | import re
  4 | 
  5 | SOURCES = {
  6 | 	"1": "beatport",
  7 | 	"2": "apple",
  8 | 	"4": "traxsource",
  9 | 	"10": "soundcloud",
 10 | 	"13": "video",
 11 | 	"36": "spotify",
 12 | }
 13 | 
 14 | class Tracklist:
 15 | 	""" An object representing a tracklist on 1001tracklists.com.
 16 | 
 17 | 	Attributes
 18 | 	----------
 19 | 	url : str
 20 | 		Link to tracklist page
 21 | 	tracklist_id : str
 22 | 		Internal 1001tracklists.com ID for the tracklist
 23 | 	title : str
 24 | 		Name of tracklist
 25 | 	tracks : list[Track]
 26 | 		List of tracks in tracklist
 27 | 	date_recorded : str
 28 | 		Date tracklist was recorded
 29 | 	interaction_details : dict{str : int}
 30 | 		Dictionary including measure of interactions with the tracklist (e.g. likes)
 31 | 	num_tracks : int
 32 | 		Number of tracks in tracklist
 33 | 	num_tracks_IDed : int
 34 | 		Number of tracks which have been IDed
 35 | 	genres : list[str]
 36 | 		List of genres associated with the tracklist
 37 | 	DJs : list[str]
 38 | 		List of DJs who made the tracklist
 39 | 	sources : dict{str : str}
 40 | 		Dictionary mapping the type of source to the source (e.g. {' Open Air / Festival 0': 'Tomorrowland'})
 41 | 	track_nums : list[str]
 42 | 		List of track numbers ("01" is first song, "02" is second song, "w/" if track played with previous)
 43 | 	cues : list[str]
 44 | 		List of cue times for tracks (e.g. "0:04" or "47:30" or "" if not available)
 45 | 	"""
 46 | 	
 47 | 	""" To-do list included here to avoid showing up in help(Tracklist)
 48 | 	TODO:
 49 | 	make DJ class
 50 | 	tracklist media links
 51 | 	clean up sources?
 52 | 	methods for genre_counts, artist_counts, etc.
 53 | 	additional metadata from left pane
 54 | 	"""
 55 | 	
 56 | 	def __init__(self, url, fetch=True):
 57 | 		""" Initialize Tracklist. """
 58 | 		self.url = url
 59 | 		self.tracklist_id = self.url.split('tracklist/')[1].split('/')[0] # Get id from url
 60 | 		if fetch:
 61 | 			self.fetch()
 62 | 
 63 | 	def fetch(self):
 64 | 		""" Load Tracklist details from url. """
 65 | 		soup = get_soup(self.url)
 66 | 		left_pane = soup.find("div", id = "left")
 67 | 		self.title = soup.title.text
 68 | 		self.load_metadata(left_pane)
 69 | 		
 70 | 		# Load tracks into list
 71 | 		self.tracks = []
 72 | 		track_divs = soup.find_all('div', {'class': 'tlpItem'}) # Find div objects for tracks
 73 | 		for track_div in track_divs:
 74 | 			t = Track(track_div) # TODO Get external links?
 75 | 			if track_div.find('i', {'title': 'mashup linked position'}): # Track part of mashup -> attach to previous entry in tracklist
 76 | 				self.tracks[-1].add_subsong(t)
 77 | 			else:
 78 | 				self.tracks.append(t)
 79 | 
 80 | 		# Load track numbers and cues (adapted from https://github.com/globalnomad/quickCUE/blob/master/quickCUE.py)
 81 | 		self.track_nums = [span.text.strip() for span in soup.find_all('span', id=re.compile('_tracknumber_value'))]
 82 | 		self.cues = [div.text.strip() for div in soup.find_all('div', class_='cueValueField')]
 83 | 
 84 | 	def __repr__(self):
 85 | 		return "Tracklist(" + self.tracklist_id + ")"
 86 | 
 87 | 	def __str__(self):
 88 | 		return self.title
 89 | 
 90 | 	def load_metadata(self, left_pane):
 91 | 		""" Load metadata on tracklist from left pane of page (DJ, source, date recorded, etc.). """ 
 92 | 		try:
 93 | 			self.date_recorded = left_pane.find("span", title = "tracklist recording date").parent.parent.select("td")[1].text
 94 | 		except (AttributeError, IndexError):
 95 | 			self.date_recorded = None
 96 | 		
 97 | 		self.interaction_details = {}
 98 | 		for interaction_detail in left_pane.find_all("meta", itemprop = "interactionCount"):
 99 | 			name, count = interaction_detail["content"].strip().split(":")
100 | 			self.interaction_details[name] = int(count)
101 | 		
102 | 		IDed = left_pane.text.split('IDed')[1].split('short')[0].strip().split(" / ")[0]
103 | 		total = left_pane.text.split('IDed')[1].split('short')[0].strip().split(" / ")[1].split(' ')[0]
104 | 		self.num_tracks = int(total)
105 | 		self.num_tracks_IDed = int(IDed) if IDed != 'all' else int(total)
106 | 		
107 | 		self.genres = left_pane.find("td", id = "tl_music_styles").text.split(", ") if left_pane.find("td", id = "tl_music_styles") else []
108 | 		
109 | 		try:
110 | 			tracklist_DJs_location = left_pane.find_all("table", class_ = "sideTop")
111 | 			tracklist_DJs_location = [person_place.find("a") for person_place in tracklist_DJs_location]
112 | 			
113 | 			tracklist_DJs = []
114 | 			tracklist_source = {}
115 | 			
116 | 			for person_place in tracklist_DJs_location:
117 | 				if re.search("\/dj\/", person_place.get("href")):
118 | 					tracklist_DJs.append(person_place.text)
119 | 				if re.search("\/source\/", person_place.get("href")):
120 | 					tracklist_source[person_place.parent.parent.parent.find("td").contents[0]] = person_place.text
121 | 			  
122 | 			self.DJs = tracklist_DJs
123 | 			
124 | 			# Tracklist sources include event name, location, radio show, etc.
125 | 			# Splits each by type of source and adds number in case tracklist is for multiple sources (e.g. two radio shows do a colab and both appear on 1001Tracklists page)
126 | 			self.sources = {}
127 | 			for source in tracklist_source:
128 | 				source_number = 0
129 | 				source_w_number = source + str(source_number)
130 | 				while source_w_number in self.sources:
131 | 					source_number += 1
132 | 					source_w_number = source + str(source_number)
133 | 				self.sources[source_w_number] = tracklist_source[source]
134 | 			
135 | 		except AttributeError: # Couldn't find tracklist sources (e.g. DJs, festival, radio show, etc.)
136 | 			pass
137 | 
138 | 
139 | class Artist:
140 | 	""" An object representing an artist on 1001tracklists.com
141 | 
142 | 	Attributes
143 | 	----------
144 | 	name : str
145 | 		Artist's name
146 | 	url : str
147 | 		Link to artist page on 1001tracklists.com, or None if no page exists
148 | 	artist_id : str
149 | 		Internal 1001tracklists.com ID for artist, or None if not available
150 | 
151 | 	"""
152 | 
153 | 	"""
154 | 	TODO
155 | 	handle subartists e.g. &, vs.
156 | 	fetch() method
157 | 	"""
158 | 
159 | 	def __init__(self, span=None, name=None):
160 | 		""" Initialize from bs4.span containing artist name and link, or artist name if span not available """
161 | 		self.name = name if name else span.text
162 | 		self.url = span.a['href'] if span and span.find('a') else None
163 | 		self.artist_id = self.url.split('artist/')[1].split('/')[0] if self.url else None
164 | 
165 | 	def __str__(self):
166 | 		return self.name
167 | 
168 | 	def __repr__(self):
169 | 		return self.name
170 | 
171 | 
172 | class Label:
173 | 	""" An object representing a label on 1001tracklists.com
174 | 
175 | 	Attributes
176 | 	----------
177 | 	name : str
178 | 		Label name
179 | 	url : str
180 | 		Link to label page on 1001tracklists.com, or None if no page exists
181 | 	label_id : str
182 | 		Internal 1001tracklists.com ID for label, or None if not available
183 | 	"""
184 | 
185 | 	"""
186 | 	TODO
187 | 	handle sublabels better
188 | 	fetch() method
189 | 	"""
190 | 	
191 | 
192 | 	def __init__(self, td):
193 | 		""" Initialize from bs4.td containing label name and link """
194 | 		self.name = td.text
195 | 		self.url = td.a['href'] if td.find('a') else None
196 | 		self.label_id = self.url.split('label/')[1].split('/')[0] if self.url else None
197 | 		# If sublabel (e.g. "DHARMA (SPINNIN')") keeps full name ("DHARMA (SPINNIN')") and link to sublabel (DHARMA)
198 | 
199 | 	def __str__(self):
200 | 		return self.name
201 | 
202 | 	def __repr__(self):
203 | 		return "Label(" + (self.label_id if self.label_id else self.name) + ")"
204 | 
205 | 
206 | class Track:
207 | 	"""An object representing a tracklist on 1001tracklists.com
208 | 
209 | 	Attributes
210 | 	----------
211 | 	url : str
212 | 		Link to tracklist page
213 | 	track_id : str
214 | 		Internal 1001tracklists.com ID for the track
215 | 	full_title : str
216 | 		Full name of track (e.g. 'Tiësto & DallasK - Show Me')
217 | 	title : str
218 | 		Track title (e.g. 'Seven Nation Fun (Holl & Rush Mashup)')
219 | 	full_artist : str
220 | 		Full name of track artist, including any featured artists (e.g. 'Tiësto & Dzeko ft. Lena Leon')
221 | 	artist : Artist
222 | 		Artist who recorded track (not including any featured artists)
223 | 	genre : str
224 | 		Genre associated with the track or None if not listed
225 | 	duration : str
226 | 		The duration of the track in ISO 8601 date format or None if not listed
227 | 	labels : list[Label]
228 | 		Label that released track, multiple labels in case Track is a mashup
229 | 	subsongs : list[Track]
230 | 		If Track is a mashup, list of tracks it includes
231 | 	"""
232 | 	"""
233 | 	TODO
234 | 	----
235 | 	fetch()
236 | 	Remix information
237 | 	Parse featured artist(s)
238 | 	Handle reworks
239 | 	"""
240 | 
241 | 	def __init__(self, soup, fetch=False):
242 | 		""" Initialize Track from bs4.div with itemprop="tracks". """
243 | 
244 | 		# Get basic info
245 | 		self.full_title = soup.find('span', {'class': "trackValue"}).text.strip().replace('\xa0', ' ')
246 | 		self.full_artist, self.title = tuple(self.full_title.split(' - ', maxsplit=1))
247 | 
248 | 		# Get info from metadata
249 | 		meta_data = {meta['itemprop']: meta['content'] for meta in soup.find_all('meta')} # keys: 'name', 'byArtist', 'publisher', 'duration', 'genre', 'url'
250 | 		self.genre = meta_data['genre'] if 'genre' in meta_data else None
251 | 		self.duration = meta_data['duration'] if 'duration' in meta_data else None
252 | 		self.url = meta_data['url'] if 'url' in meta_data else None
253 | 
254 | 
255 | 		#doesn't exist for some reason
256 | 		#track_id = soup.find('span', {'class': 'trackValue'})['id']
257 | 		#self.track_id = int(track_id[3:]) if 'pos' not in track_id else None
258 | 		
259 | 		# Get track artist
260 | 		try:
261 | 			artist_span = soup.find('span', {'title': 'open artist page'}).parent
262 | 			self.artist = Artist(span=artist_span)
263 | 		except: # No artist link available -> use artist name from metadata or self.full_artist
264 | 			self.artist = Artist(name=meta_data['byArtist'] if 'byArtist' in meta_data else self.full_artist)
265 | 		
266 | 		# Get label details (store in list because can have multiple labels)
267 | 		labels = soup.find_all('span', {'title': 'label'})
268 | 		self.labels = [Label(label) for label in labels] if labels else []
269 | 
270 | 		# Prepare mashup details
271 | 		self.subsongs = [] # Will be populated if track is mashup
272 | 
273 | 	def add_subsong(self, subsong):
274 | 		"""
275 | 		Add original tracks which were used in mashup.
276 | 		
277 | 		subsong : Track
278 | 			Original track used in mashup
279 | 		"""
280 | 		self.subsongs.append(subsong)
281 | 
282 | 	def __str__(self):
283 | 		return self.title + ' by ' + str(self.artist)
284 | 	
285 | 	def __repr__(self):
286 | 		return 'Track(' + self.full_title + ')'
287 | 	
288 | 	def fetch(self):
289 | 		""" Fetch track details from track page. """
290 | 		if not self.track_id:
291 | 			return
292 | 		
293 | 		url = 'https://www.1001tracklists.com/track/' + str(self.track_id)
294 | 		soup = get_soup(url)
295 | 		# TODO get other information
296 | 
297 | 	def fetch_external_ids(self):
298 | 		""" Fetch external link ids for track. """
299 | 		if not self.track_id:
300 | 			return
301 | 
302 | 		# Request all medialinks from 1001tl-API
303 | 		url = f"https://www.1001tracklists.com/ajax/get_medialink.php?idObject=5&idItem={self.track_id}"
304 | 		response = get_json(url)
305 | 
306 | 		# Add external ids to external_ids
307 | 		self.external_ids = {}
308 | 		if response["success"]:
309 | 			for elem in response["data"]:
310 | 				try: 
311 | 					self.external_ids[SOURCES[elem["source"]]] = elem["playerId"]
312 | 				except KeyError:
313 | 					print("Source: ", elem["source"], "not defined.")


--------------------------------------------------------------------------------