├── tracklists ├── __init__.py ├── scraper.py └── tracklists.py ├── .gitignore ├── requirements.txt ├── setup.py ├── README.md └── LICENSE /tracklists/__init__.py: -------------------------------------------------------------------------------- 1 | from .tracklists import * -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | .vscode 3 | __pycache__ 4 | *.html 5 | */__pycache__ 6 | try.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.9.1 2 | bs4==0.0.1 3 | certifi==2020.4.5.2 4 | chardet==3.0.4 5 | fake-headers==1.0.2 6 | html5lib==1.0.1 7 | idna==2.9 8 | requests==2.23.0 9 | six==1.15.0 10 | soupsieve==2.0.1 11 | urllib3==1.26.5 12 | webencodings==0.5.1 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name="tracklists", 5 | version="0.0.1", 6 | description="Python library for accessing content on 1001tracklists.com.", 7 | url="https://github.com/leandertolksdorf/1001-tracklists-api", 8 | author="Leander Tolksdorf", 9 | author_email="a@b.com", 10 | license="MIT", 11 | packages=["tracklists"], 12 | zip_safe = False 13 | ) -------------------------------------------------------------------------------- /tracklists/scraper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from fake_headers import Headers 3 | from bs4 import BeautifulSoup 4 | 5 | 6 | def get_soup(url): 7 | """" Retrieve html and return a bs4 object """ 8 | response = requests.get(url, headers=Headers().generate()) 9 | soup = BeautifulSoup(response.text, "html.parser") 10 | if "Error 403" in soup.title.text: 11 | del soup 12 | raise Exception("Error 403: Captcha? https://www.1001tracklists.com/") 13 | else: 14 | return soup 15 | 16 | 17 | def get_json(url): 18 | """ Retrieve JSON from url """ 19 | return requests.get(url, headers=Headers().generate()).json() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ♫ 1001-tracklists-api (unofficial) 2 | 3 | An unofficial API to create Python objects with data from 1001tracklists.com. 4 | 5 | ## Summary 6 | 7 | This project is aimed at enabling easy access to the 1001tracklists database via Python. 8 | 9 | The scraping is done with [BeautifulSoup](https://pypi.org/project/beautifulsoup4/). 10 | 11 | ## Usage 12 | 13 | Tracklists can be loaded into `Tracklist` objects using: 14 | ```python 15 | from tracklists import * 16 | tl = Tracklist(url) 17 | tracks = tl.tracks # A list of Track objects 18 | ``` 19 | 20 | To explore the data available for each object, you may wish to use: 21 | ```python 22 | vars(x) # To see all the attributes 23 | help(x) # For more complete documentation 24 | ``` 25 | 26 | ## Contributing 27 | 28 | Everyone is welcome to contribute! 29 | 30 | Tasks which can be worked on are included in the files with TODO tags. In addition, this is the general to-do list: 31 | 32 | - [ ] Captcha handling 33 | - When captcha occurs, pause scraping and redirect to solving page. 34 | - or: Proxy rotation 35 | - [ ] JSON export 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Leander Tolksdorf 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /tracklists/tracklists.py: -------------------------------------------------------------------------------- 1 | from .scraper import * 2 | 3 | import re 4 | 5 | SOURCES = { 6 | "1": "beatport", 7 | "2": "apple", 8 | "4": "traxsource", 9 | "10": "soundcloud", 10 | "13": "video", 11 | "36": "spotify", 12 | } 13 | 14 | class Tracklist: 15 | """ An object representing a tracklist on 1001tracklists.com. 16 | 17 | Attributes 18 | ---------- 19 | url : str 20 | Link to tracklist page 21 | tracklist_id : str 22 | Internal 1001tracklists.com ID for the tracklist 23 | title : str 24 | Name of tracklist 25 | tracks : list[Track] 26 | List of tracks in tracklist 27 | date_recorded : str 28 | Date tracklist was recorded 29 | interaction_details : dict{str : int} 30 | Dictionary including measure of interactions with the tracklist (e.g. likes) 31 | num_tracks : int 32 | Number of tracks in tracklist 33 | num_tracks_IDed : int 34 | Number of tracks which have been IDed 35 | genres : list[str] 36 | List of genres associated with the tracklist 37 | DJs : list[str] 38 | List of DJs who made the tracklist 39 | sources : dict{str : str} 40 | Dictionary mapping the type of source to the source (e.g. {' Open Air / Festival 0': 'Tomorrowland'}) 41 | track_nums : list[str] 42 | List of track numbers ("01" is first song, "02" is second song, "w/" if track played with previous) 43 | cues : list[str] 44 | List of cue times for tracks (e.g. "0:04" or "47:30" or "" if not available) 45 | """ 46 | 47 | """ To-do list included here to avoid showing up in help(Tracklist) 48 | TODO: 49 | make DJ class 50 | tracklist media links 51 | clean up sources? 52 | methods for genre_counts, artist_counts, etc. 53 | additional metadata from left pane 54 | """ 55 | 56 | def __init__(self, url, fetch=True): 57 | """ Initialize Tracklist. """ 58 | self.url = url 59 | self.tracklist_id = self.url.split('tracklist/')[1].split('/')[0] # Get id from url 60 | if fetch: 61 | self.fetch() 62 | 63 | def fetch(self): 64 | """ Load Tracklist details from url. """ 65 | soup = get_soup(self.url) 66 | left_pane = soup.find("div", id = "left") 67 | self.title = soup.title.text 68 | self.load_metadata(left_pane) 69 | 70 | # Load tracks into list 71 | self.tracks = [] 72 | track_divs = soup.find_all('div', {'class': 'tlpItem'}) # Find div objects for tracks 73 | for track_div in track_divs: 74 | t = Track(track_div) # TODO Get external links? 75 | if track_div.find('i', {'title': 'mashup linked position'}): # Track part of mashup -> attach to previous entry in tracklist 76 | self.tracks[-1].add_subsong(t) 77 | else: 78 | self.tracks.append(t) 79 | 80 | # Load track numbers and cues (adapted from https://github.com/globalnomad/quickCUE/blob/master/quickCUE.py) 81 | self.track_nums = [span.text.strip() for span in soup.find_all('span', id=re.compile('_tracknumber_value'))] 82 | self.cues = [div.text.strip() for div in soup.find_all('div', class_='cueValueField')] 83 | 84 | def __repr__(self): 85 | return "Tracklist(" + self.tracklist_id + ")" 86 | 87 | def __str__(self): 88 | return self.title 89 | 90 | def load_metadata(self, left_pane): 91 | """ Load metadata on tracklist from left pane of page (DJ, source, date recorded, etc.). """ 92 | try: 93 | self.date_recorded = left_pane.find("span", title = "tracklist recording date").parent.parent.select("td")[1].text 94 | except (AttributeError, IndexError): 95 | self.date_recorded = None 96 | 97 | self.interaction_details = {} 98 | for interaction_detail in left_pane.find_all("meta", itemprop = "interactionCount"): 99 | name, count = interaction_detail["content"].strip().split(":") 100 | self.interaction_details[name] = int(count) 101 | 102 | IDed = left_pane.text.split('IDed')[1].split('short')[0].strip().split(" / ")[0] 103 | total = left_pane.text.split('IDed')[1].split('short')[0].strip().split(" / ")[1].split(' ')[0] 104 | self.num_tracks = int(total) 105 | self.num_tracks_IDed = int(IDed) if IDed != 'all' else int(total) 106 | 107 | self.genres = left_pane.find("td", id = "tl_music_styles").text.split(", ") if left_pane.find("td", id = "tl_music_styles") else [] 108 | 109 | try: 110 | tracklist_DJs_location = left_pane.find_all("table", class_ = "sideTop") 111 | tracklist_DJs_location = [person_place.find("a") for person_place in tracklist_DJs_location] 112 | 113 | tracklist_DJs = [] 114 | tracklist_source = {} 115 | 116 | for person_place in tracklist_DJs_location: 117 | if re.search("\/dj\/", person_place.get("href")): 118 | tracklist_DJs.append(person_place.text) 119 | if re.search("\/source\/", person_place.get("href")): 120 | tracklist_source[person_place.parent.parent.parent.find("td").contents[0]] = person_place.text 121 | 122 | self.DJs = tracklist_DJs 123 | 124 | # Tracklist sources include event name, location, radio show, etc. 125 | # Splits each by type of source and adds number in case tracklist is for multiple sources (e.g. two radio shows do a colab and both appear on 1001Tracklists page) 126 | self.sources = {} 127 | for source in tracklist_source: 128 | source_number = 0 129 | source_w_number = source + str(source_number) 130 | while source_w_number in self.sources: 131 | source_number += 1 132 | source_w_number = source + str(source_number) 133 | self.sources[source_w_number] = tracklist_source[source] 134 | 135 | except AttributeError: # Couldn't find tracklist sources (e.g. DJs, festival, radio show, etc.) 136 | pass 137 | 138 | 139 | class Artist: 140 | """ An object representing an artist on 1001tracklists.com 141 | 142 | Attributes 143 | ---------- 144 | name : str 145 | Artist's name 146 | url : str 147 | Link to artist page on 1001tracklists.com, or None if no page exists 148 | artist_id : str 149 | Internal 1001tracklists.com ID for artist, or None if not available 150 | 151 | """ 152 | 153 | """ 154 | TODO 155 | handle subartists e.g. &, vs. 156 | fetch() method 157 | """ 158 | 159 | def __init__(self, span=None, name=None): 160 | """ Initialize from bs4.span containing artist name and link, or artist name if span not available """ 161 | self.name = name if name else span.text 162 | self.url = span.a['href'] if span and span.find('a') else None 163 | self.artist_id = self.url.split('artist/')[1].split('/')[0] if self.url else None 164 | 165 | def __str__(self): 166 | return self.name 167 | 168 | def __repr__(self): 169 | return self.name 170 | 171 | 172 | class Label: 173 | """ An object representing a label on 1001tracklists.com 174 | 175 | Attributes 176 | ---------- 177 | name : str 178 | Label name 179 | url : str 180 | Link to label page on 1001tracklists.com, or None if no page exists 181 | label_id : str 182 | Internal 1001tracklists.com ID for label, or None if not available 183 | """ 184 | 185 | """ 186 | TODO 187 | handle sublabels better 188 | fetch() method 189 | """ 190 | 191 | 192 | def __init__(self, td): 193 | """ Initialize from bs4.td containing label name and link """ 194 | self.name = td.text 195 | self.url = td.a['href'] if td.find('a') else None 196 | self.label_id = self.url.split('label/')[1].split('/')[0] if self.url else None 197 | # If sublabel (e.g. "DHARMA (SPINNIN')") keeps full name ("DHARMA (SPINNIN')") and link to sublabel (DHARMA) 198 | 199 | def __str__(self): 200 | return self.name 201 | 202 | def __repr__(self): 203 | return "Label(" + (self.label_id if self.label_id else self.name) + ")" 204 | 205 | 206 | class Track: 207 | """An object representing a tracklist on 1001tracklists.com 208 | 209 | Attributes 210 | ---------- 211 | url : str 212 | Link to tracklist page 213 | track_id : str 214 | Internal 1001tracklists.com ID for the track 215 | full_title : str 216 | Full name of track (e.g. 'Tiësto & DallasK - Show Me') 217 | title : str 218 | Track title (e.g. 'Seven Nation Fun (Holl & Rush Mashup)') 219 | full_artist : str 220 | Full name of track artist, including any featured artists (e.g. 'Tiësto & Dzeko ft. Lena Leon') 221 | artist : Artist 222 | Artist who recorded track (not including any featured artists) 223 | genre : str 224 | Genre associated with the track or None if not listed 225 | duration : str 226 | The duration of the track in ISO 8601 date format or None if not listed 227 | labels : list[Label] 228 | Label that released track, multiple labels in case Track is a mashup 229 | subsongs : list[Track] 230 | If Track is a mashup, list of tracks it includes 231 | """ 232 | """ 233 | TODO 234 | ---- 235 | fetch() 236 | Remix information 237 | Parse featured artist(s) 238 | Handle reworks 239 | """ 240 | 241 | def __init__(self, soup, fetch=False): 242 | """ Initialize Track from bs4.div with itemprop="tracks". """ 243 | 244 | # Get basic info 245 | self.full_title = soup.find('span', {'class': "trackValue"}).text.strip().replace('\xa0', ' ') 246 | self.full_artist, self.title = tuple(self.full_title.split(' - ', maxsplit=1)) 247 | 248 | # Get info from metadata 249 | meta_data = {meta['itemprop']: meta['content'] for meta in soup.find_all('meta')} # keys: 'name', 'byArtist', 'publisher', 'duration', 'genre', 'url' 250 | self.genre = meta_data['genre'] if 'genre' in meta_data else None 251 | self.duration = meta_data['duration'] if 'duration' in meta_data else None 252 | self.url = meta_data['url'] if 'url' in meta_data else None 253 | 254 | 255 | #doesn't exist for some reason 256 | #track_id = soup.find('span', {'class': 'trackValue'})['id'] 257 | #self.track_id = int(track_id[3:]) if 'pos' not in track_id else None 258 | 259 | # Get track artist 260 | try: 261 | artist_span = soup.find('span', {'title': 'open artist page'}).parent 262 | self.artist = Artist(span=artist_span) 263 | except: # No artist link available -> use artist name from metadata or self.full_artist 264 | self.artist = Artist(name=meta_data['byArtist'] if 'byArtist' in meta_data else self.full_artist) 265 | 266 | # Get label details (store in list because can have multiple labels) 267 | labels = soup.find_all('span', {'title': 'label'}) 268 | self.labels = [Label(label) for label in labels] if labels else [] 269 | 270 | # Prepare mashup details 271 | self.subsongs = [] # Will be populated if track is mashup 272 | 273 | def add_subsong(self, subsong): 274 | """ 275 | Add original tracks which were used in mashup. 276 | 277 | subsong : Track 278 | Original track used in mashup 279 | """ 280 | self.subsongs.append(subsong) 281 | 282 | def __str__(self): 283 | return self.title + ' by ' + str(self.artist) 284 | 285 | def __repr__(self): 286 | return 'Track(' + self.full_title + ')' 287 | 288 | def fetch(self): 289 | """ Fetch track details from track page. """ 290 | if not self.track_id: 291 | return 292 | 293 | url = 'https://www.1001tracklists.com/track/' + str(self.track_id) 294 | soup = get_soup(url) 295 | # TODO get other information 296 | 297 | def fetch_external_ids(self): 298 | """ Fetch external link ids for track. """ 299 | if not self.track_id: 300 | return 301 | 302 | # Request all medialinks from 1001tl-API 303 | url = f"https://www.1001tracklists.com/ajax/get_medialink.php?idObject=5&idItem={self.track_id}" 304 | response = get_json(url) 305 | 306 | # Add external ids to external_ids 307 | self.external_ids = {} 308 | if response["success"]: 309 | for elem in response["data"]: 310 | try: 311 | self.external_ids[SOURCES[elem["source"]]] = elem["playerId"] 312 | except KeyError: 313 | print("Source: ", elem["source"], "not defined.") --------------------------------------------------------------------------------