├── .gitignore
├── README.md
├── img
    ├── example.png
    └── mirrorfulLogo.png
├── requirements.txt
└── visualizeSongs.py


/.gitignore:
--------------------------------------------------------------------------------
1 | **p3_env
2 | 
3 | .env
4 | 
5 | .DS_Store
6 | 
7 | 
8 | 
9 | *.xlsx


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 🎵 Setlist Visualizer
 2 | 
 3 | ![graph](./img/example.png)
 4 | 
 5 | ### Introduction
 6 | 
 7 | Python script that scrapes setlistfm for a given band and visualizes their setlists for a given tour.
 8 | 
 9 | _Setlist Visualizer is one of the first things I ever "programmed." There's cool functionality here that could be improved upon. I come back to it every few years after my favorite artists go on tour._
10 | 
11 | ## Usage:
12 | 
13 | ### 1. Install dependencies:
14 | 
15 | ```
16 | mkdir p3_env; python3 -m venv p3_env; source p3_env/bin/activate; pip3 install -r requirements.txt
17 | ```
18 | 
19 | ### 2. Change constants:
20 | 
21 | At the top of the script, change the relevant constants.
22 | 
23 | ### 3. Then, get the data in xlsx format
24 | 
25 | `python3 visualizeSongs.py`
26 | 
27 | ### 4. Finally, visualize the data once you have it
28 | 
29 | `python3 visualizeSongs.py`
30 | 
31 | ### Acknowledgements:
32 | 
33 | Author of original setlist.fm scraping: the talented **Ryan Lee Watts**
34 | 
35 | Github: https://github.com/ryanleewatts
36 | 
37 | Script: https://github.com/ryanleewatts/coding-project/blob/master/scraper/SetlistScript.py
38 | 
39 | ## Sponsored by
40 | 
41 | Setlist Visualizer is proudly sponsored by [Magic Patterns](https://www.magicpatterns.com/), an AI design tool.
42 | 


--------------------------------------------------------------------------------
/img/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexdanilowicz/Setlist-Visualizer/9afc46400c7a67626886cec63803db936178ea09/img/example.png


--------------------------------------------------------------------------------
/img/mirrorfulLogo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexdanilowicz/Setlist-Visualizer/9afc46400c7a67626886cec63803db936178ea09/img/mirrorfulLogo.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4==4.8.0
 2 | bs4==0.0.1
 3 | certifi==2023.7.22
 4 | chardet==3.0.4
 5 | contourpy==1.0.5
 6 | cycler==0.10.0
 7 | et-xmlfile==1.0.1
 8 | fonttools==4.37.3
 9 | idna==2.8
10 | jdcal==1.4.1
11 | kiwisolver==1.1.0
12 | lxml==4.4.1
13 | matplotlib==3.6.0
14 | numpy==1.23.3
15 | openpyxl==3.0.7
16 | packaging==21.3
17 | pandas==1.5.0
18 | Pillow==10.0.1
19 | pyparsing==2.4.2
20 | python-dateutil==2.8.2
21 | pytz==2022.2.1
22 | requests==2.31.0
23 | six==1.12.0
24 | soupsieve==1.9.3
25 | urllib3==1.26.18
26 | xlrd==1.2.0
27 | 


--------------------------------------------------------------------------------
/visualizeSongs.py:
--------------------------------------------------------------------------------
  1 | # Author of original setlistfm scraping: ryanleewatts
  2 | # Github: https://github.com/ryanleewatts
  3 | # Original script: https://github.com/ryanleewatts/coding-project/blob/master/scraper/SetlistScript.py
  4 | #
  5 | # Example Usage:
  6 | #	python3 visualizeSongs.py
  7 | #
  8 | # Author: Alex Danilowicz
  9 | # 	Wrote for fun as a summer personal project.
 10 | #	Started as just a way to see what Radiohead songs would be played...
 11 | #
 12 | # First run it with scrape(), then once the file is created, run it with visualize()
 13 | 
 14 | from bs4 import BeautifulSoup as bs
 15 | import requests
 16 | import pandas as pd
 17 | from matplotlib.pyplot import cm
 18 | import matplotlib.pyplot as plt
 19 | import matplotlib.ticker as mtick
 20 | import numpy as np
 21 | import random
 22 | from collections import defaultdict
 23 | from pathlib import Path
 24 | 
 25 | # THINGS YOU MUST CHANGE
 26 | ARTIST = "The National"
 27 | UNIQUE = "the-national-53d69b79.html" # MAKE SURE TO INCLUDE .html part. This is unique for the band
 28 | URL_TO_STOP_AT = "/auditorium-theatre-chicago-il-53b9d7a9.html" # Note: get rid of HTTPS part
 29 | URL_TO_START_AT = "html" # this url will be the first one to be scraped. If you want first one, put in nothing or html
 30 | CUSTOM = True # if true, be sure to define returnCustomAlbumDict
 31 | TOUR_DEFINITION = "/setlist/the-national/2023"
 32 | 
 33 | # OPTIONAL THINGS TO CHANGE
 34 | YEAR = "2023"
 35 | SORT_ALBUM = False # toggle if you want to sort by album or not. if false, sorts by count
 36 | FILE = ARTIST + "-Data" + "-" + YEAR +".xlsx" # filename
 37 | TITLE = "The National's 2023 Tour, so far... "
 38 | SONGS_TO_IGNORE = ["I Can't Forget"]
 39 | MAX_PAGES = 10000 # max to scrape, not used if URL_TO_STOP set properly
 40 | FONT_SIZE_TICKS = 8
 41 | FONT_Y = 10 # for labels
 42 | color_album_dict = {}
 43 | 
 44 | def scrape():
 45 | 	if ('.html' not in UNIQUE):
 46 | 		raise Exception("You must add .html to the unique var otherwise you will stuck in an infinite loop")
 47 | 
 48 | 	UNIQUE_URL = "https://www.setlist.fm/setlists/" + UNIQUE + "?page="
 49 | 	SONG_URL = "https://www.setlist.fm/stats/songs/" + UNIQUE + "?song="  # notice difference: /stats/
 50 | 	visited = {} # key song, album is value
 51 | 	links = []
 52 | 	dm = []
 53 | 	my_file = Path("./" + FILE)
 54 | 	if not my_file.is_file(): # only do scraping if file doesn't exist already
 55 | 		break_bool = False
 56 | 		start = False
 57 | 		for i in range(MAX_PAGES):
 58 | 			if break_bool:
 59 | 				break
 60 | 			url = UNIQUE_URL + str(i + 1)
 61 | 			print('🎉 Page ', url)
 62 | 			r = requests.get(url)
 63 | 			soup = bs(r.content, "lxml")
 64 | 			#print('🍲 Soup', soup)
 65 | 			for link in soup.find_all('a', class_='summary url'):
 66 | 				setlist = (link.get('href'))
 67 | 				completeurl = 'http://www.setlists.fm' + setlist[2:]
 68 | 				if URL_TO_START_AT in completeurl:
 69 | 					start = True
 70 | 				if start:
 71 | 					print("Considering url: ", completeurl)
 72 | 					if completeurl not in links:
 73 | 						if TOUR_DEFINITION in completeurl:
 74 | 							print("ℹ️ Getting url: " + completeurl) # print the output
 75 | 							links.append(completeurl)
 76 | 				# stop at this url
 77 | 				if URL_TO_STOP_AT in completeurl:
 78 | 					break_bool = True
 79 | 					break # stop at this setlist
 80 | 
 81 | 		# Scrape every url in that list
 82 | 		for item in links:
 83 | 			print('🎹 Looking at setlist for: ', item)
 84 | 			# 1. Scrape the date
 85 | 			r = requests.get(item)
 86 | 			soup = bs(r.content, "lxml")
 87 | 			for datehtml in soup.find_all('em', class_='link', text=True):
 88 | 				date = datehtml.text[:-7]
 89 | 				date = date.partition(",")[0]
 90 | 				date = date.replace(" ", "")
 91 | 
 92 | 			# 2. Scrape the setlist
 93 | 			songs = []
 94 | 			for songHTML in soup.find_all('div', class_='songPart'):
 95 | 				songstext = songHTML.text
 96 | 				# hardcoded these, cause too lazy to put into list
 97 | 				# skip the intro/outro songs that are always there
 98 | 				thesong = songstext.encode('utf-8').rstrip().strip().decode("utf-8")
 99 | 				if thesong not in SONGS_TO_IGNORE:
100 | 					songs.append(thesong)
101 | 
102 | 			#3. Scrape the album
103 | 			for song in songs:
104 | 				if str(song) not in visited:
105 | 					print('Getting the song 🎵', song)
106 | 					# hardcoded this one for Radiohead cause url format is wonky, can fix later
107 | 					if "2 + 2 = 5" in song:
108 | 						r = requests.get("https://www.setlist.fm/stats/songs/radiohead-bd6bd12.html?song=" + "2+%2B+2+%3D+5")
109 | 					else:
110 | 						r = requests.get(SONG_URL + song)
111 | 					soup = bs(r.content, "lxml")
112 | 
113 | 					thenext = False
114 | 					for album in soup.find_all('span'):
115 | 						if thenext:
116 | 							thealbum = album.text
117 | 							thealbum = thealbum.replace("(Album)", "")
118 | 							thealbum = thealbum.replace("(Single)", "")
119 | 							thealbum = thealbum.strip("'")
120 | 							thealbum = thealbum.rstrip()
121 | 							# harcoded but you could just see if the key is in th
122 | 							# if thealbum not in returnCustomAlbumDict(color_album_dict):
123 | 							# 	thealbum = "Other"
124 | 							if (str(song) == "Deep End"):
125 | 								visited[str(song)] = "New Album"
126 | 							if (str(song) == "Space Invader (Threaded Gold)"):
127 | 								visited[str(song)] = "New Album"
128 | 							if (str(song) == "Coat on a Hook"):
129 | 								visited[str(song)] = "New Album"	
130 | 							if (str(song) == "Tour Manager"):
131 | 								visited[str(song)] = "New Album"		
132 | 							else:
133 | 								visited[str(song)] = thealbum
134 | 							break
135 | 						if album.text == "From the release": # album name falls under this span
136 | 							thenext = True
137 | 				else:
138 | 					thealbum = visited[str(song)]
139 | 
140 | 				try:
141 | 					dm.append([date, song, thealbum])
142 | 				except:
143 | 					print("skipping over this song")
144 | 
145 | 		df = pd.DataFrame(dm, columns=['Date', 'Track', 'Album'])
146 | 
147 | 		df.to_excel(FILE, index=False)
148 | 	else:
149 | 		visualize_album()
150 | def return_original_df():
151 | 	return pd.read_excel(FILE, sheet_name="Sheet1")
152 | 
153 | def create_clean_df():
154 | 	df = return_original_df()
155 | 	total_df = df.copy()
156 | 	count = len(total_df['Track'].unique())
157 | 	print(count)
158 | 	total = len(total_df['Date'].unique())
159 | 
160 | 	albums = df[['Track', 'Album']]
161 | 	# put track as index, date in row
162 | 	unique_df = df.groupby(df['Track']).nunique() # get count
163 | 	# clean up and rename
164 | 	del unique_df['Album']
165 | 	unique_df = unique_df.rename(columns={'Date': 'Count_Played'})
166 | 
167 | 	# merge with albums df, I assume there's a better way...
168 | 	albums = albums.set_index('Track')
169 | 	albums = albums[~albums.index.duplicated(keep='first')]
170 | 	unique_df = pd.merge(unique_df, albums, left_index=True, right_index=True)
171 | 
172 | 	return (total, unique_df)
173 | 
174 | def visualize_album():
175 | 	(total, unique_df) = create_clean_df()
176 | 	album_df = unique_df.copy()
177 | 	color_album_dict = return_color_album_dict(album_df['Album'].unique().tolist())
178 | 
179 | 	unique_df['Album'] = pd.Categorical(unique_df['Album'], color_album_dict.keys()) # order it by dictionary
180 | 
181 | 	if SORT_ALBUM: # sort by album, then count
182 | 		unique_df = unique_df.sort_values(['Album', 'Count_Played'], ascending=True)
183 | 		ORDERCOUNT = ""
184 | 	else:
185 | 		ORDERCOUNT = "-OrderedByCount"
186 | 		unique_df = unique_df.sort_values(['Count_Played', 'Album'], ascending=True)
187 | 
188 | 	# convert to percentages
189 | 	unique_df['Frequency'] = unique_df['Count_Played'].div(total).multiply(100)
190 | 
191 | 	c = []
192 | 	l = unique_df['Album'].tolist()
193 | 	for val in l:
194 | 		c.append(color_album_dict[val])
195 | 	# https://github.com/pandas-dev/pandas/issues/16822#issuecomment-1257284602
196 | 	ax = unique_df.plot.barh(y='Frequency', color=c)
197 | 
198 | 
199 | 	format(ax, color_album_dict, total, unique_df)
200 | 
201 | 	plt.savefig("./Visual-" + ARTIST + ORDERCOUNT + YEAR + ".png", format='png', dpi=1200)
202 | 	plt.show()
203 | 
204 | def format(ax, color_dict, total, df):
205 | 	plt.rcParams["font.family"] = "Helvetica"
206 | 	hfont = {'fontname':'Helvetica'}
207 | 
208 | 	# The following two lines generate custom fake lines that will be used as legend entries:
209 | 	markers = [plt.Line2D([0,0],[0,0],color=color, marker='o', linestyle='') for color in color_dict.values()]
210 | 	plt.legend(markers, color_dict.keys(), numpoints=1, fontsize='7')
211 | 
212 | 	# formatting labels
213 | 	ax.set_xlabel("Frequency" + " (" + str(total) + " concerts total)", **hfont)
214 | 	fmt = '%.0f%%' # Format you want the ticks, e.g. '40%'
215 | 	xticks = mtick.FormatStrFormatter(fmt)
216 | 	ax.xaxis.set_major_formatter(xticks)
217 | 
218 | 	plt.xticks(fontname='Helvetica')
219 | 
220 | 
221 | 	ax.set_ylabel("Track (" + str(len(df.index)) + ")", **hfont)
222 | 	plt.yticks(fontsize=FONT_Y, fontname='Helvetica')
223 | 	plt.title(TITLE, fontsize=16, **hfont)
224 | 	plt.figtext(0.5, 0.01, 'Created by github.com/alexdanilowicz/Setlist-Visualizer', wrap=True, horizontalalignment='center', fontsize=7, **hfont)
225 | 
226 | 
227 | 	for i in ax.patches:
228 | 		ax.text(i.get_width()+.3, i.get_y()+.38, str(round((i.get_width()*total/100), 1)).replace(".0", ""), fontsize=FONT_SIZE_TICKS, color='dimgrey', **hfont)
229 | 
230 | 	ax.invert_yaxis()
231 | 	plt.tight_layout()
232 | 
233 | 
234 | # helper function to sort by date
235 | def sorting(date):
236 | 	string = ''.join(x for x in date if x.isdigit())
237 | 	return int(string)
238 | 
239 | def return_color_album_dict(albums_list):
240 | 	color_album_dict = {}
241 | 
242 | 	# HERE YOU CAN SPECIFY ALBUM COLORS SO THEY FIT YOUR ARTIST's ALBUM ARTWORK
243 | 	# Make sure you catch everything, or it will not map properly
244 | 	if CUSTOM:
245 | 		color_album_dict = returnCustomAlbumDict(color_album_dict)
246 | 
247 | 	else: # otherwise, just get random ugly hex colors
248 | 		number_of_colors = len(albums_list)
249 | 		random_hex_list = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
250 | 		for i in range(number_of_colors)]
251 | 
252 | 		i = 0
253 | 		for album in albums_list:
254 | 			color_album_dict[album] = str(random_hex_list[i])
255 | 			i += 1
256 | 
257 | 	return color_album_dict
258 | 
259 | def returnCustomAlbumDict(color_album_dict):
260 | 	color_album_dict["Boxer"] = '#79B791' #lime green
261 | 	color_album_dict["Trouble Will Find Me"] = '#96C9DC' #sky blue
262 | 	color_album_dict["Alligator"] = '#F06C9B' # dark pink
263 | 	color_album_dict["Cherry Tree"] = '#F5D491' # yellow
264 | 	color_album_dict["High Violet"] = '#666A86' # purplish
265 | 	color_album_dict["Sleep Well Beast"] = '#333333' # grey
266 | 	color_album_dict["I Am Easy to Find"] = '#61A0AF' # dark blue
267 | 	color_album_dict["First Two Pages of Frankenstein"] = '#D7C2D1' # pink
268 | 	color_album_dict["New Album"] = '#FFA500' # orange
269 | 	color_album_dict["Sad Songs for Dirty Lovers"] = '#880808' # red
270 | 
271 | 	return color_album_dict
272 | 
273 | if __name__ == "__main__":
274 | 	scrape()
275 | 


--------------------------------------------------------------------------------