├── .gitignore ├── filter_square_brackets.sh ├── filter_whisps.sh ├── prose └── README.md ├── notebook_helpers.py ├── parse_infgen.py ├── pop_prose_splits.py ├── common.py ├── add_genre.py ├── god_frame.py ├── README.md ├── billboard_scrape.py ├── lyrics_scrape.py ├── Lyrics.py ├── retry_lyrics_scrape.py ├── badromance_infgen.txt └── normalizer.py /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | *.pickle 3 | *.pyc 4 | -------------------------------------------------------------------------------- /filter_square_brackets.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bads=`grep '\[' lyrics/*.txt | cut -d ':' -f 1 | sort | uniq` 4 | nbad=`echo "$bads" | wc -l` 5 | echo "Moving $nbad braced lyrics files" 6 | for bad in $bads 7 | do 8 | mv $bad bad_lyrics/braces/ 9 | rm $bad.gz 10 | done 11 | -------------------------------------------------------------------------------- /filter_whisps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Filter out lyrics files that are too small 4 | 5 | for song in `ls lyrics/*.txt` 6 | do 7 | chars=`wc -c $song | cut -d ' ' -f 1` 8 | if (( chars < 3 )) 9 | then 10 | mv $song bad_lyrics/whisps/ 11 | elif (( chars < 40 )) 12 | then 13 | mv $song bad_lyrics/questionable/ 14 | fi 15 | done 16 | -------------------------------------------------------------------------------- /prose/README.md: -------------------------------------------------------------------------------- 1 | A few prose samples to compare against. (For the purposes of answering the question: do pop songs compress better than other natural language documents?) 2 | 3 | - poynton.txt: The Spoils of Poynton by Henry James 4 | - hansard.txt: excerpt from debates in the Canadian House of Commons from 2001 (source: http://www.isi.edu/natural-language/download/hansard/) 5 | - comments.txt: some Reddit comments from 2006-02 6 | -------------------------------------------------------------------------------- /notebook_helpers.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import pandas as pd 3 | import math 4 | 5 | import common 6 | 7 | BIAS_ADJUSTED_RATIO = 1 8 | #BIAS = 34.775287769 9 | # 10 for the header, 8 for the footer, 1 for the block prefix (really 3 bits, but I'm rounding up) 10 | BIAS = 10 + 8 + 1 11 | BASE = 2 12 | 13 | def get_frame(having_lyrics=False): 14 | om = common.get_omnisong() 15 | # Set this in both cases just for flexibility if I wanna turn adjustment on and off 16 | om['raw_ratio'] = om['raw'] / om['comp'] 17 | if BIAS_ADJUSTED_RATIO: 18 | om['unbiased_ratio'] = om['raw'] / (om['comp']-BIAS) 19 | else: 20 | om['unbiased_ratio'] = om['raw'] / om['comp'] 21 | om['year'] = om['date'].apply(lambda d: d.year) 22 | om['yearf'] = om['date'].apply(lambda d: d.year + d.month/12 + d.day/365) 23 | # Ratio = the one calculated using infgen (set in god_frame.py) 24 | om['rscore'] = om['ratio'].apply(lambda x: math.log(x, BASE)) 25 | if having_lyrics: 26 | om = om[(om['raw'] > 2) & om['scraped']].copy() 27 | return om 28 | 29 | def get_lyrics_frame(): 30 | return get_frame(True) 31 | -------------------------------------------------------------------------------- /parse_infgen.py: -------------------------------------------------------------------------------- 1 | """ 2 | WTK: pre-huffman compression ratio 3 | """ 4 | 5 | from __future__ import division 6 | import re 7 | 8 | fname = 'badromance_infgen.txt' 9 | 10 | def parse_ratio(f, verbose=False): 11 | matches = 0 12 | n_literals = 0 13 | n_symbols = 0 14 | for line in f: 15 | if line.startswith('match'): 16 | _, length, dist = line.split() 17 | matches += 1 18 | 19 | pattern = r'! stats literals \d\.\d bits each \(\d+/(\d+)\)' 20 | p = re.compile(pattern) 21 | m = re.match(p, line) 22 | if m: 23 | n_literals = int(m.group(1)) 24 | 25 | m = re.match(r'! stats total inout \d+:\d+ \((\d+)\)', line) 26 | if m: 27 | n_symbols = int(m.group(1)) 28 | 29 | m = re.match(r'! stats total block average (\d+)\.\d uncompressed', line) 30 | if m: 31 | uncomp = int(m.group(1)) 32 | 33 | if verbose: 34 | print "{} matches, {} literals, {} symbols".format(matches, n_literals, n_symbols) 35 | print "Uncompressed size = {} bytes".format(uncomp) 36 | assert matches + n_literals == n_symbols 37 | 38 | # 1 byte per literal, 3 bytes per match. 39 | pseudosize = matches * 3 + n_literals 40 | ratio = uncomp / pseudosize 41 | if verbose: 42 | print "{} / {} = {:.2f}".format(uncomp, pseudosize, ratio) 43 | return (uncomp, pseudosize) 44 | 45 | if __name__ == '__main__': 46 | with open(fname) as f: 47 | parse_ratio(f, verbose=True) 48 | 49 | -------------------------------------------------------------------------------- /pop_prose_splits.py: -------------------------------------------------------------------------------- 1 | import common 2 | import pickle 3 | import os 4 | 5 | N_SONGS = 100 6 | MIN_CHART_POS = 10 7 | PROSE_SOURCES = ['poynton', 'comments', 'hansard'] 8 | 9 | def get_recent_keys(n=N_SONGS, min_pos=MIN_CHART_POS): 10 | charts = common.get_chartdata() 11 | found = set() 12 | for chart in charts: 13 | for song in chart[:min_pos]: 14 | k = common.song_key(song) 15 | if k in found: 16 | continue 17 | if common.have_lyrics(song): 18 | found.add(k) 19 | if len(found) >= N_SONGS: 20 | break 21 | 22 | if len(found) >= N_SONGS: 23 | break 24 | return found 25 | 26 | if __name__ == '__main__': 27 | song_keys = get_recent_keys() 28 | print "Loaded song keys to match against" 29 | prosedir = 'prose' 30 | prosefiles = {src: open(os.path.join(prosedir, src+'.txt')) 31 | for src in PROSE_SOURCES} 32 | try: 33 | os.mkdir(os.path.join(prosedir, 'fragments')) 34 | except OSError: 35 | pass 36 | for prose_src in prosefiles: 37 | try: 38 | os.mkdir(os.path.join(prosedir, 'fragments', prose_src)) 39 | except OSError: 40 | pass 41 | 42 | for i, song_key in enumerate(song_keys): 43 | fname = str(i) 44 | size = os.path.getsize(os.path.join(common.LYRICS_DIR, song_key+'.txt')) 45 | for prose_src, prosefile in prosefiles.iteritems(): 46 | acc = '' 47 | while len(acc) < size and abs(len(acc) - size) > 5: 48 | acc += prosefile.readline() 49 | with open(os.path.join(prosedir, 'fragments', prose_src, fname), 'w') as f: 50 | f.write(acc) 51 | 52 | 53 | -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import parse_infgen 4 | 5 | CHARTDATA_FILENAME = 'hot-100-chartdata.pickle' 6 | DB_FILENAME = 'hot-100.pickle' 7 | LYRICS_DIR = 'lyrics' 8 | OMNI_PICKLE_NAME = 'omnisongs.pickle' 9 | 10 | class NotScrapedException(Exception): 11 | pass 12 | 13 | def have_lyrics(song): 14 | k = song_key(song) 15 | path = os.path.join(LYRICS_DIR, k+'.txt') 16 | return os.path.exists(path) 17 | 18 | def song_key(song): 19 | k = song.artist[:15] + '-' + song.title[:20] 20 | k = k.replace('/', '') 21 | return k.replace(' ', '_') 22 | 23 | def get_songdb(): 24 | with open(DB_FILENAME) as f: 25 | db = pickle.load(f) 26 | return db 27 | 28 | def get_omnisong(): 29 | with open(OMNI_PICKLE_NAME) as f: 30 | om = pickle.load(f) 31 | return om 32 | 33 | def get_chartdata(): 34 | with open(CHARTDATA_FILENAME) as f: 35 | cd = pickle.load(f) 36 | return cd 37 | 38 | def get_sizes(song_or_key): 39 | if isinstance(song_or_key, basestring): 40 | k = song_or_key 41 | else: 42 | k = song_key(song_or_key) 43 | path = os.path.join(LYRICS_DIR, k+'.txt') 44 | try: 45 | raw = os.path.getsize(path) 46 | except OSError: 47 | raise NotScrapedException 48 | comp = os.path.getsize(path+'.gz') 49 | return (raw, comp) 50 | 51 | def get_inf_sizes(song_or_key): 52 | """Return raw/compressed sizes used when calculating the infgen-based 53 | compression ratio. Raw size will be the same as above (i.e. just the 54 | number you'd get from `wc -c` on the text file). 55 | The compressed size will be an approximation of the size of the LZ-77 56 | compressed data *before* Huffman coding. Assumes 1 byte per literal, 3 57 | bytes per match.""" 58 | if isinstance(song_or_key, basestring): 59 | k = song_or_key 60 | else: 61 | k = song_key(song_or_key) 62 | path = os.path.join(LYRICS_DIR, k+'.txt.gz.infgen') 63 | with open(path) as f: 64 | return parse_infgen.parse_ratio(f) 65 | -------------------------------------------------------------------------------- /add_genre.py: -------------------------------------------------------------------------------- 1 | import common 2 | import os 3 | import pandas as pd 4 | import normalizer 5 | import time 6 | 7 | GENRE_FILE = 'msd_tagtraum_cd2.cls' 8 | SAVE = True 9 | 10 | omni = common.get_omnisong() 11 | omni['genre'] = None 12 | 13 | def build_trackid_to_genre(): 14 | f = open(GENRE_FILE) 15 | ttg = {} 16 | for line in f: 17 | if line.startswith('#'): 18 | continue 19 | fields = line.split('\t') 20 | trackid = fields[0] 21 | # Has a majority genre and an optional "minority genre". always take the majority. 22 | genre = fields[1].strip() 23 | ttg[trackid] = genre 24 | f.close() 25 | return ttg 26 | 27 | def song_key(title, artist, bb=False): 28 | if bb: 29 | # billboard specific transforms 30 | for clitic in ['s', 't', 'll']: 31 | title = title.replace(' {} '.format(clitic), '{} '.format(clitic)) 32 | return tuple(map(normalizer.normalize_no_rotation, [title, artist])) 33 | 34 | 35 | t0 = time.time() 36 | # Build a mapping from existing title/artist pairs to index 37 | normalized_lookup = {} 38 | for i, (title, artist) in enumerate(omni[ ['title', 'artist'] ].values): 39 | k = song_key(title, artist, bb=True) 40 | normalized_lookup[k] = i 41 | print "Built normalized lookup in {:.1f} seconds".format(time.time()-t0) 42 | t0 = time.time() 43 | 44 | ttg = build_trackid_to_genre() 45 | print "Built genre lookup in {:.1f} seconds".format(time.time()-t0) 46 | 47 | found = 0 48 | with open('unique_tracks.txt') as f: 49 | for line in f: 50 | trackid, _, artist, title = line.split('') 51 | try: 52 | genre = ttg[trackid] 53 | except KeyError: 54 | continue 55 | k = song_key(title.strip(), artist) 56 | try: 57 | i = normalized_lookup[k] 58 | except KeyError: 59 | continue 60 | omni.loc[i, 'genre'] = genre 61 | found +=1 62 | 63 | print "Found {} genre labels out of {} songs".format(found, len(omni)) 64 | 65 | if SAVE: 66 | omni.to_pickle(common.OMNI_PICKLE_NAME) 67 | 68 | -------------------------------------------------------------------------------- /god_frame.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import pandas as pd 3 | import os 4 | import re 5 | 6 | import common 7 | 8 | # this title should be a saint 9 | def canonize_title(title): 10 | trans = title 11 | trans = re.sub('\s+', ' ', title) 12 | clitics = ['ll', 's', 't'] 13 | for clitic in clitics: 14 | trans = trans.replace(' '+clitic, "'"+clitic) 15 | return trans 16 | 17 | db = common.get_songdb() 18 | rows = [] 19 | merged = 0 20 | for artist_discog in db.itervalues(): 21 | title_to_row = {} 22 | for title, song in artist_discog.iteritems(): 23 | try: 24 | raw, comp = common.get_sizes(song) 25 | scraped = True 26 | inf_raw, inf_comp = common.get_inf_ratio(song) 27 | ratio = inf_raw / inf_comp 28 | assert raw == inf_raw, "{} != {}".format(raw, inf_raw) 29 | except common.NotScrapedException: 30 | raw = comp = None 31 | scraped = False 32 | ir = None 33 | canon_title = canonize_title(title) 34 | if canon_title not in title_to_row: 35 | row = dict(artist=song.artist, title=canon_title, date=song.earliest, 36 | peak=song.peakPos, scraped=scraped, 37 | raw=raw, comp=comp, icomp=inf_comp, ratio=ratio, 38 | ) 39 | title_to_row[canon_title] = row 40 | # Got a dupe. Merge them. 41 | else: 42 | merged += 1 43 | extant = title_to_row[canon_title] 44 | extant['peak'] = min(extant['peak'], song.peakPos) 45 | extant['scraped'] = extant['scraped'] or scraped 46 | extant['date'] = min(extant['date'], song.earliest) 47 | 48 | rows.extend(title_to_row.values()) 49 | 50 | print "Merged {} duplicate rows".format(merged) 51 | 52 | df = pd.DataFrame(rows) 53 | df['date'] = pd.to_datetime(df['date']) 54 | # Blargh. Can't do this with nullable col. http://stackoverflow.com/a/21290084/262271 55 | #df['raw'] = df['raw'].astype(int) 56 | #df['comp'] = df['comp'].astype(int) 57 | print "Saving god frame with shape {}".format(df.shape) 58 | df.to_pickle(common.OMNI_PICKLE_NAME) 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Investigating repetition in pop music. Interested in questions like: 2 | 3 | - Has pop music been getting more (or less) repetitive over time? 4 | - Which songs/artists/genres are the most/least repetitive? 5 | 6 | I'm measuring repetitiveness of a song by how well gzip can compress it. (Which sounds cheeky, but I think it can actually be justified when you look at how Lempel-Ziv compression works.) 7 | 8 | The investigations from this repo were a precursor to a visual essay I did for Pudding.cool: [Are Pop Lyrics Getting More Repetitive?](https://pudding.cool/2017/05/song-repetition). The code for that essay lives at https://github.com/polygraph-cool/song-repetition 9 | 10 | ## Brief overview of calculating lyric compressibility 11 | 12 | 1. Put lyrics in text files (making sure they're ASCII encoded) 13 | 2. Compress those text files using gzip (I used the -9 flag to maximize the compression efficiency) 14 | 3. Run [infgen](https://github.com/madler/infgen) on each gzip file, redirecting the output to a file. (See `badromance_infgen.txt` for an example of what one of these files looks like) 15 | 4. Run `parse_ratio` from `parse_infgen.py` on those files. This returns a tuple of the original (uncompressed) and compressed sizes (in bytes/characters). The ratio of those two numbers will give you the compression ratio. 16 | 17 | Roughly speaking, `parse_ratio` calculates the compressed size using only the Lempel-Ziv part of the DEFLATE compression performed by gzip (and not the Huffman coding part). Infgen is what lets us separate those steps. The compressed size is calculated by treating a 'match' (i.e. a pointer backwards to an earlier portion of the text which is repeated) as costing 3 bytes. This is close to reality, but also just gives intuitively reasonable results for my purposes. You can increase the cost (to put more emphasis on longer repeated sequences, and avoid spurious repetitions on short character sequences) or decrease it - it shouldn't have a huge effect. 18 | 19 | ### Lazy version 20 | 21 | Run steps 1 and 2 above, then just look at the ratio between the file sizes of the original (text) files and the gzip files. The disadvantage is that this also incorporates the Huffman coding step (which is not relevant to the natural sense of 'repetitiveness' of song lyrics), and adds a constant amount of overhead (from the gzip header, and the huffman table), which can distort the rankings for very short texts. But overall, the rankings you get with this method will be pretty close to the more principled one above. 22 | -------------------------------------------------------------------------------- /billboard_scrape.py: -------------------------------------------------------------------------------- 1 | import billboard 2 | import time 3 | import pickle 4 | import datetime 5 | 6 | chartname = 'hot-100' 7 | DATE_FMT = '%Y-%m-%d' 8 | SLEEPYTIME = 1 9 | 10 | class SongDB(object): 11 | def __init__(self, path): 12 | self.path = path 13 | try: 14 | f = open(path) 15 | self.db = pickle.load(f) 16 | except IOError: 17 | self.db = {} 18 | 19 | def add_song(self, song, date): 20 | if song.artist not in self.db: 21 | self.db[song.artist] = {} 22 | 23 | artist_songs = self.db[song.artist] 24 | if song.title in artist_songs: 25 | extant = artist_songs[song.title] 26 | # Not strictly necessary if we're guaranteed to always 27 | # iterate in reverse chrono order, but doesn't hurt to 28 | # be safe. 29 | extant.weeks = max(extant.weeks, song.weeks) 30 | extant.peakPos = min(extant.peakPos, song.peakPos) 31 | extant.earliest = min(extant.earliest, date) 32 | else: 33 | song.earliest = date 34 | artist_songs[song.title] = song 35 | 36 | def save(self): 37 | with open(self.path, 'w') as f: 38 | pickle.dump(self.db, f) 39 | 40 | def size(self): 41 | n = 0 42 | for artist_songs in self.db.itervalues(): 43 | n += len(artist_songs) 44 | return n 45 | 46 | 47 | 48 | path = chartname + '.pickle' 49 | chartpicklename = chartname + '-chartdata.pickle' 50 | try: 51 | with open(chartpicklename) as f: 52 | charts = pickle.load(f) 53 | date = charts[-1].previousDate 54 | except IOError: 55 | charts = [] 56 | date = None 57 | 58 | db = SongDB(path) 59 | i = 0 60 | lim = float('inf') 61 | # TODO: load pickled charts 62 | try: 63 | while 1: 64 | chart = billboard.ChartData(chartname, date) 65 | dt = datetime.datetime.strptime(chart.date, DATE_FMT).date() 66 | 67 | for song in chart: 68 | db.add_song(song, dt) 69 | charts.append(chart) 70 | time.sleep(SLEEPYTIME) 71 | 72 | i += 1 73 | if not chart.previousDate or i >= lim: 74 | break 75 | if (i % 26) == 0: 76 | print date 77 | date = chart.previousDate 78 | except: # gotta catchemall 79 | if i == 0: 80 | raise 81 | print "Uh oh. Got unexpected exception. Saving whatever we've accumulated before bailing" 82 | db.save() 83 | with open(chartpicklename, 'w') as f: 84 | pickle.dump(charts, f) 85 | raise 86 | 87 | 88 | db.save() 89 | print "Saved db with {} songs to {}".format(db.size(), path) 90 | with open(chartpicklename, 'w') as f: 91 | pickle.dump(charts, f) 92 | 93 | -------------------------------------------------------------------------------- /lyrics_scrape.py: -------------------------------------------------------------------------------- 1 | # TODO: if you rerun this later, make sure you check against filenames in "bad_lyrics" dir, 2 | # to avoid scraping them twice 3 | import pickle 4 | import time 5 | import os 6 | 7 | import common 8 | import Lyrics 9 | 10 | PICKLE_NAME = 'hot-100.pickle' 11 | LYRICS_DIR = 'lyrics' 12 | SLEEPYTIME = 1 13 | EXT = '.txt' 14 | MAX_CHART_POS = 1000 # Only scrape songs that charted at least this high 15 | 16 | def unicode_unfuck(s): 17 | return ''.join(map(lambda c: chr(ord(c)), s)) 18 | 19 | def load_extant(d): 20 | keys = set() 21 | for fname in os.listdir(d): 22 | if fname.endswith(EXT): 23 | keys.add(fname[:-len(EXT)]) 24 | return keys 25 | 26 | with open(PICKLE_NAME) as f: 27 | db = pickle.load(f) 28 | 29 | i = 0 30 | lim = float('inf') 31 | # nvm. probably better just to use os.path.exists each time. we need to sleep 32 | # between requests anyways, so who cares if it's slower 33 | #extant = load_extant(LYRICS_DIR) 34 | malencoded = 0 35 | with open('song_404s.txt', 'a+') as skips_file: 36 | bad_keys = set([line.split('\t')[-1].strip() for line in skips_file]) 37 | skips_file.seek(0) 38 | for artist in db: 39 | for song in db[artist].itervalues(): 40 | if song.peakPos > MAX_CHART_POS: 41 | continue 42 | k = common.song_key(song) 43 | if k in bad_keys: 44 | continue 45 | #if k in extant: 46 | # continue 47 | path = os.path.join(LYRICS_DIR, k + EXT) 48 | if os.path.exists(path): 49 | continue 50 | try: 51 | lyrics, url = Lyrics.get_lyrics2(song) 52 | time.sleep(SLEEPYTIME) 53 | except Lyrics.LyricsNotFoundException: 54 | time.sleep(SLEEPYTIME) 55 | print "Failed to find lyrics for {}".format(song) 56 | try: 57 | skips_file.write('\t'.join([song.artist, song.title, k]) + '\n') 58 | except UnicodeEncodeError: 59 | malencoded += 1 60 | continue 61 | 62 | #skipped.add( (song.artist, song.title) ) 63 | continue 64 | except Lyrics.URLNotChangedException: 65 | unchanged += 1 66 | continue 67 | if len(lyrics) < 5: 68 | print "WARNING: Got length 0 lyrics for {} ({})".format(song, url) 69 | skips_file.write('\t' + '\t'.join([song.artist, song.title, k]) + '\n') 70 | continue 71 | with open(path, 'w') as f: 72 | try: 73 | f.write(lyrics) 74 | except UnicodeEncodeError: 75 | # Blah blah fishcakes. Somehow got into a situation where, like, if there are multi-byte 76 | # unicode code points in the lyrics, we get each byte encoded in utf-8, rather than the 77 | # whole thing. TODO: should probably file a bug on... someone 78 | lyrics = unicode_unfuck(lyrics) 79 | f.write(lyrics) 80 | i += 1 81 | if i >= lim: 82 | break 83 | if i % 100 == 0: 84 | print '.', 85 | if i >= lim: 86 | break 87 | 88 | print "Skipped {} malencoded songs".format(malencoded) 89 | -------------------------------------------------------------------------------- /Lyrics.py: -------------------------------------------------------------------------------- 1 | # Modified version of Lyrics.py from this repo: https://github.com/bhrigu123/Instant-Lyrics 2 | # TODO: submit a patch? 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import os 6 | import sys 7 | import re 8 | 9 | try: 10 | from urllib.parse import quote_plus 11 | except ImportError: 12 | from urllib import quote_plus 13 | 14 | class LyricsNotFoundException(Exception): 15 | pass 16 | 17 | def get_metrolyrics(url): 18 | resp = requests.get(url, headers={ 19 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel' 20 | 'Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, ' 21 | 'like Gecko) Chrome/55.0.2883.95 Safari/537.36' 22 | } 23 | ) 24 | if resp.status_code == 404: 25 | raise LyricsNotFoundException 26 | lyrics_html = resp.text 27 | 28 | soup = BeautifulSoup(lyrics_html, "lxml") 29 | raw_lyrics = (soup.findAll('p', attrs={'class': 'verse'})) 30 | paras = [] 31 | try: 32 | final_lyrics = unicode.join(u'\n', map(unicode, raw_lyrics)) 33 | except NameError: 34 | final_lyrics = str.join(u'\n', map(str, raw_lyrics)) 35 | 36 | final_lyrics = (final_lyrics.replace('

', '\n')) 37 | final_lyrics = (final_lyrics.replace('
', ' ')) 38 | final_lyrics = final_lyrics.replace('

', ' ') 39 | return (final_lyrics, url) 40 | 41 | def get_lyrics2(song): 42 | # Using google isn't really scalable. Looks like they're pretty serious about 43 | # detecting and blocking scrapers. 44 | # Have to just guess the URL for now :/ 45 | artist = song.artist.lower() 46 | # metrolyrics quirk. if artist is foo ft bar, url seems to always just have foo 47 | cleaved = False 48 | for feat in [' featuring', ' &', ' feat.']: 49 | feati = artist.find(feat) 50 | if feati != -1: 51 | artist = artist[:feati] 52 | cleaved = True 53 | if cleaved: 54 | if ',' in artist: 55 | artist = artist.split(',')[0].strip() 56 | if artist == 'n sync': 57 | artist = 'nsync' 58 | if artist == 'p!nk': 59 | artist = 'pink' 60 | title = song.title.lower().replace(' & ', ' and ') 61 | fragment = title + ' lyrics ' + artist 62 | # Lowercase islands seem to come up a lot in song titles like 63 | # "It Wasn t Me", or "I ll Be There" 64 | fragment = fragment\ 65 | .replace("'", "")\ 66 | .replace(' s ', 's ')\ 67 | .replace(' t ', 't ')\ 68 | .replace(' ll ', 'll ')\ 69 | .replace('-', '')\ 70 | .replace('#', '')\ 71 | .replace(".", "")\ 72 | .replace("& ", "")\ 73 | .replace('?', '')\ 74 | .replace('f**k', 'fuck') 75 | 76 | fragment = re.sub('\s+', ' ', fragment) 77 | fragment = fragment.replace(' ', '-') 78 | 79 | try: 80 | url = 'http://www.metrolyrics.com/{}.html'.format(fragment) 81 | except UnicodeEncodeError: 82 | raise LyricsNotFoundException 83 | return get_metrolyrics(url) 84 | 85 | def get_lyrics(song_name): 86 | 87 | song_name += ' site:metrolyrics.com' 88 | name = quote_plus(song_name) 89 | hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11' 90 | '(KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 91 | 'Accept-Language': 'en-US,en;q=0.8', 92 | 'Connection': 'keep-alive'} 93 | 94 | url = 'http://www.google.com/search?q=' + name 95 | 96 | result = requests.get(url, headers=hdr).text 97 | offset = 0 98 | lyrics_found = False 99 | while not lyrics_found: 100 | domain = 'http://www.metrolyrics.com' 101 | link_start = result.find(domain, offset) 102 | if link_start == -1: 103 | with open('err.html', 'w') as f: 104 | #result = ''.join(map(lambda c: chr(ord(c)), unicode(result)) 105 | f.write(result.encode('utf-8')) 106 | raise LyricsNotFoundException 107 | link_end = result.find('html', link_start + 1) 108 | offset = link_start+1 109 | 110 | link = result[link_start:link_end + 4] 111 | if 'lyrics' in link[len(domain):]: 112 | lyrics_found = True 113 | return get_metrolyrics(link) 114 | 115 | 116 | 117 | if __name__ == '__main__': 118 | song = ' '.join(sys.argv[1:]) 119 | lyrics = get_lyrics(song) 120 | print lyrics 121 | -------------------------------------------------------------------------------- /retry_lyrics_scrape.py: -------------------------------------------------------------------------------- 1 | # TODO: if you rerun this later, make sure you check against filenames in "bad_lyrics" dir, 2 | # to avoid scraping them twice 3 | import pickle 4 | import time 5 | import os 6 | import re 7 | 8 | import common 9 | import Lyrics 10 | 11 | PICKLE_NAME = 'hot-100.pickle' 12 | LYRICS_DIR = 'lyrics' 13 | SLEEPYTIME = 1 14 | EXT = '.txt' 15 | 16 | def unicode_unfuck(s): 17 | return ''.join(map(lambda c: chr(ord(c)), s)) 18 | 19 | def load_extant(d): 20 | keys = set() 21 | for fname in os.listdir(d): 22 | if fname.endswith(EXT): 23 | keys.add(fname[:-len(EXT)]) 24 | return keys 25 | 26 | class FakeSong(object): 27 | def __init__(self, artist, title): 28 | self.artist = artist 29 | self.title = title 30 | 31 | def __str__(self): 32 | return '{} - {}'.format(self.artist, self.title) 33 | 34 | artists_renamed = { 35 | 'Beyonce': 'Beyonce Knowles', 36 | 'Janet': 'Janet Jackson', 37 | 'India.Arie': 'India Arie', 38 | 'James Brown And The Famous Flames': 'James Brown', 39 | "Go-Go's": "The Gogos", 40 | } 41 | andy_artists = [ 42 | 'Peter And Gordon', 'Blood, Sweat & Tears', 'Captain & Tennille', 43 | 'Crosby, Stills & Nash', 44 | ] 45 | 46 | for andy in andy_artists: 47 | canon = andy.replace(' & ', ' ') 48 | canon = canon.replace(' And ', ' ') 49 | artists_renamed[andy] = canon 50 | 51 | def transformed_songs(song): 52 | trans = [] 53 | cp = lambda: FakeSong(song.artist, song.title) 54 | artist = song.artist 55 | title = song.title 56 | if '#' in artist or '#' in title: 57 | yield song 58 | if artist.startswith('The '): 59 | s = cp() 60 | s.artist = artist[len('The '):] 61 | yield s 62 | if artist.startswith('Gladys Knight '): 63 | s = cp() 64 | s.artist = 'Gladys Knight' 65 | yield s 66 | if artist == 'Earth, Wind & Fire': 67 | s = cp() 68 | s.artist = 'Earth Wind Fire' 69 | yield s 70 | if artist == 'Big & Rich': 71 | s = cp() 72 | s.artist = 'Big Rich' 73 | yield s 74 | if artist == 'Peaches & Herb': 75 | s = cp() 76 | s.artist = 'Peaches Herb' 77 | yield s 78 | if artist == 'Maroon5': 79 | s = cp() 80 | s.artist = 'Maroon 5' 81 | yield s 82 | if 'B****' in title: 83 | s = cp() 84 | s.title = title.replace('B****', 'Bitch') 85 | yield s 86 | if artist in artists_renamed: 87 | s = cp() 88 | s.artist = artists_renamed[artist] 89 | yield s 90 | 91 | # cause it has no parens. yuk yuk. 92 | orphaned = re.sub('\(.*\)', '', title) 93 | if orphaned != title: 94 | s = cp() 95 | s.title = orphaned 96 | yield s 97 | if artist.endswith(' s'): 98 | s = cp() 99 | s.artist = artist[:-2]+'s' 100 | yield s 101 | 102 | with open(PICKLE_NAME) as f: 103 | db = pickle.load(f) 104 | 105 | # nvm. probably better just to use os.path.exists each time. we need to sleep 106 | # between requests anyways, so who cares if it's slower 107 | #extant = load_extant(LYRICS_DIR) 108 | malencoded = 0 109 | with open('song_404s.txt') as to_retry: 110 | bad_keys = set([line.split('\t')[-1].strip() for line in to_retry]) 111 | 112 | with open('still_404s.txt', 'w') as skips_file: 113 | for artist in db: 114 | for orig_song in db[artist].itervalues(): 115 | k = common.song_key(orig_song) 116 | if k not in bad_keys: 117 | continue 118 | path = os.path.join(LYRICS_DIR, k + EXT) 119 | found = False 120 | for song in transformed_songs(orig_song): 121 | #print "Transformed {} to {}".format(orig_song, song) 122 | try: 123 | lyrics, url = Lyrics.get_lyrics2(song) 124 | time.sleep(SLEEPYTIME) 125 | except Lyrics.LyricsNotFoundException: 126 | time.sleep(SLEEPYTIME) 127 | continue 128 | if len(lyrics) < 5: 129 | continue 130 | else: 131 | found = True 132 | break 133 | if not found: 134 | try: 135 | skips_file.write('\t'.join([orig_song.artist, orig_song.title, k]) + '\n') 136 | except UnicodeEncodeError: 137 | malencoded += 1 138 | else: 139 | print "Success! {}".format(orig_song) 140 | with open(path, 'w') as f: 141 | try: 142 | f.write(lyrics) 143 | except UnicodeEncodeError: 144 | # Blah blah fishcakes. Somehow got into a situation where, like, if there are multi-byte 145 | # unicode code points in the lyrics, we get each byte encoded in utf-8, rather than the 146 | # whole thing. TODO: should probably file a bug on... someone 147 | lyrics = unicode_unfuck(lyrics) 148 | f.write(lyrics) 149 | 150 | print "Skipped {} malencoded songs".format(malencoded) 151 | -------------------------------------------------------------------------------- /badromance_infgen.txt: -------------------------------------------------------------------------------- 1 | ! infgen 2.4 output 2 | ! 3 | gzip 4 | ! 5 | last 6 | dynamic 7 | ! stats table 56:1 8 | litlen 10 6 9 | litlen 32 4 10 | litlen 33 8 11 | litlen 39 6 12 | litlen 40 7 13 | litlen 41 7 14 | litlen 44 7 15 | litlen 45 6 16 | litlen 65 8 17 | litlen 66 9 18 | litlen 67 9 19 | litlen 71 9 20 | litlen 73 7 21 | litlen 74 9 22 | litlen 76 9 23 | litlen 79 9 24 | litlen 82 8 25 | litlen 84 9 26 | litlen 87 8 27 | litlen 89 9 28 | litlen 97 4 29 | litlen 98 7 30 | litlen 99 6 31 | litlen 100 5 32 | litlen 101 4 33 | litlen 102 7 34 | litlen 103 6 35 | litlen 104 5 36 | litlen 105 5 37 | litlen 106 8 38 | litlen 107 7 39 | litlen 108 5 40 | litlen 109 6 41 | litlen 110 5 42 | litlen 111 5 43 | litlen 112 8 44 | litlen 114 5 45 | litlen 115 5 46 | litlen 116 5 47 | litlen 117 6 48 | litlen 118 7 49 | litlen 119 6 50 | litlen 120 9 51 | litlen 121 6 52 | litlen 122 9 53 | litlen 256 9 54 | litlen 257 4 55 | litlen 258 5 56 | litlen 259 5 57 | litlen 260 6 58 | litlen 262 7 59 | litlen 263 9 60 | litlen 264 7 61 | litlen 265 8 62 | litlen 266 6 63 | litlen 267 6 64 | litlen 268 6 65 | litlen 269 6 66 | litlen 270 6 67 | litlen 271 9 68 | litlen 272 9 69 | litlen 273 9 70 | litlen 274 8 71 | litlen 275 8 72 | litlen 276 7 73 | litlen 277 7 74 | litlen 278 9 75 | litlen 279 8 76 | litlen 280 9 77 | litlen 281 8 78 | litlen 283 9 79 | dist 2 5 80 | dist 3 6 81 | dist 4 5 82 | dist 5 7 83 | dist 6 7 84 | dist 7 6 85 | dist 8 3 86 | dist 9 5 87 | dist 10 3 88 | dist 11 4 89 | dist 12 4 90 | dist 13 5 91 | dist 14 4 92 | dist 15 5 93 | dist 16 5 94 | dist 17 4 95 | dist 18 4 96 | dist 19 3 97 | dist 20 6 98 | dist 21 4 99 | literal 10 'Oh-o 100 | match 10 3 101 | literal '! 102 | match 14 16 103 | match 5 19 104 | literal 10 'Caught in a bad romance 105 | literal 10 106 | match 61 61 107 | literal 10 'Rah r 108 | match 3 4 109 | literal 'ah- 110 | match 5 3 111 | match 3 45 112 | literal 'Ro m 113 | match 4 22 114 | literal 'o- 115 | match 3 7 116 | match 4 4 117 | literal ' 118 | literal 10 'Gaga 119 | match 3 77 120 | literal 'la 121 | match 3 3 122 | match 3 35 123 | literal 'Want your 124 | match 59 78 125 | match 35 79 126 | literal 'I w 127 | match 9 26 128 | literal 'ugly 129 | match 13 18 130 | literal 'disease 131 | match 14 21 132 | literal 'everything 133 | literal 10 'As lo 134 | match 3 9 135 | literal 'as it's fre 136 | match 15 46 137 | literal 'lov 138 | match 3 18 139 | literal 'L 140 | match 3 6 141 | literal '-l 142 | match 8 5 143 | match 20 34 144 | match 14 120 145 | literal 'rama 146 | literal 10 'The touch of 147 | match 6 25 148 | literal 'hand 149 | match 15 62 150 | literal 'eather studded kiss 151 | match 4 390 152 | match 3 21 153 | literal ' s 154 | match 18 46 155 | match 19 142 156 | literal ', 157 | match 17 34 158 | literal '( 159 | match 32 34 160 | literal ') 161 | match 3 177 162 | literal 'You know 163 | match 3 107 164 | literal 'at 165 | match 11 34 166 | match 3 278 167 | match 3 119 168 | literal 'y 169 | match 15 30 170 | literal 'nee 171 | match 6 21 172 | match 8 144 173 | literal 'it 174 | match 5 395 175 | match 4 71 176 | match 28 405 177 | match 5 144 178 | match 17 200 179 | literal 'r 180 | match 3 389 181 | literal 'nge 182 | match 5 62 183 | match 5 30 184 | literal 'me could write 185 | match 16 644 186 | literal '( 187 | match 13 690 188 | match 3 536 189 | literal '!) 190 | match 23 102 191 | literal 'All 192 | match 10 19 193 | literal 'rs 194 | match 48 106 195 | match 63 812 196 | match 14 168 197 | match 10 63 198 | match 84 876 199 | match 47 798 200 | literal 'horror 201 | match 15 800 202 | literal 'esign 203 | literal 10 '' 204 | match 3 146 205 | match 3 807 206 | match 3 20 207 | literal ''r 208 | match 4 275 209 | literal 'criminal 210 | match 13 801 211 | match 5 50 212 | match 3 23 213 | match 68 801 214 | literal 'psycho, 215 | match 6 13 216 | match 3 424 217 | literal 'tigo shtick 218 | match 10 206 219 | match 4 289 220 | literal 'my 221 | match 3 450 222 | literal 'ar wind 223 | match 3 653 224 | literal 10 'Baby 225 | match 8 168 226 | literal 's 227 | match 3 46 228 | match 19 765 229 | match 140 799 230 | literal ', 231 | match 111 794 232 | match 104 793 233 | match 51 731 234 | match 143 795 235 | match 78 1750 236 | literal 'Wal 237 | match 3 651 238 | literal 'w 239 | match 3 6 240 | literal ' fashion 241 | match 3 33 242 | literal 'by 243 | match 3 19 244 | literal 'ork 245 | match 4 524 246 | literal 'm 247 | match 4 392 248 | match 5 556 249 | literal 'bit 250 | match 3 1553 251 | literal 'crazy 252 | match 5 56 253 | literal '- 254 | match 115 55 255 | literal 'pass 256 | match 18 55 257 | literal 'I'm 258 | match 3 375 259 | match 5 1796 260 | match 6 56 261 | match 4 31 262 | match 20 694 263 | match 5 771 264 | match 21 694 265 | match 18 43 266 | literal 'I don't 267 | match 4 24 268 | literal 'n 269 | match 3 468 270 | literal 'e 271 | match 3 96 272 | literal 'iends 273 | match 3 88 274 | literal '(J'veux 275 | match 3 1828 276 | match 3 495 277 | literal 'm 278 | match 5 1948 279 | literal 't je 280 | match 6 21 281 | literal 'a 282 | match 4 86 283 | match 3 352 284 | match 3 1801 285 | literal 'j 286 | match 15 40 287 | literal ') 288 | match 26 87 289 | match 4 973 290 | match 24 113 291 | match 52 52 292 | match 22 498 293 | literal '(c 294 | match 22 680 295 | literal ') 296 | match 11 255 297 | match 218 1018 298 | match 23 774 299 | match 25 956 300 | match 23 48 301 | match 65 88 302 | match 78 1044 303 | end 304 | ! stats literals 5.4 bits each (1772/331) 305 | ! stats matches 88.6% (128 x 20.2) 306 | ! stats inout 534:0 (459) 2916 0 307 | ! stats total inout 534:0 (459) 2916 308 | ! stats total block average 2916.0 uncompressed 309 | ! stats total block average 459.0 symbols 310 | ! stats total literals 5.4 bits each 311 | ! stats total matches 88.6% (128 x 20.2) 312 | ! 313 | crc 314 | length 315 | -------------------------------------------------------------------------------- /normalizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Thierry Bertin-Mahieux (2011) Columbia University 3 | tb2332@columbia.edu 4 | 5 | 6 | This code contains functions to normalize an artist name, 7 | and possibly a song title. 8 | This is intended to do metadata matching. 9 | It is mostly an elaborate hack, I never did an extensive search of 10 | all problematic name matches. 11 | Code developed using Python 2.6 on a Ubuntu machine, using UTF-8 12 | 13 | This is part of the Million Song Dataset project from 14 | LabROSA (Columbia University) and The Echo Nest. 15 | 16 | 17 | Copyright 2011, Thierry Bertin-Mahieux 18 | 19 | This program is free software: you can redistribute it and/or modify 20 | it under the terms of the GNU General Public License as published by 21 | the Free Software Foundation, either version 3 of the License, or 22 | (at your option) any later version. 23 | 24 | This program is distributed in the hope that it will be useful, 25 | but WITHOUT ANY WARRANTY; without even the implied warranty of 26 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 27 | GNU General Public License for more details. 28 | 29 | You should have received a copy of the GNU General Public License 30 | along with this program. If not, see . 31 | """ 32 | 33 | import os 34 | import re 35 | import sys 36 | import unicodedata 37 | import itertools 38 | import Levenshtein # http://pypi.python.org/pypi/python-Levenshtein/ 39 | 40 | 41 | # ROTATION SYMBOLS (A and B => B and A) 42 | rotation_symbols = ['\|', '/', '&', ',', '\+', ';', '_']#, '\-'] 43 | rotation_words = ['and', 'y', 'et', 'vs', 'vs.', 'v', 'with', 'feat', 44 | 'feat.', 'featuring', 'presents', 'ft.', 'pres.'] 45 | 46 | # SYMBOLS TO REMOVE AT THE BEGINNING 47 | stub_to_remove = ['dj', 'dj.', 'mc', 'm.c.', 'mc.', 'the', 'los', 'les'] 48 | 49 | # SYMBOLS TO REMOVE AT THE END 50 | end_to_remove1 = ['big band', 'trio', 'quartet', 'ensemble', 'orchestra'] 51 | end_to_remove2 = ['band'] 52 | 53 | # COMPILED REGULAR EXPRESSION 54 | # white spaces 55 | re_space = re.compile(r'\s') 56 | # non alphanumeric 57 | re_nonalphanum = re.compile(r'\W') 58 | # rotation symbols 59 | re_rotsymbols = re.compile('\s*?' + '|'.join(rotation_symbols) + '\s*?') 60 | # rotation words 61 | re_rotwords = re.compile(r'\s(' + '|'.join(rotation_words) + ')\s') 62 | # stub to remove 63 | re_remstub = re.compile('(' + '|'.join(stub_to_remove) + ')\s(.*)') 64 | # ending to remove 65 | re_remending1 = re.compile('(.*)\s(' + '|'.join(end_to_remove1) + ')') 66 | re_remending2 = re.compile('(.*)\s(' + '|'.join(end_to_remove2) + ')') 67 | # quotes to remove 68 | re_remquotes = re.compile('(.+)\s(".+?")\s(.+)') 69 | # parenthesis to remove 70 | re_remparenthesis = re.compile('(.+)\s(\(.+?\))\s*(.*)') 71 | # brackets to remove 72 | re_rembrackets = re.compile('(.+)\s(\[.+?\])\s*(.*)') 73 | 74 | 75 | def char_is_ascii(c): 76 | """ 77 | Check if a unicode character, e.g. u'A', u'1' or u'\u0301' is ASCII 78 | """ 79 | #return ord(c) < 128 80 | # the following should be faster, according to: 81 | #http://stackoverflow.com/questions/196345/how-to-check-if-a-string-in-python-is-in-ascii 82 | return c < u"\x7F" 83 | 84 | 85 | def remove_non_ascii(s): 86 | """ 87 | Normalize characters in unicode string 's' that are not ASCII, 88 | try to transform accented characters to non accented version. 89 | Otherwise, remove non-ascii chars 90 | """ 91 | decomposition = unicodedata.normalize('NFKD', s) 92 | return filter(lambda x: char_is_ascii(x), decomposition) 93 | 94 | 95 | def to_lower_case(s): 96 | """ 97 | transform a unicode string 's' to lowercase 98 | ok, this one is trivial, I know 99 | """ 100 | return s.lower() 101 | 102 | 103 | def remove_spaces(s): 104 | """ 105 | Remove all possible spaces in the unicode string s 106 | """ 107 | return re_space.sub('', s) 108 | 109 | 110 | def replace_rotation_symbols(s): 111 | """ 112 | Mostly, replace '&' by 'and' 113 | """ 114 | return re_rotsymbols.sub(' and ', s) 115 | 116 | 117 | def remove_stub(s): 118 | """ 119 | Remove a questionable beginning, e.g. dj 120 | otherwise return string at is 121 | """ 122 | m = re_remstub.match(s) 123 | if not m: 124 | return s 125 | return m.groups()[1] 126 | 127 | 128 | def remove_endings(s): 129 | """ 130 | Remove questionable endings, e.g. 'band' 131 | """ 132 | m = re_remending1.match(s) 133 | if m: 134 | s = m.groups()[0] 135 | m = re_remending2.match(s) 136 | if m: 137 | s = m.groups()[0] 138 | return s 139 | 140 | 141 | def remove_quotes(s): 142 | """ 143 | Remove the quote, like Thierry "The Awesomest" BM 144 | """ 145 | m = re_remquotes.match(s) 146 | if not m: 147 | return s 148 | parts = m.groups() 149 | assert len(parts) == 3 150 | return parts[0] + ' ' + parts[2] 151 | 152 | 153 | def remove_parenthesis(s): 154 | """ 155 | Remove parenthesis, like Thierry (Coolest guy) 156 | """ 157 | m = re_remparenthesis.match(s) 158 | if not m: 159 | return s 160 | parts = m.groups() 161 | assert len(parts) >= 2 162 | if len(parts) == 2: 163 | return parts[0] 164 | return parts[0] + ' ' + parts[2] 165 | 166 | 167 | def remove_brackets(s): 168 | """ 169 | Remove brackets, like Thierry [Coolest guy] 170 | """ 171 | m = re_rembrackets.match(s) 172 | if not m: 173 | return s 174 | parts = m.groups() 175 | assert len(parts) >= 2 176 | if len(parts) == 2: 177 | return parts[0] 178 | return parts[0] + ' ' + parts[2] 179 | 180 | 181 | def normalize_no_rotation(s): 182 | """ 183 | We normalize a name that is supposed to contain no 184 | rotation term ('and', 'y', ...) 185 | """ 186 | # remove beginning 187 | s = remove_stub(s) 188 | # remove ends 189 | s = remove_endings(s) 190 | # remove () 191 | s = remove_parenthesis(s) 192 | # remove "" 193 | s = remove_quotes(s) 194 | return s 195 | 196 | 197 | def split_rotation_words(s): 198 | """ 199 | Split a name using the rotation words: 'and', 'vs', 'y', 'et', ... 200 | then create all possible permutations 201 | """ 202 | parts = re_rotwords.split(s) 203 | parts = filter(lambda p: not p in rotation_words, parts)[:5] 204 | results = set() 205 | # keep only the individual elems (risky?) 206 | for p in parts: 207 | results.add(p) 208 | # create all permutations 209 | permutations = itertools.permutations(parts) 210 | #maxperm = 30 211 | #count_perm = 0 212 | for perm in permutations: 213 | #count_perm += 1 214 | #if count_perm > maxperm: 215 | # break 216 | results.add(' '.join(perm)) 217 | # redo the same but remove the stub first for all parts 218 | parts = map(lambda p: normalize_no_rotation(p), parts) 219 | for p in parts: 220 | results.add(p) 221 | permutations = itertools.permutations(parts) 222 | for perm in permutations: 223 | results.add(' '.join(perm)) 224 | # done 225 | return results 226 | 227 | 228 | def remove_nonalphanumeric(s): 229 | """ 230 | Remove usual punctuation signs: ! , ? : ; . ' etc 231 | Also, we transform long spaces into normal ones 232 | """ 233 | # split around non-alphanum chars 234 | parts = re_nonalphanum.split(s) 235 | # remove empty spots 236 | parts = filter(lambda p: p, parts) 237 | # rejoin with regular space ' ' 238 | return ' '.join(parts) 239 | 240 | 241 | def normalize_artist(s): 242 | """ 243 | Return a set of normalized versions of that artist name 244 | """ 245 | # normalized versions 246 | results = set() 247 | # lower case 248 | s = to_lower_case(s) 249 | results.add(s) 250 | # remove non-ascii chars (try to replace them) 251 | s = remove_non_ascii(s) 252 | results.add(s) 253 | # try removing parenthesis before, in case there's an & in it 254 | s2 = remove_parenthesis(s) 255 | results.add(s2) 256 | # replace rotation symbols 257 | s = replace_rotation_symbols(s) 258 | # split and permute according to rotation words 259 | permutations = split_rotation_words(s) 260 | results.update(permutations) 261 | # remove non-alphanumeric and normalize spaces 262 | results = map(lambda s: remove_nonalphanumeric(s), results) 263 | # remove all spaces 264 | results = map(lambda s: remove_spaces(s), results) 265 | # done (and remove dupes) 266 | return set(results) 267 | 268 | 269 | def normalize_title(s): 270 | """ 271 | Return a set of normalized versions of that title 272 | """ 273 | # normalized versions 274 | results = set() 275 | # lower case 276 | s = to_lower_case(s) 277 | results.add(s) 278 | # remove non-ascii chars (try to replace them) 279 | s = remove_non_ascii(s) 280 | results.add(s) 281 | # try removing parenthesis 282 | s = remove_parenthesis(s) 283 | results.add(s) 284 | # try removing brackets 285 | s = remove_brackets(s) 286 | results.add(s) 287 | # remove non-alphanumeric and normalize spaces 288 | results = map(lambda s: remove_nonalphanumeric(s), results) 289 | # remove all spaces 290 | results = map(lambda s: remove_spaces(s), results) 291 | # done (and remove dupes) 292 | return set(results) 293 | 294 | 295 | def same_artist(name1, name2): 296 | """ 297 | Compare two artists: 298 | - edit distance 299 | - if one name is contained in the other 300 | - by normalizing the names 301 | Return True if it's the same artist, False otherwise 302 | """ 303 | # trivial 304 | n1 = to_lower_case(name1) 305 | n2 = to_lower_case(name2) 306 | if n1 == n2: 307 | return True 308 | # edit distance 309 | if len(n1) >= 10 or len(n2) >= 10: 310 | if Levenshtein.distance(n1, n2) <= 2: 311 | return True 312 | # n1 contains n2? or the other way around 313 | if len(n1) >= 10 and len(n2) >= 10: 314 | if len(n1) > len(n2): 315 | if n1.find(n2) >= 0: 316 | return True 317 | else: 318 | if n2.find(n1) >= 0: 319 | return True 320 | # compare by normalizing names 321 | normalized1 = normalize_artist(n1) 322 | normalized2 = normalize_artist(n2) 323 | if len(normalized1.intersection(normalized2)) > 0: 324 | return True 325 | return False 326 | 327 | 328 | def same_title(title1, title2): 329 | """ 330 | Compare two titles: 331 | - edit distance 332 | - if one name is contained in the other 333 | - by normalizing the title 334 | Return True if it's the same title, False otherwise 335 | """ 336 | # trivial 337 | t1 = to_lower_case(title1) 338 | t2 = to_lower_case(title2) 339 | if t1 == t2: 340 | return True 341 | # edit distance 342 | if len(t1) >= 10 or len(t2) >= 10: 343 | if Levenshtein.distance(t1, t2) <= 2: 344 | return True 345 | # n1 contains n2? or the other way around 346 | if len(t1) >= 10 and len(t2) >= 10: 347 | if len(t1) > len(t2): 348 | if t1.find(t2) >= 0: 349 | return True 350 | else: 351 | if t2.find(t1) >= 0: 352 | return True 353 | # compare by normalizing names 354 | normalized1 = normalize_title(t1) 355 | normalized2 = normalize_title(t2) 356 | if len(normalized1.intersection(normalized2)) > 0: 357 | return True 358 | return False 359 | --------------------------------------------------------------------------------