├── .gitignore
├── filter_square_brackets.sh
├── filter_whisps.sh
├── prose
    └── README.md
├── notebook_helpers.py
├── parse_infgen.py
├── pop_prose_splits.py
├── common.py
├── add_genre.py
├── god_frame.py
├── README.md
├── billboard_scrape.py
├── lyrics_scrape.py
├── Lyrics.py
├── retry_lyrics_scrape.py
├── badromance_infgen.txt
└── normalizer.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | *.pickle
3 | *.pyc
4 | 


--------------------------------------------------------------------------------
/filter_square_brackets.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | bads=`grep '\[' lyrics/*.txt | cut -d ':' -f 1 | sort | uniq`
 4 | nbad=`echo "$bads" | wc -l`
 5 | echo "Moving $nbad braced lyrics files"
 6 | for bad in $bads
 7 | do
 8 |     mv $bad bad_lyrics/braces/
 9 |     rm $bad.gz
10 | done
11 | 


--------------------------------------------------------------------------------
/filter_whisps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Filter out lyrics files that are too small
 4 | 
 5 | for song in `ls lyrics/*.txt`
 6 | do
 7 |     chars=`wc -c $song | cut -d ' ' -f 1`
 8 |     if (( chars < 3 ))
 9 |     then
10 |         mv $song bad_lyrics/whisps/
11 |     elif (( chars < 40 ))
12 |     then
13 |         mv $song bad_lyrics/questionable/
14 |     fi
15 | done
16 | 


--------------------------------------------------------------------------------
/prose/README.md:
--------------------------------------------------------------------------------
1 | A few prose samples to compare against. (For the purposes of answering the question: do pop songs compress better than other natural language documents?)
2 | 
3 | - poynton.txt: The Spoils of Poynton by Henry James
4 | - hansard.txt: excerpt from debates in the Canadian House of Commons from 2001 (source: http://www.isi.edu/natural-language/download/hansard/)
5 | - comments.txt: some Reddit comments from 2006-02
6 | 


--------------------------------------------------------------------------------
/notebook_helpers.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import pandas as pd
 3 | import math
 4 | 
 5 | import common
 6 | 
 7 | BIAS_ADJUSTED_RATIO = 1
 8 | #BIAS = 34.775287769
 9 | # 10 for the header, 8 for the footer, 1 for the block prefix (really 3 bits, but I'm rounding up)
10 | BIAS = 10 + 8 + 1
11 | BASE = 2
12 | 
13 | def get_frame(having_lyrics=False):
14 |     om = common.get_omnisong()
15 |     # Set this in both cases just for flexibility if I wanna turn adjustment on and off
16 |     om['raw_ratio'] = om['raw'] / om['comp']
17 |     if BIAS_ADJUSTED_RATIO:
18 |         om['unbiased_ratio'] = om['raw'] / (om['comp']-BIAS)
19 |     else:
20 |         om['unbiased_ratio'] = om['raw'] / om['comp']
21 |     om['year'] = om['date'].apply(lambda d: d.year)
22 |     om['yearf'] = om['date'].apply(lambda d: d.year + d.month/12 + d.day/365)
23 |     # Ratio = the one calculated using infgen (set in god_frame.py)
24 |     om['rscore'] = om['ratio'].apply(lambda x: math.log(x, BASE))
25 |     if having_lyrics:
26 |         om = om[(om['raw'] > 2) & om['scraped']].copy()
27 |     return om
28 | 
29 | def get_lyrics_frame():
30 |     return get_frame(True)
31 | 


--------------------------------------------------------------------------------
/parse_infgen.py:
--------------------------------------------------------------------------------
 1 | """
 2 | WTK: pre-huffman compression ratio
 3 | """
 4 | 
 5 | from __future__ import division
 6 | import re
 7 | 
 8 | fname = 'badromance_infgen.txt'
 9 | 
10 | def parse_ratio(f, verbose=False):
11 |     matches = 0
12 |     n_literals = 0
13 |     n_symbols = 0
14 |     for line in f:
15 |         if line.startswith('match'):
16 |             _, length, dist = line.split()
17 |             matches += 1
18 | 
19 |         pattern = r'! stats literals \d\.\d bits each \(\d+/(\d+)\)'
20 |         p = re.compile(pattern)
21 |         m = re.match(p, line)
22 |         if m:
23 |             n_literals = int(m.group(1))
24 | 
25 |         m = re.match(r'! stats total inout \d+:\d+ \((\d+)\)', line)
26 |         if m:
27 |             n_symbols = int(m.group(1))
28 | 
29 |         m = re.match(r'! stats total block average (\d+)\.\d uncompressed', line)
30 |         if m:
31 |             uncomp = int(m.group(1))
32 | 
33 |     if verbose:
34 |         print "{} matches, {} literals, {} symbols".format(matches, n_literals, n_symbols)
35 |         print "Uncompressed size = {} bytes".format(uncomp)
36 |     assert matches + n_literals == n_symbols
37 | 
38 |     # 1 byte per literal, 3 bytes per match.
39 |     pseudosize = matches * 3 + n_literals
40 |     ratio = uncomp / pseudosize
41 |     if verbose:
42 |         print "{} / {} = {:.2f}".format(uncomp, pseudosize, ratio)
43 |     return (uncomp, pseudosize)
44 | 
45 | if __name__ == '__main__':
46 |     with open(fname) as f:
47 |         parse_ratio(f, verbose=True)
48 | 
49 | 


--------------------------------------------------------------------------------
/pop_prose_splits.py:
--------------------------------------------------------------------------------
 1 | import common
 2 | import pickle
 3 | import os
 4 | 
 5 | N_SONGS = 100
 6 | MIN_CHART_POS = 10
 7 | PROSE_SOURCES = ['poynton', 'comments', 'hansard']
 8 | 
 9 | def get_recent_keys(n=N_SONGS, min_pos=MIN_CHART_POS):
10 |     charts = common.get_chartdata()
11 |     found = set()
12 |     for chart in charts:
13 |         for song in chart[:min_pos]:
14 |             k = common.song_key(song)
15 |             if k in found:
16 |                 continue
17 |             if common.have_lyrics(song):
18 |                 found.add(k)
19 |                 if len(found) >= N_SONGS:
20 |                     break
21 | 
22 |         if len(found) >= N_SONGS:
23 |             break
24 |     return found
25 | 
26 | if __name__ == '__main__':
27 |     song_keys = get_recent_keys()
28 |     print "Loaded song keys to match against"
29 |     prosedir = 'prose'
30 |     prosefiles = {src: open(os.path.join(prosedir, src+'.txt')) 
31 |             for src in PROSE_SOURCES} 
32 |     try:
33 |         os.mkdir(os.path.join(prosedir, 'fragments'))
34 |     except OSError:
35 |         pass
36 |     for prose_src in prosefiles:
37 |         try:
38 |             os.mkdir(os.path.join(prosedir, 'fragments', prose_src))
39 |         except OSError:
40 |             pass
41 | 
42 |     for i, song_key in enumerate(song_keys):
43 |         fname = str(i)
44 |         size = os.path.getsize(os.path.join(common.LYRICS_DIR, song_key+'.txt'))
45 |         for prose_src, prosefile in prosefiles.iteritems():
46 |             acc = ''
47 |             while len(acc) < size and abs(len(acc) - size) > 5:
48 |                 acc += prosefile.readline()
49 |             with open(os.path.join(prosedir, 'fragments', prose_src, fname), 'w') as f:
50 |                 f.write(acc)
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/common.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import os
 3 | import parse_infgen
 4 | 
 5 | CHARTDATA_FILENAME = 'hot-100-chartdata.pickle'
 6 | DB_FILENAME = 'hot-100.pickle'
 7 | LYRICS_DIR = 'lyrics'
 8 | OMNI_PICKLE_NAME = 'omnisongs.pickle'
 9 | 
10 | class NotScrapedException(Exception):
11 |     pass
12 | 
13 | def have_lyrics(song):
14 |     k = song_key(song)
15 |     path = os.path.join(LYRICS_DIR, k+'.txt')
16 |     return os.path.exists(path)
17 | 
18 | def song_key(song):
19 |     k = song.artist[:15] + '-' + song.title[:20]
20 |     k = k.replace('/', '')
21 |     return k.replace(' ', '_')
22 | 
23 | def get_songdb():
24 |     with open(DB_FILENAME) as f:
25 |         db = pickle.load(f)
26 |     return db
27 | 
28 | def get_omnisong():
29 |     with open(OMNI_PICKLE_NAME) as f:
30 |         om = pickle.load(f)
31 |     return om
32 | 
33 | def get_chartdata():
34 |     with open(CHARTDATA_FILENAME) as f:
35 |         cd = pickle.load(f)
36 |     return cd
37 | 
38 | def get_sizes(song_or_key):
39 |     if isinstance(song_or_key, basestring):
40 |         k = song_or_key
41 |     else:
42 |         k = song_key(song_or_key)
43 |     path = os.path.join(LYRICS_DIR, k+'.txt')
44 |     try:
45 |         raw = os.path.getsize(path)
46 |     except OSError:
47 |         raise NotScrapedException
48 |     comp = os.path.getsize(path+'.gz')
49 |     return (raw, comp)
50 | 
51 | def get_inf_sizes(song_or_key):
52 |     """Return raw/compressed sizes used when calculating the infgen-based 
53 |     compression ratio. Raw size will be the same as above (i.e. just the
54 |     number you'd get from `wc -c` on the text file).
55 |     The compressed size will be an approximation of the size of the LZ-77
56 |     compressed data *before* Huffman coding. Assumes 1 byte per literal, 3
57 |     bytes per match."""
58 |     if isinstance(song_or_key, basestring):
59 |         k = song_or_key
60 |     else:
61 |         k = song_key(song_or_key)
62 |     path = os.path.join(LYRICS_DIR, k+'.txt.gz.infgen')
63 |     with open(path) as f:
64 |         return parse_infgen.parse_ratio(f)
65 | 


--------------------------------------------------------------------------------
/add_genre.py:
--------------------------------------------------------------------------------
 1 | import common
 2 | import os
 3 | import pandas as pd
 4 | import normalizer
 5 | import time
 6 | 
 7 | GENRE_FILE = 'msd_tagtraum_cd2.cls'
 8 | SAVE = True
 9 | 
10 | omni = common.get_omnisong()
11 | omni['genre'] = None
12 | 
13 | def build_trackid_to_genre():
14 |     f = open(GENRE_FILE)
15 |     ttg = {}
16 |     for line in f:
17 |         if line.startswith('#'):
18 |             continue
19 |         fields = line.split('\t')
20 |         trackid = fields[0]
21 |         # Has a majority genre and an optional "minority genre". always take the majority.
22 |         genre = fields[1].strip()
23 |         ttg[trackid] = genre
24 |     f.close()
25 |     return ttg
26 | 
27 | def song_key(title, artist, bb=False):
28 |     if bb:
29 |         # billboard specific transforms
30 |         for clitic in ['s', 't', 'll']:
31 |             title = title.replace(' {} '.format(clitic), '{} '.format(clitic))
32 |     return tuple(map(normalizer.normalize_no_rotation, [title, artist]))
33 | 
34 | 
35 | t0 = time.time()
36 | # Build a mapping from existing title/artist pairs to index
37 | normalized_lookup = {}
38 | for i, (title, artist) in enumerate(omni[ ['title', 'artist'] ].values):
39 |     k = song_key(title, artist, bb=True)
40 |     normalized_lookup[k] = i
41 | print "Built normalized lookup in {:.1f} seconds".format(time.time()-t0)
42 | t0 = time.time()
43 | 
44 | ttg = build_trackid_to_genre()
45 | print "Built genre lookup in {:.1f} seconds".format(time.time()-t0)
46 | 
47 | found = 0
48 | with open('unique_tracks.txt') as f:
49 |     for line in f:
50 |         trackid, _, artist, title = line.split('<SEP>')
51 |         try:
52 |             genre = ttg[trackid]
53 |         except KeyError:
54 |             continue
55 |         k = song_key(title.strip(), artist)
56 |         try:
57 |             i = normalized_lookup[k]
58 |         except KeyError:
59 |             continue
60 |         omni.loc[i, 'genre'] = genre
61 |         found +=1
62 | 
63 | print "Found {} genre labels out of {} songs".format(found, len(omni))
64 | 
65 | if SAVE:
66 |     omni.to_pickle(common.OMNI_PICKLE_NAME)
67 | 
68 | 


--------------------------------------------------------------------------------
/god_frame.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import pandas as pd
 3 | import os
 4 | import re
 5 | 
 6 | import common
 7 | 
 8 | # this title should be a saint
 9 | def canonize_title(title):
10 |     trans = title
11 |     trans = re.sub('\s+', ' ', title)
12 |     clitics = ['ll', 's', 't']
13 |     for clitic in clitics:
14 |         trans = trans.replace(' '+clitic, "'"+clitic)
15 |     return trans
16 | 
17 | db = common.get_songdb()
18 | rows = []
19 | merged = 0
20 | for artist_discog in db.itervalues():
21 |     title_to_row = {}
22 |     for title, song in artist_discog.iteritems():
23 |         try:
24 |             raw, comp = common.get_sizes(song)
25 |             scraped = True
26 |             inf_raw, inf_comp = common.get_inf_ratio(song)
27 |             ratio = inf_raw / inf_comp
28 |             assert raw == inf_raw, "{} != {}".format(raw, inf_raw)
29 |         except common.NotScrapedException:
30 |             raw = comp = None
31 |             scraped = False
32 |             ir = None
33 |         canon_title = canonize_title(title)
34 |         if canon_title not in title_to_row:
35 |             row = dict(artist=song.artist, title=canon_title, date=song.earliest,
36 |                 peak=song.peakPos, scraped=scraped, 
37 |                 raw=raw, comp=comp, icomp=inf_comp, ratio=ratio,
38 |             )
39 |             title_to_row[canon_title] = row
40 |         # Got a dupe. Merge them.
41 |         else:
42 |             merged += 1
43 |             extant = title_to_row[canon_title]
44 |             extant['peak'] = min(extant['peak'], song.peakPos)
45 |             extant['scraped'] = extant['scraped'] or scraped
46 |             extant['date'] = min(extant['date'], song.earliest)
47 | 
48 |     rows.extend(title_to_row.values())
49 | 
50 | print "Merged {} duplicate rows".format(merged)
51 | 
52 | df = pd.DataFrame(rows)
53 | df['date'] = pd.to_datetime(df['date'])
54 | # Blargh. Can't do this with nullable col. http://stackoverflow.com/a/21290084/262271
55 | #df['raw'] = df['raw'].astype(int)
56 | #df['comp'] = df['comp'].astype(int)
57 | print "Saving god frame with shape {}".format(df.shape)
58 | df.to_pickle(common.OMNI_PICKLE_NAME)
59 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Investigating repetition in pop music. Interested in questions like:
 2 | 
 3 | - Has pop music been getting more (or less) repetitive over time?
 4 | - Which songs/artists/genres are the most/least repetitive?
 5 | 
 6 | I'm measuring repetitiveness of a song by how well gzip can compress it. (Which sounds cheeky, but I think it can actually be justified when you look at how Lempel-Ziv compression works.)
 7 | 
 8 | The investigations from this repo were a precursor to a visual essay I did for Pudding.cool: [Are Pop Lyrics Getting More Repetitive?](https://pudding.cool/2017/05/song-repetition). The code for that essay lives at https://github.com/polygraph-cool/song-repetition
 9 | 
10 | ## Brief overview of calculating lyric compressibility
11 | 
12 | 1. Put lyrics in text files (making sure they're ASCII encoded)
13 | 2. Compress those text files using gzip (I used the -9 flag to maximize the compression efficiency)
14 | 3. Run [infgen](https://github.com/madler/infgen) on each gzip file, redirecting the output to a file. (See `badromance_infgen.txt` for an example of what one of these files looks like)
15 | 4. Run `parse_ratio` from `parse_infgen.py` on those files. This returns a tuple of the original (uncompressed) and compressed sizes (in bytes/characters). The ratio of those two numbers will give you the compression ratio.
16 | 
17 | Roughly speaking, `parse_ratio` calculates the compressed size using only the Lempel-Ziv part of the DEFLATE compression performed by gzip (and not the Huffman coding part). Infgen is what lets us separate those steps. The compressed size is calculated by treating a 'match' (i.e. a pointer backwards to an earlier portion of the text which is repeated) as costing 3 bytes. This is close to reality, but also just gives intuitively reasonable results for my purposes. You can increase the cost (to put more emphasis on longer repeated sequences, and avoid spurious repetitions on short character sequences) or decrease it - it shouldn't have a huge effect.
18 | 
19 | ### Lazy version
20 | 
21 | Run steps 1 and 2 above, then just look at the ratio between the file sizes of the original (text) files and the gzip files. The disadvantage is that this also incorporates the Huffman coding step (which is not relevant to the natural sense of 'repetitiveness' of song lyrics), and adds a constant amount of overhead (from the gzip header, and the huffman table), which can distort the rankings for very short texts. But overall, the rankings you get with this method will be pretty close to the more principled one above.
22 | 


--------------------------------------------------------------------------------
/billboard_scrape.py:
--------------------------------------------------------------------------------
 1 | import billboard
 2 | import time
 3 | import pickle
 4 | import datetime
 5 | 
 6 | chartname = 'hot-100'
 7 | DATE_FMT = '%Y-%m-%d'
 8 | SLEEPYTIME = 1
 9 | 
10 | class SongDB(object):
11 |     def __init__(self, path):
12 |         self.path = path
13 |         try:
14 |             f = open(path)
15 |             self.db = pickle.load(f)
16 |         except IOError:
17 |             self.db = {}
18 | 
19 |     def add_song(self, song, date):
20 |         if song.artist not in self.db:
21 |             self.db[song.artist] = {}
22 | 
23 |         artist_songs = self.db[song.artist]
24 |         if song.title in artist_songs:
25 |             extant = artist_songs[song.title]
26 |             # Not strictly necessary if we're guaranteed to always
27 |             # iterate in reverse chrono order, but doesn't hurt to
28 |             # be safe.
29 |             extant.weeks = max(extant.weeks, song.weeks)
30 |             extant.peakPos = min(extant.peakPos, song.peakPos)
31 |             extant.earliest = min(extant.earliest, date)
32 |         else:
33 |             song.earliest = date
34 |             artist_songs[song.title] = song
35 | 
36 |     def save(self):
37 |         with open(self.path, 'w') as f:
38 |             pickle.dump(self.db, f)
39 | 
40 |     def size(self):
41 |         n = 0
42 |         for artist_songs in self.db.itervalues():
43 |             n += len(artist_songs)
44 |         return n
45 | 
46 | 
47 | 
48 | path = chartname + '.pickle'
49 | chartpicklename = chartname + '-chartdata.pickle'
50 | try:
51 |     with open(chartpicklename) as f:
52 |         charts = pickle.load(f)
53 |         date = charts[-1].previousDate
54 | except IOError:
55 |     charts = []
56 |     date = None
57 | 
58 | db = SongDB(path)
59 | i = 0
60 | lim = float('inf')
61 | # TODO: load pickled charts
62 | try:
63 |     while 1:
64 |         chart = billboard.ChartData(chartname, date)
65 |         dt = datetime.datetime.strptime(chart.date, DATE_FMT).date()
66 | 
67 |         for song in chart:
68 |             db.add_song(song, dt)
69 |         charts.append(chart)
70 |         time.sleep(SLEEPYTIME)
71 | 
72 |         i += 1
73 |         if not chart.previousDate or i >= lim:
74 |             break
75 |         if (i % 26) == 0:
76 |             print date
77 |         date = chart.previousDate
78 | except: # gotta catchemall
79 |     if i == 0:
80 |         raise
81 |     print "Uh oh. Got unexpected exception. Saving whatever we've accumulated before bailing"
82 |     db.save()
83 |     with open(chartpicklename, 'w') as f:
84 |         pickle.dump(charts, f)
85 |     raise
86 | 
87 | 
88 | db.save()
89 | print "Saved db with {} songs to {}".format(db.size(), path)
90 | with open(chartpicklename, 'w') as f:
91 |     pickle.dump(charts, f)
92 | 
93 | 


--------------------------------------------------------------------------------
/lyrics_scrape.py:
--------------------------------------------------------------------------------
 1 | # TODO: if you rerun this later, make sure you check against filenames in "bad_lyrics" dir,
 2 | # to avoid scraping them twice 
 3 | import pickle
 4 | import time
 5 | import os
 6 | 
 7 | import common
 8 | import Lyrics
 9 | 
10 | PICKLE_NAME = 'hot-100.pickle'
11 | LYRICS_DIR = 'lyrics'
12 | SLEEPYTIME = 1
13 | EXT = '.txt'
14 | MAX_CHART_POS = 1000 # Only scrape songs that charted at least this high
15 | 
16 | def unicode_unfuck(s):
17 |     return ''.join(map(lambda c: chr(ord(c)), s))
18 | 
19 | def load_extant(d):
20 |     keys = set()
21 |     for fname in os.listdir(d):
22 |         if fname.endswith(EXT):
23 |             keys.add(fname[:-len(EXT)])
24 |     return keys
25 | 
26 | with open(PICKLE_NAME) as f:
27 |     db = pickle.load(f)
28 | 
29 | i = 0
30 | lim = float('inf')
31 | # nvm. probably better just to use os.path.exists each time. we need to sleep
32 | # between requests anyways, so who cares if it's slower
33 | #extant = load_extant(LYRICS_DIR)
34 | malencoded = 0
35 | with open('song_404s.txt', 'a+') as skips_file:
36 |     bad_keys = set([line.split('\t')[-1].strip() for line in skips_file])
37 |     skips_file.seek(0)
38 |     for artist in db:
39 |         for song in db[artist].itervalues():
40 |             if song.peakPos > MAX_CHART_POS:
41 |                 continue
42 |             k = common.song_key(song)
43 |             if k in bad_keys:
44 |                 continue
45 |             #if k in extant:
46 |             #    continue
47 |             path = os.path.join(LYRICS_DIR, k + EXT)
48 |             if os.path.exists(path):
49 |                 continue
50 |             try:
51 |                 lyrics, url = Lyrics.get_lyrics2(song)
52 |                 time.sleep(SLEEPYTIME)
53 |             except Lyrics.LyricsNotFoundException:
54 |                 time.sleep(SLEEPYTIME)
55 |                 print "Failed to find lyrics for {}".format(song)
56 |                 try:
57 |                     skips_file.write('\t'.join([song.artist, song.title, k]) + '\n')
58 |                 except UnicodeEncodeError:
59 |                     malencoded += 1
60 |                     continue
61 | 
62 |                 #skipped.add( (song.artist, song.title) )
63 |                 continue
64 |             except Lyrics.URLNotChangedException:
65 |                 unchanged += 1
66 |                 continue
67 |             if len(lyrics) < 5:
68 |                 print "WARNING: Got length 0 lyrics for {} ({})".format(song, url)
69 |                 skips_file.write('\t' + '\t'.join([song.artist, song.title, k]) + '\n')
70 |                 continue
71 |             with open(path, 'w') as f:
72 |                 try:
73 |                     f.write(lyrics)
74 |                 except UnicodeEncodeError:
75 |                     # Blah blah fishcakes. Somehow got into a situation where, like, if there are multi-byte
76 |                     # unicode code points in the lyrics, we get each byte encoded in utf-8, rather than the 
77 |                     # whole thing. TODO: should probably file a bug on... someone
78 |                     lyrics = unicode_unfuck(lyrics)
79 |                     f.write(lyrics)
80 |             i += 1
81 |             if i >= lim:
82 |                 break
83 |             if i % 100 == 0:
84 |                 print '.',
85 |         if i >= lim:
86 |             break
87 | 
88 | print "Skipped {} malencoded songs".format(malencoded)
89 | 


--------------------------------------------------------------------------------
/Lyrics.py:
--------------------------------------------------------------------------------
  1 | # Modified version of Lyrics.py from this repo: https://github.com/bhrigu123/Instant-Lyrics
  2 | # TODO: submit a patch?
  3 | import requests
  4 | from bs4 import BeautifulSoup
  5 | import os
  6 | import sys
  7 | import re
  8 | 
  9 | try:
 10 |     from urllib.parse import quote_plus
 11 | except ImportError:
 12 |     from urllib import quote_plus
 13 | 
 14 | class LyricsNotFoundException(Exception):
 15 |     pass
 16 | 
 17 | def get_metrolyrics(url):
 18 |     resp = requests.get(url, headers={
 19 |                                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel'
 20 |                                'Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, '
 21 |                                'like Gecko) Chrome/55.0.2883.95 Safari/537.36'
 22 |                                }
 23 |                                )
 24 |     if resp.status_code == 404:
 25 |         raise LyricsNotFoundException
 26 |     lyrics_html = resp.text
 27 | 
 28 |     soup = BeautifulSoup(lyrics_html, "lxml")
 29 |     raw_lyrics = (soup.findAll('p', attrs={'class': 'verse'}))
 30 |     paras = []
 31 |     try:
 32 |         final_lyrics = unicode.join(u'\n', map(unicode, raw_lyrics))
 33 |     except NameError:
 34 |         final_lyrics = str.join(u'\n', map(str, raw_lyrics))
 35 | 
 36 |     final_lyrics = (final_lyrics.replace('<p class="verse">', '\n'))
 37 |     final_lyrics = (final_lyrics.replace('<br/>', ' '))
 38 |     final_lyrics = final_lyrics.replace('</p>', ' ')
 39 |     return (final_lyrics, url)
 40 | 
 41 | def get_lyrics2(song):
 42 |     # Using google isn't really scalable. Looks like they're pretty serious about
 43 |     # detecting and blocking scrapers.
 44 |     # Have to just guess the URL for now :/
 45 |     artist = song.artist.lower()
 46 |     # metrolyrics quirk. if artist is foo ft bar, url seems to always just have foo
 47 |     cleaved = False
 48 |     for feat in [' featuring', ' &', ' feat.']:
 49 |         feati = artist.find(feat)
 50 |         if feati != -1:
 51 |             artist = artist[:feati]
 52 |             cleaved = True
 53 |     if cleaved:
 54 |         if ',' in artist:
 55 |             artist = artist.split(',')[0].strip()
 56 |     if artist == 'n sync':
 57 |         artist = 'nsync'
 58 |     if artist == 'p!nk':
 59 |         artist = 'pink'
 60 |     title = song.title.lower().replace(' & ', ' and ')
 61 |     fragment = title + ' lyrics ' + artist
 62 |     # Lowercase islands seem to come up a lot in song titles like
 63 |     # "It Wasn t Me", or "I ll Be There"
 64 |     fragment = fragment\
 65 |             .replace("'", "")\
 66 |             .replace(' s ', 's ')\
 67 |             .replace(' t ', 't ')\
 68 |             .replace(' ll ', 'll ')\
 69 |             .replace('-', '')\
 70 |             .replace('#', '')\
 71 |             .replace(".", "")\
 72 |             .replace("& ", "")\
 73 |             .replace('?', '')\
 74 |             .replace('f**k', 'fuck')
 75 | 
 76 |     fragment = re.sub('\s+', ' ', fragment)
 77 |     fragment = fragment.replace(' ', '-')
 78 | 
 79 |     try:
 80 |         url = 'http://www.metrolyrics.com/{}.html'.format(fragment)
 81 |     except UnicodeEncodeError:
 82 |         raise LyricsNotFoundException
 83 |     return get_metrolyrics(url)
 84 | 
 85 | def get_lyrics(song_name):
 86 | 
 87 |     song_name += ' site:metrolyrics.com'
 88 |     name = quote_plus(song_name)
 89 |     hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11'
 90 |            '(KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
 91 |            'Accept-Language': 'en-US,en;q=0.8',
 92 |            'Connection': 'keep-alive'}
 93 | 
 94 |     url = 'http://www.google.com/search?q=' + name
 95 | 
 96 |     result = requests.get(url, headers=hdr).text
 97 |     offset = 0
 98 |     lyrics_found = False
 99 |     while not lyrics_found:
100 |         domain = 'http://www.metrolyrics.com'
101 |         link_start = result.find(domain, offset)
102 |         if link_start == -1:
103 |             with open('err.html', 'w') as f:
104 |                 #result = ''.join(map(lambda c: chr(ord(c)), unicode(result))
105 |                 f.write(result.encode('utf-8'))
106 |             raise LyricsNotFoundException
107 |         link_end = result.find('html', link_start + 1)
108 |         offset = link_start+1
109 | 
110 |         link = result[link_start:link_end + 4]
111 |         if 'lyrics' in link[len(domain):]:
112 |             lyrics_found = True
113 |     return get_metrolyrics(link)
114 | 
115 | 
116 | 
117 | if __name__ == '__main__':
118 |     song = ' '.join(sys.argv[1:])
119 |     lyrics = get_lyrics(song)
120 |     print lyrics
121 | 


--------------------------------------------------------------------------------
/retry_lyrics_scrape.py:
--------------------------------------------------------------------------------
  1 | # TODO: if you rerun this later, make sure you check against filenames in "bad_lyrics" dir,
  2 | # to avoid scraping them twice 
  3 | import pickle
  4 | import time
  5 | import os
  6 | import re
  7 | 
  8 | import common
  9 | import Lyrics
 10 | 
 11 | PICKLE_NAME = 'hot-100.pickle'
 12 | LYRICS_DIR = 'lyrics'
 13 | SLEEPYTIME = 1
 14 | EXT = '.txt'
 15 | 
 16 | def unicode_unfuck(s):
 17 |     return ''.join(map(lambda c: chr(ord(c)), s))
 18 | 
 19 | def load_extant(d):
 20 |     keys = set()
 21 |     for fname in os.listdir(d):
 22 |         if fname.endswith(EXT):
 23 |             keys.add(fname[:-len(EXT)])
 24 |     return keys
 25 | 
 26 | class FakeSong(object):
 27 |     def __init__(self, artist, title):
 28 |         self.artist = artist
 29 |         self.title = title
 30 | 
 31 |     def __str__(self):
 32 |         return '{} - {}'.format(self.artist, self.title)
 33 | 
 34 | artists_renamed = {
 35 |         'Beyonce': 'Beyonce Knowles',
 36 |         'Janet': 'Janet Jackson',
 37 |         'India.Arie': 'India Arie',
 38 |         'James Brown And The Famous Flames': 'James Brown',
 39 |         "Go-Go's": "The Gogos",
 40 | }
 41 | andy_artists = [
 42 |     'Peter And Gordon', 'Blood, Sweat & Tears', 'Captain & Tennille',
 43 |     'Crosby, Stills & Nash',
 44 | ]
 45 | 
 46 | for andy in andy_artists:
 47 |     canon = andy.replace(' & ', ' ')
 48 |     canon = canon.replace(' And ', ' ')
 49 |     artists_renamed[andy] = canon
 50 | 
 51 | def transformed_songs(song):
 52 |     trans = []
 53 |     cp = lambda: FakeSong(song.artist, song.title)
 54 |     artist = song.artist
 55 |     title = song.title
 56 |     if '#' in artist or '#' in title:
 57 |         yield song
 58 |     if artist.startswith('The '):
 59 |         s = cp()
 60 |         s.artist = artist[len('The '):]
 61 |         yield s
 62 |     if artist.startswith('Gladys Knight '):
 63 |         s = cp()
 64 |         s.artist = 'Gladys Knight'
 65 |         yield s
 66 |     if artist == 'Earth, Wind & Fire':
 67 |         s = cp()
 68 |         s.artist = 'Earth Wind Fire'
 69 |         yield s
 70 |     if artist == 'Big & Rich':
 71 |         s = cp()
 72 |         s.artist = 'Big Rich'
 73 |         yield s
 74 |     if artist == 'Peaches & Herb':
 75 |         s = cp()
 76 |         s.artist = 'Peaches Herb'
 77 |         yield s
 78 |     if artist == 'Maroon5':
 79 |         s = cp()
 80 |         s.artist = 'Maroon 5'
 81 |         yield s
 82 |     if 'B****' in title:
 83 |         s = cp()
 84 |         s.title = title.replace('B****', 'Bitch')
 85 |         yield s
 86 |     if artist in artists_renamed:
 87 |         s = cp()
 88 |         s.artist = artists_renamed[artist]
 89 |         yield s
 90 |         
 91 |     # cause it has no parens. yuk yuk.
 92 |     orphaned = re.sub('\(.*\)', '', title)
 93 |     if orphaned != title:
 94 |         s = cp()
 95 |         s.title = orphaned
 96 |         yield s
 97 |     if artist.endswith(' s'):
 98 |         s = cp()
 99 |         s.artist = artist[:-2]+'s'
100 |         yield s
101 | 
102 | with open(PICKLE_NAME) as f:
103 |     db = pickle.load(f)
104 | 
105 | # nvm. probably better just to use os.path.exists each time. we need to sleep
106 | # between requests anyways, so who cares if it's slower
107 | #extant = load_extant(LYRICS_DIR)
108 | malencoded = 0
109 | with open('song_404s.txt') as to_retry:
110 |     bad_keys = set([line.split('\t')[-1].strip() for line in to_retry])
111 | 
112 | with open('still_404s.txt', 'w') as skips_file:
113 |     for artist in db:
114 |         for orig_song in db[artist].itervalues():
115 |             k = common.song_key(orig_song)
116 |             if k not in bad_keys:
117 |                 continue
118 |             path = os.path.join(LYRICS_DIR, k + EXT)
119 |             found = False
120 |             for song in transformed_songs(orig_song):
121 |                 #print "Transformed {} to {}".format(orig_song, song)
122 |                 try:
123 |                     lyrics, url = Lyrics.get_lyrics2(song)
124 |                     time.sleep(SLEEPYTIME)
125 |                 except Lyrics.LyricsNotFoundException:
126 |                     time.sleep(SLEEPYTIME)
127 |                     continue
128 |                 if len(lyrics) < 5:
129 |                     continue
130 |                 else:
131 |                     found = True
132 |                     break
133 |             if not found:
134 |                 try:
135 |                     skips_file.write('\t'.join([orig_song.artist, orig_song.title, k]) + '\n')
136 |                 except UnicodeEncodeError:
137 |                     malencoded += 1
138 |             else:
139 |                 print "Success! {}".format(orig_song)
140 |                 with open(path, 'w') as f:
141 |                     try:
142 |                         f.write(lyrics)
143 |                     except UnicodeEncodeError:
144 |                         # Blah blah fishcakes. Somehow got into a situation where, like, if there are multi-byte
145 |                         # unicode code points in the lyrics, we get each byte encoded in utf-8, rather than the 
146 |                         # whole thing. TODO: should probably file a bug on... someone
147 |                         lyrics = unicode_unfuck(lyrics)
148 |                         f.write(lyrics)
149 | 
150 | print "Skipped {} malencoded songs".format(malencoded)
151 | 


--------------------------------------------------------------------------------
/badromance_infgen.txt:
--------------------------------------------------------------------------------
  1 | ! infgen 2.4 output
  2 | !
  3 | gzip
  4 | !
  5 | last
  6 | dynamic
  7 | ! stats table 56:1
  8 | litlen 10 6
  9 | litlen 32 4
 10 | litlen 33 8
 11 | litlen 39 6
 12 | litlen 40 7
 13 | litlen 41 7
 14 | litlen 44 7
 15 | litlen 45 6
 16 | litlen 65 8
 17 | litlen 66 9
 18 | litlen 67 9
 19 | litlen 71 9
 20 | litlen 73 7
 21 | litlen 74 9
 22 | litlen 76 9
 23 | litlen 79 9
 24 | litlen 82 8
 25 | litlen 84 9
 26 | litlen 87 8
 27 | litlen 89 9
 28 | litlen 97 4
 29 | litlen 98 7
 30 | litlen 99 6
 31 | litlen 100 5
 32 | litlen 101 4
 33 | litlen 102 7
 34 | litlen 103 6
 35 | litlen 104 5
 36 | litlen 105 5
 37 | litlen 106 8
 38 | litlen 107 7
 39 | litlen 108 5
 40 | litlen 109 6
 41 | litlen 110 5
 42 | litlen 111 5
 43 | litlen 112 8
 44 | litlen 114 5
 45 | litlen 115 5
 46 | litlen 116 5
 47 | litlen 117 6
 48 | litlen 118 7
 49 | litlen 119 6
 50 | litlen 120 9
 51 | litlen 121 6
 52 | litlen 122 9
 53 | litlen 256 9
 54 | litlen 257 4
 55 | litlen 258 5
 56 | litlen 259 5
 57 | litlen 260 6
 58 | litlen 262 7
 59 | litlen 263 9
 60 | litlen 264 7
 61 | litlen 265 8
 62 | litlen 266 6
 63 | litlen 267 6
 64 | litlen 268 6
 65 | litlen 269 6
 66 | litlen 270 6
 67 | litlen 271 9
 68 | litlen 272 9
 69 | litlen 273 9
 70 | litlen 274 8
 71 | litlen 275 8
 72 | litlen 276 7
 73 | litlen 277 7
 74 | litlen 278 9
 75 | litlen 279 8
 76 | litlen 280 9
 77 | litlen 281 8
 78 | litlen 283 9
 79 | dist 2 5
 80 | dist 3 6
 81 | dist 4 5
 82 | dist 5 7
 83 | dist 6 7
 84 | dist 7 6
 85 | dist 8 3
 86 | dist 9 5
 87 | dist 10 3
 88 | dist 11 4
 89 | dist 12 4
 90 | dist 13 5
 91 | dist 14 4
 92 | dist 15 5
 93 | dist 16 5
 94 | dist 17 4
 95 | dist 18 4
 96 | dist 19 3
 97 | dist 20 6
 98 | dist 21 4
 99 | literal 10 'Oh-o
100 | match 10 3
101 | literal '! 
102 | match 14 16
103 | match 5 19
104 | literal 10 'Caught in a bad romance 
105 | literal 10
106 | match 61 61
107 | literal 10 'Rah r
108 | match 3 4
109 | literal 'ah-
110 | match 5 3
111 | match 3 45
112 | literal 'Ro m
113 | match 4 22
114 | literal 'o-
115 | match 3 7
116 | match 4 4
117 | literal ' 
118 | literal 10 'Gaga 
119 | match 3 77
120 | literal 'la
121 | match 3 3
122 | match 3 35
123 | literal 'Want your
124 | match 59 78
125 | match 35 79
126 | literal 'I w
127 | match 9 26
128 | literal 'ugly 
129 | match 13 18
130 | literal 'disease
131 | match 14 21
132 | literal 'everything 
133 | literal 10 'As lo
134 | match 3 9
135 | literal 'as it's fre
136 | match 15 46
137 | literal 'lov
138 | match 3 18
139 | literal 'L
140 | match 3 6
141 | literal '-l
142 | match 8 5
143 | match 20 34
144 | match 14 120
145 | literal 'rama 
146 | literal 10 'The touch of
147 | match 6 25
148 | literal 'hand
149 | match 15 62
150 | literal 'eather studded kiss
151 | match 4 390
152 | match 3 21
153 | literal ' s
154 | match 18 46
155 | match 19 142
156 | literal ', 
157 | match 17 34
158 | literal '(
159 | match 32 34
160 | literal ')
161 | match 3 177
162 | literal 'You know
163 | match 3 107
164 | literal 'at
165 | match 11 34
166 | match 3 278
167 | match 3 119
168 | literal 'y
169 | match 15 30
170 | literal 'nee
171 | match 6 21
172 | match 8 144
173 | literal 'it
174 | match 5 395
175 | match 4 71
176 | match 28 405
177 | match 5 144
178 | match 17 200
179 | literal 'r
180 | match 3 389
181 | literal 'nge
182 | match 5 62
183 | match 5 30
184 | literal 'me could write
185 | match 16 644
186 | literal '(
187 | match 13 690
188 | match 3 536
189 | literal '!) 
190 | match 23 102
191 | literal 'All
192 | match 10 19
193 | literal 'rs
194 | match 48 106
195 | match 63 812
196 | match 14 168
197 | match 10 63
198 | match 84 876
199 | match 47 798
200 | literal 'horror
201 | match 15 800
202 | literal 'esign 
203 | literal 10 ''
204 | match 3 146
205 | match 3 807
206 | match 3 20
207 | literal ''r
208 | match 4 275
209 | literal 'criminal
210 | match 13 801
211 | match 5 50
212 | match 3 23
213 | match 68 801
214 | literal 'psycho,
215 | match 6 13
216 | match 3 424
217 | literal 'tigo shtick
218 | match 10 206
219 | match 4 289
220 | literal 'my
221 | match 3 450
222 | literal 'ar wind
223 | match 3 653
224 | literal 10 'Baby
225 | match 8 168
226 | literal 's
227 | match 3 46
228 | match 19 765
229 | match 140 799
230 | literal ',
231 | match 111 794
232 | match 104 793
233 | match 51 731
234 | match 143 795
235 | match 78 1750
236 | literal 'Wal
237 | match 3 651
238 | literal 'w
239 | match 3 6
240 | literal ' fashion
241 | match 3 33
242 | literal 'by
243 | match 3 19
244 | literal 'ork
245 | match 4 524
246 | literal 'm
247 | match 4 392
248 | match 5 556
249 | literal 'bit
250 | match 3 1553
251 | literal 'crazy 
252 | match 5 56
253 | literal '-
254 | match 115 55
255 | literal 'pass
256 | match 18 55
257 | literal 'I'm
258 | match 3 375
259 | match 5 1796
260 | match 6 56
261 | match 4 31
262 | match 20 694
263 | match 5 771
264 | match 21 694
265 | match 18 43
266 | literal 'I don't
267 | match 4 24
268 | literal 'n
269 | match 3 468
270 | literal 'e
271 | match 3 96
272 | literal 'iends
273 | match 3 88
274 | literal '(J'veux
275 | match 3 1828
276 | match 3 495
277 | literal 'm
278 | match 5 1948
279 | literal 't je 
280 | match 6 21
281 | literal 'a
282 | match 4 86
283 | match 3 352
284 | match 3 1801
285 | literal 'j
286 | match 15 40
287 | literal ')
288 | match 26 87
289 | match 4 973
290 | match 24 113
291 | match 52 52
292 | match 22 498
293 | literal '(c
294 | match 22 680
295 | literal ')
296 | match 11 255
297 | match 218 1018
298 | match 23 774
299 | match 25 956
300 | match 23 48
301 | match 65 88
302 | match 78 1044
303 | end
304 | ! stats literals 5.4 bits each (1772/331)
305 | ! stats matches 88.6% (128 x 20.2)
306 | ! stats inout 534:0 (459) 2916 0
307 | ! stats total inout 534:0 (459) 2916
308 | ! stats total block average 2916.0 uncompressed
309 | ! stats total block average 459.0 symbols
310 | ! stats total literals 5.4 bits each
311 | ! stats total matches 88.6% (128 x 20.2)
312 | !
313 | crc
314 | length
315 | 


--------------------------------------------------------------------------------
/normalizer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Thierry Bertin-Mahieux (2011) Columbia University
  3 | tb2332@columbia.edu
  4 | 
  5 | 
  6 | This code contains functions to normalize an artist name,
  7 | and possibly a song title.
  8 | This is intended to do metadata matching.
  9 | It is mostly an elaborate hack, I never did an extensive search of
 10 | all problematic name matches.
 11 | Code developed using Python 2.6 on a Ubuntu machine, using UTF-8
 12 | 
 13 | This is part of the Million Song Dataset project from
 14 | LabROSA (Columbia University) and The Echo Nest.
 15 | 
 16 | 
 17 | Copyright 2011, Thierry Bertin-Mahieux
 18 | 
 19 | This program is free software: you can redistribute it and/or modify
 20 | it under the terms of the GNU General Public License as published by
 21 | the Free Software Foundation, either version 3 of the License, or
 22 | (at your option) any later version.
 23 | 
 24 | This program is distributed in the hope that it will be useful,
 25 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 26 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 27 | GNU General Public License for more details.
 28 | 
 29 | You should have received a copy of the GNU General Public License
 30 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 31 | """
 32 | 
 33 | import os
 34 | import re
 35 | import sys
 36 | import unicodedata
 37 | import itertools
 38 | import Levenshtein # http://pypi.python.org/pypi/python-Levenshtein/
 39 | 
 40 | 
 41 | # ROTATION SYMBOLS (A and B => B and A)
 42 | rotation_symbols = ['\|', '/', '&', ',', '\+', ';', '_']#, '\-']
 43 | rotation_words = ['and', 'y', 'et', 'vs', 'vs.', 'v', 'with', 'feat',
 44 |                   'feat.', 'featuring', 'presents', 'ft.', 'pres.']
 45 | 
 46 | # SYMBOLS TO REMOVE AT THE BEGINNING
 47 | stub_to_remove = ['dj', 'dj.', 'mc', 'm.c.', 'mc.', 'the', 'los', 'les']
 48 | 
 49 | # SYMBOLS TO REMOVE AT THE END
 50 | end_to_remove1 = ['big band', 'trio', 'quartet', 'ensemble', 'orchestra']
 51 | end_to_remove2 = ['band']
 52 | 
 53 | # COMPILED REGULAR EXPRESSION
 54 | # white spaces
 55 | re_space = re.compile(r'\s')
 56 | # non alphanumeric
 57 | re_nonalphanum = re.compile(r'\W')
 58 | # rotation symbols
 59 | re_rotsymbols = re.compile('\s*?' + '|'.join(rotation_symbols) + '\s*?')
 60 | # rotation words
 61 | re_rotwords = re.compile(r'\s(' + '|'.join(rotation_words) + ')\s')
 62 | # stub to remove
 63 | re_remstub = re.compile('(' + '|'.join(stub_to_remove) + ')\s(.*)')
 64 | # ending to remove
 65 | re_remending1 = re.compile('(.*)\s(' + '|'.join(end_to_remove1) + ')')
 66 | re_remending2 = re.compile('(.*)\s(' + '|'.join(end_to_remove2) + ')')
 67 | # quotes to remove
 68 | re_remquotes = re.compile('(.+)\s(".+?")\s(.+)')
 69 | # parenthesis to remove
 70 | re_remparenthesis = re.compile('(.+)\s(\(.+?\))\s*(.*)')
 71 | # brackets to remove
 72 | re_rembrackets = re.compile('(.+)\s(\[.+?\])\s*(.*)')
 73 | 
 74 | 
 75 | def char_is_ascii(c):
 76 |     """
 77 |     Check if a unicode character, e.g. u'A', u'1' or u'\u0301' is ASCII
 78 |     """
 79 |     #return ord(c) < 128
 80 |     # the following should be faster, according to:
 81 |     #http://stackoverflow.com/questions/196345/how-to-check-if-a-string-in-python-is-in-ascii
 82 |     return c < u"\x7F"
 83 | 
 84 | 
 85 | def remove_non_ascii(s):
 86 |     """
 87 |     Normalize characters in unicode string 's' that are not ASCII,
 88 |     try to transform accented characters to non accented version.
 89 |     Otherwise, remove non-ascii chars
 90 |     """
 91 |     decomposition = unicodedata.normalize('NFKD', s)
 92 |     return filter(lambda x: char_is_ascii(x), decomposition)
 93 | 
 94 | 
 95 | def to_lower_case(s):
 96 |     """
 97 |     transform a unicode string 's' to lowercase
 98 |     ok, this one is trivial, I know
 99 |     """
100 |     return s.lower()
101 | 
102 | 
103 | def remove_spaces(s):
104 |     """
105 |     Remove all possible spaces in the unicode string s
106 |     """
107 |     return re_space.sub('', s)
108 | 
109 | 
110 | def replace_rotation_symbols(s):
111 |     """
112 |     Mostly, replace '&' by 'and'
113 |     """
114 |     return re_rotsymbols.sub(' and ', s)
115 | 
116 | 
117 | def remove_stub(s):
118 |     """
119 |     Remove a questionable beginning, e.g. dj
120 |     otherwise return string at is
121 |     """
122 |     m = re_remstub.match(s)
123 |     if not m:
124 |         return s
125 |     return m.groups()[1]
126 | 
127 | 
128 | def remove_endings(s):
129 |     """
130 |     Remove questionable endings, e.g. 'band'
131 |     """
132 |     m = re_remending1.match(s)
133 |     if m:
134 |        s = m.groups()[0]
135 |     m = re_remending2.match(s)
136 |     if m:
137 |         s = m.groups()[0]
138 |     return s
139 | 
140 | 
141 | def remove_quotes(s):
142 |     """
143 |     Remove the quote, like Thierry "The Awesomest" BM
144 |     """
145 |     m = re_remquotes.match(s)
146 |     if not m:
147 |         return s
148 |     parts = m.groups()
149 |     assert len(parts) == 3
150 |     return parts[0] + ' ' + parts[2]
151 | 
152 | 
153 | def remove_parenthesis(s):
154 |     """
155 |     Remove parenthesis, like Thierry (Coolest guy)
156 |     """
157 |     m = re_remparenthesis.match(s)
158 |     if not m:
159 |         return s
160 |     parts = m.groups()
161 |     assert len(parts) >= 2
162 |     if len(parts) == 2:
163 |         return parts[0]
164 |     return parts[0] + ' ' + parts[2]
165 | 
166 | 
167 | def remove_brackets(s):
168 |     """
169 |     Remove brackets, like Thierry [Coolest guy]
170 |     """
171 |     m = re_rembrackets.match(s)
172 |     if not m:
173 |         return s
174 |     parts = m.groups()
175 |     assert len(parts) >= 2
176 |     if len(parts) == 2:
177 |         return parts[0]
178 |     return parts[0] + ' ' + parts[2]
179 | 
180 | 
181 | def normalize_no_rotation(s):
182 |     """
183 |     We normalize a name that is supposed to contain no
184 |     rotation term ('and', 'y', ...)
185 |     """
186 |     # remove beginning
187 |     s = remove_stub(s)
188 |     # remove ends
189 |     s = remove_endings(s)    
190 |     # remove ()
191 |     s = remove_parenthesis(s)
192 |     # remove ""
193 |     s = remove_quotes(s)
194 |     return s
195 | 
196 | 
197 | def split_rotation_words(s):
198 |     """
199 |     Split a name using the rotation words: 'and', 'vs', 'y', 'et', ...
200 |     then create all possible permutations
201 |     """
202 |     parts = re_rotwords.split(s)
203 |     parts = filter(lambda p: not p in rotation_words, parts)[:5]
204 |     results = set()
205 |     # keep only the individual elems (risky?)
206 |     for p in parts:
207 |         results.add(p)
208 |     # create all permutations
209 |     permutations = itertools.permutations(parts)
210 |     #maxperm = 30
211 |     #count_perm = 0
212 |     for perm in permutations:
213 |         #count_perm += 1
214 |         #if count_perm > maxperm:
215 |         #    break
216 |         results.add(' '.join(perm))
217 |     # redo the same but remove the stub first for all parts
218 |     parts = map(lambda p: normalize_no_rotation(p), parts)
219 |     for p in parts:
220 |         results.add(p)
221 |     permutations = itertools.permutations(parts)
222 |     for perm in permutations:
223 |         results.add(' '.join(perm))
224 |     # done
225 |     return results
226 | 
227 | 
228 | def remove_nonalphanumeric(s):
229 |     """
230 |     Remove usual punctuation signs:  ! , ? : ; . '   etc
231 |     Also, we transform long spaces into normal ones
232 |     """
233 |     # split around non-alphanum chars
234 |     parts = re_nonalphanum.split(s)
235 |     # remove empty spots
236 |     parts = filter(lambda p: p, parts)
237 |     # rejoin with regular space ' '
238 |     return ' '.join(parts)
239 | 
240 | 
241 | def normalize_artist(s):
242 |     """
243 |     Return a set of normalized versions of that artist name
244 |     """
245 |     # normalized versions
246 |     results = set()
247 |     # lower case
248 |     s = to_lower_case(s)
249 |     results.add(s)
250 |     # remove non-ascii chars (try to replace them)
251 |     s = remove_non_ascii(s)
252 |     results.add(s)
253 |     # try removing parenthesis before, in case there's an & in it
254 |     s2 = remove_parenthesis(s)
255 |     results.add(s2)
256 |     # replace rotation symbols
257 |     s = replace_rotation_symbols(s)
258 |     # split and permute according to rotation words
259 |     permutations = split_rotation_words(s)
260 |     results.update(permutations)
261 |     # remove non-alphanumeric and normalize spaces
262 |     results = map(lambda s: remove_nonalphanumeric(s), results)
263 |     # remove all spaces
264 |     results = map(lambda s: remove_spaces(s), results)
265 |     # done (and remove dupes)
266 |     return set(results)
267 | 
268 | 
269 | def normalize_title(s):
270 |     """
271 |     Return a set of normalized versions of that title
272 |     """
273 |     # normalized versions
274 |     results = set()
275 |     # lower case
276 |     s = to_lower_case(s)
277 |     results.add(s)
278 |     # remove non-ascii chars (try to replace them)
279 |     s = remove_non_ascii(s)
280 |     results.add(s)
281 |     # try removing parenthesis
282 |     s = remove_parenthesis(s)
283 |     results.add(s)
284 |     # try removing brackets
285 |     s = remove_brackets(s)
286 |     results.add(s)
287 |     # remove non-alphanumeric and normalize spaces
288 |     results = map(lambda s: remove_nonalphanumeric(s), results)
289 |     # remove all spaces
290 |     results = map(lambda s: remove_spaces(s), results)
291 |     # done (and remove dupes)
292 |     return set(results)
293 | 
294 | 
295 | def same_artist(name1, name2):
296 |     """
297 |     Compare two artists:
298 |     - edit distance
299 |     - if one name is contained in the other
300 |     - by normalizing the names
301 |     Return True if it's the same artist, False otherwise
302 |     """
303 |     # trivial
304 |     n1 = to_lower_case(name1)
305 |     n2 = to_lower_case(name2)
306 |     if n1 == n2:
307 |         return True
308 |     # edit distance
309 |     if len(n1) >= 10 or len(n2) >= 10:
310 |         if Levenshtein.distance(n1, n2) <= 2:
311 |             return True
312 |     # n1 contains n2? or the other way around
313 |     if len(n1) >= 10 and len(n2) >= 10:
314 |         if len(n1) > len(n2):
315 |             if n1.find(n2) >= 0:
316 |                 return True
317 |         else:
318 |             if n2.find(n1) >= 0:
319 |                 return True
320 |     # compare by normalizing names
321 |     normalized1 = normalize_artist(n1)
322 |     normalized2 = normalize_artist(n2)
323 |     if len(normalized1.intersection(normalized2)) > 0:
324 |         return True
325 |     return False
326 | 
327 | 
328 | def same_title(title1, title2):
329 |     """
330 |     Compare two titles:
331 |     - edit distance
332 |     - if one name is contained in the other
333 |     - by normalizing the title
334 |     Return True if it's the same title, False otherwise
335 |     """
336 |     # trivial
337 |     t1 = to_lower_case(title1)
338 |     t2 = to_lower_case(title2)
339 |     if t1 == t2:
340 |         return True
341 |     # edit distance
342 |     if len(t1) >= 10 or len(t2) >= 10:
343 |         if Levenshtein.distance(t1, t2) <= 2:
344 |             return True
345 |     # n1 contains n2? or the other way around
346 |     if len(t1) >= 10 and len(t2) >= 10:
347 |         if len(t1) > len(t2):
348 |             if t1.find(t2) >= 0:
349 |                 return True
350 |         else:
351 |             if t2.find(t1) >= 0:
352 |                 return True
353 |     # compare by normalizing names
354 |     normalized1 = normalize_title(t1)
355 |     normalized2 = normalize_title(t2)
356 |     if len(normalized1.intersection(normalized2)) > 0:
357 |         return True
358 |     return False
359 | 


--------------------------------------------------------------------------------