├── .gitignore ├── README.md ├── artist_ramp.py ├── cramp.py ├── density.py ├── lmatch.py ├── ramp.py ├── tiny.dat ├── tools.py ├── track.py └── util ├── README.md ├── dump_loudness.py ├── make_plots.py ├── nsort.py ├── nsort2.py ├── plotter └── sdir.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.out 3 | *.gz 4 | data/* 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | msd-examples 2 | =========== 3 | Example code for processing the Million Song Database. This repostory contains 4 | code that can be used to process the million song dataset. 5 | 6 | The Million Song Dataset is a freely-available collection of audio features and metadata for a million contemporary popular 7 | music tracks available at: 8 | 9 | http://labrosa.ee.columbia.edu/millionsong/pages/getting-dataset 10 | 11 | Its purposes are: 12 | 13 | * To encourage research on algorithms that scale to commercial sizes 14 | * To provide a reference dataset for evaluating research 15 | * As a shortcut alternative to creating a large dataset with The Echo Nests API 16 | * To help new researchers get started in the MIR field 17 | 18 | The core of the dataset is the feature analysis and metadata for one million songs, provided by The Echo Nest. The dataset 19 | does not include any audio, only the derived features. Note, however, that sample audio can be fetched from services like 20 | 7digital, using code we provide. Additional datasets have been attached to the Million Song Dataset, so far they contain lyrics and cover songs. The Million Song Dataset started as a collaborative project between The Echo Nest and LabROSA. 21 | It was supported in part by the NSF. 22 | 23 | These examples depend on mrjob, a python library for running MapReduce jobs on Hadoop or Amazon web services. See 24 | https://github.com/Yelp/mrjob and http://packages.python.org/mrjob/. 25 | 26 | 27 | MSD Data on S3 28 | ============== 29 | These examples use MSD data that has been loaded on to S3 at s3://tbmmsd. There are around 330 files each with about 3000 30 | sets track data each (one set per line) where each line is represented by 54 fields as described here: 31 | 32 | http://labrosa.ee.columbia.edu/millionsong/pages/field-list 33 | 34 | except that in the flat file format, the 'track id' field has been moved from field 52 to the first field. 35 | 36 | In the repository you will find tiny.dat which contains data for 20 tracks. 37 | 38 | 39 | 40 | 41 | Map-reduce jobs 42 | =============== 43 | 44 | Density 45 | ------ 46 | Finds the most dense and the least dense songs 47 | 48 | density.py 49 | 50 | 51 | ### Local Usage: 52 | 53 | python density.py tiny.dat 54 | 55 | 56 | ### EC2 Usage 57 | This will run the job on EC2 Map reduce on 100 small instances. Note that you have to 58 | add the track.py code to t.tar.gz with: 59 | 60 | % tar cvfz t.tar.gz track.py 61 | 62 | To run the job on 100 CPUs on all of the MSD use: 63 | 64 | % python density.py --num-ec2-instances 100 --python-archive t.tar.gz -r emr 's3://tbmmsd/*.tsv.*' > output.dat 65 | 66 | 67 | (Of course you will need to setup your Amazon credentials. See http://packages.python.org/mrjob/writing-and-running.html#running-on-emr ) 68 | 69 | 70 | -------------------------------------------------------------------------------- /artist_ramp.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | """ 4 | A map-reduce that calculates the average slow build 5 | for each artist 6 | """ 7 | 8 | from mrjob.job import MRJob 9 | import track 10 | from itertools import imap 11 | import unicodedata 12 | 13 | # if YIELD_ALL is true, we yield all densities, otherwise, 14 | # we yield just the extremes 15 | 16 | YIELD_ALL = False 17 | 18 | class MRRamp(MRJob): 19 | """ A map-reduce job that calculates the ramp factor """ 20 | 21 | def mapper(self, _, line): 22 | """ The mapper loads a track and yields its ramp factor """ 23 | t = track.load_track(line) 24 | if t and t['duration'] > 60 and len(t['segments']) > 20: 25 | segments = t['segments'] 26 | half_track = t['duration'] / 2 27 | first_half = 0 28 | second_half = 0 29 | first_count = 0 30 | second_count = 0 31 | 32 | xdata = [] 33 | ydata = [] 34 | for i in xrange(len(segments)): 35 | seg = segments[i] 36 | 37 | # bail out if we have a really long quiet segment 38 | # these are usually surprise hidden tracks 39 | 40 | if seg['loudness_max'] < -40 and seg['duration'] > 30: 41 | return 42 | 43 | seg_loudness = seg['loudness_max'] * seg['duration'] 44 | 45 | if seg['start'] + seg['duration'] <= half_track: 46 | seg_loudness = seg['loudness_max'] * seg['duration'] 47 | first_half += seg_loudness 48 | first_count += 1 49 | elif seg['start'] < half_track and seg['start'] + seg['duration'] > half_track: 50 | # this is the nasty segment that spans the song midpoint. 51 | # apportion the loudness appropriately 52 | first_seg_loudness = seg['loudness_max'] * (half_track - seg['start']) 53 | first_half += first_seg_loudness 54 | first_count += 1 55 | 56 | second_seg_loudness = seg['loudness_max'] * (seg['duration'] - (half_track - seg['start'])) 57 | second_half += second_seg_loudness 58 | second_count += 1 59 | else: 60 | seg_loudness = seg['loudness_max'] * seg['duration'] 61 | second_half += seg_loudness 62 | second_count += 1 63 | 64 | xdata.append( seg['start'] ) 65 | ydata.append( seg['loudness_max'] ) 66 | 67 | # only yield data if we've had sufficient segments in the first 68 | # and second half of the track. (This is to avoid the proverbial 69 | # hidden tracks that have extreme amounts of leading or tailing 70 | # silene 71 | 72 | correlation = pearsonr(xdata, ydata) 73 | ramp_factor = second_half / half_track - first_half / half_track 74 | score = correlation * ramp_factor 75 | yield (t['artist_id'], t['artist_name']), (score, t['track_id']) 76 | 77 | def reducer(self, key, val): 78 | count = 0 79 | sum = 0 80 | best = -60 81 | best_id = None 82 | 83 | for score, trid in val: 84 | sum += score 85 | count += 1 86 | if score > best: 87 | best = score 88 | best_id = trid 89 | avg = sum / count 90 | 91 | if count > 5 and avg > 5: 92 | yield key, (avg, count, best, best_id) 93 | 94 | 95 | def pearsonr(x, y): 96 | # Assume len(x) == len(y) 97 | n = len(x) 98 | sum_x = sum(x) 99 | sum_y = sum(y) 100 | sum_x_sq = sum(map(lambda x: pow(x, 2), x)) 101 | sum_y_sq = sum(map(lambda x: pow(x, 2), y)) 102 | psum = sum(imap(lambda x, y: x * y, x, y)) 103 | num = psum - (sum_x * sum_y/n) 104 | den = pow((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n), 0.5) 105 | if den == 0: 106 | return 0 107 | return num / den 108 | 109 | 110 | if __name__ == '__main__': 111 | MRRamp.run() 112 | -------------------------------------------------------------------------------- /cramp.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | A map-reduce that calculates the difference in 4 | average volume between the first and the second 5 | half of the song. 6 | """ 7 | 8 | from mrjob.job import MRJob 9 | import track 10 | from itertools import imap 11 | import unicodedata 12 | 13 | # if YIELD_ALL is true, we yield all densities, otherwise, 14 | # we yield just the extremes 15 | 16 | YIELD_ALL = False 17 | 18 | class MRRamp(MRJob): 19 | """ A map-reduce job that calculates the ramp factor """ 20 | 21 | def mapper(self, _, line): 22 | """ The mapper loads a track and yields its ramp factor """ 23 | t = track.load_track(line) 24 | if t and t['duration'] > 60 and len(t['segments']) > 20: 25 | segments = t['segments'] 26 | half_track = t['duration'] / 2 27 | first_half = 0 28 | second_half = 0 29 | first_count = 0 30 | second_count = 0 31 | 32 | xdata = [] 33 | ydata = [] 34 | for i in xrange(len(segments)): 35 | seg = segments[i] 36 | 37 | # bail out if we have a really long quiet segment 38 | # these are usually surprise hidden tracks 39 | 40 | if seg['loudness_max'] < -40 and seg['duration'] > 30: 41 | return 42 | 43 | seg_loudness = seg['loudness_max'] * seg['duration'] 44 | 45 | if seg['start'] + seg['duration'] <= half_track: 46 | seg_loudness = seg['loudness_max'] * seg['duration'] 47 | first_half += seg_loudness 48 | first_count += 1 49 | elif seg['start'] < half_track and seg['start'] + seg['duration'] > half_track: 50 | # this is the nasty segment that spans the song midpoint. 51 | # apportion the loudness appropriately 52 | first_seg_loudness = seg['loudness_max'] * (half_track - seg['start']) 53 | first_half += first_seg_loudness 54 | first_count += 1 55 | 56 | second_seg_loudness = seg['loudness_max'] * (seg['duration'] - (half_track - seg['start'])) 57 | second_half += second_seg_loudness 58 | second_count += 1 59 | else: 60 | seg_loudness = seg['loudness_max'] * seg['duration'] 61 | second_half += seg_loudness 62 | second_count += 1 63 | 64 | xdata.append( seg['start'] ) 65 | ydata.append( seg['loudness_max'] ) 66 | 67 | # only yield data if we've had sufficient segments in the first 68 | # and second half of the track. (This is to avoid the proverbial 69 | # hidden tracks that have extreme amounts of leading or tailing 70 | # silene 71 | 72 | correlation = pearsonr(xdata, ydata) 73 | if first_count > 10 and second_count > 10: 74 | ramp_factor = second_half / half_track - first_half / half_track 75 | #if YIELD_ALL or ramp_factor > 10 or ramp_factor < -10: 76 | if YIELD_ALL or ramp_factor > 10 and correlation > .5: 77 | yield (t['artist_name'], t['title'], t['track_id'], correlation), ramp_factor 78 | 79 | # no need for a reducer 80 | #def reducer(self, key, val): 81 | #yield (key, sum(val)) 82 | 83 | 84 | def pearsonr(x, y): 85 | # Assume len(x) == len(y) 86 | n = len(x) 87 | sum_x = sum(x) 88 | sum_y = sum(y) 89 | sum_x_sq = sum(map(lambda x: pow(x, 2), x)) 90 | sum_y_sq = sum(map(lambda x: pow(x, 2), y)) 91 | psum = sum(imap(lambda x, y: x * y, x, y)) 92 | num = psum - (sum_x * sum_y/n) 93 | den = pow((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n), 0.5) 94 | if den == 0: 95 | return 0 96 | return num / den 97 | 98 | 99 | def test(): 100 | x = [1,2,3,4,5,6,7,8,9,10] 101 | y = [10, 20, 35, 45, 47, 60, 70, 87, 91, 100] 102 | print pearsonr(x,y) 103 | 104 | 105 | if __name__ == '__main__': 106 | MRRamp.run() 107 | -------------------------------------------------------------------------------- /density.py: -------------------------------------------------------------------------------- 1 | """ 2 | A map-reduce that calculates the density for each 3 | of a set of tracks. The track density is the average 4 | number of segments per segment for a track. 5 | """ 6 | 7 | from mrjob.job import MRJob 8 | import track 9 | 10 | # if YIELD_ALL is true, we yield all densities, otherwise, 11 | # we yield just the extremes 12 | 13 | YIELD_ALL = True 14 | 15 | class MRDensity(MRJob): 16 | """ A map-reduce job that calculates the density """ 17 | 18 | def mapper(self, _, line): 19 | """ The mapper loads a track and yields its density """ 20 | t = track.load_track(line) 21 | if t: 22 | if t['tempo'] > 0: 23 | density = len(t['segments']) / t['duration'] 24 | #only output extreme density 25 | if YIELD_ALL or density > 8 or density < .5: 26 | yield (t['artist_name'], t['title'], t['song_id']), density 27 | 28 | # no need for a reducer 29 | #def reducer(self, key, val): 30 | #yield (key, sum(val)) 31 | 32 | if __name__ == '__main__': 33 | MRDensity.run() 34 | -------------------------------------------------------------------------------- /lmatch.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | """ 4 | A map-reduce that calculates the difference in 5 | average volume between the first and the second 6 | half of the song. 7 | """ 8 | 9 | from mrjob.job import MRJob 10 | import track 11 | from itertools import imap 12 | import math 13 | import tools 14 | import sys 15 | 16 | # if YIELD_ALL is true, we yield all densities, otherwise, 17 | # we yield just the extremes 18 | 19 | YIELD_ALL = False 20 | 21 | class MRlmatch(MRJob): 22 | """ A map-reduce job that calculates the ramp factor """ 23 | 24 | DUMP = False 25 | SIZE = 64 26 | VECTOR = True 27 | #MATCH = tools.rnormalize(tools.scale(tools.sin2wave(SIZE), 60, -60), -60, 0) 28 | MATCH = tools.rnormalize(tools.scale(tools.sinwave(SIZE), 60, -60), -60, 0) 29 | 30 | def mapper(self, _, line): 31 | """ The mapper loads a track and yields its ramp factor """ 32 | t = track.load_track(line) 33 | segments = t['segments'] 34 | duration = t['duration'] 35 | xdata = [] 36 | ydata = [] 37 | for i in xrange(len(segments)): 38 | seg = segments[i] 39 | sloudness = seg['loudness_max'] 40 | sstart = seg['start'] + seg['loudness_max_time'] 41 | xdata.append( sstart ) 42 | ydata.append( sloudness ) 43 | 44 | if duration > 20: 45 | idata = tools.interpolate(xdata, ydata, int(duration) * 10) 46 | smooth = tools.smooth(idata, 20) 47 | samp = tools.sample(smooth, self.SIZE) 48 | ndata = tools.rnormalize(samp, -60, 0) 49 | if self.DUMP: 50 | for i, (x, y) in enumerate(zip(self.MATCH, ndata)): 51 | print i, x, y 52 | if self.VECTOR: 53 | yield (t['artist_name'], t['title'], t['track_id']), ndata 54 | else: 55 | distance = tools.distance(self.MATCH, ndata) 56 | yield (t['artist_name'], t['title'], t['track_id']), distance 57 | 58 | 59 | # no need for a reducer 60 | #def reducer(self, key, val): 61 | #yield (key, sum(val)) 62 | 63 | 64 | def dump(): 65 | data = tools.rnormalize(tools.scale(tools.sin2wave(256), 60, -60), -60, 0) 66 | for d in data: 67 | print d 68 | 69 | if __name__ == '__main__': 70 | #dump() 71 | MRlmatch.run() 72 | -------------------------------------------------------------------------------- /ramp.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | A map-reduce that calculates the difference in 4 | average volume between the first and the second 5 | half of the song. 6 | """ 7 | 8 | from mrjob.job import MRJob 9 | import track 10 | 11 | # if YIELD_ALL is true, we yield all densities, otherwise, 12 | # we yield just the extremes 13 | 14 | YIELD_ALL = False 15 | 16 | class MRRamp(MRJob): 17 | """ A map-reduce job that calculates the ramp factor """ 18 | 19 | def mapper(self, _, line): 20 | """ The mapper loads a track and yields its ramp factor """ 21 | t = track.load_track(line) 22 | if t and t['duration'] > 60 and len(t['segments']) > 20: 23 | segments = t['segments'] 24 | half_track = t['duration'] / 2 25 | first_half = 0 26 | second_half = 0 27 | first_count = 0 28 | second_count = 0 29 | 30 | for i in xrange(len(segments)): 31 | seg = segments[i] 32 | 33 | # bail out if we have a really long quiet segment 34 | # these are usually surprise hidden tracks 35 | 36 | if seg['loudness_max'] < -40 and seg['duration'] > 30: 37 | return 38 | 39 | seg_loudness = seg['loudness_max'] * seg['duration'] 40 | 41 | if seg['start'] + seg['duration'] <= half_track: 42 | seg_loudness = seg['loudness_max'] * seg['duration'] 43 | first_half += seg_loudness 44 | first_count += 1 45 | elif seg['start'] < half_track and seg['start'] + seg['duration'] > half_track: 46 | # this is the nasty segment that spans the song midpoint. 47 | # apportion the loudness appropriately 48 | first_seg_loudness = seg['loudness_max'] * (half_track - seg['start']) 49 | first_half += first_seg_loudness 50 | first_count += 1 51 | 52 | second_seg_loudness = seg['loudness_max'] * (seg['duration'] - (half_track - seg['start'])) 53 | second_half += second_seg_loudness 54 | second_count += 1 55 | else: 56 | seg_loudness = seg['loudness_max'] * seg['duration'] 57 | second_half += seg_loudness 58 | second_count += 1 59 | 60 | # only yield data if we've had sufficient segments in the first 61 | # and second half of the track. (This is to avoid the proverbial 62 | # hidden tracks that have extreme amounts of leading or tailing 63 | # silene 64 | 65 | if first_count > 10 and second_count > 10: 66 | ramp_factor = second_half / half_track - first_half / half_track 67 | if YIELD_ALL or ramp_factor > 10 or ramp_factor < -10: 68 | yield (t['artist_name'], t['title'], t['track_id']), ramp_factor 69 | 70 | # no need for a reducer 71 | #def reducer(self, key, val): 72 | #yield (key, sum(val)) 73 | 74 | if __name__ == '__main__': 75 | MRRamp.run() 76 | -------------------------------------------------------------------------------- /tools.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | import time 4 | 5 | 6 | def interpolate(xdata, ydata, points): 7 | results = [] 8 | duration = xdata[-1] 9 | inc = duration / float(points) 10 | ctime = 0 11 | cindex = 0 12 | 13 | last_index = 0 14 | for i in xrange(points): 15 | for j in xrange(last_index, len(xdata) - 1): 16 | #print 'xd', xdata[j], xdata[j+1], ctime 17 | if ctime < xdata[j+1]: 18 | break 19 | last_index = j 20 | 21 | frac = (ctime - xdata[j]) / (xdata[j+1] - xdata[j]) 22 | y = frac * (ydata[j+1] - ydata[j]) + ydata[j] 23 | #print 'ct', ctime, xdata[j], xdata[j+1] 24 | results.append(y) 25 | 26 | ctime += inc 27 | return results 28 | 29 | 30 | def smooth(data, fsize=10): 31 | out = [] 32 | filter = [] 33 | # bug, make this be a centering filter 34 | for d in data: 35 | filter.append(d) 36 | if len(filter) > fsize: 37 | filter.pop(0) 38 | out.append( sum(filter) / len(filter)) 39 | return out 40 | 41 | def sample(data, size): 42 | results = [] 43 | jump = float(len(data)) / float(size) 44 | for i in xrange(size): 45 | index = int(round(i * jump)) 46 | index = min(len(data) - 1, index) 47 | results.append(data[index]) 48 | return results 49 | 50 | 51 | def normalize(data, range = 1): 52 | max_data = max(data) 53 | min_data = min(data) 54 | 55 | out = [ range * (x - min_data) / (max_data - min_data) for x in data] 56 | return out 57 | 58 | def rnormalize(data, min_data = 0, max_data = 1, range = 1): 59 | data = clamp(data, min_data, max_data) 60 | out = [ range * (x - min_data) / (max_data - min_data) for x in data] 61 | return out 62 | 63 | 64 | def clamp(data, min_data, max_data): 65 | results = [] 66 | for d in data: 67 | d = min(d, max_data) 68 | d = max(d, min_data) 69 | results.append(d) 70 | return results 71 | 72 | 73 | def distance(d1, d2): 74 | if len(d1) <> len(d2): 75 | raise ValueError 76 | sum = 0 77 | for p1, p2 in zip(d1, d2): 78 | d = p2 - p1 79 | dd = d * d 80 | sum += dd 81 | #print p1, p2, dd, sum 82 | return math.sqrt(sum) 83 | 84 | def qdistance(d1, d2): 85 | if len(d1) <> len(d2): 86 | raise ValueError 87 | sum = 0 88 | for p1, p2 in zip(d1, d2): 89 | d = p2 - p1 90 | dd = d * d 91 | sum += dd 92 | #print p1, p2, dd, sum 93 | return sum 94 | 95 | 96 | 97 | def pearsonr(x, y): 98 | # Assume len(x) == len(y) 99 | n = len(x) 100 | sum_x = sum(x) 101 | sum_y = sum(y) 102 | sum_x_sq = sum(map(lambda x: pow(x, 2), x)) 103 | sum_y_sq = sum(map(lambda x: pow(x, 2), y)) 104 | psum = sum(imap(lambda x, y: x * y, x, y)) 105 | num = psum - (sum_x * sum_y/n) 106 | den = pow((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n), 0.5) 107 | if den == 0: 108 | return 0 109 | return num / den 110 | 111 | 112 | 113 | def sinwave(size): 114 | results = [] 115 | inc = 3.14159 / size 116 | angle = 0 117 | for x in xrange(size): 118 | results.append( math.sin(angle) ) 119 | angle += inc 120 | 121 | return normalize(results) 122 | 123 | def sin2wave(size): 124 | results = [] 125 | inc = 3.14159 / size 126 | angle = -3.14159/2 127 | for x in xrange(size): 128 | results.append( math.sin(angle) ) 129 | angle += inc 130 | 131 | return normalize(results) 132 | 133 | def sin3wave(size): 134 | results = [] 135 | inc = 4 * 3.14159 / size 136 | angle = -3.14159/2 137 | for x in xrange(size): 138 | results.append( math.sin(angle) ) 139 | angle += inc 140 | 141 | return normalize(results) 142 | 143 | def coswave(size): 144 | results = [] 145 | inc = 3.14159 / size 146 | angle = 0 147 | for x in xrange(size): 148 | results.append( math.cos(angle) ) 149 | angle += inc 150 | 151 | return normalize(results) 152 | 153 | def ramp(start = 0, inc = 1, size=10): 154 | results = [] 155 | val = start 156 | for x in xrange(size): 157 | results.append( val ) 158 | val += inc 159 | return results 160 | 161 | def add_noise(data, range): 162 | results = [] 163 | for d in data: 164 | results.append(d + random.triangular(-range, range)) 165 | return results 166 | 167 | def scale(data, scale, offset = 0): 168 | results = [] 169 | for d in data: 170 | results.append(d * scale + offset) 171 | return results 172 | 173 | def dump(d): 174 | for i in d: 175 | print i 176 | 177 | def timing(size = 32,count = 1000000): 178 | start = time.time() 179 | sin2 = sin2wave(size) 180 | cos = coswave(size) 181 | 182 | for i in xrange(count): 183 | qdistance(sin2, cos) 184 | 185 | end = time.time() 186 | 187 | print end - start 188 | 189 | 190 | 191 | 192 | def timing2(size = 32, count = 1000000): 193 | start = time.time() 194 | ramp1 = ramp(0, 1, size) 195 | ramp2 = ramp(0, 2, size) 196 | 197 | for i in xrange(count): 198 | qdistance(ramp1, ramp2) 199 | 200 | end = time.time() 201 | 202 | print end - start 203 | 204 | def test(): 205 | xdata = aamp(0, .3, 1000) 206 | ydata = sin2wave(1000) 207 | ydata = add_noise(ydata, .5) 208 | idata = interpolate(xdata, ydata, 1400) 209 | sdata = smooth(idata, 20) 210 | samp = sample(sdata, 256) 211 | ndata = normalize(samp) 212 | 213 | sin2 = sin2wave(256) 214 | cos = coswave(256) 215 | sin = sinwave(256) 216 | flat = normalize(ramp(0, .01, 256)) 217 | 218 | print "# sin2wav", distance(ndata, sin2) 219 | print "# coswav", distance(ndata, cos) 220 | print "# sinwav", distance(ndata, sin) 221 | print "# flat", distance(ndata, flat) 222 | 223 | for i, (a, b, c, d, e) in enumerate(zip(ndata, sin2, cos, sin, flat)): 224 | print i, a,b,c,d,e 225 | 226 | 227 | 228 | if __name__ == '__main__': 229 | #timing() 230 | #timing2() 231 | dump(sin3wave(100)) 232 | #test() 233 | -------------------------------------------------------------------------------- /track.py: -------------------------------------------------------------------------------- 1 | 2 | """ Processes track data from the Million Song Database. Specifically, this 3 | file contains functions that load the flat-file format of tracks for the 4 | MSD. The format is one track per line, where each line is represented by 54 5 | fields as described here: 6 | 7 | http://labrosa.ee.columbia.edu/millionsong/pages/field-list 8 | 9 | except that in the flat file format, the 'track id' field has been moved 10 | from field 52 to the first field. 11 | 12 | A track is represented as a dictionary. 13 | """ 14 | 15 | import sys 16 | import pprint 17 | 18 | def load_track(line): 19 | """ Loads a track from a single line """ 20 | t = {} 21 | 22 | f = line.split('\t') 23 | if len(f) == 54: 24 | t['track_id'] = f[0] 25 | t['analysis_sample_rate'] = f[1] 26 | t['artist_7digitalid'] = f[2] 27 | t['artist_familiarity'] = float(f[3]) 28 | t['artist_hotttnesss'] = float(f[4]) 29 | t['artist_id'] = f[5] 30 | t['artist_latitude'] = float(f[6]) 31 | t['artist_location'] = f[7] 32 | t['artist_longitude'] = float(f[8]) 33 | t['artist_mbid'] = f[9] 34 | 35 | tag_words = f[10].split(',') 36 | tag_count = f[11].split(',') 37 | mbtags = [ (w, int(c)) for w,c in zip(tag_words, tag_count) if len(w) > 0] 38 | t['artist_mbtags'] = mbtags 39 | 40 | t['artist_name'] = f[12] 41 | t['artist_playmeid'] = int(f[13]) 42 | 43 | artist_terms = f[14].split(',') 44 | artist_terms_freq = f[15].split(',') 45 | artist_terms_weight = f[16].split(',') 46 | t['artist_terms'] = [ (term, float(freq), float(weight)) \ 47 | for term ,freq, weight in zip(artist_terms, artist_terms_freq, artist_terms_weight) if len(term) > 0] 48 | 49 | t['audio_md5'] = f[17] 50 | 51 | bars_confidence = f[18].split(',') 52 | bars_start = f[19].split(',') 53 | t['bars'] = [ (float(start), float(conf)) \ 54 | for start, conf in zip(bars_start, bars_confidence) if len(start) > 0 ] 55 | 56 | beats_confidence = f[20].split(',') 57 | beats_start = f[21].split(',') 58 | t['beats'] = [ (float(start), float(conf)) \ 59 | for start, conf in zip(beats_start, beats_confidence) if len(start) > 0 ] 60 | 61 | t['danceability'] = float(f[22]) 62 | t['duration'] = float(f[23]) 63 | t['end_of_fade_in'] = float(f[24]) 64 | t['energy'] = float(f[25]) 65 | t['key'] = (int(f[26]), float(f[27])) 66 | t['loudness'] = float(f[28]) 67 | t['mode'] = (int(f[29]), float(f[30])) 68 | t['release'] = f[31] 69 | t['release_7digitalid'] = f[32] 70 | srid = f[32].zfill(10) 71 | t['cover_art'] = 'http://cdn.7static.com/static/img/sleeveart/%s/%s/%s/%s_200.jpg' \ 72 | % (srid[0:2], srid[2:5], srid[5:8], srid) 73 | 74 | sections_confidence = f[33].split(',') 75 | sections_start = f[34].split(',') 76 | t['sections'] = [ (float(start), float(conf)) \ 77 | for start, conf in zip(sections_start, sections_confidence) if len(start) > 0 ] 78 | 79 | seg_confidence = f[35].split(',') 80 | seg_loudness_max = f[36].split(',') 81 | seg_loudness_max_time = f[37].split(',') 82 | seg_loudness_max_start = f[38].split(',') 83 | seg_pitches = f[39].split(',') 84 | seg_start = f[40].split(',')[:-1] 85 | seg_timbre = f[41].split(',') 86 | 87 | PITCH_COUNT = 12 88 | TIMBRE_COUNT = 12 89 | t['segments'] = [] 90 | for i, sstart in enumerate(seg_start): 91 | if len(sstart) > 0: 92 | seg = {} 93 | seg['start'] = float(sstart) 94 | seg['confidence'] = float(seg_confidence[i]) 95 | seg['loudness_max'] = float(seg_loudness_max[i]) 96 | seg['loudness_max_time'] = float(seg_loudness_max_time[i]) 97 | seg['loudness_start'] = float(seg_loudness_max_start[i]) 98 | seg['pitch'] =[ float(p) for p in seg_pitches[i * PITCH_COUNT: i * PITCH_COUNT + PITCH_COUNT]] 99 | seg['timbre'] =[ float(p) for p in seg_timbre[i * TIMBRE_COUNT: i * TIMBRE_COUNT + TIMBRE_COUNT]] 100 | t['segments'].append(seg) 101 | if i < len(seg_start) - 1: 102 | seg['duration'] = float(seg_start[i + 1]) - seg['start'] 103 | else: 104 | seg['duration'] = t['duration'] - seg['start'] 105 | 106 | t['similar_artists'] = [s for s in f[42].split(',') if len(s) > 0] 107 | t['song_hotttnesss'] = float(f[43]) 108 | t['song_id'] = f[44] 109 | t['start_of_fade_out'] = float(f[45]) 110 | 111 | tatums_confidence = f[46].split(',') 112 | tatums_start = f[47].split(',') 113 | t['tatums'] = [ (float(start), float(conf)) \ 114 | for start, conf in zip(tatums_start, tatums_confidence) if len(start) > 0 ] 115 | t['tempo'] = float(f[48]) 116 | t['time_signature'] = (int(f[49]), float(f[50])) 117 | t['title'] = f[51] 118 | t['track_7digitalid'] = int(f[52]) 119 | t['preview'] = 'http://previews.7digital.com/clips/34/%d.clip.mp3' % (int(f[52]), ) 120 | t['year'] = int(f[53]) 121 | return t 122 | else: 123 | print 'mismatched fields, found', len(f), 'should have 54' 124 | return None 125 | 126 | 127 | 128 | 129 | def load_tracks(path): 130 | """ Loads a list of track from a file """ 131 | 132 | tracks = [] 133 | file = open(path) 134 | for which, line in enumerate(file): 135 | track = load_track(line) 136 | if track <> None: 137 | track['path'] = path 138 | track['line'] = which 139 | tracks.append(track) 140 | file.close() 141 | return tracks 142 | 143 | def process_tracks(path, func): 144 | """ applies func(track) to each track found in path """ 145 | file = open(path) 146 | for which, line in enumerate(file): 147 | track = load_track(line) 148 | if track <> None: 149 | track['path'] = path 150 | track['line'] = which 151 | func(track) 152 | file.close() 153 | 154 | 155 | def dump(track): 156 | """ Dumps some data from a track for debugging """ 157 | print track['line'], track['track_id'], track['artist_id'], len(track['artist_mbtags']), \ 158 | len(track['artist_terms'] ), len(track['bars']), len(track['beats']), track['title'], \ 159 | track['key'], track['mode'], len(track['segments']) 160 | for seg in track['segments']: 161 | print ' ', seg['start'], seg['duration'], track['duration'] 162 | print 163 | 164 | 165 | if __name__ == '__main__': 166 | process_tracks(sys.argv[1], dump) 167 | -------------------------------------------------------------------------------- /util/README.md: -------------------------------------------------------------------------------- 1 | Utilities 2 | --------- 3 | A set of utility scripts to help process output data from the map reduce jobs 4 | 5 | nsort.py - sorts the data 6 | dump_loudness.py - dumps the loudness for each segment of a track suitable for plotting 7 | -------------------------------------------------------------------------------- /util/dump_loudness.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | from itertools import imap 4 | from pyechonest import track 5 | import spotimeta 6 | 7 | WEIGHT = 10 8 | 9 | def dump_loudness(id, file=sys.stdout): 10 | t = track.track_from_id(id) 11 | title = t.title + ' by ' + t.artist 12 | spotify_id = get_spotify_id(t.artist, t.title) 13 | 14 | print >>file, "# ID ", id 15 | print >>file, "#", title 16 | print >>file, "# ARTIST ", t.artist 17 | print >>file, "# TITLE ", t.title 18 | print >>file, "# SONG_ID ", t.song_id 19 | print >>file, "# SPOT_ID ", spotify_id 20 | print >>file, "#" 21 | 22 | weighted = [] 23 | half_track = t.duration / 2 24 | first_half = 0 25 | second_half = 0 26 | 27 | xdata = [] 28 | ydata = [] 29 | 30 | for seg in t.segments: 31 | sstart = seg['start'] 32 | sloudness = min(seg['loudness_max'], 0) 33 | sloudness = max(sloudness, -60) 34 | sduration = seg['duration'] 35 | send = sstart + sduration 36 | 37 | weighted.append(sloudness) 38 | if len(weighted) > WEIGHT: 39 | weighted.pop(0) 40 | avg = sum(weighted) / len(weighted) 41 | 42 | if send <= half_track: 43 | seg_loudness = sloudness * sduration 44 | first_half += seg_loudness 45 | elif sstart < half_track and send > half_track: 46 | # this is the nasty segment that spans the song midpoint. 47 | # apportion the loudness appropriately 48 | first_seg_loudness = sloudness * (half_track - sstart) 49 | first_half += first_seg_loudness 50 | 51 | second_seg_loudness = sloudness * (send - half_track) 52 | second_half += second_seg_loudness 53 | else: 54 | seg_loudness = sloudness * sduration 55 | second_half += seg_loudness 56 | 57 | xdata.append( sstart ) 58 | ydata.append( sloudness ) 59 | 60 | ramp_factor = second_half / half_track - first_half / half_track 61 | #print >>file, seg['start'], sloudness, avg, first_half, second_half, ramp_factor 62 | #print >>file, "%8.6f %9.4f %9.4f %12.6f %12.6f %12.6f" % (sstart, sloudness, avg, first_half, second_half, ramp_factor) 63 | print >>file, "%8.6f %9.4f %9.4f" % (sstart, sloudness, avg) 64 | 65 | correlation = pearsonr(xdata, ydata) 66 | print >>file, "#" 67 | print >>file, "#", 'ramp factor', ramp_factor 68 | print >>file, "#", 'correlation', correlation 69 | print >>file, "#", 'first', first_half / half_track 70 | print >>file, "#", 'second', second_half / half_track 71 | print >>file, "#" 72 | 73 | return title, ramp_factor, first_half / half_track, second_half / half_track 74 | 75 | def pearsonr(x, y): 76 | # Assume len(x) == len(y) 77 | n = len(x) 78 | sum_x = sum(x) 79 | sum_y = sum(y) 80 | sum_x_sq = sum(map(lambda x: pow(x, 2), x)) 81 | sum_y_sq = sum(map(lambda x: pow(x, 2), y)) 82 | psum = sum(imap(lambda x, y: x * y, x, y)) 83 | num = psum - (sum_x * sum_y/n) 84 | den = pow((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n), 0.5) 85 | if den == 0: 86 | return 0 87 | return num / den 88 | 89 | 90 | def spotify_search(name): 91 | retries = 5 92 | while retries > 0: 93 | try: 94 | results = spotimeta.search_track(name) 95 | return results 96 | except spotimeta.ServerError: 97 | print >> sys.stderr, " ... retrying spotify for ", name 98 | time.sleep(5) 99 | retries -= 1 100 | return None 101 | 102 | def get_spotify_id(artist, title): 103 | name = artist + ' ' + title 104 | name = norm(name) 105 | 106 | search = spotify_search(name) 107 | if search and search["total_results"] > 0: 108 | best = search['result'][0] 109 | result = best['href'] 110 | print >> sys.stderr, ' found', result, best['artist']['name'], best['name'] 111 | return result 112 | else: 113 | print >> sys.stderr, "Couldn't spotifind", name 114 | return None 115 | 116 | def norm(name): 117 | s = name 118 | s = s.replace(".", "") 119 | s = s.lower() 120 | s = re.sub(r'&', ' and ', s) 121 | s = re.sub(r' +', ' ', s) 122 | s = s.strip() 123 | 124 | # if we've normalized away everything 125 | # keep it. 126 | if len(s) == 0: 127 | s = name 128 | return s 129 | 130 | 131 | 132 | if __name__ == '__main__': 133 | dump_loudness(sys.argv[1]) 134 | -------------------------------------------------------------------------------- /util/make_plots.py: -------------------------------------------------------------------------------- 1 | # 30.8926305957 ["David Coverdale", "Into The Light", "TRPNADQ128F422DB22"] 2 | 3 | import simplejson as json 4 | import sys 5 | import os 6 | import sdir 7 | import dump_loudness 8 | 9 | plotter_text = """ 10 | set terminal postscript landscape size 8.4,5.9 11 | set output "foo.plt" 12 | set style data line 13 | set key bottom 14 | set xlabel "seconds" 15 | set ylabel "decibels" 16 | set title " %s " 17 | plot "%s.out" using 1:2 with line linecolor rgb "#aaffaa" title "raw loudness" 18 | replot "%s.out" using 1:2 with line lt -1 linecolor rgb "blue" smooth bezier title "smoothed loudness" 19 | replot %f with line linecolor rgb "#777777" title "" 20 | replot %f with line linecolor rgb "#777777" title "" 21 | set key title "score:%.2f ramp:%.2f corr:%.2f" 22 | set terminal postscript landscape size 8.4,5.9 23 | set output "plot.ps" 24 | replot 25 | """ 26 | 27 | def make_plot(id, force=False): 28 | plot_path = os.path.join("plots", id + ".png") 29 | if force or not os.path.exists(plot_path): 30 | print "Creating plot", plot_path 31 | name, ramp, first, second, correlation = sdir.get_plot_info(id) 32 | score = ramp * correlation 33 | title = name.replace('"', "'") 34 | plotter = open("plotter.gplt", "w") 35 | print >>plotter, plotter_text % (title, id, id, first, second, score, ramp, correlation) 36 | plotter.close() 37 | os.system("gnuplot plotter.gplt") 38 | os.system("convert -rotate 90 plot.ps %s" % (plot_path,)) 39 | else: 40 | print >>sys.stderr, " plot exists, skipping...", plot_path 41 | 42 | def create_plot(id): 43 | force = False 44 | path = id + ".out" 45 | if not os.path.exists(path): 46 | f = open(path, "w") 47 | dump_loudness.dump_loudness(id, f) 48 | f.close() 49 | force = True 50 | else: 51 | print >>sys.stderr, " data exists, skipping...", id 52 | make_plot(id, force) 53 | 54 | 55 | def create_plots(): 56 | # parses input in the form 57 | # # 30.8926305957 ["David Coverdale", "Into The Light", "TRPNADQ128F422DB22"] 58 | for which, line in enumerate(sys.stdin): 59 | fields = line.strip().split() 60 | slist = ' '.join(fields[1:]) 61 | list = json.loads(slist) 62 | id = list[2] 63 | print >>sys.stderr, which, id 64 | create_plot(id) 65 | 66 | 67 | 68 | if __name__ == '__main__': 69 | if len(sys.argv) > 1: 70 | for id in sys.argv[1:]: 71 | create_plot(id) 72 | else: 73 | create_plots() 74 | 75 | -------------------------------------------------------------------------------- /util/nsort.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | 4 | 5 | lines = [] 6 | for line in sys.stdin: 7 | fields = line.strip().split('\t') 8 | if len(fields) == 2: 9 | key, sval = fields 10 | if len(sval) > 0: 11 | lines.append( (key, float(sval)) ) 12 | 13 | lines.sort(reverse=True, key=lambda s:s[1]) 14 | 15 | 16 | for key, val in lines: 17 | print val, key 18 | -------------------------------------------------------------------------------- /util/nsort2.py: -------------------------------------------------------------------------------- 1 | 2 | import simplejson as json 3 | import sys 4 | 5 | 6 | lines = [] 7 | which = 0 8 | 9 | if len(sys.argv) > 1: 10 | which = int(sys.argv[1]) 11 | 12 | for line in sys.stdin: 13 | fields = line.strip().split('\t') 14 | if len(fields) == 2: 15 | key, sval = fields 16 | if len(sval) > 0: 17 | list = json.loads(sval) 18 | lines.append( (key, list)) 19 | 20 | lines.sort(reverse=True, key=lambda s:s[1][which]) 21 | 22 | 23 | for key, val in lines: 24 | print val, key 25 | -------------------------------------------------------------------------------- /util/plotter: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ID TRANELS128F425B25C 3 | # I'll Kill You by Konami Kukeiha Club 4 | 5 | ID=`awk ' $2 == "ID" { $1= ""; $2 = ""; print }' data/$1.out` 6 | title=`awk ' NR==2 { $1 = ""; print }' data/$1.out` 7 | 8 | rm plot 9 | echo plot \"$1.out\" using 1:2 with line linecolor rgb \"#eeeeee\" title \"\" >> plot 10 | echo replot "$1.out" using 1:3 with line linecolor rgb \"blue\" title \"loudness\" >> plot 11 | echo set xlabel \"Time seconds\" >> plot 12 | echo set ylabel \"decibels\" >> plot 13 | echo set title \"$title\" >> plot 14 | echo set terminal ps >> plot 15 | echo set output \"$ID.ps\" >> plot 16 | -------------------------------------------------------------------------------- /util/sdir.py: -------------------------------------------------------------------------------- 1 | import time 2 | import json 3 | import re 4 | import sys 5 | import os 6 | 7 | 8 | def get_plot_info(id): 9 | path = id + ".out" 10 | tid = '' 11 | sid = '' 12 | spot = None 13 | name = '' 14 | factor = 0 15 | first = 0 16 | second = 0 17 | correlation = 0 18 | 19 | if os.path.exists(path): 20 | f = open(path) 21 | for which, line in enumerate(f): 22 | if which == 1: 23 | name = line.strip().replace("# ", "") 24 | elif line[0] == '#': 25 | fields = line.strip().split() 26 | if len(fields) >= 4 and fields[1] == 'ramp': 27 | factor = float(fields[3]) 28 | if len(fields) >= 3 and fields[1] == 'first': 29 | first = float(fields[2]) 30 | if len(fields) >= 3 and fields[1] == 'second': 31 | second = float(fields[2]) 32 | if len(fields) >= 3 and fields[1] == 'correlation': 33 | correlation = float(fields[2]) 34 | if len(fields) >= 3 and fields[1] == 'ID': 35 | tid = fields[2] 36 | if len(fields) >= 3 and fields[1] == 'SONG_ID': 37 | sid = fields[2] 38 | if len(fields) >= 3 and fields[1] == 'SPOT_ID' and fields[2] <> 'None': 39 | spot = fields[2] 40 | 41 | if tid <> id: 42 | print >> sys.stderr, "Mismatched ID ", id, tid 43 | return name, sid, spot, factor, first, second, correlation 44 | else: 45 | return None 46 | 47 | 48 | def make_entry(id): 49 | info = get_plot_info(id) 50 | if info: 51 | name, sid, spot_id, factor, first, second, correlation = info 52 | image = id + ".png" 53 | 54 | title = name 55 | 56 | if spot_id: 57 | print '', 58 | print '', 59 | print '' 60 | else: 61 | pass 62 | # print '' 63 | else: 64 | print >>sys.stderr, "Can't open info for", id 65 | 66 | def make_plot_page(index): 67 | for l in index: 68 | process_plot(l.strip()); 69 | 70 | 71 | def header(): 72 | print "" 73 | print "" 74 | print "" 75 | print "Loudness curves from the Million Song Dataset" 76 | print "" 77 | print "" 78 | print "" 79 | print "

Loudness curves from the Million Song Dataset

" 80 | 81 | def footer(): 82 | print "
" 83 | print "" 84 | 85 | def build_page(index): 86 | header() 87 | for count, l in enumerate(open(index)): 88 | l = l.strip() 89 | print >> sys.stderr, count, l 90 | if len(l) > 0: 91 | if l[0] == '#': 92 | print "

", l[1:], "

" 93 | elif l.startswith("TR"): 94 | make_entry(l) 95 | footer() 96 | 97 | 98 | def qd_page(): 99 | header() 100 | for l in sys.stdin: 101 | plot = l.strip() 102 | print '', 103 | footer() 104 | 105 | def dump_ids(): 106 | # parses input in the form 107 | # # 30.8926305957 ["David Coverdale", "Into The Light", "TRPNADQ128F422DB22"] 108 | for which, line in enumerate(sys.stdin): 109 | fields = line.strip().split() 110 | slist = ' '.join(fields[1:]) 111 | list = json.loads(slist) 112 | id = list[2] 113 | print id 114 | 115 | 116 | if __name__ == '__main__': 117 | if len(sys.argv) > 1: 118 | if sys.argv[1] == '--dump': 119 | dump_ids() 120 | else: 121 | build_page(sys.argv[1]) 122 | else: 123 | qd_page() 124 | 125 | --------------------------------------------------------------------------------