├── .gitignore
├── README.md
├── artist_ramp.py
├── cramp.py
├── density.py
├── lmatch.py
├── ramp.py
├── tiny.dat
├── tools.py
├── track.py
└── util
├── README.md
├── dump_loudness.py
├── make_plots.py
├── nsort.py
├── nsort2.py
├── plotter
└── sdir.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.out
3 | *.gz
4 | data/*
5 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | msd-examples
2 | ===========
3 | Example code for processing the Million Song Database. This repostory contains
4 | code that can be used to process the million song dataset.
5 |
6 | The Million Song Dataset is a freely-available collection of audio features and metadata for a million contemporary popular
7 | music tracks available at:
8 |
9 | http://labrosa.ee.columbia.edu/millionsong/pages/getting-dataset
10 |
11 | Its purposes are:
12 |
13 | * To encourage research on algorithms that scale to commercial sizes
14 | * To provide a reference dataset for evaluating research
15 | * As a shortcut alternative to creating a large dataset with The Echo Nests API
16 | * To help new researchers get started in the MIR field
17 |
18 | The core of the dataset is the feature analysis and metadata for one million songs, provided by The Echo Nest. The dataset
19 | does not include any audio, only the derived features. Note, however, that sample audio can be fetched from services like
20 | 7digital, using code we provide. Additional datasets have been attached to the Million Song Dataset, so far they contain lyrics and cover songs. The Million Song Dataset started as a collaborative project between The Echo Nest and LabROSA.
21 | It was supported in part by the NSF.
22 |
23 | These examples depend on mrjob, a python library for running MapReduce jobs on Hadoop or Amazon web services. See
24 | https://github.com/Yelp/mrjob and http://packages.python.org/mrjob/.
25 |
26 |
27 | MSD Data on S3
28 | ==============
29 | These examples use MSD data that has been loaded on to S3 at s3://tbmmsd. There are around 330 files each with about 3000
30 | sets track data each (one set per line) where each line is represented by 54 fields as described here:
31 |
32 | http://labrosa.ee.columbia.edu/millionsong/pages/field-list
33 |
34 | except that in the flat file format, the 'track id' field has been moved from field 52 to the first field.
35 |
36 | In the repository you will find tiny.dat which contains data for 20 tracks.
37 |
38 |
39 |
40 |
41 | Map-reduce jobs
42 | ===============
43 |
44 | Density
45 | ------
46 | Finds the most dense and the least dense songs
47 |
48 | density.py
49 |
50 |
51 | ### Local Usage:
52 |
53 | python density.py tiny.dat
54 |
55 |
56 | ### EC2 Usage
57 | This will run the job on EC2 Map reduce on 100 small instances. Note that you have to
58 | add the track.py code to t.tar.gz with:
59 |
60 | % tar cvfz t.tar.gz track.py
61 |
62 | To run the job on 100 CPUs on all of the MSD use:
63 |
64 | % python density.py --num-ec2-instances 100 --python-archive t.tar.gz -r emr 's3://tbmmsd/*.tsv.*' > output.dat
65 |
66 |
67 | (Of course you will need to setup your Amazon credentials. See http://packages.python.org/mrjob/writing-and-running.html#running-on-emr )
68 |
69 |
70 |
--------------------------------------------------------------------------------
/artist_ramp.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | """
4 | A map-reduce that calculates the average slow build
5 | for each artist
6 | """
7 |
8 | from mrjob.job import MRJob
9 | import track
10 | from itertools import imap
11 | import unicodedata
12 |
13 | # if YIELD_ALL is true, we yield all densities, otherwise,
14 | # we yield just the extremes
15 |
16 | YIELD_ALL = False
17 |
18 | class MRRamp(MRJob):
19 | """ A map-reduce job that calculates the ramp factor """
20 |
21 | def mapper(self, _, line):
22 | """ The mapper loads a track and yields its ramp factor """
23 | t = track.load_track(line)
24 | if t and t['duration'] > 60 and len(t['segments']) > 20:
25 | segments = t['segments']
26 | half_track = t['duration'] / 2
27 | first_half = 0
28 | second_half = 0
29 | first_count = 0
30 | second_count = 0
31 |
32 | xdata = []
33 | ydata = []
34 | for i in xrange(len(segments)):
35 | seg = segments[i]
36 |
37 | # bail out if we have a really long quiet segment
38 | # these are usually surprise hidden tracks
39 |
40 | if seg['loudness_max'] < -40 and seg['duration'] > 30:
41 | return
42 |
43 | seg_loudness = seg['loudness_max'] * seg['duration']
44 |
45 | if seg['start'] + seg['duration'] <= half_track:
46 | seg_loudness = seg['loudness_max'] * seg['duration']
47 | first_half += seg_loudness
48 | first_count += 1
49 | elif seg['start'] < half_track and seg['start'] + seg['duration'] > half_track:
50 | # this is the nasty segment that spans the song midpoint.
51 | # apportion the loudness appropriately
52 | first_seg_loudness = seg['loudness_max'] * (half_track - seg['start'])
53 | first_half += first_seg_loudness
54 | first_count += 1
55 |
56 | second_seg_loudness = seg['loudness_max'] * (seg['duration'] - (half_track - seg['start']))
57 | second_half += second_seg_loudness
58 | second_count += 1
59 | else:
60 | seg_loudness = seg['loudness_max'] * seg['duration']
61 | second_half += seg_loudness
62 | second_count += 1
63 |
64 | xdata.append( seg['start'] )
65 | ydata.append( seg['loudness_max'] )
66 |
67 | # only yield data if we've had sufficient segments in the first
68 | # and second half of the track. (This is to avoid the proverbial
69 | # hidden tracks that have extreme amounts of leading or tailing
70 | # silene
71 |
72 | correlation = pearsonr(xdata, ydata)
73 | ramp_factor = second_half / half_track - first_half / half_track
74 | score = correlation * ramp_factor
75 | yield (t['artist_id'], t['artist_name']), (score, t['track_id'])
76 |
77 | def reducer(self, key, val):
78 | count = 0
79 | sum = 0
80 | best = -60
81 | best_id = None
82 |
83 | for score, trid in val:
84 | sum += score
85 | count += 1
86 | if score > best:
87 | best = score
88 | best_id = trid
89 | avg = sum / count
90 |
91 | if count > 5 and avg > 5:
92 | yield key, (avg, count, best, best_id)
93 |
94 |
95 | def pearsonr(x, y):
96 | # Assume len(x) == len(y)
97 | n = len(x)
98 | sum_x = sum(x)
99 | sum_y = sum(y)
100 | sum_x_sq = sum(map(lambda x: pow(x, 2), x))
101 | sum_y_sq = sum(map(lambda x: pow(x, 2), y))
102 | psum = sum(imap(lambda x, y: x * y, x, y))
103 | num = psum - (sum_x * sum_y/n)
104 | den = pow((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n), 0.5)
105 | if den == 0:
106 | return 0
107 | return num / den
108 |
109 |
110 | if __name__ == '__main__':
111 | MRRamp.run()
112 |
--------------------------------------------------------------------------------
/cramp.py:
--------------------------------------------------------------------------------
1 |
2 | """
3 | A map-reduce that calculates the difference in
4 | average volume between the first and the second
5 | half of the song.
6 | """
7 |
8 | from mrjob.job import MRJob
9 | import track
10 | from itertools import imap
11 | import unicodedata
12 |
13 | # if YIELD_ALL is true, we yield all densities, otherwise,
14 | # we yield just the extremes
15 |
16 | YIELD_ALL = False
17 |
18 | class MRRamp(MRJob):
19 | """ A map-reduce job that calculates the ramp factor """
20 |
21 | def mapper(self, _, line):
22 | """ The mapper loads a track and yields its ramp factor """
23 | t = track.load_track(line)
24 | if t and t['duration'] > 60 and len(t['segments']) > 20:
25 | segments = t['segments']
26 | half_track = t['duration'] / 2
27 | first_half = 0
28 | second_half = 0
29 | first_count = 0
30 | second_count = 0
31 |
32 | xdata = []
33 | ydata = []
34 | for i in xrange(len(segments)):
35 | seg = segments[i]
36 |
37 | # bail out if we have a really long quiet segment
38 | # these are usually surprise hidden tracks
39 |
40 | if seg['loudness_max'] < -40 and seg['duration'] > 30:
41 | return
42 |
43 | seg_loudness = seg['loudness_max'] * seg['duration']
44 |
45 | if seg['start'] + seg['duration'] <= half_track:
46 | seg_loudness = seg['loudness_max'] * seg['duration']
47 | first_half += seg_loudness
48 | first_count += 1
49 | elif seg['start'] < half_track and seg['start'] + seg['duration'] > half_track:
50 | # this is the nasty segment that spans the song midpoint.
51 | # apportion the loudness appropriately
52 | first_seg_loudness = seg['loudness_max'] * (half_track - seg['start'])
53 | first_half += first_seg_loudness
54 | first_count += 1
55 |
56 | second_seg_loudness = seg['loudness_max'] * (seg['duration'] - (half_track - seg['start']))
57 | second_half += second_seg_loudness
58 | second_count += 1
59 | else:
60 | seg_loudness = seg['loudness_max'] * seg['duration']
61 | second_half += seg_loudness
62 | second_count += 1
63 |
64 | xdata.append( seg['start'] )
65 | ydata.append( seg['loudness_max'] )
66 |
67 | # only yield data if we've had sufficient segments in the first
68 | # and second half of the track. (This is to avoid the proverbial
69 | # hidden tracks that have extreme amounts of leading or tailing
70 | # silene
71 |
72 | correlation = pearsonr(xdata, ydata)
73 | if first_count > 10 and second_count > 10:
74 | ramp_factor = second_half / half_track - first_half / half_track
75 | #if YIELD_ALL or ramp_factor > 10 or ramp_factor < -10:
76 | if YIELD_ALL or ramp_factor > 10 and correlation > .5:
77 | yield (t['artist_name'], t['title'], t['track_id'], correlation), ramp_factor
78 |
79 | # no need for a reducer
80 | #def reducer(self, key, val):
81 | #yield (key, sum(val))
82 |
83 |
84 | def pearsonr(x, y):
85 | # Assume len(x) == len(y)
86 | n = len(x)
87 | sum_x = sum(x)
88 | sum_y = sum(y)
89 | sum_x_sq = sum(map(lambda x: pow(x, 2), x))
90 | sum_y_sq = sum(map(lambda x: pow(x, 2), y))
91 | psum = sum(imap(lambda x, y: x * y, x, y))
92 | num = psum - (sum_x * sum_y/n)
93 | den = pow((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n), 0.5)
94 | if den == 0:
95 | return 0
96 | return num / den
97 |
98 |
99 | def test():
100 | x = [1,2,3,4,5,6,7,8,9,10]
101 | y = [10, 20, 35, 45, 47, 60, 70, 87, 91, 100]
102 | print pearsonr(x,y)
103 |
104 |
105 | if __name__ == '__main__':
106 | MRRamp.run()
107 |
--------------------------------------------------------------------------------
/density.py:
--------------------------------------------------------------------------------
1 | """
2 | A map-reduce that calculates the density for each
3 | of a set of tracks. The track density is the average
4 | number of segments per segment for a track.
5 | """
6 |
7 | from mrjob.job import MRJob
8 | import track
9 |
10 | # if YIELD_ALL is true, we yield all densities, otherwise,
11 | # we yield just the extremes
12 |
13 | YIELD_ALL = True
14 |
15 | class MRDensity(MRJob):
16 | """ A map-reduce job that calculates the density """
17 |
18 | def mapper(self, _, line):
19 | """ The mapper loads a track and yields its density """
20 | t = track.load_track(line)
21 | if t:
22 | if t['tempo'] > 0:
23 | density = len(t['segments']) / t['duration']
24 | #only output extreme density
25 | if YIELD_ALL or density > 8 or density < .5:
26 | yield (t['artist_name'], t['title'], t['song_id']), density
27 |
28 | # no need for a reducer
29 | #def reducer(self, key, val):
30 | #yield (key, sum(val))
31 |
32 | if __name__ == '__main__':
33 | MRDensity.run()
34 |
--------------------------------------------------------------------------------
/lmatch.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | """
4 | A map-reduce that calculates the difference in
5 | average volume between the first and the second
6 | half of the song.
7 | """
8 |
9 | from mrjob.job import MRJob
10 | import track
11 | from itertools import imap
12 | import math
13 | import tools
14 | import sys
15 |
16 | # if YIELD_ALL is true, we yield all densities, otherwise,
17 | # we yield just the extremes
18 |
19 | YIELD_ALL = False
20 |
21 | class MRlmatch(MRJob):
22 | """ A map-reduce job that calculates the ramp factor """
23 |
24 | DUMP = False
25 | SIZE = 64
26 | VECTOR = True
27 | #MATCH = tools.rnormalize(tools.scale(tools.sin2wave(SIZE), 60, -60), -60, 0)
28 | MATCH = tools.rnormalize(tools.scale(tools.sinwave(SIZE), 60, -60), -60, 0)
29 |
30 | def mapper(self, _, line):
31 | """ The mapper loads a track and yields its ramp factor """
32 | t = track.load_track(line)
33 | segments = t['segments']
34 | duration = t['duration']
35 | xdata = []
36 | ydata = []
37 | for i in xrange(len(segments)):
38 | seg = segments[i]
39 | sloudness = seg['loudness_max']
40 | sstart = seg['start'] + seg['loudness_max_time']
41 | xdata.append( sstart )
42 | ydata.append( sloudness )
43 |
44 | if duration > 20:
45 | idata = tools.interpolate(xdata, ydata, int(duration) * 10)
46 | smooth = tools.smooth(idata, 20)
47 | samp = tools.sample(smooth, self.SIZE)
48 | ndata = tools.rnormalize(samp, -60, 0)
49 | if self.DUMP:
50 | for i, (x, y) in enumerate(zip(self.MATCH, ndata)):
51 | print i, x, y
52 | if self.VECTOR:
53 | yield (t['artist_name'], t['title'], t['track_id']), ndata
54 | else:
55 | distance = tools.distance(self.MATCH, ndata)
56 | yield (t['artist_name'], t['title'], t['track_id']), distance
57 |
58 |
59 | # no need for a reducer
60 | #def reducer(self, key, val):
61 | #yield (key, sum(val))
62 |
63 |
64 | def dump():
65 | data = tools.rnormalize(tools.scale(tools.sin2wave(256), 60, -60), -60, 0)
66 | for d in data:
67 | print d
68 |
69 | if __name__ == '__main__':
70 | #dump()
71 | MRlmatch.run()
72 |
--------------------------------------------------------------------------------
/ramp.py:
--------------------------------------------------------------------------------
1 |
2 | """
3 | A map-reduce that calculates the difference in
4 | average volume between the first and the second
5 | half of the song.
6 | """
7 |
8 | from mrjob.job import MRJob
9 | import track
10 |
11 | # if YIELD_ALL is true, we yield all densities, otherwise,
12 | # we yield just the extremes
13 |
14 | YIELD_ALL = False
15 |
16 | class MRRamp(MRJob):
17 | """ A map-reduce job that calculates the ramp factor """
18 |
19 | def mapper(self, _, line):
20 | """ The mapper loads a track and yields its ramp factor """
21 | t = track.load_track(line)
22 | if t and t['duration'] > 60 and len(t['segments']) > 20:
23 | segments = t['segments']
24 | half_track = t['duration'] / 2
25 | first_half = 0
26 | second_half = 0
27 | first_count = 0
28 | second_count = 0
29 |
30 | for i in xrange(len(segments)):
31 | seg = segments[i]
32 |
33 | # bail out if we have a really long quiet segment
34 | # these are usually surprise hidden tracks
35 |
36 | if seg['loudness_max'] < -40 and seg['duration'] > 30:
37 | return
38 |
39 | seg_loudness = seg['loudness_max'] * seg['duration']
40 |
41 | if seg['start'] + seg['duration'] <= half_track:
42 | seg_loudness = seg['loudness_max'] * seg['duration']
43 | first_half += seg_loudness
44 | first_count += 1
45 | elif seg['start'] < half_track and seg['start'] + seg['duration'] > half_track:
46 | # this is the nasty segment that spans the song midpoint.
47 | # apportion the loudness appropriately
48 | first_seg_loudness = seg['loudness_max'] * (half_track - seg['start'])
49 | first_half += first_seg_loudness
50 | first_count += 1
51 |
52 | second_seg_loudness = seg['loudness_max'] * (seg['duration'] - (half_track - seg['start']))
53 | second_half += second_seg_loudness
54 | second_count += 1
55 | else:
56 | seg_loudness = seg['loudness_max'] * seg['duration']
57 | second_half += seg_loudness
58 | second_count += 1
59 |
60 | # only yield data if we've had sufficient segments in the first
61 | # and second half of the track. (This is to avoid the proverbial
62 | # hidden tracks that have extreme amounts of leading or tailing
63 | # silene
64 |
65 | if first_count > 10 and second_count > 10:
66 | ramp_factor = second_half / half_track - first_half / half_track
67 | if YIELD_ALL or ramp_factor > 10 or ramp_factor < -10:
68 | yield (t['artist_name'], t['title'], t['track_id']), ramp_factor
69 |
70 | # no need for a reducer
71 | #def reducer(self, key, val):
72 | #yield (key, sum(val))
73 |
74 | if __name__ == '__main__':
75 | MRRamp.run()
76 |
--------------------------------------------------------------------------------
/tools.py:
--------------------------------------------------------------------------------
1 | import math
2 | import random
3 | import time
4 |
5 |
6 | def interpolate(xdata, ydata, points):
7 | results = []
8 | duration = xdata[-1]
9 | inc = duration / float(points)
10 | ctime = 0
11 | cindex = 0
12 |
13 | last_index = 0
14 | for i in xrange(points):
15 | for j in xrange(last_index, len(xdata) - 1):
16 | #print 'xd', xdata[j], xdata[j+1], ctime
17 | if ctime < xdata[j+1]:
18 | break
19 | last_index = j
20 |
21 | frac = (ctime - xdata[j]) / (xdata[j+1] - xdata[j])
22 | y = frac * (ydata[j+1] - ydata[j]) + ydata[j]
23 | #print 'ct', ctime, xdata[j], xdata[j+1]
24 | results.append(y)
25 |
26 | ctime += inc
27 | return results
28 |
29 |
30 | def smooth(data, fsize=10):
31 | out = []
32 | filter = []
33 | # bug, make this be a centering filter
34 | for d in data:
35 | filter.append(d)
36 | if len(filter) > fsize:
37 | filter.pop(0)
38 | out.append( sum(filter) / len(filter))
39 | return out
40 |
41 | def sample(data, size):
42 | results = []
43 | jump = float(len(data)) / float(size)
44 | for i in xrange(size):
45 | index = int(round(i * jump))
46 | index = min(len(data) - 1, index)
47 | results.append(data[index])
48 | return results
49 |
50 |
51 | def normalize(data, range = 1):
52 | max_data = max(data)
53 | min_data = min(data)
54 |
55 | out = [ range * (x - min_data) / (max_data - min_data) for x in data]
56 | return out
57 |
58 | def rnormalize(data, min_data = 0, max_data = 1, range = 1):
59 | data = clamp(data, min_data, max_data)
60 | out = [ range * (x - min_data) / (max_data - min_data) for x in data]
61 | return out
62 |
63 |
64 | def clamp(data, min_data, max_data):
65 | results = []
66 | for d in data:
67 | d = min(d, max_data)
68 | d = max(d, min_data)
69 | results.append(d)
70 | return results
71 |
72 |
73 | def distance(d1, d2):
74 | if len(d1) <> len(d2):
75 | raise ValueError
76 | sum = 0
77 | for p1, p2 in zip(d1, d2):
78 | d = p2 - p1
79 | dd = d * d
80 | sum += dd
81 | #print p1, p2, dd, sum
82 | return math.sqrt(sum)
83 |
84 | def qdistance(d1, d2):
85 | if len(d1) <> len(d2):
86 | raise ValueError
87 | sum = 0
88 | for p1, p2 in zip(d1, d2):
89 | d = p2 - p1
90 | dd = d * d
91 | sum += dd
92 | #print p1, p2, dd, sum
93 | return sum
94 |
95 |
96 |
97 | def pearsonr(x, y):
98 | # Assume len(x) == len(y)
99 | n = len(x)
100 | sum_x = sum(x)
101 | sum_y = sum(y)
102 | sum_x_sq = sum(map(lambda x: pow(x, 2), x))
103 | sum_y_sq = sum(map(lambda x: pow(x, 2), y))
104 | psum = sum(imap(lambda x, y: x * y, x, y))
105 | num = psum - (sum_x * sum_y/n)
106 | den = pow((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n), 0.5)
107 | if den == 0:
108 | return 0
109 | return num / den
110 |
111 |
112 |
113 | def sinwave(size):
114 | results = []
115 | inc = 3.14159 / size
116 | angle = 0
117 | for x in xrange(size):
118 | results.append( math.sin(angle) )
119 | angle += inc
120 |
121 | return normalize(results)
122 |
123 | def sin2wave(size):
124 | results = []
125 | inc = 3.14159 / size
126 | angle = -3.14159/2
127 | for x in xrange(size):
128 | results.append( math.sin(angle) )
129 | angle += inc
130 |
131 | return normalize(results)
132 |
133 | def sin3wave(size):
134 | results = []
135 | inc = 4 * 3.14159 / size
136 | angle = -3.14159/2
137 | for x in xrange(size):
138 | results.append( math.sin(angle) )
139 | angle += inc
140 |
141 | return normalize(results)
142 |
143 | def coswave(size):
144 | results = []
145 | inc = 3.14159 / size
146 | angle = 0
147 | for x in xrange(size):
148 | results.append( math.cos(angle) )
149 | angle += inc
150 |
151 | return normalize(results)
152 |
153 | def ramp(start = 0, inc = 1, size=10):
154 | results = []
155 | val = start
156 | for x in xrange(size):
157 | results.append( val )
158 | val += inc
159 | return results
160 |
161 | def add_noise(data, range):
162 | results = []
163 | for d in data:
164 | results.append(d + random.triangular(-range, range))
165 | return results
166 |
167 | def scale(data, scale, offset = 0):
168 | results = []
169 | for d in data:
170 | results.append(d * scale + offset)
171 | return results
172 |
173 | def dump(d):
174 | for i in d:
175 | print i
176 |
177 | def timing(size = 32,count = 1000000):
178 | start = time.time()
179 | sin2 = sin2wave(size)
180 | cos = coswave(size)
181 |
182 | for i in xrange(count):
183 | qdistance(sin2, cos)
184 |
185 | end = time.time()
186 |
187 | print end - start
188 |
189 |
190 |
191 |
192 | def timing2(size = 32, count = 1000000):
193 | start = time.time()
194 | ramp1 = ramp(0, 1, size)
195 | ramp2 = ramp(0, 2, size)
196 |
197 | for i in xrange(count):
198 | qdistance(ramp1, ramp2)
199 |
200 | end = time.time()
201 |
202 | print end - start
203 |
204 | def test():
205 | xdata = aamp(0, .3, 1000)
206 | ydata = sin2wave(1000)
207 | ydata = add_noise(ydata, .5)
208 | idata = interpolate(xdata, ydata, 1400)
209 | sdata = smooth(idata, 20)
210 | samp = sample(sdata, 256)
211 | ndata = normalize(samp)
212 |
213 | sin2 = sin2wave(256)
214 | cos = coswave(256)
215 | sin = sinwave(256)
216 | flat = normalize(ramp(0, .01, 256))
217 |
218 | print "# sin2wav", distance(ndata, sin2)
219 | print "# coswav", distance(ndata, cos)
220 | print "# sinwav", distance(ndata, sin)
221 | print "# flat", distance(ndata, flat)
222 |
223 | for i, (a, b, c, d, e) in enumerate(zip(ndata, sin2, cos, sin, flat)):
224 | print i, a,b,c,d,e
225 |
226 |
227 |
228 | if __name__ == '__main__':
229 | #timing()
230 | #timing2()
231 | dump(sin3wave(100))
232 | #test()
233 |
--------------------------------------------------------------------------------
/track.py:
--------------------------------------------------------------------------------
1 |
2 | """ Processes track data from the Million Song Database. Specifically, this
3 | file contains functions that load the flat-file format of tracks for the
4 | MSD. The format is one track per line, where each line is represented by 54
5 | fields as described here:
6 |
7 | http://labrosa.ee.columbia.edu/millionsong/pages/field-list
8 |
9 | except that in the flat file format, the 'track id' field has been moved
10 | from field 52 to the first field.
11 |
12 | A track is represented as a dictionary.
13 | """
14 |
15 | import sys
16 | import pprint
17 |
18 | def load_track(line):
19 | """ Loads a track from a single line """
20 | t = {}
21 |
22 | f = line.split('\t')
23 | if len(f) == 54:
24 | t['track_id'] = f[0]
25 | t['analysis_sample_rate'] = f[1]
26 | t['artist_7digitalid'] = f[2]
27 | t['artist_familiarity'] = float(f[3])
28 | t['artist_hotttnesss'] = float(f[4])
29 | t['artist_id'] = f[5]
30 | t['artist_latitude'] = float(f[6])
31 | t['artist_location'] = f[7]
32 | t['artist_longitude'] = float(f[8])
33 | t['artist_mbid'] = f[9]
34 |
35 | tag_words = f[10].split(',')
36 | tag_count = f[11].split(',')
37 | mbtags = [ (w, int(c)) for w,c in zip(tag_words, tag_count) if len(w) > 0]
38 | t['artist_mbtags'] = mbtags
39 |
40 | t['artist_name'] = f[12]
41 | t['artist_playmeid'] = int(f[13])
42 |
43 | artist_terms = f[14].split(',')
44 | artist_terms_freq = f[15].split(',')
45 | artist_terms_weight = f[16].split(',')
46 | t['artist_terms'] = [ (term, float(freq), float(weight)) \
47 | for term ,freq, weight in zip(artist_terms, artist_terms_freq, artist_terms_weight) if len(term) > 0]
48 |
49 | t['audio_md5'] = f[17]
50 |
51 | bars_confidence = f[18].split(',')
52 | bars_start = f[19].split(',')
53 | t['bars'] = [ (float(start), float(conf)) \
54 | for start, conf in zip(bars_start, bars_confidence) if len(start) > 0 ]
55 |
56 | beats_confidence = f[20].split(',')
57 | beats_start = f[21].split(',')
58 | t['beats'] = [ (float(start), float(conf)) \
59 | for start, conf in zip(beats_start, beats_confidence) if len(start) > 0 ]
60 |
61 | t['danceability'] = float(f[22])
62 | t['duration'] = float(f[23])
63 | t['end_of_fade_in'] = float(f[24])
64 | t['energy'] = float(f[25])
65 | t['key'] = (int(f[26]), float(f[27]))
66 | t['loudness'] = float(f[28])
67 | t['mode'] = (int(f[29]), float(f[30]))
68 | t['release'] = f[31]
69 | t['release_7digitalid'] = f[32]
70 | srid = f[32].zfill(10)
71 | t['cover_art'] = 'http://cdn.7static.com/static/img/sleeveart/%s/%s/%s/%s_200.jpg' \
72 | % (srid[0:2], srid[2:5], srid[5:8], srid)
73 |
74 | sections_confidence = f[33].split(',')
75 | sections_start = f[34].split(',')
76 | t['sections'] = [ (float(start), float(conf)) \
77 | for start, conf in zip(sections_start, sections_confidence) if len(start) > 0 ]
78 |
79 | seg_confidence = f[35].split(',')
80 | seg_loudness_max = f[36].split(',')
81 | seg_loudness_max_time = f[37].split(',')
82 | seg_loudness_max_start = f[38].split(',')
83 | seg_pitches = f[39].split(',')
84 | seg_start = f[40].split(',')[:-1]
85 | seg_timbre = f[41].split(',')
86 |
87 | PITCH_COUNT = 12
88 | TIMBRE_COUNT = 12
89 | t['segments'] = []
90 | for i, sstart in enumerate(seg_start):
91 | if len(sstart) > 0:
92 | seg = {}
93 | seg['start'] = float(sstart)
94 | seg['confidence'] = float(seg_confidence[i])
95 | seg['loudness_max'] = float(seg_loudness_max[i])
96 | seg['loudness_max_time'] = float(seg_loudness_max_time[i])
97 | seg['loudness_start'] = float(seg_loudness_max_start[i])
98 | seg['pitch'] =[ float(p) for p in seg_pitches[i * PITCH_COUNT: i * PITCH_COUNT + PITCH_COUNT]]
99 | seg['timbre'] =[ float(p) for p in seg_timbre[i * TIMBRE_COUNT: i * TIMBRE_COUNT + TIMBRE_COUNT]]
100 | t['segments'].append(seg)
101 | if i < len(seg_start) - 1:
102 | seg['duration'] = float(seg_start[i + 1]) - seg['start']
103 | else:
104 | seg['duration'] = t['duration'] - seg['start']
105 |
106 | t['similar_artists'] = [s for s in f[42].split(',') if len(s) > 0]
107 | t['song_hotttnesss'] = float(f[43])
108 | t['song_id'] = f[44]
109 | t['start_of_fade_out'] = float(f[45])
110 |
111 | tatums_confidence = f[46].split(',')
112 | tatums_start = f[47].split(',')
113 | t['tatums'] = [ (float(start), float(conf)) \
114 | for start, conf in zip(tatums_start, tatums_confidence) if len(start) > 0 ]
115 | t['tempo'] = float(f[48])
116 | t['time_signature'] = (int(f[49]), float(f[50]))
117 | t['title'] = f[51]
118 | t['track_7digitalid'] = int(f[52])
119 | t['preview'] = 'http://previews.7digital.com/clips/34/%d.clip.mp3' % (int(f[52]), )
120 | t['year'] = int(f[53])
121 | return t
122 | else:
123 | print 'mismatched fields, found', len(f), 'should have 54'
124 | return None
125 |
126 |
127 |
128 |
129 | def load_tracks(path):
130 | """ Loads a list of track from a file """
131 |
132 | tracks = []
133 | file = open(path)
134 | for which, line in enumerate(file):
135 | track = load_track(line)
136 | if track <> None:
137 | track['path'] = path
138 | track['line'] = which
139 | tracks.append(track)
140 | file.close()
141 | return tracks
142 |
143 | def process_tracks(path, func):
144 | """ applies func(track) to each track found in path """
145 | file = open(path)
146 | for which, line in enumerate(file):
147 | track = load_track(line)
148 | if track <> None:
149 | track['path'] = path
150 | track['line'] = which
151 | func(track)
152 | file.close()
153 |
154 |
155 | def dump(track):
156 | """ Dumps some data from a track for debugging """
157 | print track['line'], track['track_id'], track['artist_id'], len(track['artist_mbtags']), \
158 | len(track['artist_terms'] ), len(track['bars']), len(track['beats']), track['title'], \
159 | track['key'], track['mode'], len(track['segments'])
160 | for seg in track['segments']:
161 | print ' ', seg['start'], seg['duration'], track['duration']
162 | print
163 |
164 |
165 | if __name__ == '__main__':
166 | process_tracks(sys.argv[1], dump)
167 |
--------------------------------------------------------------------------------
/util/README.md:
--------------------------------------------------------------------------------
1 | Utilities
2 | ---------
3 | A set of utility scripts to help process output data from the map reduce jobs
4 |
5 | nsort.py - sorts the data
6 | dump_loudness.py - dumps the loudness for each segment of a track suitable for plotting
7 |
--------------------------------------------------------------------------------
/util/dump_loudness.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import re
3 | from itertools import imap
4 | from pyechonest import track
5 | import spotimeta
6 |
7 | WEIGHT = 10
8 |
9 | def dump_loudness(id, file=sys.stdout):
10 | t = track.track_from_id(id)
11 | title = t.title + ' by ' + t.artist
12 | spotify_id = get_spotify_id(t.artist, t.title)
13 |
14 | print >>file, "# ID ", id
15 | print >>file, "#", title
16 | print >>file, "# ARTIST ", t.artist
17 | print >>file, "# TITLE ", t.title
18 | print >>file, "# SONG_ID ", t.song_id
19 | print >>file, "# SPOT_ID ", spotify_id
20 | print >>file, "#"
21 |
22 | weighted = []
23 | half_track = t.duration / 2
24 | first_half = 0
25 | second_half = 0
26 |
27 | xdata = []
28 | ydata = []
29 |
30 | for seg in t.segments:
31 | sstart = seg['start']
32 | sloudness = min(seg['loudness_max'], 0)
33 | sloudness = max(sloudness, -60)
34 | sduration = seg['duration']
35 | send = sstart + sduration
36 |
37 | weighted.append(sloudness)
38 | if len(weighted) > WEIGHT:
39 | weighted.pop(0)
40 | avg = sum(weighted) / len(weighted)
41 |
42 | if send <= half_track:
43 | seg_loudness = sloudness * sduration
44 | first_half += seg_loudness
45 | elif sstart < half_track and send > half_track:
46 | # this is the nasty segment that spans the song midpoint.
47 | # apportion the loudness appropriately
48 | first_seg_loudness = sloudness * (half_track - sstart)
49 | first_half += first_seg_loudness
50 |
51 | second_seg_loudness = sloudness * (send - half_track)
52 | second_half += second_seg_loudness
53 | else:
54 | seg_loudness = sloudness * sduration
55 | second_half += seg_loudness
56 |
57 | xdata.append( sstart )
58 | ydata.append( sloudness )
59 |
60 | ramp_factor = second_half / half_track - first_half / half_track
61 | #print >>file, seg['start'], sloudness, avg, first_half, second_half, ramp_factor
62 | #print >>file, "%8.6f %9.4f %9.4f %12.6f %12.6f %12.6f" % (sstart, sloudness, avg, first_half, second_half, ramp_factor)
63 | print >>file, "%8.6f %9.4f %9.4f" % (sstart, sloudness, avg)
64 |
65 | correlation = pearsonr(xdata, ydata)
66 | print >>file, "#"
67 | print >>file, "#", 'ramp factor', ramp_factor
68 | print >>file, "#", 'correlation', correlation
69 | print >>file, "#", 'first', first_half / half_track
70 | print >>file, "#", 'second', second_half / half_track
71 | print >>file, "#"
72 |
73 | return title, ramp_factor, first_half / half_track, second_half / half_track
74 |
75 | def pearsonr(x, y):
76 | # Assume len(x) == len(y)
77 | n = len(x)
78 | sum_x = sum(x)
79 | sum_y = sum(y)
80 | sum_x_sq = sum(map(lambda x: pow(x, 2), x))
81 | sum_y_sq = sum(map(lambda x: pow(x, 2), y))
82 | psum = sum(imap(lambda x, y: x * y, x, y))
83 | num = psum - (sum_x * sum_y/n)
84 | den = pow((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n), 0.5)
85 | if den == 0:
86 | return 0
87 | return num / den
88 |
89 |
90 | def spotify_search(name):
91 | retries = 5
92 | while retries > 0:
93 | try:
94 | results = spotimeta.search_track(name)
95 | return results
96 | except spotimeta.ServerError:
97 | print >> sys.stderr, " ... retrying spotify for ", name
98 | time.sleep(5)
99 | retries -= 1
100 | return None
101 |
102 | def get_spotify_id(artist, title):
103 | name = artist + ' ' + title
104 | name = norm(name)
105 |
106 | search = spotify_search(name)
107 | if search and search["total_results"] > 0:
108 | best = search['result'][0]
109 | result = best['href']
110 | print >> sys.stderr, ' found', result, best['artist']['name'], best['name']
111 | return result
112 | else:
113 | print >> sys.stderr, "Couldn't spotifind", name
114 | return None
115 |
116 | def norm(name):
117 | s = name
118 | s = s.replace(".", "")
119 | s = s.lower()
120 | s = re.sub(r'&', ' and ', s)
121 | s = re.sub(r' +', ' ', s)
122 | s = s.strip()
123 |
124 | # if we've normalized away everything
125 | # keep it.
126 | if len(s) == 0:
127 | s = name
128 | return s
129 |
130 |
131 |
132 | if __name__ == '__main__':
133 | dump_loudness(sys.argv[1])
134 |
--------------------------------------------------------------------------------
/util/make_plots.py:
--------------------------------------------------------------------------------
1 | # 30.8926305957 ["David Coverdale", "Into The Light", "TRPNADQ128F422DB22"]
2 |
3 | import simplejson as json
4 | import sys
5 | import os
6 | import sdir
7 | import dump_loudness
8 |
9 | plotter_text = """
10 | set terminal postscript landscape size 8.4,5.9
11 | set output "foo.plt"
12 | set style data line
13 | set key bottom
14 | set xlabel "seconds"
15 | set ylabel "decibels"
16 | set title " %s "
17 | plot "%s.out" using 1:2 with line linecolor rgb "#aaffaa" title "raw loudness"
18 | replot "%s.out" using 1:2 with line lt -1 linecolor rgb "blue" smooth bezier title "smoothed loudness"
19 | replot %f with line linecolor rgb "#777777" title ""
20 | replot %f with line linecolor rgb "#777777" title ""
21 | set key title "score:%.2f ramp:%.2f corr:%.2f"
22 | set terminal postscript landscape size 8.4,5.9
23 | set output "plot.ps"
24 | replot
25 | """
26 |
27 | def make_plot(id, force=False):
28 | plot_path = os.path.join("plots", id + ".png")
29 | if force or not os.path.exists(plot_path):
30 | print "Creating plot", plot_path
31 | name, ramp, first, second, correlation = sdir.get_plot_info(id)
32 | score = ramp * correlation
33 | title = name.replace('"', "'")
34 | plotter = open("plotter.gplt", "w")
35 | print >>plotter, plotter_text % (title, id, id, first, second, score, ramp, correlation)
36 | plotter.close()
37 | os.system("gnuplot plotter.gplt")
38 | os.system("convert -rotate 90 plot.ps %s" % (plot_path,))
39 | else:
40 | print >>sys.stderr, " plot exists, skipping...", plot_path
41 |
42 | def create_plot(id):
43 | force = False
44 | path = id + ".out"
45 | if not os.path.exists(path):
46 | f = open(path, "w")
47 | dump_loudness.dump_loudness(id, f)
48 | f.close()
49 | force = True
50 | else:
51 | print >>sys.stderr, " data exists, skipping...", id
52 | make_plot(id, force)
53 |
54 |
55 | def create_plots():
56 | # parses input in the form
57 | # # 30.8926305957 ["David Coverdale", "Into The Light", "TRPNADQ128F422DB22"]
58 | for which, line in enumerate(sys.stdin):
59 | fields = line.strip().split()
60 | slist = ' '.join(fields[1:])
61 | list = json.loads(slist)
62 | id = list[2]
63 | print >>sys.stderr, which, id
64 | create_plot(id)
65 |
66 |
67 |
68 | if __name__ == '__main__':
69 | if len(sys.argv) > 1:
70 | for id in sys.argv[1:]:
71 | create_plot(id)
72 | else:
73 | create_plots()
74 |
75 |
--------------------------------------------------------------------------------
/util/nsort.py:
--------------------------------------------------------------------------------
1 |
2 | import sys
3 |
4 |
5 | lines = []
6 | for line in sys.stdin:
7 | fields = line.strip().split('\t')
8 | if len(fields) == 2:
9 | key, sval = fields
10 | if len(sval) > 0:
11 | lines.append( (key, float(sval)) )
12 |
13 | lines.sort(reverse=True, key=lambda s:s[1])
14 |
15 |
16 | for key, val in lines:
17 | print val, key
18 |
--------------------------------------------------------------------------------
/util/nsort2.py:
--------------------------------------------------------------------------------
1 |
2 | import simplejson as json
3 | import sys
4 |
5 |
6 | lines = []
7 | which = 0
8 |
9 | if len(sys.argv) > 1:
10 | which = int(sys.argv[1])
11 |
12 | for line in sys.stdin:
13 | fields = line.strip().split('\t')
14 | if len(fields) == 2:
15 | key, sval = fields
16 | if len(sval) > 0:
17 | list = json.loads(sval)
18 | lines.append( (key, list))
19 |
20 | lines.sort(reverse=True, key=lambda s:s[1][which])
21 |
22 |
23 | for key, val in lines:
24 | print val, key
25 |
--------------------------------------------------------------------------------
/util/plotter:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # ID TRANELS128F425B25C
3 | # I'll Kill You by Konami Kukeiha Club
4 |
5 | ID=`awk ' $2 == "ID" { $1= ""; $2 = ""; print }' data/$1.out`
6 | title=`awk ' NR==2 { $1 = ""; print }' data/$1.out`
7 |
8 | rm plot
9 | echo plot \"$1.out\" using 1:2 with line linecolor rgb \"#eeeeee\" title \"\" >> plot
10 | echo replot "$1.out" using 1:3 with line linecolor rgb \"blue\" title \"loudness\" >> plot
11 | echo set xlabel \"Time seconds\" >> plot
12 | echo set ylabel \"decibels\" >> plot
13 | echo set title \"$title\" >> plot
14 | echo set terminal ps >> plot
15 | echo set output \"$ID.ps\" >> plot
16 |
--------------------------------------------------------------------------------
/util/sdir.py:
--------------------------------------------------------------------------------
1 | import time
2 | import json
3 | import re
4 | import sys
5 | import os
6 |
7 |
8 | def get_plot_info(id):
9 | path = id + ".out"
10 | tid = ''
11 | sid = ''
12 | spot = None
13 | name = ''
14 | factor = 0
15 | first = 0
16 | second = 0
17 | correlation = 0
18 |
19 | if os.path.exists(path):
20 | f = open(path)
21 | for which, line in enumerate(f):
22 | if which == 1:
23 | name = line.strip().replace("# ", "")
24 | elif line[0] == '#':
25 | fields = line.strip().split()
26 | if len(fields) >= 4 and fields[1] == 'ramp':
27 | factor = float(fields[3])
28 | if len(fields) >= 3 and fields[1] == 'first':
29 | first = float(fields[2])
30 | if len(fields) >= 3 and fields[1] == 'second':
31 | second = float(fields[2])
32 | if len(fields) >= 3 and fields[1] == 'correlation':
33 | correlation = float(fields[2])
34 | if len(fields) >= 3 and fields[1] == 'ID':
35 | tid = fields[2]
36 | if len(fields) >= 3 and fields[1] == 'SONG_ID':
37 | sid = fields[2]
38 | if len(fields) >= 3 and fields[1] == 'SPOT_ID' and fields[2] <> 'None':
39 | spot = fields[2]
40 |
41 | if tid <> id:
42 | print >> sys.stderr, "Mismatched ID ", id, tid
43 | return name, sid, spot, factor, first, second, correlation
44 | else:
45 | return None
46 |
47 |
48 | def make_entry(id):
49 | info = get_plot_info(id)
50 | if info:
51 | name, sid, spot_id, factor, first, second, correlation = info
52 | image = id + ".png"
53 |
54 | title = name
55 |
56 | if spot_id:
57 | print '',
58 | print '',
59 | print ''
60 | else:
61 | pass
62 | # print '
'
63 | else:
64 | print >>sys.stderr, "Can't open info for", id
65 |
66 | def make_plot_page(index):
67 | for l in index:
68 | process_plot(l.strip());
69 |
70 |
71 | def header():
72 | print ""
73 | print "