├── .gitignore
├── README.md
├── artist_ramp.py
├── cramp.py
├── density.py
├── lmatch.py
├── ramp.py
├── tiny.dat
├── tools.py
├── track.py
└── util
    ├── README.md
    ├── dump_loudness.py
    ├── make_plots.py
    ├── nsort.py
    ├── nsort2.py
    ├── plotter
    └── sdir.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.out
3 | *.gz
4 | data/*
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | msd-examples
 2 | ===========
 3 | Example code for processing the Million Song Database. This repostory contains
 4 | code that can be used to process the million song dataset.
 5 | 
 6 | The Million Song Dataset is a freely-available collection of audio features and metadata for a million contemporary popular
 7 | music tracks available at:
 8 | 
 9 |  http://labrosa.ee.columbia.edu/millionsong/pages/getting-dataset
10 | 
11 |  Its purposes are:
12 | 
13 |  * To encourage research on algorithms that scale to commercial sizes
14 |  * To provide a reference dataset for evaluating research
15 |  * As a shortcut alternative to creating a large dataset with The Echo Nests API
16 |  * To help new researchers get started in the MIR field
17 | 
18 | The core of the dataset is the feature analysis and metadata for one million songs, provided by The Echo Nest. The dataset
19 | does not include any audio, only the derived features. Note, however, that sample audio can be fetched from services like
20 | 7digital, using code we provide.  Additional datasets have been attached to the Million Song Dataset, so far they contain lyrics and cover songs.  The Million Song Dataset started as a collaborative project between The Echo Nest and LabROSA. 
21 | It was supported in part by the NSF.
22 | 
23 | These examples depend on mrjob, a python library for running MapReduce jobs on Hadoop or Amazon web services.  See
24 | https://github.com/Yelp/mrjob and http://packages.python.org/mrjob/.
25 | 
26 | 
27 | MSD Data on S3
28 | ==============
29 | These examples use MSD data that has been loaded on to S3 at s3://tbmmsd.  There are around 330 files each with about 3000
30 | sets track data each (one set per line) where each line is represented by 54 fields as described here:  
31 |     
32 |  http://labrosa.ee.columbia.edu/millionsong/pages/field-list
33 | 
34 | except that in the flat file format, the 'track id' field has been moved from field 52 to the first field.
35 | 
36 | In the repository you will find tiny.dat which contains data for 20 tracks.
37 | 
38 | 
39 | 
40 | 
41 | Map-reduce jobs
42 | ===============
43 | 
44 | Density
45 | ------
46 | Finds the most dense and the least dense songs
47 | 
48 | density.py
49 | 
50 | 
51 | ### Local Usage:
52 | 
53 |     python density.py tiny.dat
54 | 
55 | 
56 | ### EC2 Usage
57 | This will run the job on EC2 Map reduce on 100 small instances. Note that you have to 
58 | add the track.py code to t.tar.gz with:
59 | 
60 |     % tar cvfz t.tar.gz track.py
61 | 
62 | To run the job on 100 CPUs on all of the MSD use:     
63 | 
64 |     %  python density.py --num-ec2-instances 100 --python-archive t.tar.gz -r emr 's3://tbmmsd/*.tsv.*' > output.dat
65 | 
66 | 
67 | (Of course you will need to setup your Amazon credentials. See http://packages.python.org/mrjob/writing-and-running.html#running-on-emr )
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/artist_ramp.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | """
  4 |     A map-reduce that calculates the average slow build
  5 |     for each artist
  6 | """
  7 | 
  8 | from mrjob.job import MRJob
  9 | import track
 10 | from itertools import imap
 11 | import unicodedata
 12 | 
 13 | # if YIELD_ALL is true, we yield all densities, otherwise,
 14 | # we yield just the extremes
 15 | 
 16 | YIELD_ALL = False
 17 | 
 18 | class MRRamp(MRJob):
 19 |     """ A  map-reduce job that calculates the ramp factor """
 20 | 
 21 |     def mapper(self, _, line):
 22 |         """ The mapper loads a track and yields its ramp factor """
 23 |         t = track.load_track(line)
 24 |         if t and t['duration'] > 60 and len(t['segments']) > 20:
 25 |             segments = t['segments']
 26 |             half_track = t['duration'] / 2
 27 |             first_half = 0
 28 |             second_half = 0
 29 |             first_count = 0
 30 |             second_count = 0
 31 | 
 32 |             xdata = []
 33 |             ydata = []
 34 |             for i in xrange(len(segments)):
 35 |                 seg = segments[i]
 36 | 
 37 |                 # bail out if we have a really long quiet segment
 38 |                 # these are usually surprise hidden tracks
 39 | 
 40 |                 if seg['loudness_max'] < -40 and seg['duration'] > 30:
 41 |                     return
 42 | 
 43 |                 seg_loudness = seg['loudness_max'] * seg['duration']
 44 | 
 45 |                 if seg['start'] + seg['duration'] <= half_track:
 46 |                     seg_loudness = seg['loudness_max'] * seg['duration']
 47 |                     first_half += seg_loudness
 48 |                     first_count += 1
 49 |                 elif seg['start'] < half_track and seg['start'] + seg['duration'] > half_track:
 50 |                     # this is the nasty segment that spans the song midpoint.
 51 |                     # apportion the loudness appropriately
 52 |                     first_seg_loudness = seg['loudness_max'] * (half_track - seg['start'])
 53 |                     first_half += first_seg_loudness
 54 |                     first_count += 1
 55 | 
 56 |                     second_seg_loudness = seg['loudness_max'] * (seg['duration'] - (half_track - seg['start']))
 57 |                     second_half += second_seg_loudness
 58 |                     second_count += 1
 59 |                 else:
 60 |                     seg_loudness = seg['loudness_max'] * seg['duration']
 61 |                     second_half += seg_loudness
 62 |                     second_count += 1
 63 | 
 64 |                 xdata.append( seg['start'] )
 65 |                 ydata.append( seg['loudness_max'] )
 66 | 
 67 |             # only yield data if we've had sufficient segments in the first
 68 |             # and second half of the track. (This is to avoid the proverbial
 69 |             # hidden tracks that have extreme amounts of leading or tailing
 70 |             # silene
 71 | 
 72 |             correlation = pearsonr(xdata, ydata)
 73 |             ramp_factor = second_half / half_track - first_half / half_track
 74 |             score = correlation * ramp_factor
 75 |             yield (t['artist_id'], t['artist_name']), (score, t['track_id'])
 76 | 
 77 |     def reducer(self, key, val):
 78 |         count = 0
 79 |         sum = 0
 80 |         best = -60
 81 |         best_id = None
 82 | 
 83 |         for score, trid in val:
 84 |             sum += score
 85 |             count += 1
 86 |             if score > best:
 87 |                 best = score
 88 |                 best_id = trid
 89 |         avg = sum / count
 90 | 
 91 |         if count > 5 and avg > 5:
 92 |             yield key,  (avg, count, best, best_id)
 93 | 
 94 | 
 95 | def pearsonr(x, y):
 96 |   # Assume len(x) == len(y)
 97 |     n = len(x)
 98 |     sum_x = sum(x)
 99 |     sum_y = sum(y)
100 |     sum_x_sq = sum(map(lambda x: pow(x, 2), x))
101 |     sum_y_sq = sum(map(lambda x: pow(x, 2), y))
102 |     psum = sum(imap(lambda x, y: x * y, x, y))
103 |     num = psum - (sum_x * sum_y/n)
104 |     den = pow((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n), 0.5)
105 |     if den == 0: 
106 |         return 0
107 |     return num / den
108 | 
109 | 
110 | if __name__ == '__main__':
111 |     MRRamp.run()
112 | 


--------------------------------------------------------------------------------
/cramp.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 |     A map-reduce that calculates the difference in 
  4 |     average volume between the first and the second
  5 |     half of the song.
  6 | """
  7 | 
  8 | from mrjob.job import MRJob
  9 | import track
 10 | from itertools import imap
 11 | import unicodedata
 12 | 
 13 | # if YIELD_ALL is true, we yield all densities, otherwise,
 14 | # we yield just the extremes
 15 | 
 16 | YIELD_ALL = False
 17 | 
 18 | class MRRamp(MRJob):
 19 |     """ A  map-reduce job that calculates the ramp factor """
 20 | 
 21 |     def mapper(self, _, line):
 22 |         """ The mapper loads a track and yields its ramp factor """
 23 |         t = track.load_track(line)
 24 |         if t and t['duration'] > 60 and len(t['segments']) > 20:
 25 |             segments = t['segments']
 26 |             half_track = t['duration'] / 2
 27 |             first_half = 0
 28 |             second_half = 0
 29 |             first_count = 0
 30 |             second_count = 0
 31 | 
 32 |             xdata = []
 33 |             ydata = []
 34 |             for i in xrange(len(segments)):
 35 |                 seg = segments[i]
 36 | 
 37 |                 # bail out if we have a really long quiet segment
 38 |                 # these are usually surprise hidden tracks
 39 | 
 40 |                 if seg['loudness_max'] < -40 and seg['duration'] > 30:
 41 |                     return
 42 | 
 43 |                 seg_loudness = seg['loudness_max'] * seg['duration']
 44 | 
 45 |                 if seg['start'] + seg['duration'] <= half_track:
 46 |                     seg_loudness = seg['loudness_max'] * seg['duration']
 47 |                     first_half += seg_loudness
 48 |                     first_count += 1
 49 |                 elif seg['start'] < half_track and seg['start'] + seg['duration'] > half_track:
 50 |                     # this is the nasty segment that spans the song midpoint.
 51 |                     # apportion the loudness appropriately
 52 |                     first_seg_loudness = seg['loudness_max'] * (half_track - seg['start'])
 53 |                     first_half += first_seg_loudness
 54 |                     first_count += 1
 55 | 
 56 |                     second_seg_loudness = seg['loudness_max'] * (seg['duration'] - (half_track - seg['start']))
 57 |                     second_half += second_seg_loudness
 58 |                     second_count += 1
 59 |                 else:
 60 |                     seg_loudness = seg['loudness_max'] * seg['duration']
 61 |                     second_half += seg_loudness
 62 |                     second_count += 1
 63 | 
 64 |                 xdata.append( seg['start'] )
 65 |                 ydata.append( seg['loudness_max'] )
 66 | 
 67 |             # only yield data if we've had sufficient segments in the first
 68 |             # and second half of the track. (This is to avoid the proverbial
 69 |             # hidden tracks that have extreme amounts of leading or tailing
 70 |             # silene
 71 | 
 72 |             correlation = pearsonr(xdata, ydata)
 73 |             if first_count > 10 and second_count > 10:
 74 |                 ramp_factor = second_half / half_track - first_half / half_track
 75 |                 #if YIELD_ALL or ramp_factor > 10 or ramp_factor < -10:
 76 |                 if YIELD_ALL or ramp_factor > 10 and correlation > .5:
 77 |                     yield (t['artist_name'], t['title'], t['track_id'], correlation), ramp_factor
 78 | 
 79 |     # no need for a reducer
 80 |     #def reducer(self, key, val):
 81 |         #yield (key, sum(val))
 82 | 
 83 | 
 84 | def pearsonr(x, y):
 85 |   # Assume len(x) == len(y)
 86 |     n = len(x)
 87 |     sum_x = sum(x)
 88 |     sum_y = sum(y)
 89 |     sum_x_sq = sum(map(lambda x: pow(x, 2), x))
 90 |     sum_y_sq = sum(map(lambda x: pow(x, 2), y))
 91 |     psum = sum(imap(lambda x, y: x * y, x, y))
 92 |     num = psum - (sum_x * sum_y/n)
 93 |     den = pow((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n), 0.5)
 94 |     if den == 0: 
 95 |         return 0
 96 |     return num / den
 97 | 
 98 | 
 99 | def test():
100 |     x = [1,2,3,4,5,6,7,8,9,10]
101 |     y = [10, 20, 35, 45, 47, 60, 70, 87, 91, 100]
102 |     print pearsonr(x,y)
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     MRRamp.run()
107 | 


--------------------------------------------------------------------------------
/density.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     A map-reduce that calculates the density for each
 3 |     of a set of tracks.  The track density is the average
 4 |     number of segments per segment for a track.
 5 | """
 6 | 
 7 | from mrjob.job import MRJob
 8 | import track
 9 | 
10 | # if YIELD_ALL is true, we yield all densities, otherwise,
11 | # we yield just the extremes
12 | 
13 | YIELD_ALL = True
14 | 
15 | class MRDensity(MRJob):
16 |     """ A  map-reduce job that calculates the density """
17 | 
18 |     def mapper(self, _, line):
19 |         """ The mapper loads a track and yields its density """
20 |         t = track.load_track(line)
21 |         if t:
22 |             if t['tempo'] > 0:
23 |                 density = len(t['segments']) / t['duration']
24 |                 #only output extreme density
25 |                 if YIELD_ALL or density > 8 or density < .5:
26 |                     yield (t['artist_name'], t['title'], t['song_id']), density
27 | 
28 |     # no need for a reducer
29 |     #def reducer(self, key, val):
30 |         #yield (key, sum(val))
31 | 
32 | if __name__ == '__main__':
33 |     MRDensity.run()
34 | 


--------------------------------------------------------------------------------
/lmatch.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | """
 4 |     A map-reduce that calculates the difference in 
 5 |     average volume between the first and the second
 6 |     half of the song.
 7 | """
 8 | 
 9 | from mrjob.job import MRJob
10 | import track
11 | from itertools import imap
12 | import math
13 | import tools
14 | import sys
15 | 
16 | # if YIELD_ALL is true, we yield all densities, otherwise,
17 | # we yield just the extremes
18 | 
19 | YIELD_ALL = False
20 | 
21 | class MRlmatch(MRJob):
22 |     """ A  map-reduce job that calculates the ramp factor """
23 | 
24 |     DUMP = False
25 |     SIZE = 64
26 |     VECTOR = True
27 |     #MATCH = tools.rnormalize(tools.scale(tools.sin2wave(SIZE), 60, -60), -60, 0)
28 |     MATCH = tools.rnormalize(tools.scale(tools.sinwave(SIZE), 60, -60), -60, 0)
29 | 
30 |     def mapper(self, _, line):
31 |         """ The mapper loads a track and yields its ramp factor """
32 |         t = track.load_track(line)
33 |         segments = t['segments']
34 |         duration = t['duration']
35 |         xdata = []
36 |         ydata = []
37 |         for i in xrange(len(segments)):
38 |             seg = segments[i]
39 |             sloudness = seg['loudness_max']
40 |             sstart = seg['start'] + seg['loudness_max_time']
41 |             xdata.append( sstart )
42 |             ydata.append( sloudness )
43 | 
44 |         if duration > 20:
45 |             idata = tools.interpolate(xdata, ydata, int(duration) * 10)
46 |             smooth = tools.smooth(idata, 20)
47 |             samp = tools.sample(smooth, self.SIZE)
48 |             ndata = tools.rnormalize(samp, -60, 0)
49 |             if self.DUMP:
50 |                 for i, (x, y) in enumerate(zip(self.MATCH, ndata)):
51 |                     print i, x, y
52 |             if self.VECTOR:
53 |                 yield (t['artist_name'], t['title'], t['track_id']), ndata
54 |             else:
55 |                 distance = tools.distance(self.MATCH, ndata)
56 |                 yield (t['artist_name'], t['title'], t['track_id']), distance
57 | 
58 | 
59 |     # no need for a reducer
60 |     #def reducer(self, key, val):
61 |         #yield (key, sum(val))
62 | 
63 | 
64 | def dump():
65 |     data = tools.rnormalize(tools.scale(tools.sin2wave(256), 60, -60), -60, 0)
66 |     for d in data:
67 |         print d
68 | 
69 | if __name__ == '__main__':
70 |     #dump()
71 |     MRlmatch.run()
72 | 


--------------------------------------------------------------------------------
/ramp.py:
--------------------------------------------------------------------------------
 1 | 
 2 | """
 3 |     A map-reduce that calculates the difference in 
 4 |     average volume between the first and the second
 5 |     half of the song.
 6 | """
 7 | 
 8 | from mrjob.job import MRJob
 9 | import track
10 | 
11 | # if YIELD_ALL is true, we yield all densities, otherwise,
12 | # we yield just the extremes
13 | 
14 | YIELD_ALL = False
15 | 
16 | class MRRamp(MRJob):
17 |     """ A  map-reduce job that calculates the ramp factor """
18 | 
19 |     def mapper(self, _, line):
20 |         """ The mapper loads a track and yields its ramp factor """
21 |         t = track.load_track(line)
22 |         if t and t['duration'] > 60 and len(t['segments']) > 20:
23 |             segments = t['segments']
24 |             half_track = t['duration'] / 2
25 |             first_half = 0
26 |             second_half = 0
27 |             first_count = 0
28 |             second_count = 0
29 | 
30 |             for i in xrange(len(segments)):
31 |                 seg = segments[i]
32 | 
33 |                 # bail out if we have a really long quiet segment
34 |                 # these are usually surprise hidden tracks
35 | 
36 |                 if seg['loudness_max'] < -40 and seg['duration'] > 30:
37 |                     return
38 | 
39 |                 seg_loudness = seg['loudness_max'] * seg['duration']
40 | 
41 |                 if seg['start'] + seg['duration'] <= half_track:
42 |                     seg_loudness = seg['loudness_max'] * seg['duration']
43 |                     first_half += seg_loudness
44 |                     first_count += 1
45 |                 elif seg['start'] < half_track and seg['start'] + seg['duration'] > half_track:
46 |                     # this is the nasty segment that spans the song midpoint.
47 |                     # apportion the loudness appropriately
48 |                     first_seg_loudness = seg['loudness_max'] * (half_track - seg['start'])
49 |                     first_half += first_seg_loudness
50 |                     first_count += 1
51 | 
52 |                     second_seg_loudness = seg['loudness_max'] * (seg['duration'] - (half_track - seg['start']))
53 |                     second_half += second_seg_loudness
54 |                     second_count += 1
55 |                 else:
56 |                     seg_loudness = seg['loudness_max'] * seg['duration']
57 |                     second_half += seg_loudness
58 |                     second_count += 1
59 | 
60 |             # only yield data if we've had sufficient segments in the first
61 |             # and second half of the track. (This is to avoid the proverbial
62 |             # hidden tracks that have extreme amounts of leading or tailing
63 |             # silene
64 | 
65 |             if first_count > 10 and second_count > 10:
66 |                 ramp_factor = second_half / half_track - first_half / half_track
67 |                 if YIELD_ALL or ramp_factor > 10 or ramp_factor < -10:
68 |                     yield (t['artist_name'], t['title'], t['track_id']), ramp_factor
69 | 
70 |     # no need for a reducer
71 |     #def reducer(self, key, val):
72 |         #yield (key, sum(val))
73 | 
74 | if __name__ == '__main__':
75 |     MRRamp.run()
76 | 


--------------------------------------------------------------------------------
/tools.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import random
  3 | import time
  4 | 
  5 | 
  6 | def interpolate(xdata, ydata, points):
  7 |     results = []
  8 |     duration = xdata[-1]
  9 |     inc = duration / float(points)
 10 |     ctime = 0
 11 |     cindex = 0
 12 | 
 13 |     last_index = 0
 14 |     for i in xrange(points):
 15 |         for j in xrange(last_index, len(xdata) - 1):
 16 |             #print 'xd', xdata[j], xdata[j+1], ctime
 17 |             if ctime < xdata[j+1]:
 18 |                 break
 19 |         last_index = j
 20 | 
 21 |         frac = (ctime - xdata[j]) / (xdata[j+1] - xdata[j])
 22 |         y = frac * (ydata[j+1] - ydata[j]) + ydata[j]
 23 |         #print 'ct', ctime, xdata[j], xdata[j+1]
 24 |         results.append(y)
 25 | 
 26 |         ctime += inc
 27 |     return results
 28 | 
 29 | 
 30 | def smooth(data, fsize=10):
 31 |     out = []
 32 |     filter = []
 33 |     # bug, make this be a centering filter
 34 |     for d in data:
 35 |         filter.append(d)
 36 |         if len(filter) > fsize:
 37 |             filter.pop(0)
 38 |         out.append( sum(filter) / len(filter))
 39 |     return out
 40 | 
 41 | def sample(data, size):
 42 |     results = []
 43 |     jump = float(len(data)) / float(size)
 44 |     for i in xrange(size):
 45 |         index = int(round(i * jump))
 46 |         index = min(len(data) - 1, index)
 47 |         results.append(data[index])
 48 |     return results
 49 | 
 50 |     
 51 | def normalize(data, range = 1):
 52 |     max_data = max(data)
 53 |     min_data = min(data)
 54 | 
 55 |     out = [ range * (x - min_data) / (max_data - min_data) for x in data]
 56 |     return out
 57 | 
 58 | def rnormalize(data, min_data = 0, max_data = 1, range = 1):
 59 |     data = clamp(data, min_data, max_data)
 60 |     out = [ range * (x - min_data) / (max_data - min_data) for x in data]
 61 |     return out
 62 | 
 63 | 
 64 | def clamp(data, min_data, max_data):
 65 |     results = []
 66 |     for d in data:
 67 |         d = min(d, max_data)
 68 |         d = max(d, min_data)
 69 |         results.append(d)
 70 |     return results
 71 | 
 72 | 
 73 | def distance(d1, d2):
 74 |     if len(d1) <> len(d2):
 75 |         raise ValueError
 76 |     sum = 0
 77 |     for p1, p2 in zip(d1, d2):
 78 |         d = p2 - p1
 79 |         dd = d * d
 80 |         sum += dd
 81 |         #print p1, p2, dd, sum
 82 |     return math.sqrt(sum)
 83 | 
 84 | def qdistance(d1, d2):
 85 |     if len(d1) <> len(d2):
 86 |         raise ValueError
 87 |     sum = 0
 88 |     for p1, p2 in zip(d1, d2):
 89 |         d = p2 - p1
 90 |         dd = d * d
 91 |         sum += dd
 92 |         #print p1, p2, dd, sum
 93 |     return sum
 94 | 
 95 |     
 96 | 
 97 | def pearsonr(x, y):
 98 |   # Assume len(x) == len(y)
 99 |     n = len(x)
100 |     sum_x = sum(x)
101 |     sum_y = sum(y)
102 |     sum_x_sq = sum(map(lambda x: pow(x, 2), x))
103 |     sum_y_sq = sum(map(lambda x: pow(x, 2), y))
104 |     psum = sum(imap(lambda x, y: x * y, x, y))
105 |     num = psum - (sum_x * sum_y/n)
106 |     den = pow((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n), 0.5)
107 |     if den == 0: 
108 |         return 0
109 |     return num / den
110 | 
111 | 
112 | 
113 | def sinwave(size):
114 |     results = []
115 |     inc = 3.14159 / size
116 |     angle = 0
117 |     for x in xrange(size):
118 |         results.append( math.sin(angle) )
119 |         angle += inc
120 | 
121 |     return normalize(results)
122 | 
123 | def sin2wave(size):
124 |     results = []
125 |     inc = 3.14159 / size
126 |     angle = -3.14159/2
127 |     for x in xrange(size):
128 |         results.append( math.sin(angle) )
129 |         angle += inc
130 | 
131 |     return normalize(results)
132 | 
133 | def sin3wave(size):
134 |     results = []
135 |     inc = 4 * 3.14159 / size
136 |     angle = -3.14159/2
137 |     for x in xrange(size):
138 |         results.append( math.sin(angle) )
139 |         angle += inc
140 | 
141 |     return normalize(results)
142 | 
143 | def coswave(size):
144 |     results = []
145 |     inc = 3.14159 / size
146 |     angle = 0
147 |     for x in xrange(size):
148 |         results.append( math.cos(angle) )
149 |         angle += inc
150 | 
151 |     return normalize(results)
152 | 
153 | def ramp(start = 0, inc = 1, size=10):
154 |     results = []
155 |     val = start
156 |     for x in xrange(size):
157 |         results.append( val )
158 |         val += inc
159 |     return results
160 |   
161 | def  add_noise(data, range):
162 |     results = []
163 |     for d in data:
164 |         results.append(d + random.triangular(-range, range))
165 |     return results
166 | 
167 | def  scale(data, scale, offset = 0):
168 |     results = []
169 |     for d in data:
170 |         results.append(d * scale + offset)
171 |     return results
172 | 
173 | def dump(d):
174 |     for i in d:
175 |         print i
176 | 
177 | def timing(size = 32,count = 1000000):
178 |     start = time.time()
179 |     sin2 = sin2wave(size)
180 |     cos = coswave(size)
181 | 
182 |     for i in xrange(count):
183 |        qdistance(sin2, cos)
184 | 
185 |     end = time.time()
186 | 
187 |     print end - start
188 | 
189 | 
190 | 
191 | 
192 | def timing2(size = 32, count = 1000000):
193 |     start = time.time()
194 |     ramp1 = ramp(0, 1, size)
195 |     ramp2 = ramp(0, 2, size)
196 | 
197 |     for i in xrange(count):
198 |        qdistance(ramp1, ramp2)
199 | 
200 |     end = time.time()
201 | 
202 |     print end - start
203 | 
204 | def test():
205 |     xdata = aamp(0, .3, 1000)
206 |     ydata = sin2wave(1000)
207 |     ydata = add_noise(ydata, .5)
208 |     idata = interpolate(xdata, ydata, 1400)
209 |     sdata = smooth(idata, 20)
210 |     samp = sample(sdata, 256)
211 |     ndata = normalize(samp)
212 | 
213 |     sin2 = sin2wave(256)
214 |     cos = coswave(256)
215 |     sin = sinwave(256)
216 |     flat = normalize(ramp(0, .01, 256))
217 | 
218 |     print "# sin2wav", distance(ndata, sin2)
219 |     print "# coswav", distance(ndata, cos)
220 |     print "# sinwav", distance(ndata, sin)
221 |     print "# flat", distance(ndata, flat)
222 | 
223 |     for i, (a, b, c, d, e) in enumerate(zip(ndata, sin2, cos, sin, flat)):
224 |         print i, a,b,c,d,e
225 | 
226 | 
227 | 
228 | if __name__ == '__main__':
229 |     #timing()
230 |     #timing2()
231 |     dump(sin3wave(100))
232 |     #test()
233 | 


--------------------------------------------------------------------------------
/track.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """ Processes track data from the Million Song Database.  Specifically, this
  3 |     file contains functions that load the flat-file format of tracks for the
  4 |     MSD. The format is one track per line, where each line is represented by 54
  5 |     fields as described here:  
  6 |     
  7 |     http://labrosa.ee.columbia.edu/millionsong/pages/field-list
  8 | 
  9 |     except that in the flat file format, the 'track id' field has been moved
 10 |     from field 52 to the first field.
 11 | 
 12 |     A track is represented as a dictionary.
 13 | """
 14 | 
 15 | import sys
 16 | import pprint
 17 | 
 18 | def load_track(line):
 19 |     """ Loads a track from a single line """
 20 |     t = {}
 21 | 
 22 |     f = line.split('\t')
 23 |     if len(f) == 54:
 24 |         t['track_id'] = f[0]
 25 |         t['analysis_sample_rate'] = f[1]
 26 |         t['artist_7digitalid'] = f[2]
 27 |         t['artist_familiarity'] = float(f[3])
 28 |         t['artist_hotttnesss'] = float(f[4])
 29 |         t['artist_id'] = f[5]
 30 |         t['artist_latitude'] = float(f[6])
 31 |         t['artist_location'] = f[7]
 32 |         t['artist_longitude'] = float(f[8])
 33 |         t['artist_mbid'] = f[9]
 34 | 
 35 |         tag_words = f[10].split(',')
 36 |         tag_count = f[11].split(',')
 37 |         mbtags = [  (w, int(c))  for w,c in zip(tag_words, tag_count) if len(w) > 0]
 38 |         t['artist_mbtags'] = mbtags
 39 | 
 40 |         t['artist_name'] = f[12]
 41 |         t['artist_playmeid'] = int(f[13])
 42 | 
 43 |         artist_terms = f[14].split(',')
 44 |         artist_terms_freq = f[15].split(',')
 45 |         artist_terms_weight = f[16].split(',')
 46 |         t['artist_terms'] = [  (term, float(freq), float(weight)) \
 47 |             for term ,freq, weight in zip(artist_terms, artist_terms_freq, artist_terms_weight) if len(term) > 0]
 48 | 
 49 |         t['audio_md5'] = f[17]
 50 | 
 51 |         bars_confidence = f[18].split(',')
 52 |         bars_start = f[19].split(',')
 53 |         t['bars'] = [ (float(start), float(conf)) \
 54 |             for start, conf in zip(bars_start, bars_confidence) if len(start) > 0 ]
 55 | 
 56 |         beats_confidence = f[20].split(',')
 57 |         beats_start = f[21].split(',')
 58 |         t['beats'] = [ (float(start), float(conf)) \
 59 |             for start, conf in zip(beats_start, beats_confidence) if len(start) > 0 ]
 60 | 
 61 |         t['danceability'] = float(f[22])
 62 |         t['duration'] = float(f[23])
 63 |         t['end_of_fade_in'] = float(f[24])
 64 |         t['energy'] = float(f[25])
 65 |         t['key'] = (int(f[26]), float(f[27]))
 66 |         t['loudness'] = float(f[28])
 67 |         t['mode'] = (int(f[29]), float(f[30]))
 68 |         t['release'] = f[31]
 69 |         t['release_7digitalid'] = f[32]
 70 |         srid = f[32].zfill(10)
 71 |         t['cover_art'] = 'http://cdn.7static.com/static/img/sleeveart/%s/%s/%s/%s_200.jpg' \
 72 |             % (srid[0:2], srid[2:5], srid[5:8],  srid)
 73 | 
 74 |         sections_confidence = f[33].split(',')
 75 |         sections_start = f[34].split(',')
 76 |         t['sections'] = [ (float(start), float(conf)) \
 77 |             for start, conf in zip(sections_start, sections_confidence) if len(start) > 0 ]
 78 | 
 79 |         seg_confidence = f[35].split(',')
 80 |         seg_loudness_max = f[36].split(',')
 81 |         seg_loudness_max_time = f[37].split(',')
 82 |         seg_loudness_max_start = f[38].split(',')
 83 |         seg_pitches = f[39].split(',')
 84 |         seg_start = f[40].split(',')[:-1]
 85 |         seg_timbre = f[41].split(',')
 86 | 
 87 |         PITCH_COUNT = 12
 88 |         TIMBRE_COUNT = 12
 89 |         t['segments'] = []
 90 |         for i, sstart in enumerate(seg_start):
 91 |             if len(sstart) > 0:
 92 |                 seg = {}
 93 |                 seg['start'] = float(sstart)
 94 |                 seg['confidence'] = float(seg_confidence[i])
 95 |                 seg['loudness_max'] = float(seg_loudness_max[i])
 96 |                 seg['loudness_max_time'] = float(seg_loudness_max_time[i])
 97 |                 seg['loudness_start'] = float(seg_loudness_max_start[i])
 98 |                 seg['pitch'] =[ float(p) for p in seg_pitches[i * PITCH_COUNT: i * PITCH_COUNT + PITCH_COUNT]]
 99 |                 seg['timbre'] =[ float(p) for p in seg_timbre[i * TIMBRE_COUNT: i * TIMBRE_COUNT + TIMBRE_COUNT]]
100 |                 t['segments'].append(seg)
101 |                 if i < len(seg_start) - 1:
102 |                     seg['duration'] = float(seg_start[i + 1]) - seg['start']
103 |                 else:
104 |                     seg['duration'] = t['duration']  - seg['start']
105 | 
106 |         t['similar_artists'] = [s for s in f[42].split(',') if len(s) > 0]
107 |         t['song_hotttnesss'] = float(f[43])
108 |         t['song_id'] = f[44]
109 |         t['start_of_fade_out'] = float(f[45])
110 | 
111 |         tatums_confidence = f[46].split(',')
112 |         tatums_start = f[47].split(',')
113 |         t['tatums'] = [ (float(start), float(conf)) \
114 |             for start, conf in zip(tatums_start, tatums_confidence) if len(start) > 0 ]
115 |         t['tempo'] = float(f[48])
116 |         t['time_signature'] = (int(f[49]), float(f[50]))
117 |         t['title'] = f[51]
118 |         t['track_7digitalid'] = int(f[52])
119 |         t['preview'] = 'http://previews.7digital.com/clips/34/%d.clip.mp3' % (int(f[52]), )
120 |         t['year'] = int(f[53])
121 |         return t
122 |     else:
123 |         print 'mismatched fields, found', len(f), 'should have 54'
124 |         return None
125 | 
126 | 
127 | 
128 | 
129 | def load_tracks(path):
130 |     """ Loads a list of track from a file """
131 | 
132 |     tracks = []
133 |     file = open(path)
134 |     for which, line in enumerate(file):
135 |         track = load_track(line)
136 |         if track <> None:
137 |             track['path'] = path
138 |             track['line'] = which
139 |             tracks.append(track)
140 |     file.close()
141 |     return tracks
142 | 
143 | def process_tracks(path, func):
144 |     """ applies func(track) to each track found in path  """
145 |     file = open(path)
146 |     for which, line in enumerate(file):
147 |         track = load_track(line)
148 |         if track <> None:
149 |             track['path'] = path
150 |             track['line'] = which
151 |             func(track)
152 |     file.close()
153 | 
154 | 
155 | def dump(track):
156 |     """ Dumps some data from a track for debugging """
157 |     print track['line'], track['track_id'], track['artist_id'],  len(track['artist_mbtags']), \
158 |         len(track['artist_terms'] ), len(track['bars']), len(track['beats']), track['title'], \
159 |         track['key'], track['mode'], len(track['segments'])
160 |     for seg in track['segments']:
161 |         print '   ', seg['start'], seg['duration'], track['duration']
162 |     print
163 | 
164 | 
165 | if __name__ == '__main__':
166 |     process_tracks(sys.argv[1], dump)
167 | 


--------------------------------------------------------------------------------
/util/README.md:
--------------------------------------------------------------------------------
1 | Utilities
2 | ---------
3 | A set of utility scripts to help process output data from the map reduce jobs
4 | 
5 |     nsort.py - sorts the data
6 |     dump_loudness.py - dumps the loudness for each segment of a track suitable for plotting
7 | 


--------------------------------------------------------------------------------
/util/dump_loudness.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import re
  3 | from itertools import imap
  4 | from pyechonest import track
  5 | import spotimeta
  6 | 
  7 | WEIGHT = 10
  8 | 
  9 | def dump_loudness(id, file=sys.stdout):
 10 |     t = track.track_from_id(id)
 11 |     title = t.title + ' by ' + t.artist
 12 |     spotify_id = get_spotify_id(t.artist, t.title)
 13 | 
 14 |     print >>file, "# ID ", id
 15 |     print >>file, "#", title
 16 |     print >>file, "# ARTIST ", t.artist
 17 |     print >>file, "# TITLE ", t.title
 18 |     print >>file, "# SONG_ID ", t.song_id
 19 |     print >>file, "# SPOT_ID ", spotify_id
 20 |     print >>file, "#"
 21 | 
 22 |     weighted = []
 23 |     half_track = t.duration / 2
 24 |     first_half = 0
 25 |     second_half = 0
 26 | 
 27 |     xdata = []
 28 |     ydata = []
 29 | 
 30 |     for seg in t.segments:
 31 |         sstart = seg['start']
 32 |         sloudness = min(seg['loudness_max'], 0)
 33 |         sloudness = max(sloudness, -60)
 34 |         sduration = seg['duration']
 35 |         send = sstart + sduration
 36 | 
 37 |         weighted.append(sloudness)
 38 |         if len(weighted) > WEIGHT:
 39 |             weighted.pop(0)
 40 |         avg = sum(weighted) / len(weighted)
 41 | 
 42 |         if send <= half_track:
 43 |             seg_loudness = sloudness * sduration
 44 |             first_half += seg_loudness
 45 |         elif sstart < half_track and send > half_track:
 46 |             # this is the nasty segment that spans the song midpoint.
 47 |             # apportion the loudness appropriately
 48 |             first_seg_loudness = sloudness * (half_track - sstart)
 49 |             first_half += first_seg_loudness
 50 | 
 51 |             second_seg_loudness = sloudness * (send - half_track)
 52 |             second_half += second_seg_loudness
 53 |         else:
 54 |             seg_loudness = sloudness * sduration
 55 |             second_half += seg_loudness
 56 | 
 57 |         xdata.append( sstart )
 58 |         ydata.append( sloudness )
 59 | 
 60 |         ramp_factor = second_half / half_track - first_half / half_track
 61 |         #print >>file, seg['start'], sloudness, avg, first_half, second_half, ramp_factor
 62 |         #print >>file, "%8.6f %9.4f %9.4f %12.6f %12.6f %12.6f" % (sstart, sloudness, avg, first_half, second_half, ramp_factor)
 63 |         print >>file, "%8.6f %9.4f %9.4f" % (sstart, sloudness, avg)
 64 | 
 65 |     correlation = pearsonr(xdata, ydata)
 66 |     print >>file, "#"
 67 |     print >>file, "#", 'ramp factor', ramp_factor
 68 |     print >>file, "#", 'correlation', correlation
 69 |     print >>file, "#", 'first', first_half / half_track
 70 |     print >>file, "#", 'second', second_half / half_track
 71 |     print >>file, "#"
 72 | 
 73 |     return title, ramp_factor, first_half / half_track, second_half / half_track
 74 | 
 75 | def pearsonr(x, y):
 76 |   # Assume len(x) == len(y)
 77 |     n = len(x)
 78 |     sum_x = sum(x)
 79 |     sum_y = sum(y)
 80 |     sum_x_sq = sum(map(lambda x: pow(x, 2), x))
 81 |     sum_y_sq = sum(map(lambda x: pow(x, 2), y))
 82 |     psum = sum(imap(lambda x, y: x * y, x, y))
 83 |     num = psum - (sum_x * sum_y/n)
 84 |     den = pow((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n), 0.5)
 85 |     if den == 0: 
 86 |         return 0
 87 |     return num / den
 88 | 
 89 | 
 90 | def spotify_search(name):
 91 |     retries = 5
 92 |     while retries > 0:
 93 |         try:
 94 |             results = spotimeta.search_track(name)
 95 |             return results
 96 |         except spotimeta.ServerError:
 97 |             print >> sys.stderr, "      ... retrying spotify for ", name
 98 |             time.sleep(5)
 99 |             retries -= 1
100 |     return None
101 |             
102 | def get_spotify_id(artist, title):
103 |     name = artist + ' ' + title
104 |     name = norm(name)
105 | 
106 |     search = spotify_search(name)
107 |     if search and search["total_results"] > 0:
108 |         best = search['result'][0]
109 |         result = best['href']
110 |         print >> sys.stderr, '    found', result, best['artist']['name'], best['name']
111 |         return result
112 |     else:
113 |         print >> sys.stderr, "Couldn't spotifind", name
114 |         return None
115 | 
116 | def norm(name):
117 |     s = name
118 |     s = s.replace(".", "")
119 |     s = s.lower()
120 |     s = re.sub(r'&', ' and ', s)
121 |     s = re.sub(r' +', ' ', s)
122 |     s = s.strip()
123 | 
124 |     # if we've normalized away everything
125 |     # keep it.
126 |     if len(s) == 0:
127 |         s = name
128 |     return s
129 | 
130 | 
131 | 
132 | if __name__ == '__main__':
133 |     dump_loudness(sys.argv[1])
134 | 


--------------------------------------------------------------------------------
/util/make_plots.py:
--------------------------------------------------------------------------------
 1 | # 30.8926305957 ["David Coverdale", "Into The Light", "TRPNADQ128F422DB22"]
 2 | 
 3 | import simplejson as json
 4 | import sys
 5 | import os
 6 | import sdir
 7 | import dump_loudness
 8 | 
 9 | plotter_text = """
10 | set terminal postscript landscape size 8.4,5.9
11 | set output "foo.plt"
12 | set style data line
13 | set key bottom
14 | set xlabel "seconds"
15 | set ylabel "decibels"
16 | set title " %s "
17 | plot "%s.out" using 1:2 with line linecolor rgb "#aaffaa" title "raw loudness"
18 | replot "%s.out" using 1:2 with line lt -1 linecolor rgb "blue" smooth bezier title "smoothed loudness"
19 | replot %f with line linecolor rgb "#777777" title ""
20 | replot %f with line linecolor rgb "#777777" title ""
21 | set key title "score:%.2f ramp:%.2f corr:%.2f"
22 | set terminal postscript landscape size 8.4,5.9
23 | set output "plot.ps"
24 | replot
25 | """
26 | 
27 | def make_plot(id, force=False):
28 |     plot_path = os.path.join("plots", id + ".png")
29 |     if force or not os.path.exists(plot_path):
30 |         print "Creating plot", plot_path
31 |         name, ramp, first, second, correlation = sdir.get_plot_info(id)
32 |         score = ramp * correlation
33 |         title = name.replace('"', "'")
34 |         plotter = open("plotter.gplt", "w")
35 |         print >>plotter, plotter_text % (title, id, id, first, second, score, ramp, correlation)
36 |         plotter.close()
37 |         os.system("gnuplot plotter.gplt")
38 |         os.system("convert -rotate 90 plot.ps %s" % (plot_path,))
39 |     else:
40 |         print >>sys.stderr, "   plot exists, skipping...", plot_path
41 | 
42 | def create_plot(id):
43 |     force = False
44 |     path = id + ".out"
45 |     if not os.path.exists(path):
46 |         f = open(path, "w")
47 |         dump_loudness.dump_loudness(id, f)
48 |         f.close()
49 |         force = True
50 |     else:
51 |         print >>sys.stderr, "   data exists, skipping...", id
52 |     make_plot(id, force)
53 | 
54 | 
55 | def create_plots():
56 |     # parses input in the form
57 |     # # 30.8926305957 ["David Coverdale", "Into The Light", "TRPNADQ128F422DB22"]
58 |     for which, line in  enumerate(sys.stdin):
59 |         fields = line.strip().split()
60 |         slist = ' '.join(fields[1:])
61 |         list = json.loads(slist)
62 |         id = list[2]
63 |         print >>sys.stderr, which, id
64 |         create_plot(id)
65 | 
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     if len(sys.argv) > 1:
70 |         for id in sys.argv[1:]:
71 |             create_plot(id)
72 |     else:
73 |         create_plots()
74 |     
75 | 


--------------------------------------------------------------------------------
/util/nsort.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import sys
 3 | 
 4 | 
 5 | lines = []
 6 | for line in sys.stdin:
 7 |     fields = line.strip().split('\t')
 8 |     if len(fields) == 2:
 9 |         key, sval = fields
10 |         if len(sval) > 0:
11 |             lines.append( (key, float(sval)) )
12 | 
13 | lines.sort(reverse=True, key=lambda s:s[1])
14 | 
15 | 
16 | for key, val in lines:
17 |     print val, key
18 | 


--------------------------------------------------------------------------------
/util/nsort2.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import simplejson as json
 3 | import sys
 4 | 
 5 | 
 6 | lines = []
 7 | which = 0
 8 | 
 9 | if len(sys.argv) > 1:
10 |     which = int(sys.argv[1])
11 | 
12 | for line in sys.stdin:
13 |     fields = line.strip().split('\t')
14 |     if len(fields) == 2:
15 |         key, sval = fields
16 |         if len(sval) > 0:
17 |             list = json.loads(sval)
18 |             lines.append( (key, list))
19 | 
20 | lines.sort(reverse=True, key=lambda s:s[1][which])
21 | 
22 | 
23 | for key, val in lines:
24 |     print val, key
25 | 


--------------------------------------------------------------------------------
/util/plotter:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ID  TRANELS128F425B25C
 3 | # I'll Kill You by Konami Kukeiha Club
 4 | 
 5 | ID=`awk ' $2 == "ID"  { $1= ""; $2 = ""; print }' data/$1.out`
 6 | title=`awk ' NR==2 { $1 = ""; print }' data/$1.out`
 7 | 
 8 | rm plot
 9 | echo plot \"$1.out\" using 1:2 with line linecolor rgb \"#eeeeee\" title \"\" >> plot
10 | echo replot "$1.out" using 1:3 with line linecolor rgb \"blue\" title \"loudness\" >> plot
11 | echo set xlabel \"Time seconds\" >> plot
12 | echo set ylabel \"decibels\" >> plot
13 | echo set title \"$title\" >> plot
14 | echo set terminal ps >> plot
15 | echo set output \"$ID.ps\" >> plot
16 | 


--------------------------------------------------------------------------------
/util/sdir.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import json
  3 | import re
  4 | import sys
  5 | import os
  6 | 
  7 | 
  8 | def get_plot_info(id):
  9 |     path = id + ".out"
 10 |     tid = ''
 11 |     sid = ''
 12 |     spot = None
 13 |     name = ''
 14 |     factor = 0
 15 |     first = 0
 16 |     second = 0
 17 |     correlation = 0
 18 | 
 19 |     if os.path.exists(path):
 20 |         f = open(path)
 21 |         for which, line in enumerate(f):
 22 |             if which == 1:
 23 |                 name = line.strip().replace("# ", "")
 24 |             elif line[0] == '#':
 25 |                 fields = line.strip().split()
 26 |                 if len(fields) >= 4 and fields[1] == 'ramp':
 27 |                     factor = float(fields[3])
 28 |                 if len(fields) >= 3 and fields[1] == 'first':
 29 |                     first = float(fields[2])
 30 |                 if len(fields) >= 3 and fields[1] == 'second':
 31 |                     second = float(fields[2])
 32 |                 if len(fields) >= 3 and fields[1] == 'correlation':
 33 |                     correlation = float(fields[2])
 34 |                 if len(fields) >= 3 and fields[1] == 'ID':
 35 |                     tid = fields[2]
 36 |                 if len(fields) >= 3 and fields[1] == 'SONG_ID':
 37 |                     sid = fields[2]
 38 |                 if len(fields) >= 3 and fields[1] == 'SPOT_ID' and fields[2] <> 'None':
 39 |                     spot = fields[2]
 40 | 
 41 |         if tid <> id:
 42 |             print >> sys.stderr, "Mismatched ID ", id, tid
 43 |         return name, sid, spot, factor, first, second, correlation
 44 |     else:
 45 |         return None
 46 | 
 47 | 
 48 | def make_entry(id):
 49 |     info = get_plot_info(id)
 50 |     if info:
 51 |         name, sid, spot_id, factor, first, second, correlation = info
 52 |         image = id + ".png"
 53 | 
 54 |         title = name
 55 | 
 56 |         if spot_id:
 57 |             print '<a href=\"' + spot_id + '">',
 58 |             print '<img src="plots/' + image + '">',
 59 |             print '</a>'
 60 |         else:
 61 |             pass
 62 |             # print '<img src="plots/' + image + '">'
 63 |     else:
 64 |         print >>sys.stderr, "Can't open info for", id
 65 |     
 66 | def make_plot_page(index):
 67 |     for l in index:
 68 |         process_plot(l.strip());
 69 | 
 70 | 
 71 | def header():
 72 |     print "<html>"
 73 |     print "<head>"
 74 |     print "<title>"
 75 |     print "Loudness curves from the Million Song Dataset"
 76 |     print "</title>"
 77 |     print "</head>"
 78 |     print "<body>"
 79 |     print "<h1> Loudness curves from the Million Song Dataset </h1>"
 80 | 
 81 | def footer():
 82 |     print "<hr>"
 83 |     print "</body>"
 84 | 
 85 | def build_page(index):
 86 |     header()
 87 |     for count, l in enumerate(open(index)):
 88 |         l = l.strip()
 89 |         print >> sys.stderr, count, l
 90 |         if len(l) > 0:
 91 |             if l[0] == '#':
 92 |                 print "<h2>", l[1:], "</h2>"
 93 |             elif l.startswith("TR"):
 94 |                 make_entry(l)
 95 |     footer()
 96 | 
 97 | 
 98 | def qd_page():
 99 |     header()
100 |     for l in sys.stdin:
101 |         plot = l.strip()
102 |         print '<img src="' + plot + '">',
103 |     footer()
104 | 
105 | def dump_ids():
106 |     # parses input in the form
107 |     # # 30.8926305957 ["David Coverdale", "Into The Light", "TRPNADQ128F422DB22"]
108 |     for which, line in  enumerate(sys.stdin):
109 |         fields = line.strip().split()
110 |         slist = ' '.join(fields[1:])
111 |         list = json.loads(slist)
112 |         id = list[2]
113 |         print id
114 | 
115 | 
116 | if __name__ == '__main__':
117 |     if len(sys.argv) > 1:
118 |         if sys.argv[1] == '--dump':
119 |             dump_ids()
120 |         else:
121 |             build_page(sys.argv[1])
122 |     else:
123 |         qd_page()
124 | 
125 | 


--------------------------------------------------------------------------------