├── LICENSE
├── README.md
├── Results.pdf
├── Shazam Industrial Audio Search.pdf
├── fingerprint.py
├── main.py
├── peakpicker.py
└── tdft.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Bryant Moquist
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Shazam 
2 | ======
3 | 
4 | #####Music Identification Program based on Shazam's methods
5 | 
6 | 
7 | Drawing on the methods described in "An Industrial Strength Audio Search Algorithm" by Avery Li-Chun Wang of Shazam, this program implements basic music identification capabilities. Results.pdf describes the effectiveness of the program using spectrograms and charts when tested on a small library of songs.
8 | 


--------------------------------------------------------------------------------
/Results.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bmoquist/Shazam/78915b4c4c216027867e47005de867d1a71d6efa/Results.pdf


--------------------------------------------------------------------------------
/Shazam Industrial Audio Search.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bmoquist/Shazam/78915b4c4c216027867e47005de867d1a71d6efa/Shazam Industrial Audio Search.pdf


--------------------------------------------------------------------------------
/fingerprint.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Hash and Acoustic Fingerprint Functions
 3 | Bryant Moquist
 4 | '''
 5 | 
 6 | import numpy as np
 7 | 
 8 | def findAdjPts(index,A,delay_time,delta_time,delta_freq):
 9 |     "Find the three closest adjacent points to the anchor point"    
10 |     adjPts = []
11 |     low_x = A[index][0]+delay_time
12 |     high_x = low_x+delta_time
13 |     low_y = A[index][1]-delta_freq/2
14 |     high_y = A[index][1]+delta_freq/2
15 |     
16 |     for i in A:
17 |         if ((i[0]>low_x and i[0]<high_x) and (i[1]>low_y and i[1]<high_y)):
18 |             adjPts.append(i)
19 |             
20 |     return adjPts
21 |     
22 | def hashPeaks(A,songID,delay_time,delta_time,delta_freq):
23 |     "Create a matrix of peaks hashed as: [[freq_anchor, freq_other, delta_time], time_anchor, songID]"
24 |     hashMatrix = np.zeros((len(A)*100,5))  #Assume size limitation
25 |     index = 0
26 |     numPeaks = len(A)
27 |     for i in range(0,numPeaks):
28 |         adjPts = findAdjPts(i,A,delay_time,delta_time,delta_freq)
29 |         adjNum=len(adjPts)
30 |         for j in range(0,adjNum):
31 |             hashMatrix[index][0] = A[i][1]
32 |             hashMatrix[index][1] = adjPts[j][1]
33 |             hashMatrix[index][2] = adjPts[j][0]-A[i][0]
34 |             hashMatrix[index][3] = A[i][0]
35 |             hashMatrix[index][4] = songID
36 |             index=index+1
37 |     
38 |     hashMatrix = hashMatrix[~np.all(hashMatrix==0,axis=1)]
39 |     hashMatrix = np.sort(hashMatrix,axis=0)
40 |         
41 |     return hashMatrix
42 | 
43 | def hashSamplePeaks(A,delay_time,delta_time,delta_freq):
44 |     "Create a matrix of peaks hashed as: [[freq_anchor, freq_other, delta_time],time_anchor]"
45 |     hashMatrix = np.zeros((len(A)*100,4))
46 |     index = 0
47 |     numPeaks = len(A)
48 |     for i in range(0,numPeaks):
49 |         adjPts = findAdjPts(i,A,delay_time,delta_time,delta_freq)
50 |         adjNum = len(adjPts)
51 |         for j in range(0,adjNum):
52 |             hashMatrix[index][0] = A[i][1]
53 |             hashMatrix[index][1] = adjPts[j][1]
54 |             hashMatrix[index][2] = adjPts[j][0]-A[i][0]
55 |             hashMatrix[index][3] = A[i][0]
56 |             index=index+1
57 | 
58 |     hashMatrix = hashMatrix[~np.all(hashMatrix==0,axis=1)]
59 |     hashMatrix = np.sort(hashMatrix,axis=0)
60 |         
61 |     return hashMatrix
62 | 
63 | def findTimePairs(hash_database,sample_hash,deltaTime,deltaFreq):
64 |     "Find the matching pairs between sample audio file and the songs in the database"
65 | 
66 |     timePairs = []
67 | 
68 |     for i in sample_hash:
69 |         for j in hash_database:
70 |             if(i[0] > (j[0]-deltaFreq) and i[0] < (j[0] + deltaFreq)):
71 |                 if(i[1] > (j[1]-deltaFreq) and i[1] < (j[1] + deltaFreq)):
72 |                     if(i[2] > (j[2]-deltaTime) and i[2] < (j[2] + deltaTime)):
73 |                         timePairs.append((j[3],i[3],j[4]))
74 |                     else:
75 |                         continue
76 |                 else:
77 |                     continue
78 |             else:
79 |                 continue
80 |             
81 |     return timePairs


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | ''' 
  2 | Music Identification Program (a.k.a. Shazam/Soundhound) 
  3 | Proof of Concept 
  4 | Bryant Moquist
  5 | '''
  6 | from __future__ import print_function
  7 | import scipy, pylab
  8 | from scipy.io.wavfile import read
  9 | import sys
 10 | import peakpicker as pp
 11 | import fingerprint as fhash
 12 | import matplotlib
 13 | import numpy as np
 14 | import tdft
 15 | 
 16 | if __name__ == '__main__':
 17 | 
 18 |     num_inputs = len(sys.argv)
 19 | 
 20 |     #Song files to be hashed into database
 21 |     songs = []
 22 |     songnames = []
 23 |     separator = '.'
 24 | 
 25 |     for i in range(1,num_inputs):
 26 |     	songs.append(read(sys.argv[i]))
 27 |     	name = sys.argv[i].split(separator,1)[0]
 28 |     	songnames.append(name)
 29 | 
 30 |     #TDFT parameters
 31 |     windowsize = 0.008     #set the window size  (0.008s = 64 samples)
 32 |     windowshift = 0.004    #set the window shift (0.004s = 32 samples)
 33 |     fftsize = 1024         #set the fft size (if srate = 8000, 1024 --> 513 freq. bins separated by 7.797 Hz from 0 to 4000Hz) 
 34 |     
 35 |     #Peak picking dimensions 
 36 |     f_dim1 = 30
 37 |     t_dim1 = 80 
 38 |     f_dim2 = 10
 39 |     t_dim2 = 20
 40 |     percentile = 70
 41 |     base = 70 # lowest frequency bin used (peaks below are too common/not as useful for identification)
 42 |     high_peak_threshold = 75
 43 |     low_peak_threshold = 60
 44 | 
 45 |     #Hash parameters
 46 |     delay_time = 250      # 250*0.004 = 1 second
 47 |     delta_time = 250*3    # 750*0.004 = 3 seconds
 48 |     delta_freq = 128      # 128*7.797Hz = approx 1000Hz
 49 |     
 50 |     #Time pair parameters
 51 |     TPdelta_freq = 4
 52 |     TPdelta_time = 2
 53 | 
 54 |     #Construct the audio database of hashes
 55 |     database = np.zeros((1,5))
 56 |     spectrodata = []
 57 |     peaksdata = []
 58 | 
 59 |     for i in range(0,len(songs)):
 60 | 
 61 |     	print('Analyzing '+str(songnames[i]))
 62 |     	srate = songs[i][0]  #sample rate in samples/second
 63 |     	audio = songs[i][1]  #audio data    	
 64 |     	spectrogram = tdft.tdft(audio, srate, windowsize, windowshift, fftsize)
 65 |     	time = spectrogram.shape[0]
 66 |     	freq = spectrogram.shape[1]
 67 | 
 68 |     	threshold = pp.find_thres(spectrogram, percentile, base)
 69 | 
 70 |     	print('The size of the spectrogram is time: '+str(time)+' and freq: '+str(freq))
 71 |     	spectrodata.append(spectrogram)
 72 | 
 73 |     	peaks = pp.peak_pick(spectrogram,f_dim1,t_dim1,f_dim2,t_dim2,threshold,base)
 74 | 
 75 |     	print('The initial number of peaks is:'+str(len(peaks)))
 76 |     	peaks = pp.reduce_peaks(peaks, fftsize, high_peak_threshold, low_peak_threshold)
 77 | 
 78 |     	print('The reduced number of peaks is:'+str(len(peaks)))
 79 |     	peaksdata.append(peaks)
 80 | 
 81 |     	#Calculate the hashMatrix for the database song file
 82 |     	songid = i
 83 |     	hashMatrix = fhash.hashPeaks(peaks,songid,delay_time,delta_time,delta_freq)
 84 | 
 85 |         #Add to the song hash matrix to the database
 86 |     	database = np.concatenate((database,hashMatrix),axis=0)
 87 | 
 88 | 
 89 |     print('The dimensions of the database hash matrix: '+str(database.shape))
 90 |     database = database[np.lexsort((database[:,2],database[:,1],database[:,0]))]
 91 | 
 92 |     # Audio sample to be analyzed and identified
 93 | 
 94 |     print('Please enter an audio sample file to identify: ')
 95 |     userinput = raw_input('---> ')
 96 |     sample = read(userinput)
 97 |     userinput = userinput.split(separator,1)[0]
 98 | 
 99 |     print('Analyzing the audio sample: '+str(userinput))
100 |     srate = sample[0]  #sample rate in samples/second
101 |     audio = sample[1]  #audio data    	
102 |     spectrogram = tdft.tdft(audio, srate, windowsize, windowshift, fftsize)
103 |     time = spectrogram.shape[0]
104 |     freq = spectrogram.shape[1]
105 |     
106 |     print('The size of the spectrogram is time: '+str(time)+' and freq: '+str(freq))
107 | 
108 |     threshold = pp.find_thres(spectrogram, percentile, base)
109 |     
110 |     peaks = pp.peak_pick(spectrogram,f_dim1,t_dim1,f_dim2,t_dim2,threshold,base)
111 |     
112 |     print('The initial number of peaks is:'+str(len(peaks)))
113 |     peaks = pp.reduce_peaks(peaks, fftsize, high_peak_threshold, low_peak_threshold)
114 |     print('The reduced number of peaks is:'+str(len(peaks)))
115 | 
116 |     #Store information for the spectrogram graph
117 |     samplePeaks = peaks
118 |     sampleSpectro = spectrogram
119 | 
120 |     hashSample = fhash.hashSamplePeaks(peaks,delay_time,delta_time,delta_freq)
121 |     print('The dimensions of the hash matrix of the sample: '+str(hashSample.shape))
122 | 
123 |     print('Attempting to identify the sample audio clip.')
124 |     timepairs = fhash.findTimePairs(database, hashSample, TPdelta_freq, TPdelta_time)
125 | 
126 |     #Compute number of matches by song id to determine a match
127 |     numSongs = len(songs)
128 |     songbins= np.zeros(numSongs)
129 |     numOffsets = len(timepairs)
130 |     offsets = np.zeros(numOffsets)
131 |     index = 0
132 |     for i in timepairs:
133 |     	offsets[index]=i[0]-i[1]
134 |     	index = index+1
135 |     	songbins[i[2]] += 1
136 | 
137 |     # Identify the song
138 |     print('The sample song is: '+str(songnames[np.argmax(songbins)]))
139 | 
140 |     # Plots 
141 |     fig = []
142 | 
143 |     # Plot the magnitude spectrograms
144 |     for i in range(0,numSongs):
145 |         fig1 = pylab.figure(i)
146 |         peaks = peaksdata[i]
147 |         pylab.imshow(spectrodata[i].T,origin='lower', aspect='auto', interpolation='nearest')
148 |         pylab.scatter(*zip(*peaks), marker='.', color='blue')
149 |         pylab.title(str(songnames[i])+' Spectrogram and Selected Peaks') 
150 |         pylab.xlabel('Time')    
151 |         pylab.ylabel('Frequency Bin')
152 |         fig.append(fig1)
153 |     
154 |     #Show the figures
155 |     for i in fig:
156 |         i.show()
157 | 
158 |     fig2 = pylab.figure(1002)
159 |     pylab.imshow(sampleSpectro.T,origin='lower', aspect='auto', interpolation='nearest')
160 |     pylab.scatter(*zip(*samplePeaks), marker='.', color='blue')
161 |     pylab.title('Sample File: '+str(userinput)+' Spectrogram and Selected Peaks') 
162 |     pylab.xlabel('Time')    
163 |     pylab.ylabel('Frequency Bin')
164 |     fig2.show()
165 | 
166 |     fig3 = pylab.figure(1003)
167 |     ax = fig3.add_subplot(111)
168 | 
169 |     ind = np.arange(numSongs)
170 |     width = 0.35
171 |     rects1 = ax.bar(ind,songbins,width,color='blue',align='center')
172 |     ax.set_ylabel('Number of Matches')
173 |     ax.set_xticks(ind)
174 |     xtickNames = ax.set_xticklabels(songnames)
175 |     matplotlib.pyplot.setp(xtickNames)
176 |     pylab.title('Song Identification') 
177 |     fig3.show()
178 | 
179 |     pylab.show()
180 | 
181 |     print('The sample song is: '+str(songnames[np.argmax(songbins)]))
182 | 
183 | 


--------------------------------------------------------------------------------
/peakpicker.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Peak Picking Functions
  3 | Bryant Moquist
  4 | '''
  5 | import numpy as np
  6 | 
  7 | def find_thres(spectrogram,percentile,base):
  8 |     "Find the peak picking threshold for a particular spectrogram"
  9 |     dim = spectrogram.shape
 10 |     window = spectrogram[0:dim[0],base:dim[1]]
 11 |     threshold = np.percentile(window, percentile)
 12 | 
 13 |     return threshold
 14 | 
 15 | def peak_pick (S,f_dim1,t_dim1,f_dim2,t_dim2,threshold,base):
 16 |     "Selects local peaks in a spectrogram and returns a list of tuples (time, freq, amplitude)" 
 17 |     "S is spectrogram matrix"
 18 |     "f_dim1,f_dim2,t_dim1,and t_dim2 are freq x time dimensions of the sliding window for first and second passes"
 19 |     "threshold is the minimum amplitude required to be a peak"
 20 |     "base is the lowest frequency bin considered"
 21 | 
 22 |     a = len(S) #num of time bins
 23 |     b = len(S[1]) #num of frequency bins
 24 | 
 25 |     peaks = []
 26 |     t_coords = []
 27 |     f_coords = []
 28 | 
 29 |     "Determine the time x frequency window to analyze"
 30 |     for i in range(0,a,t_dim1):
 31 |         for j in range(base,b,f_dim1):
 32 |             if i + t_dim1 < a and j + f_dim1 < b:
 33 |                 window = S[i:i+t_dim1,j:j+f_dim1]
 34 |             elif i + t_dim1 < a and j + f_dim1 >= b:
 35 |                 window = S[i:i+t_dim1,j:b]
 36 |             elif i + t_dim1 >= a and j + f_dim1 < b:
 37 |                 window = S[i:a,j:j+f_dim1]
 38 |             else:
 39 |                 window = S[i:a,j:b]
 40 | 
 41 |             "Check if the largest value in the window is greater than the threshold"
 42 |             if np.amax(window) >= threshold:
 43 |                 row, col = np.unravel_index(np.argmax(window), window.shape) # pulls coordinates of max value from window
 44 |                 t_coords.append(i+row)
 45 |                 f_coords.append(j+col) 
 46 |      
 47 |     "Iterates through coordinates selected above to make sure that each of those points is in fact a local peak"    
 48 |     for k in range(0,len(f_coords)):
 49 |         fmin = f_coords[k] - f_dim2
 50 |         fmax = f_coords[k] + f_dim2
 51 |         tmin = t_coords[k] - t_dim2
 52 |         tmax = t_coords[k] + t_dim2
 53 |         if fmin < base:
 54 |             fmin = base
 55 |         if fmax > b:
 56 |             fmax = b
 57 |         if tmin < 0:
 58 |             tmin = 0
 59 |         if tmax > a:
 60 |             tmax = a
 61 |         window = S[tmin:tmax,fmin:fmax] #window centered around current coordinate pair
 62 | 
 63 |         "Break when the window is empty"
 64 |         if not window.size:
 65 |             continue
 66 | 
 67 |         "Eliminates coordinates that are not local peaks by setting their coordinates to -1"
 68 |         if S[t_coords[k],f_coords[k]] < np.amax(window):
 69 |             t_coords[k] = -1
 70 |             f_coords[k] = -1
 71 | 
 72 |     "Removes all -1 coordinate pairs"
 73 |     f_coords[:] = (value for value in f_coords if value != -1)
 74 |     t_coords[:] = (value for value in t_coords if value != -1)
 75 | 
 76 |     for x in range(0, len(f_coords)):
 77 |         peaks.append((t_coords[x], f_coords[x], S[t_coords[x], f_coords[x]]))
 78 | 
 79 |     return peaks
 80 | 
 81 | def reduce_peaks(peaks,fftsize,high_peak_threshold,low_peak_threshold):
 82 |     "Reduce the number of peaks by separating into high and low frequency regions and thresholding."
 83 |     #Separate regions ensure better spread of peaks. 
 84 |     low_peaks = []
 85 |     high_peaks = []
 86 | 
 87 |     for item in peaks:
 88 |         if(item[1]>(fftsize/4)):
 89 |             high_peaks.append(item)
 90 |         else:
 91 |             low_peaks.append(item)
 92 |     
 93 |     #Eliminate peaks based on respective thresholds in the low and high frequency regions.  
 94 |     reduced_peaks = []
 95 |     for item in peaks:
 96 |         if(item[1]>(fftsize/4)):
 97 |             if(item[2]>np.percentile(high_peaks,high_peak_threshold,axis=0)[2]):
 98 |                 reduced_peaks.append(item)
 99 |             else:
100 |                 continue
101 |         else:
102 |             if(item[2]>np.percentile(low_peaks,low_peak_threshold,axis=0)[2]):
103 |                 reduced_peaks.append(item)
104 |             else:
105 |                 continue
106 | 
107 |     return reduced_peaks


--------------------------------------------------------------------------------
/tdft.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Time Dependent Fourier Transform (Spectrogram)
 3 | Bryant Moquist
 4 | '''
 5 | 
 6 | import scipy
 7 | import numpy as np
 8 | 
 9 | def tdft(audio, srate, windowsize, windowshift,fftsize):
10 | 
11 |     """Calculate the real valued fast Fourier transform of a segment of audio multiplied by a 
12 |     a Hamming window.  Then, convert to decibels by multiplying by 20*log10.  Repeat for all
13 |     segments of the audio."""
14 |     
15 |     windowsamp = int(windowsize*srate)
16 |     shift = int(windowshift*srate)
17 |     window = scipy.hamming(windowsamp)
18 |     spectrogram = scipy.array([20*scipy.log10(abs(np.fft.rfft(window*audio[i:i+windowsamp],fftsize))) 
19 |                      for i in range(0, len(audio)-windowsamp, shift)])
20 |     return spectrogram


--------------------------------------------------------------------------------