├── LICENSE ├── README.md ├── Results.pdf ├── Shazam Industrial Audio Search.pdf ├── fingerprint.py ├── main.py ├── peakpicker.py └── tdft.py /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Bryant Moquist 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Shazam 2 | ====== 3 | 4 | #####Music Identification Program based on Shazam's methods 5 | 6 | 7 | Drawing on the methods described in "An Industrial Strength Audio Search Algorithm" by Avery Li-Chun Wang of Shazam, this program implements basic music identification capabilities. Results.pdf describes the effectiveness of the program using spectrograms and charts when tested on a small library of songs. 8 | -------------------------------------------------------------------------------- /Results.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bmoquist/Shazam/78915b4c4c216027867e47005de867d1a71d6efa/Results.pdf -------------------------------------------------------------------------------- /Shazam Industrial Audio Search.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bmoquist/Shazam/78915b4c4c216027867e47005de867d1a71d6efa/Shazam Industrial Audio Search.pdf -------------------------------------------------------------------------------- /fingerprint.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Hash and Acoustic Fingerprint Functions 3 | Bryant Moquist 4 | ''' 5 | 6 | import numpy as np 7 | 8 | def findAdjPts(index,A,delay_time,delta_time,delta_freq): 9 | "Find the three closest adjacent points to the anchor point" 10 | adjPts = [] 11 | low_x = A[index][0]+delay_time 12 | high_x = low_x+delta_time 13 | low_y = A[index][1]-delta_freq/2 14 | high_y = A[index][1]+delta_freq/2 15 | 16 | for i in A: 17 | if ((i[0]>low_x and i[0]low_y and i[1] (j[0]-deltaFreq) and i[0] < (j[0] + deltaFreq)): 71 | if(i[1] > (j[1]-deltaFreq) and i[1] < (j[1] + deltaFreq)): 72 | if(i[2] > (j[2]-deltaTime) and i[2] < (j[2] + deltaTime)): 73 | timePairs.append((j[3],i[3],j[4])) 74 | else: 75 | continue 76 | else: 77 | continue 78 | else: 79 | continue 80 | 81 | return timePairs -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Music Identification Program (a.k.a. Shazam/Soundhound) 3 | Proof of Concept 4 | Bryant Moquist 5 | ''' 6 | from __future__ import print_function 7 | import scipy, pylab 8 | from scipy.io.wavfile import read 9 | import sys 10 | import peakpicker as pp 11 | import fingerprint as fhash 12 | import matplotlib 13 | import numpy as np 14 | import tdft 15 | 16 | if __name__ == '__main__': 17 | 18 | num_inputs = len(sys.argv) 19 | 20 | #Song files to be hashed into database 21 | songs = [] 22 | songnames = [] 23 | separator = '.' 24 | 25 | for i in range(1,num_inputs): 26 | songs.append(read(sys.argv[i])) 27 | name = sys.argv[i].split(separator,1)[0] 28 | songnames.append(name) 29 | 30 | #TDFT parameters 31 | windowsize = 0.008 #set the window size (0.008s = 64 samples) 32 | windowshift = 0.004 #set the window shift (0.004s = 32 samples) 33 | fftsize = 1024 #set the fft size (if srate = 8000, 1024 --> 513 freq. bins separated by 7.797 Hz from 0 to 4000Hz) 34 | 35 | #Peak picking dimensions 36 | f_dim1 = 30 37 | t_dim1 = 80 38 | f_dim2 = 10 39 | t_dim2 = 20 40 | percentile = 70 41 | base = 70 # lowest frequency bin used (peaks below are too common/not as useful for identification) 42 | high_peak_threshold = 75 43 | low_peak_threshold = 60 44 | 45 | #Hash parameters 46 | delay_time = 250 # 250*0.004 = 1 second 47 | delta_time = 250*3 # 750*0.004 = 3 seconds 48 | delta_freq = 128 # 128*7.797Hz = approx 1000Hz 49 | 50 | #Time pair parameters 51 | TPdelta_freq = 4 52 | TPdelta_time = 2 53 | 54 | #Construct the audio database of hashes 55 | database = np.zeros((1,5)) 56 | spectrodata = [] 57 | peaksdata = [] 58 | 59 | for i in range(0,len(songs)): 60 | 61 | print('Analyzing '+str(songnames[i])) 62 | srate = songs[i][0] #sample rate in samples/second 63 | audio = songs[i][1] #audio data 64 | spectrogram = tdft.tdft(audio, srate, windowsize, windowshift, fftsize) 65 | time = spectrogram.shape[0] 66 | freq = spectrogram.shape[1] 67 | 68 | threshold = pp.find_thres(spectrogram, percentile, base) 69 | 70 | print('The size of the spectrogram is time: '+str(time)+' and freq: '+str(freq)) 71 | spectrodata.append(spectrogram) 72 | 73 | peaks = pp.peak_pick(spectrogram,f_dim1,t_dim1,f_dim2,t_dim2,threshold,base) 74 | 75 | print('The initial number of peaks is:'+str(len(peaks))) 76 | peaks = pp.reduce_peaks(peaks, fftsize, high_peak_threshold, low_peak_threshold) 77 | 78 | print('The reduced number of peaks is:'+str(len(peaks))) 79 | peaksdata.append(peaks) 80 | 81 | #Calculate the hashMatrix for the database song file 82 | songid = i 83 | hashMatrix = fhash.hashPeaks(peaks,songid,delay_time,delta_time,delta_freq) 84 | 85 | #Add to the song hash matrix to the database 86 | database = np.concatenate((database,hashMatrix),axis=0) 87 | 88 | 89 | print('The dimensions of the database hash matrix: '+str(database.shape)) 90 | database = database[np.lexsort((database[:,2],database[:,1],database[:,0]))] 91 | 92 | # Audio sample to be analyzed and identified 93 | 94 | print('Please enter an audio sample file to identify: ') 95 | userinput = raw_input('---> ') 96 | sample = read(userinput) 97 | userinput = userinput.split(separator,1)[0] 98 | 99 | print('Analyzing the audio sample: '+str(userinput)) 100 | srate = sample[0] #sample rate in samples/second 101 | audio = sample[1] #audio data 102 | spectrogram = tdft.tdft(audio, srate, windowsize, windowshift, fftsize) 103 | time = spectrogram.shape[0] 104 | freq = spectrogram.shape[1] 105 | 106 | print('The size of the spectrogram is time: '+str(time)+' and freq: '+str(freq)) 107 | 108 | threshold = pp.find_thres(spectrogram, percentile, base) 109 | 110 | peaks = pp.peak_pick(spectrogram,f_dim1,t_dim1,f_dim2,t_dim2,threshold,base) 111 | 112 | print('The initial number of peaks is:'+str(len(peaks))) 113 | peaks = pp.reduce_peaks(peaks, fftsize, high_peak_threshold, low_peak_threshold) 114 | print('The reduced number of peaks is:'+str(len(peaks))) 115 | 116 | #Store information for the spectrogram graph 117 | samplePeaks = peaks 118 | sampleSpectro = spectrogram 119 | 120 | hashSample = fhash.hashSamplePeaks(peaks,delay_time,delta_time,delta_freq) 121 | print('The dimensions of the hash matrix of the sample: '+str(hashSample.shape)) 122 | 123 | print('Attempting to identify the sample audio clip.') 124 | timepairs = fhash.findTimePairs(database, hashSample, TPdelta_freq, TPdelta_time) 125 | 126 | #Compute number of matches by song id to determine a match 127 | numSongs = len(songs) 128 | songbins= np.zeros(numSongs) 129 | numOffsets = len(timepairs) 130 | offsets = np.zeros(numOffsets) 131 | index = 0 132 | for i in timepairs: 133 | offsets[index]=i[0]-i[1] 134 | index = index+1 135 | songbins[i[2]] += 1 136 | 137 | # Identify the song 138 | print('The sample song is: '+str(songnames[np.argmax(songbins)])) 139 | 140 | # Plots 141 | fig = [] 142 | 143 | # Plot the magnitude spectrograms 144 | for i in range(0,numSongs): 145 | fig1 = pylab.figure(i) 146 | peaks = peaksdata[i] 147 | pylab.imshow(spectrodata[i].T,origin='lower', aspect='auto', interpolation='nearest') 148 | pylab.scatter(*zip(*peaks), marker='.', color='blue') 149 | pylab.title(str(songnames[i])+' Spectrogram and Selected Peaks') 150 | pylab.xlabel('Time') 151 | pylab.ylabel('Frequency Bin') 152 | fig.append(fig1) 153 | 154 | #Show the figures 155 | for i in fig: 156 | i.show() 157 | 158 | fig2 = pylab.figure(1002) 159 | pylab.imshow(sampleSpectro.T,origin='lower', aspect='auto', interpolation='nearest') 160 | pylab.scatter(*zip(*samplePeaks), marker='.', color='blue') 161 | pylab.title('Sample File: '+str(userinput)+' Spectrogram and Selected Peaks') 162 | pylab.xlabel('Time') 163 | pylab.ylabel('Frequency Bin') 164 | fig2.show() 165 | 166 | fig3 = pylab.figure(1003) 167 | ax = fig3.add_subplot(111) 168 | 169 | ind = np.arange(numSongs) 170 | width = 0.35 171 | rects1 = ax.bar(ind,songbins,width,color='blue',align='center') 172 | ax.set_ylabel('Number of Matches') 173 | ax.set_xticks(ind) 174 | xtickNames = ax.set_xticklabels(songnames) 175 | matplotlib.pyplot.setp(xtickNames) 176 | pylab.title('Song Identification') 177 | fig3.show() 178 | 179 | pylab.show() 180 | 181 | print('The sample song is: '+str(songnames[np.argmax(songbins)])) 182 | 183 | -------------------------------------------------------------------------------- /peakpicker.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Peak Picking Functions 3 | Bryant Moquist 4 | ''' 5 | import numpy as np 6 | 7 | def find_thres(spectrogram,percentile,base): 8 | "Find the peak picking threshold for a particular spectrogram" 9 | dim = spectrogram.shape 10 | window = spectrogram[0:dim[0],base:dim[1]] 11 | threshold = np.percentile(window, percentile) 12 | 13 | return threshold 14 | 15 | def peak_pick (S,f_dim1,t_dim1,f_dim2,t_dim2,threshold,base): 16 | "Selects local peaks in a spectrogram and returns a list of tuples (time, freq, amplitude)" 17 | "S is spectrogram matrix" 18 | "f_dim1,f_dim2,t_dim1,and t_dim2 are freq x time dimensions of the sliding window for first and second passes" 19 | "threshold is the minimum amplitude required to be a peak" 20 | "base is the lowest frequency bin considered" 21 | 22 | a = len(S) #num of time bins 23 | b = len(S[1]) #num of frequency bins 24 | 25 | peaks = [] 26 | t_coords = [] 27 | f_coords = [] 28 | 29 | "Determine the time x frequency window to analyze" 30 | for i in range(0,a,t_dim1): 31 | for j in range(base,b,f_dim1): 32 | if i + t_dim1 < a and j + f_dim1 < b: 33 | window = S[i:i+t_dim1,j:j+f_dim1] 34 | elif i + t_dim1 < a and j + f_dim1 >= b: 35 | window = S[i:i+t_dim1,j:b] 36 | elif i + t_dim1 >= a and j + f_dim1 < b: 37 | window = S[i:a,j:j+f_dim1] 38 | else: 39 | window = S[i:a,j:b] 40 | 41 | "Check if the largest value in the window is greater than the threshold" 42 | if np.amax(window) >= threshold: 43 | row, col = np.unravel_index(np.argmax(window), window.shape) # pulls coordinates of max value from window 44 | t_coords.append(i+row) 45 | f_coords.append(j+col) 46 | 47 | "Iterates through coordinates selected above to make sure that each of those points is in fact a local peak" 48 | for k in range(0,len(f_coords)): 49 | fmin = f_coords[k] - f_dim2 50 | fmax = f_coords[k] + f_dim2 51 | tmin = t_coords[k] - t_dim2 52 | tmax = t_coords[k] + t_dim2 53 | if fmin < base: 54 | fmin = base 55 | if fmax > b: 56 | fmax = b 57 | if tmin < 0: 58 | tmin = 0 59 | if tmax > a: 60 | tmax = a 61 | window = S[tmin:tmax,fmin:fmax] #window centered around current coordinate pair 62 | 63 | "Break when the window is empty" 64 | if not window.size: 65 | continue 66 | 67 | "Eliminates coordinates that are not local peaks by setting their coordinates to -1" 68 | if S[t_coords[k],f_coords[k]] < np.amax(window): 69 | t_coords[k] = -1 70 | f_coords[k] = -1 71 | 72 | "Removes all -1 coordinate pairs" 73 | f_coords[:] = (value for value in f_coords if value != -1) 74 | t_coords[:] = (value for value in t_coords if value != -1) 75 | 76 | for x in range(0, len(f_coords)): 77 | peaks.append((t_coords[x], f_coords[x], S[t_coords[x], f_coords[x]])) 78 | 79 | return peaks 80 | 81 | def reduce_peaks(peaks,fftsize,high_peak_threshold,low_peak_threshold): 82 | "Reduce the number of peaks by separating into high and low frequency regions and thresholding." 83 | #Separate regions ensure better spread of peaks. 84 | low_peaks = [] 85 | high_peaks = [] 86 | 87 | for item in peaks: 88 | if(item[1]>(fftsize/4)): 89 | high_peaks.append(item) 90 | else: 91 | low_peaks.append(item) 92 | 93 | #Eliminate peaks based on respective thresholds in the low and high frequency regions. 94 | reduced_peaks = [] 95 | for item in peaks: 96 | if(item[1]>(fftsize/4)): 97 | if(item[2]>np.percentile(high_peaks,high_peak_threshold,axis=0)[2]): 98 | reduced_peaks.append(item) 99 | else: 100 | continue 101 | else: 102 | if(item[2]>np.percentile(low_peaks,low_peak_threshold,axis=0)[2]): 103 | reduced_peaks.append(item) 104 | else: 105 | continue 106 | 107 | return reduced_peaks -------------------------------------------------------------------------------- /tdft.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Time Dependent Fourier Transform (Spectrogram) 3 | Bryant Moquist 4 | ''' 5 | 6 | import scipy 7 | import numpy as np 8 | 9 | def tdft(audio, srate, windowsize, windowshift,fftsize): 10 | 11 | """Calculate the real valued fast Fourier transform of a segment of audio multiplied by a 12 | a Hamming window. Then, convert to decibels by multiplying by 20*log10. Repeat for all 13 | segments of the audio.""" 14 | 15 | windowsamp = int(windowsize*srate) 16 | shift = int(windowshift*srate) 17 | window = scipy.hamming(windowsamp) 18 | spectrogram = scipy.array([20*scipy.log10(abs(np.fft.rfft(window*audio[i:i+windowsamp],fftsize))) 19 | for i in range(0, len(audio)-windowsamp, shift)]) 20 | return spectrogram --------------------------------------------------------------------------------