├── .gitignore ├── README.md ├── __init__.py ├── f0estimate.py └── utilities.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.swo 2 | *.swp 3 | *.pyc 4 | testdata 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Multiple Fundamental Frequency Estimation 2 | ----------------------------------------- 3 | 4 | Multiple F0 estimation based on Anssi Klapuri's 2006 paper, "Multiple Fundamental Frequency Estimation by Summing Harmonic Amplitudes". Implemented in Python using NumPy. 5 | 6 | Current Status 7 | -------------- 8 | 9 | I should point out that my implementation never achieved the results stated in Klapuri's paper. I think the issue is with the method of spectral estimation of the found fundamental frequency and its harmonics, which is vaguely described in the paper. I have since moved institutions and no longer have the time to troubleshoot. If anyone else would like to take a crack at it, there is a good code base here to work with. 10 | 11 | Authors 12 | ------- 13 | Gregory Burlet 2012 14 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gburlet/multi-f0-estimation/2cc06e3ea60fc1cb07a8a7f0247431affcf736a3/__init__.py -------------------------------------------------------------------------------- /f0estimate.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import os 3 | import argparse 4 | import numpy as np 5 | from scikits.audiolab import wavread 6 | from scipy.signal import get_window 7 | 8 | from utilities import nextpow2 9 | from pymei import MeiDocument, MeiElement, XmlExport 10 | 11 | # set up command line argument structure 12 | parser = argparse.ArgumentParser(description='Estimate the pitches in an audio file.') 13 | parser.add_argument('-fin', '--filein', help='input file') 14 | parser.add_argument('-fout', '--fileout', help='output file') 15 | parser.add_argument('-v', '--verbose', help='increase output verbosity', action='store_true') 16 | 17 | class F0Estimate: 18 | 19 | # sorted list of frequencies used to find the closest pitch 20 | # name and octave to the fundamental frequency estimate. 21 | # frequencies range from 65Hz to 2100Hz. 22 | frequencies = np.array([2**(n/12.0)*440 for n in range(-33,28)]) 23 | 24 | # sorted list of pitch name and octaves, which correspond to the 25 | # frequency elements in the frequency list. 26 | # for enharmonic notes, choose sharps instead of flats 27 | notes = [ 28 | {'pname': 'C', 'oct': 2}, 29 | {'pname': 'C#', 'oct': 2}, 30 | {'pname': 'D', 'oct': 2}, 31 | {'pname': 'D#', 'oct': 2}, 32 | {'pname': 'E', 'oct': 2}, 33 | {'pname': 'F', 'oct': 2}, 34 | {'pname': 'F#', 'oct': 2}, 35 | {'pname': 'G', 'oct': 2}, 36 | {'pname': 'G#', 'oct': 2}, 37 | {'pname': 'A', 'oct': 2}, 38 | {'pname': 'A#', 'oct': 2}, 39 | {'pname': 'B', 'oct': 2}, 40 | {'pname': 'C', 'oct': 3}, 41 | {'pname': 'C#', 'oct': 3}, 42 | {'pname': 'D', 'oct': 3}, 43 | {'pname': 'D#', 'oct': 3}, 44 | {'pname': 'E', 'oct': 3}, 45 | {'pname': 'F', 'oct': 3}, 46 | {'pname': 'F#', 'oct': 3}, 47 | {'pname': 'G', 'oct': 3}, 48 | {'pname': 'G#', 'oct': 3}, 49 | {'pname': 'A', 'oct': 3}, 50 | {'pname': 'A#', 'oct': 3}, 51 | {'pname': 'B', 'oct': 3}, 52 | {'pname': 'C', 'oct': 4}, 53 | {'pname': 'C#', 'oct': 4}, 54 | {'pname': 'D', 'oct': 4}, 55 | {'pname': 'D#', 'oct': 4}, 56 | {'pname': 'E', 'oct': 4}, 57 | {'pname': 'F', 'oct': 4}, 58 | {'pname': 'F#', 'oct': 4}, 59 | {'pname': 'G', 'oct': 4}, 60 | {'pname': 'G#', 'oct': 4}, 61 | {'pname': 'A', 'oct': 4}, 62 | {'pname': 'A#', 'oct': 4}, 63 | {'pname': 'B', 'oct': 4}, 64 | {'pname': 'C', 'oct': 5}, 65 | {'pname': 'C#', 'oct': 5}, 66 | {'pname': 'D', 'oct': 5}, 67 | {'pname': 'D#', 'oct': 5}, 68 | {'pname': 'E', 'oct': 5}, 69 | {'pname': 'F', 'oct': 5}, 70 | {'pname': 'F#', 'oct': 5}, 71 | {'pname': 'G', 'oct': 5}, 72 | {'pname': 'G#', 'oct': 5}, 73 | {'pname': 'A', 'oct': 5}, 74 | {'pname': 'A#', 'oct': 5}, 75 | {'pname': 'B', 'oct': 5}, 76 | {'pname': 'C', 'oct': 6}, 77 | {'pname': 'C#', 'oct': 6}, 78 | {'pname': 'D', 'oct': 6}, 79 | {'pname': 'D#', 'oct': 6}, 80 | {'pname': 'E', 'oct': 6}, 81 | {'pname': 'F', 'oct': 6}, 82 | {'pname': 'F#', 'oct': 6}, 83 | {'pname': 'G', 'oct': 6}, 84 | {'pname': 'G#', 'oct': 6}, 85 | {'pname': 'A', 'oct': 6}, 86 | {'pname': 'A#', 'oct': 6}, 87 | {'pname': 'B', 'oct': 6}, 88 | {'pname': 'C', 'oct': 7} 89 | ] 90 | 91 | def __init__(self, **kwargs): 92 | # set maximum number of simultaneous notes 93 | if 'max_poly' in kwargs: 94 | self._max_poly = kwargs['max_poly'] 95 | else: 96 | self._max_poly = 6 97 | 98 | # set minimum fundamental frequency to detect 99 | if 'min_f0' in kwargs: 100 | self._min_f0 = kwargs['min_f0'] 101 | else: 102 | self._min_f0 = 65 103 | 104 | # set maximum fundamental frequency to detect 105 | if 'max_f0' in kwargs: 106 | self._max_f0 = kwargs['max_f0'] 107 | else: 108 | self._max_f0 = 2100 109 | 110 | # set analysis frame length 111 | if 'frame_len_sec' in kwargs: 112 | self._frame_len_sec = kwargs['frame_len_sec'] 113 | if self._frame_len_sec != 0.046 and self._frame_len_sec != 0.093: 114 | raise ValueError('Analysis frame length must be 46ms or 93ms') 115 | else: 116 | self._frame_len_sec = 0.093 117 | 118 | if 'window_func' in kwargs: 119 | self._window_func = kwargs['window_func'] 120 | else: 121 | self._window_func = 'hanning' 122 | 123 | # set the bin width of the estimated spectrum of the partial 124 | # of the detected fundamental 125 | if 'partial_width' in kwargs: 126 | self._partial_width = kwargs['partial_width'] 127 | else: 128 | self._partial_width = 10 129 | 130 | ''' 131 | Derived parameters 132 | ''' 133 | # these parameter values are from the 2006 paper 134 | if self._frame_len_sec == 0.046: 135 | self._alpha = 27 136 | self._beta = 320 137 | self._d = 1.0 138 | else: 139 | self._alpha = 52 140 | self._beta = 320 141 | self._d = 0.89 142 | 143 | def estimate_f0s(self, audio_path): 144 | if not os.path.exists(audio_path): 145 | raise ValueError('Invalid audio path') 146 | 147 | x, fs, _ = wavread(audio_path) 148 | 149 | # make x mono if stereo 150 | if x.ndim > 1: 151 | _, n_channels = x.shape 152 | x = x.sum(axis=1)/n_channels 153 | 154 | X = self._stft(x, fs) 155 | 156 | # Section 2.1 Spectrally whiten the signal to suppress timbral information 157 | Y = self._spectral_whitening(X, fs) 158 | 159 | # perform iterative estimation of the fundamental periods in the audio file 160 | f0_estimations = self._iterative_est(Y, fs) 161 | 162 | # get notes which correspond to these frequency estimates 163 | notes = [] 164 | for frame_ests in f0_estimations: 165 | notes.append([self._freq_to_note(f) for f in frame_ests]) 166 | 167 | return f0_estimations, notes 168 | 169 | def _freq_to_note(self, freq): 170 | i_note = np.argmin(np.abs(F0Estimate.frequencies-freq)) 171 | return F0Estimate.notes[i_note] 172 | 173 | def _stft(self, x, fs): 174 | ''' 175 | Calculate short time fourier transform on signal that is 176 | hann windowed and zero-padded to twice its length. 177 | Hopsize = window length 178 | ''' 179 | 180 | frame_len_samps = int(fs * self._frame_len_sec) 181 | win = get_window(self._window_func, frame_len_samps) 182 | 183 | # zero-pad to twice the length of the frame 184 | K = int(nextpow2(2*frame_len_samps)) 185 | X = np.array([np.fft.fft(win*x[i:i+frame_len_samps], K) 186 | for i in xrange(0, len(x)-frame_len_samps, frame_len_samps)]) 187 | 188 | return X 189 | 190 | def _spectral_whitening(self, X, fs, nu=0.33): 191 | ''' 192 | Spectrally flatten ('whiten') the given input signal in the frequency domain, 193 | with the intention of supressing timbral information. 194 | 195 | PARAMETERS 196 | ---------- 197 | X (T, K): frequency domain input signal with T frames and FFT of length K 198 | fs: sampling rate of the input signal 199 | nu (float): amount of spectral whitening 200 | ''' 201 | 202 | T, K = X.shape 203 | nyquist_freq = fs/2 204 | nyquist_bin = K>>1 205 | 206 | # calculate centre frequencies c_b (Hz) of subbands on the critical-band scale 207 | # c_b = 229 * (10^[(b+1)/21.4]-1) 208 | # calculate one subband below and above the range to get the head and tail 209 | # frequencies of the triangle windows 210 | c = [] # centre frequencies of critical-bands 211 | b = 0 # critical band index 212 | while True: 213 | centre_freq = 229*(10**((b+1)/21.4)-1) 214 | if centre_freq < nyquist_freq: 215 | c.append(centre_freq) 216 | b += 1 217 | else: 218 | break 219 | 220 | c = np.asarray(c) 221 | c_bins = np.asarray(np.floor(c*K/fs), np.int) 222 | 223 | # subband compression coefficients -> gamma (K/2,) 224 | gamma = np.zeros([T, nyquist_bin]) 225 | 226 | # for each subband 227 | for b in xrange(1,len(c_bins)-1): 228 | H = np.zeros(nyquist_bin) 229 | 230 | left = c_bins[b-1] 231 | centre = c_bins[b] 232 | right = c_bins[b+1] 233 | 234 | # construct the triangular power response for each subband 235 | H[left:centre+1] = np.linspace(0, 1, centre - left + 1) 236 | H[centre:right+1] = np.linspace(1, 0, right - centre + 1) 237 | 238 | # multiply by 2, since energy is symmetric about the nyquist rate 239 | gamma[:,centre] = np.sqrt((2/K)*np.sum(H*(np.abs(X[:,:nyquist_bin])**2), axis=1))**(nu-1) 240 | 241 | # interpolate between the previous centre bin and the current centre bin 242 | # for each STFT frame 243 | for t in xrange(T): 244 | gamma[t,left:centre] = np.linspace(gamma[t,left], gamma[t,centre], centre - left) 245 | 246 | # calculate the whitened spectrum. Only need to store half the spectrum for analysis 247 | # since the bin energy is symmetric about the nyquist frequency 248 | Y = gamma * X[:,:nyquist_bin] 249 | 250 | return Y 251 | 252 | def _iterative_est(self, Y, fs): 253 | f0_estimations = [] 254 | 255 | T = Y.shape[0] 256 | # for each STFT frame 257 | for t in xrange(T): 258 | # residual magnitude spectrum of the analysis frame 259 | Y_t_R = np.abs(Y[t,:]) 260 | 261 | # fundamental frequency estimates for the current frame 262 | f0_frame_estimations = [] 263 | 264 | # keep track of saliences of period estimates in this frame 265 | S = -1 266 | salience_hats = [] 267 | 268 | # while there are fundamentals to estimate and the maximum number 269 | # of polyphony is not exceeded 270 | while len(salience_hats) < self._max_poly: 271 | tau_hat, salience_hat, Y_t_D = self._search_smax(Y_t_R, fs, tau_prec=0.5) 272 | salience_hats.append(salience_hat) 273 | 274 | f0_frame_estimations.append(fs/tau_hat) 275 | f0_estimations.append(f0_frame_estimations) 276 | 277 | cur_S = self._calc_S(salience_hats) 278 | if cur_S <= S: 279 | break 280 | else: 281 | # subtract the detected spectrum from the residual spectrum 282 | Y_t_R -= self._d*Y_t_D 283 | Y_t_R[Y_t_R < 0] = 0 284 | 285 | S = cur_S 286 | 287 | return f0_estimations 288 | 289 | def _calc_S(self, salience_hats, gamma=0.7): 290 | ''' 291 | Calculate a normalized sum of saliences to determine if searching 292 | for more fundamentals in the spectrum is necessary. 293 | ''' 294 | 295 | j = len(salience_hats) 296 | S = sum(salience_hats)/(j**gamma) 297 | 298 | return S 299 | 300 | def _search_smax(self, Y_t_R, fs, tau_prec=1.0): 301 | Q = 0 # index of the new block 302 | q_best = 0 # index of the best block 303 | 304 | tau_low = [round(fs/self._max_f0)] # in samples/cycle 305 | tau_up = [round(fs/self._min_f0)] # in samples/cycle 306 | smax = [0] 307 | 308 | while tau_up[q_best] - tau_low[q_best] > tau_prec: 309 | # split the best block and compute new limits 310 | Q += 1 311 | tau_low.append((tau_low[q_best] + tau_up[q_best])/2) 312 | tau_up.append(tau_up[q_best]) 313 | tau_up[q_best] = tau_low[Q] 314 | 315 | # compute new saliences for the two block-halves 316 | for q in [q_best, Q]: 317 | salience, _ = self._calc_salience(Y_t_R, fs, tau_low[q], tau_up[q]) 318 | if q == q_best: 319 | smax[q_best] = salience 320 | else: 321 | smax.append(salience) 322 | 323 | q_best = np.argmax(smax) 324 | 325 | # estimated fundamental period of the frame 326 | tau_hat = (tau_low[q_best] + tau_up[q_best])/2 327 | 328 | # calculate the spectrum of the detected fundamental period and harmonics 329 | salience_hat, harmonics = self._calc_salience(Y_t_R, fs, tau_low[q_best], tau_up[q_best]) 330 | K = len(Y_t_R)<<1 331 | Y_t_D = self._calc_harmonic_spec(fs, K, harmonics) 332 | 333 | return tau_hat, salience_hat, Y_t_D 334 | 335 | def _calc_salience(self, Y_t_R, fs, tau_low, tau_up): 336 | salience = 0 337 | 338 | tau = (tau_low + tau_up)/2 339 | delta_tau = tau_up - tau_low 340 | 341 | # calculate the number of harmonics under the nyquist frequency 342 | # the statement below is equivalent to floor((fs/2)/fo) 343 | num_harmonics = int(np.floor(tau/2)) 344 | 345 | # calculate all harmonic weights 346 | harmonics = np.arange(num_harmonics)+1 347 | g = (fs/tau_low + self._alpha) / (harmonics*fs/tau_up + self._beta) 348 | 349 | # calculate lower and upper bounds of partial vicinity 350 | nyquist_bin = len(Y_t_R) 351 | K = nyquist_bin<<1 352 | lb_vicinity = K/(tau + delta_tau/2) 353 | ub_vicinity = K/(tau - delta_tau/2) 354 | 355 | # for each harmonic 356 | harmonics = [] 357 | for m in xrange(1,num_harmonics+1): 358 | harmonic_lb = round(m*lb_vicinity) 359 | harmonic_ub = min(round(m*ub_vicinity), nyquist_bin) 360 | harmonic_bin = np.argmax(Y_t_R[harmonic_lb-1:harmonic_ub]) + harmonic_lb-1 361 | harmonic_amp = Y_t_R[harmonic_bin] 362 | w_harmonic_amp = g[m-1] * harmonic_amp 363 | 364 | # save the properties of this fundamental period and harmonics 365 | harmonics.append({'bin': harmonic_bin, 'amp': w_harmonic_amp}) 366 | 367 | salience += w_harmonic_amp 368 | 369 | return salience, harmonics 370 | 371 | def _calc_harmonic_spec(self, fs, K, harmonics): 372 | nyquist_bin = K>>1 373 | # initialize spectrum of detected harmonics 374 | Y_t_D = np.zeros(nyquist_bin) 375 | 376 | # calculate the partial spectrum for each harmonic 377 | # Klapuri PhD Thesis, page 62 and (Klapuri, 2006) Section 2.5 378 | # Even with these sources, the algorithm for estimating the 379 | # spectrum of the fundamental and partials is rather unclear. 380 | frame_len_samps = int(fs * self._frame_len_sec) 381 | win = get_window(self._window_func, frame_len_samps) 382 | window_spec = np.abs(np.fft.fft(win, K)) 383 | partial_spectrum = np.hstack((window_spec[self._partial_width::-1], 384 | window_spec[1:self._partial_width+1])) 385 | # normalize the spectrum 386 | partial_spectrum /= np.max(partial_spectrum) 387 | 388 | for h in harmonics: 389 | h_lb = max(0, h['bin']-self._partial_width) 390 | h_ub = min(nyquist_bin-1, h['bin']+self._partial_width) 391 | 392 | # translate the spectrum of the window function to the position of the harmonic 393 | Y_t_D[h_lb:h_ub+1] = h['amp']*partial_spectrum[h_lb-h['bin']+self._partial_width:h_ub-h['bin']+self._partial_width+1] 394 | 395 | return Y_t_D 396 | 397 | def collapse_notes(self, notes): 398 | ''' 399 | Collapse consecutive notes (notes that span more than 400 | one analysis frame). 401 | ''' 402 | 403 | notes_c = [] 404 | prev_frame = [] 405 | for frame_n in notes: 406 | # remove identical notes 407 | if len(frame_n) > 1: 408 | n_set = set([n['pname']+str(n['oct']) for n in frame_n]) 409 | frame_n = [{'pname': n[:-1], 'oct': int(n[-1])} for n in n_set] 410 | 411 | # if polyphony is different, add to notes 412 | if len(frame_n) != len(prev_frame): 413 | notes_c.append(frame_n) 414 | elif not np.all([n1['pname'] == n2['pname'] and n1['oct'] == n2['oct'] 415 | for n1,n2 in zip(prev_frame, frame_n)]): 416 | notes_c.append(frame_n) 417 | 418 | prev_frame = frame_n 419 | 420 | return notes_c 421 | 422 | def write_mei(self, notes, output_path=None): 423 | # begin constructing mei document 424 | meidoc = MeiDocument() 425 | mei = MeiElement('mei') 426 | meidoc.setRootElement(mei) 427 | mei_head = MeiElement('meiHead') 428 | mei.addChild(mei_head) 429 | 430 | music = MeiElement('music') 431 | body = MeiElement('body') 432 | mdiv = MeiElement('mdiv') 433 | score = MeiElement('score') 434 | score_def = MeiElement('scoreDef') 435 | 436 | # assume 4/4 time signature 437 | meter_count = 4 438 | meter_unit = 4 439 | score_def.addAttribute('meter.count', str(meter_count)) 440 | score_def.addAttribute('meter.unit', str(meter_unit)) 441 | 442 | staff_def = MeiElement('staffDef') 443 | staff_def.addAttribute('n', '1') 444 | staff_def.addAttribute('label.full', 'Electric Guitar') 445 | staff_def.addAttribute('clef.shape', 'TAB') 446 | 447 | instr_def = MeiElement('instrDef') 448 | instr_def.addAttribute('n', 'Electric_Guitar') 449 | instr_def.addAttribute('midi.channel', '1') 450 | instr_def.addAttribute('midi.instrnum', '28') 451 | 452 | mei.addChild(music) 453 | music.addChild(body) 454 | body.addChild(mdiv) 455 | mdiv.addChild(score) 456 | score.addChild(score_def) 457 | score_def.addChild(staff_def) 458 | staff_def.addChild(instr_def) 459 | 460 | section = MeiElement('section') 461 | score.addChild(section) 462 | # another score def 463 | score_def = MeiElement('scoreDef') 464 | score_def.addAttribute('meter.count', str(meter_count)) 465 | score_def.addAttribute('meter.unit', str(meter_unit)) 466 | section.addChild(score_def) 467 | 468 | # start writing pitches to file 469 | note_container = None 470 | for i, frame_n in enumerate(notes): 471 | if i % meter_count == 0: 472 | measure = MeiElement('measure') 473 | measure.addAttribute('n', str(int(i/meter_count + 1))) 474 | staff = MeiElement('staff') 475 | staff.addAttribute('n', '1') 476 | layer = MeiElement('layer') 477 | layer.addAttribute('n', '1') 478 | section.addChild(measure) 479 | measure.addChild(staff) 480 | staff.addChild(layer) 481 | note_container = layer 482 | 483 | if len(frame_n) > 1: 484 | chord = MeiElement('chord') 485 | for n in frame_n: 486 | note = MeiElement('note') 487 | pname = n['pname'][0].upper() 488 | note.addAttribute('pname', pname) 489 | note.addAttribute('oct', str(n['oct'])) 490 | if len(n['pname']) > 1 and n['pname'][1] == '#': 491 | # there is an accidental 492 | note.addAttribute('accid.ges', 's') 493 | note.addAttribute('dur', str(meter_unit)) 494 | chord.addChild(note) 495 | note_container.addChild(chord) 496 | else: 497 | n = frame_n[0] 498 | note = MeiElement('note') 499 | pname = n['pname'][0].upper() 500 | note.addAttribute('pname', pname) 501 | note.addAttribute('oct', str(n['oct'])) 502 | if len(n['pname']) > 1 and n['pname'][1] == '#': 503 | # there is an accidental 504 | note.addAttribute('accid.ges', 's') 505 | note.addAttribute('dur', str(meter_unit)) 506 | note_container.addChild(note) 507 | 508 | if output_path is not None: 509 | XmlExport.meiDocumentToFile(meidoc, output_path) 510 | else: 511 | return XmlExport.meiDocumentToText(meidoc) 512 | 513 | if __name__ == '__main__': 514 | # parse command line arguments 515 | args = parser.parse_args() 516 | 517 | input_path = args.filein 518 | if not os.path.exists(input_path): 519 | raise ValueError('The input file does not exist') 520 | 521 | output_path = args.fileout 522 | 523 | # check file extensions are correct for this type of conversion 524 | _, input_ext = os.path.splitext(input_path) 525 | if input_ext != '.wav': 526 | raise ValueError('Input path must be a wav file') 527 | _, output_ext = os.path.splitext(output_path) 528 | if output_ext != '.mei': 529 | raise ValueError('Ouput path must have the file extension .mei') 530 | 531 | freq_est = F0Estimate(max_poly=6) 532 | f0_estimates, notes = freq_est.estimate_f0s(input_path) 533 | notes_c = freq_est.collapse_notes(notes) 534 | freq_est.write_mei(notes_c, output_path) 535 | -------------------------------------------------------------------------------- /utilities.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Utility functions 3 | ''' 4 | 5 | import numpy as np 6 | 7 | def nextpow2(x): 8 | return 2**np.ceil(np.log2(x)) 9 | --------------------------------------------------------------------------------