├── .gitignore
├── COPYING
├── MFCC.py
├── README.md
└── smacpy.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | *.pyc
3 | wavs
4 | 
5 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | smacpy
 2 | Copyright (c) 2012 Dan Stowell and Queen Mary University of London
 3 | 
 4 | incorporating MFCC.py
 5 | Copyright (c) 2009 Gyorgy Fazekas and Queen Mary University of London
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 8 | 
 9 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
10 | 
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
12 | 
13 | 


--------------------------------------------------------------------------------
/MFCC.py:
--------------------------------------------------------------------------------
  1 | '''MFCC.py
  2 | 
  3 | Calculation of MFCC coefficients from frequency-domain data
  4 | 
  5 | Adapted from the Vampy example plugin "PyMFCC" by Gyorgy Fazekas
  6 | http://code.soundsoftware.ac.uk/projects/vampy/repository/entry/Example%20VamPy%20plugins/PyMFCC.py
  7 | 
  8 | Centre for Digital Music, Queen Mary University of London.
  9 | Copyright (C) 2009 Gyorgy Fazekas, QMUL.
 10 | '''
 11 | 
 12 | import sys,numpy
 13 | from numpy import abs,log,exp,floor,sum,sqrt,cos,hstack
 14 | from numpy.fft import *
 15 | 
 16 | class melScaling(object):
 17 | 
 18 | 	def __init__(self,sampleRate,inputSize,numBands,minHz = 0,maxHz = None):
 19 | 		'''Initialise frequency warping and DCT matrix. 
 20 | 		Parameters:
 21 | 		sampleRate: audio sample rate
 22 | 		inputSize: length of magnitude spectrum (half of FFT size assumed)
 23 | 		numBands: number of mel Bands (MFCCs)
 24 | 		minHz: lower bound of warping  (default = DC)
 25 | 		maxHz: higher bound of warping (default = Nyquist frequency)
 26 | 		'''
 27 | 		self.sampleRate = sampleRate
 28 | 		self.NqHz = sampleRate / 2.0
 29 | 		self.minHz = minHz
 30 | 		if maxHz is None : maxHz = self.NqHz
 31 | 		self.maxHz = maxHz
 32 | 		self.inputSize = inputSize
 33 | 		self.numBands = numBands
 34 | 		self.valid = False
 35 | 		self.updated = False
 36 | 		
 37 | 	def update(self): 
 38 | 		# make sure this will run only once 
 39 | 		# if called from a vamp process
 40 | 		if self.updated: return self.valid
 41 | 		self.updated = True
 42 | 		self.valid = False
 43 | 		print('Updating parameters and recalculating filters: ')
 44 | 		print('Nyquist: ',self.NqHz)
 45 | 		
 46 | 		if self.maxHz > self.NqHz : 
 47 | 			raise Exception('Maximum frequency must be smaller than the Nyquist frequency')
 48 | 		
 49 | 		self.maxMel = 1000*log(1+self.maxHz/700.0)/log(1+1000.0/700.0)
 50 | 		self.minMel = 1000*log(1+self.minHz/700.0)/log(1+1000.0/700.0)
 51 | 		print('minHz:%s\nmaxHz:%s\nminMel:%s\nmaxMel:%s\n' \
 52 | 		%(self.minHz,self.maxHz,self.minMel,self.maxMel))
 53 | 		self.filterMatrix = self.getFilterMatrix(self.inputSize,self.numBands)
 54 | 		self.DCTMatrix = self.getDCTMatrix(self.numBands)
 55 | 		self.valid = True
 56 | 		return self.valid
 57 | 				
 58 | 	def getFilterCentres(self,inputSize,numBands):
 59 | 		'''Calculate Mel filter centres around FFT bins.
 60 | 		This function calculates two extra bands at the edges for
 61 | 		finding the starting and end point of the first and last 
 62 | 		actual filters.'''
 63 | 		centresMel = numpy.array(range(numBands+2)) * (self.maxMel-self.minMel)/(numBands+1) + self.minMel
 64 | 		centresBin = numpy.floor(0.5 + 700.0*inputSize*(exp(centresMel*log(1+1000.0/700.0)/1000.0)-1)/self.NqHz)
 65 | 		return numpy.array(centresBin,int)
 66 | 		
 67 | 	def getFilterMatrix(self,inputSize,numBands):
 68 | 		'''Compose the Mel scaling matrix.'''
 69 | 		filterMatrix = numpy.zeros((numBands,inputSize))
 70 | 		self.filterCentres = self.getFilterCentres(inputSize,numBands)
 71 | 		for i in range(numBands) :
 72 | 			start,centre,end = self.filterCentres[i:i+3]
 73 | 			self.setFilter(filterMatrix[i],start,centre,end)
 74 | 		return filterMatrix.transpose()
 75 | 
 76 | 	def setFilter(self,filt,filterStart,filterCentre,filterEnd):
 77 | 		'''Calculate a single Mel filter.'''
 78 | 		k1 = numpy.float32(filterCentre-filterStart)
 79 | 		k2 = numpy.float32(filterEnd-filterCentre)
 80 | 		up = (numpy.array(range(filterStart,filterCentre))-filterStart)/k1
 81 | 		dn = (filterEnd-numpy.array(range(filterCentre,filterEnd)))/k2
 82 | 		filt[filterStart:filterCentre] = up
 83 | 		filt[filterCentre:filterEnd] = dn
 84 | 
 85 | 	def warpSpectrum(self,magnitudeSpectrum):
 86 | 		'''Compute the Mel scaled spectrum.'''
 87 | 		return numpy.dot(magnitudeSpectrum,self.filterMatrix)
 88 | 		
 89 | 	def getDCTMatrix(self,size):
 90 | 		'''Calculate the square DCT transform matrix. Results are 
 91 | 		equivalent to Matlab dctmtx(n) with 64 bit precision.'''
 92 | 		DCTmx = numpy.array(range(size),numpy.float64).repeat(size).reshape(size,size)
 93 | 		DCTmxT = numpy.pi * (DCTmx.transpose()+0.5) / size
 94 | 		DCTmxT = (1.0/sqrt( size / 2.0)) * cos(DCTmx * DCTmxT)
 95 | 		DCTmxT[0] = DCTmxT[0] * (sqrt(2.0)/2.0)
 96 | 		return DCTmxT
 97 | 		
 98 | 	def dct(self,data_matrix):
 99 | 		'''Compute DCT of input matrix.'''
100 | 		return numpy.dot(self.DCTMatrix,data_matrix)
101 | 		
102 | 	def getMFCCs(self,warpedSpectrum,cn=True):
103 | 		'''Compute MFCC coefficients from Mel warped magnitude spectrum.'''
104 | 		mfccs=self.dct(numpy.log(numpy.clip(warpedSpectrum, 1e-9, numpy.inf)))
105 | 		if cn is False : mfccs[0] = 0.0
106 | 		return mfccs
107 | 
108 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | smacpy - simple-minded audio classifier in python
 3 | =================================================
 4 | 
 5 | Copyright (c) 2012 Dan Stowell and Queen Mary University of London
 6 | (incorporating code Copyright (c) 2009 Gyorgy Fazekas and Queen Mary University of London)
 7 | - for licence information see the file named COPYING.
 8 | 
 9 | This is a classifier that you can train on a set of labelled audio files, and then it predicts a label for further audio files.
10 | It is designed with two main aims:
11 | 
12 | 1. to provide a baseline against which to test more advanced audio classifiers;
13 | 2. to provide a simple code example of a classifier which people are free to build on.
14 | 
15 | It uses a workflow which was very common before the age of deep learning, and might still be useful for low-complexity audio tasks: take an audio clip as input, converting it frame-by-frame into MFCCs, and modelling the MFCC "bag of frames" with a GMM.
16 | 
17 | Requirements
18 | ------------
19 | * Python 2.7 or later, or Python 3
20 | * Python modules:
21 |     * numpy
22 |     * [librosa](http://librosa.org/)
23 |     * [sckikit-learn](http://scikit-learn.sourceforge.net/)
24 | 
25 | It has been tested on python 2.7 and 3.8 (on Ubuntu).
26 | 
27 | 
28 | Usage example 1: commandline
29 | -------------
30 | If you invoke the script from the commandline (e.g. "python smacpy.py") it will assume there is a folder called "wavs"
31 | and inside that folder are multiple WAV files, each of which has an underscore in the filename,
32 | and the class label is the text BEFORE the underscore.
33 | It will train a model using the wavs, and then test it on the same wavs (dividing the collection up so it can do a "crossvalidated" test).
34 | 
35 | To train and test on different folders, you can run it like this:
36 | 
37 | 	python smacpy.py -t trainwavs -T testwavs
38 | 
39 | 
40 | Usage example 2: from your own code
41 | -------------
42 | In this hypothetical example we train on four audio files, labelled as either 'usa' or 'uk', and then test on a separate audio file of someone called hubert:
43 | 
44 | 	from smacpy import Smacpy
45 | 	model = Smacpy("wavs/training", {'karen01.wav':'usa', 'john01.wav':'uk', 'steve02.wav':'usa', 'joe03.wav':'uk'})
46 | 	model.classify('wavs/testing/hubert01.wav')
47 | 
48 | 


--------------------------------------------------------------------------------
/smacpy.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | #
  3 | # smacpy - simple-minded audio classifier in python
  4 | # 
  5 | # Copyright (c) 2012 Dan Stowell and Queen Mary University of London
  6 | # 
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
  8 | # 
  9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 10 | # 
 11 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 12 | 
 13 | import os.path
 14 | import numpy as np
 15 | import argparse
 16 | from glob import glob
 17 | import librosa
 18 | from sklearn.mixture import GaussianMixture as GMM
 19 | 
 20 | from MFCC import melScaling
 21 | 
 22 | #######################################################################
 23 | # some settings
 24 | framelen = 1024
 25 | fs = 44100.0
 26 | verbose = True
 27 | 
 28 | #######################################################################
 29 | # main class
 30 | 
 31 | class Smacpy:
 32 | 	"""Smacpy - simple-minded audio classifier in python. See the README file for more details.
 33 | 
 34 | 	USAGE EXAMPLE:
 35 | 	In this hypothetical example we train on four audio files, labelled as either 'usa' or 'uk', and then test on a separate audio file of someone called hubert:
 36 | 
 37 | 	from smacpy import Smacpy
 38 | 	model = Smacpy("wavs/training", {'karen01.wav':'usa', 'john01.wav':'uk', 'steve02.wav':'usa', 'joe03.wav':'uk'})
 39 | 	model.classify('wavs/testing/hubert01.wav')
 40 | 
 41 | 	Note for developers: this code should aim to be understandable, and not too long. Don't add too much functionality, or efficiency ;)
 42 | 	"""
 43 | 
 44 | 	def __init__(self, wavfolder, trainingdata):
 45 | 		"""Initialise the classifier and train it on some WAV files.
 46 | 		'wavfolder' is the base folder, to be prepended to all WAV paths.
 47 | 		'trainingdata' is a dictionary of wavpath:label pairs."""
 48 | 
 49 | 		self.mfccMaker = melScaling(int(fs), int(framelen/2), 40)
 50 | 		self.mfccMaker.update()
 51 | 
 52 | 		allfeatures = {wavpath:self.file_to_features(os.path.join(wavfolder, wavpath)) for wavpath in trainingdata}
 53 | 
 54 | 		# Determine the normalisation stats, and remember them
 55 | 		allconcat = np.vstack(list(allfeatures.values()))
 56 | 		self.means = np.mean(allconcat, 0)
 57 | 		self.invstds = np.std(allconcat, 0)
 58 | 		for i,val in enumerate(self.invstds):
 59 | 			if val == 0.0:
 60 | 				self.invstds[i] = 1.0
 61 | 			else:
 62 | 				self.invstds[i] = 1.0 / val
 63 | 
 64 | 		# For each label, compile a normalised concatenated list of features
 65 | 		aggfeatures = {}
 66 | 		for wavpath, features in allfeatures.items():
 67 | 			label = trainingdata[wavpath]
 68 | 			normed = self.__normalise(features)
 69 | 			if label not in aggfeatures:
 70 | 				aggfeatures[label] = normed
 71 | 			else:
 72 | 				aggfeatures[label] = np.vstack((aggfeatures[label], normed))
 73 | 
 74 | 		# For each label's aggregated features, train a GMM and remember it
 75 | 		self.gmms = {}
 76 | 		for label, aggf in aggfeatures.items():
 77 | 			if verbose: print("    Training a GMM for label %s, using data of shape %s" % (label, str(np.shape(aggf))))
 78 | 			self.gmms[label] = GMM(n_components=10) # , cvtype='full')
 79 | 			self.gmms[label].fit(aggf)
 80 | 		if verbose: print("  Trained %i classes from %i input files" % (len(self.gmms), len(trainingdata)))
 81 | 
 82 | 	def __normalise(self, data):
 83 | 		"Normalises data using the mean and stdev of the training data - so that everything is on a common scale."
 84 | 		return (data - self.means) * self.invstds
 85 | 
 86 | 	def classify(self, wavpath):
 87 | 		"Specify the path to an audio file, and this returns the max-likelihood class, as a string label."
 88 | 		features = self.__normalise(self.file_to_features(wavpath))
 89 | 		# For each label GMM, find the overall log-likelihood and choose the strongest
 90 | 		bestlabel = ''
 91 | 		bestll = -9e99
 92 | 		for label, gmm in self.gmms.items():
 93 | 			ll = gmm.score_samples(features)[0]
 94 | 			ll = np.sum(ll)
 95 | 			if ll > bestll:
 96 | 				bestll = ll
 97 | 				bestlabel = label
 98 | 		return bestlabel
 99 | 
100 | 	def file_to_features(self, wavpath):
101 | 		"Reads through a mono WAV file, converting each frame to the required features. Returns a 2D array."
102 | 		if verbose: print("Reading %s" % wavpath)
103 | 		if not os.path.isfile(wavpath): raise ValueError("path %s not found" % wavpath)
104 | 		
105 | 		audiodata, _ = librosa.load(wavpath, sr=fs, mono=True)
106 | 		window = np.hamming(framelen)
107 | 		features = []
108 | 		chunkpos = 0
109 | 		while(True):
110 | 			try:
111 | 				chunk = audiodata[chunkpos:chunkpos+framelen]
112 | 				if len(chunk) != framelen:
113 | 					#print("Not read sufficient samples - assuming end of file")
114 | 					break
115 | 				framespectrum = np.fft.fft(window * chunk)
116 | 				magspec = abs(framespectrum[:int(framelen/2)])
117 | 
118 | 				# do the frequency warping and MFCC computation
119 | 				melSpectrum = self.mfccMaker.warpSpectrum(magspec)
120 | 				melCepstrum = self.mfccMaker.getMFCCs(melSpectrum,cn=True)
121 | 				melCepstrum = melCepstrum[1:]   # exclude zeroth coefficient
122 | 				melCepstrum = melCepstrum[:13] # limit to lower MFCCs
123 | 
124 | 				framefeatures = melCepstrum   # todo: include deltas? that can be your homework.
125 | 
126 | 				features.append(framefeatures)
127 | 				
128 | 				chunkpos += framelen
129 | 			except RuntimeError:
130 | 				break
131 | 		if verbose: print("  Data shape: %s" % str(np.array(features).shape))
132 | 		return np.array(features)
133 | 
134 | #######################################################################
135 | def trainAndTest(trainpath, trainwavs, testpath, testwavs):
136 | 	"Handy function for evaluating your code: trains a model, tests it on wavs of known class. Returns (numcorrect, numtotal, numclasses)."
137 | 	print("TRAINING")
138 | 	model = Smacpy(trainpath, trainwavs)
139 | 	print("TESTING")
140 | 	ncorrect = 0
141 | 	for wavpath,label in testwavs.items():
142 | 		result = model.classify(os.path.join(testpath, wavpath))
143 | 		if verbose: print(" inferred: %s" % result)
144 | 		if result == label:
145 | 			ncorrect += 1
146 | 	return (ncorrect, len(testwavs), len(model.gmms))
147 | 
148 | #######################################################################
149 | # If this file is invoked as a script, it carries out a simple runthrough
150 | # of training on some wavs, then testing, with classnames being the start of the filenames
151 | if __name__ == '__main__':
152 | 
153 | 	# Handle the command-line arguments for where the train/test data comes from:
154 | 	parser = argparse.ArgumentParser()
155 | 	parser.add_argument('-t', '--trainpath', default='wavs', help="Path to the WAV files used for training")
156 | 	parser.add_argument('-T', '--testpath',                  help="Path to the WAV files used for testing")
157 | 	parser.add_argument('-q', dest='quiet', action='store_true', help="Be less verbose, don't output much text during processing")
158 | 	group = parser.add_mutually_exclusive_group()
159 | 	group.add_argument('-c', '--charsplit',  default='_',    help="Character used to split filenames: anything BEFORE this character is the class")
160 | 	group.add_argument('-n', '--numchars' ,  default=0  ,    help="Instead of splitting using 'charsplit', use this fixed number of characters from the start of the filename", type=int)
161 | 	args = vars(parser.parse_args())
162 | 	verbose = not args['quiet']
163 | 
164 | 	if args['testpath']==None:
165 | 		args['testpath'] = args['trainpath']
166 | 
167 | 	# Build up lists of the training and testing WAV files:
168 | 	wavsfound = {'trainpath':{}, 'testpath':{}}
169 | 	for onepath in ['trainpath', 'testpath']:
170 | 		pattern = os.path.join(args[onepath], '*.wav')
171 | 		for wavpath in glob(pattern):
172 | 			if args['numchars'] != 0:
173 | 				label = os.path.basename(wavpath)[:args['numchars']]
174 | 			else:
175 | 				label = os.path.basename(wavpath).split(args['charsplit'])[0]
176 | 			shortwavpath = os.path.relpath(wavpath, args[onepath])
177 | 			wavsfound[onepath][shortwavpath] = label
178 | 		if len(wavsfound[onepath])==0:
179 | 			raise RuntimeError("Found no files using this pattern: %s" % pattern)
180 | 		if verbose:
181 | 			print("Class-labels and filenames to be used from %s:" % onepath)
182 | 			for wavpath,label in sorted(wavsfound[onepath].items()):
183 | 				print(" %s: \t %s" % (label, wavpath))
184 | 
185 | 	if args['testpath'] != args['trainpath']:
186 | 		# Separate train-and-test collections
187 | 		ncorrect, ntotal, nclasses = trainAndTest(args['trainpath'], wavsfound['trainpath'], args['testpath'], wavsfound['testpath'])
188 | 		print("Got %i correct out of %i (trained on %i classes)" % (ncorrect, ntotal, nclasses))
189 | 	else:
190 | 		# This runs "stratified leave-one-out crossvalidation": test multiple times by leaving one-of-each-class out and training on the rest.
191 | 		# First we need to build a list of files grouped by each classlabel
192 | 		labelsinuse = sorted(list(set(wavsfound['trainpath'].values())))
193 | 		grouped = {label:[] for label in labelsinuse}
194 | 		for wavpath,label in wavsfound['trainpath'].items():
195 | 			grouped[label].append(wavpath)
196 | 		dropkeys = []
197 | 		for label,collection in grouped.items():
198 | 			if len(collection)==1:
199 | 				print("   Warning: class '%s' has only 1 item, thus we will not use it in our crossvalidation test" % label)
200 | 				dropkeys.append(label)
201 | 		for akey in dropkeys:
202 | 			del grouped[akey]
203 | 		numfolds = min(len(collection) for collection in grouped.values())
204 | 		# Each "fold" will be a collection of one item of each label
205 | 		folds = [{wavpaths[index]:label for label,wavpaths in grouped.items()} for index in range(numfolds)]
206 | 		totcorrect, tottotal = (0,0)
207 | 		# Then we go through, each time training on all-but-one and testing on the one left out
208 | 		for index in range(numfolds):
209 | 			print("Fold %i of %i" % (index+1, numfolds))
210 | 			chosenfold = folds[index]
211 | 			alltherest = {}
212 | 			for whichfold, otherfold in enumerate(folds):
213 | 				if whichfold != index:
214 | 					alltherest.update(otherfold)
215 | 			ncorrect, ntotal, nclasses = trainAndTest(args['trainpath'], alltherest, args['trainpath'], chosenfold)
216 | 			totcorrect += ncorrect
217 | 			tottotal   += ntotal
218 | 		print("Got %i correct out of %i (using stratified leave-one-out crossvalidation, %i folds)" % (totcorrect, tottotal, numfolds))
219 | 
220 | 


--------------------------------------------------------------------------------