├── .gitignore ├── README.md ├── cnn.py ├── data └── README.md ├── feat_extract.py ├── nn.py ├── predict └── README.md └── svm.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.h5 3 | *.npy 4 | *.ogg 5 | *.png 6 | env/* 7 | .vscode/* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Environmental Sound Classification using Deep Learning 2 | 3 | > A project from Digital Signal Processing course 4 | ## Dependencies 5 | 6 | - Python 3.6 7 | - numpy 8 | - librosa 9 | - pysoundfile 10 | - sounddevice 11 | - matplotlib 12 | - scikit-learn 13 | - tensorflow 14 | - keras 15 | 16 | ## Dataset 17 | 18 | Dataset could be downloaded at [Dataverse](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/YDEPUT) or [Github](https://github.com/karoldvl/ESC-50). 19 | 20 | I'd recommend use ESC-10 for the sake of convenience. 21 | 22 | Example: 23 | 24 | ``` 25 | ├── 001 - Cat 26 | │ ├── cat_1.ogg 27 | │ ├── cat_2.ogg 28 | │ ├── cat_3.ogg 29 | │ ... 30 | ... 31 | └── 002 - Dog 32 | ├── dog_barking_0.ogg 33 | ├── dog_barking_1.ogg 34 | ├── dog_barking_2.ogg 35 | ... 36 | ``` 37 | 38 | ## Feature Extraction 39 | 40 | Put audio files (`.wav` untested) under `data` directory and run the following command: 41 | 42 | `python feat_extract.py` 43 | 44 | Features and labels will be generated and saved in the directory. 45 | 46 | ## Classify with SVM 47 | 48 | Make sure you have `scikit-learn` installed and `feat.npy` and `label.npy` under the same directory. Run `svm.py` and you could see the result. 49 | 50 | ## Classify with Multilayer Perceptron 51 | 52 | Install `tensorflow` and `keras` at first. Run `nn.py` to train and test the network. 53 | 54 | ## Classify with Convolutional Neural Network 55 | 56 | - Run `cnn.py -t` to train and test a CNN. Optionally set how many epochs to train on. 57 | - Predict files by either: 58 | - Putting target files under `predict/` directory and running `cnn.py -p` 59 | - Recording on the fly with `cnn.py -P` 60 | -------------------------------------------------------------------------------- /cnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding= UTF-8 3 | # 4 | # Author: Fing 5 | # Date : 2017-12-03 6 | # 7 | 8 | import feat_extract 9 | from feat_extract import * 10 | import time 11 | import argparse 12 | import numpy as np 13 | import keras 14 | from keras.models import Sequential 15 | from keras.layers import Dense, Dropout, Activation 16 | from keras.layers import Dense, Dropout 17 | from keras.layers import Embedding 18 | from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D 19 | from keras.optimizers import SGD 20 | import os 21 | import os.path as op 22 | from sklearn.model_selection import train_test_split 23 | 24 | def train(args): 25 | if not op.exists('feat.npy') or not op.exists('label.npy'): 26 | if input('No feature/labels found. Run feat_extract.py first? (Y/n)').lower() in ['y', 'yes', '']: 27 | feat_extract.main() 28 | train(args) 29 | else: 30 | X = np.load('feat.npy') 31 | y = np.load('label.npy').ravel() 32 | 33 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=233) 34 | 35 | # Count the number of sub-directories in 'data' 36 | class_count = len(next(os.walk('data/'))[1]) 37 | 38 | # Build the Neural Network 39 | model = Sequential() 40 | model.add(Conv1D(64, 3, activation='relu', input_shape=(193, 1))) 41 | model.add(Conv1D(64, 3, activation='relu')) 42 | model.add(MaxPooling1D(3)) 43 | model.add(Conv1D(128, 3, activation='relu')) 44 | model.add(Conv1D(128, 3, activation='relu')) 45 | model.add(GlobalAveragePooling1D()) 46 | model.add(Dropout(0.5)) 47 | model.add(Dense(class_count, activation='softmax')) 48 | model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) 49 | 50 | # Convert label to onehot 51 | y_train = keras.utils.to_categorical(y_train, num_classes=class_count) 52 | y_test = keras.utils.to_categorical(y_test, num_classes=class_count) 53 | 54 | X_train = np.expand_dims(X_train, axis=2) 55 | X_test = np.expand_dims(X_test, axis=2) 56 | 57 | start = time.time() 58 | model.fit(X_train, y_train, batch_size=args.batch_size, epochs=args.epochs) 59 | score, acc = model.evaluate(X_test, y_test, batch_size=16) 60 | 61 | print('Test score:', score) 62 | print('Test accuracy:', acc) 63 | print('Training took: %d seconds' % int(time.time() - start)) 64 | model.save(args.model) 65 | 66 | def predict(args): 67 | if op.exists(args.model): 68 | model = keras.models.load_model(args.model) 69 | predict_feat_path = 'predict_feat.npy' 70 | predict_filenames = 'predict_filenames.npy' 71 | filenames = np.load(predict_filenames) 72 | X_predict = np.load(predict_feat_path) 73 | X_predict = np.expand_dims(X_predict, axis=2) 74 | pred = model.predict_classes(X_predict) 75 | for pair in list(zip(filenames, pred)): print(pair) 76 | elif input('Model not found. Train network first? (Y/n)').lower() in ['y', 'yes', '']: 77 | train() 78 | predict(args) 79 | 80 | def real_time_predict(args): 81 | import sounddevice as sd 82 | import soundfile as sf 83 | import queue 84 | import librosa 85 | import sys 86 | if op.exists(args.model): 87 | model = keras.models.load_model(args.model) 88 | while True: 89 | try: 90 | features = np.empty((0,193)) 91 | start = time.time() 92 | mfccs, chroma, mel, contrast,tonnetz = extract_feature() 93 | ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz]) 94 | features = np.vstack([features,ext_features]) 95 | features = np.expand_dims(features, axis=2) 96 | pred = model.predict_classes(features) 97 | for p in pred: 98 | print(p) 99 | if args.verbose: print('Time elapsed in real time feature extraction: ', time.time() - start) 100 | sys.stdout.flush() 101 | except KeyboardInterrupt: parser.exit(0) 102 | except Exception as e: parser.exit(type(e).__name__ + ': ' + str(e)) 103 | elif input('Model not found. Train network first? (y/N)') in ['y', 'yes']: 104 | train() 105 | real_time_predict(args) 106 | 107 | 108 | def main(args): 109 | if args.train: train(args) 110 | elif args.predict: predict(args) 111 | elif args.real_time_predict: real_time_predict(args) 112 | 113 | if __name__ == '__main__': 114 | parser = argparse.ArgumentParser(description=__doc__) 115 | parser.add_argument('-t', '--train', action='store_true', help='train neural network with extracted features') 116 | parser.add_argument('-m', '--model', metavar='path', default='trained_model.h5',help='use this model path on train and predict operations') 117 | parser.add_argument('-e', '--epochs', metavar='N', default=500, help='epochs to train', type=int) 118 | parser.add_argument('-p', '--predict', action='store_true', help='predict files in ./predict folder') 119 | parser.add_argument('-P', '--real-time-predict', action='store_true', help='predict sound in real time') 120 | parser.add_argument('-v', '--verbose', action='store_true', help='verbose print') 121 | parser.add_argument('-s', '--log-speed', action='store_true', help='performance profiling') 122 | parser.add_argument('-b', '--batch-size', metavar='size', default=64, help='batch size', type=int) 123 | args = parser.parse_args() 124 | main(args) 125 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | # Download ESC dataset 2 | 3 | The ESC-10/50 dataset could be downloaded at [Dataverse](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/YDEPUT) or [Github](https://github.com/karoldvl/ESC-50). 4 | 5 | -------------------------------------------------------------------------------- /feat_extract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding= UTF-8 3 | # 4 | # Author: Fing 5 | # Date : 2017-12-03 6 | # 7 | 8 | import code 9 | import glob 10 | import os 11 | import librosa 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | from matplotlib.pyplot import specgram 15 | import soundfile as sf 16 | import sounddevice as sd 17 | import queue 18 | 19 | def extract_feature(file_name=None): 20 | if file_name: 21 | print('Extracting', file_name) 22 | X, sample_rate = sf.read(file_name, dtype='float32') 23 | else: 24 | device_info = sd.query_devices(None, 'input') 25 | sample_rate = int(device_info['default_samplerate']) 26 | q = queue.Queue() 27 | def callback(i,f,t,s): q.put(i.copy()) 28 | data = [] 29 | with sd.InputStream(samplerate=sample_rate, callback=callback): 30 | while True: 31 | if len(data) < 100000: data.extend(q.get()) 32 | else: break 33 | X = np.array(data) 34 | 35 | if X.ndim > 1: X = X[:,0] 36 | X = X.T 37 | 38 | # short term fourier transform 39 | stft = np.abs(librosa.stft(X)) 40 | 41 | # mfcc (mel-frequency cepstrum) 42 | mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0) 43 | 44 | # chroma 45 | chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0) 46 | 47 | # melspectrogram 48 | mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0) 49 | 50 | # spectral contrast 51 | contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0) 52 | 53 | tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0) 54 | return mfccs,chroma,mel,contrast,tonnetz 55 | 56 | def parse_audio_files(parent_dir,file_ext='*.ogg'): 57 | sub_dirs = os.listdir(parent_dir) 58 | sub_dirs.sort() 59 | features, labels = np.empty((0,193)), np.empty(0) 60 | for label, sub_dir in enumerate(sub_dirs): 61 | if os.path.isdir(os.path.join(parent_dir, sub_dir)): 62 | for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)): 63 | try: mfccs, chroma, mel, contrast,tonnetz = extract_feature(fn) 64 | except Exception as e: 65 | print("[Error] extract feature error in %s. %s" % (fn,e)) 66 | continue 67 | ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz]) 68 | features = np.vstack([features,ext_features]) 69 | # labels = np.append(labels, fn.split('/')[1]) 70 | labels = np.append(labels, label) 71 | print("extract %s features done" % (sub_dir)) 72 | return np.array(features), np.array(labels, dtype = np.int) 73 | 74 | def parse_predict_files(parent_dir,file_ext='*.ogg'): 75 | features = np.empty((0,193)) 76 | filenames = [] 77 | for fn in glob.glob(os.path.join(parent_dir, file_ext)): 78 | mfccs, chroma, mel, contrast,tonnetz = extract_feature(fn) 79 | ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz]) 80 | features = np.vstack([features,ext_features]) 81 | filenames.append(fn) 82 | print("extract %s features done" % fn) 83 | return np.array(features), np.array(filenames) 84 | 85 | def main(): 86 | # Get features and labels 87 | features, labels = parse_audio_files('data') 88 | np.save('feat.npy', features) 89 | np.save('label.npy', labels) 90 | 91 | # Predict new 92 | features, filenames = parse_predict_files('predict') 93 | np.save('predict_feat.npy', features) 94 | np.save('predict_filenames.npy', filenames) 95 | 96 | if __name__ == '__main__': main() 97 | -------------------------------------------------------------------------------- /nn.py: -------------------------------------------------------------------------------- 1 | # coding= UTF-8 2 | # 3 | # Author: Fing 4 | # Date : 2017-12-03 5 | # 6 | 7 | import numpy as np 8 | import keras 9 | from keras.models import Sequential 10 | from keras.layers import Dense, Dropout, Activation 11 | from keras.optimizers import SGD 12 | from sklearn.model_selection import train_test_split 13 | 14 | # Prepare the data 15 | X = np.load('feat.npy') 16 | y = np.load('label.npy').ravel() 17 | 18 | num_classes = np.max(y, axis=0) 19 | 20 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) 21 | 22 | # Build the Neural Network 23 | model = Sequential() 24 | model.add(Dense(512, activation='relu', input_dim=193)) 25 | model.add(Dropout(0.5)) 26 | model.add(Dense(512, activation='relu')) 27 | model.add(Dropout(0.5)) 28 | model.add(Dense(num_classes, activation='softmax')) 29 | 30 | model.compile(optimizer='rmsprop', 31 | loss='categorical_crossentropy', 32 | metrics=['accuracy']) 33 | 34 | # Convert label to onehot 35 | y_train = keras.utils.to_categorical(y_train-1, num_classes=num_classes) 36 | y_test = keras.utils.to_categorical(y_test-1, num_classes=num_classes) 37 | 38 | # Train and test 39 | model.fit(X_train, y_train, epochs=1000, batch_size=64) 40 | score, acc = model.evaluate(X_test, y_test, batch_size=32) 41 | print('Test score:', score) 42 | print('Test accuracy:', acc) 43 | -------------------------------------------------------------------------------- /predict/README.md: -------------------------------------------------------------------------------- 1 | # Prediction folder 2 | 3 | Target files to be predicted go in here 4 | 5 | -------------------------------------------------------------------------------- /svm.py: -------------------------------------------------------------------------------- 1 | # coding= UTF-8 2 | # 3 | # Author: Fing 4 | # Date : 2017-12-03 5 | # 6 | 7 | import numpy as np 8 | import sklearn 9 | from sklearn.svm import SVC 10 | from sklearn.model_selection import train_test_split 11 | from sklearn.model_selection import GridSearchCV 12 | from sklearn.metrics import classification_report 13 | 14 | # Load data from numpy file 15 | X = np.load('feat.npy') 16 | y = np.load('label.npy').ravel() 17 | 18 | # Split data into training and test subsets 19 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) 20 | 21 | # Simple SVM 22 | print('fitting...') 23 | clf = SVC(C=20.0, gamma=0.00001) 24 | clf.fit(X_train, y_train) 25 | acc = clf.score(X_test, y_test) 26 | print("acc=%0.3f" % acc) 27 | 28 | # Grid search for best parameters 29 | # Set the parameters by cross-validation 30 | # tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4, 1e-5], 31 | # 'C': [1, 10 ,20,30,40,50]}] 32 | # # , 33 | # # {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] 34 | 35 | # scores = ['precision', 'recall'] 36 | 37 | # for score in scores: 38 | # print("# Tuning hyper-parameters for %s" % score) 39 | # print('') 40 | 41 | # clf = GridSearchCV(SVC(), tuned_parameters, cv=5, 42 | # scoring='%s_macro' % score) 43 | # clf.fit(X_train, y_train) 44 | 45 | # print("Best parameters set found on development set:") 46 | # print('') 47 | # print(clf.best_params_) 48 | # print('') 49 | # print("Grid scores on development set:") 50 | # print('') 51 | # means = clf.cv_results_['mean_test_score'] 52 | # stds = clf.cv_results_['std_test_score'] 53 | # for mean, std, params in zip(means, stds, clf.cv_results_['params']): 54 | # print("%0.3f (+/-%0.03f) for %r" 55 | # % (mean, std * 2, params)) 56 | # print('') 57 | 58 | # print("Detailed classification report:") 59 | # print('') 60 | # print("The model is trained on the full development set.") 61 | # print("The scores are computed on the full evaluation set.") 62 | # print('') 63 | # y_true, y_pred = y_test, clf.predict(X_test) 64 | # print(classification_report(y_true, y_pred)) 65 | # print('') 66 | --------------------------------------------------------------------------------