├── utils ├── __init__.py └── batch_generator.py ├── requirements.txt ├── model ├── fetch_model.sh └── download_model.txt ├── dataset ├── ruhe_silence.wav ├── fenster_window.wav ├── schreien_scream.wav └── datasets.txt ├── LICENSE ├── AED_spec.py ├── AED_test.py ├── AED_eval.py ├── README.md └── AED_train.py /utils/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | scikit-learn 4 | python_speech_features 5 | pydub 6 | matplotlib -------------------------------------------------------------------------------- /model/fetch_model.sh: -------------------------------------------------------------------------------- 1 | wget https://box.tu-chemnitz.de/index.php/s/8vkQqXbUjVWlt5m/download -O AED_Example_Run_model.pkl -------------------------------------------------------------------------------- /dataset/ruhe_silence.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/AcousticEventDetection/master/dataset/ruhe_silence.wav -------------------------------------------------------------------------------- /dataset/fenster_window.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/AcousticEventDetection/master/dataset/fenster_window.wav -------------------------------------------------------------------------------- /dataset/schreien_scream.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/AcousticEventDetection/master/dataset/schreien_scream.wav -------------------------------------------------------------------------------- /model/download_model.txt: -------------------------------------------------------------------------------- 1 | You can download a pre-trained model here: https://box.tu-chemnitz.de/index.php/s/8vkQqXbUjVWlt5m 2 | -------------------------------------------------------------------------------- /dataset/datasets.txt: -------------------------------------------------------------------------------- 1 | You can use some publicly available datasets such as UrbanSound8K for training. Additionally, you can visit freesound.org to download open source sound files. We provide three test samples of our data set used in the paper. The pre-trained model we provide is trained on 20 classes described in the paper. 2 | -------------------------------------------------------------------------------- /utils/batch_generator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #Loading images with CPU background threads during GPU forward passes saves a lot of time 3 | #Credit: J. Schlüter (https://github.com/Lasagne/Lasagne/issues/12) 4 | def threadedBatchGenerator(generator, num_cached=10): 5 | 6 | import Queue 7 | queue = Queue.Queue(maxsize=num_cached) 8 | sentinel = object() # guaranteed unique reference 9 | 10 | #define producer (putting items into queue) 11 | def producer(): 12 | for item in generator: 13 | queue.put(item) 14 | queue.put(sentinel) 15 | 16 | #start producer (in a background thread) 17 | import threading 18 | thread = threading.Thread(target=producer) 19 | thread.daemon = True 20 | thread.start() 21 | 22 | #run as consumer (read items from queue, in current thread) 23 | item = queue.get() 24 | while item is not sentinel: 25 | yield item 26 | queue.task_done() 27 | item = queue.get() 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Stefan Kahl 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /AED_spec.py: -------------------------------------------------------------------------------- 1 | import os 2 | import traceback 3 | import operator 4 | 5 | import numpy as np 6 | import cv2 7 | 8 | import python_speech_features as psf 9 | import scipy.io.wavfile as wave 10 | from scipy import interpolate 11 | 12 | ###################################################### 13 | src_dir = 'dataset/train/wav/' 14 | spec_dir = 'dataset/train/spec/' 15 | 16 | SPEC_LENGTH = 3 #seconds 17 | SPEC_OVERLAP = 2 #seconds 18 | 19 | ###################################################### 20 | def getSpecSettings(seconds): 21 | 22 | #recommended settings for spectrogram extraction 23 | settings = {2:[0.015, 0.0068], 24 | 3:[0.02, 0.00585], 25 | 5:[0.05, 0.0097], 26 | 10:[0.05, 0.0195], 27 | 30:[0.05, 0.0585]} 28 | 29 | winlen = settings[seconds][0] 30 | winstep = settings[seconds][1] 31 | 32 | nfft = 511 33 | 34 | return winlen, winstep, nfft 35 | 36 | 37 | def changeSampleRate(sig, rate): 38 | 39 | duration = sig.shape[0] / rate 40 | 41 | time_old = np.linspace(0, duration, sig.shape[0]) 42 | time_new = np.linspace(0, duration, int(sig.shape[0] * 44100 / rate)) 43 | 44 | interpolator = interpolate.interp1d(time_old, sig.T) 45 | new_audio = interpolator(time_new).T 46 | 47 | sig = np.round(new_audio).astype(sig.dtype) 48 | 49 | return sig, 44100 50 | 51 | def getSpecFromSignal(sig, rate, seconds=SPEC_LENGTH): 52 | 53 | #get settings 54 | winlen, winstep, nfft = getSpecSettings(seconds) 55 | 56 | #get frames 57 | winfunc=lambda x:np.ones((x,)) 58 | frames = psf.sigproc.framesig(sig, winlen*rate, winstep*rate, winfunc) 59 | 60 | #Magnitude Spectrogram 61 | magspec = np.rot90(psf.sigproc.magspec(frames, nfft)) 62 | 63 | #normalize to values from 0 to 1 64 | magspec -= magspec.min(axis=None) 65 | magspec /= magspec.max(axis=None) 66 | 67 | #adjust shape if signal is too short 68 | magspec = magspec[:256, :512] 69 | temp = np.zeros((256, 512), dtype="float32") 70 | temp[:magspec.shape[0], :magspec.shape[1]] = magspec 71 | magspec = temp.copy() 72 | magspec = cv2.resize(magspec, (512, 256)) 73 | 74 | #DEBUG: show 75 | #cv2.imshow('SPEC', magspec) 76 | #cv2.waitKey(-1) 77 | 78 | return magspec 79 | 80 | def splitSignal(sig, rate, seconds=SPEC_LENGTH, overlap=SPEC_OVERLAP): 81 | 82 | #split signal with ovelap 83 | sig_splits = [] 84 | for i in xrange(0, len(sig), int((seconds - overlap) * rate)): 85 | split = sig[i:i + seconds * rate] 86 | if len(split) >= 1 * rate: 87 | sig_splits.append(split) 88 | 89 | #is signal too short for segmentation? 90 | if len(sig_splits) == 0: 91 | sig_splits.append(sig) 92 | 93 | return sig_splits 94 | 95 | def getMultiSpec(path, seconds=SPEC_LENGTH, overlap=SPEC_OVERLAP): 96 | 97 | #open wav file 98 | (rate, sig) = wave.read(path) 99 | print "SAMPLE RATE:", rate, 100 | 101 | #adjust to different sample rates 102 | if rate != 44100: 103 | sig, rate = changeSampleRate(sig, rate) 104 | 105 | #split signal into chunks 106 | sig_splits = splitSignal(sig, rate, seconds, overlap) 107 | 108 | #calculate spectrogram for every split 109 | for sig in sig_splits: 110 | 111 | magspec = getSpecFromSignal(sig, rate, seconds) 112 | 113 | yield magspec 114 | 115 | ###################################################### 116 | if __name__ == "__main__": 117 | 118 | events = [src_dir + event + '/' for event in sorted(os.listdir(src_dir))] 119 | print "NUMBER OF EVENTS:", len(events) 120 | 121 | #parse wave files for every event class 122 | for event in events: 123 | total_specs = 0 124 | 125 | #get wav files for event class 126 | wav_files = [event + wav for wav in sorted(os.listdir(event))] 127 | 128 | #parse wav files 129 | for wav in wav_files: 130 | 131 | #stats 132 | spec_cnt = 0 133 | print wav, 134 | 135 | try: 136 | #extract specs from wav file 137 | for spec in getMultiSpec(wav): 138 | 139 | #output dir for specs 140 | dst_dir = spec_dir + event.split("/")[-2] + "/" 141 | if not os.path.exists(dst_dir): 142 | os.makedirs(dst_dir) 143 | 144 | #save spec 145 | cv2.imwrite(dst_dir + wav.split("/")[-1].rsplit(".")[0] + "_" + str(spec_cnt) + ".png", spec * 255.0) 146 | spec_cnt += 1 147 | total_specs += 1 148 | 149 | print "SPECS:", spec_cnt 150 | 151 | except: 152 | print spec_cnt, "ERROR" 153 | traceback.print_exc() 154 | pass 155 | 156 | -------------------------------------------------------------------------------- /AED_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | print "HANDLING IMPORTS..." 4 | 5 | import time 6 | import operator 7 | import argparse 8 | 9 | import traceback 10 | import numpy as np 11 | import pickle 12 | 13 | import theano 14 | 15 | from lasagne import random as lasagne_random 16 | from lasagne import layers as l 17 | 18 | import AED_spec as spectrogram 19 | 20 | print "...DONE!" 21 | 22 | ######################## CONFIG ######################### 23 | #Fixed random seed 24 | RANDOM_SEED = 1337 25 | RANDOM = np.random.RandomState(RANDOM_SEED) 26 | lasagne_random.set_rng(RANDOM) 27 | 28 | #Pre-trained model params 29 | MODEL_PATH = 'model/' 30 | TRAINED_MODEL = 'AED_Example_Run_model.pkl' 31 | 32 | ################### ARGUMENT PARSER ##################### 33 | def parse_args(): 34 | 35 | parser = argparse.ArgumentParser(description='Acoustic Event Classification') 36 | parser.add_argument('--filenames', dest='filenames', help='paths to sample wav files for testing as list or single string', type=str, default='') 37 | parser.add_argument('--modelname', dest='modelname', help='name of pre-trained model', type=str, default=None) 38 | parser.add_argument('--speclength', dest='spec_length', help='spectrogram length in seconds', type=int, default=3) 39 | parser.add_argument('--overlap', dest='spec_overlap', help='spectrogram overlap in seconds', type=int, default=2) 40 | parser.add_argument('--results', dest='num_results', help='number of results', type=int, default=5) 41 | parser.add_argument('--confidence', dest='min_confidence', help='confidence threshold', type=float, default=0.01) 42 | 43 | args = parser.parse_args() 44 | 45 | #single test file or list of files? 46 | if isinstance(args.filenames, basestring): 47 | args.filenames = [args.filenames] 48 | 49 | return args 50 | 51 | #################### MODEL LOAD ######################## 52 | def loadModel(filename): 53 | print "IMPORTING MODEL...", 54 | net_filename = MODEL_PATH + filename 55 | 56 | with open(net_filename, 'rb') as f: 57 | data = pickle.load(f) 58 | 59 | #for evaluation, we want to load the complete model architecture and trained classes 60 | net = data['net'] 61 | classes = data['classes'] 62 | im_size = data['im_size'] 63 | im_dim = data['im_dim'] 64 | 65 | print "DONE!" 66 | 67 | return net, classes, im_size, im_dim 68 | 69 | ################# PREDICTION FUNCTION #################### 70 | def getPredictionFuntion(net): 71 | net_output = l.get_output(net, deterministic=True) 72 | 73 | print "COMPILING THEANO TEST FUNCTION...", 74 | start = time.time() 75 | test_net = theano.function([l.get_all_layers(NET)[0].input_var], net_output, allow_input_downcast=True) 76 | print "DONE! (", int(time.time() - start), "s )" 77 | 78 | return test_net 79 | 80 | ################# PREDICTION POOLING #################### 81 | def predictionPooling(p): 82 | 83 | #You can test different prediction pooling strategies here 84 | #We only use average pooling 85 | if p.ndim == 2: 86 | p_pool = np.mean(p, axis=0) 87 | else: 88 | p_pool = p 89 | 90 | return p_pool 91 | 92 | ####################### PREDICT ######################### 93 | def predict(img): 94 | 95 | #transpose image if dim=3 96 | try: 97 | img = np.transpose(img, (2, 0, 1)) 98 | except: 99 | pass 100 | 101 | #reshape image 102 | img = img.reshape(-1, IM_DIM, IM_SIZE[1], IM_SIZE[0]) 103 | 104 | #calling the test function returns the net output 105 | prediction = TEST_NET(img)[0] 106 | 107 | return prediction 108 | 109 | ####################### TESTING ######################### 110 | def testFile(path, spec_length, spec_overlap, num_results, confidence_threshold=0.01): 111 | 112 | #time 113 | start = time.time() 114 | 115 | #extract spectrograms from wav-file and process them 116 | predictions = [] 117 | spec_cnt = 0 118 | for spec in spectrogram.getMultiSpec(path, seconds=spec_length, overlap=spec_overlap): 119 | 120 | #make prediction 121 | p = predict(spec) 122 | spec_cnt += 1 123 | 124 | #stack predictions 125 | if len(predictions): 126 | predictions = np.vstack([predictions, p]) 127 | else: 128 | predictions = p 129 | 130 | #prediction pooling 131 | p_pool = predictionPooling(predictions) 132 | 133 | #get class labels for predictions 134 | p_labels = {} 135 | for i in range(p_pool.shape[0]): 136 | if p_pool[i] >= confidence_threshold: 137 | p_labels[CLASSES[i]] = p_pool[i] 138 | 139 | #sort by confidence and limit results (None returns all results) 140 | p_sorted = sorted(p_labels.items(), key=operator.itemgetter(1), reverse=True)[:num_results] 141 | 142 | #take time again 143 | dur = time.time() - start 144 | 145 | return p_sorted, spec_cnt, dur 146 | 147 | #################### EXAMPLE USAGE ###################### 148 | if __name__ == "__main__": 149 | 150 | #adjust config 151 | args = parse_args() 152 | 153 | #load model 154 | if args.modelname: 155 | TRAINED_MODEL = args.modelname 156 | NET, CLASSES, IM_SIZE, IM_DIM = loadModel(TRAINED_MODEL) 157 | 158 | #compile test function 159 | TEST_NET = getPredictionFuntion(NET) 160 | 161 | #do testing 162 | for fname in args.filenames: 163 | print 'TESTING:', fname 164 | pred, cnt, dur = testFile(fname, args.spec_length, args.spec_overlap, args.num_results, args.min_confidence) 165 | print 'TOP PREDICTION(S):' 166 | for p in pred: 167 | print '\t', p[0], int(p[1] * 100), '%' 168 | print 'PREDICTION FOR', cnt, 'SPECS TOOK', int(dur * 1000), 'ms (', int(dur / cnt * 1000) , 'ms/spec', ')', '\n' 169 | 170 | -------------------------------------------------------------------------------- /AED_eval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | print "HANDLING IMPORTS..." 4 | 5 | import warnings 6 | warnings.filterwarnings('ignore') 7 | 8 | import os 9 | import time 10 | import operator 11 | 12 | import traceback 13 | import numpy as np 14 | import pickle 15 | 16 | import theano 17 | 18 | from lasagne import random as lasagne_random 19 | from lasagne import layers as l 20 | 21 | import scipy.io.wavfile as wave 22 | 23 | import AED_spec as spectrogram 24 | import utils.batch_generator as bg 25 | 26 | print "...DONE!" 27 | 28 | ######################## CONFIG ######################### 29 | #Fixed random seed 30 | RANDOM_SEED = 1337 31 | RANDOM = np.random.RandomState(RANDOM_SEED) 32 | lasagne_random.set_rng(RANDOM) 33 | 34 | #Dataset params 35 | TEST_DIR = 'dataset/test/' 36 | 37 | #Pre-trained model params 38 | MODEL_PATH = 'model/' 39 | TRAINED_MODEL = 'AED_Example_Run_model.pkl' 40 | 41 | #Testing params 42 | BATCH_SIZE = 32 43 | SPEC_LENGTH = 3 44 | SPEC_OVERLAP = 2 45 | CONFIDENCE_THRESHOLD = 0.0001 46 | MAX_PREDICTIONS = 10 47 | 48 | ################### AUDIO PROCESSING #################### 49 | def parseTestSet(): 50 | 51 | #get list of test files 52 | test = [] 53 | test_classes = [os.path.join(TEST_DIR, tc) for tc in sorted(os.listdir(TEST_DIR))] 54 | for tc in test_classes: 55 | test += [os.path.join(tc, fpath) for fpath in os.listdir(tc)] 56 | test = test 57 | 58 | #get class label for every test sample 59 | gt = {} 60 | for path in test: 61 | label = path.split('/')[-2] 62 | gt[path] = label 63 | 64 | #stats 65 | #print classes 66 | print "NUMBER OF CLASSES:", len(test_classes) 67 | print "NUMBER OF TEST SAMPLES:", len(test) 68 | 69 | return test, gt 70 | 71 | TEST, GT = parseTestSet() 72 | 73 | #################### BATCH HANDLING ##################### 74 | def getSignalChunk(sig, rate): 75 | 76 | #split signal into chunks 77 | sig_splits = spectrogram.splitSignal(sig, rate, SPEC_LENGTH, SPEC_OVERLAP) 78 | 79 | #get batch-sized chunks of image paths 80 | for i in xrange(0, len(sig_splits), BATCH_SIZE): 81 | yield sig_splits[i:i+BATCH_SIZE] 82 | 83 | def getNextSpecBatch(path): 84 | 85 | #open wav file 86 | (rate, sig) = wave.read(path) 87 | 88 | #change sample rate if needed 89 | if rate != 44100: 90 | sig, rate = spectrogram.changeSampleRate(sig, rate) 91 | 92 | #fill batches 93 | for sig_chunk in getSignalChunk(sig, rate): 94 | 95 | #allocate numpy arrays for image data and targets 96 | s_b = np.zeros((BATCH_SIZE, IM_DIM, IM_SIZE[1], IM_SIZE[0]), dtype='float32') 97 | 98 | ib = 0 99 | for s in sig_chunk: 100 | 101 | #load spectrogram data from sig 102 | spec = spectrogram.getSpecFromSignal(s, rate, SPEC_LENGTH) 103 | 104 | #reshape spec 105 | spec = spec.reshape(-1, IM_DIM, IM_SIZE[1], IM_SIZE[0]) 106 | 107 | #pack into batch array 108 | s_b[ib] = spec 109 | ib += 1 110 | 111 | #trim to actual size 112 | s_b = s_b[:ib] 113 | 114 | #yield batch 115 | yield s_b 116 | 117 | #################### MODEL LOAD ######################## 118 | def loadModel(filename): 119 | print "IMPORTING MODEL...", 120 | net_filename = MODEL_PATH + filename 121 | 122 | with open(net_filename, 'rb') as f: 123 | data = pickle.load(f) 124 | 125 | #for evaluation, we want to load the complete model architecture and trained classes 126 | net = data['net'] 127 | classes = data['classes'] 128 | im_size = data['im_size'] 129 | im_dim = data['im_dim'] 130 | 131 | print "DONE!" 132 | 133 | return net, classes, im_size, im_dim 134 | 135 | ################# PREDICTION FUNCTION #################### 136 | def getPredictionFuntion(net): 137 | net_output = l.get_output(net, deterministic=True) 138 | 139 | print "COMPILING THEANO TEST FUNCTION...", 140 | start = time.time() 141 | test_net = theano.function([l.get_all_layers(NET)[0].input_var], net_output, allow_input_downcast=True) 142 | print "DONE! (", int(time.time() - start), "s )" 143 | 144 | return test_net 145 | 146 | ################# PREDICTION POOLING #################### 147 | def predictionPooling(p): 148 | 149 | #You can test different prediction pooling strategies here 150 | #We only use average pooling 151 | if p.ndim == 2: 152 | p_pool = np.mean(p, axis=0) 153 | else: 154 | p_pool = p 155 | 156 | return p_pool 157 | 158 | ####################### TESTING ######################### 159 | #test model 160 | print "TESTING MODEL..." 161 | 162 | #load model 163 | NET, CLASSES, IM_SIZE, IM_DIM = loadModel(filename=TRAINED_MODEL) 164 | 165 | #get test function 166 | test_net = getPredictionFuntion(NET) 167 | 168 | pr = [] 169 | pcnt = 1 170 | ecnt = 0 171 | acc = [] 172 | #test every sample from test collection 173 | for path in TEST: 174 | 175 | #status 176 | print pcnt, path.replace(TEST_DIR, ''), 177 | 178 | try: 179 | 180 | #make predictions for batches of spectrograms 181 | predictions = [] 182 | for spec_batch in bg.threadedBatchGenerator(getNextSpecBatch(path)): 183 | 184 | #predict 185 | p = test_net(spec_batch) 186 | 187 | #stack predictions 188 | if len(predictions): 189 | predictions = np.vstack([predictions, p]) 190 | else: 191 | predictions = p 192 | 193 | #prediction pooling 194 | p_pool = predictionPooling(predictions) 195 | 196 | #get class labels for predictions 197 | p_labels = {} 198 | for i in range(p_pool.shape[0]): 199 | p_labels[CLASSES[i]] = p_pool[i] 200 | 201 | #sort by confidence 202 | p_sorted = sorted(p_labels.items(), key=operator.itemgetter(1), reverse=True)[:MAX_PREDICTIONS] 203 | 204 | #calculate avg precision 205 | for i in range(len(p_sorted)): 206 | if p_sorted[i][0] == GT[path]: 207 | pr.append(1.0 / float(i + 1)) 208 | if i + 1 == 1: 209 | acc.append(1) 210 | else: 211 | acc.append(0) 212 | break 213 | 214 | print 'LABEL:', p_sorted[0], 'AVGP:', pr[-1] 215 | 216 | except KeyboardInterrupt: 217 | break 218 | except: 219 | print "ERROR" 220 | #pr.append(0.0) 221 | traceback.print_exc() 222 | ecnt += 1 223 | continue 224 | 225 | pcnt += 1 226 | 227 | print "TESTING DONE!" 228 | print "ERRORS:", ecnt, "/", pcnt - 1 229 | print "MAP:", np.mean(pr) 230 | print "ACCURACY:", np.mean(acc) 231 | 232 | 233 | 234 | 235 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Acoustic Event Classification Using Convolutional Neural Networks 2 | By [Stefan Kahl](http://medien.informatik.tu-chemnitz.de/skahl/about/), [Hussein Hussein](https://www.tu-chemnitz.de/informatik/HomePages/Medieninformatik/team.php.en), [Etienne Fabian](https://www.intenta.de/en/home.html), [Jan Schloßhauer](https://www.intenta.de/en/home.html), [Danny Kowerko](https://www.tu-chemnitz.de/informatik/mc/staff.php.en), and [Maximilian Eibl](https://www.tu-chemnitz.de/informatik/HomePages/Medieninformatik/team.php.en) 3 | 4 | ## Introduction 5 | This code repo complements our submission to the [INFORMATIK 2017 Workshop WS34](https://informatik2017.de/ws34-dlhd/). This is a refined version of our original code described in the paper. We added comments, removed some of the boilerplate code and added testing functionality. If you have any questions or problems running the scripts, don't hesitate to contact us. 6 | 7 | Contact: [Stefan Kahl](http://medien.informatik.tu-chemnitz.de/skahl/about/), [Technische Universität Chemnitz](https://www.tu-chemnitz.de/index.html.en), [Media Informatics](https://www.tu-chemnitz.de/informatik/Medieninformatik/index.php.en) 8 | 9 | E-Mail: stefan.kahl@informatik.tu-chemnitz.de 10 | 11 | This project is licensed under the terms of the MIT license. 12 | 13 | Please cite the paper in your publications if it helps your research. 14 | 15 | You can download the submission here: [2017_INFORMATIK_AED_CNN.pdf](https://box.tu-chemnitz.de/index.php/s/sfW010bbLEsP4Kw) (Unpublished draft version) 16 | 17 | ## Installation 18 | This is a Thenao/Lasagne implementation in Python for the classification of acoustic events based on deep features. This code is tested using Ubuntu 14.04 LTS but should work with other distributions as well. 19 | 20 | First, you need to install Python 2.7 and the CUDA-Toolkit for GPU acceleration. After that, you can clone the project and run the Python package tool PIP to install most of the relevant dependencies: 21 | 22 | ``` 23 | git clone https://github.com/kahst/AcousticEventDetection.git 24 | cd AcousticEventDetection 25 | sudo pip install –r requirements.txt 26 | ``` 27 | 28 | We use OpenCV for image processing; you can install the cv2 package for Python running this command: 29 | 30 | ``` 31 | sudo apt-get install python-opencv 32 | ``` 33 | 34 | Finally, you need to install Theano and Lasagne: 35 | ``` 36 | sudo pip install -r https://raw.githubusercontent.com/Lasagne/Lasagne/master/requirements.txt 37 | sudo pip install https://github.com/Lasagne/Lasagne/archive/master.zip 38 | ``` 39 | 40 | You should follow the Lasagne installation instructions for more details: 41 | http://lasagne.readthedocs.io/en/latest/user/installation.html 42 | 43 | ## Training 44 | In order to train a model based on your own dataset or any other publicly available dataset (e.g. [UrbanSound8K](https://serv.cusp.nyu.edu/projects/urbansounddataset/index.html)) you need to follow some simple steps: First, you need to organize your dataset with subfolders as class labels. Secondly, you need to extract spectrograms from all audio files using the script AED_spec.py. After that, you are ready to train your model. Finally, you can either evaluate a model using AED_eval.py or make predictions for any sound file using AED_test.py. 45 | 46 | ### Dataset 47 | The training script uses subfolders as class names and you should provide following directory structure: 48 | 49 | ``` 50 | dataset 51 | ¦ 52 | +---event1 53 | ¦ ¦ file011.wav 54 | ¦ ¦ file012.wav 55 | ¦ ¦ ... 56 | ¦ 57 | +---event2 58 | ¦ ¦ file021.wav 59 | ¦ ¦ file022.wav 60 | ¦ ¦ ... 61 | ¦ 62 | +---... 63 | ``` 64 | ### Extracting Spectrograms 65 | We decided to use magnitude spectrograms with a resolution of 512x256 pixels, which represent three-second chunks of audio signal. You can generate spectrograms for your sorted dataset with the script AED_spec.py. You can switch to different settings for the spectrograms by editing the file. 66 | 67 | Extracting spectrograms might take a while. Eventually, you should end up with a directory containing subfolders named after acoustic events, which we will use as class names during training. 68 | 69 | ### Training a Model 70 | You can train your own model using either publicly available training data or your own sound recordings. All you need are spectrograms of the recordings. Before training, you should review the following settings, which you can find in the AED_train.py file: 71 | 72 | - `DATASET_PATH` containing the spectrograms (subfolders as class names). 73 | 74 | - `MIN_SAMPLES_PER_CLASS` increasing the number of spectrograms per acoustic event class in order to counter class imbalances (Default = -1). 75 | 76 | - `MAX_SAMPLES_PER_CLASS` limiting the number of spectrograms per acoustic event class (Default = None - No limit). 77 | 78 | - `VAL_SPLIT` which defines the amount of spectrograms in percent you like to use for monitoring the training process (Default = 0.1). 79 | 80 | - `MULTI_LABEL` to switch between softmax outputs (False) or sigmoid outputs (True); Activates batch augmentation (multiple targets per spec). 81 | 82 | - `IM_SIZE` defining the size of input images, spectrograms will be scaled accordingly (Default = 512x256 pixels). 83 | 84 | - `IM_AUGMENTATION` selecting different techniques for dataset augmentation. 85 | 86 | - `BATCH_SIZE` defining the number of images per batch; reduce batch size to fit model in less GPU memory (Default = 32). 87 | 88 | - `LEARNING_RATE` for scheduling the learning rate; use `LR_DESCENT = True` for linear interpolation and `LR_DESCENT = False` for steps. 89 | 90 | - `PRETRAINED_MODEL` if you want to use a pickle file of a previously trained model; set `LOAD_OUTPUT_LAYER = False` if model output size differs (you can download a pre-trained model [here](https://box.tu-chemnitz.de/index.php/s/8vkQqXbUjVWlt5m)). 91 | 92 | - `SNAPSHOT_EPOCHS` in order to continuously save model snapshots; select `[-1]` to save after every epoch; the best model params will be saved automatically after training. 93 | 94 | There are a lot more options - most should be self-explanatory. If you have any questions regarding the settings or the training process, feel free to contact us. 95 | 96 | Note: In order to keep results reproducible with fixed random seeds, you need to update your .theanorc file with the following lines: 97 | 98 | ``` 99 | [dnn.conv] 100 | algo_bwd_filer=deterministic 101 | algo_bwd_data=deterministic 102 | ``` 103 | 104 | Depending on your GPU, training might take while... 105 | 106 | ## Evaluation 107 | After training, you can test models and evaluate them on your local validation split. Therefore, you need to adjust the settings in AED_eval.py to match your task. The most important settings are: 108 | 109 | - `TEST_DIR` defining the path to your test data. Again, we use subfolders as class labels (Ground Truth). 110 | 111 | - `TRAINED_MODEL` where you specify the pickle file of your pre-trained model and the corresponding model architecture. 112 | 113 | - `SPEC_LENGTH` and `SPEC_OVERLAP` which you should choose according to your training data. Increasing the overlap might reduce the prediction error due to more predictions per file. 114 | 115 | - `CONFIDENCE_THRESHOLD` and `MAX_PREDICTIONS` can be used to limit the number of predictions returned. 116 | 117 | Note: Test data should be organized as training data, subfolders as class names. Feel free to use different ground truth annotations; all you need to do is edit the script accordingly. 118 | 119 | ## Testing 120 | If you want to make predictions for a single, unlabeled wav-file, you can call the script AED_test.py via the command shell. We provided some example files in the dataset folder. You can use this script as is, no training required. Simply follow these steps: 121 | 122 | 1. Download pre-trained model: 123 | ``` 124 | sh model/fetch_model.sh 125 | ``` 126 | 2. Execute script: 127 | ``` 128 | python AED_test.py --filenames 'dataset/schreien_scream.wav' --modelname 'AED_Example_Run_model.pkl' --overlap 4 --results 5 --confidence 0.01 129 | ``` 130 | If everything goes well, you should see an output just like this: 131 | 132 | ``` 133 | HANDLING IMPORTS... DONE! 134 | IMPORTING MODEL... DONE! 135 | COMPILING THEANO TEST FUNCTION... DONE! ( 2 s ) 136 | TESTING: dataset/schreien_scream.wav 137 | SAMPLE RATE: 44100 TOP PREDICTION(S): 138 | schreien 99 % 139 | PREDICTION FOR 4 SPECS TOOK 57 ms ( 14 ms/spec ) 140 | ``` 141 | Note: You do not need to specify values for overlap, results and confidence – those are optional. You can define a list of wav-files for prediction. To do so, run the script using `--filenames ['file1.wav', file2.wav', ...]`. 142 | 143 | This repo might not suffice for real-world applications, but you should be able to adapt the testing script to your specific needs. 144 | 145 | We will keep this repo updated and will provide more testing functionality in the future. 146 | -------------------------------------------------------------------------------- /AED_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | print "HANDLING IMPORTS..." 4 | 5 | import sys 6 | import os 7 | import time 8 | import operator 9 | import math 10 | 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | import cv2 14 | 15 | from scipy import interpolate 16 | from sklearn.utils import shuffle 17 | from sklearn.metrics import confusion_matrix 18 | import itertools 19 | 20 | import pickle 21 | 22 | import theano 23 | import theano.tensor as T 24 | 25 | from lasagne import random as lasagne_random 26 | from lasagne import layers as l 27 | from lasagne import nonlinearities 28 | from lasagne import init 29 | from lasagne import objectives 30 | from lasagne import updates 31 | from lasagne import regularization 32 | 33 | from utils import batch_generator as bg 34 | 35 | print "...DONE!" 36 | 37 | sys.setrecursionlimit(10000) 38 | ######################## CONFIG ######################### 39 | #Fixed random seed 40 | RANDOM_SEED = 1337 41 | RANDOM = np.random.RandomState(RANDOM_SEED) 42 | lasagne_random.set_rng(RANDOM) 43 | 44 | #Dataset params 45 | DATASET_PATH = 'dataset/train/spec/' 46 | MIN_SAMPLES_PER_CLASS = -1 47 | MAX_SAMPLES_PER_CLASS = None 48 | SORT_CLASSES_ALPHABETICALLY = True 49 | VAL_SPLIT = 0.1 50 | USE_CACHE = False 51 | 52 | #Multi-Label Params 53 | MULTI_LABEL = False 54 | VAL_HAS_MULTI_LABEL = False 55 | MEAN_TARGETS_PER_IMAGE = 3 56 | 57 | #Image params 58 | IM_SIZE = (512, 256) #(width, height) 59 | IM_DIM = 1 60 | IM_AUGMENTATION = {#'type':[probability, value] 61 | 'roll':[0.5, (0.0, 0.05)], 62 | #'noise':[0.1, 0.01], 63 | #'brightness':[0.5, (0.25, 1.25)], 64 | #'crop':[0.5, 0.07], 65 | #'flip': [0.25, 1] 66 | } 67 | 68 | #General model params 69 | DROPOUT = 0.5 70 | NONLINEARITY = nonlinearities.rectify 71 | INIT_GAIN = math.sqrt(2) 72 | 73 | #Training params 74 | BATCH_SIZE = 32 75 | LEARNING_RATE = {0:0.001, 55:0.000001} #epoch:lr 76 | LR_DESCENT = True 77 | L2_WEIGHT = 0 #1e-4 78 | OPTIMIZER='adam' #'adam' or 'nesterov' 79 | EPOCHS = 55 80 | RANDOMIZE_TRAIN_SET = True 81 | 82 | #Confusion matrix params 83 | CONFMATRIX_MAX_CLASSES = 20 84 | NORMALIZE_CONFMATRIX = True 85 | 86 | #Model import/export params 87 | MODEL_PATH = 'model/' 88 | PRETRAINED_MODEL = None #'pretrained_model.pkl' 89 | LOAD_OUTPUT_LAYER = True 90 | EPOCH_START = 1 91 | RUN_NAME = 'Example_Run' 92 | SIMPLE_LOG_MODE = True 93 | SNAPSHOT_EPOCHS = [10, 20, 30, 40, 50] #[-1] saves after every epoch 94 | SAVE_AFTER_INTERRUPT = True 95 | 96 | ################### DATASAT HANDLING #################### 97 | def parseDataset(): 98 | 99 | #we use subfolders as class labels 100 | classes = [folder for folder in sorted(os.listdir(DATASET_PATH))] 101 | if not SORT_CLASSES_ALPHABETICALLY: 102 | classes = shuffle(classes, random_state=RANDOM) 103 | 104 | #now we enlist all image paths for each class 105 | images = [] 106 | tclasses = [] 107 | sample_count = {} 108 | for c in classes: 109 | c_images = [os.path.join(DATASET_PATH, c, path) for path in os.listdir(os.path.join(DATASET_PATH, c))][:MAX_SAMPLES_PER_CLASS] 110 | sample_count[c] = len(c_images) 111 | images += c_images 112 | 113 | #Do we want to correct class imbalance? 114 | #This will affect validation scores as we use some samples in TRAIN and VAL 115 | while sample_count[c] < MIN_SAMPLES_PER_CLASS: 116 | images += [c_images[RANDOM.randint(0, len(c_images))]] 117 | sample_count[c] += 1 118 | 119 | #shuffle image paths 120 | images = shuffle(images, random_state=RANDOM) 121 | 122 | #validation split 123 | vsplit = int(len(images) * VAL_SPLIT) 124 | train = images[:-vsplit] 125 | val = images[-vsplit:] 126 | 127 | #show classes if needed for testing 128 | #print classes 129 | 130 | #show some stats 131 | print "CLASSES:", len(classes) 132 | print "CLASS LABELS:", sorted(sample_count.items(), key=operator.itemgetter(1)) 133 | print "TRAINING IMAGES:", len(train) 134 | print "VALIDATION IMAGES:", len(val) 135 | 136 | return classes, train, val 137 | 138 | #parse dataset 139 | CLASSES, TRAIN, VAL = parseDataset() 140 | NUM_CLASSES = len(CLASSES) 141 | #################### BATCH HANDLING ##################### 142 | CACHE = {} 143 | def openImage(path, useCache=USE_CACHE): 144 | 145 | global CACHE 146 | 147 | #using a dict {path:image} cache saves some time after first epoch 148 | #but may consume a lot of RAM 149 | if path in CACHE: 150 | return CACHE[path] 151 | else: 152 | 153 | #open image 154 | img = cv2.imread(path) 155 | 156 | #DEBUG 157 | try: 158 | h, w = img.shape[:2] 159 | except: 160 | print "IMAGE NONE-TYPE:", path 161 | 162 | #original image dimensions 163 | try: 164 | h, w, d = img.shape 165 | 166 | #to gray? 167 | if IM_DIM == 1: 168 | img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) 169 | 170 | except: 171 | h, w = img.shape 172 | 173 | #to color? 174 | if IM_DIM == 3: 175 | img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) 176 | 177 | #resize to conv input size 178 | img = cv2.resize(img, (IM_SIZE[0], IM_SIZE[1])) 179 | 180 | #convert to floats between 0 and 1 181 | img = np.asarray(img / 255., dtype='float32') 182 | 183 | if useCache: 184 | CACHE[path] = img 185 | return img 186 | 187 | def imageAugmentation(img): 188 | 189 | AUG = IM_AUGMENTATION 190 | 191 | #Random Crop (without padding) 192 | if 'crop' in AUG and RANDOM.choice([True, False], p=[AUG['crop'][0], 1 - AUG['crop'][0]]): 193 | h, w = img.shape[:2] 194 | cropw = RANDOM.randint(1, int(float(w) * AUG['crop'][1])) 195 | croph = RANDOM.randint(1, int(float(h) * AUG['crop'][1])) 196 | img = img[croph:-croph, cropw:-cropw] 197 | img = cv2.resize(img, (IM_SIZE[0], IM_SIZE[1])) 198 | 199 | #Flip - 1 = Horizontal, 0 = Vertical 200 | if 'flip' in AUG and RANDOM.choice([True, False], p=[AUG['flip'][0], 1 - AUG['flip'][0]]): 201 | img = cv2.flip(img, AUG['flip'][1]) 202 | 203 | #Wrap shift (roll up/down and left/right) 204 | if 'roll' in AUG and RANDOM.choice([True, False], p=[AUG['roll'][0], 1 - AUG['roll'][0]]): 205 | img = np.roll(img, int(img.shape[0] * (RANDOM.uniform(-AUG['roll'][1][1], AUG['roll'][1][1]))), axis=0) 206 | img = np.roll(img, int(img.shape[1] * (RANDOM.uniform(-AUG['roll'][1][0], AUG['roll'][1][0]))), axis=1) 207 | 208 | #substract/add mean 209 | if 'mean' in AUG and RANDOM.choice([True, False], p=[AUG['mean'][0], 1 - AUG['mean'][0]]): 210 | img += np.mean(img) * AUG['mean'][1] 211 | 212 | #gaussian noise 213 | if 'noise' in AUG and RANDOM.choice([True, False], p=[AUG['noise'][0], 1 - AUG['noise'][0]]): 214 | img += RANDOM.normal(0.0, RANDOM.uniform(0, AUG['noise'][1]**0.5), img.shape) 215 | img = np.clip(img, 0.0, 1.0) 216 | 217 | #adjust brightness 218 | if 'brightness' in AUG and RANDOM.choice([True, False], p=[AUG['brightness'][0], 1 - AUG['brightness'][0]]): 219 | img *= RANDOM.uniform(AUG['brightness'][1][0], AUG['brightness'][1][1]) 220 | img = np.clip(img, 0.0, 1.0) 221 | 222 | #show 223 | #cv2.imshow("AUG", img)#.reshape(IM_SIZE[1], IM_SIZE[0], IM_DIM)) 224 | #cv2.waitKey(-1) 225 | 226 | return img 227 | 228 | def loadImageAndTarget(path, doAugmentation=True): 229 | 230 | #here we open the image 231 | img = openImage(path) 232 | 233 | #image augmentation? 234 | if IM_AUGMENTATION != None and doAugmentation: 235 | img = imageAugmentation(img) 236 | 237 | #we want to use subfolders as class labels 238 | label = path.split("/")[-2] 239 | 240 | #we need to get the index of our label from CLASSES 241 | index = CLASSES.index(label) 242 | 243 | #allocate array for target 244 | target = np.zeros((NUM_CLASSES), dtype='float32') 245 | 246 | #we set our target array = 1.0 at our label index, all other entries remain 0.0 247 | target[index] = 1.0 248 | 249 | #transpose image if dim=3 250 | try: 251 | img = np.transpose(img, (2, 0, 1)) 252 | except: 253 | pass 254 | 255 | #we need a 4D-vector for our image and a 2D-vector for our targets 256 | img = img.reshape(-1, IM_DIM, IM_SIZE[1], IM_SIZE[0]) 257 | target = target.reshape(-1, NUM_CLASSES) 258 | 259 | return img, target 260 | 261 | def getAugmentedBatches(x, y): 262 | 263 | #augment batch until desired number of target labels per image is reached 264 | while np.mean(np.sum(y, axis=1)) < MEAN_TARGETS_PER_IMAGE: 265 | 266 | #get two images to combine (we try to prevent i == j (which could result in infinite loops) with excluding ranges) 267 | i = RANDOM.choice(range(1, x.shape[0] - 1)) 268 | j = RANDOM.choice(range(0, i) + range(i + 1, x.shape[0])) 269 | 270 | #add images 271 | x[i] += x[j] 272 | 273 | #re-normalize new image 274 | x[i] -= x[i].min(axis=None) 275 | x[i] /= x[i].max(axis=None) 276 | 277 | #combine targets (makes this task a multi-label classification!) 278 | y[i] = np.logical_or(y[i], y[j]) 279 | 280 | #TODO: We still might end up in an infinite loop 281 | #and should add a break in case something is fishy 282 | 283 | #show 284 | #cv2.imshow("BA", x[i].reshape(IM_SIZE[1], IM_SIZE[0], IM_DIM)) 285 | #cv2.waitKey(-1) 286 | 287 | return x, y 288 | 289 | def getDatasetChunk(split): 290 | 291 | #get batch-sized chunks of image paths 292 | for i in xrange(0, len(split), BATCH_SIZE): 293 | yield split[i:i+BATCH_SIZE] 294 | 295 | def getNextImageBatch(split=TRAIN, doAugmentation=True, batchAugmentation=MULTI_LABEL): 296 | 297 | #fill batch 298 | for chunk in getDatasetChunk(split): 299 | 300 | #allocate numpy arrays for image data and targets 301 | x_b = np.zeros((BATCH_SIZE, IM_DIM, IM_SIZE[1], IM_SIZE[0]), dtype='float32') 302 | y_b = np.zeros((BATCH_SIZE, NUM_CLASSES), dtype='float32') 303 | 304 | ib = 0 305 | for path in chunk: 306 | 307 | try: 308 | 309 | #load image data and class label from path 310 | x, y = loadImageAndTarget(path, doAugmentation) 311 | 312 | #pack into batch array 313 | x_b[ib] = x 314 | y_b[ib] = y 315 | ib += 1 316 | 317 | except: 318 | continue 319 | 320 | #trim to actual size 321 | x_b = x_b[:ib] 322 | y_b = y_b[:ib] 323 | 324 | #batch augmentation? 325 | if batchAugmentation and x_b.shape[0] >= BATCH_SIZE // 2: 326 | x_b, y_b = getAugmentedBatches(x_b, y_b) 327 | 328 | #instead of return, we use yield 329 | yield x_b, y_b 330 | 331 | ################## BUILDING THE MODEL ################### 332 | def buildModel(): 333 | 334 | print "BUILDING MODEL TYPE..." 335 | 336 | #default settings 337 | filters = 64 338 | first_stride = 2 339 | last_filter_multiplier = 16 340 | 341 | #input layer 342 | net = l.InputLayer((None, IM_DIM, IM_SIZE[1], IM_SIZE[0])) 343 | 344 | #conv layers 345 | net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters, filter_size=7, pad='same', stride=first_stride, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) 346 | net = l.MaxPool2DLayer(net, pool_size=2) 347 | 348 | net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters * 2, filter_size=5, pad='same', stride=1, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) 349 | net = l.MaxPool2DLayer(net, pool_size=2) 350 | 351 | net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters * 4, filter_size=3, pad='same', stride=1, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) 352 | net = l.MaxPool2DLayer(net, pool_size=2) 353 | 354 | net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters * 8, filter_size=3, pad='same', stride=1, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) 355 | net = l.MaxPool2DLayer(net, pool_size=2) 356 | 357 | net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters * last_filter_multiplier, filter_size=3, pad='same', stride=1, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) 358 | net = l.MaxPool2DLayer(net, pool_size=2) 359 | 360 | print "\tFINAL POOL OUT SHAPE:", l.get_output_shape(net) 361 | 362 | #dense layers 363 | net = l.batch_norm(l.DenseLayer(net, 512, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) 364 | net = l.DropoutLayer(net, DROPOUT) 365 | net = l.batch_norm(l.DenseLayer(net, 512, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) 366 | net = l.DropoutLayer(net, DROPOUT) 367 | 368 | #Classification Layer 369 | if MULTI_LABEL: 370 | net = l.DenseLayer(net, NUM_CLASSES, nonlinearity=nonlinearities.sigmoid, W=init.HeNormal(gain=1)) 371 | else: 372 | net = l.DenseLayer(net, NUM_CLASSES, nonlinearity=nonlinearities.softmax, W=init.HeNormal(gain=1)) 373 | 374 | print "...DONE!" 375 | 376 | #model stats 377 | print "MODEL HAS", (sum(hasattr(layer, 'W') for layer in l.get_all_layers(net))), "WEIGHTED LAYERS" 378 | print "MODEL HAS", l.count_params(net), "PARAMS" 379 | 380 | return net 381 | 382 | NET = buildModel() 383 | 384 | ################## MODEL SAVE/LOAD #################### 385 | BEST_MODEL = None 386 | BEST_EPOCH = 0 387 | def saveModel(epoch, model=None): 388 | print "EXPORTING MODEL...", 389 | if model == None: 390 | model = NET 391 | net_filename = MODEL_PATH + "AED_" + RUN_NAME + "_model_epoch_" + str(epoch) + ".pkl" 392 | if not os.path.exists(MODEL_PATH): 393 | os.makedirs(MODEL_PATH) 394 | with open(net_filename, 'w') as f: 395 | 396 | #We want to save the model architecture with all params and trained classes 397 | data = {'net': model, 'classes':CLASSES, 'run_name': RUN_NAME, 'epoch':epoch, 'im_size':IM_SIZE, 'im_dim':IM_DIM} 398 | pickle.dump(data, f) 399 | 400 | print "DONE!" 401 | 402 | def loadModel(filename): 403 | print "IMPORTING MODEL PARAMS...", 404 | net_filename = MODEL_PATH + filename 405 | 406 | with open(net_filename, 'rb') as f: 407 | data = pickle.load(f) 408 | 409 | #for training, we only want to load the model params 410 | net = data['net'] 411 | params = l.get_all_param_values(net) 412 | if LOAD_OUTPUT_LAYER: 413 | l.set_all_param_values(NET, params) 414 | else: 415 | l.set_all_param_values(l.get_all_layers(NET)[:-1], params[:-2]) 416 | 417 | print "DONE!" 418 | 419 | if PRETRAINED_MODEL != None: 420 | loadModel(PRETRAINED_MODEL) 421 | 422 | #################### LOSS FUNCTION ###################### 423 | def calc_loss(prediction, targets): 424 | 425 | #categorical crossentropy is the best choice for a multi-class softmax output 426 | loss = T.mean(objectives.categorical_crossentropy(prediction, targets)) 427 | 428 | return loss 429 | 430 | def calc_loss_multi(prediction, targets): 431 | 432 | #we need to clip predictions when calculating the log-loss 433 | prediction = T.clip(prediction, 0.0000001, 0.9999999) 434 | 435 | #binary crossentropy is the best choice for a multi-class sigmoid output 436 | loss = T.mean(objectives.binary_crossentropy(prediction, targets)) 437 | 438 | return loss 439 | 440 | #theano variable for the class targets 441 | targets = T.matrix('targets', dtype=theano.config.floatX) 442 | 443 | #get the network output 444 | prediction = l.get_output(NET) 445 | 446 | #we use L2 Norm for regularization 447 | l2_reg = regularization.regularize_layer_params(NET, regularization.l2) * L2_WEIGHT 448 | 449 | #calculate the loss 450 | if MULTI_LABEL: 451 | loss = calc_loss_multi(prediction, targets) + l2_reg 452 | else: 453 | loss = calc_loss(prediction, targets) + l2_reg 454 | 455 | ################# ACCURACY FUNCTION ##################### 456 | def calc_accuracy(prediction, targets): 457 | 458 | #we can use the lasagne objective categorical_accuracy to determine the top1 single label accuracy 459 | a = T.mean(objectives.categorical_accuracy(prediction, targets, top_k=1)) 460 | 461 | return a 462 | 463 | def calc_accuracy_multi(prediction, targets): 464 | 465 | #we can use the lasagne objective binary_accuracy to determine the multi label accuracy 466 | a = T.mean(objectives.binary_accuracy(prediction, targets)) 467 | 468 | return a 469 | 470 | #calculate accuracy 471 | if MULTI_LABEL and VAL_HAS_MULTI_LABEL: 472 | accuracy = calc_accuracy_multi(prediction, targets) 473 | else: 474 | accuracy = calc_accuracy(prediction, targets) 475 | 476 | ####################### UPDATES ######################### 477 | #we use dynamic learning rates which change after some epochs 478 | lr_dynamic = T.scalar(name='learning_rate') 479 | 480 | #get all trainable parameters (weights) of our net 481 | params = l.get_all_params(NET, trainable=True) 482 | 483 | #we use the adam update 484 | if OPTIMIZER == 'adam': 485 | param_updates = updates.adam(loss, params, learning_rate=lr_dynamic, beta1=0.5) 486 | elif OPTIMIZER == 'nesterov': 487 | param_updates = updates.nesterov_momentum(loss, params, learning_rate=lr_dynamic, momentum=0.9) 488 | 489 | #################### TRAIN FUNCTION ###################### 490 | #the theano train functions takes images and class targets as input 491 | print "COMPILING THEANO TRAIN FUNCTION...", 492 | start = time.time() 493 | train_net = theano.function([l.get_all_layers(NET)[0].input_var, targets, lr_dynamic], loss, updates=param_updates) 494 | print "DONE! (", int(time.time() - start), "s )" 495 | 496 | ################# PREDICTION FUNCTION #################### 497 | #we need the prediction function to calculate the validation accuracy 498 | #this way we can test the net during/after training 499 | net_output = l.get_output(NET, deterministic=True) 500 | 501 | print "COMPILING THEANO TEST FUNCTION...", 502 | start = time.time() 503 | test_net = theano.function([l.get_all_layers(NET)[0].input_var, targets], [net_output, loss, accuracy]) 504 | print "DONE! (", int(time.time() - start), "s )" 505 | 506 | ################## CONFUSION MATRIX ##################### 507 | cmatrix = [] 508 | def clearConfusionMatrix(): 509 | 510 | global cmatrix 511 | 512 | #allocate empty matrix 513 | cmatrix = np.zeros((NUM_CLASSES, NUM_CLASSES), dtype='int32') 514 | 515 | def updateConfusionMatrix(p, t): 516 | 517 | global cmatrix 518 | 519 | #get class indices for prediction and target 520 | targets = np.argmax(t, axis=1) 521 | predictions = np.argmax(p, axis=1) 522 | 523 | #add up confusion matrices of validation batches 524 | cmatrix += confusion_matrix(targets, predictions, labels=range(0, NUM_CLASSES)) 525 | 526 | def showConfusionMatrix(epoch): 527 | 528 | #new figure 529 | plt.figure(0, figsize=(35, 35), dpi=72) 530 | plt.clf() 531 | 532 | #get additional metrics 533 | pr, re, f1 = calculateMetrics() 534 | 535 | #normalize? 536 | if NORMALIZE_CONFMATRIX: 537 | global cmatrix 538 | cmatrix = np.around(cmatrix.astype('float') / cmatrix.sum(axis=1)[:, np.newaxis] * 100.0, decimals=1) 539 | 540 | #show matrix 541 | plt.imshow(cmatrix[:CONFMATRIX_MAX_CLASSES, :CONFMATRIX_MAX_CLASSES], interpolation='nearest', cmap=plt.cm.Blues) 542 | plt.title('Confusion Matrix\n' + 543 | RUN_NAME + ' - Epoch ' + str(epoch) + 544 | '\nTrain Samples: ' + str(len(TRAIN)) + ' Validation Samples: ' + str(len(VAL)) + 545 | '\nmP: ' + str(np.mean(pr)) + ' mF1: ' + str( np.mean(f1)), fontsize=32) 546 | 547 | #tick marks 548 | tick_marks = np.arange(min(CONFMATRIX_MAX_CLASSES, NUM_CLASSES)) 549 | plt.xticks(tick_marks, CLASSES[:CONFMATRIX_MAX_CLASSES], rotation=90) 550 | plt.yticks(tick_marks, CLASSES[:CONFMATRIX_MAX_CLASSES]) 551 | 552 | #labels 553 | thresh = cmatrix.max() / 2. 554 | for i, j in itertools.product(range(min(CONFMATRIX_MAX_CLASSES, cmatrix.shape[0])), range(min(CONFMATRIX_MAX_CLASSES, cmatrix.shape[1]))): 555 | plt.text(j, i, cmatrix[i, j], 556 | horizontalalignment="center", verticalalignment="center", 557 | color="white" if cmatrix[i, j] > thresh else "black", fontsize=32) 558 | 559 | #axes labels 560 | plt.tight_layout() 561 | plt.ylabel('Target label', fontsize=32) 562 | plt.xlabel('Predicted label', fontsize=32) 563 | 564 | #fontsize 565 | plt.rc('font', size=32) 566 | 567 | #save plot 568 | global cmcnt 569 | if not os.path.exists('confmatrix'): 570 | os.makedirs('confmatrix') 571 | plt.savefig('confmatrix/' + RUN_NAME + '_' + str(epoch) + '.png') 572 | 573 | def calculateMetrics(): 574 | 575 | #allocate arrays 576 | pr = [] 577 | re = [] 578 | f1 = [] 579 | 580 | #parse rows and columns of confusion matrix 581 | for i in range(0, cmatrix.shape[0]): 582 | 583 | #true positives, false positves, false negatives 584 | tp = float(cmatrix[i][i]) 585 | fp = float(np.sum(cmatrix, axis=1)[i] - tp) 586 | fn = float(np.sum(cmatrix, axis=0)[i] - tp) 587 | 588 | #precision 589 | if tp > 0 or fp > 0: 590 | p = tp / (tp + fp) 591 | else: 592 | p = 0 593 | pr.append(p) 594 | 595 | #recall 596 | if tp > 0 or fn > 0: 597 | r = tp / (tp + fn) 598 | else: 599 | r = 0 600 | re.append(r) 601 | 602 | #f1 measure 603 | if p > 0 or r > 0: 604 | f = 2 * ((p * r) / (p + r)) 605 | else: 606 | f = 0 607 | f1.append(f) 608 | 609 | return pr, re, f1 610 | 611 | ###################### PROGRESS ######################### 612 | batches_per_epoch = len(TRAIN + VAL) // BATCH_SIZE + 1 613 | avg_duration = [] 614 | last_update = -1 615 | def showProgress(stat, duration, current, end=batches_per_epoch, update_interval=5, simple_mode=False): 616 | 617 | #epochs might take a lot of time, so we want some kind of progress bar 618 | #this approach is not very sophisticated, but it does the job :) 619 | #you should use simple_mode=True if run with IDLE and simple_mode=False if run on command line 620 | 621 | global avg_duration 622 | global last_update 623 | 624 | #time left 625 | avg_duration.append(duration) 626 | avg_duration = avg_duration[-10:] 627 | r = int(abs(end - current) * np.mean(avg_duration) / 60) + 1 628 | 629 | #percentage 630 | p = int(current / float(end) * 100) 631 | progress = "" 632 | for s in xrange(update_interval, 100, update_interval): 633 | if s <= p: 634 | progress += "=" 635 | else: 636 | progress += " " 637 | 638 | #status line 639 | if p > last_update and p % update_interval == 0 or last_update == -1: 640 | if simple_mode: 641 | if current == 1: 642 | print stat.upper() + ": [", 643 | else: 644 | print "=", 645 | if current == end: 646 | print "]", 647 | else: 648 | print stat.upper() + ": [" + progress + "] BATCHES " + str(current) + "/" + str(end) + " (" + str(p) + "%) - " + str(r) + " min REMAINING\r", 649 | last_update = p 650 | 651 | ###################### TRAINING ######################### 652 | print "START TRAINING..." 653 | train_loss = [] 654 | val_loss = [] 655 | val_accuracy = [] 656 | max_acc = -1 657 | lr = LEARNING_RATE[LEARNING_RATE.keys()[0]] 658 | SAVE_MODEL_AFTER_TRAINING = True 659 | 660 | #train for some epochs... 661 | for epoch in range(EPOCH_START, EPOCHS + 1): 662 | 663 | try: 664 | 665 | #start timer 666 | start = time.time() 667 | 668 | #reset confusion matrix 669 | clearConfusionMatrix() 670 | 671 | #adjust learning rate (interpolate or steps) 672 | if LR_DESCENT: 673 | lr_keys = np.array(LEARNING_RATE.keys() + [EPOCHS], dtype='float32') 674 | lr_values = np.array(LEARNING_RATE.values() + [LEARNING_RATE.values()[-1]], dtype='float32') 675 | lr_func = interpolate.interp1d(lr_keys, lr_values, kind='linear') 676 | lr = np.float32(lr_func(max(LEARNING_RATE.keys()[0], epoch - 1))) 677 | else: 678 | if epoch in LEARNING_RATE: 679 | lr = LEARNING_RATE[epoch] 680 | 681 | #shuffle dataset (this way we get "new" batches every epoch) 682 | if RANDOMIZE_TRAIN_SET: 683 | TRAIN = shuffle(TRAIN, random_state=RANDOM) 684 | 685 | #time 686 | bstart = time.time() 687 | last_update = -1 688 | 689 | #iterate over train split batches and calculate mean loss for epoch 690 | t_l = [] 691 | bcnt = 0 692 | for image_batch, target_batch in bg.threadedBatchGenerator(getNextImageBatch()): 693 | 694 | #calling the training functions returns the current loss 695 | loss = train_net(image_batch, target_batch, lr) 696 | t_l.append(loss) 697 | 698 | bcnt += 1 699 | 700 | #show progress 701 | showProgress("EPOCH " + str(epoch), (time.time() - bstart), bcnt, simple_mode=SIMPLE_LOG_MODE) 702 | bstart = time.time() 703 | 704 | #we validate our net every epoch and pass our validation split through as well 705 | v_l = [] 706 | v_a = [] 707 | for image_batch, target_batch in bg.threadedBatchGenerator(getNextImageBatch(VAL, False, VAL_HAS_MULTI_LABEL)): 708 | 709 | #calling the test function returns the net output, loss and accuracy 710 | prediction_batch, loss, acc = test_net(image_batch, target_batch) 711 | v_l.append(loss) 712 | v_a.append(acc) 713 | 714 | #save predicions and targets for confusion matrix 715 | updateConfusionMatrix(prediction_batch, target_batch) 716 | 717 | bcnt += 1 718 | 719 | #show progress 720 | showProgress("EPOCH " + str(epoch), (time.time() - bstart), bcnt, simple_mode=SIMPLE_LOG_MODE) 721 | bstart = time.time() 722 | 723 | #stop timer 724 | end = time.time() 725 | 726 | #calculate stats for epoch 727 | train_loss.append(np.mean(t_l)) 728 | val_loss.append(np.mean(v_l)) 729 | val_accuracy.append(np.mean(v_a)) 730 | 731 | #print stats for epoch 732 | print "TRAIN LOSS:", train_loss[-1], 733 | print "VAL LOSS:", val_loss[-1], 734 | print "VAL ACCURACY:", (int(val_accuracy[-1] * 1000) / 10.0), "%", 735 | print "LR:", lr, 736 | print "TIME:", (int((end - start) * 10) / 10.0), "s" 737 | 738 | #log max accuracy and save best params 739 | acc = (int(val_accuracy[-1] * 1000) / 10.0) 740 | if acc >= max_acc: 741 | max_acc = acc 742 | BEST_MODEL = NET 743 | BEST_EPOCH = epoch 744 | 745 | #show confusion matrix 746 | showConfusionMatrix(epoch) 747 | 748 | #save snapshot? 749 | if epoch in SNAPSHOT_EPOCHS or SNAPSHOT_EPOCHS[0] == -1: 750 | saveModel(epoch) 751 | 752 | except KeyboardInterrupt: 753 | SAVE_MODEL_AFTER_TRAINING = SAVE_AFTER_INTERRUPT 754 | break 755 | 756 | print "TRAINING DONE!" 757 | print "MAX ACC: ", max_acc 758 | 759 | #save best model params 760 | if SAVE_MODEL_AFTER_TRAINING: 761 | saveModel(BEST_EPOCH, BEST_MODEL) 762 | --------------------------------------------------------------------------------