├── utils
├── __init__.py
└── batch_generator.py
├── requirements.txt
├── model
├── fetch_model.sh
└── download_model.txt
├── dataset
├── ruhe_silence.wav
├── fenster_window.wav
├── schreien_scream.wav
└── datasets.txt
├── LICENSE
├── AED_spec.py
├── AED_test.py
├── AED_eval.py
├── README.md
└── AED_train.py
/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | scikit-learn
4 | python_speech_features
5 | pydub
6 | matplotlib
--------------------------------------------------------------------------------
/model/fetch_model.sh:
--------------------------------------------------------------------------------
1 | wget https://box.tu-chemnitz.de/index.php/s/8vkQqXbUjVWlt5m/download -O AED_Example_Run_model.pkl
--------------------------------------------------------------------------------
/dataset/ruhe_silence.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/AcousticEventDetection/master/dataset/ruhe_silence.wav
--------------------------------------------------------------------------------
/dataset/fenster_window.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/AcousticEventDetection/master/dataset/fenster_window.wav
--------------------------------------------------------------------------------
/dataset/schreien_scream.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/AcousticEventDetection/master/dataset/schreien_scream.wav
--------------------------------------------------------------------------------
/model/download_model.txt:
--------------------------------------------------------------------------------
1 | You can download a pre-trained model here: https://box.tu-chemnitz.de/index.php/s/8vkQqXbUjVWlt5m
2 |
--------------------------------------------------------------------------------
/dataset/datasets.txt:
--------------------------------------------------------------------------------
1 | You can use some publicly available datasets such as UrbanSound8K for training. Additionally, you can visit freesound.org to download open source sound files. We provide three test samples of our data set used in the paper. The pre-trained model we provide is trained on 20 classes described in the paper.
2 |
--------------------------------------------------------------------------------
/utils/batch_generator.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #Loading images with CPU background threads during GPU forward passes saves a lot of time
3 | #Credit: J. Schlüter (https://github.com/Lasagne/Lasagne/issues/12)
4 | def threadedBatchGenerator(generator, num_cached=10):
5 |
6 | import Queue
7 | queue = Queue.Queue(maxsize=num_cached)
8 | sentinel = object() # guaranteed unique reference
9 |
10 | #define producer (putting items into queue)
11 | def producer():
12 | for item in generator:
13 | queue.put(item)
14 | queue.put(sentinel)
15 |
16 | #start producer (in a background thread)
17 | import threading
18 | thread = threading.Thread(target=producer)
19 | thread.daemon = True
20 | thread.start()
21 |
22 | #run as consumer (read items from queue, in current thread)
23 | item = queue.get()
24 | while item is not sentinel:
25 | yield item
26 | queue.task_done()
27 | item = queue.get()
28 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Stefan Kahl
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/AED_spec.py:
--------------------------------------------------------------------------------
1 | import os
2 | import traceback
3 | import operator
4 |
5 | import numpy as np
6 | import cv2
7 |
8 | import python_speech_features as psf
9 | import scipy.io.wavfile as wave
10 | from scipy import interpolate
11 |
12 | ######################################################
13 | src_dir = 'dataset/train/wav/'
14 | spec_dir = 'dataset/train/spec/'
15 |
16 | SPEC_LENGTH = 3 #seconds
17 | SPEC_OVERLAP = 2 #seconds
18 |
19 | ######################################################
20 | def getSpecSettings(seconds):
21 |
22 | #recommended settings for spectrogram extraction
23 | settings = {2:[0.015, 0.0068],
24 | 3:[0.02, 0.00585],
25 | 5:[0.05, 0.0097],
26 | 10:[0.05, 0.0195],
27 | 30:[0.05, 0.0585]}
28 |
29 | winlen = settings[seconds][0]
30 | winstep = settings[seconds][1]
31 |
32 | nfft = 511
33 |
34 | return winlen, winstep, nfft
35 |
36 |
37 | def changeSampleRate(sig, rate):
38 |
39 | duration = sig.shape[0] / rate
40 |
41 | time_old = np.linspace(0, duration, sig.shape[0])
42 | time_new = np.linspace(0, duration, int(sig.shape[0] * 44100 / rate))
43 |
44 | interpolator = interpolate.interp1d(time_old, sig.T)
45 | new_audio = interpolator(time_new).T
46 |
47 | sig = np.round(new_audio).astype(sig.dtype)
48 |
49 | return sig, 44100
50 |
51 | def getSpecFromSignal(sig, rate, seconds=SPEC_LENGTH):
52 |
53 | #get settings
54 | winlen, winstep, nfft = getSpecSettings(seconds)
55 |
56 | #get frames
57 | winfunc=lambda x:np.ones((x,))
58 | frames = psf.sigproc.framesig(sig, winlen*rate, winstep*rate, winfunc)
59 |
60 | #Magnitude Spectrogram
61 | magspec = np.rot90(psf.sigproc.magspec(frames, nfft))
62 |
63 | #normalize to values from 0 to 1
64 | magspec -= magspec.min(axis=None)
65 | magspec /= magspec.max(axis=None)
66 |
67 | #adjust shape if signal is too short
68 | magspec = magspec[:256, :512]
69 | temp = np.zeros((256, 512), dtype="float32")
70 | temp[:magspec.shape[0], :magspec.shape[1]] = magspec
71 | magspec = temp.copy()
72 | magspec = cv2.resize(magspec, (512, 256))
73 |
74 | #DEBUG: show
75 | #cv2.imshow('SPEC', magspec)
76 | #cv2.waitKey(-1)
77 |
78 | return magspec
79 |
80 | def splitSignal(sig, rate, seconds=SPEC_LENGTH, overlap=SPEC_OVERLAP):
81 |
82 | #split signal with ovelap
83 | sig_splits = []
84 | for i in xrange(0, len(sig), int((seconds - overlap) * rate)):
85 | split = sig[i:i + seconds * rate]
86 | if len(split) >= 1 * rate:
87 | sig_splits.append(split)
88 |
89 | #is signal too short for segmentation?
90 | if len(sig_splits) == 0:
91 | sig_splits.append(sig)
92 |
93 | return sig_splits
94 |
95 | def getMultiSpec(path, seconds=SPEC_LENGTH, overlap=SPEC_OVERLAP):
96 |
97 | #open wav file
98 | (rate, sig) = wave.read(path)
99 | print "SAMPLE RATE:", rate,
100 |
101 | #adjust to different sample rates
102 | if rate != 44100:
103 | sig, rate = changeSampleRate(sig, rate)
104 |
105 | #split signal into chunks
106 | sig_splits = splitSignal(sig, rate, seconds, overlap)
107 |
108 | #calculate spectrogram for every split
109 | for sig in sig_splits:
110 |
111 | magspec = getSpecFromSignal(sig, rate, seconds)
112 |
113 | yield magspec
114 |
115 | ######################################################
116 | if __name__ == "__main__":
117 |
118 | events = [src_dir + event + '/' for event in sorted(os.listdir(src_dir))]
119 | print "NUMBER OF EVENTS:", len(events)
120 |
121 | #parse wave files for every event class
122 | for event in events:
123 | total_specs = 0
124 |
125 | #get wav files for event class
126 | wav_files = [event + wav for wav in sorted(os.listdir(event))]
127 |
128 | #parse wav files
129 | for wav in wav_files:
130 |
131 | #stats
132 | spec_cnt = 0
133 | print wav,
134 |
135 | try:
136 | #extract specs from wav file
137 | for spec in getMultiSpec(wav):
138 |
139 | #output dir for specs
140 | dst_dir = spec_dir + event.split("/")[-2] + "/"
141 | if not os.path.exists(dst_dir):
142 | os.makedirs(dst_dir)
143 |
144 | #save spec
145 | cv2.imwrite(dst_dir + wav.split("/")[-1].rsplit(".")[0] + "_" + str(spec_cnt) + ".png", spec * 255.0)
146 | spec_cnt += 1
147 | total_specs += 1
148 |
149 | print "SPECS:", spec_cnt
150 |
151 | except:
152 | print spec_cnt, "ERROR"
153 | traceback.print_exc()
154 | pass
155 |
156 |
--------------------------------------------------------------------------------
/AED_test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | print "HANDLING IMPORTS..."
4 |
5 | import time
6 | import operator
7 | import argparse
8 |
9 | import traceback
10 | import numpy as np
11 | import pickle
12 |
13 | import theano
14 |
15 | from lasagne import random as lasagne_random
16 | from lasagne import layers as l
17 |
18 | import AED_spec as spectrogram
19 |
20 | print "...DONE!"
21 |
22 | ######################## CONFIG #########################
23 | #Fixed random seed
24 | RANDOM_SEED = 1337
25 | RANDOM = np.random.RandomState(RANDOM_SEED)
26 | lasagne_random.set_rng(RANDOM)
27 |
28 | #Pre-trained model params
29 | MODEL_PATH = 'model/'
30 | TRAINED_MODEL = 'AED_Example_Run_model.pkl'
31 |
32 | ################### ARGUMENT PARSER #####################
33 | def parse_args():
34 |
35 | parser = argparse.ArgumentParser(description='Acoustic Event Classification')
36 | parser.add_argument('--filenames', dest='filenames', help='paths to sample wav files for testing as list or single string', type=str, default='')
37 | parser.add_argument('--modelname', dest='modelname', help='name of pre-trained model', type=str, default=None)
38 | parser.add_argument('--speclength', dest='spec_length', help='spectrogram length in seconds', type=int, default=3)
39 | parser.add_argument('--overlap', dest='spec_overlap', help='spectrogram overlap in seconds', type=int, default=2)
40 | parser.add_argument('--results', dest='num_results', help='number of results', type=int, default=5)
41 | parser.add_argument('--confidence', dest='min_confidence', help='confidence threshold', type=float, default=0.01)
42 |
43 | args = parser.parse_args()
44 |
45 | #single test file or list of files?
46 | if isinstance(args.filenames, basestring):
47 | args.filenames = [args.filenames]
48 |
49 | return args
50 |
51 | #################### MODEL LOAD ########################
52 | def loadModel(filename):
53 | print "IMPORTING MODEL...",
54 | net_filename = MODEL_PATH + filename
55 |
56 | with open(net_filename, 'rb') as f:
57 | data = pickle.load(f)
58 |
59 | #for evaluation, we want to load the complete model architecture and trained classes
60 | net = data['net']
61 | classes = data['classes']
62 | im_size = data['im_size']
63 | im_dim = data['im_dim']
64 |
65 | print "DONE!"
66 |
67 | return net, classes, im_size, im_dim
68 |
69 | ################# PREDICTION FUNCTION ####################
70 | def getPredictionFuntion(net):
71 | net_output = l.get_output(net, deterministic=True)
72 |
73 | print "COMPILING THEANO TEST FUNCTION...",
74 | start = time.time()
75 | test_net = theano.function([l.get_all_layers(NET)[0].input_var], net_output, allow_input_downcast=True)
76 | print "DONE! (", int(time.time() - start), "s )"
77 |
78 | return test_net
79 |
80 | ################# PREDICTION POOLING ####################
81 | def predictionPooling(p):
82 |
83 | #You can test different prediction pooling strategies here
84 | #We only use average pooling
85 | if p.ndim == 2:
86 | p_pool = np.mean(p, axis=0)
87 | else:
88 | p_pool = p
89 |
90 | return p_pool
91 |
92 | ####################### PREDICT #########################
93 | def predict(img):
94 |
95 | #transpose image if dim=3
96 | try:
97 | img = np.transpose(img, (2, 0, 1))
98 | except:
99 | pass
100 |
101 | #reshape image
102 | img = img.reshape(-1, IM_DIM, IM_SIZE[1], IM_SIZE[0])
103 |
104 | #calling the test function returns the net output
105 | prediction = TEST_NET(img)[0]
106 |
107 | return prediction
108 |
109 | ####################### TESTING #########################
110 | def testFile(path, spec_length, spec_overlap, num_results, confidence_threshold=0.01):
111 |
112 | #time
113 | start = time.time()
114 |
115 | #extract spectrograms from wav-file and process them
116 | predictions = []
117 | spec_cnt = 0
118 | for spec in spectrogram.getMultiSpec(path, seconds=spec_length, overlap=spec_overlap):
119 |
120 | #make prediction
121 | p = predict(spec)
122 | spec_cnt += 1
123 |
124 | #stack predictions
125 | if len(predictions):
126 | predictions = np.vstack([predictions, p])
127 | else:
128 | predictions = p
129 |
130 | #prediction pooling
131 | p_pool = predictionPooling(predictions)
132 |
133 | #get class labels for predictions
134 | p_labels = {}
135 | for i in range(p_pool.shape[0]):
136 | if p_pool[i] >= confidence_threshold:
137 | p_labels[CLASSES[i]] = p_pool[i]
138 |
139 | #sort by confidence and limit results (None returns all results)
140 | p_sorted = sorted(p_labels.items(), key=operator.itemgetter(1), reverse=True)[:num_results]
141 |
142 | #take time again
143 | dur = time.time() - start
144 |
145 | return p_sorted, spec_cnt, dur
146 |
147 | #################### EXAMPLE USAGE ######################
148 | if __name__ == "__main__":
149 |
150 | #adjust config
151 | args = parse_args()
152 |
153 | #load model
154 | if args.modelname:
155 | TRAINED_MODEL = args.modelname
156 | NET, CLASSES, IM_SIZE, IM_DIM = loadModel(TRAINED_MODEL)
157 |
158 | #compile test function
159 | TEST_NET = getPredictionFuntion(NET)
160 |
161 | #do testing
162 | for fname in args.filenames:
163 | print 'TESTING:', fname
164 | pred, cnt, dur = testFile(fname, args.spec_length, args.spec_overlap, args.num_results, args.min_confidence)
165 | print 'TOP PREDICTION(S):'
166 | for p in pred:
167 | print '\t', p[0], int(p[1] * 100), '%'
168 | print 'PREDICTION FOR', cnt, 'SPECS TOOK', int(dur * 1000), 'ms (', int(dur / cnt * 1000) , 'ms/spec', ')', '\n'
169 |
170 |
--------------------------------------------------------------------------------
/AED_eval.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | print "HANDLING IMPORTS..."
4 |
5 | import warnings
6 | warnings.filterwarnings('ignore')
7 |
8 | import os
9 | import time
10 | import operator
11 |
12 | import traceback
13 | import numpy as np
14 | import pickle
15 |
16 | import theano
17 |
18 | from lasagne import random as lasagne_random
19 | from lasagne import layers as l
20 |
21 | import scipy.io.wavfile as wave
22 |
23 | import AED_spec as spectrogram
24 | import utils.batch_generator as bg
25 |
26 | print "...DONE!"
27 |
28 | ######################## CONFIG #########################
29 | #Fixed random seed
30 | RANDOM_SEED = 1337
31 | RANDOM = np.random.RandomState(RANDOM_SEED)
32 | lasagne_random.set_rng(RANDOM)
33 |
34 | #Dataset params
35 | TEST_DIR = 'dataset/test/'
36 |
37 | #Pre-trained model params
38 | MODEL_PATH = 'model/'
39 | TRAINED_MODEL = 'AED_Example_Run_model.pkl'
40 |
41 | #Testing params
42 | BATCH_SIZE = 32
43 | SPEC_LENGTH = 3
44 | SPEC_OVERLAP = 2
45 | CONFIDENCE_THRESHOLD = 0.0001
46 | MAX_PREDICTIONS = 10
47 |
48 | ################### AUDIO PROCESSING ####################
49 | def parseTestSet():
50 |
51 | #get list of test files
52 | test = []
53 | test_classes = [os.path.join(TEST_DIR, tc) for tc in sorted(os.listdir(TEST_DIR))]
54 | for tc in test_classes:
55 | test += [os.path.join(tc, fpath) for fpath in os.listdir(tc)]
56 | test = test
57 |
58 | #get class label for every test sample
59 | gt = {}
60 | for path in test:
61 | label = path.split('/')[-2]
62 | gt[path] = label
63 |
64 | #stats
65 | #print classes
66 | print "NUMBER OF CLASSES:", len(test_classes)
67 | print "NUMBER OF TEST SAMPLES:", len(test)
68 |
69 | return test, gt
70 |
71 | TEST, GT = parseTestSet()
72 |
73 | #################### BATCH HANDLING #####################
74 | def getSignalChunk(sig, rate):
75 |
76 | #split signal into chunks
77 | sig_splits = spectrogram.splitSignal(sig, rate, SPEC_LENGTH, SPEC_OVERLAP)
78 |
79 | #get batch-sized chunks of image paths
80 | for i in xrange(0, len(sig_splits), BATCH_SIZE):
81 | yield sig_splits[i:i+BATCH_SIZE]
82 |
83 | def getNextSpecBatch(path):
84 |
85 | #open wav file
86 | (rate, sig) = wave.read(path)
87 |
88 | #change sample rate if needed
89 | if rate != 44100:
90 | sig, rate = spectrogram.changeSampleRate(sig, rate)
91 |
92 | #fill batches
93 | for sig_chunk in getSignalChunk(sig, rate):
94 |
95 | #allocate numpy arrays for image data and targets
96 | s_b = np.zeros((BATCH_SIZE, IM_DIM, IM_SIZE[1], IM_SIZE[0]), dtype='float32')
97 |
98 | ib = 0
99 | for s in sig_chunk:
100 |
101 | #load spectrogram data from sig
102 | spec = spectrogram.getSpecFromSignal(s, rate, SPEC_LENGTH)
103 |
104 | #reshape spec
105 | spec = spec.reshape(-1, IM_DIM, IM_SIZE[1], IM_SIZE[0])
106 |
107 | #pack into batch array
108 | s_b[ib] = spec
109 | ib += 1
110 |
111 | #trim to actual size
112 | s_b = s_b[:ib]
113 |
114 | #yield batch
115 | yield s_b
116 |
117 | #################### MODEL LOAD ########################
118 | def loadModel(filename):
119 | print "IMPORTING MODEL...",
120 | net_filename = MODEL_PATH + filename
121 |
122 | with open(net_filename, 'rb') as f:
123 | data = pickle.load(f)
124 |
125 | #for evaluation, we want to load the complete model architecture and trained classes
126 | net = data['net']
127 | classes = data['classes']
128 | im_size = data['im_size']
129 | im_dim = data['im_dim']
130 |
131 | print "DONE!"
132 |
133 | return net, classes, im_size, im_dim
134 |
135 | ################# PREDICTION FUNCTION ####################
136 | def getPredictionFuntion(net):
137 | net_output = l.get_output(net, deterministic=True)
138 |
139 | print "COMPILING THEANO TEST FUNCTION...",
140 | start = time.time()
141 | test_net = theano.function([l.get_all_layers(NET)[0].input_var], net_output, allow_input_downcast=True)
142 | print "DONE! (", int(time.time() - start), "s )"
143 |
144 | return test_net
145 |
146 | ################# PREDICTION POOLING ####################
147 | def predictionPooling(p):
148 |
149 | #You can test different prediction pooling strategies here
150 | #We only use average pooling
151 | if p.ndim == 2:
152 | p_pool = np.mean(p, axis=0)
153 | else:
154 | p_pool = p
155 |
156 | return p_pool
157 |
158 | ####################### TESTING #########################
159 | #test model
160 | print "TESTING MODEL..."
161 |
162 | #load model
163 | NET, CLASSES, IM_SIZE, IM_DIM = loadModel(filename=TRAINED_MODEL)
164 |
165 | #get test function
166 | test_net = getPredictionFuntion(NET)
167 |
168 | pr = []
169 | pcnt = 1
170 | ecnt = 0
171 | acc = []
172 | #test every sample from test collection
173 | for path in TEST:
174 |
175 | #status
176 | print pcnt, path.replace(TEST_DIR, ''),
177 |
178 | try:
179 |
180 | #make predictions for batches of spectrograms
181 | predictions = []
182 | for spec_batch in bg.threadedBatchGenerator(getNextSpecBatch(path)):
183 |
184 | #predict
185 | p = test_net(spec_batch)
186 |
187 | #stack predictions
188 | if len(predictions):
189 | predictions = np.vstack([predictions, p])
190 | else:
191 | predictions = p
192 |
193 | #prediction pooling
194 | p_pool = predictionPooling(predictions)
195 |
196 | #get class labels for predictions
197 | p_labels = {}
198 | for i in range(p_pool.shape[0]):
199 | p_labels[CLASSES[i]] = p_pool[i]
200 |
201 | #sort by confidence
202 | p_sorted = sorted(p_labels.items(), key=operator.itemgetter(1), reverse=True)[:MAX_PREDICTIONS]
203 |
204 | #calculate avg precision
205 | for i in range(len(p_sorted)):
206 | if p_sorted[i][0] == GT[path]:
207 | pr.append(1.0 / float(i + 1))
208 | if i + 1 == 1:
209 | acc.append(1)
210 | else:
211 | acc.append(0)
212 | break
213 |
214 | print 'LABEL:', p_sorted[0], 'AVGP:', pr[-1]
215 |
216 | except KeyboardInterrupt:
217 | break
218 | except:
219 | print "ERROR"
220 | #pr.append(0.0)
221 | traceback.print_exc()
222 | ecnt += 1
223 | continue
224 |
225 | pcnt += 1
226 |
227 | print "TESTING DONE!"
228 | print "ERRORS:", ecnt, "/", pcnt - 1
229 | print "MAP:", np.mean(pr)
230 | print "ACCURACY:", np.mean(acc)
231 |
232 |
233 |
234 |
235 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Acoustic Event Classification Using Convolutional Neural Networks
2 | By [Stefan Kahl](http://medien.informatik.tu-chemnitz.de/skahl/about/), [Hussein Hussein](https://www.tu-chemnitz.de/informatik/HomePages/Medieninformatik/team.php.en), [Etienne Fabian](https://www.intenta.de/en/home.html), [Jan Schloßhauer](https://www.intenta.de/en/home.html), [Danny Kowerko](https://www.tu-chemnitz.de/informatik/mc/staff.php.en), and [Maximilian Eibl](https://www.tu-chemnitz.de/informatik/HomePages/Medieninformatik/team.php.en)
3 |
4 | ## Introduction
5 | This code repo complements our submission to the [INFORMATIK 2017 Workshop WS34](https://informatik2017.de/ws34-dlhd/). This is a refined version of our original code described in the paper. We added comments, removed some of the boilerplate code and added testing functionality. If you have any questions or problems running the scripts, don't hesitate to contact us.
6 |
7 | Contact: [Stefan Kahl](http://medien.informatik.tu-chemnitz.de/skahl/about/), [Technische Universität Chemnitz](https://www.tu-chemnitz.de/index.html.en), [Media Informatics](https://www.tu-chemnitz.de/informatik/Medieninformatik/index.php.en)
8 |
9 | E-Mail: stefan.kahl@informatik.tu-chemnitz.de
10 |
11 | This project is licensed under the terms of the MIT license.
12 |
13 | Please cite the paper in your publications if it helps your research.
14 |
15 | You can download the submission here: [2017_INFORMATIK_AED_CNN.pdf](https://box.tu-chemnitz.de/index.php/s/sfW010bbLEsP4Kw) (Unpublished draft version)
16 |
17 | ## Installation
18 | This is a Thenao/Lasagne implementation in Python for the classification of acoustic events based on deep features. This code is tested using Ubuntu 14.04 LTS but should work with other distributions as well.
19 |
20 | First, you need to install Python 2.7 and the CUDA-Toolkit for GPU acceleration. After that, you can clone the project and run the Python package tool PIP to install most of the relevant dependencies:
21 |
22 | ```
23 | git clone https://github.com/kahst/AcousticEventDetection.git
24 | cd AcousticEventDetection
25 | sudo pip install –r requirements.txt
26 | ```
27 |
28 | We use OpenCV for image processing; you can install the cv2 package for Python running this command:
29 |
30 | ```
31 | sudo apt-get install python-opencv
32 | ```
33 |
34 | Finally, you need to install Theano and Lasagne:
35 | ```
36 | sudo pip install -r https://raw.githubusercontent.com/Lasagne/Lasagne/master/requirements.txt
37 | sudo pip install https://github.com/Lasagne/Lasagne/archive/master.zip
38 | ```
39 |
40 | You should follow the Lasagne installation instructions for more details:
41 | http://lasagne.readthedocs.io/en/latest/user/installation.html
42 |
43 | ## Training
44 | In order to train a model based on your own dataset or any other publicly available dataset (e.g. [UrbanSound8K](https://serv.cusp.nyu.edu/projects/urbansounddataset/index.html)) you need to follow some simple steps: First, you need to organize your dataset with subfolders as class labels. Secondly, you need to extract spectrograms from all audio files using the script AED_spec.py. After that, you are ready to train your model. Finally, you can either evaluate a model using AED_eval.py or make predictions for any sound file using AED_test.py.
45 |
46 | ### Dataset
47 | The training script uses subfolders as class names and you should provide following directory structure:
48 |
49 | ```
50 | dataset
51 | ¦
52 | +---event1
53 | ¦ ¦ file011.wav
54 | ¦ ¦ file012.wav
55 | ¦ ¦ ...
56 | ¦
57 | +---event2
58 | ¦ ¦ file021.wav
59 | ¦ ¦ file022.wav
60 | ¦ ¦ ...
61 | ¦
62 | +---...
63 | ```
64 | ### Extracting Spectrograms
65 | We decided to use magnitude spectrograms with a resolution of 512x256 pixels, which represent three-second chunks of audio signal. You can generate spectrograms for your sorted dataset with the script AED_spec.py. You can switch to different settings for the spectrograms by editing the file.
66 |
67 | Extracting spectrograms might take a while. Eventually, you should end up with a directory containing subfolders named after acoustic events, which we will use as class names during training.
68 |
69 | ### Training a Model
70 | You can train your own model using either publicly available training data or your own sound recordings. All you need are spectrograms of the recordings. Before training, you should review the following settings, which you can find in the AED_train.py file:
71 |
72 | - `DATASET_PATH` containing the spectrograms (subfolders as class names).
73 |
74 | - `MIN_SAMPLES_PER_CLASS` increasing the number of spectrograms per acoustic event class in order to counter class imbalances (Default = -1).
75 |
76 | - `MAX_SAMPLES_PER_CLASS` limiting the number of spectrograms per acoustic event class (Default = None - No limit).
77 |
78 | - `VAL_SPLIT` which defines the amount of spectrograms in percent you like to use for monitoring the training process (Default = 0.1).
79 |
80 | - `MULTI_LABEL` to switch between softmax outputs (False) or sigmoid outputs (True); Activates batch augmentation (multiple targets per spec).
81 |
82 | - `IM_SIZE` defining the size of input images, spectrograms will be scaled accordingly (Default = 512x256 pixels).
83 |
84 | - `IM_AUGMENTATION` selecting different techniques for dataset augmentation.
85 |
86 | - `BATCH_SIZE` defining the number of images per batch; reduce batch size to fit model in less GPU memory (Default = 32).
87 |
88 | - `LEARNING_RATE` for scheduling the learning rate; use `LR_DESCENT = True` for linear interpolation and `LR_DESCENT = False` for steps.
89 |
90 | - `PRETRAINED_MODEL` if you want to use a pickle file of a previously trained model; set `LOAD_OUTPUT_LAYER = False` if model output size differs (you can download a pre-trained model [here](https://box.tu-chemnitz.de/index.php/s/8vkQqXbUjVWlt5m)).
91 |
92 | - `SNAPSHOT_EPOCHS` in order to continuously save model snapshots; select `[-1]` to save after every epoch; the best model params will be saved automatically after training.
93 |
94 | There are a lot more options - most should be self-explanatory. If you have any questions regarding the settings or the training process, feel free to contact us.
95 |
96 | Note: In order to keep results reproducible with fixed random seeds, you need to update your .theanorc file with the following lines:
97 |
98 | ```
99 | [dnn.conv]
100 | algo_bwd_filer=deterministic
101 | algo_bwd_data=deterministic
102 | ```
103 |
104 | Depending on your GPU, training might take while...
105 |
106 | ## Evaluation
107 | After training, you can test models and evaluate them on your local validation split. Therefore, you need to adjust the settings in AED_eval.py to match your task. The most important settings are:
108 |
109 | - `TEST_DIR` defining the path to your test data. Again, we use subfolders as class labels (Ground Truth).
110 |
111 | - `TRAINED_MODEL` where you specify the pickle file of your pre-trained model and the corresponding model architecture.
112 |
113 | - `SPEC_LENGTH` and `SPEC_OVERLAP` which you should choose according to your training data. Increasing the overlap might reduce the prediction error due to more predictions per file.
114 |
115 | - `CONFIDENCE_THRESHOLD` and `MAX_PREDICTIONS` can be used to limit the number of predictions returned.
116 |
117 | Note: Test data should be organized as training data, subfolders as class names. Feel free to use different ground truth annotations; all you need to do is edit the script accordingly.
118 |
119 | ## Testing
120 | If you want to make predictions for a single, unlabeled wav-file, you can call the script AED_test.py via the command shell. We provided some example files in the dataset folder. You can use this script as is, no training required. Simply follow these steps:
121 |
122 | 1. Download pre-trained model:
123 | ```
124 | sh model/fetch_model.sh
125 | ```
126 | 2. Execute script:
127 | ```
128 | python AED_test.py --filenames 'dataset/schreien_scream.wav' --modelname 'AED_Example_Run_model.pkl' --overlap 4 --results 5 --confidence 0.01
129 | ```
130 | If everything goes well, you should see an output just like this:
131 |
132 | ```
133 | HANDLING IMPORTS... DONE!
134 | IMPORTING MODEL... DONE!
135 | COMPILING THEANO TEST FUNCTION... DONE! ( 2 s )
136 | TESTING: dataset/schreien_scream.wav
137 | SAMPLE RATE: 44100 TOP PREDICTION(S):
138 | schreien 99 %
139 | PREDICTION FOR 4 SPECS TOOK 57 ms ( 14 ms/spec )
140 | ```
141 | Note: You do not need to specify values for overlap, results and confidence – those are optional. You can define a list of wav-files for prediction. To do so, run the script using `--filenames ['file1.wav', file2.wav', ...]`.
142 |
143 | This repo might not suffice for real-world applications, but you should be able to adapt the testing script to your specific needs.
144 |
145 | We will keep this repo updated and will provide more testing functionality in the future.
146 |
--------------------------------------------------------------------------------
/AED_train.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | print "HANDLING IMPORTS..."
4 |
5 | import sys
6 | import os
7 | import time
8 | import operator
9 | import math
10 |
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 | import cv2
14 |
15 | from scipy import interpolate
16 | from sklearn.utils import shuffle
17 | from sklearn.metrics import confusion_matrix
18 | import itertools
19 |
20 | import pickle
21 |
22 | import theano
23 | import theano.tensor as T
24 |
25 | from lasagne import random as lasagne_random
26 | from lasagne import layers as l
27 | from lasagne import nonlinearities
28 | from lasagne import init
29 | from lasagne import objectives
30 | from lasagne import updates
31 | from lasagne import regularization
32 |
33 | from utils import batch_generator as bg
34 |
35 | print "...DONE!"
36 |
37 | sys.setrecursionlimit(10000)
38 | ######################## CONFIG #########################
39 | #Fixed random seed
40 | RANDOM_SEED = 1337
41 | RANDOM = np.random.RandomState(RANDOM_SEED)
42 | lasagne_random.set_rng(RANDOM)
43 |
44 | #Dataset params
45 | DATASET_PATH = 'dataset/train/spec/'
46 | MIN_SAMPLES_PER_CLASS = -1
47 | MAX_SAMPLES_PER_CLASS = None
48 | SORT_CLASSES_ALPHABETICALLY = True
49 | VAL_SPLIT = 0.1
50 | USE_CACHE = False
51 |
52 | #Multi-Label Params
53 | MULTI_LABEL = False
54 | VAL_HAS_MULTI_LABEL = False
55 | MEAN_TARGETS_PER_IMAGE = 3
56 |
57 | #Image params
58 | IM_SIZE = (512, 256) #(width, height)
59 | IM_DIM = 1
60 | IM_AUGMENTATION = {#'type':[probability, value]
61 | 'roll':[0.5, (0.0, 0.05)],
62 | #'noise':[0.1, 0.01],
63 | #'brightness':[0.5, (0.25, 1.25)],
64 | #'crop':[0.5, 0.07],
65 | #'flip': [0.25, 1]
66 | }
67 |
68 | #General model params
69 | DROPOUT = 0.5
70 | NONLINEARITY = nonlinearities.rectify
71 | INIT_GAIN = math.sqrt(2)
72 |
73 | #Training params
74 | BATCH_SIZE = 32
75 | LEARNING_RATE = {0:0.001, 55:0.000001} #epoch:lr
76 | LR_DESCENT = True
77 | L2_WEIGHT = 0 #1e-4
78 | OPTIMIZER='adam' #'adam' or 'nesterov'
79 | EPOCHS = 55
80 | RANDOMIZE_TRAIN_SET = True
81 |
82 | #Confusion matrix params
83 | CONFMATRIX_MAX_CLASSES = 20
84 | NORMALIZE_CONFMATRIX = True
85 |
86 | #Model import/export params
87 | MODEL_PATH = 'model/'
88 | PRETRAINED_MODEL = None #'pretrained_model.pkl'
89 | LOAD_OUTPUT_LAYER = True
90 | EPOCH_START = 1
91 | RUN_NAME = 'Example_Run'
92 | SIMPLE_LOG_MODE = True
93 | SNAPSHOT_EPOCHS = [10, 20, 30, 40, 50] #[-1] saves after every epoch
94 | SAVE_AFTER_INTERRUPT = True
95 |
96 | ################### DATASAT HANDLING ####################
97 | def parseDataset():
98 |
99 | #we use subfolders as class labels
100 | classes = [folder for folder in sorted(os.listdir(DATASET_PATH))]
101 | if not SORT_CLASSES_ALPHABETICALLY:
102 | classes = shuffle(classes, random_state=RANDOM)
103 |
104 | #now we enlist all image paths for each class
105 | images = []
106 | tclasses = []
107 | sample_count = {}
108 | for c in classes:
109 | c_images = [os.path.join(DATASET_PATH, c, path) for path in os.listdir(os.path.join(DATASET_PATH, c))][:MAX_SAMPLES_PER_CLASS]
110 | sample_count[c] = len(c_images)
111 | images += c_images
112 |
113 | #Do we want to correct class imbalance?
114 | #This will affect validation scores as we use some samples in TRAIN and VAL
115 | while sample_count[c] < MIN_SAMPLES_PER_CLASS:
116 | images += [c_images[RANDOM.randint(0, len(c_images))]]
117 | sample_count[c] += 1
118 |
119 | #shuffle image paths
120 | images = shuffle(images, random_state=RANDOM)
121 |
122 | #validation split
123 | vsplit = int(len(images) * VAL_SPLIT)
124 | train = images[:-vsplit]
125 | val = images[-vsplit:]
126 |
127 | #show classes if needed for testing
128 | #print classes
129 |
130 | #show some stats
131 | print "CLASSES:", len(classes)
132 | print "CLASS LABELS:", sorted(sample_count.items(), key=operator.itemgetter(1))
133 | print "TRAINING IMAGES:", len(train)
134 | print "VALIDATION IMAGES:", len(val)
135 |
136 | return classes, train, val
137 |
138 | #parse dataset
139 | CLASSES, TRAIN, VAL = parseDataset()
140 | NUM_CLASSES = len(CLASSES)
141 | #################### BATCH HANDLING #####################
142 | CACHE = {}
143 | def openImage(path, useCache=USE_CACHE):
144 |
145 | global CACHE
146 |
147 | #using a dict {path:image} cache saves some time after first epoch
148 | #but may consume a lot of RAM
149 | if path in CACHE:
150 | return CACHE[path]
151 | else:
152 |
153 | #open image
154 | img = cv2.imread(path)
155 |
156 | #DEBUG
157 | try:
158 | h, w = img.shape[:2]
159 | except:
160 | print "IMAGE NONE-TYPE:", path
161 |
162 | #original image dimensions
163 | try:
164 | h, w, d = img.shape
165 |
166 | #to gray?
167 | if IM_DIM == 1:
168 | img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
169 |
170 | except:
171 | h, w = img.shape
172 |
173 | #to color?
174 | if IM_DIM == 3:
175 | img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
176 |
177 | #resize to conv input size
178 | img = cv2.resize(img, (IM_SIZE[0], IM_SIZE[1]))
179 |
180 | #convert to floats between 0 and 1
181 | img = np.asarray(img / 255., dtype='float32')
182 |
183 | if useCache:
184 | CACHE[path] = img
185 | return img
186 |
187 | def imageAugmentation(img):
188 |
189 | AUG = IM_AUGMENTATION
190 |
191 | #Random Crop (without padding)
192 | if 'crop' in AUG and RANDOM.choice([True, False], p=[AUG['crop'][0], 1 - AUG['crop'][0]]):
193 | h, w = img.shape[:2]
194 | cropw = RANDOM.randint(1, int(float(w) * AUG['crop'][1]))
195 | croph = RANDOM.randint(1, int(float(h) * AUG['crop'][1]))
196 | img = img[croph:-croph, cropw:-cropw]
197 | img = cv2.resize(img, (IM_SIZE[0], IM_SIZE[1]))
198 |
199 | #Flip - 1 = Horizontal, 0 = Vertical
200 | if 'flip' in AUG and RANDOM.choice([True, False], p=[AUG['flip'][0], 1 - AUG['flip'][0]]):
201 | img = cv2.flip(img, AUG['flip'][1])
202 |
203 | #Wrap shift (roll up/down and left/right)
204 | if 'roll' in AUG and RANDOM.choice([True, False], p=[AUG['roll'][0], 1 - AUG['roll'][0]]):
205 | img = np.roll(img, int(img.shape[0] * (RANDOM.uniform(-AUG['roll'][1][1], AUG['roll'][1][1]))), axis=0)
206 | img = np.roll(img, int(img.shape[1] * (RANDOM.uniform(-AUG['roll'][1][0], AUG['roll'][1][0]))), axis=1)
207 |
208 | #substract/add mean
209 | if 'mean' in AUG and RANDOM.choice([True, False], p=[AUG['mean'][0], 1 - AUG['mean'][0]]):
210 | img += np.mean(img) * AUG['mean'][1]
211 |
212 | #gaussian noise
213 | if 'noise' in AUG and RANDOM.choice([True, False], p=[AUG['noise'][0], 1 - AUG['noise'][0]]):
214 | img += RANDOM.normal(0.0, RANDOM.uniform(0, AUG['noise'][1]**0.5), img.shape)
215 | img = np.clip(img, 0.0, 1.0)
216 |
217 | #adjust brightness
218 | if 'brightness' in AUG and RANDOM.choice([True, False], p=[AUG['brightness'][0], 1 - AUG['brightness'][0]]):
219 | img *= RANDOM.uniform(AUG['brightness'][1][0], AUG['brightness'][1][1])
220 | img = np.clip(img, 0.0, 1.0)
221 |
222 | #show
223 | #cv2.imshow("AUG", img)#.reshape(IM_SIZE[1], IM_SIZE[0], IM_DIM))
224 | #cv2.waitKey(-1)
225 |
226 | return img
227 |
228 | def loadImageAndTarget(path, doAugmentation=True):
229 |
230 | #here we open the image
231 | img = openImage(path)
232 |
233 | #image augmentation?
234 | if IM_AUGMENTATION != None and doAugmentation:
235 | img = imageAugmentation(img)
236 |
237 | #we want to use subfolders as class labels
238 | label = path.split("/")[-2]
239 |
240 | #we need to get the index of our label from CLASSES
241 | index = CLASSES.index(label)
242 |
243 | #allocate array for target
244 | target = np.zeros((NUM_CLASSES), dtype='float32')
245 |
246 | #we set our target array = 1.0 at our label index, all other entries remain 0.0
247 | target[index] = 1.0
248 |
249 | #transpose image if dim=3
250 | try:
251 | img = np.transpose(img, (2, 0, 1))
252 | except:
253 | pass
254 |
255 | #we need a 4D-vector for our image and a 2D-vector for our targets
256 | img = img.reshape(-1, IM_DIM, IM_SIZE[1], IM_SIZE[0])
257 | target = target.reshape(-1, NUM_CLASSES)
258 |
259 | return img, target
260 |
261 | def getAugmentedBatches(x, y):
262 |
263 | #augment batch until desired number of target labels per image is reached
264 | while np.mean(np.sum(y, axis=1)) < MEAN_TARGETS_PER_IMAGE:
265 |
266 | #get two images to combine (we try to prevent i == j (which could result in infinite loops) with excluding ranges)
267 | i = RANDOM.choice(range(1, x.shape[0] - 1))
268 | j = RANDOM.choice(range(0, i) + range(i + 1, x.shape[0]))
269 |
270 | #add images
271 | x[i] += x[j]
272 |
273 | #re-normalize new image
274 | x[i] -= x[i].min(axis=None)
275 | x[i] /= x[i].max(axis=None)
276 |
277 | #combine targets (makes this task a multi-label classification!)
278 | y[i] = np.logical_or(y[i], y[j])
279 |
280 | #TODO: We still might end up in an infinite loop
281 | #and should add a break in case something is fishy
282 |
283 | #show
284 | #cv2.imshow("BA", x[i].reshape(IM_SIZE[1], IM_SIZE[0], IM_DIM))
285 | #cv2.waitKey(-1)
286 |
287 | return x, y
288 |
289 | def getDatasetChunk(split):
290 |
291 | #get batch-sized chunks of image paths
292 | for i in xrange(0, len(split), BATCH_SIZE):
293 | yield split[i:i+BATCH_SIZE]
294 |
295 | def getNextImageBatch(split=TRAIN, doAugmentation=True, batchAugmentation=MULTI_LABEL):
296 |
297 | #fill batch
298 | for chunk in getDatasetChunk(split):
299 |
300 | #allocate numpy arrays for image data and targets
301 | x_b = np.zeros((BATCH_SIZE, IM_DIM, IM_SIZE[1], IM_SIZE[0]), dtype='float32')
302 | y_b = np.zeros((BATCH_SIZE, NUM_CLASSES), dtype='float32')
303 |
304 | ib = 0
305 | for path in chunk:
306 |
307 | try:
308 |
309 | #load image data and class label from path
310 | x, y = loadImageAndTarget(path, doAugmentation)
311 |
312 | #pack into batch array
313 | x_b[ib] = x
314 | y_b[ib] = y
315 | ib += 1
316 |
317 | except:
318 | continue
319 |
320 | #trim to actual size
321 | x_b = x_b[:ib]
322 | y_b = y_b[:ib]
323 |
324 | #batch augmentation?
325 | if batchAugmentation and x_b.shape[0] >= BATCH_SIZE // 2:
326 | x_b, y_b = getAugmentedBatches(x_b, y_b)
327 |
328 | #instead of return, we use yield
329 | yield x_b, y_b
330 |
331 | ################## BUILDING THE MODEL ###################
332 | def buildModel():
333 |
334 | print "BUILDING MODEL TYPE..."
335 |
336 | #default settings
337 | filters = 64
338 | first_stride = 2
339 | last_filter_multiplier = 16
340 |
341 | #input layer
342 | net = l.InputLayer((None, IM_DIM, IM_SIZE[1], IM_SIZE[0]))
343 |
344 | #conv layers
345 | net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters, filter_size=7, pad='same', stride=first_stride, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY))
346 | net = l.MaxPool2DLayer(net, pool_size=2)
347 |
348 | net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters * 2, filter_size=5, pad='same', stride=1, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY))
349 | net = l.MaxPool2DLayer(net, pool_size=2)
350 |
351 | net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters * 4, filter_size=3, pad='same', stride=1, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY))
352 | net = l.MaxPool2DLayer(net, pool_size=2)
353 |
354 | net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters * 8, filter_size=3, pad='same', stride=1, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY))
355 | net = l.MaxPool2DLayer(net, pool_size=2)
356 |
357 | net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters * last_filter_multiplier, filter_size=3, pad='same', stride=1, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY))
358 | net = l.MaxPool2DLayer(net, pool_size=2)
359 |
360 | print "\tFINAL POOL OUT SHAPE:", l.get_output_shape(net)
361 |
362 | #dense layers
363 | net = l.batch_norm(l.DenseLayer(net, 512, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY))
364 | net = l.DropoutLayer(net, DROPOUT)
365 | net = l.batch_norm(l.DenseLayer(net, 512, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY))
366 | net = l.DropoutLayer(net, DROPOUT)
367 |
368 | #Classification Layer
369 | if MULTI_LABEL:
370 | net = l.DenseLayer(net, NUM_CLASSES, nonlinearity=nonlinearities.sigmoid, W=init.HeNormal(gain=1))
371 | else:
372 | net = l.DenseLayer(net, NUM_CLASSES, nonlinearity=nonlinearities.softmax, W=init.HeNormal(gain=1))
373 |
374 | print "...DONE!"
375 |
376 | #model stats
377 | print "MODEL HAS", (sum(hasattr(layer, 'W') for layer in l.get_all_layers(net))), "WEIGHTED LAYERS"
378 | print "MODEL HAS", l.count_params(net), "PARAMS"
379 |
380 | return net
381 |
382 | NET = buildModel()
383 |
384 | ################## MODEL SAVE/LOAD ####################
385 | BEST_MODEL = None
386 | BEST_EPOCH = 0
387 | def saveModel(epoch, model=None):
388 | print "EXPORTING MODEL...",
389 | if model == None:
390 | model = NET
391 | net_filename = MODEL_PATH + "AED_" + RUN_NAME + "_model_epoch_" + str(epoch) + ".pkl"
392 | if not os.path.exists(MODEL_PATH):
393 | os.makedirs(MODEL_PATH)
394 | with open(net_filename, 'w') as f:
395 |
396 | #We want to save the model architecture with all params and trained classes
397 | data = {'net': model, 'classes':CLASSES, 'run_name': RUN_NAME, 'epoch':epoch, 'im_size':IM_SIZE, 'im_dim':IM_DIM}
398 | pickle.dump(data, f)
399 |
400 | print "DONE!"
401 |
402 | def loadModel(filename):
403 | print "IMPORTING MODEL PARAMS...",
404 | net_filename = MODEL_PATH + filename
405 |
406 | with open(net_filename, 'rb') as f:
407 | data = pickle.load(f)
408 |
409 | #for training, we only want to load the model params
410 | net = data['net']
411 | params = l.get_all_param_values(net)
412 | if LOAD_OUTPUT_LAYER:
413 | l.set_all_param_values(NET, params)
414 | else:
415 | l.set_all_param_values(l.get_all_layers(NET)[:-1], params[:-2])
416 |
417 | print "DONE!"
418 |
419 | if PRETRAINED_MODEL != None:
420 | loadModel(PRETRAINED_MODEL)
421 |
422 | #################### LOSS FUNCTION ######################
423 | def calc_loss(prediction, targets):
424 |
425 | #categorical crossentropy is the best choice for a multi-class softmax output
426 | loss = T.mean(objectives.categorical_crossentropy(prediction, targets))
427 |
428 | return loss
429 |
430 | def calc_loss_multi(prediction, targets):
431 |
432 | #we need to clip predictions when calculating the log-loss
433 | prediction = T.clip(prediction, 0.0000001, 0.9999999)
434 |
435 | #binary crossentropy is the best choice for a multi-class sigmoid output
436 | loss = T.mean(objectives.binary_crossentropy(prediction, targets))
437 |
438 | return loss
439 |
440 | #theano variable for the class targets
441 | targets = T.matrix('targets', dtype=theano.config.floatX)
442 |
443 | #get the network output
444 | prediction = l.get_output(NET)
445 |
446 | #we use L2 Norm for regularization
447 | l2_reg = regularization.regularize_layer_params(NET, regularization.l2) * L2_WEIGHT
448 |
449 | #calculate the loss
450 | if MULTI_LABEL:
451 | loss = calc_loss_multi(prediction, targets) + l2_reg
452 | else:
453 | loss = calc_loss(prediction, targets) + l2_reg
454 |
455 | ################# ACCURACY FUNCTION #####################
456 | def calc_accuracy(prediction, targets):
457 |
458 | #we can use the lasagne objective categorical_accuracy to determine the top1 single label accuracy
459 | a = T.mean(objectives.categorical_accuracy(prediction, targets, top_k=1))
460 |
461 | return a
462 |
463 | def calc_accuracy_multi(prediction, targets):
464 |
465 | #we can use the lasagne objective binary_accuracy to determine the multi label accuracy
466 | a = T.mean(objectives.binary_accuracy(prediction, targets))
467 |
468 | return a
469 |
470 | #calculate accuracy
471 | if MULTI_LABEL and VAL_HAS_MULTI_LABEL:
472 | accuracy = calc_accuracy_multi(prediction, targets)
473 | else:
474 | accuracy = calc_accuracy(prediction, targets)
475 |
476 | ####################### UPDATES #########################
477 | #we use dynamic learning rates which change after some epochs
478 | lr_dynamic = T.scalar(name='learning_rate')
479 |
480 | #get all trainable parameters (weights) of our net
481 | params = l.get_all_params(NET, trainable=True)
482 |
483 | #we use the adam update
484 | if OPTIMIZER == 'adam':
485 | param_updates = updates.adam(loss, params, learning_rate=lr_dynamic, beta1=0.5)
486 | elif OPTIMIZER == 'nesterov':
487 | param_updates = updates.nesterov_momentum(loss, params, learning_rate=lr_dynamic, momentum=0.9)
488 |
489 | #################### TRAIN FUNCTION ######################
490 | #the theano train functions takes images and class targets as input
491 | print "COMPILING THEANO TRAIN FUNCTION...",
492 | start = time.time()
493 | train_net = theano.function([l.get_all_layers(NET)[0].input_var, targets, lr_dynamic], loss, updates=param_updates)
494 | print "DONE! (", int(time.time() - start), "s )"
495 |
496 | ################# PREDICTION FUNCTION ####################
497 | #we need the prediction function to calculate the validation accuracy
498 | #this way we can test the net during/after training
499 | net_output = l.get_output(NET, deterministic=True)
500 |
501 | print "COMPILING THEANO TEST FUNCTION...",
502 | start = time.time()
503 | test_net = theano.function([l.get_all_layers(NET)[0].input_var, targets], [net_output, loss, accuracy])
504 | print "DONE! (", int(time.time() - start), "s )"
505 |
506 | ################## CONFUSION MATRIX #####################
507 | cmatrix = []
508 | def clearConfusionMatrix():
509 |
510 | global cmatrix
511 |
512 | #allocate empty matrix
513 | cmatrix = np.zeros((NUM_CLASSES, NUM_CLASSES), dtype='int32')
514 |
515 | def updateConfusionMatrix(p, t):
516 |
517 | global cmatrix
518 |
519 | #get class indices for prediction and target
520 | targets = np.argmax(t, axis=1)
521 | predictions = np.argmax(p, axis=1)
522 |
523 | #add up confusion matrices of validation batches
524 | cmatrix += confusion_matrix(targets, predictions, labels=range(0, NUM_CLASSES))
525 |
526 | def showConfusionMatrix(epoch):
527 |
528 | #new figure
529 | plt.figure(0, figsize=(35, 35), dpi=72)
530 | plt.clf()
531 |
532 | #get additional metrics
533 | pr, re, f1 = calculateMetrics()
534 |
535 | #normalize?
536 | if NORMALIZE_CONFMATRIX:
537 | global cmatrix
538 | cmatrix = np.around(cmatrix.astype('float') / cmatrix.sum(axis=1)[:, np.newaxis] * 100.0, decimals=1)
539 |
540 | #show matrix
541 | plt.imshow(cmatrix[:CONFMATRIX_MAX_CLASSES, :CONFMATRIX_MAX_CLASSES], interpolation='nearest', cmap=plt.cm.Blues)
542 | plt.title('Confusion Matrix\n' +
543 | RUN_NAME + ' - Epoch ' + str(epoch) +
544 | '\nTrain Samples: ' + str(len(TRAIN)) + ' Validation Samples: ' + str(len(VAL)) +
545 | '\nmP: ' + str(np.mean(pr)) + ' mF1: ' + str( np.mean(f1)), fontsize=32)
546 |
547 | #tick marks
548 | tick_marks = np.arange(min(CONFMATRIX_MAX_CLASSES, NUM_CLASSES))
549 | plt.xticks(tick_marks, CLASSES[:CONFMATRIX_MAX_CLASSES], rotation=90)
550 | plt.yticks(tick_marks, CLASSES[:CONFMATRIX_MAX_CLASSES])
551 |
552 | #labels
553 | thresh = cmatrix.max() / 2.
554 | for i, j in itertools.product(range(min(CONFMATRIX_MAX_CLASSES, cmatrix.shape[0])), range(min(CONFMATRIX_MAX_CLASSES, cmatrix.shape[1]))):
555 | plt.text(j, i, cmatrix[i, j],
556 | horizontalalignment="center", verticalalignment="center",
557 | color="white" if cmatrix[i, j] > thresh else "black", fontsize=32)
558 |
559 | #axes labels
560 | plt.tight_layout()
561 | plt.ylabel('Target label', fontsize=32)
562 | plt.xlabel('Predicted label', fontsize=32)
563 |
564 | #fontsize
565 | plt.rc('font', size=32)
566 |
567 | #save plot
568 | global cmcnt
569 | if not os.path.exists('confmatrix'):
570 | os.makedirs('confmatrix')
571 | plt.savefig('confmatrix/' + RUN_NAME + '_' + str(epoch) + '.png')
572 |
573 | def calculateMetrics():
574 |
575 | #allocate arrays
576 | pr = []
577 | re = []
578 | f1 = []
579 |
580 | #parse rows and columns of confusion matrix
581 | for i in range(0, cmatrix.shape[0]):
582 |
583 | #true positives, false positves, false negatives
584 | tp = float(cmatrix[i][i])
585 | fp = float(np.sum(cmatrix, axis=1)[i] - tp)
586 | fn = float(np.sum(cmatrix, axis=0)[i] - tp)
587 |
588 | #precision
589 | if tp > 0 or fp > 0:
590 | p = tp / (tp + fp)
591 | else:
592 | p = 0
593 | pr.append(p)
594 |
595 | #recall
596 | if tp > 0 or fn > 0:
597 | r = tp / (tp + fn)
598 | else:
599 | r = 0
600 | re.append(r)
601 |
602 | #f1 measure
603 | if p > 0 or r > 0:
604 | f = 2 * ((p * r) / (p + r))
605 | else:
606 | f = 0
607 | f1.append(f)
608 |
609 | return pr, re, f1
610 |
611 | ###################### PROGRESS #########################
612 | batches_per_epoch = len(TRAIN + VAL) // BATCH_SIZE + 1
613 | avg_duration = []
614 | last_update = -1
615 | def showProgress(stat, duration, current, end=batches_per_epoch, update_interval=5, simple_mode=False):
616 |
617 | #epochs might take a lot of time, so we want some kind of progress bar
618 | #this approach is not very sophisticated, but it does the job :)
619 | #you should use simple_mode=True if run with IDLE and simple_mode=False if run on command line
620 |
621 | global avg_duration
622 | global last_update
623 |
624 | #time left
625 | avg_duration.append(duration)
626 | avg_duration = avg_duration[-10:]
627 | r = int(abs(end - current) * np.mean(avg_duration) / 60) + 1
628 |
629 | #percentage
630 | p = int(current / float(end) * 100)
631 | progress = ""
632 | for s in xrange(update_interval, 100, update_interval):
633 | if s <= p:
634 | progress += "="
635 | else:
636 | progress += " "
637 |
638 | #status line
639 | if p > last_update and p % update_interval == 0 or last_update == -1:
640 | if simple_mode:
641 | if current == 1:
642 | print stat.upper() + ": [",
643 | else:
644 | print "=",
645 | if current == end:
646 | print "]",
647 | else:
648 | print stat.upper() + ": [" + progress + "] BATCHES " + str(current) + "/" + str(end) + " (" + str(p) + "%) - " + str(r) + " min REMAINING\r",
649 | last_update = p
650 |
651 | ###################### TRAINING #########################
652 | print "START TRAINING..."
653 | train_loss = []
654 | val_loss = []
655 | val_accuracy = []
656 | max_acc = -1
657 | lr = LEARNING_RATE[LEARNING_RATE.keys()[0]]
658 | SAVE_MODEL_AFTER_TRAINING = True
659 |
660 | #train for some epochs...
661 | for epoch in range(EPOCH_START, EPOCHS + 1):
662 |
663 | try:
664 |
665 | #start timer
666 | start = time.time()
667 |
668 | #reset confusion matrix
669 | clearConfusionMatrix()
670 |
671 | #adjust learning rate (interpolate or steps)
672 | if LR_DESCENT:
673 | lr_keys = np.array(LEARNING_RATE.keys() + [EPOCHS], dtype='float32')
674 | lr_values = np.array(LEARNING_RATE.values() + [LEARNING_RATE.values()[-1]], dtype='float32')
675 | lr_func = interpolate.interp1d(lr_keys, lr_values, kind='linear')
676 | lr = np.float32(lr_func(max(LEARNING_RATE.keys()[0], epoch - 1)))
677 | else:
678 | if epoch in LEARNING_RATE:
679 | lr = LEARNING_RATE[epoch]
680 |
681 | #shuffle dataset (this way we get "new" batches every epoch)
682 | if RANDOMIZE_TRAIN_SET:
683 | TRAIN = shuffle(TRAIN, random_state=RANDOM)
684 |
685 | #time
686 | bstart = time.time()
687 | last_update = -1
688 |
689 | #iterate over train split batches and calculate mean loss for epoch
690 | t_l = []
691 | bcnt = 0
692 | for image_batch, target_batch in bg.threadedBatchGenerator(getNextImageBatch()):
693 |
694 | #calling the training functions returns the current loss
695 | loss = train_net(image_batch, target_batch, lr)
696 | t_l.append(loss)
697 |
698 | bcnt += 1
699 |
700 | #show progress
701 | showProgress("EPOCH " + str(epoch), (time.time() - bstart), bcnt, simple_mode=SIMPLE_LOG_MODE)
702 | bstart = time.time()
703 |
704 | #we validate our net every epoch and pass our validation split through as well
705 | v_l = []
706 | v_a = []
707 | for image_batch, target_batch in bg.threadedBatchGenerator(getNextImageBatch(VAL, False, VAL_HAS_MULTI_LABEL)):
708 |
709 | #calling the test function returns the net output, loss and accuracy
710 | prediction_batch, loss, acc = test_net(image_batch, target_batch)
711 | v_l.append(loss)
712 | v_a.append(acc)
713 |
714 | #save predicions and targets for confusion matrix
715 | updateConfusionMatrix(prediction_batch, target_batch)
716 |
717 | bcnt += 1
718 |
719 | #show progress
720 | showProgress("EPOCH " + str(epoch), (time.time() - bstart), bcnt, simple_mode=SIMPLE_LOG_MODE)
721 | bstart = time.time()
722 |
723 | #stop timer
724 | end = time.time()
725 |
726 | #calculate stats for epoch
727 | train_loss.append(np.mean(t_l))
728 | val_loss.append(np.mean(v_l))
729 | val_accuracy.append(np.mean(v_a))
730 |
731 | #print stats for epoch
732 | print "TRAIN LOSS:", train_loss[-1],
733 | print "VAL LOSS:", val_loss[-1],
734 | print "VAL ACCURACY:", (int(val_accuracy[-1] * 1000) / 10.0), "%",
735 | print "LR:", lr,
736 | print "TIME:", (int((end - start) * 10) / 10.0), "s"
737 |
738 | #log max accuracy and save best params
739 | acc = (int(val_accuracy[-1] * 1000) / 10.0)
740 | if acc >= max_acc:
741 | max_acc = acc
742 | BEST_MODEL = NET
743 | BEST_EPOCH = epoch
744 |
745 | #show confusion matrix
746 | showConfusionMatrix(epoch)
747 |
748 | #save snapshot?
749 | if epoch in SNAPSHOT_EPOCHS or SNAPSHOT_EPOCHS[0] == -1:
750 | saveModel(epoch)
751 |
752 | except KeyboardInterrupt:
753 | SAVE_MODEL_AFTER_TRAINING = SAVE_AFTER_INTERRUPT
754 | break
755 |
756 | print "TRAINING DONE!"
757 | print "MAX ACC: ", max_acc
758 |
759 | #save best model params
760 | if SAVE_MODEL_AFTER_TRAINING:
761 | saveModel(BEST_EPOCH, BEST_MODEL)
762 |
--------------------------------------------------------------------------------