├── .dockerignore ├── pretrained ├── musan-full-20201031-tf │ ├── saved_model.pb │ └── variables │ │ ├── variables.index │ │ └── variables.data-00000-of-00001 └── musan-balanced-20200330-tf │ ├── saved_model.pb │ └── variables │ ├── variables.index │ ├── variables.data-00000-of-00002 │ └── variables.data-00001-of-00002 ├── requirements.txt ├── Dockerfile ├── reader.py ├── LICENSE ├── .gitignore ├── writer.py ├── smoothing.py ├── README.md ├── classifier.py ├── run.py ├── feature.py └── evaluation.py /.dockerignore: -------------------------------------------------------------------------------- 1 | saved_models/** 2 | -------------------------------------------------------------------------------- /pretrained/musan-full-20201031-tf/saved_model.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/acoustic-classification-segmentation/master/pretrained/musan-full-20201031-tf/saved_model.pb -------------------------------------------------------------------------------- /pretrained/musan-balanced-20200330-tf/saved_model.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/acoustic-classification-segmentation/master/pretrained/musan-balanced-20200330-tf/saved_model.pb -------------------------------------------------------------------------------- /pretrained/musan-full-20201031-tf/variables/variables.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/acoustic-classification-segmentation/master/pretrained/musan-full-20201031-tf/variables/variables.index -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | librosa==0.7.2 2 | numba==0.48 3 | numpy<1.19.0 4 | scipy>=1.4.1 5 | scikit-learn>=0.22.1 6 | ffmpeg-python==0.2.0 7 | sphfile>=1.0.0 8 | beautifulsoup4~=4.9 9 | tensorflow==2.0.3 10 | -------------------------------------------------------------------------------- /pretrained/musan-balanced-20200330-tf/variables/variables.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/acoustic-classification-segmentation/master/pretrained/musan-balanced-20200330-tf/variables/variables.index -------------------------------------------------------------------------------- /pretrained/musan-full-20201031-tf/variables/variables.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/acoustic-classification-segmentation/master/pretrained/musan-full-20201031-tf/variables/variables.data-00000-of-00001 -------------------------------------------------------------------------------- /pretrained/musan-balanced-20200330-tf/variables/variables.data-00000-of-00002: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/acoustic-classification-segmentation/master/pretrained/musan-balanced-20200330-tf/variables/variables.data-00000-of-00002 -------------------------------------------------------------------------------- /pretrained/musan-balanced-20200330-tf/variables/variables.data-00001-of-00002: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/acoustic-classification-segmentation/master/pretrained/musan-balanced-20200330-tf/variables/variables.data-00001-of-00002 -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:2.0.1 2 | 3 | RUN apt-get update && apt-get install -y libsndfile1 ffmpeg 4 | 5 | RUN useradd -d /segmenter -m segmenter && chown -R segmenter /segmenter 6 | USER segmenter 7 | COPY . /segmenter 8 | WORKDIR /segmenter 9 | RUN pip install -r requirements.txt 10 | 11 | CMD python run.py -s pretrained/$(ls pretrained/ | sort | tail -1) data > data/segmented.tsv 12 | -------------------------------------------------------------------------------- /reader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tempfile 4 | 5 | 6 | def read_sph(r, f): 7 | from sphfile import SPHFile; 8 | sph = SPHFile(os.path.join(r, f)) 9 | with tempfile.NamedTemporaryFile(prefix=f, delete=False) as tempf: 10 | sph.write_wav(tempf.name) 11 | return os.path.dirname(tempf.name), os.path.basename(tempf.name) 12 | 13 | 14 | def read_audios(data_dir, file_ext=['wav', 'mp3', 'sph'], file_per_dir=sys.maxsize): 15 | for r, ds, fs in os.walk(data_dir): 16 | for f in fs[:file_per_dir]: 17 | if f.split('.')[-1] in file_ext: 18 | if f.endswith('.sph'): 19 | yield read_sph(r, f) 20 | else: 21 | yield r, f 22 | 23 | 24 | # quick testing 25 | if __name__ == '__main__': 26 | files = read_audios(sys.argv[1]) 27 | while True: 28 | try: 29 | print(next(files)) 30 | except StopIteration: 31 | break 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Kyeongmin Rim 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/** 2 | _models/** 3 | saved_models/** 4 | *.results.txt 5 | # linux 6 | *~ 7 | .directory # KDE directory preferences 8 | .Trash-* # Linux trash folder which might appear on any partition or disk 9 | 10 | # macos 11 | .DS_Store 12 | .AppleDouble 13 | .LSOverride 14 | Icon # Icon must end with two \r 15 | ._* # Thumbnails 16 | .DocumentRevisions-V100 # Files that might appear in the root of a volume 17 | .fseventsd 18 | .Spotlight-V100 19 | .TemporaryItems 20 | .Trashes 21 | .VolumeIcon.icns 22 | .AppleDB # Directories potentially created on remote AFP share 23 | .AppleDesktop 24 | Network Trash Folder 25 | Temporary Items 26 | .apdisk 27 | 28 | # windows 29 | Thumbs.db 30 | ehthumbs.db 31 | Desktop.ini 32 | $RECYCLE.BIN/ 33 | *.cab # Windows Installer files 34 | *.msi 35 | *.msm 36 | *.msp 37 | *.lnk # Windows shortcuts 38 | 39 | # idea 40 | .idea 41 | *.iml 42 | out 43 | gen 44 | 45 | # sqlite 46 | *.db 47 | *.sqlite3 48 | 49 | 50 | # java 51 | *.class 52 | .mtj.tmp/ # Mobile Tools for Java (J2ME) 53 | target/ # Package Files # 54 | *.jar 55 | *.war 56 | *.ear 57 | hs_err_pid* # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 58 | 59 | # python 60 | **/*.pyc 61 | **/__pycache__ 62 | 63 | # shared folders 64 | dropbox_shared*/ 65 | github_shared*/ 66 | gdrive_shared*/ 67 | 68 | # external references 69 | \!rel_*/ 70 | \!ref_*/ 71 | 72 | # ctag generated file 73 | tags 74 | .tags 75 | -------------------------------------------------------------------------------- /writer.py: -------------------------------------------------------------------------------- 1 | import ffmpeg 2 | import os 3 | import feature 4 | 5 | PRECISION = 1000 // feature.FRAME_SIZE 6 | 7 | def index_frames(predictions): 8 | speech = False 9 | segments = {} 10 | cur_speech_segment_started = 0 11 | for f_num, frame in enumerate(predictions): 12 | if speech and frame == 1: 13 | segments[cur_speech_segment_started] = f_num - 1 14 | speech = False 15 | elif not speech and frame == 0: 16 | cur_speech_segment_started = f_num 17 | speech = True 18 | if speech: 19 | segments[cur_speech_segment_started] = len(predictions) - 1 20 | return segments, len(predictions) 21 | 22 | 23 | def print_durations(indexed_speech_segements, input_audio_fname, total_len=None): 24 | speech_sum = 0 25 | print(input_audio_fname, end='\t', flush=True) 26 | for start, end in indexed_speech_segements.items(): 27 | print(f'{start / PRECISION}\t{end / PRECISION}', end='\t', flush=True) 28 | speech_sum += (end - start) 29 | 30 | if total_len is not None: 31 | print(f"speech_ratio: {(speech_sum / total_len):.2%} ({speech_sum} / {total_len})", end='', flush=True) 32 | print('', flush=True) 33 | 34 | 35 | def slice_speech(indexed_speech_segements, input_audio_fname): 36 | output_dirname = input_audio_fname[:-4] 37 | if not os.path.exists(output_dirname): 38 | os.makedirs(output_dirname) 39 | elif not os.path.isdir(output_dirname): 40 | raise IOError(f'{output_dirname} file exists and thus output directory cannot be created.') 41 | else: 42 | for f in os.listdir(output_dirname): 43 | os.remove(os.path.join(output_dirname, f)) 44 | 45 | for start, end in indexed_speech_segements.items(): 46 | start = start / PRECISION 47 | end = end / PRECISION 48 | output_fname = f'{output_dirname.split(os.sep)[-1]}.{str(start)}.wav' 49 | in_stream = ffmpeg.input(input_audio_fname, f=input_audio_fname[-3:], ss=start, t=end-start) 50 | in_stream.output(os.path.join(output_dirname, output_fname)).run(overwrite_output=True) 51 | -------------------------------------------------------------------------------- /smoothing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def minimum_change_support(predictions: np.ndarray, minimum_window_size=300): 5 | for i in range(1, len(predictions)): 6 | cur_label = predictions[i] 7 | minimum_window = predictions[max(0, i - minimum_window_size):i] 8 | if cur_label != 0 and np.sum(minimum_window == cur_label) < (len(minimum_window) // 2): 9 | predictions[i] = predictions[i - 1] 10 | 11 | 12 | def mode_smooth(predictions: np.ndarray, smooth_window=20): 13 | from scipy import stats 14 | for i in range(len(predictions)): 15 | s = max(0, i - smooth_window) 16 | e = min(len(predictions), i + 1 + smooth_window) 17 | predictions[i] = stats.mode(predictions[s:e])[0] 18 | 19 | 20 | def trim_short_speech(predictions: np.ndarray, threshold=200): 21 | i = 0 22 | while i < len(predictions): 23 | if predictions[i] == 0: 24 | next_nonzeros = np.where(predictions[i:] == 1)[0] 25 | if len(next_nonzeros) == 0: # nore more flips left 26 | break 27 | speech_len = next_nonzeros[0] 28 | # print(i, noise_len) 29 | if speech_len < threshold: 30 | predictions[i:i + speech_len] = 1 31 | i += speech_len 32 | else: 33 | i += 1 34 | 35 | 36 | def trim_short_noises(predictions: np.ndarray, threshold=300): 37 | i = 0 38 | cur = predictions[0] 39 | while i < len(predictions): 40 | if predictions[i] == 1: 41 | next_speeches = np.where(predictions[i:] == 0)[0] 42 | if len(next_speeches) == 0: # nore more flips left 43 | break 44 | noise_len = next_speeches[0] 45 | # print(i, noise_len) 46 | if noise_len < threshold: 47 | predictions[i:i + noise_len] = 0 48 | i += noise_len 49 | else: 50 | i += 1 51 | 52 | 53 | def smooth(predictions): 54 | # assumes frame size to be a hundredth second (10ms) 55 | # smoothings happen in-place 56 | # mode_smooth(predictions) 57 | # minimum_change_support(predictions) 58 | trim_short_noises(predictions, threshold=100) 59 | trim_short_speech(predictions) 60 | return predictions 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Acoustic Calssification & Segmentation 2 | 3 | Simple audio segmenter to isolate speech portion out of audio streams. Uses a simple feedforward MLP for classification (implemented using `tensorflow`) and heuristic smoothing methods to increase the recall of speech segments. 4 | 5 | 6 | ## Requirements 7 | 8 | * System packages: [`ffmpeg`](http://ffmpeg.org/download.html) 9 | * Python packages: 10 | * `librosa` 11 | * `tensorflow` or `tensorflow-gpu` `>=2.0.0` 12 | * `numpy` 13 | * `scipy` 14 | * `scikit-learn` 15 | * `ffmpeg-python` 16 | 17 | ## Training 18 | 19 | ### Pretrained model 20 | 21 | We provide a [pretrained model](pretrained/). The model is trained on [MUSAN corpus](https://www.openslr.org/17/), using binary labels (`speech` vs. `nonspeech`). The model is, then, serialized using [`tensorflow::SavedModel` format](https://www.tensorflow.org/guide/keras/save_and_serialize#export_to_savedmodel). Because of the distribution bias in the corpus (a lot more of speech recordings in the training data), we randomly resampled from frames (size of 10ms) from speech examples to match its size to negative examples. In doing so, the language distribution among the resampled speech examples was NOT deliberately balanced. 22 | 23 | ### Training pipeline 24 | 25 | To train your own model, invoke `run.py` with `-t` flag and pass the directory name where training data is stored. You might also want to take a look at `extract_all` function in [`feature.py`](feature.py) to change how the labels are read in, if using corpora other than the MUSAN. 26 | 27 | ## Segmentation 28 | 29 | To run the segmenter over audio files, invoke `run.py` with `-s` flag, and pass 1) model path (feel free to use the pretrained model if needed) and 2) the directory where audio files are stored. Currently it will process all `mp3` and `wav` files in the target directory. If you want to process other types of audio file, add to or change the `file_ext` list near the bottom of [`run.py`](run.py) files. 30 | 31 | The processed results are stored as `segmented.tsv`, a tab-separated file, in the target directory. Each row of the file represents a result from a single audio file, and columns represents as follows; 32 | * first column shows the file path 33 | * last column shows the ratio of speech portion of the file 34 | * columns between are paired into start and end points (in seconds) of speech segments. 35 | 36 | ### Using docker 37 | 38 | We also provide [`Dockerfile`](Dockerfile). If you want to run the segmenter as a docker container (not worrying about dependencies), build an image from this project directory using the `Dockerfile` and run it with the target directory mounted to `/segmenter/data`. Just MAKE SURE that target directory is writable by others (`chmod o+w $TARGET_DIR`) because a non-root user will be running the processor in the container. For example, 39 | 40 | ```bash 41 | git clone https://github.com/keighrim/audio-segmentation.git 42 | cd audio-segmentation 43 | chmod -R o+w $HOME/audio-files && docker build . -t audioseg && docker run --rm -v $HOME/audio-files:/segmenter/data -it audioseg 44 | ``` 45 | 46 | Once the process is done, you'll find a `segmented.tsv` file in the local target directory. 47 | -------------------------------------------------------------------------------- /classifier.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from sklearn.model_selection import train_test_split 4 | 5 | BATCH_SIZE = 1024 6 | RANDOM_SEED = 123 7 | LEARNING_RATE = 0.001 8 | np.random.seed(RANDOM_SEED) 9 | tf.random.set_seed(RANDOM_SEED) 10 | 11 | 12 | def train_pipeline(X: np.ndarray, Y: np.ndarray): 13 | tr_ds, te_ds, num_cats = prep_data_pipeline(X, Y, downsample=True) 14 | model = train(tr_ds, num_cats) 15 | test(model, te_ds) 16 | return persist_model(model, '_models') 17 | 18 | 19 | def predict_pipeline(audio_fpath, model, raw_prob=False): 20 | import feature 21 | import os 22 | if type(audio_fpath) != str: 23 | audio_fpath = os.path.join(*audio_fpath) 24 | feats = feature.extract(audio_fpath) 25 | predictions = model.predict(feats) 26 | if not raw_prob: 27 | predictions = np.argmax(predictions, axis=1) 28 | return predictions 29 | 30 | 31 | def prep_data_pipeline(X, Y, downsample=False): 32 | # current implementation only considers binary classification (speech vs. nonspeech) 33 | negs = np.where(Y != 0)[0] 34 | poss = np.where(Y == 0)[0] 35 | if downsample: 36 | # we know for sure that negative examples (nonspeech) are much smaller than the positives, so trim positives instances 37 | np.random.shuffle(poss) 38 | poss = poss[:len(negs)] 39 | 40 | # because both poss and negs are 1d array, should use hstack to concat them 41 | data_idxs = np.hstack((poss, negs)) 42 | X_tr, X_te, Y_tr, Y_te = train_test_split(X[data_idxs], Y[data_idxs], test_size=0.1, shuffle=True) 43 | (traind, num_cats), (testd, _) = to_tf_dataset(X_tr, Y_tr), to_tf_dataset(X_te, Y_te) 44 | return traind, testd, num_cats 45 | 46 | 47 | def to_tf_dataset(X, Y): 48 | Y_onehot = tf.keras.utils.to_categorical(Y, dtype='int16') 49 | num_cats = Y_onehot.shape[1] 50 | ds = tf.data.Dataset.from_tensor_slices((X, Y_onehot)).batch(BATCH_SIZE) 51 | return ds, num_cats 52 | 53 | 54 | def train(dataset, num_cats): 55 | model = tf.keras.models.Sequential([ 56 | tf.keras.layers.Dense(units=30, activation='sigmoid'), 57 | tf.keras.layers.Dense(units=20, activation='sigmoid'), 58 | tf.keras.layers.Dense(units=10, activation='sigmoid'), 59 | tf.keras.layers.Dense(units=num_cats, activation='softmax'), 60 | ]) 61 | optimizer = tf.keras.optimizers.Adam(LEARNING_RATE) 62 | if num_cats < 2: 63 | raise ValueError("Number of acoustic categories must be more than one.") 64 | elif num_cats == 2: 65 | loss_fn = tf.losses.BinaryCrossentropy(from_logits=True) 66 | else: 67 | loss_fn = tf.losses.CategoricalCrossentropy(from_logits=True) 68 | model.compile(loss=loss_fn, optimizer=optimizer, metrics=['accuracy']) 69 | model.fit(dataset, epochs=20) 70 | return model 71 | 72 | 73 | def test(model, dataset): 74 | model.evaluate(dataset, verbose=2) 75 | 76 | 77 | def predict(model, data): 78 | return model.predict(data) 79 | 80 | 81 | def persist_model(model, persist_dir): 82 | import datetime 83 | import os 84 | timestamp = datetime.datetime.today().strftime('%Y%m%d-%H%M') 85 | model_path = os.path.join(persist_dir, timestamp) 86 | model.save(model_path, save_format='tf') 87 | return model_path 88 | 89 | 90 | def load_model(model_path): 91 | return tf.keras.models.load_model(model_path) 92 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | import sys 4 | import os 5 | import evaluation 6 | 7 | if __name__ == '__main__': 8 | 9 | import argparse 10 | parser = argparse.ArgumentParser( 11 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 12 | description=__doc__ 13 | ) 14 | parser.add_argument( 15 | '-d', '--download', 16 | action='store_true', 17 | help='Flag to download input datasets; train and test sets for ' 18 | 'classifier as well as train set for word embedding.' 19 | ) 20 | 21 | parser.add_argument( 22 | '-t', '--train', 23 | default='', 24 | action='store', 25 | nargs='?', 26 | help='Flag to invoke training pipeline. Must pass an argument that ' 27 | 'points to the training data. If the arg is a directory, training ' 28 | 'features will be extracted from all wav files in the directory - ' 29 | 'extracted features will be stored as a npz file in _model ' 30 | 'directory for future uses. If the arg is a npz file, training ' 31 | 'features stored in the file will be unpacked and be used.' 32 | ) 33 | parser.add_argument( 34 | '-s', '--segment', 35 | default='', 36 | action='store', 37 | nargs=2, 38 | help='Flag to invoke segmentation pipeline. First arg to specify model ' 39 | 'path, and second to specify directory where wave files are. ' 40 | ) 41 | parser.add_argument( 42 | '-e', '--evaluate', 43 | default='', 44 | action='store', 45 | nargs=2, 46 | help='Evaluate a model (first arg) against HUB4 annotation (second arg). ' 47 | 'HUB4 annotations (txt) and audio files (sph) must be located in ' 48 | 'a single directory. ' 49 | ) 50 | parser.add_argument( 51 | '-o', '--out', 52 | default='', 53 | action='store_true', 54 | help='Only valid with \'segment\' flag. When given, new wav files are ' 55 | 'generated from an input audio file, each stores a single \'speech\' ' 56 | 'segment. Newly generated files are stored in a subdirectory named ' 57 | 'after the full audio file, and suffixed with starting position ' 58 | 'in seconds (to two decimal places).' 59 | ) 60 | parser.add_argument( 61 | '-n', '--numfiles', 62 | default=sys.maxsize, 63 | action='store', 64 | type=int, 65 | help='Valid with \'segment\' and \'evaluate\' flags. When given, ' 66 | 'the number of files in data directory to process or evaluate will ' 67 | 'be limited to the given value.' 68 | ) 69 | 70 | if len(sys.argv) == 1: 71 | parser.print_help(sys.stderr) 72 | sys.exit(1) 73 | args = parser.parse_args() 74 | import reader, feature, classifier, smoothing, writer 75 | if args.train: 76 | if args.train.endswith('.npz'): 77 | import numpy 78 | npzarrays = numpy.load(args.train) 79 | X, Y = npzarrays['xs'], npzarrays['ys'] 80 | else: 81 | X, Y = feature.extract_all(reader.read_audios(args.train), train=True, binary_class=True, persist=True) 82 | model_path = classifier.train_pipeline(X, Y) 83 | print("============") 84 | print("model saved at " + model_path) 85 | print("============") 86 | 87 | if args.segment: 88 | model = classifier.load_model(args.segment[0]) 89 | for wav in reader.read_audios(args.segment[1], file_per_dir=args.numfiles): 90 | predicted = classifier.predict_pipeline(wav, model) 91 | smoothed = smoothing.smooth(predicted) 92 | speech_portions, total_frames = writer.index_frames(smoothed) 93 | audio_fname = os.path.join(*wav) 94 | writer.print_durations(speech_portions, audio_fname, total_frames) 95 | if args.out: 96 | print('writing files') 97 | writer.slice_speech(speech_portions, audio_fname) 98 | 99 | if args.evaluate: 100 | model = classifier.load_model(args.evaluate[0]) 101 | evaluation.evaluate_files(args.evaluate[1], model, args.numfiles) 102 | 103 | -------------------------------------------------------------------------------- /feature.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import librosa 4 | import numpy as np 5 | 6 | labels = {'speech': 0, 'music': 1, 'noise': 2} 7 | FRAME_SIZE = 10 # milliseconds 8 | CONTEXT_FRAMES = 0 9 | ZCR=False 10 | MFCC_NUM=40 11 | 12 | 13 | def extract(wav_fname, frame_size=FRAME_SIZE, context_frames=CONTEXT_FRAMES, zcr=ZCR, mfcc_num=MFCC_NUM, verbose=False, **kwargs): 14 | # will sample 16000 points per second 15 | audio, sr = librosa.load(wav_fname, sr=16000, **kwargs) 16 | if verbose: 17 | print(f'feature extracting: {wav_fname}\t', end='', flush=True) 18 | feats = spectral_feats(audio, frame_size, sr, mfcc_num, zcr) 19 | if context_frames > 0: 20 | if verbose: 21 | print(f'(normalizing)\t', end='', flush=True) 22 | feats = temporal_feats(feats, context_frames) 23 | if verbose: 24 | print(f'length: {librosa.get_duration(audio, sr)} secs, frames: {feats.shape}', end='', flush=True) 25 | print('', flush=True) 26 | return feats 27 | 28 | 29 | def spectral_feats(audio, frame_size, samplerate, mfcc_num, zcr): 30 | frame_sliding_size = samplerate // (1000 // frame_size) 31 | feats = librosa.feature.mfcc(y=audio, sr=samplerate, n_mfcc=mfcc_num, hop_length=frame_sliding_size) 32 | if zcr: 33 | zcrs = librosa.feature.zero_crossing_rate(y=audio, hop_length=frame_sliding_size) 34 | feats = np.concatenate((feats, zcrs), axis=0) 35 | # transpose so that rows are time frames 36 | return feats.T 37 | 38 | 39 | def temporal_feats(spectral_feats, context_frames): 40 | last_frame = len(spectral_feats) 41 | temporalized_frames = None 42 | for i in range(last_frame): 43 | # +1 to the end as array slicing is exclusive 44 | context = spectral_feats[max(0, i - context_frames):min(last_frame, i + context_frames) + 1] 45 | means = np.mean(context, axis=0) 46 | vars = np.var(context, axis=0) 47 | stds = np.std(context, axis=0) 48 | temporalized_frame = np.concatenate((means, vars, stds), axis=0) 49 | if temporalized_frames is None: 50 | temporalized_frames = np.empty((0, len(temporalized_frame))) 51 | temporalized_frames = np.vstack([temporalized_frames, temporalized_frame]) 52 | return temporalized_frames 53 | 54 | 55 | def cmvn(mfccs): 56 | raise NotImplementedError 57 | 58 | 59 | def index_label(label_str, binary=True): 60 | if label_str in labels: 61 | label_idx = labels[label_str] 62 | if binary: 63 | label_idx = min(1, label_idx) 64 | return label_idx 65 | else: 66 | return -1 67 | 68 | 69 | def extract_all(wav_paths, frame_size=FRAME_SIZE, context_frames=CONTEXT_FRAMES, zcr=ZCR, mfcc_num=MFCC_NUM, train=False, binary_class=True, persist=False): 70 | """ 71 | 72 | :param wav_paths: A list of audio files to extract. Must be (parent_dir, audio_filename) tuples. 73 | :param frame_size: The size of minimal time unit for a spectral feature (should be in milliseconds) 74 | :param context_frames: Number of adjacent frames to be used for extraction of temporal features. When 0, no temporal features will be used. Note that the context in both directions will be used (2 * N frames). 75 | :param train: When true, it will try to obtain gold labels from the file name. Otherwise, labels will remain None 76 | :param binary_class: When true, all non-zero labels are collapsed into 1 and treated as False (0 = True) 77 | :param persist: When true, store extracted features in to _models directory. 78 | :return: Two numpy arrays. First is a feature matrix (#frames * #features), seconds is a label array (#frames). #feature = (#mfcc_num + (zcr? 1:0) ) * (context==0? 3:1) 79 | """ 80 | features = None 81 | labels = np.empty(0) 82 | label = None 83 | for wav_dir, wav_fname in wav_paths: 84 | if train: 85 | label_str = wav_fname.split('-')[0] 86 | label = index_label(label_str, binary_class) 87 | full_fname = os.path.join(wav_dir, wav_fname) 88 | feature = extract(full_fname, frame_size=frame_size, context_frames=context_frames, zcr=zcr, mfcc_num=mfcc_num) 89 | labels = np.append(labels, [label] * len(feature)) 90 | if features is None: 91 | features = np.empty((0, feature.shape[1])) 92 | features = np.vstack([features, feature]) 93 | data = (np.array(features), np.array(labels, dtype=np.int)) 94 | if persist: 95 | import datetime 96 | timestamp = datetime.datetime.today().strftime('%Y%m%d-%H%M') 97 | np.savez(f'_models/{timestamp}.features.{features.shape[1]}d', xs=data[0], ys=data[1]) 98 | return data 99 | -------------------------------------------------------------------------------- /evaluation.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from bs4 import BeautifulSoup as bs 5 | from sklearn import metrics 6 | 7 | import classifier 8 | import feature 9 | import smoothing 10 | 11 | 12 | def read_hub4_annotation(annotation_fname): 13 | if type(annotation_fname) != str: 14 | annotation_fname = os.path.join(*annotation_fname) 15 | segmentation = {'filename': "", 'speech': [], 'unannotated': []} 16 | with open(annotation_fname) as annotation: 17 | tree = bs(annotation, 'lxml') 18 | episode = tree.find('episode') 19 | segmentation['filename'] = episode['filename'] 20 | for section in tree.find_all('section'): 21 | # according to the guidelines, filler and sports_repost sections are 22 | # not transcribe - and they should not have any 'segment' tags 23 | # also, we found "Local_News" and "Commercial" type sections also do not have segments in most cases 24 | # (only 5 out of 118 "Local_News" sections had at least one segment, and 5 out of 553 "Commercial" sections had at least one segment) 25 | # so we decided to treat local_news & commercial as unannotated, too 26 | if section['type'].lower() in ('filler', 'commercial', 'local_news', 'sports_report'): 27 | segmentation['unannotated'].append((float(section['s_time']), float(section['e_time']))) 28 | for segment in section.find_all('segment'): 29 | segmentation['speech'].append((float(segment['s_time']), float(segment['e_time']))) 30 | # account for unannotated portions at the start of the file 31 | # if len(segmentation['unannotated']) == 0 or segmentation['unannotated'][0][0] > segmentation['speech'][0][0]: 32 | # segmentation['unannotated'].insert(0, (0.0, segmentation['speech'][0][0])) 33 | 34 | return segmentation 35 | 36 | 37 | def to_nparray(segment_dict, audio_duration, frame_size=feature.FRAME_SIZE): 38 | """ 39 | Converts XML annotation of audio segmentation into a numpy array 40 | 41 | :param audio_duration: duration of the audio in milliseconds 42 | :param frame_size: size of a "frame" in milliseconds. frame is a time slice of each cell of the array represents. 43 | :param segment_dict: dictionary where speech segmentation annotation is encoded 44 | 45 | :return: 46 | """ 47 | # 0 = speech 48 | # 1 = non-speech 49 | # -1 = unannotated 50 | a = np.ones(audio_duration//frame_size) 51 | 52 | def to_frame_num(start_end_tuple): 53 | return list(map(lambda x: int(x*1000) // frame_size, start_end_tuple)) 54 | 55 | for speech_seg in segment_dict['speech']: 56 | start, end = to_frame_num(speech_seg) 57 | a[start:end] = 0 58 | for unannotated_seg in segment_dict['unannotated']: 59 | start, end = to_frame_num(unannotated_seg) 60 | a[start:end] = -1 61 | 62 | # smooth out short non-speeches 63 | # https://github.com/brandeis-llc/acoustic-classification-segmentation/blob/v1/evaluation.py#L94-L96 64 | smoothing.trim_short_noises(a, 1000 // frame_size) # 3000 ms = 3 seconds 65 | 66 | # account for unannotated portions at the start of the file 67 | # https://github.com/brandeis-llc/acoustic-classification-segmentation/blob/v1/evaluation.py#L46-L49 68 | start, _ = to_frame_num(segment_dict['speech'][0]) 69 | a[0:start] = -1 70 | 71 | # do not check for remaining non-speaking sections, as multiple minutes of unannotated (but caught by the segmenter) commercials are often at the end of the file 72 | # https://github.com/brandeis-llc/acoustic-classification-segmentation/blob/v1/evaluation.py#L273 73 | _, end = to_frame_num(segment_dict['speech'][-1]) 74 | a[end:] = -1 75 | 76 | print(f'annotation loaded - ' 77 | f'unannotated: {len(np.where(a == -1)[0]) / len(a):.2%}, ' 78 | f'speech: {len(np.where(a == 0)[0]) / len(a):.2%}, ' 79 | f'non-speech: {len(np.where(a == 1)[0]) / len(a):.2%}') 80 | return a 81 | 82 | 83 | def p_r_f(hub4_array, predictions): 84 | """ 85 | predictions must be 1d array of labels, not k-d raw probabilities 86 | """ 87 | annotated_idx = np.where(hub4_array != -1)[0] 88 | return metrics.precision_recall_fscore_support(hub4_array[annotated_idx], predictions[annotated_idx], pos_label=1, average='binary') 89 | 90 | 91 | def roc(hub4_array, predictions): 92 | annotated_idx = np.where(hub4_array != -1)[0] 93 | return metrics.roc_curve(hub4_array[annotated_idx], predictions[annotated_idx][:,0], pos_label=1) 94 | 95 | 96 | def evaluate_file(sph_fname, txt_fname, classifier_model): 97 | probs = classifier.predict_pipeline(sph_fname, classifier_model, raw_prob=True) 98 | duration = probs.shape[0] * feature.FRAME_SIZE # number of frames * frame size 99 | y_hats = np.argmax(probs, axis=1) 100 | y_hats = smoothing.smooth(y_hats) 101 | ys = to_nparray(read_hub4_annotation(txt_fname), duration) 102 | return probs, y_hats, ys, p_r_f(ys, y_hats) 103 | 104 | 105 | def evaluate_files(hub4_dir, model, numfiles): 106 | import reader 107 | all_probabilities = np.empty((0,2)) 108 | all_predictions = np.empty((0,)) 109 | all_annotations = np.empty((0,)) 110 | for sph_path in reader.read_audios(hub4_dir, file_ext=['sph'], file_per_dir=numfiles): 111 | base_fname = sph_path[1].split('.')[0] 112 | probs, predictions, annotations, scores = evaluate_file(os.path.join(*sph_path), os.path.join(hub4_dir, base_fname + '.txt'), model) 113 | all_probabilities = np.vstack((all_probabilities, probs)) 114 | all_predictions = np.hstack((all_predictions, predictions)) 115 | all_annotations = np.hstack((all_annotations, annotations)) 116 | print(sph_path[1], scores, flush=True) 117 | print('TOTAL', p_r_f(all_annotations, all_predictions), flush=True) 118 | 119 | --------------------------------------------------------------------------------