├── .dockerignore
├── pretrained
    ├── musan-full-20201031-tf
    │   ├── saved_model.pb
    │   └── variables
    │   │   ├── variables.index
    │   │   └── variables.data-00000-of-00001
    └── musan-balanced-20200330-tf
    │   ├── saved_model.pb
    │   └── variables
    │       ├── variables.index
    │       ├── variables.data-00000-of-00002
    │       └── variables.data-00001-of-00002
├── requirements.txt
├── Dockerfile
├── reader.py
├── LICENSE
├── .gitignore
├── writer.py
├── smoothing.py
├── README.md
├── classifier.py
├── run.py
├── feature.py
└── evaluation.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | saved_models/**
2 | 


--------------------------------------------------------------------------------
/pretrained/musan-full-20201031-tf/saved_model.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/acoustic-classification-segmentation/master/pretrained/musan-full-20201031-tf/saved_model.pb


--------------------------------------------------------------------------------
/pretrained/musan-balanced-20200330-tf/saved_model.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/acoustic-classification-segmentation/master/pretrained/musan-balanced-20200330-tf/saved_model.pb


--------------------------------------------------------------------------------
/pretrained/musan-full-20201031-tf/variables/variables.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/acoustic-classification-segmentation/master/pretrained/musan-full-20201031-tf/variables/variables.index


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | librosa==0.7.2
 2 | numba==0.48
 3 | numpy<1.19.0
 4 | scipy>=1.4.1
 5 | scikit-learn>=0.22.1
 6 | ffmpeg-python==0.2.0
 7 | sphfile>=1.0.0
 8 | beautifulsoup4~=4.9
 9 | tensorflow==2.0.3
10 | 


--------------------------------------------------------------------------------
/pretrained/musan-balanced-20200330-tf/variables/variables.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/acoustic-classification-segmentation/master/pretrained/musan-balanced-20200330-tf/variables/variables.index


--------------------------------------------------------------------------------
/pretrained/musan-full-20201031-tf/variables/variables.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/acoustic-classification-segmentation/master/pretrained/musan-full-20201031-tf/variables/variables.data-00000-of-00001


--------------------------------------------------------------------------------
/pretrained/musan-balanced-20200330-tf/variables/variables.data-00000-of-00002:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/acoustic-classification-segmentation/master/pretrained/musan-balanced-20200330-tf/variables/variables.data-00000-of-00002


--------------------------------------------------------------------------------
/pretrained/musan-balanced-20200330-tf/variables/variables.data-00001-of-00002:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/acoustic-classification-segmentation/master/pretrained/musan-balanced-20200330-tf/variables/variables.data-00001-of-00002


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM tensorflow/tensorflow:2.0.1
 2 | 
 3 | RUN apt-get update && apt-get install -y libsndfile1 ffmpeg
 4 | 
 5 | RUN useradd -d /segmenter -m segmenter  && chown -R segmenter /segmenter
 6 | USER segmenter
 7 | COPY . /segmenter
 8 | WORKDIR /segmenter
 9 | RUN pip install -r requirements.txt
10 | 
11 | CMD python run.py -s pretrained/$(ls pretrained/ | sort | tail -1) data > data/segmented.tsv
12 | 


--------------------------------------------------------------------------------
/reader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import tempfile
 4 | 
 5 | 
 6 | def read_sph(r, f):
 7 |     from sphfile import SPHFile;
 8 |     sph = SPHFile(os.path.join(r, f))
 9 |     with tempfile.NamedTemporaryFile(prefix=f, delete=False) as tempf:
10 |         sph.write_wav(tempf.name)
11 |         return os.path.dirname(tempf.name), os.path.basename(tempf.name)
12 | 
13 | 
14 | def read_audios(data_dir, file_ext=['wav', 'mp3', 'sph'], file_per_dir=sys.maxsize):
15 |     for r, ds, fs in os.walk(data_dir):
16 |         for f in fs[:file_per_dir]:
17 |             if f.split('.')[-1] in file_ext:
18 |                 if f.endswith('.sph'):
19 |                     yield read_sph(r, f)
20 |                 else:
21 |                     yield r, f
22 | 
23 | 
24 | # quick testing
25 | if __name__ == '__main__':
26 |     files = read_audios(sys.argv[1])
27 |     while True:
28 |         try:
29 |             print(next(files))
30 |         except StopIteration:
31 |             break
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Kyeongmin Rim
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | data/**
 2 | _models/**
 3 | saved_models/**
 4 | *.results.txt
 5 | # linux
 6 | *~
 7 | .directory # KDE directory preferences
 8 | .Trash-* # Linux trash folder which might appear on any partition or disk
 9 | 
10 | # macos
11 | .DS_Store
12 | .AppleDouble
13 | .LSOverride
14 | Icon # Icon must end with two \r
15 | ._* # Thumbnails
16 | .DocumentRevisions-V100 # Files that might appear in the root of a volume
17 | .fseventsd
18 | .Spotlight-V100
19 | .TemporaryItems
20 | .Trashes
21 | .VolumeIcon.icns
22 | .AppleDB # Directories potentially created on remote AFP share
23 | .AppleDesktop
24 | Network Trash Folder
25 | Temporary Items
26 | .apdisk
27 | 
28 | # windows
29 | Thumbs.db
30 | ehthumbs.db
31 | Desktop.ini
32 | $RECYCLE.BIN/
33 | *.cab # Windows Installer files
34 | *.msi
35 | *.msm
36 | *.msp
37 | *.lnk # Windows shortcuts
38 | 
39 | # idea
40 | .idea
41 | *.iml
42 | out
43 | gen
44 | 
45 | # sqlite
46 | *.db
47 | *.sqlite3
48 | 
49 | 
50 | # java
51 | *.class
52 | .mtj.tmp/ # Mobile Tools for Java (J2ME)
53 | target/ # Package Files #
54 | *.jar
55 | *.war
56 | *.ear
57 | hs_err_pid* # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
58 | 
59 | # python 
60 | **/*.pyc
61 | **/__pycache__
62 | 
63 | # shared folders
64 | dropbox_shared*/
65 | github_shared*/
66 | gdrive_shared*/
67 | 
68 | # external references
69 | \!rel_*/
70 | \!ref_*/
71 | 
72 | # ctag generated file
73 | tags
74 | .tags
75 | 


--------------------------------------------------------------------------------
/writer.py:
--------------------------------------------------------------------------------
 1 | import ffmpeg
 2 | import os
 3 | import feature
 4 | 
 5 | PRECISION = 1000 // feature.FRAME_SIZE
 6 | 
 7 | def index_frames(predictions):
 8 |     speech = False
 9 |     segments = {}
10 |     cur_speech_segment_started = 0
11 |     for f_num, frame in enumerate(predictions):
12 |         if speech and frame == 1:
13 |             segments[cur_speech_segment_started] = f_num - 1
14 |             speech = False
15 |         elif not speech and frame == 0:
16 |             cur_speech_segment_started = f_num
17 |             speech = True
18 |     if speech:
19 |         segments[cur_speech_segment_started] = len(predictions) - 1
20 |     return segments, len(predictions)
21 | 
22 | 
23 | def print_durations(indexed_speech_segements, input_audio_fname, total_len=None):
24 |     speech_sum = 0
25 |     print(input_audio_fname, end='\t', flush=True)
26 |     for start, end in indexed_speech_segements.items():
27 |         print(f'{start / PRECISION}\t{end / PRECISION}', end='\t', flush=True)
28 |         speech_sum += (end - start)
29 | 
30 |     if total_len is not None:
31 |         print(f"speech_ratio: {(speech_sum / total_len):.2%} ({speech_sum} / {total_len})", end='', flush=True)
32 |     print('', flush=True)
33 | 
34 | 
35 | def slice_speech(indexed_speech_segements, input_audio_fname):
36 |     output_dirname = input_audio_fname[:-4]
37 |     if not os.path.exists(output_dirname):
38 |         os.makedirs(output_dirname)
39 |     elif not os.path.isdir(output_dirname):
40 |         raise IOError(f'{output_dirname} file exists and thus output directory cannot be created.')
41 |     else:
42 |         for f in os.listdir(output_dirname):
43 |             os.remove(os.path.join(output_dirname, f))
44 | 
45 |     for start, end in indexed_speech_segements.items():
46 |         start = start / PRECISION
47 |         end = end / PRECISION
48 |         output_fname = f'{output_dirname.split(os.sep)[-1]}.{str(start)}.wav'
49 |         in_stream = ffmpeg.input(input_audio_fname, f=input_audio_fname[-3:], ss=start, t=end-start)
50 |         in_stream.output(os.path.join(output_dirname, output_fname)).run(overwrite_output=True)
51 | 


--------------------------------------------------------------------------------
/smoothing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def minimum_change_support(predictions: np.ndarray, minimum_window_size=300):
 5 |     for i in range(1, len(predictions)):
 6 |         cur_label = predictions[i]
 7 |         minimum_window = predictions[max(0, i - minimum_window_size):i]
 8 |         if cur_label != 0 and np.sum(minimum_window == cur_label) < (len(minimum_window) // 2):
 9 |             predictions[i] = predictions[i - 1]
10 | 
11 | 
12 | def mode_smooth(predictions: np.ndarray, smooth_window=20):
13 |     from scipy import stats
14 |     for i in range(len(predictions)):
15 |         s = max(0, i - smooth_window)
16 |         e = min(len(predictions), i + 1 + smooth_window)
17 |         predictions[i] = stats.mode(predictions[s:e])[0]
18 | 
19 | 
20 | def trim_short_speech(predictions: np.ndarray, threshold=200):
21 |     i = 0
22 |     while i < len(predictions):
23 |         if predictions[i] == 0:
24 |             next_nonzeros = np.where(predictions[i:] == 1)[0]
25 |             if len(next_nonzeros) == 0:  # nore more flips left
26 |                 break
27 |             speech_len = next_nonzeros[0]
28 |             #  print(i, noise_len)
29 |             if speech_len < threshold:
30 |                 predictions[i:i + speech_len] = 1
31 |             i += speech_len
32 |         else:
33 |             i += 1
34 | 
35 | 
36 | def trim_short_noises(predictions: np.ndarray, threshold=300):
37 |     i = 0
38 |     cur = predictions[0]
39 |     while i < len(predictions):
40 |         if predictions[i] == 1:
41 |             next_speeches = np.where(predictions[i:] == 0)[0]
42 |             if len(next_speeches) == 0:  # nore more flips left
43 |                 break
44 |             noise_len = next_speeches[0]
45 |             #  print(i, noise_len)
46 |             if noise_len < threshold:
47 |                 predictions[i:i + noise_len] = 0
48 |             i += noise_len
49 |         else:
50 |             i += 1
51 | 
52 | 
53 | def smooth(predictions):
54 |     # assumes frame size to be a hundredth second (10ms)
55 |     # smoothings happen in-place
56 |     # mode_smooth(predictions)
57 |     # minimum_change_support(predictions)
58 |     trim_short_noises(predictions, threshold=100)
59 |     trim_short_speech(predictions)
60 |     return predictions
61 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Acoustic Calssification & Segmentation 
 2 | 
 3 | Simple audio segmenter to isolate speech portion out of audio streams. Uses a simple feedforward MLP for classification (implemented using `tensorflow`) and heuristic smoothing methods to increase the recall of speech segments. 
 4 | 
 5 | 
 6 | ## Requirements 
 7 | 
 8 | * System packages: [`ffmpeg`](http://ffmpeg.org/download.html)
 9 | * Python packages: 
10 |   * `librosa`
11 |   * `tensorflow` or `tensorflow-gpu` `>=2.0.0`
12 |   * `numpy`
13 |   * `scipy`
14 |   * `scikit-learn`
15 |   * `ffmpeg-python`
16 | 
17 | ## Training 
18 | 
19 | ### Pretrained model 
20 | 
21 | We provide a [pretrained model](pretrained/). The model is trained on [MUSAN corpus](https://www.openslr.org/17/), using binary labels (`speech` vs. `nonspeech`). The model is, then, serialized using [`tensorflow::SavedModel` format](https://www.tensorflow.org/guide/keras/save_and_serialize#export_to_savedmodel). Because of the distribution bias in the corpus (a lot more of speech recordings in the training data), we randomly resampled from frames (size of 10ms) from speech examples to match its size to negative examples. In doing so, the language distribution among the resampled speech examples was NOT deliberately balanced. 
22 | 
23 | ### Training pipeline
24 | 
25 | To train your own model, invoke `run.py` with `-t` flag and pass the directory name where training data is stored. You might also want to take a look at `extract_all` function in [`feature.py`](feature.py) to change how the labels are read in, if using corpora other than the MUSAN. 
26 | 
27 | ## Segmentation
28 | 
29 | To run the segmenter over audio files, invoke `run.py` with `-s` flag, and pass 1) model path (feel free to use the pretrained model if needed) and 2) the directory where audio files are stored. Currently it will process all `mp3` and `wav` files in the target directory. If you want to process other types of audio file, add to or change the `file_ext` list near the bottom of [`run.py`](run.py) files. 
30 | 
31 | The processed results are stored as `segmented.tsv`, a tab-separated file, in the target directory. Each row of the file represents a result from a single audio file, and columns represents as follows; 
32 | * first column shows the file path
33 | * last column shows the ratio of speech portion of the file 
34 | * columns between are paired into start and end points (in seconds) of speech segments. 
35 | 
36 | ### Using docker
37 | 
38 | We also provide [`Dockerfile`](Dockerfile). If you want to run the segmenter as a docker container (not worrying about dependencies), build an image from this project directory using the `Dockerfile` and run it with the target directory mounted to `/segmenter/data`. Just MAKE SURE that target directory is writable by others (`chmod o+w $TARGET_DIR`) because a non-root user will be running the processor in the container. For example, 
39 | 
40 | ```bash
41 | git clone https://github.com/keighrim/audio-segmentation.git 
42 | cd audio-segmentation
43 | chmod -R o+w $HOME/audio-files && docker build . -t audioseg && docker run --rm -v $HOME/audio-files:/segmenter/data -it audioseg
44 | ```
45 | 
46 | Once the process is done, you'll find a `segmented.tsv` file in the local target directory. 
47 | 


--------------------------------------------------------------------------------
/classifier.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | from sklearn.model_selection import train_test_split
 4 | 
 5 | BATCH_SIZE = 1024
 6 | RANDOM_SEED = 123
 7 | LEARNING_RATE = 0.001
 8 | np.random.seed(RANDOM_SEED)
 9 | tf.random.set_seed(RANDOM_SEED)
10 | 
11 | 
12 | def train_pipeline(X: np.ndarray, Y: np.ndarray):
13 |     tr_ds, te_ds, num_cats = prep_data_pipeline(X, Y, downsample=True)
14 |     model = train(tr_ds, num_cats)
15 |     test(model, te_ds)
16 |     return persist_model(model, '_models')
17 | 
18 | 
19 | def predict_pipeline(audio_fpath, model, raw_prob=False):
20 |     import feature
21 |     import os
22 |     if type(audio_fpath) != str:
23 |         audio_fpath = os.path.join(*audio_fpath)
24 |     feats = feature.extract(audio_fpath)
25 |     predictions = model.predict(feats)
26 |     if not raw_prob:
27 |         predictions = np.argmax(predictions, axis=1)
28 |     return predictions
29 | 
30 | 
31 | def prep_data_pipeline(X, Y, downsample=False):
32 |     # current implementation only considers binary classification (speech vs. nonspeech)
33 |     negs = np.where(Y != 0)[0]
34 |     poss = np.where(Y == 0)[0]
35 |     if downsample:
36 |         # we know for sure that negative examples (nonspeech) are much smaller than the positives, so trim positives instances
37 |         np.random.shuffle(poss)
38 |         poss = poss[:len(negs)]
39 | 
40 |     # because both poss and negs are 1d array, should use hstack to concat them
41 |     data_idxs = np.hstack((poss, negs))
42 |     X_tr, X_te, Y_tr, Y_te = train_test_split(X[data_idxs], Y[data_idxs], test_size=0.1, shuffle=True)
43 |     (traind, num_cats), (testd, _) = to_tf_dataset(X_tr, Y_tr), to_tf_dataset(X_te, Y_te)
44 |     return traind, testd, num_cats
45 | 
46 | 
47 | def to_tf_dataset(X, Y):
48 |     Y_onehot = tf.keras.utils.to_categorical(Y, dtype='int16')
49 |     num_cats = Y_onehot.shape[1]
50 |     ds = tf.data.Dataset.from_tensor_slices((X, Y_onehot)).batch(BATCH_SIZE)
51 |     return ds, num_cats
52 | 
53 | 
54 | def train(dataset, num_cats):
55 |     model = tf.keras.models.Sequential([
56 |         tf.keras.layers.Dense(units=30, activation='sigmoid'),
57 |         tf.keras.layers.Dense(units=20, activation='sigmoid'),
58 |         tf.keras.layers.Dense(units=10, activation='sigmoid'),
59 |         tf.keras.layers.Dense(units=num_cats, activation='softmax'),
60 |     ])
61 |     optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)
62 |     if num_cats < 2:
63 |         raise ValueError("Number of acoustic categories must be more than one.")
64 |     elif num_cats == 2:
65 |         loss_fn = tf.losses.BinaryCrossentropy(from_logits=True)
66 |     else:
67 |         loss_fn = tf.losses.CategoricalCrossentropy(from_logits=True)
68 |     model.compile(loss=loss_fn, optimizer=optimizer, metrics=['accuracy'])
69 |     model.fit(dataset, epochs=20)
70 |     return model
71 | 
72 | 
73 | def test(model, dataset):
74 |     model.evaluate(dataset, verbose=2)
75 | 
76 | 
77 | def predict(model, data):
78 |     return model.predict(data)
79 | 
80 | 
81 | def persist_model(model, persist_dir):
82 |     import datetime
83 |     import os
84 |     timestamp = datetime.datetime.today().strftime('%Y%m%d-%H%M')
85 |     model_path = os.path.join(persist_dir, timestamp)
86 |     model.save(model_path, save_format='tf')
87 |     return model_path
88 | 
89 | 
90 | def load_model(model_path):
91 |     return tf.keras.models.load_model(model_path)
92 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python3
  2 | 
  3 | import sys
  4 | import os
  5 | import evaluation
  6 | 
  7 | if __name__ == '__main__':
  8 | 
  9 |     import argparse
 10 |     parser = argparse.ArgumentParser(
 11 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 12 |         description=__doc__
 13 |     )
 14 |     parser.add_argument(
 15 |         '-d', '--download',
 16 |         action='store_true',
 17 |         help='Flag to download input datasets; train and test sets for '
 18 |              'classifier as well as train set for word embedding.'
 19 |     )
 20 | 
 21 |     parser.add_argument(
 22 |         '-t', '--train',
 23 |         default='',
 24 |         action='store',
 25 |         nargs='?',
 26 |         help='Flag to invoke training pipeline. Must pass an argument that '
 27 |              'points to the training data. If the arg is a directory, training '
 28 |              'features will be extracted from all wav files in the directory - '
 29 |              'extracted features will be stored as a npz file in _model '
 30 |              'directory for future uses. If the arg is a npz file, training '
 31 |              'features stored in the file will be unpacked and be used.'
 32 |     )
 33 |     parser.add_argument(
 34 |         '-s', '--segment',
 35 |         default='',
 36 |         action='store',
 37 |         nargs=2,
 38 |         help='Flag to invoke segmentation pipeline. First arg to specify model '
 39 |              'path, and second to specify directory where wave files are. '
 40 |     )
 41 |     parser.add_argument(
 42 |         '-e', '--evaluate',
 43 |         default='',
 44 |         action='store',
 45 |         nargs=2,
 46 |         help='Evaluate a model (first arg) against HUB4 annotation (second arg). '
 47 |              'HUB4 annotations (txt) and audio files (sph) must be located in '
 48 |              'a single directory. '
 49 |     )
 50 |     parser.add_argument(
 51 |         '-o', '--out',
 52 |         default='',
 53 |         action='store_true',
 54 |         help='Only valid with \'segment\' flag. When given, new wav files are '
 55 |              'generated from an input audio file, each stores a single \'speech\' '
 56 |              'segment. Newly generated files are stored in a subdirectory named '
 57 |              'after the full audio file, and suffixed with starting position '
 58 |              'in seconds (to two decimal places).'
 59 |     )
 60 |     parser.add_argument(
 61 |         '-n', '--numfiles',
 62 |         default=sys.maxsize,
 63 |         action='store',
 64 |         type=int,
 65 |         help='Valid with \'segment\' and \'evaluate\' flags. When given, '
 66 |              'the number of files in data directory to process or evaluate will '
 67 |              'be limited to the given value.'
 68 |     )
 69 | 
 70 |     if len(sys.argv) == 1:
 71 |         parser.print_help(sys.stderr)
 72 |         sys.exit(1)
 73 |     args = parser.parse_args()
 74 |     import reader, feature, classifier, smoothing, writer
 75 |     if args.train:
 76 |         if args.train.endswith('.npz'):
 77 |             import numpy
 78 |             npzarrays = numpy.load(args.train)
 79 |             X, Y = npzarrays['xs'], npzarrays['ys']
 80 |         else:
 81 |             X, Y = feature.extract_all(reader.read_audios(args.train), train=True, binary_class=True, persist=True)
 82 |         model_path = classifier.train_pipeline(X, Y)
 83 |         print("============")
 84 |         print("model saved at " + model_path)
 85 |         print("============")
 86 | 
 87 |     if args.segment:
 88 |         model = classifier.load_model(args.segment[0])
 89 |         for wav in reader.read_audios(args.segment[1], file_per_dir=args.numfiles):
 90 |             predicted = classifier.predict_pipeline(wav, model)
 91 |             smoothed = smoothing.smooth(predicted)
 92 |             speech_portions, total_frames = writer.index_frames(smoothed)
 93 |             audio_fname = os.path.join(*wav)
 94 |             writer.print_durations(speech_portions, audio_fname, total_frames)
 95 |             if args.out:
 96 |                 print('writing files')
 97 |                 writer.slice_speech(speech_portions, audio_fname)
 98 | 
 99 |     if args.evaluate:
100 |         model = classifier.load_model(args.evaluate[0])
101 |         evaluation.evaluate_files(args.evaluate[1], model, args.numfiles)
102 | 
103 | 


--------------------------------------------------------------------------------
/feature.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import librosa
 4 | import numpy as np
 5 | 
 6 | labels = {'speech': 0, 'music': 1, 'noise': 2}
 7 | FRAME_SIZE = 10 # milliseconds
 8 | CONTEXT_FRAMES = 0
 9 | ZCR=False
10 | MFCC_NUM=40
11 | 
12 | 
13 | def extract(wav_fname, frame_size=FRAME_SIZE, context_frames=CONTEXT_FRAMES, zcr=ZCR, mfcc_num=MFCC_NUM, verbose=False, **kwargs):
14 |     # will sample 16000 points per second
15 |     audio, sr = librosa.load(wav_fname, sr=16000, **kwargs)
16 |     if verbose:
17 |         print(f'feature extracting: {wav_fname}\t', end='', flush=True)
18 |     feats = spectral_feats(audio, frame_size, sr, mfcc_num, zcr)
19 |     if context_frames > 0:
20 |         if verbose:
21 |             print(f'(normalizing)\t', end='', flush=True)
22 |         feats = temporal_feats(feats, context_frames)
23 |     if verbose:
24 |         print(f'length: {librosa.get_duration(audio, sr)} secs, frames: {feats.shape}', end='', flush=True)
25 |         print('', flush=True)
26 |     return feats
27 | 
28 | 
29 | def spectral_feats(audio, frame_size, samplerate, mfcc_num, zcr):
30 |     frame_sliding_size = samplerate // (1000 // frame_size)
31 |     feats = librosa.feature.mfcc(y=audio, sr=samplerate, n_mfcc=mfcc_num, hop_length=frame_sliding_size)
32 |     if zcr:
33 |         zcrs = librosa.feature.zero_crossing_rate(y=audio, hop_length=frame_sliding_size)
34 |         feats = np.concatenate((feats, zcrs), axis=0)
35 |     # transpose so that rows are time frames
36 |     return feats.T
37 | 
38 | 
39 | def temporal_feats(spectral_feats, context_frames):
40 |     last_frame = len(spectral_feats)
41 |     temporalized_frames = None
42 |     for i in range(last_frame):
43 |         # +1 to the end as array slicing is exclusive
44 |         context = spectral_feats[max(0, i - context_frames):min(last_frame, i + context_frames) + 1]
45 |         means = np.mean(context, axis=0)
46 |         vars = np.var(context, axis=0)
47 |         stds = np.std(context, axis=0)
48 |         temporalized_frame = np.concatenate((means, vars, stds), axis=0)
49 |         if temporalized_frames is None:
50 |             temporalized_frames = np.empty((0, len(temporalized_frame)))
51 |         temporalized_frames = np.vstack([temporalized_frames, temporalized_frame])
52 |     return temporalized_frames
53 | 
54 | 
55 | def cmvn(mfccs):
56 |     raise NotImplementedError
57 | 
58 | 
59 | def index_label(label_str, binary=True):
60 |     if label_str in labels:
61 |         label_idx = labels[label_str]
62 |         if binary:
63 |             label_idx = min(1, label_idx)
64 |         return label_idx
65 |     else:
66 |         return -1
67 | 
68 | 
69 | def extract_all(wav_paths, frame_size=FRAME_SIZE, context_frames=CONTEXT_FRAMES, zcr=ZCR, mfcc_num=MFCC_NUM, train=False, binary_class=True, persist=False):
70 |     """
71 | 
72 |     :param wav_paths: A list of audio files to extract. Must be (parent_dir, audio_filename) tuples.
73 |     :param frame_size: The size of minimal time unit for a spectral feature (should be in milliseconds)
74 |     :param context_frames: Number of adjacent frames to be used for extraction of temporal features. When 0, no temporal features will be used. Note that the context in both directions will be used (2 * N frames).
75 |     :param train: When true, it will try to obtain gold labels from the file name. Otherwise, labels will remain None
76 |     :param binary_class: When true, all non-zero labels are collapsed into 1 and treated as False (0 = True)
77 |     :param persist: When true, store extracted features in to _models directory.
78 |     :return: Two numpy arrays. First is a feature matrix (#frames * #features), seconds is a label array (#frames). #feature = (#mfcc_num + (zcr? 1:0) ) * (context==0? 3:1)
79 |     """
80 |     features = None
81 |     labels = np.empty(0)
82 |     label = None
83 |     for wav_dir, wav_fname in wav_paths:
84 |         if train:
85 |             label_str = wav_fname.split('-')[0]
86 |             label = index_label(label_str, binary_class)
87 |         full_fname = os.path.join(wav_dir, wav_fname)
88 |         feature = extract(full_fname, frame_size=frame_size, context_frames=context_frames, zcr=zcr, mfcc_num=mfcc_num)
89 |         labels = np.append(labels, [label] * len(feature))
90 |         if features is None:
91 |             features = np.empty((0, feature.shape[1]))
92 |         features = np.vstack([features, feature])
93 |     data = (np.array(features), np.array(labels, dtype=np.int))
94 |     if persist:
95 |         import datetime
96 |         timestamp = datetime.datetime.today().strftime('%Y%m%d-%H%M')
97 |         np.savez(f'_models/{timestamp}.features.{features.shape[1]}d', xs=data[0], ys=data[1])
98 |     return data
99 | 


--------------------------------------------------------------------------------
/evaluation.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import numpy as np
  4 | from bs4 import BeautifulSoup as bs
  5 | from sklearn import metrics
  6 | 
  7 | import classifier
  8 | import feature
  9 | import smoothing
 10 | 
 11 | 
 12 | def read_hub4_annotation(annotation_fname):
 13 |     if type(annotation_fname) != str:
 14 |         annotation_fname = os.path.join(*annotation_fname)
 15 |     segmentation = {'filename': "", 'speech': [], 'unannotated': []}
 16 |     with open(annotation_fname) as annotation:
 17 |         tree = bs(annotation, 'lxml')
 18 |         episode = tree.find('episode')
 19 |         segmentation['filename'] = episode['filename']
 20 |         for section in tree.find_all('section'):
 21 |             # according to the guidelines, filler and sports_repost sections are
 22 |             # not transcribe - and they should not have any 'segment' tags
 23 |             # also, we found "Local_News" and "Commercial" type sections also do not have segments in most cases 
 24 |             # (only 5 out of 118 "Local_News" sections had at least one segment, and 5 out of 553 "Commercial" sections had at least one segment)
 25 |             # so we decided to treat local_news & commercial as unannotated, too
 26 |             if section['type'].lower() in ('filler', 'commercial', 'local_news', 'sports_report'):
 27 |                 segmentation['unannotated'].append((float(section['s_time']), float(section['e_time'])))
 28 |             for segment in section.find_all('segment'):
 29 |                 segmentation['speech'].append((float(segment['s_time']), float(segment['e_time'])))
 30 |     # account for unannotated portions at the start of the file
 31 |     # if len(segmentation['unannotated']) == 0 or segmentation['unannotated'][0][0] > segmentation['speech'][0][0]:
 32 |     #     segmentation['unannotated'].insert(0, (0.0, segmentation['speech'][0][0]))
 33 | 
 34 |     return segmentation
 35 | 
 36 | 
 37 | def to_nparray(segment_dict, audio_duration, frame_size=feature.FRAME_SIZE):
 38 |     """
 39 |     Converts XML annotation of audio segmentation into a numpy array
 40 | 
 41 |     :param audio_duration: duration of the audio in milliseconds
 42 |     :param frame_size: size of a "frame" in milliseconds. frame is a time slice of each cell of the array represents.
 43 |     :param segment_dict: dictionary where speech segmentation annotation is encoded
 44 | 
 45 |     :return:
 46 |     """
 47 |     # 0 = speech
 48 |     # 1 = non-speech
 49 |     # -1 = unannotated
 50 |     a = np.ones(audio_duration//frame_size)
 51 | 
 52 |     def to_frame_num(start_end_tuple):
 53 |         return list(map(lambda x: int(x*1000) // frame_size, start_end_tuple))
 54 | 
 55 |     for speech_seg in segment_dict['speech']:
 56 |         start, end = to_frame_num(speech_seg)
 57 |         a[start:end] = 0
 58 |     for unannotated_seg in segment_dict['unannotated']:
 59 |         start, end = to_frame_num(unannotated_seg)
 60 |         a[start:end] = -1
 61 | 
 62 |     # smooth out short non-speeches
 63 |     # https://github.com/brandeis-llc/acoustic-classification-segmentation/blob/v1/evaluation.py#L94-L96
 64 |     smoothing.trim_short_noises(a, 1000 // frame_size) # 3000 ms = 3 seconds
 65 | 
 66 |     # account for unannotated portions at the start of the file
 67 |     # https://github.com/brandeis-llc/acoustic-classification-segmentation/blob/v1/evaluation.py#L46-L49
 68 |     start, _ = to_frame_num(segment_dict['speech'][0])
 69 |     a[0:start] = -1
 70 | 
 71 |     # do not check for remaining non-speaking sections, as multiple minutes of unannotated (but caught by the segmenter) commercials are often at the end of the file
 72 |     # https://github.com/brandeis-llc/acoustic-classification-segmentation/blob/v1/evaluation.py#L273
 73 |     _, end = to_frame_num(segment_dict['speech'][-1])
 74 |     a[end:] = -1
 75 | 
 76 |     print(f'annotation loaded - '
 77 |           f'unannotated: {len(np.where(a == -1)[0]) / len(a):.2%}, '
 78 |           f'speech: {len(np.where(a == 0)[0]) / len(a):.2%}, '
 79 |           f'non-speech: {len(np.where(a == 1)[0]) / len(a):.2%}')
 80 |     return a
 81 | 
 82 | 
 83 | def p_r_f(hub4_array, predictions):
 84 |     """
 85 |     predictions must be 1d array of labels, not k-d raw probabilities
 86 |     """
 87 |     annotated_idx = np.where(hub4_array != -1)[0]
 88 |     return metrics.precision_recall_fscore_support(hub4_array[annotated_idx], predictions[annotated_idx], pos_label=1, average='binary')
 89 | 
 90 | 
 91 | def roc(hub4_array, predictions):
 92 |     annotated_idx = np.where(hub4_array != -1)[0]
 93 |     return metrics.roc_curve(hub4_array[annotated_idx], predictions[annotated_idx][:,0], pos_label=1)
 94 | 
 95 | 
 96 | def evaluate_file(sph_fname, txt_fname, classifier_model):
 97 |     probs = classifier.predict_pipeline(sph_fname, classifier_model, raw_prob=True)
 98 |     duration = probs.shape[0] * feature.FRAME_SIZE # number of frames * frame size
 99 |     y_hats = np.argmax(probs, axis=1)
100 |     y_hats = smoothing.smooth(y_hats)
101 |     ys = to_nparray(read_hub4_annotation(txt_fname), duration)
102 |     return probs, y_hats, ys, p_r_f(ys, y_hats)
103 | 
104 | 
105 | def evaluate_files(hub4_dir, model, numfiles):
106 |     import reader
107 |     all_probabilities = np.empty((0,2))
108 |     all_predictions = np.empty((0,))
109 |     all_annotations = np.empty((0,))
110 |     for sph_path in reader.read_audios(hub4_dir, file_ext=['sph'], file_per_dir=numfiles):
111 |         base_fname = sph_path[1].split('.')[0]
112 |         probs, predictions, annotations, scores = evaluate_file(os.path.join(*sph_path), os.path.join(hub4_dir, base_fname + '.txt'), model)
113 |         all_probabilities = np.vstack((all_probabilities, probs))
114 |         all_predictions = np.hstack((all_predictions, predictions))
115 |         all_annotations = np.hstack((all_annotations, annotations))
116 |         print(sph_path[1], scores, flush=True)
117 |     print('TOTAL', p_r_f(all_annotations, all_predictions), flush=True)
118 | 
119 | 


--------------------------------------------------------------------------------