├── example
    ├── test.mp4
    ├── train_bad.mp4
    ├── train_good.mp4
    └── example_script.py
├── loki
    ├── functions
    │   ├── __init__.py
    │   ├── evaluation.py
    │   └── helper.py
    ├── models
    │   ├── vggish_tensorflow
    │   │   ├── __init__.py
    │   │   ├── credits.md
    │   │   ├── vggish_params.py
    │   │   ├── wrappers.py
    │   │   ├── vggish_input.py
    │   │   ├── vggish_smoke_test.py
    │   │   ├── vggish_postprocess.py
    │   │   ├── vggish_slim.py
    │   │   ├── vggish_inference_demo.py
    │   │   ├── README.md
    │   │   ├── vggish_train_demo.py
    │   │   └── mel_features.py
    │   ├── __init__.py
    │   ├── util.py
    │   ├── volume.py
    │   └── neural_networks.py
    ├── processing
    │   ├── __init__.py
    │   ├── features.py
    │   └── load.py
    └── __init__.py
├── .gitignore
├── README.md
└── LICENSE.md


/example/test.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TensorDuck/videogame_highlights/HEAD/example/test.mp4


--------------------------------------------------------------------------------
/example/train_bad.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TensorDuck/videogame_highlights/HEAD/example/train_bad.mp4


--------------------------------------------------------------------------------
/example/train_good.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TensorDuck/videogame_highlights/HEAD/example/train_good.mp4


--------------------------------------------------------------------------------
/loki/functions/__init__.py:
--------------------------------------------------------------------------------
1 | """Contains functions for analyzing and pipeline functions"""
2 | 
3 | from . import evaluation
4 | from . import helper
5 | 


--------------------------------------------------------------------------------
/loki/models/vggish_tensorflow/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize this sub folder as a package """
2 | 
3 | from .wrappers import CreateVGGishNetwork, EmbeddingsFromVGGish
4 | 


--------------------------------------------------------------------------------
/loki/processing/__init__.py:
--------------------------------------------------------------------------------
1 | """Methods and classes for load/write and featurize videos"""
2 | 
3 | from .load import VideoClips, append_clips
4 | from.features import compute_decibels
5 | 


--------------------------------------------------------------------------------
/loki/models/__init__.py:
--------------------------------------------------------------------------------
1 | """Classes and methods for making models that analyze videos"""
2 | 
3 | from .volume import VolumeModel, VolumeClassifier
4 | from .neural_networks import NeuralNetworkClassifier
5 | 


--------------------------------------------------------------------------------
/loki/__init__.py:
--------------------------------------------------------------------------------
1 | from .processing import VideoClips, append_clips, compute_decibels
2 | from .models import VolumeModel, VolumeClassifier, NeuralNetworkClassifier
3 | 
4 | from .functions import evaluation
5 | from .functions import helper
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Ignore Everything
 2 | *
 3 | 
 4 | # But no these files
 5 | !.gitignore
 6 | !*.py
 7 | !*.md
 8 | !build/add_path.sh
 9 | !test.mp4
10 | !train_bad.mp4
11 | !train_good.mp4
12 | !LICENSE.md
13 | 
14 | # ... even if they are in subdirectories
15 | !*/
16 | 


--------------------------------------------------------------------------------
/loki/models/vggish_tensorflow/credits.md:
--------------------------------------------------------------------------------
 1 | Credits
 2 | =======
 3 | This application uses Open Source components. You can find the source code of their open source projects along with license information below. We acknowledge and are grateful to these developers for their contributions to open source.
 4 | 
 5 | Project: vggish https://github.com/tensorflow/models/tree/master/research/audioset/vggish
 6 | 
 7 | Copyright 2016 The TensorFlow Authors. All rights reserved.
 8 | 
 9 | License: Apache License 2.0 https://github.com/tensorflow/models/blob/master/LICENSE
10 | 


--------------------------------------------------------------------------------
/loki/processing/features.py:
--------------------------------------------------------------------------------
 1 | """Classes and functions for generating features"""
 2 | from librosa import power_to_db
 3 | import numpy as np
 4 | 
 5 | def compute_decibels(data, freq=44100):
 6 |     """Compute the total decibels from an audio waveform
 7 | 
 8 |     Compute the power by taking the square of the waveform. If the
 9 |     audio is binaural, then sum up the power of each audio channel.
10 | 
11 |     Arguments:
12 |     ----------
13 |     data -- loki.VideoClips
14 |         Object containing the VideoClips
15 | 
16 |     Keyword Arguments:
17 |     ------------------
18 |     freq -- int -- default=44100
19 |         Frequency at which to extract the audio.
20 | 
21 |     Return:
22 |     -------
23 |     decibels -- list[np.ndarray]:
24 |         The loudness over time of each inputted clip.
25 |     """
26 |     decibels = []
27 | 
28 |     all_audio = data.compute_audio_waveform(freq=freq)
29 | 
30 |     for binaural in all_audio:
31 |         power = binaural ** 2 # square for the power
32 |         #sum up dual-channel audio if the audio is dual-channel
33 |         if power.ndim == 2:
34 |             power = np.sum(power, axis=1)
35 | 
36 |         decibel = power_to_db(power)
37 |         decibels.append(decibel)
38 | 
39 |     return decibels
40 | 


--------------------------------------------------------------------------------
/loki/functions/evaluation.py:
--------------------------------------------------------------------------------
 1 | """Useful functions for evaluating a model's performance"""
 2 | import sklearn.metrics as skmet
 3 | 
 4 | def get_confusion_matrix(actual, predicted):
 5 |     """Get the confusion matrix and statistics
 6 | 
 7 |     This is a helper function that leverages sklearn to collect the
 8 |     desired statistics.
 9 | 
10 |     Arguments:
11 |     ----------
12 |     actual -- np.ndarray:
13 |         The atual classes,
14 |     predicted -- np.ndarray:
15 |         The inferred classes
16 | 
17 |     Return:
18 |     -------
19 |     results -- dict:
20 |         Contains the cm (confusion matrix), accuracy, precision and
21 |         recall.
22 |     """
23 |     cm = skmet.confusion_matrix(actual, predicted)
24 | 
25 |     accuracy = (cm[0,0] + cm[1,1]) / len(actual)
26 | 
27 |     precision = cm[1,1] / (cm[0,1] + cm[1,1]) # true positives over false positives and true positives
28 |     recall = cm[1,1] / (cm[1,0] + cm[1,1]) # True positives over false negatives and true positives
29 | 
30 |     results = {'cm':cm, 'accuracy':accuracy, 'precision':precision, 'recall':recall}
31 | 
32 |     return results
33 | 
34 | def print_confusion_matrix(actual, predicted):
35 |     """Print out the confusion matrix
36 | 
37 |     Arguments:
38 |     ----------
39 |     actual -- np.ndarray:
40 |         The atual classes,
41 |     predicted -- np.ndarray:
42 |         The inferred classes
43 |     """
44 |     results = get_confusion_matrix(actual, predicted)
45 | 
46 |     print("Confusion Matrix:")
47 |     print(results['cm'])
48 |     print(f"Accuracy: {results['accuracy']}")
49 |     print(f"Precision: {results['precision']}")
50 |     print(f"Recall: {results['recall']}")
51 | 


--------------------------------------------------------------------------------
/example/example_script.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess as sb
 3 | import time
 4 | import numpy
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | import loki
 8 | 
 9 | if __name__ == "__main__":
10 |     cwd = os.getcwd()
11 | 
12 |     print("##################################################")
13 |     print("Load Files")
14 |     print("##################################################")
15 |     #Best practice is to give full paths
16 |     train_files = [f"{cwd}/train_good.mp4", f"{cwd}/train_bad.mp4"]
17 |     test_files = [f"{cwd}/test.mp4"]
18 | 
19 |     train_targets = [1, 0]
20 | 
21 |     train_videos = loki.VideoClips(train_files)
22 |     test_videos = loki.VideoClips(test_files)
23 | 
24 |     #get a trained neural network classifier
25 |     print("##################################################")
26 |     print("Begin Training")
27 |     print("##################################################")
28 |     nnclass = loki.helper.train_classifier(train_videos, train_targets, test_clips=train_videos, test_targets=train_targets, n_epochs=100, class_weights=None, batch_size=None)
29 |     #save the neural network
30 |     nnclass.save("example_nn")
31 | 
32 |     #perform inference on the training data
33 |     train_audio = train_videos.compute_audio_waveform(mono=True)
34 |     inferred = nnclass.infer(train_audio)
35 |     loki.evaluation.print_confusion_matrix(train_targets, inferred)
36 | 
37 |     print("##################################################")
38 |     print("Analyze test.mp4")
39 |     print("##################################################")
40 | 
41 |     #single channel for Loki
42 |     test_audio = test_videos.compute_audio_waveform(mono=True)
43 |     #interest at each time-step
44 |     x_trace, y_trace = nnclass.get_trace(test_audio)
45 |     n_trace = len(x_trace[0])
46 |     print("Time      Interest")
47 |     print("------------------")
48 |     for i in range(n_trace):
49 |         print(f"{x_trace[0][i]:.2f}      {y_trace[0][i]:.4f}")
50 | 
51 |     print("##################################################")
52 |     print("Find The Most Interesting 1-Second Clip from test.mp4")
53 |     print("##################################################")
54 |     #Use helper function to find the most relevant 0.96-second section
55 |     results = loki.helper.find_best_clip(test_files, 0.96, nn_checkpoint="example_nn")
56 |     #Use helper function to find the most relevant .288-second section
57 |     results = loki.helper.find_best_clip(test_files, 3*0.96, nn_checkpoint="example_nn")
58 | 
59 |     print(results)
60 | 


--------------------------------------------------------------------------------
/loki/models/vggish_tensorflow/vggish_params.py:
--------------------------------------------------------------------------------
 1 | """This file is not my own work and was copied from an open source
 2 | repository by TensorFlow, located at:
 3 | github.com/tensorflow/models/tree/master/research/audioset/vggish
 4 | 
 5 | Only import statements were changed to work within this package.
 6 | """
 7 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
 8 | #
 9 | # Licensed under the Apache License, Version 2.0 (the "License");
10 | # you may not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | #     http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 | # ==============================================================================
21 | 
22 | """Global parameters for the VGGish model.
23 | 
24 | See vggish_slim.py for more information.
25 | """
26 | 
27 | # Architectural constants.
28 | NUM_FRAMES = 96  # Frames in input mel-spectrogram patch.
29 | NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
30 | EMBEDDING_SIZE = 128  # Size of embedding layer.
31 | 
32 | # Hyperparameters used in feature and example generation.
33 | SAMPLE_RATE = 16000
34 | STFT_WINDOW_LENGTH_SECONDS = 0.025
35 | STFT_HOP_LENGTH_SECONDS = 0.010
36 | NUM_MEL_BINS = NUM_BANDS
37 | MEL_MIN_HZ = 125
38 | MEL_MAX_HZ = 7500
39 | LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
40 | EXAMPLE_WINDOW_SECONDS = 0.96  # Each example contains 96 10ms frames
41 | EXAMPLE_HOP_SECONDS = 0.96     # with zero overlap.
42 | 
43 | # Parameters used for embedding postprocessing.
44 | PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'
45 | PCA_MEANS_NAME = 'pca_means'
46 | QUANTIZE_MIN_VAL = -2.0
47 | QUANTIZE_MAX_VAL = +2.0
48 | 
49 | # Hyperparameters used in training.
50 | INIT_STDDEV = 0.01  # Standard deviation used to initialize weights.
51 | LEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.
52 | ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.
53 | 
54 | # Names of ops, tensors, and features.
55 | INPUT_OP_NAME = 'vggish/input_features'
56 | INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'
57 | OUTPUT_OP_NAME = 'vggish/embedding'
58 | OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'
59 | AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'
60 | 


--------------------------------------------------------------------------------
/loki/models/vggish_tensorflow/wrappers.py:
--------------------------------------------------------------------------------
 1 | """This file contains wrappers for the VGGish methods
 2 | 
 3 | Large parts of this file was copied from the colab for the VGGish
 4 | method, see:
 5 | https://colab.research.google.com/drive/1TbX92UL9sYWbdwdGE0rJ9owmezB-Rl1C
 6 | """
 7 | import tensorflow as tf
 8 | 
 9 | from . import vggish_slim
10 | from . import vggish_params
11 | from . import vggish_input
12 | 
13 | def CreateVGGishNetwork(sess, checkpoint_path, hop_size=0.96):   # Hop size is in seconds.
14 |     """Define VGGish model, load the checkpoint, and return a dictionary
15 |      that points to the different tensors defined by the model.
16 |     """
17 |     vggish_slim.define_vggish_slim()
18 |     vggish_params.EXAMPLE_HOP_SECONDS = hop_size
19 | 
20 |     vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)
21 | 
22 |     features_tensor = sess.graph.get_tensor_by_name(
23 |       vggish_params.INPUT_TENSOR_NAME)
24 |     embedding_tensor = sess.graph.get_tensor_by_name(
25 |       vggish_params.OUTPUT_TENSOR_NAME)
26 | 
27 |     layers = {'conv1': 'vggish/conv1/Relu',
28 |             'pool1': 'vggish/pool1/MaxPool',
29 |             'conv2': 'vggish/conv2/Relu',
30 |             'pool2': 'vggish/pool2/MaxPool',
31 |             'conv3': 'vggish/conv3/conv3_2/Relu',
32 |             'pool3': 'vggish/pool3/MaxPool',
33 |             'conv4': 'vggish/conv4/conv4_2/Relu',
34 |             'pool4': 'vggish/pool4/MaxPool',
35 |             'fc1': 'vggish/fc1/fc1_2/Relu',
36 |             'fc2': 'vggish/fc2/Relu',
37 |             'embedding': 'vggish/embedding',
38 |             'features': 'vggish/input_features',
39 |          }
40 |     g = tf.get_default_graph()
41 |     for k in layers:
42 |         layers[k] = g.get_tensor_by_name( layers[k] + ':0')
43 | 
44 |     return {'features': features_tensor,
45 |           'embedding': embedding_tensor,
46 |           'layers': layers,
47 |          }
48 | 
49 | def EmbeddingsFromVGGish(sess, vgg, x, sr):
50 |     '''Run the VGGish model, starting with a sound (x) at sample rate
51 |     (sr). Return a dictionary of embeddings from the different layers
52 |     of the model.'''
53 |     # Produce a batch of log mel spectrogram examples.
54 |     input_batch = vggish_input.waveform_to_examples(x, sr)
55 |     # print('Log Mel Spectrogram example: ', input_batch[0])
56 | 
57 |     layer_names = vgg['layers'].keys()
58 |     tensors = [vgg['layers'][k] for k in layer_names]
59 | 
60 |     results = sess.run(tensors,
61 |                      feed_dict={vgg['features']: input_batch})
62 | 
63 |     resdict = {}
64 |     for i, k in enumerate(layer_names):
65 |         resdict[k] = results[i]
66 | 
67 |     return resdict
68 | 


--------------------------------------------------------------------------------
/loki/models/vggish_tensorflow/vggish_input.py:
--------------------------------------------------------------------------------
 1 | """This file is not my own work and was copied from an open source
 2 | repository by TensorFlow, located at:
 3 | github.com/tensorflow/models/tree/master/research/audioset/vggish
 4 | 
 5 | Only import statements were changed to work within this package.
 6 | """
 7 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
 8 | #
 9 | # Licensed under the Apache License, Version 2.0 (the "License");
10 | # you may not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | #     http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 | # ==============================================================================
21 | 
22 | """Compute input examples for VGGish from audio waveform."""
23 | 
24 | import numpy as np
25 | import resampy
26 | 
27 | from . import mel_features
28 | from . import vggish_params
29 | 
30 | import soundfile as sf
31 | 
32 | 
33 | def waveform_to_examples(data, sample_rate):
34 |   """Converts audio waveform into an array of examples for VGGish.
35 | 
36 |   Args:
37 |     data: np.array of either one dimension (mono) or two dimensions
38 |       (multi-channel, with the outer dimension representing channels).
39 |       Each sample is generally expected to lie in the range [-1.0, +1.0],
40 |       although this is not required.
41 |     sample_rate: Sample rate of data.
42 | 
43 |   Returns:
44 |     3-D np.array of shape [num_examples, num_frames, num_bands] which represents
45 |     a sequence of examples, each of which contains a patch of log mel
46 |     spectrogram, covering num_frames frames of audio and num_bands mel frequency
47 |     bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
48 |   """
49 |   # Convert to mono.
50 |   if len(data.shape) > 1:
51 |     data = np.mean(data, axis=1)
52 |   # Resample to the rate assumed by VGGish.
53 |   if sample_rate != vggish_params.SAMPLE_RATE:
54 |     data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)
55 | 
56 |   # Compute log mel spectrogram features.
57 |   log_mel = mel_features.log_mel_spectrogram(
58 |       data,
59 |       audio_sample_rate=vggish_params.SAMPLE_RATE,
60 |       log_offset=vggish_params.LOG_OFFSET,
61 |       window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
62 |       hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
63 |       num_mel_bins=vggish_params.NUM_MEL_BINS,
64 |       lower_edge_hertz=vggish_params.MEL_MIN_HZ,
65 |       upper_edge_hertz=vggish_params.MEL_MAX_HZ)
66 | 
67 |   # Frame features into examples.
68 |   features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
69 |   example_window_length = int(round(
70 |       vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
71 |   example_hop_length = int(round(
72 |       vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
73 |   log_mel_examples = mel_features.frame(
74 |       log_mel,
75 |       window_length=example_window_length,
76 |       hop_length=example_hop_length)
77 |   return log_mel_examples
78 | 
79 | 
80 | def wavfile_to_examples(wav_file):
81 |   """Convenience wrapper around waveform_to_examples() for a common WAV format.
82 | 
83 |   Args:
84 |     wav_file: String path to a file, or a file-like object. The file
85 |     is assumed to contain WAV audio data with signed 16-bit PCM samples.
86 | 
87 |   Returns:
88 |     See waveform_to_examples.
89 |   """
90 |   wav_data, sr = sf.read(wav_file, dtype='int16')
91 |   assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
92 |   samples = wav_data / 32768.0  # Convert to [-1.0, +1.0]
93 |   return waveform_to_examples(samples, sr)
94 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # videogame_highlights
 2 | Machine Learning to automatically generate highlights from videogame streams
 3 | 
 4 | Getting Started
 5 | ===============
 6 | 
 7 | Prerequisites
 8 | -------------
 9 | This package was written and tested for `Python 3.7.3` compiled using `GCC 7.3.0` in a `conda` environment.
10 | The following packages (and their dependencies) would need to be installed.
11 | The versions listed have been tested in my environment and works, but likely any more recent or backwards compatible version of those packages would also work.
12 | 
13 | *Install with conda*
14 | - [`numpy=1.16.3`](http://www.numpy.org/)
15 | - [`scipy=1.2.1`](http://www.scipy.org/)
16 | - [`tensorflow=1.13.1`](http://www.tensorflow.org/)
17 | - [`pytorch-cpu=1.1.0`](https://pytorch.org/)
18 | - [`scikit-learn=0.21.1`](https://scikit-learn.org/)
19 | - [`resampy=0.2.1`](http://resampy.readthedocs.io/en/latest/)
20 | - [`six=1.12.0`](https://pythonhosted.org/six/)
21 | - [`librosa=0.6.3`](https://librosa.github.io/librosa/)
22 | - [`ffmpeg=4.1.3`](https://ffmpeg.org/)
23 | 
24 | *Install with pip*
25 | - [`moviepy=1.0.0`](https://zulko.github.io/moviepy/)
26 | - [`pysoundfile=0.9.0.post1`](https://pysoundfile.readthedocs.io/)
27 | 
28 | 
29 | Loki Installation
30 | -----------------
31 | 
32 | Once dependencies are installed, do:
33 | ```
34 | cd build
35 | source add_path.sh
36 | ```
37 | 
38 | The `add_path.sh` script will add the relevant directories to the PYTHONPATH variable.
39 | It also sets the necessary environmental variables for finding the necessary checkpoint files.
40 | It will also check for the required VGGish checkpoint file and `wget` it if it is not found.
41 | This must be downloaded inside the build directory for the neural network classifier to work.
42 | It can be found in TensorFlow checkpoint format at: [VGGish model checkpoint](https://storage.googleapis.com/audioset/vggish_model.ckpt).
43 | 
44 | The methods in the LOKI analysis package should then be usable as:
45 | 
46 | ```
47 | import loki
48 | 
49 | clips = loki.VideoClips(["example.mp4"])
50 | ```
51 | 
52 | Helper functions exist in `loki.functions.helper` which provide convenient functions for processing video files and outputting trained models.
53 | 
54 | Example
55 | =======
56 | To run the example and test the `loki` package, do:
57 | 
58 | ```
59 | cd example
60 | python -m example_script
61 | ```
62 | If there are no errors, this example script will train a model to identify when there is a loud banging sound in a video.
63 | It will demonstrate this in these steps:
64 | 1. Load the local .mp4 files.
65 | 2. Train a neural network classifier on the video data to identify interesting moments. In this case, interesting is when there is banging on the tin lid.
66 | 3. Perform inference on the training data and print out the confusion matrix.
67 | 4. Compute an Interest vs. Time on the test mp4 file.
68 | 5. Find the most interesting 1-second segment and 3-second segment in the test mp4 file.
69 | 
70 | If a user wants to understand how the more primitive classes work, look in `loki.functions.helper` to see the details of how helper functions are used. 
71 | 
72 | Developer Notes
73 | ===============
74 | The .gitignore file ignores all files by default. If you want to add a
75 | new file or filetype to the repo, the .gitignore file must be amended.
76 | 
77 | Acknowledgements
78 | ================
79 | This project was made as a consulting project at the Insight Artificial Intelligence Program.
80 | I am grateful for the support and guidance the Insight community provided.
81 | I also want to thank the company I consulted with, [Visor](https://visor.gg/), for providing video files to train and test the model.
82 | 
83 | This application uses Open Source components, specifically files contained in `loki/models/vggish_tensorflow/`. You can find the source code of their open source projects along with license information below. We acknowledge and are grateful to these developers for their contributions to open source.
84 | 
85 | Project: vggish https://github.com/tensorflow/models/tree/master/research/audioset/vggish
86 | 
87 | Copyright 2016 The TensorFlow Authors. All rights reserved.
88 | 
89 | License: Apache License 2.0 https://github.com/tensorflow/models/blob/master/LICENSE
90 | 


--------------------------------------------------------------------------------
/loki/models/util.py:
--------------------------------------------------------------------------------
 1 | """Utility functions for the models sub-package"""
 2 | import numpy as np
 3 | 
 4 | def sort_scores_and_remove_overlap(n_top, scores, clip_indices):
 5 |     """Sort based on the inputted scores and return the n_top scores.
 6 | 
 7 |     Overlap is determined where the scene with the highest score is
 8 |     kept. Subsequent scenes with overlapping time indices are then
 9 |     ignored. This process is repeated until n_top non-overlapping scenes
10 |     are found.
11 | 
12 |     Scores can be any value that characterizes the interest level of a
13 |     scene, with the assumption that higher scores = higher interest.
14 |     For example, this could be the average volume of a scene or some
15 |     inferred interest level from some classifier.
16 | 
17 |     Arguments:
18 |     ----------
19 |     n_top -- int:
20 |         The number of top scoring scenes to return.
21 |     scores -- np.ndarray(N,):
22 |         The score for each of the N scenes, where higher numbers
23 |         translate to more relevant scenes.
24 |     clip_indices -- list([int, float, float]):
25 |         List of video indices, and time stamps in seconds for each
26 |         scene.
27 | 
28 |     Return:
29 |     -------
30 |     best_scores -- np.ndarray(float):
31 |         The score for the corresponding scene.
32 |     best_scenes -- np.ndarray(float(n_top,3)):
33 |         The highest scoring scenes formatted as
34 |         [video index, start time, stop time]
35 |     """
36 |     #argsort sorts lowest to highest so negate the score
37 |     sort_indices = np.argsort(scores * -1)
38 |     n_scenes = len(sort_indices)
39 | 
40 |     #use a while loop until n_top are found, hopefully this is short
41 |     n_found = 0 #count number of non-overlapping scores found
42 |     scene_index = 0 #keep track of number of scenes
43 |     best_scenes = np.zeros((n_top,3))
44 |     best_scores = []
45 |     while n_found < n_top and scene_index < n_scenes:
46 |         #terminate the while loop if every scene is checked.
47 |         this_idx = sort_indices[scene_index]
48 |         this_scene = clip_indices[this_idx]
49 |         this_score = scores[this_idx]
50 |         #check if overlapping
51 |         if not is_overlapping(best_scenes, this_scene):
52 |             best_scenes[n_found,:] = this_scene
53 |             best_scores.append(this_score)
54 |             #increment found index by 1
55 |             n_found += 1
56 | 
57 |         #increment scene_index by 1
58 |         scene_index += 1
59 | 
60 |     return best_scores, best_scenes
61 | 
62 | def is_overlapping(all_scenes, check_scene):
63 |     """Check the check_scene against all_scenes for overlap
64 | 
65 |     check_overlap() returns True if there is any overlap with previous
66 |     scenes. The Format of each check_scene and elements in all_scenes is
67 |     the same. The first element is an integer that denotes the video
68 |     index the scene is from. The next two elements are floats that
69 |     denote the start and stop times respectively. Therefore, check_scene
70 |     is not overlapping if its from a different video than a scene in
71 |     all_scenes. If they are from the same video, then they are not
72 |     overlapping if check_scene finishes before or starts after the
73 |     other scene.
74 | 
75 |     Arguments:
76 |     ----------
77 |     all_scenes -- list(list([int, float, float])):
78 |         List of all scenes to check against.
79 |     check_scene -- list([int, float, float]):
80 |         The scene you want to check for.
81 | 
82 |     Return:
83 |     -------
84 |     bool
85 |     """
86 |     for scene in all_scenes:
87 |         #check if it's the same video
88 |         if scene[0] == check_scene[0]:
89 |             #check if there's any overlap
90 |             #first two check if the `scene` happens after `check_scene`
91 |             #last two checks if the `scene` happens before `check_scene`
92 |             #If all the checks are true, then keep going
93 |             #If one of the checks fail, break from loop and return True
94 |             if not (scene[1] > check_scene[1] and scene[1] > check_scene[2] and scene[2] < check_scene[1] and scene[2] < check_scene[2]):
95 |                 return True
96 | 
97 |     #If the function gets here, there is no overlap
98 |     return False
99 | 


--------------------------------------------------------------------------------
/loki/models/vggish_tensorflow/vggish_smoke_test.py:
--------------------------------------------------------------------------------
  1 | """This file is not my own work and was copied from an open source
  2 | repository by TensorFlow, located at:
  3 | github.com/tensorflow/models/tree/master/research/audioset/vggish
  4 | 
  5 | Only import statements were changed to work within this package.
  6 | """
  7 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
  8 | #
  9 | # Licensed under the Apache License, Version 2.0 (the "License");
 10 | # you may not use this file except in compliance with the License.
 11 | # You may obtain a copy of the License at
 12 | #
 13 | #     http://www.apache.org/licenses/LICENSE-2.0
 14 | #
 15 | # Unless required by applicable law or agreed to in writing, software
 16 | # distributed under the License is distributed on an "AS IS" BASIS,
 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18 | # See the License for the specific language governing permissions and
 19 | # limitations under the License.
 20 | # ==============================================================================
 21 | 
 22 | """A smoke test for VGGish.
 23 | 
 24 | This is a simple smoke test of a local install of VGGish and its associated
 25 | downloaded files. We create a synthetic sound, extract log mel spectrogram
 26 | features, run them through VGGish, post-process the embedding ouputs, and
 27 | check some simple statistics of the results, allowing for variations that
 28 | might occur due to platform/version differences in the libraries we use.
 29 | 
 30 | Usage:
 31 | - Download the VGGish checkpoint and PCA parameters into the same directory as
 32 |   the VGGish source code. If you keep them elsewhere, update the checkpoint_path
 33 |   and pca_params_path variables below.
 34 | - Run:
 35 |   $ python vggish_smoke_test.py
 36 | """
 37 | 
 38 | from __future__ import print_function
 39 | 
 40 | import numpy as np
 41 | import tensorflow as tf
 42 | 
 43 | import vggish_input
 44 | import vggish_params
 45 | import vggish_postprocess
 46 | import vggish_slim
 47 | 
 48 | print('\nTesting your install of VGGish\n')
 49 | 
 50 | # Paths to downloaded VGGish files.
 51 | checkpoint_path = 'vggish_model.ckpt'
 52 | pca_params_path = 'vggish_pca_params.npz'
 53 | 
 54 | # Relative tolerance of errors in mean and standard deviation of embeddings.
 55 | rel_error = 0.1  # Up to 10%
 56 | 
 57 | # Generate a 1 kHz sine wave at 44.1 kHz (we use a high sampling rate
 58 | # to test resampling to 16 kHz during feature extraction).
 59 | num_secs = 3
 60 | freq = 1000
 61 | sr = 44100
 62 | t = np.linspace(0, num_secs, int(num_secs * sr))
 63 | x = np.sin(2 * np.pi * freq * t)
 64 | 
 65 | # Produce a batch of log mel spectrogram examples.
 66 | input_batch = vggish_input.waveform_to_examples(x, sr)
 67 | print('Log Mel Spectrogram example: ', input_batch[0])
 68 | np.testing.assert_equal(
 69 |     input_batch.shape,
 70 |     [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS])
 71 | 
 72 | # Define VGGish, load the checkpoint, and run the batch through the model to
 73 | # produce embeddings.
 74 | with tf.Graph().as_default(), tf.Session() as sess:
 75 |   vggish_slim.define_vggish_slim()
 76 |   vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)
 77 | 
 78 |   features_tensor = sess.graph.get_tensor_by_name(
 79 |       vggish_params.INPUT_TENSOR_NAME)
 80 |   embedding_tensor = sess.graph.get_tensor_by_name(
 81 |       vggish_params.OUTPUT_TENSOR_NAME)
 82 |   [embedding_batch] = sess.run([embedding_tensor],
 83 |                                feed_dict={features_tensor: input_batch})
 84 |   print('VGGish embedding: ', embedding_batch[0])
 85 |   expected_embedding_mean = 0.131
 86 |   expected_embedding_std = 0.238
 87 |   np.testing.assert_allclose(
 88 |       [np.mean(embedding_batch), np.std(embedding_batch)],
 89 |       [expected_embedding_mean, expected_embedding_std],
 90 |       rtol=rel_error)
 91 | 
 92 | # Postprocess the results to produce whitened quantized embeddings.
 93 | pproc = vggish_postprocess.Postprocessor(pca_params_path)
 94 | postprocessed_batch = pproc.postprocess(embedding_batch)
 95 | print('Postprocessed VGGish embedding: ', postprocessed_batch[0])
 96 | expected_postprocessed_mean = 123.0
 97 | expected_postprocessed_std = 75.0
 98 | np.testing.assert_allclose(
 99 |     [np.mean(postprocessed_batch), np.std(postprocessed_batch)],
100 |     [expected_postprocessed_mean, expected_postprocessed_std],
101 |     rtol=rel_error)
102 | 
103 | print('\nLooks Good To Me!\n')
104 | 


--------------------------------------------------------------------------------
/loki/models/vggish_tensorflow/vggish_postprocess.py:
--------------------------------------------------------------------------------
 1 | """This file is not my own work and was copied from an open source
 2 | repository by TensorFlow, located at:
 3 | github.com/tensorflow/models/tree/master/research/audioset/vggish
 4 | 
 5 | Only import statements were changed to work within this package.
 6 | """
 7 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
 8 | #
 9 | # Licensed under the Apache License, Version 2.0 (the "License");
10 | # you may not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | #     http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 | # ==============================================================================
21 | 
22 | """Post-process embeddings from VGGish."""
23 | 
24 | import numpy as np
25 | 
26 | import vggish_params
27 | 
28 | 
29 | class Postprocessor(object):
30 |   """Post-processes VGGish embeddings.
31 | 
32 |   The initial release of AudioSet included 128-D VGGish embeddings for each
33 |   segment of AudioSet. These released embeddings were produced by applying
34 |   a PCA transformation (technically, a whitening transform is included as well)
35 |   and 8-bit quantization to the raw embedding output from VGGish, in order to
36 |   stay compatible with the YouTube-8M project which provides visual embeddings
37 |   in the same format for a large set of YouTube videos. This class implements
38 |   the same PCA (with whitening) and quantization transformations.
39 |   """
40 | 
41 |   def __init__(self, pca_params_npz_path):
42 |     """Constructs a postprocessor.
43 | 
44 |     Args:
45 |       pca_params_npz_path: Path to a NumPy-format .npz file that
46 |         contains the PCA parameters used in postprocessing.
47 |     """
48 |     params = np.load(pca_params_npz_path)
49 |     self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME]
50 |     # Load means into a column vector for easier broadcasting later.
51 |     self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1)
52 |     assert self._pca_matrix.shape == (
53 |         vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), (
54 |             'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,))
55 |     assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), (
56 |         'Bad PCA means shape: %r' % (self._pca_means.shape,))
57 | 
58 |   def postprocess(self, embeddings_batch):
59 |     """Applies postprocessing to a batch of embeddings.
60 | 
61 |     Args:
62 |       embeddings_batch: An nparray of shape [batch_size, embedding_size]
63 |         containing output from the embedding layer of VGGish.
64 | 
65 |     Returns:
66 |       An nparray of the same shape as the input but of type uint8,
67 |       containing the PCA-transformed and quantized version of the input.
68 |     """
69 |     assert len(embeddings_batch.shape) == 2, (
70 |         'Expected 2-d batch, got %r' % (embeddings_batch.shape,))
71 |     assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, (
72 |         'Bad batch shape: %r' % (embeddings_batch.shape,))
73 | 
74 |     # Apply PCA.
75 |     # - Embeddings come in as [batch_size, embedding_size].
76 |     # - Transpose to [embedding_size, batch_size].
77 |     # - Subtract pca_means column vector from each column.
78 |     # - Premultiply by PCA matrix of shape [output_dims, input_dims]
79 |     #   where both are are equal to embedding_size in our case.
80 |     # - Transpose result back to [batch_size, embedding_size].
81 |     pca_applied = np.dot(self._pca_matrix,
82 |                          (embeddings_batch.T - self._pca_means)).T
83 | 
84 |     # Quantize by:
85 |     # - clipping to [min, max] range
86 |     clipped_embeddings = np.clip(
87 |         pca_applied, vggish_params.QUANTIZE_MIN_VAL,
88 |         vggish_params.QUANTIZE_MAX_VAL)
89 |     # - convert to 8-bit in range [0.0, 255.0]
90 |     quantized_embeddings = (
91 |         (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) *
92 |         (255.0 /
93 |          (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL)))
94 |     # - cast 8-bit float to uint8
95 |     quantized_embeddings = quantized_embeddings.astype(np.uint8)
96 | 
97 |     return quantized_embeddings
98 | 


--------------------------------------------------------------------------------
/loki/processing/load.py:
--------------------------------------------------------------------------------
  1 | """Class and methods for handling loading of video files"""
  2 | from moviepy.editor import VideoFileClip
  3 | import librosa
  4 | 
  5 | def append_clips(first, second):
  6 |     """Append two different VideoClips objects
  7 | 
  8 |     Arguments:
  9 |     ----------
 10 |     first -- loki.VideoClips:
 11 |         These filenames will go first.
 12 |     second -- loki.VideoClips:
 13 |         These filenames will follow the filenames in first.
 14 | 
 15 |     Return:
 16 |     -------
 17 |     vclips -- loki.VideoClips:
 18 |         A new VideoClips object with both sets of filenames stored.
 19 |     """
 20 |     #collect the filenames
 21 |     all_filenames = []
 22 |     for fil in first.filenames:
 23 |         all_filenames.append(fil)
 24 |     for fil in second.filenames:
 25 |         all_filenames.append(fil)
 26 | 
 27 |     #make the new VideoClips, does not support saving audio information
 28 |     vclips = VideoClips(all_filenames)
 29 | 
 30 |     return vclips
 31 | 
 32 | class VideoClips():
 33 |     """Load multiple videos and write out relevant clips/audio
 34 | 
 35 |     VideoClips stores a series of video files and provides methods for
 36 |     editing and outputting clips from the video.
 37 | 
 38 |     Arguments:
 39 |     ----------
 40 |     filenames -- list(str):
 41 |         List of video filenames to load.
 42 | 
 43 |     Public Methods:
 44 |     ---------------
 45 |     save_clips():
 46 |         Save subclips of the loaded videofiles.
 47 |     """
 48 | 
 49 |     def __init__(self, filenames):
 50 |         #save the filenames and avoid pass by reference errors
 51 |         self.filenames = filenames[:]
 52 | 
 53 |         #these are attributes resultant from later analysis
 54 |         self.audio_freq = None
 55 |         self.audios = None
 56 | 
 57 |     @property
 58 |     def nclips(self):
 59 |         return len(self.filenames)
 60 | 
 61 |     def write_clips(self, time_stamps, write_fps=12, write_ext=".mp4", write_names=None):
 62 |         """Write selected clips to a file
 63 | 
 64 |         Save out N clips from the previously stored video clips.
 65 | 
 66 |         Arguments:
 67 |         ----------
 68 |         time_stamps -- Nx3 list or np.ndarray:
 69 |             Nx3 List giving the video index, followed by the start and
 70 |             stop times in seconds.
 71 | 
 72 |         Keyword Arguments:
 73 |         ------------------
 74 |         write_fps -- int -- default=12:
 75 |             Frames per a second to write out.
 76 |         write_ext -- str -- default="mp4":
 77 |             File extension format to save with.
 78 |         write_names -- list(str) -- default=None:
 79 |             List of len(N) to write output files to. If None, a default
 80 |             name format will be used.
 81 |         """
 82 | 
 83 |         #If write_names was not given, use a generic name output format
 84 |         if write_names is None:
 85 |             write_names = []
 86 |             for stamp in time_stamps:
 87 |                 vid_idx = int(stamp[0])
 88 |                 start_t = stamp[1]
 89 |                 end_t = stamp[2]
 90 |                 write_names.append(f"vid{vid_idx}_{start_t}-{end_t}{write_ext}")
 91 | 
 92 |         #Iterate over time_stamps and write out the specified clips
 93 |         for i_count, stamp in enumerate(time_stamps):
 94 |             this_vid = VideoFileClip(self.filenames[0])
 95 |             clip = this_vid.subclip(stamp[1], stamp[2])
 96 |             clip.write_videofile(write_names[i_count], fps=write_fps)
 97 |             clip.close()
 98 | 
 99 |     def compute_audio_waveform(self, freq=44100, mono=False):
100 |         """Compute the binaural audio time series
101 | 
102 |         For each video stored, extract the binaural audio. This audio
103 |         is then stored in the attribute self.audios, but also returns
104 |         the list for use in further functions.
105 | 
106 |         Keyword Arguments:
107 |         ------------------
108 |         freq -- int -- default=44100:
109 |             Frequency of the computed sound in Hz. Default is 44.1 kHz.
110 |         mono -- bool -- default=False:
111 |             If True, return mono-channel instead of binaural audio.
112 | 
113 |         Return:
114 |         -------
115 |         audios -- list(np.ndarray):
116 |             Return a list of audio waveforms.
117 |         """
118 | 
119 |         self.audio_freq = 44100
120 |         #only extract audio once (saves time)
121 |         if self.audios is None:
122 |             self.audios = []
123 |             for fname in self.filenames:
124 |                 clip = VideoFileClip(fname)
125 |                 audio = clip.audio
126 |                 wav = audio.to_soundarray(fps=freq)
127 |                 clip.close()
128 |                 #convert to mono-channel
129 |                 if mono:
130 |                     #librosa requires shape (2,N), moviepy gives shape (N,2)
131 |                     wav = librosa.to_mono(wav.transpose())
132 |                 self.audios.append(wav)
133 | 
134 |         return self.audios
135 | 


--------------------------------------------------------------------------------
/loki/models/vggish_tensorflow/vggish_slim.py:
--------------------------------------------------------------------------------
  1 | """This file is not my own work and was copied from an open source
  2 | repository by TensorFlow, located at:
  3 | github.com/tensorflow/models/tree/master/research/audioset/vggish
  4 | 
  5 | Only import statements were changed to work within this package.
  6 | """
  7 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
  8 | #
  9 | # Licensed under the Apache License, Version 2.0 (the "License");
 10 | # you may not use this file except in compliance with the License.
 11 | # You may obtain a copy of the License at
 12 | #
 13 | #     http://www.apache.org/licenses/LICENSE-2.0
 14 | #
 15 | # Unless required by applicable law or agreed to in writing, software
 16 | # distributed under the License is distributed on an "AS IS" BASIS,
 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18 | # See the License for the specific language governing permissions and
 19 | # limitations under the License.
 20 | # ==============================================================================
 21 | 
 22 | """Defines the 'VGGish' model used to generate AudioSet embedding features.
 23 | 
 24 | The public AudioSet release (https://research.google.com/audioset/download.html)
 25 | includes 128-D features extracted from the embedding layer of a VGG-like model
 26 | that was trained on a large Google-internal YouTube dataset. Here we provide
 27 | a TF-Slim definition of the same model, without any dependences on libraries
 28 | internal to Google. We call it 'VGGish'.
 29 | 
 30 | Note that we only define the model up to the embedding layer, which is the
 31 | penultimate layer before the final classifier layer. We also provide various
 32 | hyperparameter values (in vggish_params.py) that were used to train this model
 33 | internally.
 34 | 
 35 | For comparison, here is TF-Slim's VGG definition:
 36 | https://github.com/tensorflow/models/blob/master/research/slim/nets/vgg.py
 37 | """
 38 | 
 39 | import tensorflow as tf
 40 | from . import vggish_params as params
 41 | 
 42 | slim = tf.contrib.slim
 43 | 
 44 | 
 45 | def define_vggish_slim(training=False):
 46 |   """Defines the VGGish TensorFlow model.
 47 | 
 48 |   All ops are created in the current default graph, under the scope 'vggish/'.
 49 | 
 50 |   The input is a placeholder named 'vggish/input_features' of type float32 and
 51 |   shape [batch_size, num_frames, num_bands] where batch_size is variable and
 52 |   num_frames and num_bands are constants, and [num_frames, num_bands] represents
 53 |   a log-mel-scale spectrogram patch covering num_bands frequency bands and
 54 |   num_frames time frames (where each frame step is usually 10ms). This is
 55 |   produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET).
 56 |   The output is an op named 'vggish/embedding' which produces the activations of
 57 |   a 128-D embedding layer, which is usually the penultimate layer when used as
 58 |   part of a full model with a final classifier layer.
 59 | 
 60 |   Args:
 61 |     training: If true, all parameters are marked trainable.
 62 | 
 63 |   Returns:
 64 |     The op 'vggish/embeddings'.
 65 |   """
 66 |   # Defaults:
 67 |   # - All weights are initialized to N(0, INIT_STDDEV).
 68 |   # - All biases are initialized to 0.
 69 |   # - All activations are ReLU.
 70 |   # - All convolutions are 3x3 with stride 1 and SAME padding.
 71 |   # - All max-pools are 2x2 with stride 2 and SAME padding.
 72 |   with slim.arg_scope([slim.conv2d, slim.fully_connected],
 73 |                       weights_initializer=tf.truncated_normal_initializer(
 74 |                           stddev=params.INIT_STDDEV),
 75 |                       biases_initializer=tf.zeros_initializer(),
 76 |                       activation_fn=tf.nn.relu,
 77 |                       trainable=training), \
 78 |        slim.arg_scope([slim.conv2d],
 79 |                       kernel_size=[3, 3], stride=1, padding='SAME'), \
 80 |        slim.arg_scope([slim.max_pool2d],
 81 |                       kernel_size=[2, 2], stride=2, padding='SAME'), \
 82 |        tf.variable_scope('vggish'):
 83 |     # Input: a batch of 2-D log-mel-spectrogram patches.
 84 |     features = tf.placeholder(
 85 |         tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS),
 86 |         name='input_features')
 87 |     # Reshape to 4-D so that we can convolve a batch with conv2d().
 88 |     net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1])
 89 | 
 90 |     # The VGG stack of alternating convolutions and max-pools.
 91 |     net = slim.conv2d(net, 64, scope='conv1')
 92 |     net = slim.max_pool2d(net, scope='pool1')
 93 |     net = slim.conv2d(net, 128, scope='conv2')
 94 |     net = slim.max_pool2d(net, scope='pool2')
 95 |     net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
 96 |     net = slim.max_pool2d(net, scope='pool3')
 97 |     net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
 98 |     net = slim.max_pool2d(net, scope='pool4')
 99 | 
100 |     # Flatten before entering fully-connected layers
101 |     net = slim.flatten(net)
102 |     net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
103 |     # The embedding layer.
104 |     net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')
105 |     return tf.identity(net, name='embedding')
106 | 
107 | 
108 | def load_vggish_slim_checkpoint(session, checkpoint_path):
109 |   """Loads a pre-trained VGGish-compatible checkpoint.
110 | 
111 |   This function can be used as an initialization function (referred to as
112 |   init_fn in TensorFlow documentation) which is called in a Session after
113 |   initializating all variables. When used as an init_fn, this will load
114 |   a pre-trained checkpoint that is compatible with the VGGish model
115 |   definition. Only variables defined by VGGish will be loaded.
116 | 
117 |   Args:
118 |     session: an active TensorFlow session.
119 |     checkpoint_path: path to a file containing a checkpoint that is
120 |       compatible with the VGGish model definition.
121 |   """
122 |   # Get the list of names of all VGGish variables that exist in
123 |   # the checkpoint (i.e., all inference-mode VGGish variables).
124 |   with tf.Graph().as_default():
125 |     define_vggish_slim(training=False)
126 |     vggish_var_names = [v.name for v in tf.global_variables()]
127 | 
128 |   # Get the list of all currently existing variables that match
129 |   # the list of variable names we just computed.
130 |   vggish_vars = [v for v in tf.global_variables() if v.name in vggish_var_names]
131 | 
132 |   # Use a Saver to restore just the variables selected above.
133 |   saver = tf.train.Saver(vggish_vars, name='vggish_load_pretrained',
134 |                          write_version=1)
135 |   saver.restore(session, checkpoint_path)
136 | 


--------------------------------------------------------------------------------
/loki/models/vggish_tensorflow/vggish_inference_demo.py:
--------------------------------------------------------------------------------
  1 | """This file is not my own work and was copied from an open source
  2 | repository by TensorFlow, located at:
  3 | github.com/tensorflow/models/tree/master/research/audioset/vggish
  4 | 
  5 | Only import statements were changed to work within this package.
  6 | """
  7 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
  8 | #
  9 | # Licensed under the Apache License, Version 2.0 (the "License");
 10 | # you may not use this file except in compliance with the License.
 11 | # You may obtain a copy of the License at
 12 | #
 13 | #     http://www.apache.org/licenses/LICENSE-2.0
 14 | #
 15 | # Unless required by applicable law or agreed to in writing, software
 16 | # distributed under the License is distributed on an "AS IS" BASIS,
 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18 | # See the License for the specific language governing permissions and
 19 | # limitations under the License.
 20 | # ==============================================================================
 21 | 
 22 | r"""A simple demonstration of running VGGish in inference mode.
 23 | 
 24 | This is intended as a toy example that demonstrates how the various building
 25 | blocks (feature extraction, model definition and loading, postprocessing) work
 26 | together in an inference context.
 27 | 
 28 | A WAV file (assumed to contain signed 16-bit PCM samples) is read in, converted
 29 | into log mel spectrogram examples, fed into VGGish, the raw embedding output is
 30 | whitened and quantized, and the postprocessed embeddings are optionally written
 31 | in a SequenceExample to a TFRecord file (using the same format as the embedding
 32 | features released in AudioSet).
 33 | 
 34 | Usage:
 35 |   # Run a WAV file through the model and print the embeddings. The model
 36 |   # checkpoint is loaded from vggish_model.ckpt and the PCA parameters are
 37 |   # loaded from vggish_pca_params.npz in the current directory.
 38 |   $ python vggish_inference_demo.py --wav_file /path/to/a/wav/file
 39 | 
 40 |   # Run a WAV file through the model and also write the embeddings to
 41 |   # a TFRecord file. The model checkpoint and PCA parameters are explicitly
 42 |   # passed in as well.
 43 |   $ python vggish_inference_demo.py --wav_file /path/to/a/wav/file \
 44 |                                     --tfrecord_file /path/to/tfrecord/file \
 45 |                                     --checkpoint /path/to/model/checkpoint \
 46 |                                     --pca_params /path/to/pca/params
 47 | 
 48 |   # Run a built-in input (a sine wav) through the model and print the
 49 |   # embeddings. Associated model files are read from the current directory.
 50 |   $ python vggish_inference_demo.py
 51 | """
 52 | 
 53 | from __future__ import print_function
 54 | 
 55 | import numpy as np
 56 | from scipy.io import wavfile
 57 | import six
 58 | import tensorflow as tf
 59 | 
 60 | import vggish_input
 61 | import vggish_params
 62 | import vggish_postprocess
 63 | import vggish_slim
 64 | 
 65 | flags = tf.app.flags
 66 | 
 67 | flags.DEFINE_string(
 68 |     'wav_file', None,
 69 |     'Path to a wav file. Should contain signed 16-bit PCM samples. '
 70 |     'If none is provided, a synthetic sound is used.')
 71 | 
 72 | flags.DEFINE_string(
 73 |     'checkpoint', 'vggish_model.ckpt',
 74 |     'Path to the VGGish checkpoint file.')
 75 | 
 76 | flags.DEFINE_string(
 77 |     'pca_params', 'vggish_pca_params.npz',
 78 |     'Path to the VGGish PCA parameters file.')
 79 | 
 80 | flags.DEFINE_string(
 81 |     'tfrecord_file', None,
 82 |     'Path to a TFRecord file where embeddings will be written.')
 83 | 
 84 | FLAGS = flags.FLAGS
 85 | 
 86 | 
 87 | def main(_):
 88 |   # In this simple example, we run the examples from a single audio file through
 89 |   # the model. If none is provided, we generate a synthetic input.
 90 |   if FLAGS.wav_file:
 91 |     wav_file = FLAGS.wav_file
 92 |   else:
 93 |     # Write a WAV of a sine wav into an in-memory file object.
 94 |     num_secs = 5
 95 |     freq = 1000
 96 |     sr = 44100
 97 |     t = np.linspace(0, num_secs, int(num_secs * sr))
 98 |     x = np.sin(2 * np.pi * freq * t)
 99 |     # Convert to signed 16-bit samples.
100 |     samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
101 |     wav_file = six.BytesIO()
102 |     wavfile.write(wav_file, sr, samples)
103 |     wav_file.seek(0)
104 |   examples_batch = vggish_input.wavfile_to_examples(wav_file)
105 |   print(examples_batch)
106 | 
107 |   # Prepare a postprocessor to munge the model embeddings.
108 |   pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)
109 | 
110 |   # If needed, prepare a record writer to store the postprocessed embeddings.
111 |   writer = tf.python_io.TFRecordWriter(
112 |       FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None
113 | 
114 |   with tf.Graph().as_default(), tf.Session() as sess:
115 |     # Define the model in inference mode, load the checkpoint, and
116 |     # locate input and output tensors.
117 |     vggish_slim.define_vggish_slim(training=False)
118 |     vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
119 |     features_tensor = sess.graph.get_tensor_by_name(
120 |         vggish_params.INPUT_TENSOR_NAME)
121 |     embedding_tensor = sess.graph.get_tensor_by_name(
122 |         vggish_params.OUTPUT_TENSOR_NAME)
123 | 
124 |     # Run inference and postprocessing.
125 |     [embedding_batch] = sess.run([embedding_tensor],
126 |                                  feed_dict={features_tensor: examples_batch})
127 |     print(embedding_batch)
128 |     postprocessed_batch = pproc.postprocess(embedding_batch)
129 |     print(postprocessed_batch)
130 | 
131 |     # Write the postprocessed embeddings as a SequenceExample, in a similar
132 |     # format as the features released in AudioSet. Each row of the batch of
133 |     # embeddings corresponds to roughly a second of audio (96 10ms frames), and
134 |     # the rows are written as a sequence of bytes-valued features, where each
135 |     # feature value contains the 128 bytes of the whitened quantized embedding.
136 |     seq_example = tf.train.SequenceExample(
137 |         feature_lists=tf.train.FeatureLists(
138 |             feature_list={
139 |                 vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
140 |                     tf.train.FeatureList(
141 |                         feature=[
142 |                             tf.train.Feature(
143 |                                 bytes_list=tf.train.BytesList(
144 |                                     value=[embedding.tobytes()]))
145 |                             for embedding in postprocessed_batch
146 |                         ]
147 |                     )
148 |             }
149 |         )
150 |     )
151 |     print(seq_example)
152 |     if writer:
153 |       writer.write(seq_example.SerializeToString())
154 | 
155 |   if writer:
156 |     writer.close()
157 | 
158 | if __name__ == '__main__':
159 |   tf.app.run()
160 | 


--------------------------------------------------------------------------------
/loki/models/vggish_tensorflow/README.md:
--------------------------------------------------------------------------------
  1 | # VGGish
  2 | 
  3 | The initial AudioSet release included 128-dimensional embeddings of each
  4 | AudioSet segment produced from a VGG-like audio classification model that was
  5 | trained on a large YouTube dataset (a preliminary version of what later became
  6 | [YouTube-8M](https://research.google.com/youtube8m)).
  7 | 
  8 | We provide a TensorFlow definition of this model, which we call __*VGGish*__, as
  9 | well as supporting code to extract input features for the model from audio
 10 | waveforms and to post-process the model embedding output into the same format as
 11 | the released embedding features.
 12 | 
 13 | ## Installation
 14 | 
 15 | VGGish depends on the following Python packages:
 16 | 
 17 | * [`numpy`](http://www.numpy.org/)
 18 | * [`scipy`](http://www.scipy.org/)
 19 | * [`resampy`](http://resampy.readthedocs.io/en/latest/)
 20 | * [`tensorflow`](http://www.tensorflow.org/)
 21 | * [`six`](https://pythonhosted.org/six/)
 22 | * [`pysoundfile`](https://pysoundfile.readthedocs.io/)
 23 | 
 24 | These are all easily installable via, e.g., `pip install numpy` (as in the
 25 | example command sequence below).
 26 | 
 27 | Any reasonably recent version of these packages should work. TensorFlow should
 28 | be at least version 1.0.  We have tested that everything works on Ubuntu and
 29 | Windows 10 with Python 3.6.6, Numpy v1.15.4, SciPy v1.1.0, resampy v0.2.1,
 30 | TensorFlow v1.3.0, Six v1.11.0 and PySoundFile 0.9.0.
 31 | 
 32 | VGGish also requires downloading two data files:
 33 | 
 34 | * [VGGish model checkpoint](https://storage.googleapis.com/audioset/vggish_model.ckpt),
 35 |   in TensorFlow checkpoint format.
 36 | * [Embedding PCA parameters](https://storage.googleapis.com/audioset/vggish_pca_params.npz),
 37 |   in NumPy compressed archive format.
 38 | 
 39 | After downloading these files into the same directory as this README, the
 40 | installation can be tested by running `python vggish_smoke_test.py` which
 41 | runs a known signal through the model and checks the output.
 42 | 
 43 | Here's a sample installation and test session:
 44 | 
 45 | ```shell
 46 | # You can optionally install and test VGGish within a Python virtualenv, which
 47 | # is useful for isolating changes from the rest of your system. For example, you
 48 | # may have an existing version of some packages that you do not want to upgrade,
 49 | # or you want to try Python 3 instead of Python 2. If you decide to use a
 50 | # virtualenv, you can create one by running
 51 | #   $ virtualenv vggish   # For Python 2
 52 | # or
 53 | #   $ python3 -m venv vggish # For Python 3
 54 | # and then enter the virtual environment by running
 55 | #   $ source vggish/bin/activate  # Assuming you use bash
 56 | # Leave the virtual environment at the end of the session by running
 57 | #   $ deactivate
 58 | # Within the virtual environment, do not use 'sudo'.
 59 | 
 60 | # Upgrade pip first.
 61 | $ sudo python -m pip install --upgrade pip
 62 | 
 63 | # Install dependences. Resampy needs to be installed after NumPy and SciPy
 64 | # are already installed.
 65 | $ sudo pip install numpy scipy
 66 | $ sudo pip install resampy tensorflow six
 67 | 
 68 | # Clone TensorFlow models repo into a 'models' directory.
 69 | $ git clone https://github.com/tensorflow/models.git
 70 | $ cd models/research/audioset
 71 | # Download data files into same directory as code.
 72 | $ curl -O https://storage.googleapis.com/audioset/vggish_model.ckpt
 73 | $ curl -O https://storage.googleapis.com/audioset/vggish_pca_params.npz
 74 | 
 75 | # Installation ready, let's test it.
 76 | $ python vggish_smoke_test.py
 77 | # If we see "Looks Good To Me", then we're all set.
 78 | ```
 79 | 
 80 | ## Usage
 81 | 
 82 | VGGish can be used in two ways:
 83 | 
 84 | * *As a feature extractor*: VGGish converts audio input features into a
 85 |   semantically meaningful, high-level 128-D embedding which can be fed as input
 86 |   to a downstream classification model. The downstream model can be shallower
 87 |   than usual because the VGGish embedding is more semantically compact than raw
 88 |   audio features.
 89 | 
 90 |   So, for example, you could train a classifier for 10 of the AudioSet classes
 91 |   by using the released embeddings as features.  Then, you could use that
 92 |   trained classifier with any arbitrary audio input by running the audio through
 93 |   the audio feature extractor and VGGish model provided here, passing the
 94 |   resulting embedding features as input to your trained model.
 95 |   `vggish_inference_demo.py` shows how to produce VGGish embeddings from
 96 |   arbitrary audio.
 97 | 
 98 | * *As part of a larger model*: Here, we treat VGGish as a "warm start" for the
 99 |   lower layers of a model that takes audio features as input and adds more
100 |   layers on top of the VGGish embedding. This can be used to fine-tune VGGish
101 |   (or parts thereof) if you have large datasets that might be very different
102 |   from the typical YouTube video clip. `vggish_train_demo.py` shows how to add
103 |   layers on top of VGGish and train the whole model.
104 | 
105 | ## About the Model
106 | 
107 | The VGGish code layout is as follows:
108 | 
109 | * `vggish_slim.py`: Model definition in TensorFlow Slim notation.
110 | * `vggish_params.py`: Hyperparameters.
111 | * `vggish_input.py`: Converter from audio waveform into input examples.
112 | * `mel_features.py`: Audio feature extraction helpers.
113 | * `vggish_postprocess.py`: Embedding postprocessing.
114 | * `vggish_inference_demo.py`: Demo of VGGish in inference mode.
115 | * `vggish_train_demo.py`: Demo of VGGish in training mode.
116 | * `vggish_smoke_test.py`: Simple test of a VGGish installation
117 | 
118 | ### Architecture
119 | 
120 | See `vggish_slim.py` and `vggish_params.py`.
121 | 
122 | VGGish is a variant of the [VGG](https://arxiv.org/abs/1409.1556) model, in
123 | particular Configuration A with 11 weight layers. Specifically, here are the
124 | changes we made:
125 | 
126 | * The input size was changed to 96x64 for log mel spectrogram audio inputs.
127 | 
128 | * We drop the last group of convolutional and maxpool layers, so we now have
129 |   only four groups of convolution/maxpool layers instead of five.
130 | 
131 | * Instead of a 1000-wide fully connected layer at the end, we use a 128-wide
132 |   fully connected layer. This acts as a compact embedding layer.
133 | 
134 | The model definition provided here defines layers up to and including the
135 | 128-wide embedding layer.
136 | 
137 | ### Input: Audio Features
138 | 
139 | See `vggish_input.py` and `mel_features.py`.
140 | 
141 | VGGish was trained with audio features computed as follows:
142 | 
143 | * All audio is resampled to 16 kHz mono.
144 | * A spectrogram is computed using magnitudes of the Short-Time Fourier Transform
145 |   with a window size of 25 ms, a window hop of 10 ms, and a periodic Hann
146 |   window.
147 | * A mel spectrogram is computed by mapping the spectrogram to 64 mel bins
148 |   covering the range 125-7500 Hz.
149 | * A stabilized log mel spectrogram is computed by applying
150 |   log(mel-spectrum + 0.01) where the offset is used to avoid taking a logarithm
151 |   of zero.
152 | * These features are then framed into non-overlapping examples of 0.96 seconds,
153 |   where each example covers 64 mel bands and 96 frames of 10 ms each.
154 | 
155 | We provide our own NumPy implementation that produces features that are very
156 | similar to those produced by our internal production code. This results in
157 | embedding outputs that are closely match the embeddings that we have already
158 | released. Note that these embeddings will *not* be bit-for-bit identical to the
159 | released embeddings due to small differences between the feature computation
160 | code paths, and even between two different installations of VGGish with
161 | different underlying libraries and hardware. However, we expect that the
162 | embeddings will be equivalent in the context of a downstream classification
163 | task.
164 | 
165 | ### Output: Embeddings
166 | 
167 | See `vggish_postprocess.py`.
168 | 
169 | The released AudioSet embeddings were postprocessed before release by applying a
170 | PCA transformation (which performs both PCA and whitening) as well as
171 | quantization to 8 bits per embedding element. This was done to be compatible
172 | with the [YouTube-8M](https://research.google.com/youtube8m) project which has
173 | released visual and audio embeddings for millions of YouTube videos in the same
174 | PCA/whitened/quantized format.
175 | 
176 | We provide a Python implementation of the postprocessing which can be applied to
177 | batches of embeddings produced by VGGish. `vggish_inference_demo.py` shows how
178 | the postprocessor can be run after inference.
179 | 
180 | If you don't need to use the released embeddings or YouTube-8M, then you could
181 | skip postprocessing and use raw embeddings.
182 | 
183 | A [Colab](https://colab.research.google.com/)
184 | showing how to download the model and calculate the embeddings on your
185 | own sound data is available here:
186 | [AudioSet Embedding Colab](https://colab.research.google.com/drive/1TbX92UL9sYWbdwdGE0rJ9owmezB-Rl1C).
187 | 
188 | 


--------------------------------------------------------------------------------
/loki/models/vggish_tensorflow/vggish_train_demo.py:
--------------------------------------------------------------------------------
  1 | """This file is not my own work and was copied from an open source
  2 | repository by TensorFlow, located at:
  3 | github.com/tensorflow/models/tree/master/research/audioset/vggish
  4 | 
  5 | Only import statements were changed to work within this package.
  6 | """
  7 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
  8 | #
  9 | # Licensed under the Apache License, Version 2.0 (the "License");
 10 | # you may not use this file except in compliance with the License.
 11 | # You may obtain a copy of the License at
 12 | #
 13 | #     http://www.apache.org/licenses/LICENSE-2.0
 14 | #
 15 | # Unless required by applicable law or agreed to in writing, software
 16 | # distributed under the License is distributed on an "AS IS" BASIS,
 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18 | # See the License for the specific language governing permissions and
 19 | # limitations under the License.
 20 | # ==============================================================================
 21 | 
 22 | r"""A simple demonstration of running VGGish in training mode.
 23 | 
 24 | This is intended as a toy example that demonstrates how to use the VGGish model
 25 | definition within a larger model that adds more layers on top, and then train
 26 | the larger model. If you let VGGish train as well, then this allows you to
 27 | fine-tune the VGGish model parameters for your application. If you don't let
 28 | VGGish train, then you use VGGish as a feature extractor for the layers above
 29 | it.
 30 | 
 31 | For this toy task, we are training a classifier to distinguish between three
 32 | classes: sine waves, constant signals, and white noise. We generate synthetic
 33 | waveforms from each of these classes, convert into shuffled batches of log mel
 34 | spectrogram examples with associated labels, and feed the batches into a model
 35 | that includes VGGish at the bottom and a couple of additional layers on top. We
 36 | also plumb in labels that are associated with the examples, which feed a label
 37 | loss used for training.
 38 | 
 39 | Usage:
 40 |   # Run training for 100 steps using a model checkpoint in the default
 41 |   # location (vggish_model.ckpt in the current directory). Allow VGGish
 42 |   # to get fine-tuned.
 43 |   $ python vggish_train_demo.py --num_batches 100
 44 | 
 45 |   # Same as before but run for fewer steps and don't change VGGish parameters
 46 |   # and use a checkpoint in a different location
 47 |   $ python vggish_train_demo.py --num_batches 50 \
 48 |                                 --train_vggish=False \
 49 |                                 --checkpoint /path/to/model/checkpoint
 50 | """
 51 | 
 52 | from __future__ import print_function
 53 | 
 54 | from random import shuffle
 55 | 
 56 | import numpy as np
 57 | import tensorflow as tf
 58 | 
 59 | import vggish_input
 60 | import vggish_params
 61 | import vggish_slim
 62 | 
 63 | flags = tf.app.flags
 64 | slim = tf.contrib.slim
 65 | 
 66 | flags.DEFINE_integer(
 67 |     'num_batches', 30,
 68 |     'Number of batches of examples to feed into the model. Each batch is of '
 69 |     'variable size and contains shuffled examples of each class of audio.')
 70 | 
 71 | flags.DEFINE_boolean(
 72 |     'train_vggish', True,
 73 |     'If True, allow VGGish parameters to change during training, thus '
 74 |     'fine-tuning VGGish. If False, VGGish parameters are fixed, thus using '
 75 |     'VGGish as a fixed feature extractor.')
 76 | 
 77 | flags.DEFINE_string(
 78 |     'checkpoint', 'vggish_model.ckpt',
 79 |     'Path to the VGGish checkpoint file.')
 80 | 
 81 | FLAGS = flags.FLAGS
 82 | 
 83 | _NUM_CLASSES = 3
 84 | 
 85 | 
 86 | def _get_examples_batch():
 87 |   """Returns a shuffled batch of examples of all audio classes.
 88 | 
 89 |   Note that this is just a toy function because this is a simple demo intended
 90 |   to illustrate how the training code might work.
 91 | 
 92 |   Returns:
 93 |     a tuple (features, labels) where features is a NumPy array of shape
 94 |     [batch_size, num_frames, num_bands] where the batch_size is variable and
 95 |     each row is a log mel spectrogram patch of shape [num_frames, num_bands]
 96 |     suitable for feeding VGGish, while labels is a NumPy array of shape
 97 |     [batch_size, num_classes] where each row is a multi-hot label vector that
 98 |     provides the labels for corresponding rows in features.
 99 |   """
100 |   # Make a waveform for each class.
101 |   num_seconds = 5
102 |   sr = 44100  # Sampling rate.
103 |   t = np.linspace(0, num_seconds, int(num_seconds * sr))  # Time axis.
104 |   # Random sine wave.
105 |   freq = np.random.uniform(100, 1000)
106 |   sine = np.sin(2 * np.pi * freq * t)
107 |   # Random constant signal.
108 |   magnitude = np.random.uniform(-1, 1)
109 |   const = magnitude * t
110 |   # White noise.
111 |   noise = np.random.normal(-1, 1, size=t.shape)
112 | 
113 |   # Make examples of each signal and corresponding labels.
114 |   # Sine is class index 0, Const class index 1, Noise class index 2.
115 |   sine_examples = vggish_input.waveform_to_examples(sine, sr)
116 |   sine_labels = np.array([[1, 0, 0]] * sine_examples.shape[0])
117 |   const_examples = vggish_input.waveform_to_examples(const, sr)
118 |   const_labels = np.array([[0, 1, 0]] * const_examples.shape[0])
119 |   noise_examples = vggish_input.waveform_to_examples(noise, sr)
120 |   noise_labels = np.array([[0, 0, 1]] * noise_examples.shape[0])
121 | 
122 |   # Shuffle (example, label) pairs across all classes.
123 |   all_examples = np.concatenate((sine_examples, const_examples, noise_examples))
124 |   all_labels = np.concatenate((sine_labels, const_labels, noise_labels))
125 |   labeled_examples = list(zip(all_examples, all_labels))
126 |   shuffle(labeled_examples)
127 | 
128 |   # Separate and return the features and labels.
129 |   features = [example for (example, _) in labeled_examples]
130 |   labels = [label for (_, label) in labeled_examples]
131 |   return (features, labels)
132 | 
133 | 
134 | def main(_):
135 |   with tf.Graph().as_default(), tf.Session() as sess:
136 |     # Define VGGish.
137 |     embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish)
138 | 
139 |     # Define a shallow classification model and associated training ops on top
140 |     # of VGGish.
141 |     with tf.variable_scope('mymodel'):
142 |       # Add a fully connected layer with 100 units.
143 |       num_units = 100
144 |       fc = slim.fully_connected(embeddings, num_units)
145 | 
146 |       # Add a classifier layer at the end, consisting of parallel logistic
147 |       # classifiers, one per class. This allows for multi-class tasks.
148 |       logits = slim.fully_connected(
149 |           fc, _NUM_CLASSES, activation_fn=None, scope='logits')
150 |       tf.sigmoid(logits, name='prediction')
151 | 
152 |       # Add training ops.
153 |       with tf.variable_scope('train'):
154 |         global_step = tf.Variable(
155 |             0, name='global_step', trainable=False,
156 |             collections=[tf.GraphKeys.GLOBAL_VARIABLES,
157 |                          tf.GraphKeys.GLOBAL_STEP])
158 | 
159 |         # Labels are assumed to be fed as a batch multi-hot vectors, with
160 |         # a 1 in the position of each positive class label, and 0 elsewhere.
161 |         labels = tf.placeholder(
162 |             tf.float32, shape=(None, _NUM_CLASSES), name='labels')
163 | 
164 |         # Cross-entropy label loss.
165 |         xent = tf.nn.sigmoid_cross_entropy_with_logits(
166 |             logits=logits, labels=labels, name='xent')
167 |         loss = tf.reduce_mean(xent, name='loss_op')
168 |         tf.summary.scalar('loss', loss)
169 | 
170 |         # We use the same optimizer and hyperparameters as used to train VGGish.
171 |         optimizer = tf.train.AdamOptimizer(
172 |             learning_rate=vggish_params.LEARNING_RATE,
173 |             epsilon=vggish_params.ADAM_EPSILON)
174 |         optimizer.minimize(loss, global_step=global_step, name='train_op')
175 | 
176 |     # Initialize all variables in the model, and then load the pre-trained
177 |     # VGGish checkpoint.
178 |     sess.run(tf.global_variables_initializer())
179 |     vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
180 | 
181 |     # Locate all the tensors and ops we need for the training loop.
182 |     features_tensor = sess.graph.get_tensor_by_name(
183 |         vggish_params.INPUT_TENSOR_NAME)
184 |     labels_tensor = sess.graph.get_tensor_by_name('mymodel/train/labels:0')
185 |     global_step_tensor = sess.graph.get_tensor_by_name(
186 |         'mymodel/train/global_step:0')
187 |     loss_tensor = sess.graph.get_tensor_by_name('mymodel/train/loss_op:0')
188 |     train_op = sess.graph.get_operation_by_name('mymodel/train/train_op')
189 | 
190 |     # The training loop.
191 |     for _ in range(FLAGS.num_batches):
192 |       (features, labels) = _get_examples_batch()
193 |       [num_steps, loss, _] = sess.run(
194 |           [global_step_tensor, loss_tensor, train_op],
195 |           feed_dict={features_tensor: features, labels_tensor: labels})
196 |       print('Step %d: loss %g' % (num_steps, loss))
197 | 
198 | if __name__ == '__main__':
199 |   tf.app.run()
200 | 


--------------------------------------------------------------------------------
/loki/models/volume.py:
--------------------------------------------------------------------------------
  1 | """Class and methods for detecting loudest portions of videos"""
  2 | import math
  3 | import numpy as np
  4 | import sklearn.metrics as skmet
  5 | 
  6 | from .util import sort_scores_and_remove_overlap
  7 | 
  8 | class VolumeClassifier():
  9 |     """A classifier that classifiers interesting scenes based on volume
 10 | 
 11 |     This classifier will class a scene as interesting if its average
 12 |     volume is above a certain cutoff. Values above that cutoff are then
 13 |     classified as interesting.
 14 | 
 15 |     Attributes:
 16 |     -----------
 17 |     volume_cutoff -- float:
 18 |         Scenes with average volume above volume_cutoff are classified as
 19 |         interesting. Those less than or equal to volume_cutoff are
 20 |         classified as uninteresting.
 21 |     """
 22 | 
 23 |     def __init__(self):
 24 |         self.volume_cutoff = 0
 25 | 
 26 |     def save(self, savefile=None):
 27 |         """Save the model's learned parameters
 28 | 
 29 |         Write out the volume_cutoff to a file. If no file is specified,
 30 |         then write into volume_classifier_model.dat in the current
 31 |         directory.
 32 | 
 33 |         Keyword Arguments:
 34 |         ------------------
 35 |         savefile -- str -- default=None:
 36 |             The file name to write the parameters into.
 37 |         """
 38 |         if savefile is None:
 39 |             savefile = "volume_classifier_model.dat"
 40 |         f = open(savefile, "w")
 41 |         f.write(f"volume_cutoff = {self.volume_cutoff}\n")
 42 |         f.close()
 43 | 
 44 |     def load(self, savefile):
 45 |         """Load a previous model's learned parameters
 46 | 
 47 |         Arguments:
 48 |         ----------
 49 |         savefile -- str -- default=None:
 50 |             The file name to write the parameters into.
 51 |         """
 52 |         f = open(savefile, "r")
 53 | 
 54 |         for line in f:
 55 |             #load the parameters one by one
 56 |             stuff = line.strip().split()
 57 |             if stuff[0] == "volume_cutoff":
 58 |                 self.volume_cutoff = float(stuff[2])
 59 |         f.close()
 60 | 
 61 | 
 62 |     def train(self, training_x, training_y):
 63 |         """Train the volume classifier
 64 | 
 65 |         Train the volume classifier (currently a binary classifier).
 66 |         Default loss function is the hamming loss which, for a binary
 67 |         classifier, is equivalent to accuracy.
 68 |         The training here is simply finding the threshold that maximizes
 69 |         the accuracy. It's not technically training but the method is
 70 |         named such for consistency.
 71 | 
 72 |         Arguments
 73 |         ---------
 74 |         training_x -- list[np.ndarray]:
 75 |             The volume (in decibels) of the training data. Of length N.
 76 |         training_y -- np.ndarray or list:
 77 |             The corresponding classes of the training data. Also of
 78 |             length N. A class of 1 is interesting, a class of 0 is
 79 |             uninteresting.
 80 |         """
 81 |         average_loudness = []
 82 |         for audioclip in training_x:
 83 |             average_loudness.append(audioclip.mean())
 84 |         average_loudness = np.array(average_loudness)
 85 | 
 86 |         best_loss = 1 # hamming loss goes from 0 - 1
 87 |         best_cutoff = None
 88 | 
 89 |         #Get a unique set of volume cutoffs:
 90 |         unique_values = np.unique(average_loudness)
 91 |         possible_cutoffs = unique_values[1:] + unique_values[:-1]
 92 |         low_endvalue = np.min(unique_values) - possible_cutoffs[1] + possible_cutoffs[2]
 93 |         high_endvalue = np.max(unique_values)
 94 | 
 95 |         possible_cutoffs = np.append([low_endvalue, high_endvalue], possible_cutoffs)
 96 | 
 97 |         #check every possible cutoff
 98 |         for cutoff in possible_cutoffs:
 99 |             predicted_values = np.zeros(len(average_loudness))
100 |             predicted_values[np.where(average_loudness > cutoff)] = 1
101 |             loss = skmet.hamming_loss(training_y, predicted_values)
102 |             if loss < best_loss:
103 |                 best_loss = loss
104 |                 best_cutoff = cutoff
105 | 
106 |         self.volume_cutoff = best_cutoff
107 | 
108 |     def infer(self, test_x, score=False):
109 |         """Make an inference on the test data based on trained model
110 | 
111 |         For instances in test_x where the average volume is greater than
112 |         self.volume_cutoff, class it as 1 for interesting.
113 | 
114 |         Arguments:
115 |         ----------
116 |         test_x -- list[np.ndarray]:
117 |             The volume (in decibels) of the test data.
118 |         score -- bool -- default=False:
119 |             If False, return the classes based on the volume threshold.
120 |             If True, return the non-thresholded average volume of each
121 |             scene.
122 | 
123 |         Return:
124 |         -------
125 |         classified -- np.ndarray:
126 |             The resultant classes for the test data.
127 |          """
128 |         classified = []
129 |         for audioclip in test_x:
130 |             avg_volume = audioclip.mean()
131 |             if score:
132 |                 classified.append(avg_volume)
133 |             else:
134 |                 if avg_volume > self.volume_cutoff:
135 |                     classified.append(1)
136 |                 else:
137 |                     classified.append(0)
138 | 
139 |         return np.array(classified)
140 | 
141 | 
142 | class VolumeModel():
143 |     """Find the loudest sections in a set of videos
144 | 
145 |     Keyword Arguments:
146 |     ------------------
147 |     search_length -- float -- default=10.0:
148 |         Desired clip size in seconds.
149 | 
150 |     search_increment -- float -- default=10.0:
151 |         Desired shift to apply to search window in seconds.
152 | 
153 |     """
154 | 
155 |     def __init__(self, search_length=10.0, search_increment=1.0):
156 |         self.search_length = search_length
157 |         self.search_increment = search_increment
158 | 
159 |     def predict(self, loudness, freq=44100, n_predict=1):
160 |         """Find the loudest section in the inputted video clips
161 | 
162 |         Take the input loudness generated from a video and search over
163 |         every volume array to determine the clips with the overall
164 |         loudest moments. Return the video index and time index
165 |         corresponding to the overall loudest portion.
166 | 
167 |         Note, the inputted loudness are not expected to be the same
168 |         length as the length of videos would vary a great deal. Assuming
169 |         every video is atleast longer than self.search_length, then the
170 |         outputted loudest segments would be of standard length.
171 | 
172 |         Arguments:
173 |         ----------
174 |         loudness -- list[np.ndarray(float)]:
175 |             List of all total volume of multiple vidoeo clips.
176 | 
177 |         Keywrod Arguments:
178 |         ------------------
179 |         freq -- int -- default=44100
180 |             Frequency in Hz to extract the audio over.
181 |         n_predict -- int -- default=1
182 |             Return the top n_predict non-overlapping scenes.
183 |         """
184 | 
185 |         #Define search windows in array index lengths
186 |         search_window = math.floor(self.search_length * freq)
187 |         search_jump = math.floor(self.search_increment * freq)
188 | 
189 |         #store the loudest section and increment
190 |         all_loudness_scores = np.zeros(0)
191 |         all_scenes = np.zeros((0,3))
192 | 
193 |         #check each audio clip
194 |         for audio_idx, audioclip in enumerate(loudness):
195 |             #check if clip is longer than search window
196 |             if len(audioclip) <= search_window:
197 |                 #if longer, compare average volume of this portion
198 |                 avg_loudness = np.sum(audioclip) / float(len(audioclip))
199 |                 clip_increment = np.array([audio_idx, 0, len(audioclip)/freq]).reshape((1,3))
200 |                 #append to lists
201 |                 all_loudness_scores = np.append(all_loudness_scores, avg_loudness)
202 |                 all_scenes = np.append(all_scenes, clip_increment, axis=0)
203 |             else:
204 |                 #If clip is not longer, check every window
205 |                 start_indices = range(0, len(audioclip) - search_window, search_jump)
206 |                 #Increment over every window
207 |                 for start_idx in start_indices:
208 |                     end_idx = start_idx + search_window
209 |                     avg_loudness = np.sum(audioclip[start_idx:end_idx]) / float(search_window)
210 |                     clip_increment = np.array([audio_idx, start_idx/freq, end_idx/freq]).reshape((1,3))
211 |                     #append to lists
212 |                     all_loudness_scores = np.append(all_loudness_scores, avg_loudness)
213 |                     all_scenes = np.append(all_scenes, clip_increment, axis=0)
214 | 
215 |         #return the top scores
216 |         top_scores, top_scenes = sort_scores_and_remove_overlap(n_predict, all_loudness_scores, all_scenes)
217 | 
218 |         return top_scores, top_scenes
219 | 


--------------------------------------------------------------------------------
/loki/functions/helper.py:
--------------------------------------------------------------------------------
  1 | """Contains helper functions for loading/training/evaluation"""
  2 | import os
  3 | import numpy as np
  4 | 
  5 | from .. import processing
  6 | from .. import models
  7 | from . import evaluation
  8 | 
  9 | def load_clips_from_dir(target_dir=None):
 10 |     """Make a VideoClips object with all files in a dir
 11 | 
 12 |     Keyword Arguments:
 13 |     ------------------
 14 |     target_dir -- str -- None:
 15 |         The target dir to load the files from.
 16 | 
 17 |     Return:
 18 |     -------
 19 |     clips -- loki.VideoClips:
 20 |         A VideoClips loader with all the files in target_dir.
 21 |     """
 22 |     cwd = os.getcwd()
 23 |     if target_dir is None:
 24 |         #default load from current directory
 25 |         target_dir = cwd
 26 |     else:
 27 |         #make sure you use the full path
 28 |         os.chdir(target_dir)
 29 |         target_dir = os.getcwd()
 30 |         os.chdir(cwd)
 31 | 
 32 |     #grab every file and alphabetize
 33 |     all_files = os.listdir(target_dir)
 34 |     all_files.sort()
 35 | 
 36 |     #append the fullpath to every file
 37 |     fullpath_files = []
 38 |     for fil in all_files:
 39 |         fullpath_files.append(f"{target_dir}/{fil}")
 40 | 
 41 |     clips = processing.VideoClips(fullpath_files)
 42 | 
 43 |     return clips
 44 | 
 45 | def average_over_window(data, n_average):
 46 |     """Computer a sliding window average over data
 47 | 
 48 |     Given a window size in indices, compute the average value of data
 49 |     over that window sliding by one index.
 50 | 
 51 |     Arguments:
 52 |     ----------
 53 |     data -- np.ndarray:
 54 |         1-D array of length N to compute averages over.
 55 |     n_average -- int:
 56 |         Size of the window.
 57 | 
 58 |     Return:
 59 |     -------
 60 |     new_data -- np.ndarray:
 61 |         1-D array of length N-n_average. Each index represents the
 62 |         average over n_average consecutive elements.
 63 | 
 64 |     Example:
 65 |     --------
 66 |     Given data = [0, 1, 2, 3, 4]
 67 |     n_average = 2
 68 |     Then the average trace over a window of 2 is:
 69 |     [0.5, 1.5, 2.5, 3.5]
 70 |     """
 71 | 
 72 |     new_data = np.copy(data)[:-n_average]
 73 |     for i in range(1, n_average):
 74 |         new_data += data[i:-(n_average-i)]
 75 |     new_data /= n_average
 76 | 
 77 |     return new_data
 78 | 
 79 | def find_best_clip(video_files, clip_length, nn_checkpoint="nn_model"):
 80 |     """Find the best clip in a set of videos of specified duration
 81 | 
 82 |     Search over every video_file and compute a windowed average of the
 83 |     interest level every second. The clip section with the largest
 84 |     average interest is then returned, as well as the original
 85 |     non-averaged trace of interest level for each inputted video file.
 86 | 
 87 |     Arguments:
 88 |     ----------
 89 |     video_files -- list[str]:
 90 |         List of N video files to calculcate interest levels for.
 91 |     clip_length -- float:
 92 |         Length of the desired highlight clip in seconds.
 93 | 
 94 |     Keyword Arguments:
 95 |     ------------------
 96 |     nn_model -- str -- nn_model:
 97 |         Location of the loki.NeuralNetworkClassifier checkpoint file.
 98 | 
 99 |     Return:
100 |     -------
101 |     best_clip -- list:
102 | 
103 |     -- dict:
104 |         Return a dictionary containing the best_clip, x_trace, and
105 |         y_trace.
106 |         best_clip -- list:
107 |             Contains the best clip section. The first element is the
108 |             video file containing the best clip. The second and third
109 |             element is the start and stop time respectively.
110 |         x_trace -- list[np.ndarray]:
111 |             List of N arrays giving the times for the center of each
112 |             averaging window.
113 |         y_trace -- list[np.ndarray]:
114 |             List of N arrays giving the average interst level over each
115 |             window.
116 |     """
117 |     #0.96 is the length of time VGGish processes as a single embedding
118 |     clip_size = int(np.ceil(clip_length / 0.96))
119 |     nnclass = models.NeuralNetworkClassifier()
120 |     nnclass.load(nn_checkpoint)
121 |     vclips = processing.VideoClips(video_files)
122 |     big_audio = vclips.compute_audio_waveform()
123 | 
124 |     x_trace, y_trace = nnclass.get_trace(big_audio)
125 | 
126 |     #save the average interest level over each clip in a windowed avg
127 |     x_avg = []
128 |     y_avg = []
129 |     for x,y in zip(x_trace,y_trace):
130 |         x_avg.append(average_over_window(x, clip_size))
131 |         y_avg.append(average_over_window(y, clip_size))
132 | 
133 |     #find the most interesting segment
134 |     best_interest = 0
135 |     best_time = 0
136 |     best_file = None
137 |     for i,(x,y) in enumerate(zip(x_avg, y_avg)):
138 |         if np.max(y) > best_interest:
139 |             #found a better clip
140 |             best_interest = np.max(y)
141 |             #find time of peak interest. If multiple, keep only first
142 |             best_time = x[np.where(y == best_interest)][0]
143 |             best_file = video_files[i]
144 | 
145 |     half_clip = clip_length * 0.5
146 |     best_clip = [best_file, best_time - half_clip, best_time +half_clip]
147 | 
148 |     return {"best_clip":best_clip, "x_trace":x_trace, "y_trace":y_trace}
149 | 
150 | def train_classifier(train_clips, train_targets, test_clips=None, test_targets=None, classifier="nn", n_epochs=100, batch_size=None, class_weights=None):
151 |     """Get a trained classifier for audio data
152 | 
153 |     Return a trained classifier. If test_clips and test_targets is
154 |     given, then also compute and print out validation statistics
155 |     consisting of a confusion matrix, accuracy, precision and recall.
156 |     Default is to train a NeuralNetworkClassifier.
157 | 
158 |     Arguments:
159 |     ----------
160 |     train_clips -- loki.VideoClips:
161 |         The loaded video clips to use for training.
162 |     train_targets -- np.ndarray:
163 |         An array with the same number of elements as train_clips
164 |         classifying each clip as either interesting (1) or boring (0).
165 | 
166 |     Keyword Arguments:
167 |     ------------------
168 |     test_clips -- loki.VideoClips -- default=None:
169 |         The loaded video clips used for validation.
170 |     test_targets -- np.ndarray -- default=None:
171 |         An array with the same number of elements as test_clips
172 |         classifying each clip as either interesting (1) or boring(0)
173 |     classifier -- str -- default='nn':
174 |         Type of classifier to train. `nn` returns a
175 |         loki.NeuralNetworkClassifier while 'volume' returns a
176 |         loki.VolumeClassifier.
177 |     n_epochs -- int -- default=100:
178 |         Number of training epochs to run. Only used for classifier=`nn`.
179 |     batch_size -- int -- default=all:
180 |         Batch size of each training epoch. Default is all training
181 |         data at each epoch. Only used for classifier=`nn`.
182 |     class_weights -- np.ndarray -- default=None:
183 |         Relative weight of each class. This weight affects the
184 |         probability of picking each class when selecting the batch. Only
185 |         used for classifier=`nn`.
186 | 
187 |     Return:
188 |     -------
189 |     classifier -- loki.VolumeClassifier:
190 |         A trained classifier.
191 |     """
192 |     if classifier == "volume":
193 |         clf = _train_volume_classifier(train_clips, train_targets)
194 |     elif classifier == "nn":
195 |         clf = _train_nn_classifier(train_clips, train_targets, n_epochs=n_epochs, class_weights=class_weights, batch_size=batch_size)
196 |     else:
197 |         print("Invalid Classifier Specified. Keyword wargument classifier must be either 'volume' or 'nn'.")
198 | 
199 |     if test_clips is not None and test_targets is not None:
200 |         #compute validation of test_clips is given
201 |         if classifier == "volume":
202 |             results = clf.infer(processing.compute_decibels(test_clips))
203 |         elif classifier == "nn":
204 |             results = clf.infer(test_clips.compute_audio_waveform(mono=True))
205 |         else:
206 |             print("Invalid Classifier Specified. Keyword wargument classifier must be either 'volume' or 'nn'.")
207 |         evaluation.print_confusion_matrix(test_targets, results)
208 | 
209 |     return clf
210 | 
211 | def _train_volume_classifier(train_clips, train_targets):
212 |     """Get a trained volume classifier
213 | 
214 |     Arguments:
215 |     ----------
216 |     See loki.functions.helper.train_classifier().
217 | 
218 |     Return:
219 |     -------
220 |     vclassifier -- loki.VolumeClassifier:
221 |         A trained volume classifier.
222 | 
223 |     """
224 |     vclassifier = models.VolumeClassifier()
225 | 
226 |     #extract the audio data
227 |     audio_data = processing.compute_decibels(train_clips)
228 |     #train the volume classifier
229 |     vclassifier.train(audio_data, train_targets)
230 | 
231 |     return vclassifier
232 | 
233 | def _train_nn_classifier(train_clips, train_targets, n_epochs=100, batch_size=None, class_weights=None):
234 |     """Get a trained neural network classifier
235 | 
236 |     Arguments:
237 |     ----------
238 |     See loki.functions.helper.train_classifier().
239 | 
240 |     Return:
241 |     -------
242 |     nclassifier -- loki.NeuralNetworkClassifier:
243 |         A trained neural network classifier.
244 |     """
245 |     nclassifier = models.NeuralNetworkClassifier()
246 | 
247 |     raw_audio = train_clips.compute_audio_waveform(mono=True)
248 |     nclassifier.train(raw_audio, train_targets, n_epochs=n_epochs, class_weights=class_weights, batch_size=batch_size)
249 | 
250 |     return nclassifier
251 | 


--------------------------------------------------------------------------------
/loki/models/neural_networks.py:
--------------------------------------------------------------------------------
  1 | """This package contains methods for using neural networks"""
  2 | import numpy as np
  3 | import os
  4 | import tensorflow as tf
  5 | from .vggish_tensorflow import CreateVGGishNetwork, EmbeddingsFromVGGish
  6 | import torch
  7 | from torch.autograd import Variable
  8 | import torch.optim as optim
  9 | import torch.nn as nn
 10 | import torch.nn.functional as F
 11 | 
 12 | def get_embeddings(x_list, sr):
 13 |     """Get the sound embeddings from vgg-ish
 14 | 
 15 |     Use the pre-trained vggish network from TensorFlow in order to
 16 |     extract embeddings from an audio clip. The vggish network does all
 17 |     the preprocesisng necessary on a raw audio input.
 18 | 
 19 |     x can be arbitrary length, but the VGGish network was trained on
 20 |     0.96 second clips. As a result, the dimensions of the output is
 21 |     going to be 128 x M, where M = floor(time(x) / 0.96).
 22 |     i.e. a 10 second clip produces a 128 x 10 output.
 23 | 
 24 |     Arguments:
 25 |     ----------
 26 |     x_list -- list[numpy.ndarray]:
 27 |         List of traces of the sound wave (mono-channel)
 28 |     sr -- int:
 29 |         The sampling rate for the audio clip in Hz.
 30 |     """
 31 |     checkpoint_path = os.environ["SOUNDEMBEDDINGS"]
 32 | 
 33 |     all_embeddings = []
 34 |     tf.reset_default_graph()
 35 |     sess = tf.Session()
 36 | 
 37 |     vgg = CreateVGGishNetwork(sess, checkpoint_path)
 38 | 
 39 |     for x in x_list:
 40 |         resdict = EmbeddingsFromVGGish(sess, vgg, x, sr)
 41 |         all_embeddings.append(resdict['embedding'])
 42 | 
 43 |     sess.close()
 44 | 
 45 |     return all_embeddings
 46 | 
 47 | class SimpleNetwork(nn.Module):
 48 |     """A pytorch implementation of a final classification layer
 49 | 
 50 |     This is a simple model where a single linear unit is added after the
 51 |     embeddings layer from tensorflow. A sigmoid follows to infer the
 52 |     binary class. The embeddings from VGGish are a 128-Dimensional
 53 |     vector.
 54 |     """
 55 |     def __init__(self):
 56 |         super(SimpleNetwork, self).__init__()
 57 |         self.fc1 = nn.Linear(128,1)
 58 |         self.fc2 = nn.Sigmoid()
 59 | 
 60 |     def forward(self, x):
 61 |         y = self.fc1(x)
 62 |         y = self.fc2(y)
 63 |         return y
 64 | 
 65 | def stack_embeddings_and_targets(embeddings, targets=None):
 66 |     """Stack multiple embeddings along the zeroth axis
 67 | 
 68 |     Arguments:
 69 |     ----------
 70 |     embeddings -- list(np.array):
 71 |         A list of length L, with n_lx128 dimensional embeddings.
 72 | 
 73 |     Keyword Arguments:
 74 |     ------------------
 75 |     targets -- np.ndarray -- default=None:
 76 |         An array of target values. If given, will also stack and
 77 |         multiply the number of targets by n.
 78 | 
 79 |     Return:
 80 |     -------
 81 |     x -- np.ndarray:
 82 |         A Nx128 dimensional array where N = SUM_l(n_l)
 83 |     y -- np.ndarray:
 84 |         A N-length array representing the stacked targets.
 85 |     """
 86 |     x = np.zeros((0,128))
 87 |     y = []
 88 |     if targets is None:
 89 |         targets = np.zeros(len(embeddings))
 90 |     for tar,embed in zip(targets, embeddings):
 91 |         n_frames = np.shape(embed)[0]
 92 |         x = np.append(x, embed, axis=0)
 93 |         for i in range(n_frames):
 94 |             y.append(tar)
 95 |     y = np.array(y)
 96 | 
 97 |     return x,y
 98 | 
 99 | class NeuralNetworkClassifier():
100 |     """Initialize a NN for learning on sound embeddings
101 | 
102 |     When training on clips longer than 1-second, their outputs are
103 |     stacked such that you have an Nx128 dimensional array, where:
104 |     N = SUM_i(clip_time_i)
105 |     For the clip_time in seconds. It then trains 128-params to classify
106 |     a scene as interesting or not interesting.
107 | 
108 |     Keyword Arguments:
109 |     ------------------
110 |     save_dir -- str -- default='./nn_model':
111 |         Directory to save the model's learned parameters and log files.
112 |     """
113 | 
114 |     def __init__(self):
115 |         self.model = SimpleNetwork()
116 | 
117 |     def save(self, save_dir="./nn_model"):
118 |         """Save the pytorch model"""
119 |         torch.save(self.model.state_dict(), save_dir)
120 | 
121 |     def load(self, target):
122 |         """Load the pytorch model"""
123 |         self.model.load_state_dict(torch.load(target))
124 | 
125 |     def train(self, training_x, training_y, n_epochs=100, batch_size=None, class_weights=None):
126 |         """Train the neural network
127 | 
128 |         Training is done on a per-second basis, not on whole clips.
129 |         Thus, clips are broken up into their constitutent seonds in this
130 |         method. The batch_size is then effectively the number of seconds
131 |         of audio data to trian on in each cycle. i.e. 10 clips of 10
132 |         seconds each with a batch_size=20 means you train on 20% of the
133 |         training_data in each epoch.
134 | 
135 |         Arguments:
136 |         ----------
137 |         training_x -- list(np.ndarray):
138 |             List of raw mono-audio traces sampled at 44.1kHz.
139 |         training_y -- np.ndarray:
140 |             Corresponding list of target classes for each audio pattern.
141 | 
142 |         Keyword Arguments:
143 |         ------------------
144 |         n_epochs -- int -- default=100:
145 |             Number of training epochs to run.
146 |         batch_size -- int -- default=all:
147 |             Batch size of each training epoch. Default is all training
148 |             data at each epoch.
149 |         class_weights -- np.ndarray -- default=None:
150 |             Relative weight of each class. This weight affects the
151 |             probability of picking each class when selecting the batch.
152 |         """
153 |         x_train, y_train = stack_embeddings_and_targets(get_embeddings(training_x, 44100), targets=training_y)
154 | 
155 |         if batch_size is None: #set default batch-size
156 |             batch_size = len(x_train)
157 |         if class_weights is None: #set default class_weights
158 |             class_weights = np.ones(2)
159 | 
160 |         #pmatrix is the probability of selecting each class
161 |         #pmatrix is based on the class_weights
162 |         pmatrix = np.zeros(len(y_train))
163 |         pmatrix[np.where(y_train == 0)] = class_weights[0]
164 |         pmatrix[np.where(y_train == 1)] = class_weights[1]
165 |         pmatrix /= np.sum(pmatrix)
166 | 
167 |         #all_indices is used for np.random.choice later
168 |         all_indices = np.arange(len(x_train)).astype(int)
169 | 
170 |         #set pytroch optimizer and criterion
171 |         optimizer = optim.SGD(self.model.parameters(), lr=0.01, momentum=0.5)
172 |         criterion = nn.MSELoss()
173 |         #Begin the training epochs
174 |         for epoch in range(n_epochs):
175 |             #select random training indices for each batch
176 |             random_indices = np.random.choice(all_indices, size=batch_size, replace=False, p=pmatrix)
177 |             total_loss = 0
178 |             #perform the pytorch training
179 |             for i in random_indices:
180 |                 X = Variable(torch.FloatTensor([x_train[i]]), requires_grad=True)
181 |                 Y = Variable(torch.FloatTensor([y_train[i]]))
182 |                 optimizer.zero_grad()
183 |                 outputs = self.model(X)
184 |                 loss = criterion(outputs, Y)
185 |                 total_loss += loss.item()
186 |                 loss.backward()
187 |                 optimizer.step()
188 | 
189 |             #print out the total loss every 10 epochs
190 |             if epoch % 10 == 0:
191 |                 print(f"Epoch {epoch} Loss: {total_loss}")
192 | 
193 |     def infer(self, test_x, threshold=0.5):
194 |         """Infer the classes on inputted audio waveform
195 | 
196 |         A clip is interesting if the average interest level over the
197 |         whole clip is greater than a threshold of 0.5.
198 | 
199 |         Arguments:
200 |         ----------
201 |         test_x -- list[np.ndarray]:
202 |             List of raw audio (mono-channel) waveforms.
203 |         threshold -- float -- default=0.5:
204 |             Threshold value for classifying into either class 1 or 0.
205 |             If None, then return the raw non-thresholded scores.
206 | 
207 |         Return:
208 |         -------
209 |         inferred -- np.ndarray:
210 |             Return the inferred classes.
211 |         """
212 |         embeddings_x = get_embeddings(test_x, 44100)
213 |         inferred = []
214 |         for x in embeddings_x:
215 |             y = self.model(torch.FloatTensor(x))
216 |             y_array = y.detach().numpy()
217 |             avg = y_array.mean()
218 |             if threshold is None:
219 |                 inferred.append(avg)
220 |             else:
221 |                 if avg > threshold: #threshold is set to 0.5
222 |                     inferred.append(1)
223 |                 else:
224 |                     inferred.append(0)
225 | 
226 |         return np.array(inferred)
227 | 
228 |     def get_trace(self, test_x):
229 |         """Get a trace of the interest level every 0.96 seconds
230 | 
231 |         Inputted audio waveforms are binned to every 0.96 seconds and
232 |         then the interest level is inferred for each bin.
233 | 
234 |         Arguments:
235 |         ----------
236 |         test_x -- list[np.ndarray]:
237 |             List of N raw audio (mono-channel) waveforms.
238 | 
239 |         Return:
240 |         -------
241 |         x_traces -- list[np.ndarray]:
242 |             List of N arrays giving the time at the center of every
243 |             0.96s long bin that the interest score was inferred over.
244 |         traces -- list[np.ndarray]:
245 |             List of N arrays giving the interest level of the
246 |             corresponding time bin.
247 |         """
248 |         embeddings_x = get_embeddings(test_x, 44100)
249 |         traces = []
250 |         x_traces = []
251 |         for x in embeddings_x:
252 |             #perform the inference over the whole audio waveform at once
253 |             y = self.model(torch.FloatTensor(x))
254 |             y_array = y.detach().numpy()
255 |             traces.append(y_array[:,0])
256 | 
257 |             #output the time stamps of each data point
258 |             #time stamp is given at the center of each "bin"
259 |             max_time = len(y_array) * 0.96
260 |             time_stamps = np.arange(0.48, max_time, 0.96)
261 |             x_traces.append(time_stamps)
262 | 
263 |         return x_traces, traces
264 | 


--------------------------------------------------------------------------------
/loki/models/vggish_tensorflow/mel_features.py:
--------------------------------------------------------------------------------
  1 | """This file is not my own work and was copied from an open source
  2 | repository by TensorFlow, located at:
  3 | github.com/tensorflow/models/tree/master/research/audioset/vggish
  4 | 
  5 | Only import statements were changed to work within this package.
  6 | """
  7 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
  8 | #
  9 | # Licensed under the Apache License, Version 2.0 (the "License");
 10 | # you may not use this file except in compliance with the License.
 11 | # You may obtain a copy of the License at
 12 | #
 13 | #     http://www.apache.org/licenses/LICENSE-2.0
 14 | #
 15 | # Unless required by applicable law or agreed to in writing, software
 16 | # distributed under the License is distributed on an "AS IS" BASIS,
 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18 | # See the License for the specific language governing permissions and
 19 | # limitations under the License.
 20 | # ==============================================================================
 21 | 
 22 | """Defines routines to compute mel spectrogram features from audio waveform."""
 23 | 
 24 | import numpy as np
 25 | 
 26 | 
 27 | def frame(data, window_length, hop_length):
 28 |   """Convert array into a sequence of successive possibly overlapping frames.
 29 | 
 30 |   An n-dimensional array of shape (num_samples, ...) is converted into an
 31 |   (n+1)-D array of shape (num_frames, window_length, ...), where each frame
 32 |   starts hop_length points after the preceding one.
 33 | 
 34 |   This is accomplished using stride_tricks, so the original data is not
 35 |   copied.  However, there is no zero-padding, so any incomplete frames at the
 36 |   end are not included.
 37 | 
 38 |   Args:
 39 |     data: np.array of dimension N >= 1.
 40 |     window_length: Number of samples in each frame.
 41 |     hop_length: Advance (in samples) between each window.
 42 | 
 43 |   Returns:
 44 |     (N+1)-D np.array with as many rows as there are complete frames that can be
 45 |     extracted.
 46 |   """
 47 |   num_samples = data.shape[0]
 48 |   num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))
 49 |   shape = (num_frames, window_length) + data.shape[1:]
 50 |   strides = (data.strides[0] * hop_length,) + data.strides
 51 |   return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)
 52 | 
 53 | 
 54 | def periodic_hann(window_length):
 55 |   """Calculate a "periodic" Hann window.
 56 | 
 57 |   The classic Hann window is defined as a raised cosine that starts and
 58 |   ends on zero, and where every value appears twice, except the middle
 59 |   point for an odd-length window.  Matlab calls this a "symmetric" window
 60 |   and np.hanning() returns it.  However, for Fourier analysis, this
 61 |   actually represents just over one cycle of a period N-1 cosine, and
 62 |   thus is not compactly expressed on a length-N Fourier basis.  Instead,
 63 |   it's better to use a raised cosine that ends just before the final
 64 |   zero value - i.e. a complete cycle of a period-N cosine.  Matlab
 65 |   calls this a "periodic" window. This routine calculates it.
 66 | 
 67 |   Args:
 68 |     window_length: The number of points in the returned window.
 69 | 
 70 |   Returns:
 71 |     A 1D np.array containing the periodic hann window.
 72 |   """
 73 |   return 0.5 - (0.5 * np.cos(2 * np.pi / window_length *
 74 |                              np.arange(window_length)))
 75 | 
 76 | 
 77 | def stft_magnitude(signal, fft_length,
 78 |                    hop_length=None,
 79 |                    window_length=None):
 80 |   """Calculate the short-time Fourier transform magnitude.
 81 | 
 82 |   Args:
 83 |     signal: 1D np.array of the input time-domain signal.
 84 |     fft_length: Size of the FFT to apply.
 85 |     hop_length: Advance (in samples) between each frame passed to FFT.
 86 |     window_length: Length of each block of samples to pass to FFT.
 87 | 
 88 |   Returns:
 89 |     2D np.array where each row contains the magnitudes of the fft_length/2+1
 90 |     unique values of the FFT for the corresponding frame of input samples.
 91 |   """
 92 |   frames = frame(signal, window_length, hop_length)
 93 |   # Apply frame window to each frame. We use a periodic Hann (cosine of period
 94 |   # window_length) instead of the symmetric Hann of np.hanning (period
 95 |   # window_length-1).
 96 |   window = periodic_hann(window_length)
 97 |   windowed_frames = frames * window
 98 |   return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))
 99 | 
100 | 
101 | # Mel spectrum constants and functions.
102 | _MEL_BREAK_FREQUENCY_HERTZ = 700.0
103 | _MEL_HIGH_FREQUENCY_Q = 1127.0
104 | 
105 | 
106 | def hertz_to_mel(frequencies_hertz):
107 |   """Convert frequencies to mel scale using HTK formula.
108 | 
109 |   Args:
110 |     frequencies_hertz: Scalar or np.array of frequencies in hertz.
111 | 
112 |   Returns:
113 |     Object of same size as frequencies_hertz containing corresponding values
114 |     on the mel scale.
115 |   """
116 |   return _MEL_HIGH_FREQUENCY_Q * np.log(
117 |       1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ))
118 | 
119 | 
120 | def spectrogram_to_mel_matrix(num_mel_bins=20,
121 |                               num_spectrogram_bins=129,
122 |                               audio_sample_rate=8000,
123 |                               lower_edge_hertz=125.0,
124 |                               upper_edge_hertz=3800.0):
125 |   """Return a matrix that can post-multiply spectrogram rows to make mel.
126 | 
127 |   Returns a np.array matrix A that can be used to post-multiply a matrix S of
128 |   spectrogram values (STFT magnitudes) arranged as frames x bins to generate a
129 |   "mel spectrogram" M of frames x num_mel_bins.  M = S A.
130 | 
131 |   The classic HTK algorithm exploits the complementarity of adjacent mel bands
132 |   to multiply each FFT bin by only one mel weight, then add it, with positive
133 |   and negative signs, to the two adjacent mel bands to which that bin
134 |   contributes.  Here, by expressing this operation as a matrix multiply, we go
135 |   from num_fft multiplies per frame (plus around 2*num_fft adds) to around
136 |   num_fft^2 multiplies and adds.  However, because these are all presumably
137 |   accomplished in a single call to np.dot(), it's not clear which approach is
138 |   faster in Python.  The matrix multiplication has the attraction of being more
139 |   general and flexible, and much easier to read.
140 | 
141 |   Args:
142 |     num_mel_bins: How many bands in the resulting mel spectrum.  This is
143 |       the number of columns in the output matrix.
144 |     num_spectrogram_bins: How many bins there are in the source spectrogram
145 |       data, which is understood to be fft_size/2 + 1, i.e. the spectrogram
146 |       only contains the nonredundant FFT bins.
147 |     audio_sample_rate: Samples per second of the audio at the input to the
148 |       spectrogram. We need this to figure out the actual frequencies for
149 |       each spectrogram bin, which dictates how they are mapped into mel.
150 |     lower_edge_hertz: Lower bound on the frequencies to be included in the mel
151 |       spectrum.  This corresponds to the lower edge of the lowest triangular
152 |       band.
153 |     upper_edge_hertz: The desired top edge of the highest frequency band.
154 | 
155 |   Returns:
156 |     An np.array with shape (num_spectrogram_bins, num_mel_bins).
157 | 
158 |   Raises:
159 |     ValueError: if frequency edges are incorrectly ordered or out of range.
160 |   """
161 |   nyquist_hertz = audio_sample_rate / 2.
162 |   if lower_edge_hertz < 0.0:
163 |     raise ValueError("lower_edge_hertz %.1f must be >= 0" % lower_edge_hertz)
164 |   if lower_edge_hertz >= upper_edge_hertz:
165 |     raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" %
166 |                      (lower_edge_hertz, upper_edge_hertz))
167 |   if upper_edge_hertz > nyquist_hertz:
168 |     raise ValueError("upper_edge_hertz %.1f is greater than Nyquist %.1f" %
169 |                      (upper_edge_hertz, nyquist_hertz))
170 |   spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins)
171 |   spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)
172 |   # The i'th mel band (starting from i=1) has center frequency
173 |   # band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge
174 |   # band_edges_mel[i+1].  Thus, we need num_mel_bins + 2 values in
175 |   # the band_edges_mel arrays.
176 |   band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),
177 |                                hertz_to_mel(upper_edge_hertz), num_mel_bins + 2)
178 |   # Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins
179 |   # of spectrogram values.
180 |   mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))
181 |   for i in range(num_mel_bins):
182 |     lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]
183 |     # Calculate lower and upper slopes for every spectrogram bin.
184 |     # Line segments are linear in the *mel* domain, not hertz.
185 |     lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /
186 |                    (center_mel - lower_edge_mel))
187 |     upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /
188 |                    (upper_edge_mel - center_mel))
189 |     # .. then intersect them with each other and zero.
190 |     mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope,
191 |                                                           upper_slope))
192 |   # HTK excludes the spectrogram DC bin; make sure it always gets a zero
193 |   # coefficient.
194 |   mel_weights_matrix[0, :] = 0.0
195 |   return mel_weights_matrix
196 | 
197 | 
198 | def log_mel_spectrogram(data,
199 |                         audio_sample_rate=8000,
200 |                         log_offset=0.0,
201 |                         window_length_secs=0.025,
202 |                         hop_length_secs=0.010,
203 |                         **kwargs):
204 |   """Convert waveform to a log magnitude mel-frequency spectrogram.
205 | 
206 |   Args:
207 |     data: 1D np.array of waveform data.
208 |     audio_sample_rate: The sampling rate of data.
209 |     log_offset: Add this to values when taking log to avoid -Infs.
210 |     window_length_secs: Duration of each window to analyze.
211 |     hop_length_secs: Advance between successive analysis windows.
212 |     **kwargs: Additional arguments to pass to spectrogram_to_mel_matrix.
213 | 
214 |   Returns:
215 |     2D np.array of (num_frames, num_mel_bins) consisting of log mel filterbank
216 |     magnitudes for successive frames.
217 |   """
218 |   window_length_samples = int(round(audio_sample_rate * window_length_secs))
219 |   hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
220 |   fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
221 |   spectrogram = stft_magnitude(
222 |       data,
223 |       fft_length=fft_length,
224 |       hop_length=hop_length_samples,
225 |       window_length=window_length_samples)
226 |   mel_spectrogram = np.dot(spectrogram, spectrogram_to_mel_matrix(
227 |       num_spectrogram_bins=spectrogram.shape[1],
228 |       audio_sample_rate=audio_sample_rate, **kwargs))
229 |   return np.log(mel_spectrogram + log_offset)
230 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------