├── example ├── test.mp4 ├── train_bad.mp4 ├── train_good.mp4 └── example_script.py ├── loki ├── functions │ ├── __init__.py │ ├── evaluation.py │ └── helper.py ├── models │ ├── vggish_tensorflow │ │ ├── __init__.py │ │ ├── credits.md │ │ ├── vggish_params.py │ │ ├── wrappers.py │ │ ├── vggish_input.py │ │ ├── vggish_smoke_test.py │ │ ├── vggish_postprocess.py │ │ ├── vggish_slim.py │ │ ├── vggish_inference_demo.py │ │ ├── README.md │ │ ├── vggish_train_demo.py │ │ └── mel_features.py │ ├── __init__.py │ ├── util.py │ ├── volume.py │ └── neural_networks.py ├── processing │ ├── __init__.py │ ├── features.py │ └── load.py └── __init__.py ├── .gitignore ├── README.md └── LICENSE.md /example/test.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TensorDuck/videogame_highlights/HEAD/example/test.mp4 -------------------------------------------------------------------------------- /example/train_bad.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TensorDuck/videogame_highlights/HEAD/example/train_bad.mp4 -------------------------------------------------------------------------------- /example/train_good.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TensorDuck/videogame_highlights/HEAD/example/train_good.mp4 -------------------------------------------------------------------------------- /loki/functions/__init__.py: -------------------------------------------------------------------------------- 1 | """Contains functions for analyzing and pipeline functions""" 2 | 3 | from . import evaluation 4 | from . import helper 5 | -------------------------------------------------------------------------------- /loki/models/vggish_tensorflow/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize this sub folder as a package """ 2 | 3 | from .wrappers import CreateVGGishNetwork, EmbeddingsFromVGGish 4 | -------------------------------------------------------------------------------- /loki/processing/__init__.py: -------------------------------------------------------------------------------- 1 | """Methods and classes for load/write and featurize videos""" 2 | 3 | from .load import VideoClips, append_clips 4 | from.features import compute_decibels 5 | -------------------------------------------------------------------------------- /loki/models/__init__.py: -------------------------------------------------------------------------------- 1 | """Classes and methods for making models that analyze videos""" 2 | 3 | from .volume import VolumeModel, VolumeClassifier 4 | from .neural_networks import NeuralNetworkClassifier 5 | -------------------------------------------------------------------------------- /loki/__init__.py: -------------------------------------------------------------------------------- 1 | from .processing import VideoClips, append_clips, compute_decibels 2 | from .models import VolumeModel, VolumeClassifier, NeuralNetworkClassifier 3 | 4 | from .functions import evaluation 5 | from .functions import helper 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore Everything 2 | * 3 | 4 | # But no these files 5 | !.gitignore 6 | !*.py 7 | !*.md 8 | !build/add_path.sh 9 | !test.mp4 10 | !train_bad.mp4 11 | !train_good.mp4 12 | !LICENSE.md 13 | 14 | # ... even if they are in subdirectories 15 | !*/ 16 | -------------------------------------------------------------------------------- /loki/models/vggish_tensorflow/credits.md: -------------------------------------------------------------------------------- 1 | Credits 2 | ======= 3 | This application uses Open Source components. You can find the source code of their open source projects along with license information below. We acknowledge and are grateful to these developers for their contributions to open source. 4 | 5 | Project: vggish https://github.com/tensorflow/models/tree/master/research/audioset/vggish 6 | 7 | Copyright 2016 The TensorFlow Authors. All rights reserved. 8 | 9 | License: Apache License 2.0 https://github.com/tensorflow/models/blob/master/LICENSE 10 | -------------------------------------------------------------------------------- /loki/processing/features.py: -------------------------------------------------------------------------------- 1 | """Classes and functions for generating features""" 2 | from librosa import power_to_db 3 | import numpy as np 4 | 5 | def compute_decibels(data, freq=44100): 6 | """Compute the total decibels from an audio waveform 7 | 8 | Compute the power by taking the square of the waveform. If the 9 | audio is binaural, then sum up the power of each audio channel. 10 | 11 | Arguments: 12 | ---------- 13 | data -- loki.VideoClips 14 | Object containing the VideoClips 15 | 16 | Keyword Arguments: 17 | ------------------ 18 | freq -- int -- default=44100 19 | Frequency at which to extract the audio. 20 | 21 | Return: 22 | ------- 23 | decibels -- list[np.ndarray]: 24 | The loudness over time of each inputted clip. 25 | """ 26 | decibels = [] 27 | 28 | all_audio = data.compute_audio_waveform(freq=freq) 29 | 30 | for binaural in all_audio: 31 | power = binaural ** 2 # square for the power 32 | #sum up dual-channel audio if the audio is dual-channel 33 | if power.ndim == 2: 34 | power = np.sum(power, axis=1) 35 | 36 | decibel = power_to_db(power) 37 | decibels.append(decibel) 38 | 39 | return decibels 40 | -------------------------------------------------------------------------------- /loki/functions/evaluation.py: -------------------------------------------------------------------------------- 1 | """Useful functions for evaluating a model's performance""" 2 | import sklearn.metrics as skmet 3 | 4 | def get_confusion_matrix(actual, predicted): 5 | """Get the confusion matrix and statistics 6 | 7 | This is a helper function that leverages sklearn to collect the 8 | desired statistics. 9 | 10 | Arguments: 11 | ---------- 12 | actual -- np.ndarray: 13 | The atual classes, 14 | predicted -- np.ndarray: 15 | The inferred classes 16 | 17 | Return: 18 | ------- 19 | results -- dict: 20 | Contains the cm (confusion matrix), accuracy, precision and 21 | recall. 22 | """ 23 | cm = skmet.confusion_matrix(actual, predicted) 24 | 25 | accuracy = (cm[0,0] + cm[1,1]) / len(actual) 26 | 27 | precision = cm[1,1] / (cm[0,1] + cm[1,1]) # true positives over false positives and true positives 28 | recall = cm[1,1] / (cm[1,0] + cm[1,1]) # True positives over false negatives and true positives 29 | 30 | results = {'cm':cm, 'accuracy':accuracy, 'precision':precision, 'recall':recall} 31 | 32 | return results 33 | 34 | def print_confusion_matrix(actual, predicted): 35 | """Print out the confusion matrix 36 | 37 | Arguments: 38 | ---------- 39 | actual -- np.ndarray: 40 | The atual classes, 41 | predicted -- np.ndarray: 42 | The inferred classes 43 | """ 44 | results = get_confusion_matrix(actual, predicted) 45 | 46 | print("Confusion Matrix:") 47 | print(results['cm']) 48 | print(f"Accuracy: {results['accuracy']}") 49 | print(f"Precision: {results['precision']}") 50 | print(f"Recall: {results['recall']}") 51 | -------------------------------------------------------------------------------- /example/example_script.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess as sb 3 | import time 4 | import numpy 5 | import matplotlib.pyplot as plt 6 | 7 | import loki 8 | 9 | if __name__ == "__main__": 10 | cwd = os.getcwd() 11 | 12 | print("##################################################") 13 | print("Load Files") 14 | print("##################################################") 15 | #Best practice is to give full paths 16 | train_files = [f"{cwd}/train_good.mp4", f"{cwd}/train_bad.mp4"] 17 | test_files = [f"{cwd}/test.mp4"] 18 | 19 | train_targets = [1, 0] 20 | 21 | train_videos = loki.VideoClips(train_files) 22 | test_videos = loki.VideoClips(test_files) 23 | 24 | #get a trained neural network classifier 25 | print("##################################################") 26 | print("Begin Training") 27 | print("##################################################") 28 | nnclass = loki.helper.train_classifier(train_videos, train_targets, test_clips=train_videos, test_targets=train_targets, n_epochs=100, class_weights=None, batch_size=None) 29 | #save the neural network 30 | nnclass.save("example_nn") 31 | 32 | #perform inference on the training data 33 | train_audio = train_videos.compute_audio_waveform(mono=True) 34 | inferred = nnclass.infer(train_audio) 35 | loki.evaluation.print_confusion_matrix(train_targets, inferred) 36 | 37 | print("##################################################") 38 | print("Analyze test.mp4") 39 | print("##################################################") 40 | 41 | #single channel for Loki 42 | test_audio = test_videos.compute_audio_waveform(mono=True) 43 | #interest at each time-step 44 | x_trace, y_trace = nnclass.get_trace(test_audio) 45 | n_trace = len(x_trace[0]) 46 | print("Time Interest") 47 | print("------------------") 48 | for i in range(n_trace): 49 | print(f"{x_trace[0][i]:.2f} {y_trace[0][i]:.4f}") 50 | 51 | print("##################################################") 52 | print("Find The Most Interesting 1-Second Clip from test.mp4") 53 | print("##################################################") 54 | #Use helper function to find the most relevant 0.96-second section 55 | results = loki.helper.find_best_clip(test_files, 0.96, nn_checkpoint="example_nn") 56 | #Use helper function to find the most relevant .288-second section 57 | results = loki.helper.find_best_clip(test_files, 3*0.96, nn_checkpoint="example_nn") 58 | 59 | print(results) 60 | -------------------------------------------------------------------------------- /loki/models/vggish_tensorflow/vggish_params.py: -------------------------------------------------------------------------------- 1 | """This file is not my own work and was copied from an open source 2 | repository by TensorFlow, located at: 3 | github.com/tensorflow/models/tree/master/research/audioset/vggish 4 | 5 | Only import statements were changed to work within this package. 6 | """ 7 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | # ============================================================================== 21 | 22 | """Global parameters for the VGGish model. 23 | 24 | See vggish_slim.py for more information. 25 | """ 26 | 27 | # Architectural constants. 28 | NUM_FRAMES = 96 # Frames in input mel-spectrogram patch. 29 | NUM_BANDS = 64 # Frequency bands in input mel-spectrogram patch. 30 | EMBEDDING_SIZE = 128 # Size of embedding layer. 31 | 32 | # Hyperparameters used in feature and example generation. 33 | SAMPLE_RATE = 16000 34 | STFT_WINDOW_LENGTH_SECONDS = 0.025 35 | STFT_HOP_LENGTH_SECONDS = 0.010 36 | NUM_MEL_BINS = NUM_BANDS 37 | MEL_MIN_HZ = 125 38 | MEL_MAX_HZ = 7500 39 | LOG_OFFSET = 0.01 # Offset used for stabilized log of input mel-spectrogram. 40 | EXAMPLE_WINDOW_SECONDS = 0.96 # Each example contains 96 10ms frames 41 | EXAMPLE_HOP_SECONDS = 0.96 # with zero overlap. 42 | 43 | # Parameters used for embedding postprocessing. 44 | PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors' 45 | PCA_MEANS_NAME = 'pca_means' 46 | QUANTIZE_MIN_VAL = -2.0 47 | QUANTIZE_MAX_VAL = +2.0 48 | 49 | # Hyperparameters used in training. 50 | INIT_STDDEV = 0.01 # Standard deviation used to initialize weights. 51 | LEARNING_RATE = 1e-4 # Learning rate for the Adam optimizer. 52 | ADAM_EPSILON = 1e-8 # Epsilon for the Adam optimizer. 53 | 54 | # Names of ops, tensors, and features. 55 | INPUT_OP_NAME = 'vggish/input_features' 56 | INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0' 57 | OUTPUT_OP_NAME = 'vggish/embedding' 58 | OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0' 59 | AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding' 60 | -------------------------------------------------------------------------------- /loki/models/vggish_tensorflow/wrappers.py: -------------------------------------------------------------------------------- 1 | """This file contains wrappers for the VGGish methods 2 | 3 | Large parts of this file was copied from the colab for the VGGish 4 | method, see: 5 | https://colab.research.google.com/drive/1TbX92UL9sYWbdwdGE0rJ9owmezB-Rl1C 6 | """ 7 | import tensorflow as tf 8 | 9 | from . import vggish_slim 10 | from . import vggish_params 11 | from . import vggish_input 12 | 13 | def CreateVGGishNetwork(sess, checkpoint_path, hop_size=0.96): # Hop size is in seconds. 14 | """Define VGGish model, load the checkpoint, and return a dictionary 15 | that points to the different tensors defined by the model. 16 | """ 17 | vggish_slim.define_vggish_slim() 18 | vggish_params.EXAMPLE_HOP_SECONDS = hop_size 19 | 20 | vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) 21 | 22 | features_tensor = sess.graph.get_tensor_by_name( 23 | vggish_params.INPUT_TENSOR_NAME) 24 | embedding_tensor = sess.graph.get_tensor_by_name( 25 | vggish_params.OUTPUT_TENSOR_NAME) 26 | 27 | layers = {'conv1': 'vggish/conv1/Relu', 28 | 'pool1': 'vggish/pool1/MaxPool', 29 | 'conv2': 'vggish/conv2/Relu', 30 | 'pool2': 'vggish/pool2/MaxPool', 31 | 'conv3': 'vggish/conv3/conv3_2/Relu', 32 | 'pool3': 'vggish/pool3/MaxPool', 33 | 'conv4': 'vggish/conv4/conv4_2/Relu', 34 | 'pool4': 'vggish/pool4/MaxPool', 35 | 'fc1': 'vggish/fc1/fc1_2/Relu', 36 | 'fc2': 'vggish/fc2/Relu', 37 | 'embedding': 'vggish/embedding', 38 | 'features': 'vggish/input_features', 39 | } 40 | g = tf.get_default_graph() 41 | for k in layers: 42 | layers[k] = g.get_tensor_by_name( layers[k] + ':0') 43 | 44 | return {'features': features_tensor, 45 | 'embedding': embedding_tensor, 46 | 'layers': layers, 47 | } 48 | 49 | def EmbeddingsFromVGGish(sess, vgg, x, sr): 50 | '''Run the VGGish model, starting with a sound (x) at sample rate 51 | (sr). Return a dictionary of embeddings from the different layers 52 | of the model.''' 53 | # Produce a batch of log mel spectrogram examples. 54 | input_batch = vggish_input.waveform_to_examples(x, sr) 55 | # print('Log Mel Spectrogram example: ', input_batch[0]) 56 | 57 | layer_names = vgg['layers'].keys() 58 | tensors = [vgg['layers'][k] for k in layer_names] 59 | 60 | results = sess.run(tensors, 61 | feed_dict={vgg['features']: input_batch}) 62 | 63 | resdict = {} 64 | for i, k in enumerate(layer_names): 65 | resdict[k] = results[i] 66 | 67 | return resdict 68 | -------------------------------------------------------------------------------- /loki/models/vggish_tensorflow/vggish_input.py: -------------------------------------------------------------------------------- 1 | """This file is not my own work and was copied from an open source 2 | repository by TensorFlow, located at: 3 | github.com/tensorflow/models/tree/master/research/audioset/vggish 4 | 5 | Only import statements were changed to work within this package. 6 | """ 7 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | # ============================================================================== 21 | 22 | """Compute input examples for VGGish from audio waveform.""" 23 | 24 | import numpy as np 25 | import resampy 26 | 27 | from . import mel_features 28 | from . import vggish_params 29 | 30 | import soundfile as sf 31 | 32 | 33 | def waveform_to_examples(data, sample_rate): 34 | """Converts audio waveform into an array of examples for VGGish. 35 | 36 | Args: 37 | data: np.array of either one dimension (mono) or two dimensions 38 | (multi-channel, with the outer dimension representing channels). 39 | Each sample is generally expected to lie in the range [-1.0, +1.0], 40 | although this is not required. 41 | sample_rate: Sample rate of data. 42 | 43 | Returns: 44 | 3-D np.array of shape [num_examples, num_frames, num_bands] which represents 45 | a sequence of examples, each of which contains a patch of log mel 46 | spectrogram, covering num_frames frames of audio and num_bands mel frequency 47 | bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. 48 | """ 49 | # Convert to mono. 50 | if len(data.shape) > 1: 51 | data = np.mean(data, axis=1) 52 | # Resample to the rate assumed by VGGish. 53 | if sample_rate != vggish_params.SAMPLE_RATE: 54 | data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) 55 | 56 | # Compute log mel spectrogram features. 57 | log_mel = mel_features.log_mel_spectrogram( 58 | data, 59 | audio_sample_rate=vggish_params.SAMPLE_RATE, 60 | log_offset=vggish_params.LOG_OFFSET, 61 | window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, 62 | hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, 63 | num_mel_bins=vggish_params.NUM_MEL_BINS, 64 | lower_edge_hertz=vggish_params.MEL_MIN_HZ, 65 | upper_edge_hertz=vggish_params.MEL_MAX_HZ) 66 | 67 | # Frame features into examples. 68 | features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS 69 | example_window_length = int(round( 70 | vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) 71 | example_hop_length = int(round( 72 | vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) 73 | log_mel_examples = mel_features.frame( 74 | log_mel, 75 | window_length=example_window_length, 76 | hop_length=example_hop_length) 77 | return log_mel_examples 78 | 79 | 80 | def wavfile_to_examples(wav_file): 81 | """Convenience wrapper around waveform_to_examples() for a common WAV format. 82 | 83 | Args: 84 | wav_file: String path to a file, or a file-like object. The file 85 | is assumed to contain WAV audio data with signed 16-bit PCM samples. 86 | 87 | Returns: 88 | See waveform_to_examples. 89 | """ 90 | wav_data, sr = sf.read(wav_file, dtype='int16') 91 | assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype 92 | samples = wav_data / 32768.0 # Convert to [-1.0, +1.0] 93 | return waveform_to_examples(samples, sr) 94 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # videogame_highlights 2 | Machine Learning to automatically generate highlights from videogame streams 3 | 4 | Getting Started 5 | =============== 6 | 7 | Prerequisites 8 | ------------- 9 | This package was written and tested for `Python 3.7.3` compiled using `GCC 7.3.0` in a `conda` environment. 10 | The following packages (and their dependencies) would need to be installed. 11 | The versions listed have been tested in my environment and works, but likely any more recent or backwards compatible version of those packages would also work. 12 | 13 | *Install with conda* 14 | - [`numpy=1.16.3`](http://www.numpy.org/) 15 | - [`scipy=1.2.1`](http://www.scipy.org/) 16 | - [`tensorflow=1.13.1`](http://www.tensorflow.org/) 17 | - [`pytorch-cpu=1.1.0`](https://pytorch.org/) 18 | - [`scikit-learn=0.21.1`](https://scikit-learn.org/) 19 | - [`resampy=0.2.1`](http://resampy.readthedocs.io/en/latest/) 20 | - [`six=1.12.0`](https://pythonhosted.org/six/) 21 | - [`librosa=0.6.3`](https://librosa.github.io/librosa/) 22 | - [`ffmpeg=4.1.3`](https://ffmpeg.org/) 23 | 24 | *Install with pip* 25 | - [`moviepy=1.0.0`](https://zulko.github.io/moviepy/) 26 | - [`pysoundfile=0.9.0.post1`](https://pysoundfile.readthedocs.io/) 27 | 28 | 29 | Loki Installation 30 | ----------------- 31 | 32 | Once dependencies are installed, do: 33 | ``` 34 | cd build 35 | source add_path.sh 36 | ``` 37 | 38 | The `add_path.sh` script will add the relevant directories to the PYTHONPATH variable. 39 | It also sets the necessary environmental variables for finding the necessary checkpoint files. 40 | It will also check for the required VGGish checkpoint file and `wget` it if it is not found. 41 | This must be downloaded inside the build directory for the neural network classifier to work. 42 | It can be found in TensorFlow checkpoint format at: [VGGish model checkpoint](https://storage.googleapis.com/audioset/vggish_model.ckpt). 43 | 44 | The methods in the LOKI analysis package should then be usable as: 45 | 46 | ``` 47 | import loki 48 | 49 | clips = loki.VideoClips(["example.mp4"]) 50 | ``` 51 | 52 | Helper functions exist in `loki.functions.helper` which provide convenient functions for processing video files and outputting trained models. 53 | 54 | Example 55 | ======= 56 | To run the example and test the `loki` package, do: 57 | 58 | ``` 59 | cd example 60 | python -m example_script 61 | ``` 62 | If there are no errors, this example script will train a model to identify when there is a loud banging sound in a video. 63 | It will demonstrate this in these steps: 64 | 1. Load the local .mp4 files. 65 | 2. Train a neural network classifier on the video data to identify interesting moments. In this case, interesting is when there is banging on the tin lid. 66 | 3. Perform inference on the training data and print out the confusion matrix. 67 | 4. Compute an Interest vs. Time on the test mp4 file. 68 | 5. Find the most interesting 1-second segment and 3-second segment in the test mp4 file. 69 | 70 | If a user wants to understand how the more primitive classes work, look in `loki.functions.helper` to see the details of how helper functions are used. 71 | 72 | Developer Notes 73 | =============== 74 | The .gitignore file ignores all files by default. If you want to add a 75 | new file or filetype to the repo, the .gitignore file must be amended. 76 | 77 | Acknowledgements 78 | ================ 79 | This project was made as a consulting project at the Insight Artificial Intelligence Program. 80 | I am grateful for the support and guidance the Insight community provided. 81 | I also want to thank the company I consulted with, [Visor](https://visor.gg/), for providing video files to train and test the model. 82 | 83 | This application uses Open Source components, specifically files contained in `loki/models/vggish_tensorflow/`. You can find the source code of their open source projects along with license information below. We acknowledge and are grateful to these developers for their contributions to open source. 84 | 85 | Project: vggish https://github.com/tensorflow/models/tree/master/research/audioset/vggish 86 | 87 | Copyright 2016 The TensorFlow Authors. All rights reserved. 88 | 89 | License: Apache License 2.0 https://github.com/tensorflow/models/blob/master/LICENSE 90 | -------------------------------------------------------------------------------- /loki/models/util.py: -------------------------------------------------------------------------------- 1 | """Utility functions for the models sub-package""" 2 | import numpy as np 3 | 4 | def sort_scores_and_remove_overlap(n_top, scores, clip_indices): 5 | """Sort based on the inputted scores and return the n_top scores. 6 | 7 | Overlap is determined where the scene with the highest score is 8 | kept. Subsequent scenes with overlapping time indices are then 9 | ignored. This process is repeated until n_top non-overlapping scenes 10 | are found. 11 | 12 | Scores can be any value that characterizes the interest level of a 13 | scene, with the assumption that higher scores = higher interest. 14 | For example, this could be the average volume of a scene or some 15 | inferred interest level from some classifier. 16 | 17 | Arguments: 18 | ---------- 19 | n_top -- int: 20 | The number of top scoring scenes to return. 21 | scores -- np.ndarray(N,): 22 | The score for each of the N scenes, where higher numbers 23 | translate to more relevant scenes. 24 | clip_indices -- list([int, float, float]): 25 | List of video indices, and time stamps in seconds for each 26 | scene. 27 | 28 | Return: 29 | ------- 30 | best_scores -- np.ndarray(float): 31 | The score for the corresponding scene. 32 | best_scenes -- np.ndarray(float(n_top,3)): 33 | The highest scoring scenes formatted as 34 | [video index, start time, stop time] 35 | """ 36 | #argsort sorts lowest to highest so negate the score 37 | sort_indices = np.argsort(scores * -1) 38 | n_scenes = len(sort_indices) 39 | 40 | #use a while loop until n_top are found, hopefully this is short 41 | n_found = 0 #count number of non-overlapping scores found 42 | scene_index = 0 #keep track of number of scenes 43 | best_scenes = np.zeros((n_top,3)) 44 | best_scores = [] 45 | while n_found < n_top and scene_index < n_scenes: 46 | #terminate the while loop if every scene is checked. 47 | this_idx = sort_indices[scene_index] 48 | this_scene = clip_indices[this_idx] 49 | this_score = scores[this_idx] 50 | #check if overlapping 51 | if not is_overlapping(best_scenes, this_scene): 52 | best_scenes[n_found,:] = this_scene 53 | best_scores.append(this_score) 54 | #increment found index by 1 55 | n_found += 1 56 | 57 | #increment scene_index by 1 58 | scene_index += 1 59 | 60 | return best_scores, best_scenes 61 | 62 | def is_overlapping(all_scenes, check_scene): 63 | """Check the check_scene against all_scenes for overlap 64 | 65 | check_overlap() returns True if there is any overlap with previous 66 | scenes. The Format of each check_scene and elements in all_scenes is 67 | the same. The first element is an integer that denotes the video 68 | index the scene is from. The next two elements are floats that 69 | denote the start and stop times respectively. Therefore, check_scene 70 | is not overlapping if its from a different video than a scene in 71 | all_scenes. If they are from the same video, then they are not 72 | overlapping if check_scene finishes before or starts after the 73 | other scene. 74 | 75 | Arguments: 76 | ---------- 77 | all_scenes -- list(list([int, float, float])): 78 | List of all scenes to check against. 79 | check_scene -- list([int, float, float]): 80 | The scene you want to check for. 81 | 82 | Return: 83 | ------- 84 | bool 85 | """ 86 | for scene in all_scenes: 87 | #check if it's the same video 88 | if scene[0] == check_scene[0]: 89 | #check if there's any overlap 90 | #first two check if the `scene` happens after `check_scene` 91 | #last two checks if the `scene` happens before `check_scene` 92 | #If all the checks are true, then keep going 93 | #If one of the checks fail, break from loop and return True 94 | if not (scene[1] > check_scene[1] and scene[1] > check_scene[2] and scene[2] < check_scene[1] and scene[2] < check_scene[2]): 95 | return True 96 | 97 | #If the function gets here, there is no overlap 98 | return False 99 | -------------------------------------------------------------------------------- /loki/models/vggish_tensorflow/vggish_smoke_test.py: -------------------------------------------------------------------------------- 1 | """This file is not my own work and was copied from an open source 2 | repository by TensorFlow, located at: 3 | github.com/tensorflow/models/tree/master/research/audioset/vggish 4 | 5 | Only import statements were changed to work within this package. 6 | """ 7 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | # ============================================================================== 21 | 22 | """A smoke test for VGGish. 23 | 24 | This is a simple smoke test of a local install of VGGish and its associated 25 | downloaded files. We create a synthetic sound, extract log mel spectrogram 26 | features, run them through VGGish, post-process the embedding ouputs, and 27 | check some simple statistics of the results, allowing for variations that 28 | might occur due to platform/version differences in the libraries we use. 29 | 30 | Usage: 31 | - Download the VGGish checkpoint and PCA parameters into the same directory as 32 | the VGGish source code. If you keep them elsewhere, update the checkpoint_path 33 | and pca_params_path variables below. 34 | - Run: 35 | $ python vggish_smoke_test.py 36 | """ 37 | 38 | from __future__ import print_function 39 | 40 | import numpy as np 41 | import tensorflow as tf 42 | 43 | import vggish_input 44 | import vggish_params 45 | import vggish_postprocess 46 | import vggish_slim 47 | 48 | print('\nTesting your install of VGGish\n') 49 | 50 | # Paths to downloaded VGGish files. 51 | checkpoint_path = 'vggish_model.ckpt' 52 | pca_params_path = 'vggish_pca_params.npz' 53 | 54 | # Relative tolerance of errors in mean and standard deviation of embeddings. 55 | rel_error = 0.1 # Up to 10% 56 | 57 | # Generate a 1 kHz sine wave at 44.1 kHz (we use a high sampling rate 58 | # to test resampling to 16 kHz during feature extraction). 59 | num_secs = 3 60 | freq = 1000 61 | sr = 44100 62 | t = np.linspace(0, num_secs, int(num_secs * sr)) 63 | x = np.sin(2 * np.pi * freq * t) 64 | 65 | # Produce a batch of log mel spectrogram examples. 66 | input_batch = vggish_input.waveform_to_examples(x, sr) 67 | print('Log Mel Spectrogram example: ', input_batch[0]) 68 | np.testing.assert_equal( 69 | input_batch.shape, 70 | [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS]) 71 | 72 | # Define VGGish, load the checkpoint, and run the batch through the model to 73 | # produce embeddings. 74 | with tf.Graph().as_default(), tf.Session() as sess: 75 | vggish_slim.define_vggish_slim() 76 | vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) 77 | 78 | features_tensor = sess.graph.get_tensor_by_name( 79 | vggish_params.INPUT_TENSOR_NAME) 80 | embedding_tensor = sess.graph.get_tensor_by_name( 81 | vggish_params.OUTPUT_TENSOR_NAME) 82 | [embedding_batch] = sess.run([embedding_tensor], 83 | feed_dict={features_tensor: input_batch}) 84 | print('VGGish embedding: ', embedding_batch[0]) 85 | expected_embedding_mean = 0.131 86 | expected_embedding_std = 0.238 87 | np.testing.assert_allclose( 88 | [np.mean(embedding_batch), np.std(embedding_batch)], 89 | [expected_embedding_mean, expected_embedding_std], 90 | rtol=rel_error) 91 | 92 | # Postprocess the results to produce whitened quantized embeddings. 93 | pproc = vggish_postprocess.Postprocessor(pca_params_path) 94 | postprocessed_batch = pproc.postprocess(embedding_batch) 95 | print('Postprocessed VGGish embedding: ', postprocessed_batch[0]) 96 | expected_postprocessed_mean = 123.0 97 | expected_postprocessed_std = 75.0 98 | np.testing.assert_allclose( 99 | [np.mean(postprocessed_batch), np.std(postprocessed_batch)], 100 | [expected_postprocessed_mean, expected_postprocessed_std], 101 | rtol=rel_error) 102 | 103 | print('\nLooks Good To Me!\n') 104 | -------------------------------------------------------------------------------- /loki/models/vggish_tensorflow/vggish_postprocess.py: -------------------------------------------------------------------------------- 1 | """This file is not my own work and was copied from an open source 2 | repository by TensorFlow, located at: 3 | github.com/tensorflow/models/tree/master/research/audioset/vggish 4 | 5 | Only import statements were changed to work within this package. 6 | """ 7 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | # ============================================================================== 21 | 22 | """Post-process embeddings from VGGish.""" 23 | 24 | import numpy as np 25 | 26 | import vggish_params 27 | 28 | 29 | class Postprocessor(object): 30 | """Post-processes VGGish embeddings. 31 | 32 | The initial release of AudioSet included 128-D VGGish embeddings for each 33 | segment of AudioSet. These released embeddings were produced by applying 34 | a PCA transformation (technically, a whitening transform is included as well) 35 | and 8-bit quantization to the raw embedding output from VGGish, in order to 36 | stay compatible with the YouTube-8M project which provides visual embeddings 37 | in the same format for a large set of YouTube videos. This class implements 38 | the same PCA (with whitening) and quantization transformations. 39 | """ 40 | 41 | def __init__(self, pca_params_npz_path): 42 | """Constructs a postprocessor. 43 | 44 | Args: 45 | pca_params_npz_path: Path to a NumPy-format .npz file that 46 | contains the PCA parameters used in postprocessing. 47 | """ 48 | params = np.load(pca_params_npz_path) 49 | self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME] 50 | # Load means into a column vector for easier broadcasting later. 51 | self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1) 52 | assert self._pca_matrix.shape == ( 53 | vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), ( 54 | 'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,)) 55 | assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), ( 56 | 'Bad PCA means shape: %r' % (self._pca_means.shape,)) 57 | 58 | def postprocess(self, embeddings_batch): 59 | """Applies postprocessing to a batch of embeddings. 60 | 61 | Args: 62 | embeddings_batch: An nparray of shape [batch_size, embedding_size] 63 | containing output from the embedding layer of VGGish. 64 | 65 | Returns: 66 | An nparray of the same shape as the input but of type uint8, 67 | containing the PCA-transformed and quantized version of the input. 68 | """ 69 | assert len(embeddings_batch.shape) == 2, ( 70 | 'Expected 2-d batch, got %r' % (embeddings_batch.shape,)) 71 | assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, ( 72 | 'Bad batch shape: %r' % (embeddings_batch.shape,)) 73 | 74 | # Apply PCA. 75 | # - Embeddings come in as [batch_size, embedding_size]. 76 | # - Transpose to [embedding_size, batch_size]. 77 | # - Subtract pca_means column vector from each column. 78 | # - Premultiply by PCA matrix of shape [output_dims, input_dims] 79 | # where both are are equal to embedding_size in our case. 80 | # - Transpose result back to [batch_size, embedding_size]. 81 | pca_applied = np.dot(self._pca_matrix, 82 | (embeddings_batch.T - self._pca_means)).T 83 | 84 | # Quantize by: 85 | # - clipping to [min, max] range 86 | clipped_embeddings = np.clip( 87 | pca_applied, vggish_params.QUANTIZE_MIN_VAL, 88 | vggish_params.QUANTIZE_MAX_VAL) 89 | # - convert to 8-bit in range [0.0, 255.0] 90 | quantized_embeddings = ( 91 | (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) * 92 | (255.0 / 93 | (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL))) 94 | # - cast 8-bit float to uint8 95 | quantized_embeddings = quantized_embeddings.astype(np.uint8) 96 | 97 | return quantized_embeddings 98 | -------------------------------------------------------------------------------- /loki/processing/load.py: -------------------------------------------------------------------------------- 1 | """Class and methods for handling loading of video files""" 2 | from moviepy.editor import VideoFileClip 3 | import librosa 4 | 5 | def append_clips(first, second): 6 | """Append two different VideoClips objects 7 | 8 | Arguments: 9 | ---------- 10 | first -- loki.VideoClips: 11 | These filenames will go first. 12 | second -- loki.VideoClips: 13 | These filenames will follow the filenames in first. 14 | 15 | Return: 16 | ------- 17 | vclips -- loki.VideoClips: 18 | A new VideoClips object with both sets of filenames stored. 19 | """ 20 | #collect the filenames 21 | all_filenames = [] 22 | for fil in first.filenames: 23 | all_filenames.append(fil) 24 | for fil in second.filenames: 25 | all_filenames.append(fil) 26 | 27 | #make the new VideoClips, does not support saving audio information 28 | vclips = VideoClips(all_filenames) 29 | 30 | return vclips 31 | 32 | class VideoClips(): 33 | """Load multiple videos and write out relevant clips/audio 34 | 35 | VideoClips stores a series of video files and provides methods for 36 | editing and outputting clips from the video. 37 | 38 | Arguments: 39 | ---------- 40 | filenames -- list(str): 41 | List of video filenames to load. 42 | 43 | Public Methods: 44 | --------------- 45 | save_clips(): 46 | Save subclips of the loaded videofiles. 47 | """ 48 | 49 | def __init__(self, filenames): 50 | #save the filenames and avoid pass by reference errors 51 | self.filenames = filenames[:] 52 | 53 | #these are attributes resultant from later analysis 54 | self.audio_freq = None 55 | self.audios = None 56 | 57 | @property 58 | def nclips(self): 59 | return len(self.filenames) 60 | 61 | def write_clips(self, time_stamps, write_fps=12, write_ext=".mp4", write_names=None): 62 | """Write selected clips to a file 63 | 64 | Save out N clips from the previously stored video clips. 65 | 66 | Arguments: 67 | ---------- 68 | time_stamps -- Nx3 list or np.ndarray: 69 | Nx3 List giving the video index, followed by the start and 70 | stop times in seconds. 71 | 72 | Keyword Arguments: 73 | ------------------ 74 | write_fps -- int -- default=12: 75 | Frames per a second to write out. 76 | write_ext -- str -- default="mp4": 77 | File extension format to save with. 78 | write_names -- list(str) -- default=None: 79 | List of len(N) to write output files to. If None, a default 80 | name format will be used. 81 | """ 82 | 83 | #If write_names was not given, use a generic name output format 84 | if write_names is None: 85 | write_names = [] 86 | for stamp in time_stamps: 87 | vid_idx = int(stamp[0]) 88 | start_t = stamp[1] 89 | end_t = stamp[2] 90 | write_names.append(f"vid{vid_idx}_{start_t}-{end_t}{write_ext}") 91 | 92 | #Iterate over time_stamps and write out the specified clips 93 | for i_count, stamp in enumerate(time_stamps): 94 | this_vid = VideoFileClip(self.filenames[0]) 95 | clip = this_vid.subclip(stamp[1], stamp[2]) 96 | clip.write_videofile(write_names[i_count], fps=write_fps) 97 | clip.close() 98 | 99 | def compute_audio_waveform(self, freq=44100, mono=False): 100 | """Compute the binaural audio time series 101 | 102 | For each video stored, extract the binaural audio. This audio 103 | is then stored in the attribute self.audios, but also returns 104 | the list for use in further functions. 105 | 106 | Keyword Arguments: 107 | ------------------ 108 | freq -- int -- default=44100: 109 | Frequency of the computed sound in Hz. Default is 44.1 kHz. 110 | mono -- bool -- default=False: 111 | If True, return mono-channel instead of binaural audio. 112 | 113 | Return: 114 | ------- 115 | audios -- list(np.ndarray): 116 | Return a list of audio waveforms. 117 | """ 118 | 119 | self.audio_freq = 44100 120 | #only extract audio once (saves time) 121 | if self.audios is None: 122 | self.audios = [] 123 | for fname in self.filenames: 124 | clip = VideoFileClip(fname) 125 | audio = clip.audio 126 | wav = audio.to_soundarray(fps=freq) 127 | clip.close() 128 | #convert to mono-channel 129 | if mono: 130 | #librosa requires shape (2,N), moviepy gives shape (N,2) 131 | wav = librosa.to_mono(wav.transpose()) 132 | self.audios.append(wav) 133 | 134 | return self.audios 135 | -------------------------------------------------------------------------------- /loki/models/vggish_tensorflow/vggish_slim.py: -------------------------------------------------------------------------------- 1 | """This file is not my own work and was copied from an open source 2 | repository by TensorFlow, located at: 3 | github.com/tensorflow/models/tree/master/research/audioset/vggish 4 | 5 | Only import statements were changed to work within this package. 6 | """ 7 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | # ============================================================================== 21 | 22 | """Defines the 'VGGish' model used to generate AudioSet embedding features. 23 | 24 | The public AudioSet release (https://research.google.com/audioset/download.html) 25 | includes 128-D features extracted from the embedding layer of a VGG-like model 26 | that was trained on a large Google-internal YouTube dataset. Here we provide 27 | a TF-Slim definition of the same model, without any dependences on libraries 28 | internal to Google. We call it 'VGGish'. 29 | 30 | Note that we only define the model up to the embedding layer, which is the 31 | penultimate layer before the final classifier layer. We also provide various 32 | hyperparameter values (in vggish_params.py) that were used to train this model 33 | internally. 34 | 35 | For comparison, here is TF-Slim's VGG definition: 36 | https://github.com/tensorflow/models/blob/master/research/slim/nets/vgg.py 37 | """ 38 | 39 | import tensorflow as tf 40 | from . import vggish_params as params 41 | 42 | slim = tf.contrib.slim 43 | 44 | 45 | def define_vggish_slim(training=False): 46 | """Defines the VGGish TensorFlow model. 47 | 48 | All ops are created in the current default graph, under the scope 'vggish/'. 49 | 50 | The input is a placeholder named 'vggish/input_features' of type float32 and 51 | shape [batch_size, num_frames, num_bands] where batch_size is variable and 52 | num_frames and num_bands are constants, and [num_frames, num_bands] represents 53 | a log-mel-scale spectrogram patch covering num_bands frequency bands and 54 | num_frames time frames (where each frame step is usually 10ms). This is 55 | produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET). 56 | The output is an op named 'vggish/embedding' which produces the activations of 57 | a 128-D embedding layer, which is usually the penultimate layer when used as 58 | part of a full model with a final classifier layer. 59 | 60 | Args: 61 | training: If true, all parameters are marked trainable. 62 | 63 | Returns: 64 | The op 'vggish/embeddings'. 65 | """ 66 | # Defaults: 67 | # - All weights are initialized to N(0, INIT_STDDEV). 68 | # - All biases are initialized to 0. 69 | # - All activations are ReLU. 70 | # - All convolutions are 3x3 with stride 1 and SAME padding. 71 | # - All max-pools are 2x2 with stride 2 and SAME padding. 72 | with slim.arg_scope([slim.conv2d, slim.fully_connected], 73 | weights_initializer=tf.truncated_normal_initializer( 74 | stddev=params.INIT_STDDEV), 75 | biases_initializer=tf.zeros_initializer(), 76 | activation_fn=tf.nn.relu, 77 | trainable=training), \ 78 | slim.arg_scope([slim.conv2d], 79 | kernel_size=[3, 3], stride=1, padding='SAME'), \ 80 | slim.arg_scope([slim.max_pool2d], 81 | kernel_size=[2, 2], stride=2, padding='SAME'), \ 82 | tf.variable_scope('vggish'): 83 | # Input: a batch of 2-D log-mel-spectrogram patches. 84 | features = tf.placeholder( 85 | tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS), 86 | name='input_features') 87 | # Reshape to 4-D so that we can convolve a batch with conv2d(). 88 | net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1]) 89 | 90 | # The VGG stack of alternating convolutions and max-pools. 91 | net = slim.conv2d(net, 64, scope='conv1') 92 | net = slim.max_pool2d(net, scope='pool1') 93 | net = slim.conv2d(net, 128, scope='conv2') 94 | net = slim.max_pool2d(net, scope='pool2') 95 | net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3') 96 | net = slim.max_pool2d(net, scope='pool3') 97 | net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4') 98 | net = slim.max_pool2d(net, scope='pool4') 99 | 100 | # Flatten before entering fully-connected layers 101 | net = slim.flatten(net) 102 | net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1') 103 | # The embedding layer. 104 | net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2') 105 | return tf.identity(net, name='embedding') 106 | 107 | 108 | def load_vggish_slim_checkpoint(session, checkpoint_path): 109 | """Loads a pre-trained VGGish-compatible checkpoint. 110 | 111 | This function can be used as an initialization function (referred to as 112 | init_fn in TensorFlow documentation) which is called in a Session after 113 | initializating all variables. When used as an init_fn, this will load 114 | a pre-trained checkpoint that is compatible with the VGGish model 115 | definition. Only variables defined by VGGish will be loaded. 116 | 117 | Args: 118 | session: an active TensorFlow session. 119 | checkpoint_path: path to a file containing a checkpoint that is 120 | compatible with the VGGish model definition. 121 | """ 122 | # Get the list of names of all VGGish variables that exist in 123 | # the checkpoint (i.e., all inference-mode VGGish variables). 124 | with tf.Graph().as_default(): 125 | define_vggish_slim(training=False) 126 | vggish_var_names = [v.name for v in tf.global_variables()] 127 | 128 | # Get the list of all currently existing variables that match 129 | # the list of variable names we just computed. 130 | vggish_vars = [v for v in tf.global_variables() if v.name in vggish_var_names] 131 | 132 | # Use a Saver to restore just the variables selected above. 133 | saver = tf.train.Saver(vggish_vars, name='vggish_load_pretrained', 134 | write_version=1) 135 | saver.restore(session, checkpoint_path) 136 | -------------------------------------------------------------------------------- /loki/models/vggish_tensorflow/vggish_inference_demo.py: -------------------------------------------------------------------------------- 1 | """This file is not my own work and was copied from an open source 2 | repository by TensorFlow, located at: 3 | github.com/tensorflow/models/tree/master/research/audioset/vggish 4 | 5 | Only import statements were changed to work within this package. 6 | """ 7 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | # ============================================================================== 21 | 22 | r"""A simple demonstration of running VGGish in inference mode. 23 | 24 | This is intended as a toy example that demonstrates how the various building 25 | blocks (feature extraction, model definition and loading, postprocessing) work 26 | together in an inference context. 27 | 28 | A WAV file (assumed to contain signed 16-bit PCM samples) is read in, converted 29 | into log mel spectrogram examples, fed into VGGish, the raw embedding output is 30 | whitened and quantized, and the postprocessed embeddings are optionally written 31 | in a SequenceExample to a TFRecord file (using the same format as the embedding 32 | features released in AudioSet). 33 | 34 | Usage: 35 | # Run a WAV file through the model and print the embeddings. The model 36 | # checkpoint is loaded from vggish_model.ckpt and the PCA parameters are 37 | # loaded from vggish_pca_params.npz in the current directory. 38 | $ python vggish_inference_demo.py --wav_file /path/to/a/wav/file 39 | 40 | # Run a WAV file through the model and also write the embeddings to 41 | # a TFRecord file. The model checkpoint and PCA parameters are explicitly 42 | # passed in as well. 43 | $ python vggish_inference_demo.py --wav_file /path/to/a/wav/file \ 44 | --tfrecord_file /path/to/tfrecord/file \ 45 | --checkpoint /path/to/model/checkpoint \ 46 | --pca_params /path/to/pca/params 47 | 48 | # Run a built-in input (a sine wav) through the model and print the 49 | # embeddings. Associated model files are read from the current directory. 50 | $ python vggish_inference_demo.py 51 | """ 52 | 53 | from __future__ import print_function 54 | 55 | import numpy as np 56 | from scipy.io import wavfile 57 | import six 58 | import tensorflow as tf 59 | 60 | import vggish_input 61 | import vggish_params 62 | import vggish_postprocess 63 | import vggish_slim 64 | 65 | flags = tf.app.flags 66 | 67 | flags.DEFINE_string( 68 | 'wav_file', None, 69 | 'Path to a wav file. Should contain signed 16-bit PCM samples. ' 70 | 'If none is provided, a synthetic sound is used.') 71 | 72 | flags.DEFINE_string( 73 | 'checkpoint', 'vggish_model.ckpt', 74 | 'Path to the VGGish checkpoint file.') 75 | 76 | flags.DEFINE_string( 77 | 'pca_params', 'vggish_pca_params.npz', 78 | 'Path to the VGGish PCA parameters file.') 79 | 80 | flags.DEFINE_string( 81 | 'tfrecord_file', None, 82 | 'Path to a TFRecord file where embeddings will be written.') 83 | 84 | FLAGS = flags.FLAGS 85 | 86 | 87 | def main(_): 88 | # In this simple example, we run the examples from a single audio file through 89 | # the model. If none is provided, we generate a synthetic input. 90 | if FLAGS.wav_file: 91 | wav_file = FLAGS.wav_file 92 | else: 93 | # Write a WAV of a sine wav into an in-memory file object. 94 | num_secs = 5 95 | freq = 1000 96 | sr = 44100 97 | t = np.linspace(0, num_secs, int(num_secs * sr)) 98 | x = np.sin(2 * np.pi * freq * t) 99 | # Convert to signed 16-bit samples. 100 | samples = np.clip(x * 32768, -32768, 32767).astype(np.int16) 101 | wav_file = six.BytesIO() 102 | wavfile.write(wav_file, sr, samples) 103 | wav_file.seek(0) 104 | examples_batch = vggish_input.wavfile_to_examples(wav_file) 105 | print(examples_batch) 106 | 107 | # Prepare a postprocessor to munge the model embeddings. 108 | pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) 109 | 110 | # If needed, prepare a record writer to store the postprocessed embeddings. 111 | writer = tf.python_io.TFRecordWriter( 112 | FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None 113 | 114 | with tf.Graph().as_default(), tf.Session() as sess: 115 | # Define the model in inference mode, load the checkpoint, and 116 | # locate input and output tensors. 117 | vggish_slim.define_vggish_slim(training=False) 118 | vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) 119 | features_tensor = sess.graph.get_tensor_by_name( 120 | vggish_params.INPUT_TENSOR_NAME) 121 | embedding_tensor = sess.graph.get_tensor_by_name( 122 | vggish_params.OUTPUT_TENSOR_NAME) 123 | 124 | # Run inference and postprocessing. 125 | [embedding_batch] = sess.run([embedding_tensor], 126 | feed_dict={features_tensor: examples_batch}) 127 | print(embedding_batch) 128 | postprocessed_batch = pproc.postprocess(embedding_batch) 129 | print(postprocessed_batch) 130 | 131 | # Write the postprocessed embeddings as a SequenceExample, in a similar 132 | # format as the features released in AudioSet. Each row of the batch of 133 | # embeddings corresponds to roughly a second of audio (96 10ms frames), and 134 | # the rows are written as a sequence of bytes-valued features, where each 135 | # feature value contains the 128 bytes of the whitened quantized embedding. 136 | seq_example = tf.train.SequenceExample( 137 | feature_lists=tf.train.FeatureLists( 138 | feature_list={ 139 | vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: 140 | tf.train.FeatureList( 141 | feature=[ 142 | tf.train.Feature( 143 | bytes_list=tf.train.BytesList( 144 | value=[embedding.tobytes()])) 145 | for embedding in postprocessed_batch 146 | ] 147 | ) 148 | } 149 | ) 150 | ) 151 | print(seq_example) 152 | if writer: 153 | writer.write(seq_example.SerializeToString()) 154 | 155 | if writer: 156 | writer.close() 157 | 158 | if __name__ == '__main__': 159 | tf.app.run() 160 | -------------------------------------------------------------------------------- /loki/models/vggish_tensorflow/README.md: -------------------------------------------------------------------------------- 1 | # VGGish 2 | 3 | The initial AudioSet release included 128-dimensional embeddings of each 4 | AudioSet segment produced from a VGG-like audio classification model that was 5 | trained on a large YouTube dataset (a preliminary version of what later became 6 | [YouTube-8M](https://research.google.com/youtube8m)). 7 | 8 | We provide a TensorFlow definition of this model, which we call __*VGGish*__, as 9 | well as supporting code to extract input features for the model from audio 10 | waveforms and to post-process the model embedding output into the same format as 11 | the released embedding features. 12 | 13 | ## Installation 14 | 15 | VGGish depends on the following Python packages: 16 | 17 | * [`numpy`](http://www.numpy.org/) 18 | * [`scipy`](http://www.scipy.org/) 19 | * [`resampy`](http://resampy.readthedocs.io/en/latest/) 20 | * [`tensorflow`](http://www.tensorflow.org/) 21 | * [`six`](https://pythonhosted.org/six/) 22 | * [`pysoundfile`](https://pysoundfile.readthedocs.io/) 23 | 24 | These are all easily installable via, e.g., `pip install numpy` (as in the 25 | example command sequence below). 26 | 27 | Any reasonably recent version of these packages should work. TensorFlow should 28 | be at least version 1.0. We have tested that everything works on Ubuntu and 29 | Windows 10 with Python 3.6.6, Numpy v1.15.4, SciPy v1.1.0, resampy v0.2.1, 30 | TensorFlow v1.3.0, Six v1.11.0 and PySoundFile 0.9.0. 31 | 32 | VGGish also requires downloading two data files: 33 | 34 | * [VGGish model checkpoint](https://storage.googleapis.com/audioset/vggish_model.ckpt), 35 | in TensorFlow checkpoint format. 36 | * [Embedding PCA parameters](https://storage.googleapis.com/audioset/vggish_pca_params.npz), 37 | in NumPy compressed archive format. 38 | 39 | After downloading these files into the same directory as this README, the 40 | installation can be tested by running `python vggish_smoke_test.py` which 41 | runs a known signal through the model and checks the output. 42 | 43 | Here's a sample installation and test session: 44 | 45 | ```shell 46 | # You can optionally install and test VGGish within a Python virtualenv, which 47 | # is useful for isolating changes from the rest of your system. For example, you 48 | # may have an existing version of some packages that you do not want to upgrade, 49 | # or you want to try Python 3 instead of Python 2. If you decide to use a 50 | # virtualenv, you can create one by running 51 | # $ virtualenv vggish # For Python 2 52 | # or 53 | # $ python3 -m venv vggish # For Python 3 54 | # and then enter the virtual environment by running 55 | # $ source vggish/bin/activate # Assuming you use bash 56 | # Leave the virtual environment at the end of the session by running 57 | # $ deactivate 58 | # Within the virtual environment, do not use 'sudo'. 59 | 60 | # Upgrade pip first. 61 | $ sudo python -m pip install --upgrade pip 62 | 63 | # Install dependences. Resampy needs to be installed after NumPy and SciPy 64 | # are already installed. 65 | $ sudo pip install numpy scipy 66 | $ sudo pip install resampy tensorflow six 67 | 68 | # Clone TensorFlow models repo into a 'models' directory. 69 | $ git clone https://github.com/tensorflow/models.git 70 | $ cd models/research/audioset 71 | # Download data files into same directory as code. 72 | $ curl -O https://storage.googleapis.com/audioset/vggish_model.ckpt 73 | $ curl -O https://storage.googleapis.com/audioset/vggish_pca_params.npz 74 | 75 | # Installation ready, let's test it. 76 | $ python vggish_smoke_test.py 77 | # If we see "Looks Good To Me", then we're all set. 78 | ``` 79 | 80 | ## Usage 81 | 82 | VGGish can be used in two ways: 83 | 84 | * *As a feature extractor*: VGGish converts audio input features into a 85 | semantically meaningful, high-level 128-D embedding which can be fed as input 86 | to a downstream classification model. The downstream model can be shallower 87 | than usual because the VGGish embedding is more semantically compact than raw 88 | audio features. 89 | 90 | So, for example, you could train a classifier for 10 of the AudioSet classes 91 | by using the released embeddings as features. Then, you could use that 92 | trained classifier with any arbitrary audio input by running the audio through 93 | the audio feature extractor and VGGish model provided here, passing the 94 | resulting embedding features as input to your trained model. 95 | `vggish_inference_demo.py` shows how to produce VGGish embeddings from 96 | arbitrary audio. 97 | 98 | * *As part of a larger model*: Here, we treat VGGish as a "warm start" for the 99 | lower layers of a model that takes audio features as input and adds more 100 | layers on top of the VGGish embedding. This can be used to fine-tune VGGish 101 | (or parts thereof) if you have large datasets that might be very different 102 | from the typical YouTube video clip. `vggish_train_demo.py` shows how to add 103 | layers on top of VGGish and train the whole model. 104 | 105 | ## About the Model 106 | 107 | The VGGish code layout is as follows: 108 | 109 | * `vggish_slim.py`: Model definition in TensorFlow Slim notation. 110 | * `vggish_params.py`: Hyperparameters. 111 | * `vggish_input.py`: Converter from audio waveform into input examples. 112 | * `mel_features.py`: Audio feature extraction helpers. 113 | * `vggish_postprocess.py`: Embedding postprocessing. 114 | * `vggish_inference_demo.py`: Demo of VGGish in inference mode. 115 | * `vggish_train_demo.py`: Demo of VGGish in training mode. 116 | * `vggish_smoke_test.py`: Simple test of a VGGish installation 117 | 118 | ### Architecture 119 | 120 | See `vggish_slim.py` and `vggish_params.py`. 121 | 122 | VGGish is a variant of the [VGG](https://arxiv.org/abs/1409.1556) model, in 123 | particular Configuration A with 11 weight layers. Specifically, here are the 124 | changes we made: 125 | 126 | * The input size was changed to 96x64 for log mel spectrogram audio inputs. 127 | 128 | * We drop the last group of convolutional and maxpool layers, so we now have 129 | only four groups of convolution/maxpool layers instead of five. 130 | 131 | * Instead of a 1000-wide fully connected layer at the end, we use a 128-wide 132 | fully connected layer. This acts as a compact embedding layer. 133 | 134 | The model definition provided here defines layers up to and including the 135 | 128-wide embedding layer. 136 | 137 | ### Input: Audio Features 138 | 139 | See `vggish_input.py` and `mel_features.py`. 140 | 141 | VGGish was trained with audio features computed as follows: 142 | 143 | * All audio is resampled to 16 kHz mono. 144 | * A spectrogram is computed using magnitudes of the Short-Time Fourier Transform 145 | with a window size of 25 ms, a window hop of 10 ms, and a periodic Hann 146 | window. 147 | * A mel spectrogram is computed by mapping the spectrogram to 64 mel bins 148 | covering the range 125-7500 Hz. 149 | * A stabilized log mel spectrogram is computed by applying 150 | log(mel-spectrum + 0.01) where the offset is used to avoid taking a logarithm 151 | of zero. 152 | * These features are then framed into non-overlapping examples of 0.96 seconds, 153 | where each example covers 64 mel bands and 96 frames of 10 ms each. 154 | 155 | We provide our own NumPy implementation that produces features that are very 156 | similar to those produced by our internal production code. This results in 157 | embedding outputs that are closely match the embeddings that we have already 158 | released. Note that these embeddings will *not* be bit-for-bit identical to the 159 | released embeddings due to small differences between the feature computation 160 | code paths, and even between two different installations of VGGish with 161 | different underlying libraries and hardware. However, we expect that the 162 | embeddings will be equivalent in the context of a downstream classification 163 | task. 164 | 165 | ### Output: Embeddings 166 | 167 | See `vggish_postprocess.py`. 168 | 169 | The released AudioSet embeddings were postprocessed before release by applying a 170 | PCA transformation (which performs both PCA and whitening) as well as 171 | quantization to 8 bits per embedding element. This was done to be compatible 172 | with the [YouTube-8M](https://research.google.com/youtube8m) project which has 173 | released visual and audio embeddings for millions of YouTube videos in the same 174 | PCA/whitened/quantized format. 175 | 176 | We provide a Python implementation of the postprocessing which can be applied to 177 | batches of embeddings produced by VGGish. `vggish_inference_demo.py` shows how 178 | the postprocessor can be run after inference. 179 | 180 | If you don't need to use the released embeddings or YouTube-8M, then you could 181 | skip postprocessing and use raw embeddings. 182 | 183 | A [Colab](https://colab.research.google.com/) 184 | showing how to download the model and calculate the embeddings on your 185 | own sound data is available here: 186 | [AudioSet Embedding Colab](https://colab.research.google.com/drive/1TbX92UL9sYWbdwdGE0rJ9owmezB-Rl1C). 187 | 188 | -------------------------------------------------------------------------------- /loki/models/vggish_tensorflow/vggish_train_demo.py: -------------------------------------------------------------------------------- 1 | """This file is not my own work and was copied from an open source 2 | repository by TensorFlow, located at: 3 | github.com/tensorflow/models/tree/master/research/audioset/vggish 4 | 5 | Only import statements were changed to work within this package. 6 | """ 7 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | # ============================================================================== 21 | 22 | r"""A simple demonstration of running VGGish in training mode. 23 | 24 | This is intended as a toy example that demonstrates how to use the VGGish model 25 | definition within a larger model that adds more layers on top, and then train 26 | the larger model. If you let VGGish train as well, then this allows you to 27 | fine-tune the VGGish model parameters for your application. If you don't let 28 | VGGish train, then you use VGGish as a feature extractor for the layers above 29 | it. 30 | 31 | For this toy task, we are training a classifier to distinguish between three 32 | classes: sine waves, constant signals, and white noise. We generate synthetic 33 | waveforms from each of these classes, convert into shuffled batches of log mel 34 | spectrogram examples with associated labels, and feed the batches into a model 35 | that includes VGGish at the bottom and a couple of additional layers on top. We 36 | also plumb in labels that are associated with the examples, which feed a label 37 | loss used for training. 38 | 39 | Usage: 40 | # Run training for 100 steps using a model checkpoint in the default 41 | # location (vggish_model.ckpt in the current directory). Allow VGGish 42 | # to get fine-tuned. 43 | $ python vggish_train_demo.py --num_batches 100 44 | 45 | # Same as before but run for fewer steps and don't change VGGish parameters 46 | # and use a checkpoint in a different location 47 | $ python vggish_train_demo.py --num_batches 50 \ 48 | --train_vggish=False \ 49 | --checkpoint /path/to/model/checkpoint 50 | """ 51 | 52 | from __future__ import print_function 53 | 54 | from random import shuffle 55 | 56 | import numpy as np 57 | import tensorflow as tf 58 | 59 | import vggish_input 60 | import vggish_params 61 | import vggish_slim 62 | 63 | flags = tf.app.flags 64 | slim = tf.contrib.slim 65 | 66 | flags.DEFINE_integer( 67 | 'num_batches', 30, 68 | 'Number of batches of examples to feed into the model. Each batch is of ' 69 | 'variable size and contains shuffled examples of each class of audio.') 70 | 71 | flags.DEFINE_boolean( 72 | 'train_vggish', True, 73 | 'If True, allow VGGish parameters to change during training, thus ' 74 | 'fine-tuning VGGish. If False, VGGish parameters are fixed, thus using ' 75 | 'VGGish as a fixed feature extractor.') 76 | 77 | flags.DEFINE_string( 78 | 'checkpoint', 'vggish_model.ckpt', 79 | 'Path to the VGGish checkpoint file.') 80 | 81 | FLAGS = flags.FLAGS 82 | 83 | _NUM_CLASSES = 3 84 | 85 | 86 | def _get_examples_batch(): 87 | """Returns a shuffled batch of examples of all audio classes. 88 | 89 | Note that this is just a toy function because this is a simple demo intended 90 | to illustrate how the training code might work. 91 | 92 | Returns: 93 | a tuple (features, labels) where features is a NumPy array of shape 94 | [batch_size, num_frames, num_bands] where the batch_size is variable and 95 | each row is a log mel spectrogram patch of shape [num_frames, num_bands] 96 | suitable for feeding VGGish, while labels is a NumPy array of shape 97 | [batch_size, num_classes] where each row is a multi-hot label vector that 98 | provides the labels for corresponding rows in features. 99 | """ 100 | # Make a waveform for each class. 101 | num_seconds = 5 102 | sr = 44100 # Sampling rate. 103 | t = np.linspace(0, num_seconds, int(num_seconds * sr)) # Time axis. 104 | # Random sine wave. 105 | freq = np.random.uniform(100, 1000) 106 | sine = np.sin(2 * np.pi * freq * t) 107 | # Random constant signal. 108 | magnitude = np.random.uniform(-1, 1) 109 | const = magnitude * t 110 | # White noise. 111 | noise = np.random.normal(-1, 1, size=t.shape) 112 | 113 | # Make examples of each signal and corresponding labels. 114 | # Sine is class index 0, Const class index 1, Noise class index 2. 115 | sine_examples = vggish_input.waveform_to_examples(sine, sr) 116 | sine_labels = np.array([[1, 0, 0]] * sine_examples.shape[0]) 117 | const_examples = vggish_input.waveform_to_examples(const, sr) 118 | const_labels = np.array([[0, 1, 0]] * const_examples.shape[0]) 119 | noise_examples = vggish_input.waveform_to_examples(noise, sr) 120 | noise_labels = np.array([[0, 0, 1]] * noise_examples.shape[0]) 121 | 122 | # Shuffle (example, label) pairs across all classes. 123 | all_examples = np.concatenate((sine_examples, const_examples, noise_examples)) 124 | all_labels = np.concatenate((sine_labels, const_labels, noise_labels)) 125 | labeled_examples = list(zip(all_examples, all_labels)) 126 | shuffle(labeled_examples) 127 | 128 | # Separate and return the features and labels. 129 | features = [example for (example, _) in labeled_examples] 130 | labels = [label for (_, label) in labeled_examples] 131 | return (features, labels) 132 | 133 | 134 | def main(_): 135 | with tf.Graph().as_default(), tf.Session() as sess: 136 | # Define VGGish. 137 | embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish) 138 | 139 | # Define a shallow classification model and associated training ops on top 140 | # of VGGish. 141 | with tf.variable_scope('mymodel'): 142 | # Add a fully connected layer with 100 units. 143 | num_units = 100 144 | fc = slim.fully_connected(embeddings, num_units) 145 | 146 | # Add a classifier layer at the end, consisting of parallel logistic 147 | # classifiers, one per class. This allows for multi-class tasks. 148 | logits = slim.fully_connected( 149 | fc, _NUM_CLASSES, activation_fn=None, scope='logits') 150 | tf.sigmoid(logits, name='prediction') 151 | 152 | # Add training ops. 153 | with tf.variable_scope('train'): 154 | global_step = tf.Variable( 155 | 0, name='global_step', trainable=False, 156 | collections=[tf.GraphKeys.GLOBAL_VARIABLES, 157 | tf.GraphKeys.GLOBAL_STEP]) 158 | 159 | # Labels are assumed to be fed as a batch multi-hot vectors, with 160 | # a 1 in the position of each positive class label, and 0 elsewhere. 161 | labels = tf.placeholder( 162 | tf.float32, shape=(None, _NUM_CLASSES), name='labels') 163 | 164 | # Cross-entropy label loss. 165 | xent = tf.nn.sigmoid_cross_entropy_with_logits( 166 | logits=logits, labels=labels, name='xent') 167 | loss = tf.reduce_mean(xent, name='loss_op') 168 | tf.summary.scalar('loss', loss) 169 | 170 | # We use the same optimizer and hyperparameters as used to train VGGish. 171 | optimizer = tf.train.AdamOptimizer( 172 | learning_rate=vggish_params.LEARNING_RATE, 173 | epsilon=vggish_params.ADAM_EPSILON) 174 | optimizer.minimize(loss, global_step=global_step, name='train_op') 175 | 176 | # Initialize all variables in the model, and then load the pre-trained 177 | # VGGish checkpoint. 178 | sess.run(tf.global_variables_initializer()) 179 | vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) 180 | 181 | # Locate all the tensors and ops we need for the training loop. 182 | features_tensor = sess.graph.get_tensor_by_name( 183 | vggish_params.INPUT_TENSOR_NAME) 184 | labels_tensor = sess.graph.get_tensor_by_name('mymodel/train/labels:0') 185 | global_step_tensor = sess.graph.get_tensor_by_name( 186 | 'mymodel/train/global_step:0') 187 | loss_tensor = sess.graph.get_tensor_by_name('mymodel/train/loss_op:0') 188 | train_op = sess.graph.get_operation_by_name('mymodel/train/train_op') 189 | 190 | # The training loop. 191 | for _ in range(FLAGS.num_batches): 192 | (features, labels) = _get_examples_batch() 193 | [num_steps, loss, _] = sess.run( 194 | [global_step_tensor, loss_tensor, train_op], 195 | feed_dict={features_tensor: features, labels_tensor: labels}) 196 | print('Step %d: loss %g' % (num_steps, loss)) 197 | 198 | if __name__ == '__main__': 199 | tf.app.run() 200 | -------------------------------------------------------------------------------- /loki/models/volume.py: -------------------------------------------------------------------------------- 1 | """Class and methods for detecting loudest portions of videos""" 2 | import math 3 | import numpy as np 4 | import sklearn.metrics as skmet 5 | 6 | from .util import sort_scores_and_remove_overlap 7 | 8 | class VolumeClassifier(): 9 | """A classifier that classifiers interesting scenes based on volume 10 | 11 | This classifier will class a scene as interesting if its average 12 | volume is above a certain cutoff. Values above that cutoff are then 13 | classified as interesting. 14 | 15 | Attributes: 16 | ----------- 17 | volume_cutoff -- float: 18 | Scenes with average volume above volume_cutoff are classified as 19 | interesting. Those less than or equal to volume_cutoff are 20 | classified as uninteresting. 21 | """ 22 | 23 | def __init__(self): 24 | self.volume_cutoff = 0 25 | 26 | def save(self, savefile=None): 27 | """Save the model's learned parameters 28 | 29 | Write out the volume_cutoff to a file. If no file is specified, 30 | then write into volume_classifier_model.dat in the current 31 | directory. 32 | 33 | Keyword Arguments: 34 | ------------------ 35 | savefile -- str -- default=None: 36 | The file name to write the parameters into. 37 | """ 38 | if savefile is None: 39 | savefile = "volume_classifier_model.dat" 40 | f = open(savefile, "w") 41 | f.write(f"volume_cutoff = {self.volume_cutoff}\n") 42 | f.close() 43 | 44 | def load(self, savefile): 45 | """Load a previous model's learned parameters 46 | 47 | Arguments: 48 | ---------- 49 | savefile -- str -- default=None: 50 | The file name to write the parameters into. 51 | """ 52 | f = open(savefile, "r") 53 | 54 | for line in f: 55 | #load the parameters one by one 56 | stuff = line.strip().split() 57 | if stuff[0] == "volume_cutoff": 58 | self.volume_cutoff = float(stuff[2]) 59 | f.close() 60 | 61 | 62 | def train(self, training_x, training_y): 63 | """Train the volume classifier 64 | 65 | Train the volume classifier (currently a binary classifier). 66 | Default loss function is the hamming loss which, for a binary 67 | classifier, is equivalent to accuracy. 68 | The training here is simply finding the threshold that maximizes 69 | the accuracy. It's not technically training but the method is 70 | named such for consistency. 71 | 72 | Arguments 73 | --------- 74 | training_x -- list[np.ndarray]: 75 | The volume (in decibels) of the training data. Of length N. 76 | training_y -- np.ndarray or list: 77 | The corresponding classes of the training data. Also of 78 | length N. A class of 1 is interesting, a class of 0 is 79 | uninteresting. 80 | """ 81 | average_loudness = [] 82 | for audioclip in training_x: 83 | average_loudness.append(audioclip.mean()) 84 | average_loudness = np.array(average_loudness) 85 | 86 | best_loss = 1 # hamming loss goes from 0 - 1 87 | best_cutoff = None 88 | 89 | #Get a unique set of volume cutoffs: 90 | unique_values = np.unique(average_loudness) 91 | possible_cutoffs = unique_values[1:] + unique_values[:-1] 92 | low_endvalue = np.min(unique_values) - possible_cutoffs[1] + possible_cutoffs[2] 93 | high_endvalue = np.max(unique_values) 94 | 95 | possible_cutoffs = np.append([low_endvalue, high_endvalue], possible_cutoffs) 96 | 97 | #check every possible cutoff 98 | for cutoff in possible_cutoffs: 99 | predicted_values = np.zeros(len(average_loudness)) 100 | predicted_values[np.where(average_loudness > cutoff)] = 1 101 | loss = skmet.hamming_loss(training_y, predicted_values) 102 | if loss < best_loss: 103 | best_loss = loss 104 | best_cutoff = cutoff 105 | 106 | self.volume_cutoff = best_cutoff 107 | 108 | def infer(self, test_x, score=False): 109 | """Make an inference on the test data based on trained model 110 | 111 | For instances in test_x where the average volume is greater than 112 | self.volume_cutoff, class it as 1 for interesting. 113 | 114 | Arguments: 115 | ---------- 116 | test_x -- list[np.ndarray]: 117 | The volume (in decibels) of the test data. 118 | score -- bool -- default=False: 119 | If False, return the classes based on the volume threshold. 120 | If True, return the non-thresholded average volume of each 121 | scene. 122 | 123 | Return: 124 | ------- 125 | classified -- np.ndarray: 126 | The resultant classes for the test data. 127 | """ 128 | classified = [] 129 | for audioclip in test_x: 130 | avg_volume = audioclip.mean() 131 | if score: 132 | classified.append(avg_volume) 133 | else: 134 | if avg_volume > self.volume_cutoff: 135 | classified.append(1) 136 | else: 137 | classified.append(0) 138 | 139 | return np.array(classified) 140 | 141 | 142 | class VolumeModel(): 143 | """Find the loudest sections in a set of videos 144 | 145 | Keyword Arguments: 146 | ------------------ 147 | search_length -- float -- default=10.0: 148 | Desired clip size in seconds. 149 | 150 | search_increment -- float -- default=10.0: 151 | Desired shift to apply to search window in seconds. 152 | 153 | """ 154 | 155 | def __init__(self, search_length=10.0, search_increment=1.0): 156 | self.search_length = search_length 157 | self.search_increment = search_increment 158 | 159 | def predict(self, loudness, freq=44100, n_predict=1): 160 | """Find the loudest section in the inputted video clips 161 | 162 | Take the input loudness generated from a video and search over 163 | every volume array to determine the clips with the overall 164 | loudest moments. Return the video index and time index 165 | corresponding to the overall loudest portion. 166 | 167 | Note, the inputted loudness are not expected to be the same 168 | length as the length of videos would vary a great deal. Assuming 169 | every video is atleast longer than self.search_length, then the 170 | outputted loudest segments would be of standard length. 171 | 172 | Arguments: 173 | ---------- 174 | loudness -- list[np.ndarray(float)]: 175 | List of all total volume of multiple vidoeo clips. 176 | 177 | Keywrod Arguments: 178 | ------------------ 179 | freq -- int -- default=44100 180 | Frequency in Hz to extract the audio over. 181 | n_predict -- int -- default=1 182 | Return the top n_predict non-overlapping scenes. 183 | """ 184 | 185 | #Define search windows in array index lengths 186 | search_window = math.floor(self.search_length * freq) 187 | search_jump = math.floor(self.search_increment * freq) 188 | 189 | #store the loudest section and increment 190 | all_loudness_scores = np.zeros(0) 191 | all_scenes = np.zeros((0,3)) 192 | 193 | #check each audio clip 194 | for audio_idx, audioclip in enumerate(loudness): 195 | #check if clip is longer than search window 196 | if len(audioclip) <= search_window: 197 | #if longer, compare average volume of this portion 198 | avg_loudness = np.sum(audioclip) / float(len(audioclip)) 199 | clip_increment = np.array([audio_idx, 0, len(audioclip)/freq]).reshape((1,3)) 200 | #append to lists 201 | all_loudness_scores = np.append(all_loudness_scores, avg_loudness) 202 | all_scenes = np.append(all_scenes, clip_increment, axis=0) 203 | else: 204 | #If clip is not longer, check every window 205 | start_indices = range(0, len(audioclip) - search_window, search_jump) 206 | #Increment over every window 207 | for start_idx in start_indices: 208 | end_idx = start_idx + search_window 209 | avg_loudness = np.sum(audioclip[start_idx:end_idx]) / float(search_window) 210 | clip_increment = np.array([audio_idx, start_idx/freq, end_idx/freq]).reshape((1,3)) 211 | #append to lists 212 | all_loudness_scores = np.append(all_loudness_scores, avg_loudness) 213 | all_scenes = np.append(all_scenes, clip_increment, axis=0) 214 | 215 | #return the top scores 216 | top_scores, top_scenes = sort_scores_and_remove_overlap(n_predict, all_loudness_scores, all_scenes) 217 | 218 | return top_scores, top_scenes 219 | -------------------------------------------------------------------------------- /loki/functions/helper.py: -------------------------------------------------------------------------------- 1 | """Contains helper functions for loading/training/evaluation""" 2 | import os 3 | import numpy as np 4 | 5 | from .. import processing 6 | from .. import models 7 | from . import evaluation 8 | 9 | def load_clips_from_dir(target_dir=None): 10 | """Make a VideoClips object with all files in a dir 11 | 12 | Keyword Arguments: 13 | ------------------ 14 | target_dir -- str -- None: 15 | The target dir to load the files from. 16 | 17 | Return: 18 | ------- 19 | clips -- loki.VideoClips: 20 | A VideoClips loader with all the files in target_dir. 21 | """ 22 | cwd = os.getcwd() 23 | if target_dir is None: 24 | #default load from current directory 25 | target_dir = cwd 26 | else: 27 | #make sure you use the full path 28 | os.chdir(target_dir) 29 | target_dir = os.getcwd() 30 | os.chdir(cwd) 31 | 32 | #grab every file and alphabetize 33 | all_files = os.listdir(target_dir) 34 | all_files.sort() 35 | 36 | #append the fullpath to every file 37 | fullpath_files = [] 38 | for fil in all_files: 39 | fullpath_files.append(f"{target_dir}/{fil}") 40 | 41 | clips = processing.VideoClips(fullpath_files) 42 | 43 | return clips 44 | 45 | def average_over_window(data, n_average): 46 | """Computer a sliding window average over data 47 | 48 | Given a window size in indices, compute the average value of data 49 | over that window sliding by one index. 50 | 51 | Arguments: 52 | ---------- 53 | data -- np.ndarray: 54 | 1-D array of length N to compute averages over. 55 | n_average -- int: 56 | Size of the window. 57 | 58 | Return: 59 | ------- 60 | new_data -- np.ndarray: 61 | 1-D array of length N-n_average. Each index represents the 62 | average over n_average consecutive elements. 63 | 64 | Example: 65 | -------- 66 | Given data = [0, 1, 2, 3, 4] 67 | n_average = 2 68 | Then the average trace over a window of 2 is: 69 | [0.5, 1.5, 2.5, 3.5] 70 | """ 71 | 72 | new_data = np.copy(data)[:-n_average] 73 | for i in range(1, n_average): 74 | new_data += data[i:-(n_average-i)] 75 | new_data /= n_average 76 | 77 | return new_data 78 | 79 | def find_best_clip(video_files, clip_length, nn_checkpoint="nn_model"): 80 | """Find the best clip in a set of videos of specified duration 81 | 82 | Search over every video_file and compute a windowed average of the 83 | interest level every second. The clip section with the largest 84 | average interest is then returned, as well as the original 85 | non-averaged trace of interest level for each inputted video file. 86 | 87 | Arguments: 88 | ---------- 89 | video_files -- list[str]: 90 | List of N video files to calculcate interest levels for. 91 | clip_length -- float: 92 | Length of the desired highlight clip in seconds. 93 | 94 | Keyword Arguments: 95 | ------------------ 96 | nn_model -- str -- nn_model: 97 | Location of the loki.NeuralNetworkClassifier checkpoint file. 98 | 99 | Return: 100 | ------- 101 | best_clip -- list: 102 | 103 | -- dict: 104 | Return a dictionary containing the best_clip, x_trace, and 105 | y_trace. 106 | best_clip -- list: 107 | Contains the best clip section. The first element is the 108 | video file containing the best clip. The second and third 109 | element is the start and stop time respectively. 110 | x_trace -- list[np.ndarray]: 111 | List of N arrays giving the times for the center of each 112 | averaging window. 113 | y_trace -- list[np.ndarray]: 114 | List of N arrays giving the average interst level over each 115 | window. 116 | """ 117 | #0.96 is the length of time VGGish processes as a single embedding 118 | clip_size = int(np.ceil(clip_length / 0.96)) 119 | nnclass = models.NeuralNetworkClassifier() 120 | nnclass.load(nn_checkpoint) 121 | vclips = processing.VideoClips(video_files) 122 | big_audio = vclips.compute_audio_waveform() 123 | 124 | x_trace, y_trace = nnclass.get_trace(big_audio) 125 | 126 | #save the average interest level over each clip in a windowed avg 127 | x_avg = [] 128 | y_avg = [] 129 | for x,y in zip(x_trace,y_trace): 130 | x_avg.append(average_over_window(x, clip_size)) 131 | y_avg.append(average_over_window(y, clip_size)) 132 | 133 | #find the most interesting segment 134 | best_interest = 0 135 | best_time = 0 136 | best_file = None 137 | for i,(x,y) in enumerate(zip(x_avg, y_avg)): 138 | if np.max(y) > best_interest: 139 | #found a better clip 140 | best_interest = np.max(y) 141 | #find time of peak interest. If multiple, keep only first 142 | best_time = x[np.where(y == best_interest)][0] 143 | best_file = video_files[i] 144 | 145 | half_clip = clip_length * 0.5 146 | best_clip = [best_file, best_time - half_clip, best_time +half_clip] 147 | 148 | return {"best_clip":best_clip, "x_trace":x_trace, "y_trace":y_trace} 149 | 150 | def train_classifier(train_clips, train_targets, test_clips=None, test_targets=None, classifier="nn", n_epochs=100, batch_size=None, class_weights=None): 151 | """Get a trained classifier for audio data 152 | 153 | Return a trained classifier. If test_clips and test_targets is 154 | given, then also compute and print out validation statistics 155 | consisting of a confusion matrix, accuracy, precision and recall. 156 | Default is to train a NeuralNetworkClassifier. 157 | 158 | Arguments: 159 | ---------- 160 | train_clips -- loki.VideoClips: 161 | The loaded video clips to use for training. 162 | train_targets -- np.ndarray: 163 | An array with the same number of elements as train_clips 164 | classifying each clip as either interesting (1) or boring (0). 165 | 166 | Keyword Arguments: 167 | ------------------ 168 | test_clips -- loki.VideoClips -- default=None: 169 | The loaded video clips used for validation. 170 | test_targets -- np.ndarray -- default=None: 171 | An array with the same number of elements as test_clips 172 | classifying each clip as either interesting (1) or boring(0) 173 | classifier -- str -- default='nn': 174 | Type of classifier to train. `nn` returns a 175 | loki.NeuralNetworkClassifier while 'volume' returns a 176 | loki.VolumeClassifier. 177 | n_epochs -- int -- default=100: 178 | Number of training epochs to run. Only used for classifier=`nn`. 179 | batch_size -- int -- default=all: 180 | Batch size of each training epoch. Default is all training 181 | data at each epoch. Only used for classifier=`nn`. 182 | class_weights -- np.ndarray -- default=None: 183 | Relative weight of each class. This weight affects the 184 | probability of picking each class when selecting the batch. Only 185 | used for classifier=`nn`. 186 | 187 | Return: 188 | ------- 189 | classifier -- loki.VolumeClassifier: 190 | A trained classifier. 191 | """ 192 | if classifier == "volume": 193 | clf = _train_volume_classifier(train_clips, train_targets) 194 | elif classifier == "nn": 195 | clf = _train_nn_classifier(train_clips, train_targets, n_epochs=n_epochs, class_weights=class_weights, batch_size=batch_size) 196 | else: 197 | print("Invalid Classifier Specified. Keyword wargument classifier must be either 'volume' or 'nn'.") 198 | 199 | if test_clips is not None and test_targets is not None: 200 | #compute validation of test_clips is given 201 | if classifier == "volume": 202 | results = clf.infer(processing.compute_decibels(test_clips)) 203 | elif classifier == "nn": 204 | results = clf.infer(test_clips.compute_audio_waveform(mono=True)) 205 | else: 206 | print("Invalid Classifier Specified. Keyword wargument classifier must be either 'volume' or 'nn'.") 207 | evaluation.print_confusion_matrix(test_targets, results) 208 | 209 | return clf 210 | 211 | def _train_volume_classifier(train_clips, train_targets): 212 | """Get a trained volume classifier 213 | 214 | Arguments: 215 | ---------- 216 | See loki.functions.helper.train_classifier(). 217 | 218 | Return: 219 | ------- 220 | vclassifier -- loki.VolumeClassifier: 221 | A trained volume classifier. 222 | 223 | """ 224 | vclassifier = models.VolumeClassifier() 225 | 226 | #extract the audio data 227 | audio_data = processing.compute_decibels(train_clips) 228 | #train the volume classifier 229 | vclassifier.train(audio_data, train_targets) 230 | 231 | return vclassifier 232 | 233 | def _train_nn_classifier(train_clips, train_targets, n_epochs=100, batch_size=None, class_weights=None): 234 | """Get a trained neural network classifier 235 | 236 | Arguments: 237 | ---------- 238 | See loki.functions.helper.train_classifier(). 239 | 240 | Return: 241 | ------- 242 | nclassifier -- loki.NeuralNetworkClassifier: 243 | A trained neural network classifier. 244 | """ 245 | nclassifier = models.NeuralNetworkClassifier() 246 | 247 | raw_audio = train_clips.compute_audio_waveform(mono=True) 248 | nclassifier.train(raw_audio, train_targets, n_epochs=n_epochs, class_weights=class_weights, batch_size=batch_size) 249 | 250 | return nclassifier 251 | -------------------------------------------------------------------------------- /loki/models/neural_networks.py: -------------------------------------------------------------------------------- 1 | """This package contains methods for using neural networks""" 2 | import numpy as np 3 | import os 4 | import tensorflow as tf 5 | from .vggish_tensorflow import CreateVGGishNetwork, EmbeddingsFromVGGish 6 | import torch 7 | from torch.autograd import Variable 8 | import torch.optim as optim 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | 12 | def get_embeddings(x_list, sr): 13 | """Get the sound embeddings from vgg-ish 14 | 15 | Use the pre-trained vggish network from TensorFlow in order to 16 | extract embeddings from an audio clip. The vggish network does all 17 | the preprocesisng necessary on a raw audio input. 18 | 19 | x can be arbitrary length, but the VGGish network was trained on 20 | 0.96 second clips. As a result, the dimensions of the output is 21 | going to be 128 x M, where M = floor(time(x) / 0.96). 22 | i.e. a 10 second clip produces a 128 x 10 output. 23 | 24 | Arguments: 25 | ---------- 26 | x_list -- list[numpy.ndarray]: 27 | List of traces of the sound wave (mono-channel) 28 | sr -- int: 29 | The sampling rate for the audio clip in Hz. 30 | """ 31 | checkpoint_path = os.environ["SOUNDEMBEDDINGS"] 32 | 33 | all_embeddings = [] 34 | tf.reset_default_graph() 35 | sess = tf.Session() 36 | 37 | vgg = CreateVGGishNetwork(sess, checkpoint_path) 38 | 39 | for x in x_list: 40 | resdict = EmbeddingsFromVGGish(sess, vgg, x, sr) 41 | all_embeddings.append(resdict['embedding']) 42 | 43 | sess.close() 44 | 45 | return all_embeddings 46 | 47 | class SimpleNetwork(nn.Module): 48 | """A pytorch implementation of a final classification layer 49 | 50 | This is a simple model where a single linear unit is added after the 51 | embeddings layer from tensorflow. A sigmoid follows to infer the 52 | binary class. The embeddings from VGGish are a 128-Dimensional 53 | vector. 54 | """ 55 | def __init__(self): 56 | super(SimpleNetwork, self).__init__() 57 | self.fc1 = nn.Linear(128,1) 58 | self.fc2 = nn.Sigmoid() 59 | 60 | def forward(self, x): 61 | y = self.fc1(x) 62 | y = self.fc2(y) 63 | return y 64 | 65 | def stack_embeddings_and_targets(embeddings, targets=None): 66 | """Stack multiple embeddings along the zeroth axis 67 | 68 | Arguments: 69 | ---------- 70 | embeddings -- list(np.array): 71 | A list of length L, with n_lx128 dimensional embeddings. 72 | 73 | Keyword Arguments: 74 | ------------------ 75 | targets -- np.ndarray -- default=None: 76 | An array of target values. If given, will also stack and 77 | multiply the number of targets by n. 78 | 79 | Return: 80 | ------- 81 | x -- np.ndarray: 82 | A Nx128 dimensional array where N = SUM_l(n_l) 83 | y -- np.ndarray: 84 | A N-length array representing the stacked targets. 85 | """ 86 | x = np.zeros((0,128)) 87 | y = [] 88 | if targets is None: 89 | targets = np.zeros(len(embeddings)) 90 | for tar,embed in zip(targets, embeddings): 91 | n_frames = np.shape(embed)[0] 92 | x = np.append(x, embed, axis=0) 93 | for i in range(n_frames): 94 | y.append(tar) 95 | y = np.array(y) 96 | 97 | return x,y 98 | 99 | class NeuralNetworkClassifier(): 100 | """Initialize a NN for learning on sound embeddings 101 | 102 | When training on clips longer than 1-second, their outputs are 103 | stacked such that you have an Nx128 dimensional array, where: 104 | N = SUM_i(clip_time_i) 105 | For the clip_time in seconds. It then trains 128-params to classify 106 | a scene as interesting or not interesting. 107 | 108 | Keyword Arguments: 109 | ------------------ 110 | save_dir -- str -- default='./nn_model': 111 | Directory to save the model's learned parameters and log files. 112 | """ 113 | 114 | def __init__(self): 115 | self.model = SimpleNetwork() 116 | 117 | def save(self, save_dir="./nn_model"): 118 | """Save the pytorch model""" 119 | torch.save(self.model.state_dict(), save_dir) 120 | 121 | def load(self, target): 122 | """Load the pytorch model""" 123 | self.model.load_state_dict(torch.load(target)) 124 | 125 | def train(self, training_x, training_y, n_epochs=100, batch_size=None, class_weights=None): 126 | """Train the neural network 127 | 128 | Training is done on a per-second basis, not on whole clips. 129 | Thus, clips are broken up into their constitutent seonds in this 130 | method. The batch_size is then effectively the number of seconds 131 | of audio data to trian on in each cycle. i.e. 10 clips of 10 132 | seconds each with a batch_size=20 means you train on 20% of the 133 | training_data in each epoch. 134 | 135 | Arguments: 136 | ---------- 137 | training_x -- list(np.ndarray): 138 | List of raw mono-audio traces sampled at 44.1kHz. 139 | training_y -- np.ndarray: 140 | Corresponding list of target classes for each audio pattern. 141 | 142 | Keyword Arguments: 143 | ------------------ 144 | n_epochs -- int -- default=100: 145 | Number of training epochs to run. 146 | batch_size -- int -- default=all: 147 | Batch size of each training epoch. Default is all training 148 | data at each epoch. 149 | class_weights -- np.ndarray -- default=None: 150 | Relative weight of each class. This weight affects the 151 | probability of picking each class when selecting the batch. 152 | """ 153 | x_train, y_train = stack_embeddings_and_targets(get_embeddings(training_x, 44100), targets=training_y) 154 | 155 | if batch_size is None: #set default batch-size 156 | batch_size = len(x_train) 157 | if class_weights is None: #set default class_weights 158 | class_weights = np.ones(2) 159 | 160 | #pmatrix is the probability of selecting each class 161 | #pmatrix is based on the class_weights 162 | pmatrix = np.zeros(len(y_train)) 163 | pmatrix[np.where(y_train == 0)] = class_weights[0] 164 | pmatrix[np.where(y_train == 1)] = class_weights[1] 165 | pmatrix /= np.sum(pmatrix) 166 | 167 | #all_indices is used for np.random.choice later 168 | all_indices = np.arange(len(x_train)).astype(int) 169 | 170 | #set pytroch optimizer and criterion 171 | optimizer = optim.SGD(self.model.parameters(), lr=0.01, momentum=0.5) 172 | criterion = nn.MSELoss() 173 | #Begin the training epochs 174 | for epoch in range(n_epochs): 175 | #select random training indices for each batch 176 | random_indices = np.random.choice(all_indices, size=batch_size, replace=False, p=pmatrix) 177 | total_loss = 0 178 | #perform the pytorch training 179 | for i in random_indices: 180 | X = Variable(torch.FloatTensor([x_train[i]]), requires_grad=True) 181 | Y = Variable(torch.FloatTensor([y_train[i]])) 182 | optimizer.zero_grad() 183 | outputs = self.model(X) 184 | loss = criterion(outputs, Y) 185 | total_loss += loss.item() 186 | loss.backward() 187 | optimizer.step() 188 | 189 | #print out the total loss every 10 epochs 190 | if epoch % 10 == 0: 191 | print(f"Epoch {epoch} Loss: {total_loss}") 192 | 193 | def infer(self, test_x, threshold=0.5): 194 | """Infer the classes on inputted audio waveform 195 | 196 | A clip is interesting if the average interest level over the 197 | whole clip is greater than a threshold of 0.5. 198 | 199 | Arguments: 200 | ---------- 201 | test_x -- list[np.ndarray]: 202 | List of raw audio (mono-channel) waveforms. 203 | threshold -- float -- default=0.5: 204 | Threshold value for classifying into either class 1 or 0. 205 | If None, then return the raw non-thresholded scores. 206 | 207 | Return: 208 | ------- 209 | inferred -- np.ndarray: 210 | Return the inferred classes. 211 | """ 212 | embeddings_x = get_embeddings(test_x, 44100) 213 | inferred = [] 214 | for x in embeddings_x: 215 | y = self.model(torch.FloatTensor(x)) 216 | y_array = y.detach().numpy() 217 | avg = y_array.mean() 218 | if threshold is None: 219 | inferred.append(avg) 220 | else: 221 | if avg > threshold: #threshold is set to 0.5 222 | inferred.append(1) 223 | else: 224 | inferred.append(0) 225 | 226 | return np.array(inferred) 227 | 228 | def get_trace(self, test_x): 229 | """Get a trace of the interest level every 0.96 seconds 230 | 231 | Inputted audio waveforms are binned to every 0.96 seconds and 232 | then the interest level is inferred for each bin. 233 | 234 | Arguments: 235 | ---------- 236 | test_x -- list[np.ndarray]: 237 | List of N raw audio (mono-channel) waveforms. 238 | 239 | Return: 240 | ------- 241 | x_traces -- list[np.ndarray]: 242 | List of N arrays giving the time at the center of every 243 | 0.96s long bin that the interest score was inferred over. 244 | traces -- list[np.ndarray]: 245 | List of N arrays giving the interest level of the 246 | corresponding time bin. 247 | """ 248 | embeddings_x = get_embeddings(test_x, 44100) 249 | traces = [] 250 | x_traces = [] 251 | for x in embeddings_x: 252 | #perform the inference over the whole audio waveform at once 253 | y = self.model(torch.FloatTensor(x)) 254 | y_array = y.detach().numpy() 255 | traces.append(y_array[:,0]) 256 | 257 | #output the time stamps of each data point 258 | #time stamp is given at the center of each "bin" 259 | max_time = len(y_array) * 0.96 260 | time_stamps = np.arange(0.48, max_time, 0.96) 261 | x_traces.append(time_stamps) 262 | 263 | return x_traces, traces 264 | -------------------------------------------------------------------------------- /loki/models/vggish_tensorflow/mel_features.py: -------------------------------------------------------------------------------- 1 | """This file is not my own work and was copied from an open source 2 | repository by TensorFlow, located at: 3 | github.com/tensorflow/models/tree/master/research/audioset/vggish 4 | 5 | Only import statements were changed to work within this package. 6 | """ 7 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | # ============================================================================== 21 | 22 | """Defines routines to compute mel spectrogram features from audio waveform.""" 23 | 24 | import numpy as np 25 | 26 | 27 | def frame(data, window_length, hop_length): 28 | """Convert array into a sequence of successive possibly overlapping frames. 29 | 30 | An n-dimensional array of shape (num_samples, ...) is converted into an 31 | (n+1)-D array of shape (num_frames, window_length, ...), where each frame 32 | starts hop_length points after the preceding one. 33 | 34 | This is accomplished using stride_tricks, so the original data is not 35 | copied. However, there is no zero-padding, so any incomplete frames at the 36 | end are not included. 37 | 38 | Args: 39 | data: np.array of dimension N >= 1. 40 | window_length: Number of samples in each frame. 41 | hop_length: Advance (in samples) between each window. 42 | 43 | Returns: 44 | (N+1)-D np.array with as many rows as there are complete frames that can be 45 | extracted. 46 | """ 47 | num_samples = data.shape[0] 48 | num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length)) 49 | shape = (num_frames, window_length) + data.shape[1:] 50 | strides = (data.strides[0] * hop_length,) + data.strides 51 | return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides) 52 | 53 | 54 | def periodic_hann(window_length): 55 | """Calculate a "periodic" Hann window. 56 | 57 | The classic Hann window is defined as a raised cosine that starts and 58 | ends on zero, and where every value appears twice, except the middle 59 | point for an odd-length window. Matlab calls this a "symmetric" window 60 | and np.hanning() returns it. However, for Fourier analysis, this 61 | actually represents just over one cycle of a period N-1 cosine, and 62 | thus is not compactly expressed on a length-N Fourier basis. Instead, 63 | it's better to use a raised cosine that ends just before the final 64 | zero value - i.e. a complete cycle of a period-N cosine. Matlab 65 | calls this a "periodic" window. This routine calculates it. 66 | 67 | Args: 68 | window_length: The number of points in the returned window. 69 | 70 | Returns: 71 | A 1D np.array containing the periodic hann window. 72 | """ 73 | return 0.5 - (0.5 * np.cos(2 * np.pi / window_length * 74 | np.arange(window_length))) 75 | 76 | 77 | def stft_magnitude(signal, fft_length, 78 | hop_length=None, 79 | window_length=None): 80 | """Calculate the short-time Fourier transform magnitude. 81 | 82 | Args: 83 | signal: 1D np.array of the input time-domain signal. 84 | fft_length: Size of the FFT to apply. 85 | hop_length: Advance (in samples) between each frame passed to FFT. 86 | window_length: Length of each block of samples to pass to FFT. 87 | 88 | Returns: 89 | 2D np.array where each row contains the magnitudes of the fft_length/2+1 90 | unique values of the FFT for the corresponding frame of input samples. 91 | """ 92 | frames = frame(signal, window_length, hop_length) 93 | # Apply frame window to each frame. We use a periodic Hann (cosine of period 94 | # window_length) instead of the symmetric Hann of np.hanning (period 95 | # window_length-1). 96 | window = periodic_hann(window_length) 97 | windowed_frames = frames * window 98 | return np.abs(np.fft.rfft(windowed_frames, int(fft_length))) 99 | 100 | 101 | # Mel spectrum constants and functions. 102 | _MEL_BREAK_FREQUENCY_HERTZ = 700.0 103 | _MEL_HIGH_FREQUENCY_Q = 1127.0 104 | 105 | 106 | def hertz_to_mel(frequencies_hertz): 107 | """Convert frequencies to mel scale using HTK formula. 108 | 109 | Args: 110 | frequencies_hertz: Scalar or np.array of frequencies in hertz. 111 | 112 | Returns: 113 | Object of same size as frequencies_hertz containing corresponding values 114 | on the mel scale. 115 | """ 116 | return _MEL_HIGH_FREQUENCY_Q * np.log( 117 | 1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ)) 118 | 119 | 120 | def spectrogram_to_mel_matrix(num_mel_bins=20, 121 | num_spectrogram_bins=129, 122 | audio_sample_rate=8000, 123 | lower_edge_hertz=125.0, 124 | upper_edge_hertz=3800.0): 125 | """Return a matrix that can post-multiply spectrogram rows to make mel. 126 | 127 | Returns a np.array matrix A that can be used to post-multiply a matrix S of 128 | spectrogram values (STFT magnitudes) arranged as frames x bins to generate a 129 | "mel spectrogram" M of frames x num_mel_bins. M = S A. 130 | 131 | The classic HTK algorithm exploits the complementarity of adjacent mel bands 132 | to multiply each FFT bin by only one mel weight, then add it, with positive 133 | and negative signs, to the two adjacent mel bands to which that bin 134 | contributes. Here, by expressing this operation as a matrix multiply, we go 135 | from num_fft multiplies per frame (plus around 2*num_fft adds) to around 136 | num_fft^2 multiplies and adds. However, because these are all presumably 137 | accomplished in a single call to np.dot(), it's not clear which approach is 138 | faster in Python. The matrix multiplication has the attraction of being more 139 | general and flexible, and much easier to read. 140 | 141 | Args: 142 | num_mel_bins: How many bands in the resulting mel spectrum. This is 143 | the number of columns in the output matrix. 144 | num_spectrogram_bins: How many bins there are in the source spectrogram 145 | data, which is understood to be fft_size/2 + 1, i.e. the spectrogram 146 | only contains the nonredundant FFT bins. 147 | audio_sample_rate: Samples per second of the audio at the input to the 148 | spectrogram. We need this to figure out the actual frequencies for 149 | each spectrogram bin, which dictates how they are mapped into mel. 150 | lower_edge_hertz: Lower bound on the frequencies to be included in the mel 151 | spectrum. This corresponds to the lower edge of the lowest triangular 152 | band. 153 | upper_edge_hertz: The desired top edge of the highest frequency band. 154 | 155 | Returns: 156 | An np.array with shape (num_spectrogram_bins, num_mel_bins). 157 | 158 | Raises: 159 | ValueError: if frequency edges are incorrectly ordered or out of range. 160 | """ 161 | nyquist_hertz = audio_sample_rate / 2. 162 | if lower_edge_hertz < 0.0: 163 | raise ValueError("lower_edge_hertz %.1f must be >= 0" % lower_edge_hertz) 164 | if lower_edge_hertz >= upper_edge_hertz: 165 | raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" % 166 | (lower_edge_hertz, upper_edge_hertz)) 167 | if upper_edge_hertz > nyquist_hertz: 168 | raise ValueError("upper_edge_hertz %.1f is greater than Nyquist %.1f" % 169 | (upper_edge_hertz, nyquist_hertz)) 170 | spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins) 171 | spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz) 172 | # The i'th mel band (starting from i=1) has center frequency 173 | # band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge 174 | # band_edges_mel[i+1]. Thus, we need num_mel_bins + 2 values in 175 | # the band_edges_mel arrays. 176 | band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz), 177 | hertz_to_mel(upper_edge_hertz), num_mel_bins + 2) 178 | # Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins 179 | # of spectrogram values. 180 | mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins)) 181 | for i in range(num_mel_bins): 182 | lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3] 183 | # Calculate lower and upper slopes for every spectrogram bin. 184 | # Line segments are linear in the *mel* domain, not hertz. 185 | lower_slope = ((spectrogram_bins_mel - lower_edge_mel) / 186 | (center_mel - lower_edge_mel)) 187 | upper_slope = ((upper_edge_mel - spectrogram_bins_mel) / 188 | (upper_edge_mel - center_mel)) 189 | # .. then intersect them with each other and zero. 190 | mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope, 191 | upper_slope)) 192 | # HTK excludes the spectrogram DC bin; make sure it always gets a zero 193 | # coefficient. 194 | mel_weights_matrix[0, :] = 0.0 195 | return mel_weights_matrix 196 | 197 | 198 | def log_mel_spectrogram(data, 199 | audio_sample_rate=8000, 200 | log_offset=0.0, 201 | window_length_secs=0.025, 202 | hop_length_secs=0.010, 203 | **kwargs): 204 | """Convert waveform to a log magnitude mel-frequency spectrogram. 205 | 206 | Args: 207 | data: 1D np.array of waveform data. 208 | audio_sample_rate: The sampling rate of data. 209 | log_offset: Add this to values when taking log to avoid -Infs. 210 | window_length_secs: Duration of each window to analyze. 211 | hop_length_secs: Advance between successive analysis windows. 212 | **kwargs: Additional arguments to pass to spectrogram_to_mel_matrix. 213 | 214 | Returns: 215 | 2D np.array of (num_frames, num_mel_bins) consisting of log mel filterbank 216 | magnitudes for successive frames. 217 | """ 218 | window_length_samples = int(round(audio_sample_rate * window_length_secs)) 219 | hop_length_samples = int(round(audio_sample_rate * hop_length_secs)) 220 | fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0))) 221 | spectrogram = stft_magnitude( 222 | data, 223 | fft_length=fft_length, 224 | hop_length=hop_length_samples, 225 | window_length=window_length_samples) 226 | mel_spectrogram = np.dot(spectrogram, spectrogram_to_mel_matrix( 227 | num_spectrogram_bins=spectrogram.shape[1], 228 | audio_sample_rate=audio_sample_rate, **kwargs)) 229 | return np.log(mel_spectrogram + log_offset) 230 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------