├── utils ├── __init__.py ├── gen_utils.py ├── cnn_utils.py └── ffnn_utils.py ├── example.jpg ├── flac2wav.sh ├── .gitignore ├── prep_cnn_data.sh ├── prep_ffnn_data.sh ├── ffnn ├── dataset.py ├── librispeech.py ├── model.py ├── eval.py └── train.py ├── cnn ├── model.py ├── eval.py └── train.py ├── main.py ├── requirements.txt └── README.md /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hernanrazo/human-voice-detection/HEAD/example.jpg -------------------------------------------------------------------------------- /flac2wav.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "Converting .flac files to .wav" 4 | python -c 'from utils.gen_utils import flac2wav; flac2wav()' 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.swp 3 | *.swo 4 | *.pyc 5 | .DS_Store 6 | __pycache__/ 7 | haters/ 8 | 9 | data/ 10 | saved_models/ 11 | temp/ 12 | -------------------------------------------------------------------------------- /prep_cnn_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "Creating data splits and melss .jpg files for all .wav files ..." 4 | python -c 'from utils.cnn_utils import prepare_dataset; prepare_dataset()' 5 | echo "done" 6 | -------------------------------------------------------------------------------- /prep_ffnn_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "Creating data splits, annotations, and transforms for all .wav files ..." 4 | python -c 'from utils.ffnn_utils import prepare_dataset; prepare_dataset()' 5 | echo "done" 6 | -------------------------------------------------------------------------------- /ffnn/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | 4 | class FFNN_dataset(Dataset): 5 | 6 | def __init__(self, x, y): 7 | self.x = x 8 | self.y = y 9 | 10 | def __getitem__(self, index): 11 | return self.x[index], self.y[index] 12 | 13 | def __len__(self): 14 | return len(self.x) 15 | -------------------------------------------------------------------------------- /ffnn/librispeech.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import shutil 4 | ''' 5 | Moves all .flac files from the Librispeech dataset from its 6 | original file structure to one giant subdir. This ignores 7 | the other .txt files and labels that come with the original 8 | dataset. 9 | 10 | Ignore this if you are not using the LibriSpeech dataset or 11 | if you end up using the original file structure. 12 | ''' 13 | 14 | def main(): 15 | 16 | root = str(os.getcwd()) + '/data/LibriSpeech/train-clean-100/' 17 | dest = str(os.getcwd()) + '/data/voice/flac/' 18 | 19 | for subdirs, dirs, files in os.walk(root): 20 | for file in files: 21 | if file.endswith('.flac'): 22 | print('Moving ' + str(file)) 23 | shutil.move(str(subdirs + os.sep + file), dest) 24 | 25 | if __name__ == '__main__': 26 | main() 27 | -------------------------------------------------------------------------------- /ffnn/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | # feed forward neural network 6 | class FFNN(nn.Module): 7 | def __init__(self): 8 | super(FFNN, self).__init__() 9 | self.l1 = nn.Linear(194, 1024) 10 | self.r1 = nn.ReLU() 11 | self.d1 = nn.Dropout(0.2) 12 | 13 | self.l2 = nn.Linear(1024, 512) 14 | self.r2 = nn.ReLU() 15 | self.d2 = nn.Dropout(0.2) 16 | 17 | self.l3 = nn.Linear(512, 128) 18 | self.r3 = nn.ReLU() 19 | self.d3 = nn.Dropout(0.2) 20 | 21 | self.l4 = nn.Linear(128, 2) 22 | self.out = nn.Sigmoid() 23 | 24 | 25 | def forward(self, x): 26 | l1 = self.l1(x) 27 | r1 = self.r1(l1) 28 | d1 = self.d1(r1) 29 | 30 | l2 = self.l2(d1) 31 | r2 = self.r2(l2) 32 | d2 = self.d2(r2) 33 | 34 | l3 = self.l3(d2) 35 | r3 = self.r3(l3) 36 | d3 = self.d3(r3) 37 | 38 | l4 = self.l4(d3) 39 | y = self.out(l4) 40 | return y 41 | -------------------------------------------------------------------------------- /cnn/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class CNN(nn.Module): 6 | def __init__(self): 7 | super(CNN, self).__init__() 8 | self.conv1 = nn.Conv2d(3, 32, 5) 9 | self.bn1 = nn.BatchNorm2d(32) 10 | self.d1 = nn.Dropout(0.3) 11 | 12 | self.pool = nn.MaxPool2d(2, 2) 13 | 14 | self.conv2 = nn.Conv2d(32, 64, 5) 15 | self.bn2 = nn.BatchNorm2d(64) 16 | self.d2 = nn.Dropout(0.5) 17 | 18 | self.conv3 = nn.Conv2d(64, 128, 2) 19 | self.bn3 = nn.BatchNorm2d(128) 20 | self.d3 = nn.Dropout(0.3) 21 | 22 | self.fc1 = nn.Linear(128*2*2, 64) 23 | self.d5 = nn.Dropout(0.2) 24 | 25 | self.fc2 = nn.Linear(64, 32) 26 | self.d6 = nn.Dropout(0.2) 27 | 28 | self.fc3 = nn.Linear(32, 16) 29 | 30 | self.fc4 = nn.Linear(16, 2) 31 | 32 | def forward(self, x): 33 | x = self.pool(F.relu(self.conv1(x))) 34 | x = self.bn1(x) 35 | x = self.d1(x) 36 | 37 | x = self.pool(F.relu(self.conv2(x))) 38 | x = self.bn2(x) 39 | x = self.d2(x) 40 | 41 | x = self.pool(F.relu(self.conv3(x))) 42 | x = self.bn3(x) 43 | x = self.d3(x) 44 | 45 | x = x.view(-1, 128*2*2) 46 | 47 | x = F.relu(self.fc1(x)) 48 | x = self.d5(x) 49 | 50 | x = F.relu(self.fc2(x)) 51 | x = self.d6(x) 52 | 53 | x = F.relu(self.fc3(x)) 54 | 55 | x = self.fc4(x) 56 | return x 57 | -------------------------------------------------------------------------------- /cnn/eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import warnings 4 | from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 5 | import torch 6 | import torchvision 7 | from torchvision.transforms import transforms 8 | from model import CNN 9 | 10 | warnings.filterwarnings('ignore', category=UserWarning) 11 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 12 | root_dir = str(os.path.abspath(os.path.join(os.getcwd(), os.pardir))) 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('model_path', type=str, help='Path to convolutional neural network you wish to use') 15 | args = parser.parse_args() 16 | 17 | def main(): 18 | 19 | # set parameters 20 | batch_size = 16 21 | workers = 4 22 | nb_classes = 2 23 | pred_list = torch.zeros(0, dtype=torch.long, device='cpu') 24 | ground_truth = torch.zeros(0, dtype=torch.long, device='cpu') 25 | 26 | # get model 27 | model_path = str(root_dir + '/saved_models/' + args.model_path) 28 | model = CNN() 29 | model.load_state_dict(torch.load(model_path), strict=False) 30 | model.to(device) 31 | model.eval() 32 | 33 | # set basic transforms. Spectrograms have to look a certain way so rotations, flips, and other 34 | # transforms do not make sense in this application 35 | transform = {'test' : transforms.Compose([transforms.Resize([32, 32]), transforms.ToTensor()])} 36 | 37 | # get testing dataset 38 | test_data = torchvision.datasets.ImageFolder(root_dir + '/data/plots/test/', 39 | transform=transform['test']) 40 | 41 | test_loader = torch.utils.data.DataLoader(dataset=test_data, 42 | batch_size=batch_size, 43 | shuffle=True) 44 | # start testing 45 | with torch.no_grad(): 46 | for i, (x, y) in enumerate(test_loader): 47 | x, y = x.to(device), y.to(device) 48 | outputs = model(x) 49 | _, preds = torch.max(outputs.data, 1) 50 | 51 | pred_list = torch.cat([pred_list, preds.view(-1).cpu()]) 52 | ground_truth = torch.cat([ground_truth, y.view(-1).cpu()]) 53 | 54 | # accuracy score 55 | print('\nAccuracy Score:') 56 | print(accuracy_score(ground_truth.numpy(), pred_list.numpy())) 57 | # confusion matrix 58 | print('\nConfusion Matrix:') 59 | conf_mat = confusion_matrix(ground_truth.numpy(), pred_list.numpy()) 60 | print(conf_mat) 61 | 62 | # per-class accuracy 63 | print('\nPer-Class Accuracy:') 64 | print(100 * conf_mat.diagonal() / conf_mat.sum(1)) 65 | 66 | # classification report 67 | print('\nClassification Report:') 68 | print(classification_report(ground_truth.numpy(), pred_list.numpy())) 69 | 70 | if __name__ == '__main__': 71 | main() 72 | -------------------------------------------------------------------------------- /utils/gen_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import numpy as np 4 | import librosa 5 | import torch 6 | from pydub import AudioSegment 7 | 8 | root_dir = str(os.path.abspath(os.path.join(os.getcwd(), os.pardir))) 9 | 10 | 11 | def create_dir(dir: str) -> None: 12 | if os.path.isdir(dir): 13 | print(dir, 'already exists. Continuing ...' ) 14 | else: 15 | print('Creating new dir: ', dir) 16 | os.makedirs(dir) 17 | 18 | # convert .flac files to .wav files in the voice data folder 19 | def flac2wav() -> None: 20 | flac_path = str(root_dir + '/voice_detect/data/voice/flac/') 21 | wav_path = str(root_dir + '/voice_detect/data/voice/wav/') 22 | flac_files = [f for f in os.listdir(flac_path) if os.path.isfile(os.path.join(flac_path, f)) and f.endswith('.flac')] 23 | 24 | for file in flac_files: 25 | print('Converting ' + str(file)) 26 | temp = AudioSegment.from_file(str(flac_path + file)) 27 | temp.export(str(wav_path + os.path.splitext(file)[0]) + '.wav', format='wav') 28 | print('Done converting \n') 29 | 30 | 31 | # create the training and testing split lists for both classes 32 | def create_splits(voice_path: str, not_voice_path: str) -> list: 33 | voice_wavs = str(root_dir + '/voice_detect/data/voice/wav/') 34 | not_voice_wavs = str(root_dir + '/voice_detect/data/not_voice/wav/') 35 | 36 | # get total number of files in the both dirs and split the training and testing dataset 37 | # by a 80/20 ratio 38 | voice_list = [voice_wavs + name for name in os.listdir(voice_wavs)] 39 | voice_total = len(voice_list) 40 | voice_train_split = round(voice_total * 0.8) 41 | voice_test_split = voice_total - voice_train_split 42 | 43 | assert voice_train_split + voice_test_split == voice_total 44 | 45 | voice_train_list = random.sample(voice_list, voice_train_split) 46 | voice_test_list = random.sample(voice_list, voice_test_split) 47 | 48 | not_voice_list = [not_voice_wavs + name for name in os.listdir(not_voice_wavs)] 49 | not_voice_total = len(not_voice_list) 50 | not_voice_train_split = round(not_voice_total * 0.8) 51 | not_voice_test_split = not_voice_total - not_voice_train_split 52 | 53 | assert not_voice_train_split + not_voice_test_split == not_voice_total 54 | 55 | not_voice_train_list = random.sample(not_voice_list, not_voice_train_split) 56 | not_voice_test_list = random.sample(not_voice_list, not_voice_test_split) 57 | 58 | # concat into two complete lists 59 | full_train_list = voice_train_list + not_voice_train_list 60 | full_test_list = voice_test_list + not_voice_test_list 61 | 62 | return full_train_list, full_test_list 63 | 64 | 65 | # calculate accuracy of a prediction 66 | def get_accuracy(prediction: str, label: str) -> float: 67 | matches = [torch.argmax(i) == torch.argmax(j) for i, j in zip(prediction, label)] 68 | accuracy = matches.count(True) / len(matches) 69 | return accuracy 70 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import warnings 4 | import argparse 5 | import librosa 6 | import torch 7 | import torchvision 8 | from PIL import Image 9 | from ffnn.model import FFNN 10 | from cnn.model import CNN 11 | from utils.gen_utils import create_dir 12 | from utils.ffnn_utils import apply_transforms, transforms_to_tensor 13 | from utils.cnn_utils import get_melss 14 | 15 | warnings.filterwarnings('ignore', category=UserWarning) 16 | 17 | def main(): 18 | 19 | # set device 20 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 21 | 22 | # other 23 | root_dir = str(os.path.abspath(os.path.join(os.getcwd(), os.pardir))) 24 | to_tensor = torchvision.transforms.ToTensor() 25 | 26 | # get model path argument 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument('ffnn_path', type=str, help='Path to feed forward neural network') 29 | parser.add_argument('cnn_path', type=str, help='Path to convolutional neural network') 30 | args = parser.parse_args() 31 | 32 | # get models in eval mode 33 | ffnn = FFNN() 34 | ffnn_path = os.path.abspath('saved_models/' + args.ffnn_path) 35 | ffnn.load_state_dict(torch.load(ffnn_path), strict=False) 36 | ffnn = ffnn.to(device) 37 | ffnn.eval() 38 | 39 | cnn = CNN() 40 | cnn_path = os.path.abspath('saved_models/' + args.cnn_path) 41 | cnn.load_state_dict(torch.load(cnn_path), strict=False) 42 | cnn = cnn.to(device) 43 | cnn.eval() 44 | 45 | # create temp dir to save melss image for current inference 46 | create_dir('temp') 47 | 48 | # get transforms and spectrogram image 49 | transforms = apply_transforms('data/test/voice3.wav') 50 | melss = get_melss('data/test/voice3.wav', 'temp/test.jpg') 51 | 52 | # convert transforms dict to tensor and 53 | # apply transforms to melss image 54 | transforms = transforms_to_tensor(transforms) 55 | melss = Image.open('temp/test.jpg') 56 | melss = melss.resize((32, 32)) 57 | melss = to_tensor(melss) 58 | melss = melss.to(device) 59 | 60 | # make predictions 61 | ffnn_pred = ffnn(transforms) 62 | cnn_pred = cnn(melss.unsqueeze(0)) 63 | 64 | # if both models agree that the audio is a voice, return voice 65 | # else, return not_voice 66 | if ffnn_pred[1] > 0.85 and cnn_pred[0][1] > 0.85: 67 | print(ffnn_pred) 68 | print(cnn_pred) 69 | print('\nvoice\n') 70 | else: 71 | print(ffnn_pred) 72 | print(cnn_pred) 73 | print('\nnot_voice\n') 74 | 75 | # delete temp dir after completion 76 | if os.path.isdir(root_dir + '/voice_detect/temp'): 77 | print('deleting temp dir ...\n') 78 | os.remove(root_dir + '/voice_detect/temp/test.jpg') 79 | shutil.rmtree(root_dir + '/voice_detect/temp') 80 | else: 81 | print('temp dir does not exist...\n') 82 | 83 | print('Inference complete ...') 84 | if __name__ == '__main__': 85 | main() 86 | -------------------------------------------------------------------------------- /ffnn/eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import warnings 5 | import pandas as pd 6 | import torch 7 | from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 8 | from model import FFNN 9 | from dataset import FFNN_dataset 10 | 11 | warnings.filterwarnings('ignore', category=UserWarning) 12 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('model_path', type=str, help='Path to feed forward network you wish to use') 15 | args = parser.parse_args() 16 | 17 | def main(): 18 | 19 | # set parameters 20 | batch_size = 32 21 | nb_classes = 2 22 | pred_list = torch.zeros(0, dtype=torch.long, device='cpu') 23 | ground_truth = torch.zeros(0, dtype=torch.long, device='cpu') 24 | root_dir = str(os.path.abspath(os.path.join(os.getcwd(),os.pardir))) 25 | 26 | # get model 27 | model_path = str(root_dir + '/saved_models/' + args.model_path) 28 | model = FFNN() 29 | model.load_state_dict(torch.load(model_path), strict=False) 30 | model.to(device) 31 | model.eval() 32 | 33 | # get testing dataset 34 | data_path = str(root_dir + '/data/annotations/test.csv') 35 | df = pd.read_csv(data_path, header=None) 36 | 37 | # encode labels 38 | encode = {'voice' : 1, 'not_voice' : 0} 39 | df.iloc[:, 195].replace(encode, inplace=True) 40 | 41 | # remove file names 42 | df.drop(df.columns[0], axis=1, inplace=True) 43 | 44 | # seperate data and labels 45 | x = df.iloc[:, 0:-1] 46 | y = df.iloc[:, -1:] 47 | 48 | # get testing dataset 49 | test_data = FFNN_dataset(torch.FloatTensor(x.values), torch.FloatTensor(y.values)) 50 | test_loader = torch.utils.data.DataLoader(dataset=test_data, 51 | batch_size=batch_size, 52 | shuffle=True) 53 | # start testing 54 | with torch.no_grad(): 55 | for i, (x, y) in enumerate(test_loader): 56 | x, y = x.to(device), y.to(device) 57 | outputs = model(x) 58 | _, preds = torch.max(outputs.data, 1) 59 | 60 | pred_list = torch.cat([pred_list, preds.view(-1).cpu()]) 61 | ground_truth = torch.cat([ground_truth, y.view(-1).cpu()]) 62 | 63 | # accuracy report 64 | print('\nAccuracy Score:') 65 | print(accuracy_score(ground_truth.numpy(), pred_list.numpy())) 66 | 67 | # confusion matrix 68 | print('\nConfusion Matrix:') 69 | conf_mat = confusion_matrix(ground_truth.numpy(), pred_list.numpy()) 70 | print(conf_mat) 71 | 72 | # per-class accuracy 73 | print('\nPer-Class Accuracy:') 74 | print(100 * conf_mat.diagonal() / conf_mat.sum(1)) 75 | 76 | # classification report 77 | print('\nClassification Report:') 78 | print(classification_report(ground_truth.numpy(), pred_list.numpy())) 79 | 80 | if __name__ == '__main__': 81 | main() 82 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: linux-64 4 | _libgcc_mutex=0.1=main 5 | _openmp_mutex=4.5=1_gnu 6 | appdirs=1.4.4=pyh9f0ad1d_0 7 | audioread=2.1.9=py38h578d9bd_0 8 | blas=1.0=mkl 9 | brotlipy=0.7.0=py38h497a2fe_1001 10 | bzip2=1.0.8=h7f98852_4 11 | ca-certificates=2020.10.14=0 12 | certifi=2020.6.20=py38_0 13 | cffi=1.14.6=py38ha65f79e_0 14 | chardet=4.0.0=py38h578d9bd_1 15 | charset-normalizer=2.0.0=pyhd8ed1ab_0 16 | cryptography=3.4.7=py38ha5dfef3_0 17 | cudatoolkit=10.2.89=hfd86e86_1 18 | cycler=0.10.0=py_2 19 | decorator=5.0.9=pyhd8ed1ab_0 20 | ffmpeg=4.3.1=hca11adc_2 21 | freetype=2.10.4=h0708190_1 22 | gettext=0.19.8.1=h0b5b191_1005 23 | gmp=6.2.1=h2531618_2 24 | gnutls=3.6.15=he1e5248_0 25 | idna=3.1=pyhd3deb0d_0 26 | intel-openmp=2021.3.0=h06a4308_3350 27 | joblib=1.0.1=pyhd8ed1ab_0 28 | jpeg=9b=h024ee3a_2 29 | kiwisolver=1.3.1=py38h1fd1430_1 30 | lame=3.100=h7f98852_1001 31 | lcms2=2.12=h3be6417_0 32 | ld_impl_linux-64=2.35.1=h7274673_9 33 | libffi=3.3=he6710b0_2 34 | libflac=1.3.3=h9c3ff4c_1 35 | libgcc-ng=9.3.0=h5101ec6_17 36 | libgfortran-ng=7.5.0=h14aa051_19 37 | libgfortran4=7.5.0=h14aa051_19 38 | libgomp=9.3.0=h5101ec6_17 39 | libidn2=2.3.2=h7f8727e_0 40 | libllvm10=10.0.1=he513fc3_3 41 | libogg=1.3.4=h7f98852_1 42 | libopus=1.3.1=h7f98852_1 43 | libpng=1.6.37=h21135ba_2 44 | librosa=0.8.1=pyhd8ed1ab_0 45 | libsndfile=1.0.31=h9c3ff4c_1 46 | libstdcxx-ng=9.3.0=hd4cf53a_17 47 | libtasn1=4.16.0=h27cfd23_0 48 | libtiff=4.2.0=h85742a9_0 49 | libunistring=0.9.10=h27cfd23_0 50 | libuv=1.40.0=h7b6447c_0 51 | libvorbis=1.3.7=h9c3ff4c_0 52 | libwebp-base=1.2.0=h27cfd23_0 53 | llvmlite=0.36.0=py38h4630a5e_0 54 | lz4-c=1.9.3=h2531618_0 55 | matplotlib-base=3.4.2=py38hcc49a3a_0 56 | mkl=2021.3.0=h06a4308_520 57 | mkl-service=2.4.0=py38h7f8727e_0 58 | mkl_fft=1.3.0=py38h42c9631_2 59 | mkl_random=1.2.2=py38h51133e4_0 60 | ncurses=6.2=he6710b0_1 61 | nettle=3.7.3=hbbd107a_1 62 | ninja=1.10.2=hff7bd54_1 63 | numba=0.53.1=py38h8b71fd7_1 64 | numpy=1.20.3=py38hf144106_0 65 | numpy-base=1.20.3=py38h74d4b33_0 66 | olefile=0.46=py_0 67 | openh264=2.1.1=h780b84a_0 68 | openjpeg=2.3.0=h05c96fa_1 69 | openssl=1.1.1k=h7f98852_0 70 | packaging=21.0=pyhd8ed1ab_0 71 | pandas=1.2.5=py38h295c915_0 72 | pillow=8.3.1=py38h2c7a002_0 73 | pip=21.1.3=py38h06a4308_0 74 | pooch=1.4.0=pyhd8ed1ab_0 75 | pycparser=2.20=pyh9f0ad1d_2 76 | pydub=0.25.1=pyhd8ed1ab_0 77 | pyopenssl=20.0.1=pyhd8ed1ab_0 78 | pyparsing=2.4.7=pyh9f0ad1d_0 79 | pysocks=1.7.1=py38h578d9bd_3 80 | pysoundfile=0.10.3.post1=pyhd3deb0d_0 81 | python=3.8.5=h7579374_1 82 | python-dateutil=2.8.2=pyhd3eb1b0_0 83 | python_abi=3.8=2_cp38 84 | pytorch=1.9.0=py3.8_cuda10.2_cudnn7.6.5_0 85 | pytz=2021.1=pyhd3eb1b0_0 86 | readline=8.1=h27cfd23_0 87 | requests=2.26.0=pyhd8ed1ab_0 88 | resampy=0.2.2=py_0 89 | scikit-learn=0.23.2=py38h0573a6f_0 90 | scipy=1.6.2=py38had2a1c9_1 91 | setuptools=52.0.0=py38h06a4308_0 92 | six=1.16.0=pyhd3eb1b0_0 93 | sqlite=3.36.0=hc218d9a_0 94 | threadpoolctl=2.2.0=pyh8a188c0_0 95 | tk=8.6.10=hbc83047_0 96 | torchaudio=0.9.0=py38 97 | torchvision=0.10.0=py38_cu102 98 | tornado=6.1=py38h497a2fe_1 99 | typing_extensions=3.10.0.0=pyh06a4308_0 100 | urllib3=1.26.6=pyhd8ed1ab_0 101 | wheel=0.36.2=pyhd3eb1b0_0 102 | x264=1!161.3030=h7f98852_1 103 | xz=5.2.5=h7b6447c_0 104 | zlib=1.2.11=h7b6447c_3 105 | zstd=1.4.9=haebb681_0 106 | -------------------------------------------------------------------------------- /cnn/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import warnings 4 | import logging 5 | import time 6 | from datetime import date 7 | import torchvision 8 | from torchvision.transforms import transforms 9 | import torch.optim 10 | import torch.nn as nn 11 | from model import CNN 12 | sys.path.append('../') 13 | from utils.gen_utils import create_dir, get_accuracy 14 | 15 | warnings.filterwarnings('ignore', category=UserWarning) 16 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 17 | root_dir = str(os.path.abspath(os.path.join(os.getcwd(), os.pardir))) 18 | epochs = 250 19 | batch_size = 16 20 | lr = 0.01 21 | workers = 4 22 | 23 | def main(): 24 | 25 | # create new directory for the current training session 26 | today = date.today() 27 | today = str(today.strftime('%m-%d-%Y')) 28 | dir_ = str(root_dir + '/saved_models/CNN/train-' + today) 29 | create_dir(dir_) 30 | 31 | log_file_name = 'CNN-' + today +'.log' 32 | logging.basicConfig(filename=os.path.join(dir_, log_file_name), 33 | filemode='w', 34 | format='%(asctime)s: %(message)s', 35 | level=logging.INFO) 36 | 37 | # set basic transforms. Spectrograms have to look a certain way so rotations, 38 | # flips, and other 39 | # transforms do not make sense in this application 40 | transform = { 'train' : transforms.Compose([transforms.Resize([32, 32]), 41 | transforms.ToTensor()])} 42 | 43 | # get train dataset 44 | train_data = torchvision.datasets.ImageFolder(root=root_dir + '/data/plots/train/', 45 | transform=transform['train']) 46 | 47 | train_loader = torch.utils.data.DataLoader(dataset=train_data, 48 | batch_size=batch_size, 49 | shuffle=True, 50 | num_workers=workers) 51 | model = CNN() 52 | model = model.to(device) 53 | 54 | criterion = nn.CrossEntropyLoss() 55 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 56 | 57 | # for each epoch 58 | for epoch in range(epochs): 59 | 60 | model.train() 61 | epoch_loss = 0 62 | epoch_accuracy = 0 63 | epoch_steps = 0 64 | 65 | for i, (img, label) in enumerate(train_loader): 66 | img, label = img.to(device), label.to(device) 67 | 68 | prediction = model(img) 69 | loss = criterion(prediction, label) 70 | epoch_accuracy = get_accuracy(prediction, label) 71 | 72 | optimizer.zero_grad() 73 | loss.backward() 74 | optimizer.step() 75 | 76 | epoch_loss += loss.item() 77 | epoch_steps += 1 78 | 79 | # print status onto terminal and log file 80 | print('Epoch: [%d/%d] | Loss: %.3f | Accuracy: %.3f' % (epoch+1, 81 | epochs, 82 | epoch_loss, 83 | epoch_accuracy)) 84 | 85 | logging.info('Epoch: [%d/%d] | Loss: %.3f | Accuracy: %.3f' % (epoch+1, 86 | epochs, 87 | epoch_loss, 88 | epoch_accuracy)) 89 | # save model 90 | model_file_name = 'CNN-' + today + '.pt' 91 | torch.save(model.state_dict(), os.path.join(dir_, model_file_name)) 92 | if __name__ == '__main__': 93 | main() 94 | -------------------------------------------------------------------------------- /ffnn/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import warnings 4 | import logging 5 | import time 6 | from datetime import date 7 | import pandas as pd 8 | import torch.optim 9 | import torch.nn as nn 10 | from torch.utils.data import DataLoader 11 | from dataset import FFNN_dataset 12 | from model import FFNN 13 | sys.path.append('../') 14 | from utils.gen_utils import create_dir, get_accuracy 15 | 16 | warnings.filterwarnings('ignore', category=UserWarning) 17 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 18 | root_dir = str(os.path.abspath(os.path.join(os.getcwd(), os.pardir))) 19 | epochs = 70 20 | batch_size = 32 21 | lr = 0.005 22 | workers = 4 23 | 24 | def main(): 25 | 26 | # create new directory for the current training session 27 | today = date.today() 28 | today = str(today.strftime('%m-%d-%Y')) 29 | dir_ = str(root_dir + '/saved_models/FFNN/train-' + today) 30 | 31 | create_dir(dir_) 32 | 33 | log_file_name = 'FFNN-' + today +'.log' 34 | logging.basicConfig(filename=os.path.join(dir_, log_file_name), 35 | filemode='w', 36 | format='%(asctime)s: %(message)s', 37 | level=logging.INFO) 38 | 39 | # get training set 40 | data_path = str(root_dir + '/data/annotations/train.csv') 41 | df = pd.read_csv(data_path, header=None) 42 | 43 | # encode labels 44 | encode = {'voice' : 1, 'not_voice' : 0} 45 | df.iloc[:, 195].replace(encode, inplace=True) 46 | 47 | # remove file names 48 | df.drop(df.columns[0], axis=1, inplace=True) 49 | 50 | # seperate data and labels 51 | x = df.iloc[:, 0: -1] 52 | y = df.iloc[:, -1] 53 | 54 | # get dataloader for training 55 | train_data = FFNN_dataset(torch.FloatTensor(x.values), torch.FloatTensor(y.values)) 56 | train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True) 57 | 58 | model = FFNN() 59 | model = model.to(device) 60 | 61 | criterion = nn.CrossEntropyLoss() 62 | optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.1) 63 | 64 | # for each epoch 65 | for epoch in range(epochs): 66 | 67 | model.train() 68 | epoch_loss = 0 69 | epoch_accuracy = 0 70 | epoch_steps = 0 71 | 72 | for i, (x, y) in enumerate(train_loader): 73 | 74 | x, y = x.to(device), y.to(device, dtype=torch.int64) 75 | optimizer.zero_grad() 76 | prediction = model(x) 77 | loss = criterion(prediction, y) 78 | epoch_accuracy = get_accuracy(prediction, y) 79 | 80 | loss.backward() 81 | optimizer.step() 82 | 83 | epoch_loss += loss.item() 84 | epoch_steps += 1 85 | 86 | # print status onto terminal and log file 87 | print('Epoch: [%d/%d] | Loss: %.3f | Accuracy: %.3f' % (epoch+1, 88 | epochs, 89 | epoch_loss, 90 | epoch_accuracy)) 91 | 92 | logging.info('Epoch: [%d/%d] | Loss: %.3f | Accuracy: %.3f' % (epoch+1, 93 | epochs, 94 | epoch_loss, 95 | epoch_accuracy)) 96 | # save model 97 | model_file_name = 'FFNN-' + today + '.pt' 98 | torch.save(model.state_dict(), os.path.join(dir_, model_file_name)) 99 | 100 | if __name__ == '__main__': 101 | main() 102 | -------------------------------------------------------------------------------- /utils/cnn_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import time 4 | import threading 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | import librosa 8 | import librosa.display 9 | from utils.gen_utils import create_dir, create_splits 10 | 11 | root_dir = str(os.path.abspath(os.path.join(os.getcwd(), os.pardir))) 12 | 13 | 14 | # plot the mel-spectrogram for the single wav file input 15 | def get_melss(wav_file: str, new_name: str) -> None: 16 | # get sample rate 17 | x, sr = librosa.load(wav_file, sr=None, res_type='kaiser_fast') 18 | 19 | # get headless figure 20 | fig = plt.figure(figsize=[1, 1]) 21 | 22 | # remove the axes 23 | ax = fig.add_subplot(111) 24 | ax.axes.get_xaxis().set_visible(False) 25 | ax.axes.get_yaxis().set_visible(False) 26 | ax.set_frame_on(False) 27 | 28 | # get melss 29 | melss = librosa.feature.melspectrogram(y=x, sr=sr) 30 | librosa.display.specshow(librosa.power_to_db(melss, ref=np.max), y_axis='linear') 31 | 32 | # save plot as jpg 33 | plt.savefig(new_name, dpi=500, bbox_inches='tight', pad_inches=0) 34 | plt.close() 35 | 36 | 37 | # prepare the cnn dataset of images 38 | def prepare_dataset() -> None: 39 | # get training and testing splits 40 | voice = os.path.join(root_dir, 'voice_detect/data/voice/') 41 | not_voice = os.path.join(root_dir, 'voice_detect/data/not_voice/') 42 | train, test = create_splits(voice, not_voice) 43 | 44 | voice_train = os.path.join(root_dir, 'voice_detect/data/plots/train/voice/') 45 | not_voice_train = os.path.join(root_dir, 'voice_detect/data/plots/train/not_voice/') 46 | voice_test = os.path.join(root_dir, 'voice_detect/data/plots/test/voice/') 47 | not_voice_test = os.path.join(root_dir, 'voice_detect/data/plots/test/not_voice/') 48 | 49 | create_dir(voice_train) 50 | create_dir(not_voice_train) 51 | create_dir(voice_test) 52 | create_dir(not_voice_test) 53 | 54 | # iterate through the training split 55 | for file in train: 56 | try: 57 | print('Making train plot for: ' + file) 58 | if 'not_voice' in file: 59 | wav_name = os.path.basename(file) 60 | wav_name = os.path.splitext(wav_name) 61 | 62 | # construct new jpg file name with the extenstion 63 | jpg_file_name = str(wav_name[0]) + '.jpg' 64 | jpg_file_name = str(not_voice_train + jpg_file_name) 65 | get_melss(file, jpg_file_name) 66 | else: 67 | wav_name = os.path.basename(file) 68 | wav_name = os.path.splitext(wav_name) 69 | 70 | # construct new jpg file name with the extenstion 71 | jpg_file_name = str(wav_name[0]) + '.jpg' 72 | jpg_file_name = str(voice_train + jpg_file_name) 73 | get_melss(file, jpg_file_name) 74 | 75 | except Exception: 76 | print('ERROR at ' + file + ' CONTINUING ...') 77 | pass 78 | 79 | # iterate through the testing split 80 | for file in test: 81 | try: 82 | print('Making test plot for: ' + file) 83 | if 'not_voice' in file: 84 | wav_name = os.path.basename(file) 85 | wav_name = os.path.splitext(wav_name) 86 | 87 | # construct new jpg file name with the extenstion 88 | jpg_file_name = str(wav_name[0]) + '.jpg' 89 | jpg_file_name = str(not_voice_test + jpg_file_name) 90 | get_melss(file, jpg_file_name) 91 | else: 92 | wav_name = os.path.basename(file) 93 | wav_name = os.path.splitext(wav_name) 94 | 95 | # construct new jpg file name with the extenstion 96 | jpg_file_name = str(wav_name[0]) + '.jpg' 97 | jpg_file_name = str(voice_test + jpg_file_name) 98 | get_melss(file, jpg_file_name) 99 | 100 | except Exception: 101 | print('ERROR at ' + file + ' CONTINUING ...') 102 | pass 103 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Voice Detection 2 | === 3 | 4 | This project is a binary classification problem of audio data that aims to classify human voices from audio recordings. This project uses a feed forward neural network and a convolutional neural network where both networks work together in a voting classifier fashion to increase accuracy on never before seen data. 5 | 6 | All neural networks were implemented in PyTorch, audio utilities were implemented using Librosa, and the whole project is written in Python 3.8.5. The data for this project was obtained from the [Urban Sound Classification practice problem from Analytics Vidhya](https://datahack.analyticsvidhya.com/contest/practice-problem-urban-sound-classification/) and the voice recordings data was taken from the [LibreSpeech ASR Corpus.](https://www.openslr.org/12) 7 | 8 | Data Preparation 9 | --- 10 | 11 | To get the dataset, download both sections from the above links. I specifically rearranged the dataset by placing all non-voice recordings in one directory and voice recordings in another: 12 | 13 | ```bash 14 | ├── data 15 | │ ├── voice 16 | │ │ ├── rec1.wav 17 | │ │ ├── rec2.wav 18 | │ │ ├── ... 19 | │ ├── not_voice 20 | │ │ ├── rec1.wav 21 | │ │ ├── rec2.wav 22 | │ │ ├── ... 23 | ``` 24 | 25 | The `librespeech.py` script can be used to transform the LibreSpeech dataset to the above file structure. All other files that remain can be deleted. Ignore this if you are not using the LibreSpeech dataset. 26 | 27 | The feed forward neural network uses the Mel-frequency cepstral coefficient, chromagram, mel-scaled spectrogram, spectral contrast, and the tonal centroid features as input. The Librosa python library is used to obtain all these calculations in the `apply_transforms()` function in the ffnn utils. The `prep_ffnn_data.sh` shell script can be run to obtain all these features for each .wav file in one giant csv file. One entry will look like: 28 | ``` 29 | /home/hernanrazo/pythonProjects/voice_detect/data/voice/wav/6476-57446-0035.wav,-345.36868,111.94338,-14.379323,50.732822,-13.364564,-0.81879413,- 8.603202,-11.860127,0.3794937,-11.088642,-0.5936078,-8.392053,-4.562349,2. 390204,2.1774976,-1.8160796,-0.79248935,1.3244591,-3.649716,-2.789777,-3. 483583,-2.1718845,-7.1207843,-4.646477,-2.145171,-5.4034863,1.1288224,2. 6650674,6.1765018,8.234708,5.759141,8.06815,8.3969555,6.328495,5.646016,4. 1650767,2.2295291,1.0025103,-0.5408073,-1.0010004,0.41421363,0.3886434,0. 39528078,0.4243577,0.44586822,0.46885604,0.5323573,0.6273547,0.6136627,0. 5895413,0.5854901,0.5013139,0.030495275,0.025378285,0.03576337,0.02598723, 0.01064823,0.111760125,2.9451604,9.584426,10.904745,6.3265867,0.767396,0. 14292528,0.19490032,0.85179096,5.0548906,16.62874,16.24418,15.094028,11. 092577,5.3894925,1.5112005,0.42363763,0.42279428,0.6986573,1.1323513,1. 4325676,1.5941,1.5745226,0.6854819,0.22188246,0.18639795,0.25544456,0. 37152404,0.18624847,0.18722062,0.24387933,0.15841863,0.2312459,0.12505762, 0.0896525,0.06176768,0.033809755,0.06561177,0.11577808,0.08457274,0. 056273155,0.046364002,0.03207818,0.026625242,0.033034343,0.047393396,0. 039878745,0.030250499,0.035353974,0.04822752,0.088709675,0.08721649,0. 042465515,0.050014295,0.043818373,0.025141228,0.026777223,0.05408083,0. 054930124,0.042547297,0.027444469,0.015712438,0.013818915,0.014640613,0. 017465897,0.014250277,0.019179987,0.021202719,0.040190093,0.024158962,0. 020575762,0.020575762,0.019340117,0.01956742,0.0073476452,0.012725379,0. 016156813,0.007385745,0.008848519,0.0073545426,0.0060878447,0.007746159,0. 011803486,0.00961405,0.011231303,0.012259503,0.008804519,0.008680856,0. 008589337,0.0158784,0.015149302,0.0085100345,0.007378557,0.009641291,0. 0066143535,0.0060657472,0.003713564,0.0021371976,0.0019380879,0.0013283227, 0.0012585397,0.0009210656,0.0008644426,0.0008410996,0.00046661997,0. 00033427356,0.00020592447,5.9694554e-05,1.1552337e-05,2.8310662e-06,4. 4607148e-07,4.411787e-08,1.8092163e-09,1.2725149e-10,1.8920865e-10,1. 2470465e-10,9.163159e-11,1.8638106e-10,2.1313133e-10,7.265922e-10,3. 1799022e-10,8.475092e-10,7.542699e-10,1.6082426e-10,14.965268185360362,18. 193254004666265,20.9569399219138,17.267001240479917,18.13293584976544,19. 771650662276468,41.46849881683453,-0.019441445021252834,0. 0061759247744320065,0.05519930844766153,0.004244935825248924,-0. 004941592482226379,-0.005592662805732028,voice 30 | ``` 31 | Each individual value gets its own cell and the label (voice/not_voice) gets attached to the end. 32 | 33 | The convolutional neural network recieves an image of the recording's Mels-spectrogram as input. Each image is obtained using the Librosa library. The `prep_cnn_data.sh` shell script can be used to obtain a spectrogram for each audio recording. Example: 34 | 35 |

36 | 37 |

38 | All images are later scaled to 32x32 and transformed to tensors for training. 39 | 40 | Both shell scripts automatically split the data into training and testing sets in a 80/20 ratio. 41 | 42 | Neural Network Architectures 43 | --- 44 | The feed forward neural network is comprised of 3 linear layers with ReLU activation and dropout. The last linear layer returns 2 values, one for each class, and passes the result to a sigmoid activation function for the final output. For training, cross entropy loss is used along with an Adam optimizer. Training goes on for 70 epochs with a batch size of 32 and learning rate of 0.005. 45 | 46 | The convolutional neural network is comprised of 3 convolution layers with max pooling, batch normalization, and dropout. There are also 2 linear layers at the end with ReLU activation and dropout. The final linear layer does not have dropout and the output layer gives two values, one for each class. No activation layers are implemented since the cross entropy loss function has softmax built in. For training, I used cross entropy loss and Adam optimization. Training goes on for 250 epochs with batch size of 16 and a learning rate of 0.01. 47 | 48 | Performance 49 | --- 50 | Performance for both models can be measured using the `eval.py` script in each model's respective directory. The script takes the file path to the model as a command line argument. These scripts return the accuracy score, confusion matrix, per-class accuracy, and classification report for the model in question. 51 | 52 | For the feed forward neural network: 53 | ``` 54 | python eval.py CNN/train-08-29-2021/CNN-08-29-2021.pt 55 | ``` 56 | And for the convolutional neural network: 57 | ``` 58 | python eval.py CNN/train-04-20-2021/CNN-04-20-2021.pt 59 | ``` 60 | 61 | For the feed forward neural network, I obtained 0.9758 accuracy. For the convolutional neural network, I obtianed 0.977 accuracy. 62 | 63 | Voting Classifier Implementation 64 | --- 65 | The voting classifier is implemented in the `main.py` script. Similar to the evaluation scripts, this script takes two command line arguments, the path to the feed forward neural network and the path to the convolutional neural network. Example: 66 | ``` 67 | python main.py FFNN/train-07-14-2021/FFNN-07-14-2021.pt CNN/train-04-20-2021/CNN-04-20-2021.pt 68 | ``` 69 | The script first calculates the needed transformations for the feed forward neural network and then creates the spectrogram for the convolutional neural network. The spectrogram is then resized to a 32x32 image and then converted to a tensor. Each input is passed to its respective neural network and an inference score is returned. If both networks return an inference above 0.85 for the recording being a voice, the result is deemed a voice. Any lower scoring is deemed not a voice. 70 | 71 | Sources and helpful Links 72 | --- 73 | https://www.telusinternational.com/articles/what-is-audio-classification#:~:text=Audio%20classification%20is%20the%20process,and%20text%20to%20speech%20applications. 74 | https://stackoverflow.com/questions/53290306/confusion-matrix-and-test-accuracy-for-pytorch-transfer-learning-tutorial 75 | https://librosa.org/doc/latest/index.html 76 | https://datahack.analyticsvidhya.com/contest/practice-problem-urban-sound-classification/ 77 | https://www.openslr.org/12 78 | -------------------------------------------------------------------------------- /utils/ffnn_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import threading 4 | import warnings 5 | import numpy as np 6 | import shutil 7 | from pydub import AudioSegment 8 | import librosa 9 | import torch 10 | from utils.gen_utils import create_splits 11 | 12 | warnings.filterwarnings('ignore', category=UserWarning) 13 | root_dir = str(os.path.abspath(os.path.join(os.getcwd(), os.pardir))) 14 | np.set_printoptions(suppress=True) 15 | 16 | 17 | # assign labels to wav files 18 | def get_label(file_path: str) -> None: 19 | if 'not_voice' in file_path: 20 | return 'not_voice' 21 | else: 22 | return 'voice' 23 | 24 | 25 | # apply transforms needed to prepare data 26 | def apply_transforms(wav_file: str) -> dict: 27 | 28 | # convert wav file to floating pont time series and get 29 | # default sample rate (22050) 30 | x, sr = librosa.load(wav_file, res_type='kaiser_fast') 31 | 32 | # get mel-frequency cepstral coefficients 33 | mfccs = np.mean(librosa.feature.mfcc(y=x, sr=sr, n_mfcc=40).T, axis=0) 34 | 35 | #get short-time fourier transform 36 | stft = np.abs(librosa.stft(x)) 37 | 38 | # get chromagram 39 | chromagram = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0) 40 | 41 | # get mel_scaled spectrogram 42 | melss = np.mean(librosa.feature.melspectrogram(x, sr=sr).T, axis=0) 43 | 44 | # get spectral contrast 45 | spec_contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sr).T, axis=0) 46 | 47 | # get tonnetz 48 | tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(x), sr=sr).T, axis=0) 49 | 50 | return {'mfccs':mfccs, 'chromagram':chromagram, 'melss':melss, 'spec_contrast':spec_contrast, 'tonnetz':tonnetz} 51 | 52 | 53 | # convert transforms dictionary to a tensor 54 | def transforms_to_tensor(transforms: dict) -> list: 55 | transforms_list = [transforms['mfccs'][0], transforms['mfccs'][1], 56 | transforms['mfccs'][2], transforms['mfccs'][3], 57 | transforms['mfccs'][4], transforms['mfccs'][5], 58 | transforms['mfccs'][6], transforms['mfccs'][7], 59 | transforms['mfccs'][8], transforms['mfccs'][9], 60 | transforms['mfccs'][10], transforms['mfccs'][11], 61 | transforms['mfccs'][12], transforms['mfccs'][13], 62 | transforms['mfccs'][14], transforms['mfccs'][15], 63 | transforms['mfccs'][16], transforms['mfccs'][17], 64 | transforms['mfccs'][18], transforms['mfccs'][19], 65 | transforms['mfccs'][20], transforms['mfccs'][21], 66 | transforms['mfccs'][22], transforms['mfccs'][23], 67 | transforms['mfccs'][24], transforms['mfccs'][25], 68 | transforms['mfccs'][26], transforms['mfccs'][27], 69 | transforms['mfccs'][28], transforms['mfccs'][29], 70 | transforms['mfccs'][30], transforms['mfccs'][31], 71 | transforms['mfccs'][32], transforms['mfccs'][33], 72 | transforms['mfccs'][34], transforms['mfccs'][35], 73 | transforms['mfccs'][36], transforms['mfccs'][37], 74 | transforms['mfccs'][38], transforms['mfccs'][39], 75 | transforms['chromagram'][0], 76 | transforms['chromagram'][1], 77 | transforms['chromagram'][2], 78 | transforms['chromagram'][3], 79 | transforms['chromagram'][4], 80 | transforms['chromagram'][5], 81 | transforms['chromagram'][6], 82 | transforms['chromagram'][7], 83 | transforms['chromagram'][8], 84 | transforms['chromagram'][9], 85 | transforms['chromagram'][10], 86 | transforms['chromagram'][11], 87 | transforms['melss'][0], transforms['melss'][1], transforms['melss'][2], 88 | transforms['melss'][3], transforms['melss'][4], transforms['melss'][5], 89 | transforms['melss'][6], transforms['melss'][7], transforms['melss'][8], 90 | transforms['melss'][9], transforms['melss'][10], transforms['melss'][11], 91 | transforms['melss'][12], transforms['melss'][13], transforms['melss'][14], 92 | transforms['melss'][15], transforms['melss'][16], transforms['melss'][17], 93 | transforms['melss'][18], transforms['melss'][19], transforms['melss'][20], 94 | transforms['melss'][21], transforms['melss'][22], transforms['melss'][23], 95 | transforms['melss'][24], transforms['melss'][25], transforms['melss'][26], 96 | transforms['melss'][27], transforms['melss'][28], transforms['melss'][29], 97 | transforms['melss'][30], transforms['melss'][31], transforms['melss'][32], 98 | transforms['melss'][33], transforms['melss'][34], transforms['melss'][35], 99 | transforms['melss'][36], transforms['melss'][37], transforms['melss'][38], 100 | transforms['melss'][39], transforms['melss'][40], transforms['melss'][41], 101 | transforms['melss'][42], transforms['melss'][43], transforms['melss'][44], 102 | transforms['melss'][45], transforms['melss'][46], transforms['melss'][47], 103 | transforms['melss'][48], transforms['melss'][49], transforms['melss'][50], 104 | transforms['melss'][51], transforms['melss'][52], transforms['melss'][53], 105 | transforms['melss'][54], transforms['melss'][55], transforms['melss'][56], 106 | transforms['melss'][57], transforms['melss'][58], transforms['melss'][59], 107 | transforms['melss'][60], transforms['melss'][61], transforms['melss'][62], 108 | transforms['melss'][63], transforms['melss'][64], transforms['melss'][65], 109 | transforms['melss'][66], transforms['melss'][67], transforms['melss'][68], 110 | transforms['melss'][69], transforms['melss'][70], transforms['melss'][71], 111 | transforms['melss'][72], transforms['melss'][73], transforms['melss'][74], 112 | transforms['melss'][75], transforms['melss'][75], transforms['melss'][76], 113 | transforms['melss'][77], transforms['melss'][78], transforms['melss'][79], 114 | transforms['melss'][80], transforms['melss'][81], transforms['melss'][82], 115 | transforms['melss'][83], transforms['melss'][84], transforms['melss'][85], 116 | transforms['melss'][86], transforms['melss'][87], transforms['melss'][88], 117 | transforms['melss'][89], transforms['melss'][90], transforms['melss'][91], 118 | transforms['melss'][92], transforms['melss'][93], transforms['melss'][94], 119 | transforms['melss'][95], transforms['melss'][96], transforms['melss'][97], 120 | transforms['melss'][98], transforms['melss'][99], transforms['melss'][100], 121 | transforms['melss'][101], transforms['melss'][102], transforms['melss'][103], 122 | transforms['melss'][104], transforms['melss'][105], transforms['melss'][106], 123 | transforms['melss'][107], transforms['melss'][108], transforms['melss'][109], 124 | transforms['melss'][110], transforms['melss'][111], transforms['melss'][112], 125 | transforms['melss'][113], transforms['melss'][114], transforms['melss'][115], 126 | transforms['melss'][116], transforms['melss'][117], transforms['melss'][118], 127 | transforms['melss'][119], transforms['melss'][120], transforms['melss'][121], 128 | transforms['melss'][122], transforms['melss'][123], transforms['melss'][124], 129 | transforms['melss'][125], transforms['melss'][126], transforms['melss'][127], 130 | transforms['spec_contrast'][0], 131 | transforms['spec_contrast'][1], 132 | transforms['spec_contrast'][2], 133 | transforms['spec_contrast'][3], 134 | transforms['spec_contrast'][4], 135 | transforms['spec_contrast'][5], 136 | transforms['spec_contrast'][6], 137 | transforms['tonnetz'][0], 138 | transforms['tonnetz'][1], 139 | transforms['tonnetz'][2], 140 | transforms['tonnetz'][3], 141 | transforms['tonnetz'][4], 142 | transforms['tonnetz'][5]] 143 | 144 | return torch.FloatTensor(transforms_list).to(device='cuda') 145 | 146 | 147 | # create one giant csv with all the tranforms data and the label of each wav file 148 | def get_csv(csv_name: str, data_split: list, annotations_path: str) -> None: 149 | with open(csv_name, mode='w', newline='') as f: 150 | writer = csv.writer(f) 151 | 152 | for filename in data_split: 153 | try: 154 | print(str(csv_name) + ' THREAD: Applying transform to: ' + filename) 155 | transforms = apply_transforms(filename) 156 | writer.writerow([filename, 157 | transforms['mfccs'][0], transforms['mfccs'][1], 158 | transforms['mfccs'][2], transforms['mfccs'][3], 159 | transforms['mfccs'][4], transforms['mfccs'][5], 160 | transforms['mfccs'][6], transforms['mfccs'][7], 161 | transforms['mfccs'][8], transforms['mfccs'][9], 162 | transforms['mfccs'][10], transforms['mfccs'][11], 163 | transforms['mfccs'][12], transforms['mfccs'][13], 164 | transforms['mfccs'][14], transforms['mfccs'][15], 165 | transforms['mfccs'][16], transforms['mfccs'][17], 166 | transforms['mfccs'][18], transforms['mfccs'][19], 167 | transforms['mfccs'][20], transforms['mfccs'][21], 168 | transforms['mfccs'][22], transforms['mfccs'][23], 169 | transforms['mfccs'][24], transforms['mfccs'][25], 170 | transforms['mfccs'][26], transforms['mfccs'][27], 171 | transforms['mfccs'][28], transforms['mfccs'][29], 172 | transforms['mfccs'][30], transforms['mfccs'][31], 173 | transforms['mfccs'][32], transforms['mfccs'][33], 174 | transforms['mfccs'][34], transforms['mfccs'][35], 175 | transforms['mfccs'][36], transforms['mfccs'][37], 176 | transforms['mfccs'][38], transforms['mfccs'][39], 177 | transforms['chromagram'][0], 178 | transforms['chromagram'][1], 179 | transforms['chromagram'][2], 180 | transforms['chromagram'][3], 181 | transforms['chromagram'][4], 182 | transforms['chromagram'][5], 183 | transforms['chromagram'][6], 184 | transforms['chromagram'][7], 185 | transforms['chromagram'][8], 186 | transforms['chromagram'][9], 187 | transforms['chromagram'][10], 188 | transforms['chromagram'][11], 189 | transforms['melss'][0], transforms['melss'][1], transforms['melss'][2], 190 | transforms['melss'][3], transforms['melss'][4], transforms['melss'][5], 191 | transforms['melss'][6], transforms['melss'][7], transforms['melss'][8], 192 | transforms['melss'][9], transforms['melss'][10], transforms['melss'][11], 193 | transforms['melss'][12], transforms['melss'][13], transforms['melss'][14], 194 | transforms['melss'][15], transforms['melss'][16], transforms['melss'][17], 195 | transforms['melss'][18], transforms['melss'][19], transforms['melss'][20], 196 | transforms['melss'][21], transforms['melss'][22], transforms['melss'][23], 197 | transforms['melss'][24], transforms['melss'][25], transforms['melss'][26], 198 | transforms['melss'][27], transforms['melss'][28], transforms['melss'][29], 199 | transforms['melss'][30], transforms['melss'][31], transforms['melss'][32], 200 | transforms['melss'][33], transforms['melss'][34], transforms['melss'][35], 201 | transforms['melss'][36], transforms['melss'][37], transforms['melss'][38], 202 | transforms['melss'][39], transforms['melss'][40], transforms['melss'][41], 203 | transforms['melss'][42], transforms['melss'][43], transforms['melss'][44], 204 | transforms['melss'][45], transforms['melss'][46], transforms['melss'][47], 205 | transforms['melss'][48], transforms['melss'][49], transforms['melss'][50], 206 | transforms['melss'][51], transforms['melss'][52], transforms['melss'][53], 207 | transforms['melss'][54], transforms['melss'][55], transforms['melss'][56], 208 | transforms['melss'][57], transforms['melss'][58], transforms['melss'][59], 209 | transforms['melss'][60], transforms['melss'][61], transforms['melss'][62], 210 | transforms['melss'][63], transforms['melss'][64], transforms['melss'][65], 211 | transforms['melss'][66], transforms['melss'][67], transforms['melss'][68], 212 | transforms['melss'][69], transforms['melss'][70], transforms['melss'][71], 213 | transforms['melss'][72], transforms['melss'][73], transforms['melss'][74], 214 | transforms['melss'][75], transforms['melss'][75], transforms['melss'][76], 215 | transforms['melss'][77], transforms['melss'][78], transforms['melss'][79], 216 | transforms['melss'][80], transforms['melss'][81], transforms['melss'][82], 217 | transforms['melss'][83], transforms['melss'][84], transforms['melss'][85], 218 | transforms['melss'][86], transforms['melss'][87], transforms['melss'][88], 219 | transforms['melss'][89], transforms['melss'][90], transforms['melss'][91], 220 | transforms['melss'][92], transforms['melss'][93], transforms['melss'][94], 221 | transforms['melss'][95], transforms['melss'][96], transforms['melss'][97], 222 | transforms['melss'][98], transforms['melss'][99], transforms['melss'][100], 223 | transforms['melss'][101], transforms['melss'][102], transforms['melss'][103], 224 | transforms['melss'][104], transforms['melss'][105], transforms['melss'][106], 225 | transforms['melss'][107], transforms['melss'][108], transforms['melss'][109], 226 | transforms['melss'][110], transforms['melss'][111], transforms['melss'][112], 227 | transforms['melss'][113], transforms['melss'][114], transforms['melss'][115], 228 | transforms['melss'][116], transforms['melss'][117], transforms['melss'][118], 229 | transforms['melss'][119], transforms['melss'][120], transforms['melss'][121], 230 | transforms['melss'][122], transforms['melss'][123], transforms['melss'][124], 231 | transforms['melss'][125], transforms['melss'][126], transforms['melss'][127], 232 | transforms['spec_contrast'][0], 233 | transforms['spec_contrast'][1], 234 | transforms['spec_contrast'][2], 235 | transforms['spec_contrast'][3], 236 | transforms['spec_contrast'][4], 237 | transforms['spec_contrast'][5], 238 | transforms['spec_contrast'][6], 239 | transforms['tonnetz'][0], 240 | transforms['tonnetz'][1], 241 | transforms['tonnetz'][2], 242 | transforms['tonnetz'][3], 243 | transforms['tonnetz'][4], 244 | transforms['tonnetz'][5], 245 | get_label(filename)]) 246 | except Exception: 247 | print(str(csv_name) + ' THREAD: ERROR AT ' + filename + '. CONTINUING ...') 248 | pass 249 | 250 | shutil.move(str(csv_name), str(root_dir + annotations_path)) 251 | # end function 252 | 253 | 254 | # prepare raw .wav files to the csv dataframe needed. This includes splitting the data into training and testing, 255 | # applying the transforms, and saving as a csv. This function is used in the prepare_data.sh script 256 | def prepare_dataset() -> None: 257 | 258 | annotations_path = '/voice_detect/data/annotations' 259 | voice_wavs = str(root_dir + '/voice_detect/data/voice/wav/') 260 | not_voice_wavs = str(root_dir + '/voice_detect/data/not_voice/wav/') 261 | 262 | print('Creating splits ...\n') 263 | train, test = create_splits(voice_wavs, not_voice_wavs) 264 | 265 | # start two threads, one to crate the training csv and one for the testing csv 266 | train_thread = threading.Thread(target=get_csv, args=('train.csv', train, annotations_path)) 267 | test_thread = threading.Thread(target=get_csv, args=('test.csv', test, annotations_path)) 268 | 269 | train_thread.start() 270 | test_thread.start() 271 | train_thread.join() 272 | test_thread.join() 273 | --------------------------------------------------------------------------------