├── utils
    ├── __init__.py
    ├── gen_utils.py
    ├── cnn_utils.py
    └── ffnn_utils.py
├── example.jpg
├── flac2wav.sh
├── .gitignore
├── prep_cnn_data.sh
├── prep_ffnn_data.sh
├── ffnn
    ├── dataset.py
    ├── librispeech.py
    ├── model.py
    ├── eval.py
    └── train.py
├── cnn
    ├── model.py
    ├── eval.py
    └── train.py
├── main.py
├── requirements.txt
└── README.md


/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hernanrazo/human-voice-detection/HEAD/example.jpg


--------------------------------------------------------------------------------
/flac2wav.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | echo "Converting .flac files to .wav"
4 | python -c 'from utils.gen_utils import flac2wav; flac2wav()'
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | *.swp
 3 | *.swo
 4 | *.pyc
 5 | .DS_Store
 6 | __pycache__/
 7 | haters/
 8 | 
 9 | data/
10 | saved_models/
11 | temp/
12 | 


--------------------------------------------------------------------------------
/prep_cnn_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | echo "Creating data splits and melss .jpg files for all .wav files ..."
4 | python -c 'from utils.cnn_utils import prepare_dataset; prepare_dataset()'
5 | echo "done"
6 | 


--------------------------------------------------------------------------------
/prep_ffnn_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | echo "Creating data splits, annotations, and transforms for all .wav files ..."
4 | python -c 'from utils.ffnn_utils import prepare_dataset; prepare_dataset()'
5 | echo "done"
6 | 


--------------------------------------------------------------------------------
/ffnn/dataset.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import Dataset
 3 | 
 4 | class FFNN_dataset(Dataset):
 5 | 
 6 |     def __init__(self, x, y):
 7 |         self.x = x
 8 |         self.y = y
 9 | 
10 |     def __getitem__(self, index):
11 |         return self.x[index], self.y[index]
12 | 
13 |     def __len__(self):
14 |         return len(self.x)
15 | 


--------------------------------------------------------------------------------
/ffnn/librispeech.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import shutil
 4 | '''
 5 | Moves all .flac files from the Librispeech dataset from its 
 6 | original file structure to one giant subdir. This ignores
 7 | the other .txt files and labels that come with the original
 8 | dataset.
 9 | 
10 | Ignore this if you are not using the LibriSpeech dataset or 
11 | if you end up using the original file structure.
12 | '''
13 | 
14 | def main():
15 | 
16 |     root = str(os.getcwd()) + '/data/LibriSpeech/train-clean-100/'
17 |     dest = str(os.getcwd()) + '/data/voice/flac/'
18 |     
19 |     for subdirs, dirs, files in os.walk(root):
20 |         for file in files:
21 |             if file.endswith('.flac'):
22 |                 print('Moving ' + str(file))
23 |                 shutil.move(str(subdirs + os.sep + file), dest)
24 | 
25 | if __name__ == '__main__':
26 |     main()
27 | 


--------------------------------------------------------------------------------
/ffnn/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | # feed forward neural network
 6 | class FFNN(nn.Module):
 7 |     def __init__(self):
 8 |         super(FFNN, self).__init__()
 9 |         self.l1 = nn.Linear(194, 1024)
10 |         self.r1 = nn.ReLU()
11 |         self.d1 = nn.Dropout(0.2)
12 | 
13 |         self.l2 = nn.Linear(1024, 512)
14 |         self.r2 = nn.ReLU()
15 |         self.d2 = nn.Dropout(0.2)
16 | 
17 |         self.l3 = nn.Linear(512, 128)
18 |         self.r3 = nn.ReLU()
19 |         self.d3 = nn.Dropout(0.2)
20 | 
21 |         self.l4 = nn.Linear(128, 2)
22 |         self.out = nn.Sigmoid()
23 | 
24 | 
25 |     def forward(self, x):
26 |         l1 = self.l1(x)
27 |         r1 = self.r1(l1)
28 |         d1 = self.d1(r1)
29 | 
30 |         l2 = self.l2(d1)
31 |         r2 = self.r2(l2)
32 |         d2 = self.d2(r2)
33 | 
34 |         l3 = self.l3(d2)
35 |         r3 = self.r3(l3)
36 |         d3 = self.d3(r3)
37 | 
38 |         l4 = self.l4(d3)
39 |         y = self.out(l4)
40 |         return y
41 | 


--------------------------------------------------------------------------------
/cnn/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class CNN(nn.Module):
 6 |     def __init__(self):
 7 |         super(CNN, self).__init__()
 8 |         self.conv1 = nn.Conv2d(3, 32, 5)
 9 |         self.bn1 = nn.BatchNorm2d(32)
10 |         self.d1 = nn.Dropout(0.3)
11 | 
12 |         self.pool = nn.MaxPool2d(2, 2)
13 | 
14 |         self.conv2 = nn.Conv2d(32, 64, 5)
15 |         self.bn2 = nn.BatchNorm2d(64)
16 |         self.d2 = nn.Dropout(0.5)
17 | 
18 |         self.conv3 = nn.Conv2d(64, 128, 2)
19 |         self.bn3 = nn.BatchNorm2d(128)
20 |         self.d3 = nn.Dropout(0.3)
21 | 
22 |         self.fc1 = nn.Linear(128*2*2, 64)
23 |         self.d5 = nn.Dropout(0.2)
24 | 
25 |         self.fc2 = nn.Linear(64, 32)
26 |         self.d6 = nn.Dropout(0.2)
27 | 
28 |         self.fc3 = nn.Linear(32, 16)
29 | 
30 |         self.fc4 = nn.Linear(16, 2)
31 | 
32 |     def forward(self, x):
33 |         x = self.pool(F.relu(self.conv1(x)))
34 |         x = self.bn1(x)
35 |         x = self.d1(x)
36 | 
37 |         x = self.pool(F.relu(self.conv2(x)))
38 |         x = self.bn2(x)
39 |         x = self.d2(x)
40 | 
41 |         x = self.pool(F.relu(self.conv3(x)))
42 |         x = self.bn3(x)
43 |         x = self.d3(x)
44 |         
45 |         x = x.view(-1, 128*2*2)
46 | 
47 |         x = F.relu(self.fc1(x))
48 |         x = self.d5(x)
49 | 
50 |         x = F.relu(self.fc2(x))
51 |         x = self.d6(x)
52 | 
53 |         x = F.relu(self.fc3(x))
54 | 
55 |         x = self.fc4(x)
56 |         return x
57 | 


--------------------------------------------------------------------------------
/cnn/eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import warnings
 4 | from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
 5 | import torch
 6 | import torchvision
 7 | from torchvision.transforms import transforms
 8 | from model import CNN
 9 | 
10 | warnings.filterwarnings('ignore', category=UserWarning)
11 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
12 | root_dir = str(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('model_path', type=str, help='Path to convolutional neural network you wish to use')
15 | args = parser.parse_args()
16 | 
17 | def main():
18 | 
19 |     # set parameters
20 |     batch_size = 16
21 |     workers = 4
22 |     nb_classes = 2
23 |     pred_list = torch.zeros(0, dtype=torch.long, device='cpu')
24 |     ground_truth = torch.zeros(0, dtype=torch.long, device='cpu')
25 | 
26 |     # get model
27 |     model_path = str(root_dir + '/saved_models/' + args.model_path)
28 |     model = CNN()
29 |     model.load_state_dict(torch.load(model_path), strict=False)
30 |     model.to(device)
31 |     model.eval()
32 | 
33 |     # set basic transforms. Spectrograms have to look a certain way so rotations, flips, and other
34 |     # transforms do not make sense in this application
35 |     transform = {'test' : transforms.Compose([transforms.Resize([32, 32]), transforms.ToTensor()])}
36 | 
37 |     # get testing dataset
38 |     test_data = torchvision.datasets.ImageFolder(root_dir + '/data/plots/test/',
39 |                                                  transform=transform['test'])
40 | 
41 |     test_loader = torch.utils.data.DataLoader(dataset=test_data,
42 |                                               batch_size=batch_size,
43 |                                               shuffle=True)
44 |     # start testing
45 |     with torch.no_grad():
46 |         for i, (x, y) in enumerate(test_loader):
47 |             x, y = x.to(device), y.to(device)
48 |             outputs = model(x)
49 |             _, preds = torch.max(outputs.data, 1)
50 | 
51 |             pred_list = torch.cat([pred_list, preds.view(-1).cpu()])
52 |             ground_truth = torch.cat([ground_truth, y.view(-1).cpu()])
53 | 
54 |     # accuracy score
55 |     print('\nAccuracy Score:')
56 |     print(accuracy_score(ground_truth.numpy(), pred_list.numpy()))
57 |     # confusion matrix
58 |     print('\nConfusion Matrix:')
59 |     conf_mat = confusion_matrix(ground_truth.numpy(), pred_list.numpy())
60 |     print(conf_mat)
61 | 
62 |     # per-class accuracy
63 |     print('\nPer-Class Accuracy:')
64 |     print(100 * conf_mat.diagonal() / conf_mat.sum(1))
65 | 
66 |     # classification report
67 |     print('\nClassification Report:')
68 |     print(classification_report(ground_truth.numpy(), pred_list.numpy()))
69 | 
70 | if __name__ == '__main__':
71 |     main()
72 | 


--------------------------------------------------------------------------------
/utils/gen_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | import numpy as np
 4 | import librosa
 5 | import torch
 6 | from pydub import AudioSegment
 7 | 
 8 | root_dir = str(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
 9 | 
10 | 
11 | def create_dir(dir: str) -> None:
12 |     if os.path.isdir(dir):
13 |         print(dir, 'already exists. Continuing ...' )
14 |     else:
15 |          print('Creating new dir: ', dir)
16 |          os.makedirs(dir)
17 | 
18 |  # convert .flac files to .wav files in the voice data folder
19 | def flac2wav() -> None:
20 |     flac_path = str(root_dir + '/voice_detect/data/voice/flac/')
21 |     wav_path = str(root_dir + '/voice_detect/data/voice/wav/')
22 |     flac_files = [f for f in os.listdir(flac_path) if os.path.isfile(os.path.join(flac_path, f)) and f.endswith('.flac')]
23 | 
24 |     for file in flac_files:
25 |         print('Converting ' + str(file))
26 |         temp = AudioSegment.from_file(str(flac_path + file))
27 |         temp.export(str(wav_path + os.path.splitext(file)[0]) + '.wav', format='wav')
28 |     print('Done converting \n')
29 | 
30 | 
31 | # create the training and testing split lists for both classes
32 | def create_splits(voice_path: str, not_voice_path: str) -> list:
33 |     voice_wavs = str(root_dir + '/voice_detect/data/voice/wav/')
34 |     not_voice_wavs = str(root_dir + '/voice_detect/data/not_voice/wav/')
35 | 
36 |     # get total number of files in the both dirs and split the training and testing dataset    
37 |     # by a 80/20 ratio
38 |     voice_list = [voice_wavs + name for name in os.listdir(voice_wavs)]
39 |     voice_total  = len(voice_list)
40 |     voice_train_split = round(voice_total * 0.8)
41 |     voice_test_split = voice_total - voice_train_split
42 | 
43 |     assert voice_train_split + voice_test_split == voice_total
44 | 
45 |     voice_train_list = random.sample(voice_list, voice_train_split)
46 |     voice_test_list = random.sample(voice_list, voice_test_split)
47 | 
48 |     not_voice_list = [not_voice_wavs + name for name in os.listdir(not_voice_wavs)]
49 |     not_voice_total  = len(not_voice_list)
50 |     not_voice_train_split = round(not_voice_total * 0.8)
51 |     not_voice_test_split = not_voice_total - not_voice_train_split
52 | 
53 |     assert not_voice_train_split + not_voice_test_split == not_voice_total
54 | 
55 |     not_voice_train_list = random.sample(not_voice_list, not_voice_train_split)
56 |     not_voice_test_list = random.sample(not_voice_list, not_voice_test_split)
57 | 
58 |     # concat into two complete lists
59 |     full_train_list = voice_train_list + not_voice_train_list
60 |     full_test_list = voice_test_list + not_voice_test_list
61 |     
62 |     return full_train_list, full_test_list
63 | 
64 | 
65 | # calculate accuracy of a prediction
66 | def get_accuracy(prediction: str, label: str) -> float:
67 |     matches  = [torch.argmax(i) == torch.argmax(j) for i, j in zip(prediction, label)]
68 |     accuracy = matches.count(True) / len(matches)
69 |     return accuracy
70 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import warnings
 4 | import argparse
 5 | import librosa
 6 | import torch
 7 | import torchvision
 8 | from PIL import Image
 9 | from ffnn.model import FFNN
10 | from cnn.model import CNN
11 | from utils.gen_utils import create_dir
12 | from utils.ffnn_utils import apply_transforms, transforms_to_tensor
13 | from utils.cnn_utils import get_melss
14 | 
15 | warnings.filterwarnings('ignore', category=UserWarning)
16 | 
17 | def main():
18 | 
19 |     # set device
20 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
21 | 
22 |     # other
23 |     root_dir = str(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
24 |     to_tensor = torchvision.transforms.ToTensor()
25 |     
26 |     # get model path argument
27 |     parser = argparse.ArgumentParser()
28 |     parser.add_argument('ffnn_path', type=str, help='Path to feed forward neural network')
29 |     parser.add_argument('cnn_path', type=str, help='Path to convolutional neural network')
30 |     args = parser.parse_args()
31 | 
32 |     # get models in eval mode
33 |     ffnn = FFNN()
34 |     ffnn_path = os.path.abspath('saved_models/' + args.ffnn_path)
35 |     ffnn.load_state_dict(torch.load(ffnn_path), strict=False)
36 |     ffnn = ffnn.to(device)
37 |     ffnn.eval()
38 | 
39 |     cnn = CNN()
40 |     cnn_path = os.path.abspath('saved_models/' + args.cnn_path)
41 |     cnn.load_state_dict(torch.load(cnn_path), strict=False)
42 |     cnn = cnn.to(device)
43 |     cnn.eval()
44 | 
45 |     # create temp dir to save melss image for current inference
46 |     create_dir('temp')
47 |     
48 |     # get transforms and spectrogram image
49 |     transforms = apply_transforms('data/test/voice3.wav')
50 |     melss = get_melss('data/test/voice3.wav', 'temp/test.jpg')
51 | 
52 |     # convert transforms dict to tensor and 
53 |     # apply transforms to melss image
54 |     transforms = transforms_to_tensor(transforms)
55 |     melss = Image.open('temp/test.jpg')
56 |     melss = melss.resize((32, 32))
57 |     melss = to_tensor(melss)
58 |     melss = melss.to(device)
59 |     
60 |     # make predictions
61 |     ffnn_pred = ffnn(transforms)
62 |     cnn_pred = cnn(melss.unsqueeze(0))
63 | 
64 |     # if both models agree that the audio is a voice, return voice
65 |     # else, return not_voice
66 |     if ffnn_pred[1] > 0.85 and cnn_pred[0][1] > 0.85:
67 |         print(ffnn_pred)
68 |         print(cnn_pred)
69 |         print('\nvoice\n')
70 |     else:
71 |         print(ffnn_pred)
72 |         print(cnn_pred)
73 |         print('\nnot_voice\n')
74 | 
75 |     # delete temp dir after completion
76 |     if os.path.isdir(root_dir + '/voice_detect/temp'):
77 |         print('deleting temp dir ...\n')
78 |         os.remove(root_dir + '/voice_detect/temp/test.jpg')
79 |         shutil.rmtree(root_dir + '/voice_detect/temp')
80 |     else:
81 |         print('temp dir does not exist...\n')
82 | 
83 |     print('Inference complete ...')
84 | if __name__ == '__main__':
85 |     main()
86 | 


--------------------------------------------------------------------------------
/ffnn/eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import argparse
 4 | import warnings
 5 | import pandas as pd
 6 | import torch
 7 | from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
 8 | from model import FFNN
 9 | from dataset import FFNN_dataset
10 | 
11 | warnings.filterwarnings('ignore', category=UserWarning)
12 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('model_path', type=str, help='Path to feed forward network you wish to use')
15 | args = parser.parse_args()
16 | 
17 | def main():
18 | 
19 |     # set parameters
20 |     batch_size = 32
21 |     nb_classes = 2
22 |     pred_list = torch.zeros(0, dtype=torch.long, device='cpu')
23 |     ground_truth = torch.zeros(0, dtype=torch.long, device='cpu')
24 |     root_dir = str(os.path.abspath(os.path.join(os.getcwd(),os.pardir)))
25 | 
26 |     # get model
27 |     model_path = str(root_dir + '/saved_models/' + args.model_path)
28 |     model = FFNN()
29 |     model.load_state_dict(torch.load(model_path), strict=False)
30 |     model.to(device)
31 |     model.eval()
32 | 
33 |     # get testing dataset
34 |     data_path = str(root_dir + '/data/annotations/test.csv')
35 |     df = pd.read_csv(data_path, header=None)
36 |     
37 |     # encode labels
38 |     encode = {'voice' : 1, 'not_voice' : 0}
39 |     df.iloc[:, 195].replace(encode, inplace=True)
40 | 
41 |     # remove file names
42 |     df.drop(df.columns[0], axis=1, inplace=True)
43 |     
44 |     # seperate data and labels
45 |     x = df.iloc[:, 0:-1]
46 |     y = df.iloc[:, -1:]
47 | 
48 |     # get testing dataset
49 |     test_data = FFNN_dataset(torch.FloatTensor(x.values), torch.FloatTensor(y.values))
50 |     test_loader = torch.utils.data.DataLoader(dataset=test_data,
51 |                                               batch_size=batch_size,
52 |                                               shuffle=True)
53 |     # start testing
54 |     with torch.no_grad():
55 |         for i, (x, y) in enumerate(test_loader):
56 |             x, y = x.to(device), y.to(device)
57 |             outputs = model(x)
58 |             _, preds = torch.max(outputs.data, 1)
59 |             
60 |             pred_list = torch.cat([pred_list, preds.view(-1).cpu()])
61 |             ground_truth = torch.cat([ground_truth, y.view(-1).cpu()])
62 | 
63 |     # accuracy report
64 |     print('\nAccuracy Score:')
65 |     print(accuracy_score(ground_truth.numpy(), pred_list.numpy()))
66 | 
67 |     # confusion matrix
68 |     print('\nConfusion Matrix:')
69 |     conf_mat = confusion_matrix(ground_truth.numpy(), pred_list.numpy())
70 |     print(conf_mat)
71 | 
72 |     # per-class accuracy
73 |     print('\nPer-Class Accuracy:')
74 |     print(100 * conf_mat.diagonal() / conf_mat.sum(1))
75 |     
76 |     # classification report
77 |     print('\nClassification Report:')
78 |     print(classification_report(ground_truth.numpy(), pred_list.numpy()))
79 | 
80 | if __name__ == '__main__':
81 |     main()
82 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | # This file may be used to create an environment using:
  2 | # $ conda create --name <env> --file <this file>
  3 | # platform: linux-64
  4 | _libgcc_mutex=0.1=main
  5 | _openmp_mutex=4.5=1_gnu
  6 | appdirs=1.4.4=pyh9f0ad1d_0
  7 | audioread=2.1.9=py38h578d9bd_0
  8 | blas=1.0=mkl
  9 | brotlipy=0.7.0=py38h497a2fe_1001
 10 | bzip2=1.0.8=h7f98852_4
 11 | ca-certificates=2020.10.14=0
 12 | certifi=2020.6.20=py38_0
 13 | cffi=1.14.6=py38ha65f79e_0
 14 | chardet=4.0.0=py38h578d9bd_1
 15 | charset-normalizer=2.0.0=pyhd8ed1ab_0
 16 | cryptography=3.4.7=py38ha5dfef3_0
 17 | cudatoolkit=10.2.89=hfd86e86_1
 18 | cycler=0.10.0=py_2
 19 | decorator=5.0.9=pyhd8ed1ab_0
 20 | ffmpeg=4.3.1=hca11adc_2
 21 | freetype=2.10.4=h0708190_1
 22 | gettext=0.19.8.1=h0b5b191_1005
 23 | gmp=6.2.1=h2531618_2
 24 | gnutls=3.6.15=he1e5248_0
 25 | idna=3.1=pyhd3deb0d_0
 26 | intel-openmp=2021.3.0=h06a4308_3350
 27 | joblib=1.0.1=pyhd8ed1ab_0
 28 | jpeg=9b=h024ee3a_2
 29 | kiwisolver=1.3.1=py38h1fd1430_1
 30 | lame=3.100=h7f98852_1001
 31 | lcms2=2.12=h3be6417_0
 32 | ld_impl_linux-64=2.35.1=h7274673_9
 33 | libffi=3.3=he6710b0_2
 34 | libflac=1.3.3=h9c3ff4c_1
 35 | libgcc-ng=9.3.0=h5101ec6_17
 36 | libgfortran-ng=7.5.0=h14aa051_19
 37 | libgfortran4=7.5.0=h14aa051_19
 38 | libgomp=9.3.0=h5101ec6_17
 39 | libidn2=2.3.2=h7f8727e_0
 40 | libllvm10=10.0.1=he513fc3_3
 41 | libogg=1.3.4=h7f98852_1
 42 | libopus=1.3.1=h7f98852_1
 43 | libpng=1.6.37=h21135ba_2
 44 | librosa=0.8.1=pyhd8ed1ab_0
 45 | libsndfile=1.0.31=h9c3ff4c_1
 46 | libstdcxx-ng=9.3.0=hd4cf53a_17
 47 | libtasn1=4.16.0=h27cfd23_0
 48 | libtiff=4.2.0=h85742a9_0
 49 | libunistring=0.9.10=h27cfd23_0
 50 | libuv=1.40.0=h7b6447c_0
 51 | libvorbis=1.3.7=h9c3ff4c_0
 52 | libwebp-base=1.2.0=h27cfd23_0
 53 | llvmlite=0.36.0=py38h4630a5e_0
 54 | lz4-c=1.9.3=h2531618_0
 55 | matplotlib-base=3.4.2=py38hcc49a3a_0
 56 | mkl=2021.3.0=h06a4308_520
 57 | mkl-service=2.4.0=py38h7f8727e_0
 58 | mkl_fft=1.3.0=py38h42c9631_2
 59 | mkl_random=1.2.2=py38h51133e4_0
 60 | ncurses=6.2=he6710b0_1
 61 | nettle=3.7.3=hbbd107a_1
 62 | ninja=1.10.2=hff7bd54_1
 63 | numba=0.53.1=py38h8b71fd7_1
 64 | numpy=1.20.3=py38hf144106_0
 65 | numpy-base=1.20.3=py38h74d4b33_0
 66 | olefile=0.46=py_0
 67 | openh264=2.1.1=h780b84a_0
 68 | openjpeg=2.3.0=h05c96fa_1
 69 | openssl=1.1.1k=h7f98852_0
 70 | packaging=21.0=pyhd8ed1ab_0
 71 | pandas=1.2.5=py38h295c915_0
 72 | pillow=8.3.1=py38h2c7a002_0
 73 | pip=21.1.3=py38h06a4308_0
 74 | pooch=1.4.0=pyhd8ed1ab_0
 75 | pycparser=2.20=pyh9f0ad1d_2
 76 | pydub=0.25.1=pyhd8ed1ab_0
 77 | pyopenssl=20.0.1=pyhd8ed1ab_0
 78 | pyparsing=2.4.7=pyh9f0ad1d_0
 79 | pysocks=1.7.1=py38h578d9bd_3
 80 | pysoundfile=0.10.3.post1=pyhd3deb0d_0
 81 | python=3.8.5=h7579374_1
 82 | python-dateutil=2.8.2=pyhd3eb1b0_0
 83 | python_abi=3.8=2_cp38
 84 | pytorch=1.9.0=py3.8_cuda10.2_cudnn7.6.5_0
 85 | pytz=2021.1=pyhd3eb1b0_0
 86 | readline=8.1=h27cfd23_0
 87 | requests=2.26.0=pyhd8ed1ab_0
 88 | resampy=0.2.2=py_0
 89 | scikit-learn=0.23.2=py38h0573a6f_0
 90 | scipy=1.6.2=py38had2a1c9_1
 91 | setuptools=52.0.0=py38h06a4308_0
 92 | six=1.16.0=pyhd3eb1b0_0
 93 | sqlite=3.36.0=hc218d9a_0
 94 | threadpoolctl=2.2.0=pyh8a188c0_0
 95 | tk=8.6.10=hbc83047_0
 96 | torchaudio=0.9.0=py38
 97 | torchvision=0.10.0=py38_cu102
 98 | tornado=6.1=py38h497a2fe_1
 99 | typing_extensions=3.10.0.0=pyh06a4308_0
100 | urllib3=1.26.6=pyhd8ed1ab_0
101 | wheel=0.36.2=pyhd3eb1b0_0
102 | x264=1!161.3030=h7f98852_1
103 | xz=5.2.5=h7b6447c_0
104 | zlib=1.2.11=h7b6447c_3
105 | zstd=1.4.9=haebb681_0
106 | 


--------------------------------------------------------------------------------
/cnn/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import warnings
 4 | import logging
 5 | import time
 6 | from datetime import date
 7 | import torchvision
 8 | from torchvision.transforms import transforms
 9 | import torch.optim
10 | import torch.nn as nn
11 | from model import CNN
12 | sys.path.append('../')
13 | from utils.gen_utils import create_dir, get_accuracy
14 | 
15 | warnings.filterwarnings('ignore', category=UserWarning)
16 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
17 | root_dir = str(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
18 | epochs = 250
19 | batch_size = 16
20 | lr = 0.01
21 | workers = 4
22 | 
23 | def main():
24 |     
25 |     # create new directory for the current training session
26 |     today = date.today()
27 |     today = str(today.strftime('%m-%d-%Y'))
28 |     dir_ = str(root_dir + '/saved_models/CNN/train-' + today)
29 |     create_dir(dir_)
30 | 
31 |     log_file_name = 'CNN-' + today +'.log'
32 |     logging.basicConfig(filename=os.path.join(dir_, log_file_name),
33 |                         filemode='w',
34 |                         format='%(asctime)s: %(message)s',
35 |                         level=logging.INFO)
36 | 
37 |     # set basic transforms. Spectrograms have to look a certain way so rotations,
38 |     # flips, and other
39 |     # transforms do not make sense in this application
40 |     transform = { 'train' : transforms.Compose([transforms.Resize([32, 32]),
41 |                                                 transforms.ToTensor()])}
42 | 
43 |     # get train dataset
44 |     train_data = torchvision.datasets.ImageFolder(root=root_dir + '/data/plots/train/', 
45 |                                                   transform=transform['train'])
46 |     
47 |     train_loader = torch.utils.data.DataLoader(dataset=train_data,
48 |                                                batch_size=batch_size,
49 |                                                shuffle=True,
50 |                                                num_workers=workers)
51 |     model = CNN()
52 |     model = model.to(device)
53 | 
54 |     criterion = nn.CrossEntropyLoss()
55 |     optimizer = torch.optim.Adam(model.parameters(), lr=lr)
56 | 
57 |     # for each epoch
58 |     for epoch in range(epochs):
59 | 
60 |         model.train()
61 |         epoch_loss = 0
62 |         epoch_accuracy = 0
63 |         epoch_steps = 0
64 | 
65 |         for i, (img, label) in enumerate(train_loader):
66 |             img, label = img.to(device), label.to(device)
67 | 
68 |             prediction = model(img)
69 |             loss = criterion(prediction, label)
70 |             epoch_accuracy = get_accuracy(prediction, label)
71 | 
72 |             optimizer.zero_grad()
73 |             loss.backward()
74 |             optimizer.step()
75 | 
76 |             epoch_loss += loss.item()
77 |             epoch_steps += 1
78 | 
79 |         # print status onto terminal and log file
80 |         print('Epoch: [%d/%d] | Loss: %.3f | Accuracy: %.3f' % (epoch+1,
81 |                                                                 epochs,
82 |                                                                 epoch_loss,
83 |                                                                 epoch_accuracy))
84 | 
85 |         logging.info('Epoch: [%d/%d] | Loss: %.3f | Accuracy: %.3f' % (epoch+1,
86 |                                                                        epochs,
87 |                                                                        epoch_loss,
88 |                                                                        epoch_accuracy))
89 |         # save model
90 |         model_file_name = 'CNN-' + today + '.pt'
91 |         torch.save(model.state_dict(), os.path.join(dir_, model_file_name))
92 | if __name__ == '__main__':
93 |     main()
94 | 


--------------------------------------------------------------------------------
/ffnn/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import warnings
  4 | import logging
  5 | import time
  6 | from datetime import date
  7 | import pandas as pd
  8 | import torch.optim
  9 | import torch.nn as nn
 10 | from torch.utils.data import DataLoader
 11 | from dataset import FFNN_dataset
 12 | from model import FFNN
 13 | sys.path.append('../')
 14 | from utils.gen_utils import create_dir, get_accuracy
 15 | 
 16 | warnings.filterwarnings('ignore', category=UserWarning)
 17 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 18 | root_dir = str(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
 19 | epochs = 70
 20 | batch_size = 32
 21 | lr = 0.005
 22 | workers = 4
 23 | 
 24 | def main():
 25 | 
 26 |     # create new directory for the current training session
 27 |     today = date.today()
 28 |     today = str(today.strftime('%m-%d-%Y'))
 29 |     dir_ = str(root_dir + '/saved_models/FFNN/train-' + today)
 30 | 
 31 |     create_dir(dir_)
 32 | 
 33 |     log_file_name = 'FFNN-' + today +'.log'
 34 |     logging.basicConfig(filename=os.path.join(dir_, log_file_name),
 35 |                         filemode='w',
 36 |                         format='%(asctime)s: %(message)s',
 37 |                         level=logging.INFO)
 38 | 
 39 |     # get training set
 40 |     data_path = str(root_dir + '/data/annotations/train.csv')
 41 |     df = pd.read_csv(data_path, header=None)
 42 |     
 43 |     # encode labels
 44 |     encode = {'voice' : 1, 'not_voice' : 0}
 45 |     df.iloc[:, 195].replace(encode, inplace=True)
 46 | 
 47 |     # remove file names
 48 |     df.drop(df.columns[0], axis=1, inplace=True)
 49 | 
 50 |     # seperate data and labels
 51 |     x = df.iloc[:, 0: -1]
 52 |     y = df.iloc[:, -1]
 53 |     
 54 |     # get dataloader for training
 55 |     train_data = FFNN_dataset(torch.FloatTensor(x.values), torch.FloatTensor(y.values))
 56 |     train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
 57 |     
 58 |     model = FFNN()
 59 |     model = model.to(device)
 60 | 
 61 |     criterion = nn.CrossEntropyLoss()
 62 |     optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.1)
 63 | 
 64 |     # for each epoch
 65 |     for epoch in range(epochs):
 66 | 
 67 |         model.train()
 68 |         epoch_loss = 0
 69 |         epoch_accuracy = 0
 70 |         epoch_steps = 0
 71 | 
 72 |         for i, (x, y) in enumerate(train_loader):
 73 | 
 74 |             x, y = x.to(device), y.to(device, dtype=torch.int64)
 75 |             optimizer.zero_grad()
 76 |             prediction = model(x)
 77 |             loss = criterion(prediction, y)
 78 |             epoch_accuracy = get_accuracy(prediction, y)
 79 |             
 80 |             loss.backward()
 81 |             optimizer.step()
 82 |             
 83 |             epoch_loss += loss.item()
 84 |             epoch_steps += 1
 85 | 
 86 |         # print status onto terminal and log file
 87 |         print('Epoch: [%d/%d] | Loss: %.3f | Accuracy: %.3f' % (epoch+1,
 88 |                                                                 epochs,
 89 |                                                                 epoch_loss,
 90 |                                                                 epoch_accuracy))
 91 | 
 92 |         logging.info('Epoch: [%d/%d] | Loss: %.3f | Accuracy: %.3f' % (epoch+1,
 93 |                                                                        epochs,
 94 |                                                                        epoch_loss,
 95 |                                                                        epoch_accuracy))
 96 |         # save model
 97 |         model_file_name = 'FFNN-' + today + '.pt'
 98 |         torch.save(model.state_dict(), os.path.join(dir_, model_file_name))
 99 | 
100 | if __name__ == '__main__':
101 |     main()
102 | 


--------------------------------------------------------------------------------
/utils/cnn_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import time
  4 | import threading
  5 | import matplotlib.pyplot as plt
  6 | import numpy as np
  7 | import librosa
  8 | import librosa.display
  9 | from utils.gen_utils import create_dir, create_splits
 10 | 
 11 | root_dir = str(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
 12 | 
 13 | 
 14 | # plot the mel-spectrogram for the single wav file input
 15 | def get_melss(wav_file: str, new_name: str) -> None:
 16 |     # get sample rate
 17 |     x, sr = librosa.load(wav_file, sr=None, res_type='kaiser_fast')
 18 | 
 19 |     # get headless figure
 20 |     fig = plt.figure(figsize=[1, 1])
 21 |     
 22 |     # remove the axes
 23 |     ax = fig.add_subplot(111)
 24 |     ax.axes.get_xaxis().set_visible(False)
 25 |     ax.axes.get_yaxis().set_visible(False)
 26 |     ax.set_frame_on(False)
 27 |     
 28 |     # get melss
 29 |     melss = librosa.feature.melspectrogram(y=x, sr=sr)
 30 |     librosa.display.specshow(librosa.power_to_db(melss, ref=np.max), y_axis='linear')
 31 |     
 32 |     # save plot as jpg
 33 |     plt.savefig(new_name, dpi=500, bbox_inches='tight', pad_inches=0)
 34 |     plt.close()
 35 | 
 36 | 
 37 | # prepare the cnn dataset of images
 38 | def prepare_dataset() -> None:
 39 |     # get training and testing splits
 40 |     voice = os.path.join(root_dir, 'voice_detect/data/voice/')
 41 |     not_voice = os.path.join(root_dir, 'voice_detect/data/not_voice/')
 42 |     train, test = create_splits(voice, not_voice)
 43 | 
 44 |     voice_train = os.path.join(root_dir, 'voice_detect/data/plots/train/voice/')
 45 |     not_voice_train = os.path.join(root_dir, 'voice_detect/data/plots/train/not_voice/')
 46 |     voice_test = os.path.join(root_dir, 'voice_detect/data/plots/test/voice/')
 47 |     not_voice_test = os.path.join(root_dir, 'voice_detect/data/plots/test/not_voice/')
 48 |     
 49 |     create_dir(voice_train)
 50 |     create_dir(not_voice_train)
 51 |     create_dir(voice_test)
 52 |     create_dir(not_voice_test)
 53 | 
 54 |     # iterate through the training split
 55 |     for file in train:
 56 |         try:
 57 |             print('Making train plot for: ' + file)
 58 |             if 'not_voice' in file:
 59 |                 wav_name = os.path.basename(file)
 60 |                 wav_name = os.path.splitext(wav_name)
 61 |                 
 62 |                 # construct new jpg file name with the extenstion
 63 |                 jpg_file_name = str(wav_name[0]) + '.jpg'
 64 |                 jpg_file_name = str(not_voice_train + jpg_file_name)
 65 |                 get_melss(file, jpg_file_name)
 66 |             else:
 67 |                 wav_name = os.path.basename(file)
 68 |                 wav_name = os.path.splitext(wav_name)
 69 |                 
 70 |                 # construct new jpg file name with the extenstion
 71 |                 jpg_file_name = str(wav_name[0]) + '.jpg'
 72 |                 jpg_file_name = str(voice_train + jpg_file_name)
 73 |                 get_melss(file, jpg_file_name)
 74 | 
 75 |         except Exception:
 76 |             print('ERROR at ' + file + ' CONTINUING ...')
 77 |             pass
 78 |             
 79 |     # iterate through the testing split
 80 |     for file in test:
 81 |         try:
 82 |             print('Making test plot for: ' + file)
 83 |             if 'not_voice' in file:
 84 |                 wav_name = os.path.basename(file)
 85 |                 wav_name = os.path.splitext(wav_name)
 86 |                 
 87 |                 # construct new jpg file name with the extenstion
 88 |                 jpg_file_name = str(wav_name[0]) + '.jpg'
 89 |                 jpg_file_name = str(not_voice_test + jpg_file_name)
 90 |                 get_melss(file, jpg_file_name)
 91 |             else:
 92 |                 wav_name = os.path.basename(file)
 93 |                 wav_name = os.path.splitext(wav_name)
 94 |                 
 95 |                 # construct new jpg file name with the extenstion
 96 |                 jpg_file_name = str(wav_name[0]) + '.jpg'
 97 |                 jpg_file_name = str(voice_test + jpg_file_name)
 98 |                 get_melss(file, jpg_file_name)
 99 | 
100 |         except Exception:
101 |             print('ERROR at ' + file + ' CONTINUING ...')
102 |             pass
103 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Voice Detection
 2 | ===
 3 | 
 4 | This project is a binary classification problem of audio data that aims to classify human voices from audio recordings. This project uses a feed forward neural network and a convolutional neural network where both networks work together in a voting classifier fashion to increase accuracy on never before seen data.  
 5 | 
 6 | All neural networks were implemented in PyTorch, audio utilities were implemented using Librosa, and the whole project is written in Python 3.8.5. The data for this project was obtained from the [Urban Sound Classification practice problem from Analytics Vidhya](https://datahack.analyticsvidhya.com/contest/practice-problem-urban-sound-classification/) and the voice recordings data was taken from the [LibreSpeech ASR Corpus.](https://www.openslr.org/12)  
 7 | 
 8 | Data Preparation
 9 | ---
10 | 
11 | To get the dataset, download both sections from the above links. I specifically rearranged the dataset by placing all non-voice recordings in one directory and voice recordings in another:  
12 | 
13 | ```bash
14 | ├── data
15 | │   ├── voice
16 | │   │   ├── rec1.wav
17 | │   │   ├── rec2.wav
18 | │   │   ├── ...
19 | │   ├── not_voice
20 | │   │   ├── rec1.wav
21 | │   │   ├── rec2.wav
22 | │   │   ├── ...
23 | ```
24 | 
25 | The `librespeech.py` script can be used to transform the LibreSpeech dataset to the above file structure. All other files that remain can be deleted. Ignore this if you are not using the LibreSpeech dataset.  
26 | 
27 | The feed forward neural network uses the Mel-frequency cepstral coefficient, chromagram, mel-scaled spectrogram, spectral contrast, and the tonal centroid features as input. The Librosa python library is used to obtain all these calculations in the `apply_transforms()` function in the ffnn utils. The `prep_ffnn_data.sh` shell script can be run to obtain all these features for each .wav file in one giant csv file. One entry will look like:
28 | ```
29 | /home/hernanrazo/pythonProjects/voice_detect/data/voice/wav/6476-57446-0035.wav,-345.36868,111.94338,-14.379323,50.732822,-13.364564,-0.81879413,-     8.603202,-11.860127,0.3794937,-11.088642,-0.5936078,-8.392053,-4.562349,2.      390204,2.1774976,-1.8160796,-0.79248935,1.3244591,-3.649716,-2.789777,-3.       483583,-2.1718845,-7.1207843,-4.646477,-2.145171,-5.4034863,1.1288224,2.        6650674,6.1765018,8.234708,5.759141,8.06815,8.3969555,6.328495,5.646016,4.      1650767,2.2295291,1.0025103,-0.5408073,-1.0010004,0.41421363,0.3886434,0.       39528078,0.4243577,0.44586822,0.46885604,0.5323573,0.6273547,0.6136627,0.       5895413,0.5854901,0.5013139,0.030495275,0.025378285,0.03576337,0.02598723,      0.01064823,0.111760125,2.9451604,9.584426,10.904745,6.3265867,0.767396,0.       14292528,0.19490032,0.85179096,5.0548906,16.62874,16.24418,15.094028,11.        092577,5.3894925,1.5112005,0.42363763,0.42279428,0.6986573,1.1323513,1.         4325676,1.5941,1.5745226,0.6854819,0.22188246,0.18639795,0.25544456,0.          37152404,0.18624847,0.18722062,0.24387933,0.15841863,0.2312459,0.12505762,      0.0896525,0.06176768,0.033809755,0.06561177,0.11577808,0.08457274,0.            056273155,0.046364002,0.03207818,0.026625242,0.033034343,0.047393396,0.         039878745,0.030250499,0.035353974,0.04822752,0.088709675,0.08721649,0.          042465515,0.050014295,0.043818373,0.025141228,0.026777223,0.05408083,0.         054930124,0.042547297,0.027444469,0.015712438,0.013818915,0.014640613,0.        017465897,0.014250277,0.019179987,0.021202719,0.040190093,0.024158962,0.        020575762,0.020575762,0.019340117,0.01956742,0.0073476452,0.012725379,0.        016156813,0.007385745,0.008848519,0.0073545426,0.0060878447,0.007746159,0.      011803486,0.00961405,0.011231303,0.012259503,0.008804519,0.008680856,0.         008589337,0.0158784,0.015149302,0.0085100345,0.007378557,0.009641291,0.         0066143535,0.0060657472,0.003713564,0.0021371976,0.0019380879,0.0013283227,     0.0012585397,0.0009210656,0.0008644426,0.0008410996,0.00046661997,0.            00033427356,0.00020592447,5.9694554e-05,1.1552337e-05,2.8310662e-06,4.          4607148e-07,4.411787e-08,1.8092163e-09,1.2725149e-10,1.8920865e-10,1.           2470465e-10,9.163159e-11,1.8638106e-10,2.1313133e-10,7.265922e-10,3.            1799022e-10,8.475092e-10,7.542699e-10,1.6082426e-10,14.965268185360362,18.      193254004666265,20.9569399219138,17.267001240479917,18.13293584976544,19.       771650662276468,41.46849881683453,-0.019441445021252834,0.                      0061759247744320065,0.05519930844766153,0.004244935825248924,-0.                004941592482226379,-0.005592662805732028,voice
30 | ``` 
31 | Each individual value gets its own cell and the label (voice/not_voice) gets attached to the end.  
32 | 
33 | The convolutional neural network recieves an image of the recording's Mels-spectrogram as input. Each image is obtained using the Librosa library. The `prep_cnn_data.sh` shell script can be used to obtain a spectrogram for each audio recording. Example:  
34 | 
35 | <p align="center">
36 | <img src="https://github.com/hernanrazo/human-voice-detection/blob/main/example.jpg" width="300" height="300"> 
37 | </p>
38 | All images are later scaled to 32x32 and transformed to tensors for training.  
39 | 
40 | Both shell scripts automatically split the data into training and testing sets in a 80/20 ratio.  
41 | 
42 | Neural Network Architectures
43 | ---
44 | The feed forward neural network is comprised of 3 linear layers with ReLU activation and dropout. The last linear layer returns 2 values, one for each class, and passes the result to a sigmoid activation function for the final output. For training, cross entropy loss is used along with an Adam optimizer. Training goes on for 70 epochs with a batch size of 32 and learning rate of 0.005.  
45 | 
46 | The convolutional neural network is comprised of 3 convolution layers with max pooling, batch normalization, and dropout. There are also 2 linear layers at the end with ReLU activation and dropout. The final linear layer does not have dropout and the output layer gives two values, one for each class. No activation layers are implemented since the cross entropy loss function has softmax built in. For training, I used cross entropy loss and Adam optimization. Training goes on for 250 epochs with batch size of 16 and a learning rate of 0.01.  
47 | 
48 | Performance
49 | ---
50 | Performance for both models can be measured using the `eval.py` script in each model's respective directory. The script takes the file path to the model as a command line argument. These scripts return the accuracy score, confusion matrix, per-class accuracy, and classification report for the model in question.  
51 | 
52 | For the feed forward neural network:
53 | ```
54 | python eval.py CNN/train-08-29-2021/CNN-08-29-2021.pt
55 | ``` 
56 | And for the convolutional neural network:
57 | ```
58 | python eval.py CNN/train-04-20-2021/CNN-04-20-2021.pt
59 | ```
60 | 
61 | For the feed forward neural network, I obtained 0.9758 accuracy. For the convolutional neural network, I obtianed 0.977 accuracy.
62 | 
63 | Voting Classifier Implementation
64 | ---
65 | The voting classifier is implemented in the `main.py` script. Similar to the evaluation scripts, this script takes two command line arguments, the path to the feed forward neural network and the path to the convolutional neural network. Example:
66 | ```
67 | python main.py FFNN/train-07-14-2021/FFNN-07-14-2021.pt CNN/train-04-20-2021/CNN-04-20-2021.pt
68 | ```
69 | The script first calculates the needed transformations for the feed forward neural network and then creates the spectrogram for the convolutional neural network. The spectrogram is then resized to a 32x32 image and then converted to a tensor. Each input is passed to its respective neural network and an inference score is returned. If both networks return an inference above 0.85 for the recording being a voice, the result is deemed a voice. Any lower scoring is deemed not a voice.
70 | 
71 | Sources and helpful Links
72 | ---
73 | https://www.telusinternational.com/articles/what-is-audio-classification#:~:text=Audio%20classification%20is%20the%20process,and%20text%20to%20speech%20applications.  
74 | https://stackoverflow.com/questions/53290306/confusion-matrix-and-test-accuracy-for-pytorch-transfer-learning-tutorial  
75 | https://librosa.org/doc/latest/index.html  
76 | https://datahack.analyticsvidhya.com/contest/practice-problem-urban-sound-classification/  
77 | https://www.openslr.org/12
78 | 


--------------------------------------------------------------------------------
/utils/ffnn_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | import threading
  4 | import warnings
  5 | import numpy as np
  6 | import shutil
  7 | from pydub import AudioSegment
  8 | import librosa
  9 | import torch
 10 | from utils.gen_utils import create_splits
 11 | 
 12 | warnings.filterwarnings('ignore', category=UserWarning)
 13 | root_dir = str(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
 14 | np.set_printoptions(suppress=True)
 15 | 
 16 | 
 17 | # assign labels to wav files
 18 | def get_label(file_path: str) -> None:
 19 |     if 'not_voice' in file_path:
 20 |         return 'not_voice'
 21 |     else:
 22 |         return 'voice'
 23 | 
 24 | 
 25 |  # apply transforms needed to prepare data
 26 | def apply_transforms(wav_file: str) -> dict:
 27 |     
 28 |     # convert wav file to floating pont time series and get
 29 |     # default sample rate (22050)
 30 |     x, sr = librosa.load(wav_file, res_type='kaiser_fast')
 31 |     
 32 |     # get mel-frequency cepstral coefficients
 33 |     mfccs = np.mean(librosa.feature.mfcc(y=x, sr=sr, n_mfcc=40).T, axis=0)
 34 |     
 35 |     #get short-time fourier transform
 36 |     stft = np.abs(librosa.stft(x))
 37 |     
 38 |     # get chromagram
 39 |     chromagram = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0)
 40 | 
 41 |     # get mel_scaled spectrogram
 42 |     melss = np.mean(librosa.feature.melspectrogram(x, sr=sr).T, axis=0)
 43 | 
 44 |     # get spectral contrast
 45 |     spec_contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sr).T, axis=0)
 46 |     
 47 |     # get tonnetz
 48 |     tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(x), sr=sr).T, axis=0)
 49 |     
 50 |     return {'mfccs':mfccs, 'chromagram':chromagram, 'melss':melss, 'spec_contrast':spec_contrast, 'tonnetz':tonnetz}
 51 | 
 52 | 
 53 | # convert transforms dictionary to a tensor
 54 | def transforms_to_tensor(transforms: dict) -> list:
 55 |     transforms_list = [transforms['mfccs'][0], transforms['mfccs'][1],
 56 |                        transforms['mfccs'][2], transforms['mfccs'][3],
 57 |                        transforms['mfccs'][4], transforms['mfccs'][5],
 58 |                        transforms['mfccs'][6], transforms['mfccs'][7],
 59 |                        transforms['mfccs'][8], transforms['mfccs'][9],
 60 |                        transforms['mfccs'][10], transforms['mfccs'][11],
 61 |                        transforms['mfccs'][12], transforms['mfccs'][13],
 62 |                        transforms['mfccs'][14], transforms['mfccs'][15],
 63 |                        transforms['mfccs'][16], transforms['mfccs'][17],
 64 |                        transforms['mfccs'][18], transforms['mfccs'][19],
 65 |                        transforms['mfccs'][20], transforms['mfccs'][21],
 66 |                        transforms['mfccs'][22], transforms['mfccs'][23],
 67 |                        transforms['mfccs'][24], transforms['mfccs'][25],
 68 |                        transforms['mfccs'][26], transforms['mfccs'][27],
 69 |                        transforms['mfccs'][28], transforms['mfccs'][29],
 70 |                        transforms['mfccs'][30], transforms['mfccs'][31],
 71 |                        transforms['mfccs'][32], transforms['mfccs'][33],
 72 |                        transforms['mfccs'][34], transforms['mfccs'][35],
 73 |                        transforms['mfccs'][36], transforms['mfccs'][37],
 74 |                        transforms['mfccs'][38], transforms['mfccs'][39],
 75 |                        transforms['chromagram'][0],
 76 |                        transforms['chromagram'][1],
 77 |                        transforms['chromagram'][2],
 78 |                        transforms['chromagram'][3],
 79 |                        transforms['chromagram'][4],
 80 |                        transforms['chromagram'][5],
 81 |                        transforms['chromagram'][6],
 82 |                        transforms['chromagram'][7],
 83 |                        transforms['chromagram'][8],
 84 |                        transforms['chromagram'][9],
 85 |                        transforms['chromagram'][10],
 86 |                        transforms['chromagram'][11],
 87 |                        transforms['melss'][0], transforms['melss'][1], transforms['melss'][2],
 88 |                        transforms['melss'][3], transforms['melss'][4], transforms['melss'][5],
 89 |                        transforms['melss'][6], transforms['melss'][7], transforms['melss'][8],
 90 |                        transforms['melss'][9], transforms['melss'][10], transforms['melss'][11],
 91 |                        transforms['melss'][12], transforms['melss'][13], transforms['melss'][14],
 92 |                        transforms['melss'][15], transforms['melss'][16], transforms['melss'][17],
 93 |                        transforms['melss'][18], transforms['melss'][19], transforms['melss'][20],
 94 |                        transforms['melss'][21], transforms['melss'][22], transforms['melss'][23],
 95 |                        transforms['melss'][24], transforms['melss'][25], transforms['melss'][26],
 96 |                        transforms['melss'][27], transforms['melss'][28], transforms['melss'][29],
 97 |                        transforms['melss'][30], transforms['melss'][31], transforms['melss'][32],
 98 |                        transforms['melss'][33], transforms['melss'][34], transforms['melss'][35],
 99 |                        transforms['melss'][36], transforms['melss'][37], transforms['melss'][38],
100 |                        transforms['melss'][39], transforms['melss'][40], transforms['melss'][41],
101 |                        transforms['melss'][42], transforms['melss'][43], transforms['melss'][44],
102 |                        transforms['melss'][45], transforms['melss'][46], transforms['melss'][47],
103 |                        transforms['melss'][48], transforms['melss'][49], transforms['melss'][50],
104 |                        transforms['melss'][51], transforms['melss'][52], transforms['melss'][53],
105 |                        transforms['melss'][54], transforms['melss'][55], transforms['melss'][56],
106 |                        transforms['melss'][57], transforms['melss'][58], transforms['melss'][59],
107 |                        transforms['melss'][60], transforms['melss'][61], transforms['melss'][62],
108 |                        transforms['melss'][63], transforms['melss'][64], transforms['melss'][65],
109 |                        transforms['melss'][66], transforms['melss'][67], transforms['melss'][68],
110 |                        transforms['melss'][69], transforms['melss'][70], transforms['melss'][71],
111 |                        transforms['melss'][72], transforms['melss'][73], transforms['melss'][74],
112 |                        transforms['melss'][75], transforms['melss'][75], transforms['melss'][76],
113 |                        transforms['melss'][77], transforms['melss'][78], transforms['melss'][79],
114 |                        transforms['melss'][80], transforms['melss'][81], transforms['melss'][82],
115 |                        transforms['melss'][83], transforms['melss'][84], transforms['melss'][85],
116 |                        transforms['melss'][86], transforms['melss'][87], transforms['melss'][88],
117 |                        transforms['melss'][89], transforms['melss'][90], transforms['melss'][91],
118 |                        transforms['melss'][92], transforms['melss'][93], transforms['melss'][94],
119 |                        transforms['melss'][95], transforms['melss'][96], transforms['melss'][97],
120 |                        transforms['melss'][98], transforms['melss'][99], transforms['melss'][100],
121 |                        transforms['melss'][101], transforms['melss'][102], transforms['melss'][103],
122 |                        transforms['melss'][104], transforms['melss'][105], transforms['melss'][106],
123 |                        transforms['melss'][107], transforms['melss'][108], transforms['melss'][109],
124 |                        transforms['melss'][110], transforms['melss'][111], transforms['melss'][112],
125 |                        transforms['melss'][113], transforms['melss'][114], transforms['melss'][115],
126 |                        transforms['melss'][116], transforms['melss'][117], transforms['melss'][118],
127 |                        transforms['melss'][119], transforms['melss'][120], transforms['melss'][121],
128 |                        transforms['melss'][122], transforms['melss'][123], transforms['melss'][124],
129 |                        transforms['melss'][125], transforms['melss'][126], transforms['melss'][127],
130 |                        transforms['spec_contrast'][0],
131 |                        transforms['spec_contrast'][1],
132 |                        transforms['spec_contrast'][2],
133 |                        transforms['spec_contrast'][3],
134 |                        transforms['spec_contrast'][4],
135 |                        transforms['spec_contrast'][5],
136 |                        transforms['spec_contrast'][6],
137 |                        transforms['tonnetz'][0],
138 |                        transforms['tonnetz'][1],
139 |                        transforms['tonnetz'][2],
140 |                        transforms['tonnetz'][3],
141 |                        transforms['tonnetz'][4],
142 |                        transforms['tonnetz'][5]]
143 | 
144 |     return torch.FloatTensor(transforms_list).to(device='cuda')
145 | 
146 | 
147 | # create one giant csv with all the tranforms data and the label of each wav file
148 | def get_csv(csv_name: str, data_split: list, annotations_path: str) -> None:
149 |     with open(csv_name, mode='w', newline='') as f:
150 |         writer = csv.writer(f)
151 | 
152 |         for filename in data_split:
153 |             try:
154 |                 print(str(csv_name) + ' THREAD: Applying transform to: ' + filename)
155 |                 transforms = apply_transforms(filename)
156 |                 writer.writerow([filename,
157 |                                  transforms['mfccs'][0], transforms['mfccs'][1],
158 |                                  transforms['mfccs'][2], transforms['mfccs'][3],
159 |                                  transforms['mfccs'][4], transforms['mfccs'][5],
160 |                                  transforms['mfccs'][6], transforms['mfccs'][7],
161 |                                  transforms['mfccs'][8], transforms['mfccs'][9],
162 |                                  transforms['mfccs'][10], transforms['mfccs'][11],
163 |                                  transforms['mfccs'][12], transforms['mfccs'][13],
164 |                                  transforms['mfccs'][14], transforms['mfccs'][15],
165 |                                  transforms['mfccs'][16], transforms['mfccs'][17],
166 |                                  transforms['mfccs'][18], transforms['mfccs'][19],
167 |                                  transforms['mfccs'][20], transforms['mfccs'][21],
168 |                                  transforms['mfccs'][22], transforms['mfccs'][23],
169 |                                  transforms['mfccs'][24], transforms['mfccs'][25],
170 |                                  transforms['mfccs'][26], transforms['mfccs'][27],
171 |                                  transforms['mfccs'][28], transforms['mfccs'][29],
172 |                                  transforms['mfccs'][30], transforms['mfccs'][31],
173 |                                  transforms['mfccs'][32], transforms['mfccs'][33],
174 |                                  transforms['mfccs'][34], transforms['mfccs'][35],
175 |                                  transforms['mfccs'][36], transforms['mfccs'][37],
176 |                                  transforms['mfccs'][38], transforms['mfccs'][39],
177 |                                  transforms['chromagram'][0],
178 |                                  transforms['chromagram'][1],
179 |                                  transforms['chromagram'][2],
180 |                                  transforms['chromagram'][3],
181 |                                  transforms['chromagram'][4],
182 |                                  transforms['chromagram'][5],
183 |                                  transforms['chromagram'][6],
184 |                                  transforms['chromagram'][7],
185 |                                  transforms['chromagram'][8],
186 |                                  transforms['chromagram'][9],
187 |                                  transforms['chromagram'][10],
188 |                                  transforms['chromagram'][11],
189 |                                  transforms['melss'][0], transforms['melss'][1], transforms['melss'][2],
190 |                                  transforms['melss'][3], transforms['melss'][4], transforms['melss'][5],
191 |                                  transforms['melss'][6], transforms['melss'][7], transforms['melss'][8],
192 |                                  transforms['melss'][9], transforms['melss'][10], transforms['melss'][11],
193 |                                  transforms['melss'][12], transforms['melss'][13], transforms['melss'][14],
194 |                                  transforms['melss'][15], transforms['melss'][16], transforms['melss'][17],
195 |                                  transforms['melss'][18], transforms['melss'][19], transforms['melss'][20],
196 |                                  transforms['melss'][21], transforms['melss'][22], transforms['melss'][23],
197 |                                  transforms['melss'][24], transforms['melss'][25], transforms['melss'][26],
198 |                                  transforms['melss'][27], transforms['melss'][28], transforms['melss'][29],
199 |                                  transforms['melss'][30], transforms['melss'][31], transforms['melss'][32],
200 |                                  transforms['melss'][33], transforms['melss'][34], transforms['melss'][35],
201 |                                  transforms['melss'][36], transforms['melss'][37], transforms['melss'][38],
202 |                                  transforms['melss'][39], transforms['melss'][40], transforms['melss'][41],
203 |                                  transforms['melss'][42], transforms['melss'][43], transforms['melss'][44],
204 |                                  transforms['melss'][45], transforms['melss'][46], transforms['melss'][47],
205 |                                  transforms['melss'][48], transforms['melss'][49], transforms['melss'][50],
206 |                                  transforms['melss'][51], transforms['melss'][52], transforms['melss'][53],
207 |                                  transforms['melss'][54], transforms['melss'][55], transforms['melss'][56],
208 |                                  transforms['melss'][57], transforms['melss'][58], transforms['melss'][59],
209 |                                  transforms['melss'][60], transforms['melss'][61], transforms['melss'][62],
210 |                                  transforms['melss'][63], transforms['melss'][64], transforms['melss'][65],
211 |                                  transforms['melss'][66], transforms['melss'][67], transforms['melss'][68],
212 |                                  transforms['melss'][69], transforms['melss'][70], transforms['melss'][71],
213 |                                  transforms['melss'][72], transforms['melss'][73], transforms['melss'][74],
214 |                                  transforms['melss'][75], transforms['melss'][75], transforms['melss'][76],
215 |                                  transforms['melss'][77], transforms['melss'][78], transforms['melss'][79],
216 |                                  transforms['melss'][80], transforms['melss'][81], transforms['melss'][82],
217 |                                  transforms['melss'][83], transforms['melss'][84], transforms['melss'][85],
218 |                                  transforms['melss'][86], transforms['melss'][87], transforms['melss'][88],
219 |                                  transforms['melss'][89], transforms['melss'][90], transforms['melss'][91],
220 |                                  transforms['melss'][92], transforms['melss'][93], transforms['melss'][94],
221 |                                  transforms['melss'][95], transforms['melss'][96], transforms['melss'][97],
222 |                                  transforms['melss'][98], transforms['melss'][99], transforms['melss'][100],
223 |                                  transforms['melss'][101], transforms['melss'][102], transforms['melss'][103],
224 |                                  transforms['melss'][104], transforms['melss'][105], transforms['melss'][106],
225 |                                  transforms['melss'][107], transforms['melss'][108], transforms['melss'][109],
226 |                                  transforms['melss'][110], transforms['melss'][111], transforms['melss'][112],
227 |                                  transforms['melss'][113], transforms['melss'][114], transforms['melss'][115],
228 |                                  transforms['melss'][116], transforms['melss'][117], transforms['melss'][118],
229 |                                  transforms['melss'][119], transforms['melss'][120], transforms['melss'][121],
230 |                                  transforms['melss'][122], transforms['melss'][123], transforms['melss'][124],
231 |                                  transforms['melss'][125], transforms['melss'][126], transforms['melss'][127],
232 |                                  transforms['spec_contrast'][0],
233 |                                  transforms['spec_contrast'][1],
234 |                                  transforms['spec_contrast'][2],
235 |                                  transforms['spec_contrast'][3],
236 |                                  transforms['spec_contrast'][4],
237 |                                  transforms['spec_contrast'][5],
238 |                                  transforms['spec_contrast'][6],
239 |                                  transforms['tonnetz'][0],
240 |                                  transforms['tonnetz'][1],
241 |                                  transforms['tonnetz'][2],
242 |                                  transforms['tonnetz'][3],
243 |                                  transforms['tonnetz'][4],
244 |                                  transforms['tonnetz'][5],
245 |                                  get_label(filename)])
246 |             except Exception:
247 |                 print(str(csv_name) + ' THREAD: ERROR AT ' + filename + '. CONTINUING ...')
248 |                 pass
249 | 
250 |     shutil.move(str(csv_name), str(root_dir + annotations_path))
251 |     # end function
252 | 
253 | 
254 | # prepare raw .wav files to the csv dataframe needed. This includes splitting the data into training and testing,
255 | # applying the transforms, and saving as a csv. This function is used in the prepare_data.sh script
256 | def prepare_dataset() -> None:
257 | 
258 |     annotations_path = '/voice_detect/data/annotations'
259 |     voice_wavs = str(root_dir + '/voice_detect/data/voice/wav/')
260 |     not_voice_wavs = str(root_dir + '/voice_detect/data/not_voice/wav/')
261 | 
262 |     print('Creating splits ...\n')
263 |     train, test = create_splits(voice_wavs, not_voice_wavs)
264 |     
265 |     # start two threads, one to crate the training csv and one for the testing csv
266 |     train_thread = threading.Thread(target=get_csv, args=('train.csv', train, annotations_path))
267 |     test_thread = threading.Thread(target=get_csv, args=('test.csv', test, annotations_path))
268 | 
269 |     train_thread.start()
270 |     test_thread.start()
271 |     train_thread.join()
272 |     test_thread.join()
273 | 


--------------------------------------------------------------------------------