├── speech-accent-recognition
    ├── models
    │   └── model1.h5
    ├── __pycache__
    │   ├── accuracy.cpython-36.pyc
    │   └── getsplit.cpython-36.pyc
    └── src
    │   ├── __pycache__
    │       ├── __init__.cpython-36.pyc
    │       ├── accuracy.cpython-36.pyc
    │       └── getsplit.cpython-36.pyc
    │   ├── accuracy.py
    │   ├── getaudio.py
    │   ├── getsplit.py
    │   ├── fromwebsite.py
    │   └── trainmodel.py
└── README.md


/speech-accent-recognition/models/model1.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yatharthgarg/Speech-Accent-Recognition/HEAD/speech-accent-recognition/models/model1.h5


--------------------------------------------------------------------------------
/speech-accent-recognition/__pycache__/accuracy.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yatharthgarg/Speech-Accent-Recognition/HEAD/speech-accent-recognition/__pycache__/accuracy.cpython-36.pyc


--------------------------------------------------------------------------------
/speech-accent-recognition/__pycache__/getsplit.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yatharthgarg/Speech-Accent-Recognition/HEAD/speech-accent-recognition/__pycache__/getsplit.cpython-36.pyc


--------------------------------------------------------------------------------
/speech-accent-recognition/src/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yatharthgarg/Speech-Accent-Recognition/HEAD/speech-accent-recognition/src/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/speech-accent-recognition/src/__pycache__/accuracy.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yatharthgarg/Speech-Accent-Recognition/HEAD/speech-accent-recognition/src/__pycache__/accuracy.cpython-36.pyc


--------------------------------------------------------------------------------
/speech-accent-recognition/src/__pycache__/getsplit.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yatharthgarg/Speech-Accent-Recognition/HEAD/speech-accent-recognition/src/__pycache__/getsplit.cpython-36.pyc


--------------------------------------------------------------------------------
/speech-accent-recognition/src/accuracy.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | import numpy as np
 3 | 
 4 | def predict_class_audio(MFCCs, model):
 5 |     '''
 6 |     Predict class based on MFCC samples
 7 |     :param MFCCs: Numpy array of MFCCs
 8 |     :param model: Trained model
 9 |     :return: Predicted class of MFCC segment group
10 |     '''
11 |     MFCCs = MFCCs.reshape(MFCCs.shape[0],MFCCs.shape[1],MFCCs.shape[2],1)
12 |     y_predicted = model.predict_classes(MFCCs,verbose=0)
13 |     return(Counter(list(y_predicted)).most_common(1)[0][0])
14 | 
15 | 
16 | def predict_prob_class_audio(MFCCs, model):
17 |     '''
18 |     Predict class based on MFCC samples' probabilities
19 |     :param MFCCs: Numpy array of MFCCs
20 |     :param model: Trained model
21 |     :return: Predicted class of MFCC segment group
22 |     '''
23 |     MFCCs = MFCCs.reshape(MFCCs.shape[0],MFCCs.shape[1],MFCCs.shape[2],1)
24 |     y_predicted = model.predict_proba(MFCCs,verbose=0)
25 |     return(np.argmax(np.sum(y_predicted,axis=0)))
26 | 
27 | def predict_class_all(X_train, model):
28 |     '''
29 |     :param X_train: List of segmented mfccs
30 |     :param model: trained model
31 |     :return: list of predictions
32 |     '''
33 |     predictions = []
34 |     for mfcc in X_train:
35 |         predictions.append(predict_class_audio(mfcc, model))
36 |         # predictions.append(predict_prob_class_audio(mfcc, model))
37 |     return predictions
38 | 
39 | def confusion_matrix(y_predicted,y_test):
40 |     '''
41 |     Create confusion matrix
42 |     :param y_predicted: list of predictions
43 |     :param y_test: numpy array of shape (len(y_test), number of classes). 1.'s at index of actual, otherwise 0.
44 |     :return: numpy array. confusion matrix
45 |     '''
46 |     confusion_matrix = np.zeros((len(y_test[0]),len(y_test[0])),dtype=int )
47 |     for index, predicted in enumerate(y_predicted):
48 |         confusion_matrix[np.argmax(y_test[index])][predicted] += 1
49 |     return(confusion_matrix)
50 | 
51 | def get_accuracy(y_predicted,y_test):
52 |     '''
53 |     Get accuracy
54 |     :param y_predicted: numpy array of predictions
55 |     :param y_test: numpy array of actual
56 |     :return: accuracy
57 |     '''
58 |     c_matrix = confusion_matrix(y_predicted,y_test)
59 |     return( np.sum(c_matrix.diagonal()) / float(np.sum(c_matrix)))
60 | 
61 | if __name__ == '__main__':
62 |     pass
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/speech-accent-recognition/src/getaudio.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import urllib.request
 3 | import os
 4 | import sys
 5 | from pydub import AudioSegment
 6 | 
 7 | class GetAudio:
 8 | 
 9 |     def __init__(self, csv_filepath, destination_folder= 'audio/', wait= 1.5, debug=False ):
10 |         '''
11 |         Initializes GetAudio class object
12 |         :param destination_folder (str): Folder where audio files will be saved
13 |         :param wait (float): Length (in seconds) between web requests
14 |         :param debug (bool): Outputs status indicators to console when True
15 |         '''
16 |         self.csv_filepath = csv_filepath
17 |         self.audio_df = pd.read_csv(csv_filepath)
18 |         self.url = 'http://chnm.gmu.edu/accent/soundtracks/{}.mp3'
19 |         self.destination_folder = destination_folder
20 |         self.wait = wait
21 |         self.debug = False
22 | 
23 |     def check_path(self):
24 |         '''
25 |         Checks if self.distination_folder exists. If not, a folder called self.destination_folder is created
26 |         '''
27 |         if not os.path.exists(self.destination_folder):
28 |             if self.debug:
29 |                 print('{} does not exist, creating'.format(self.destination_folder))
30 |             os.makedirs('../' + self.destination_folder)
31 | 
32 |     def get_audio(self):
33 |         '''
34 |         Retrieves all audio files from 'language_num' column of self.audio_df
35 |         If audio file already exists, move on to the next
36 |         :return (int): Number of audio files downloaded
37 |         '''
38 | 
39 |         self.check_path()
40 | 
41 |         counter = 0
42 | 
43 |         for lang_num in self.audio_df['language_num']:
44 |             if not os.path.exists(self.destination_folder +'{}.wav'.format(lang_num)):
45 |                 if self.debug:
46 |                     print('downloading {}'.format(lang_num))
47 |                 (filename, headers) = urllib.request.urlretrieve(self.url.format(lang_num))
48 |                 sound = AudioSegment.from_mp3(filename)
49 |                 sound.export('../' + self.destination_folder + "{}.wav".format(lang_num), format="wav")
50 |                 counter += 1
51 | 
52 |         return counter
53 | 
54 | if __name__ == '__main__':
55 |     '''
56 |     Example console command
57 |     python GetAudio.py audio_metadata.csv
58 |     '''
59 |     csv_file = sys.argv[1]
60 |     ga = GetAudio(csv_filepath=csv_file)
61 |     ga.get_audio()
62 | 
63 | 


--------------------------------------------------------------------------------
/speech-accent-recognition/src/getsplit.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import sys
 3 | from sklearn.model_selection import train_test_split
 4 | 
 5 | 
 6 | 
 7 | 
 8 | # def filter_df(df):
 9 | #     '''
10 | #     Function to filter audio files based on df columns
11 | #     df column options: [age,age_of_english_onset,age_sex,birth_place,english_learning_method,
12 | #     english_residence,length_of_english_residence,native_language,other_languages,sex]
13 | #     :param df (DataFrame): Full unfiltered DataFrame
14 | #     :return (DataFrame): Filtered DataFrame
15 | #     '''
16 | #
17 | #     # Example to filter arabic, mandarin, and english and limit to 73 audio files
18 | #     arabic = df[df['native_language'] == 'arabic']
19 | #     mandarin = df[df['native_language'] == 'mandarin']
20 | #     english = df[df.native_language == 'english'][:73]
21 | #     mandarin = mandarin[mandarin.length_of_english_residence < 10][:73]
22 | #     arabic = arabic[arabic.length_of_english_residence < 10][:73]
23 | #
24 | #     df = english.append(arabic)
25 | #     df = df.append(mandarin)
26 | #
27 | #
28 | #
29 | #     return df
30 | 
31 | def filter_df(df):
32 |     '''
33 |     Function to filter audio files based on df columns
34 |     df column options: [age,age_of_english_onset,age_sex,birth_place,english_learning_method,
35 |     english_residence,length_of_english_residence,native_language,other_languages,sex]
36 |     :param df (DataFrame): Full unfiltered DataFrame
37 |     :return (DataFrame): Filtered DataFrame
38 |     '''
39 | 
40 |     arabic = df[df.native_language == 'arabic']
41 |     mandarin = df[df.native_language == 'mandarin']
42 |     english = df[df.native_language == 'english']
43 | 
44 |     mandarin = mandarin[mandarin.length_of_english_residence < 10]
45 |     arabic = arabic[arabic.length_of_english_residence < 10]
46 | 
47 |     df = df.append(english)
48 |     df = df.append(arabic)
49 |     df = df.append(mandarin)
50 | 
51 |     return df
52 | 
53 | def split_people(df,test_size=0.2):
54 |     '''
55 |     Create train test split of DataFrame
56 |     :param df (DataFrame): Pandas DataFrame of audio files to be split
57 |     :param test_size (float): Percentage of total files to be split into test
58 |     :return X_train, X_test, y_train, y_test (tuple): Xs are list of df['language_num'] and Ys are df['native_language']
59 |     '''
60 | 
61 | 
62 |     return train_test_split(df['language_num'],df['native_language'],test_size=test_size,random_state=1234)
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     '''
67 |     Console command example:
68 |     python bio_data.csv
69 |     '''
70 | 
71 |     csv_file = sys.argv[1]
72 |     df = pd.read_csv(csv_file)
73 |     filtered_df = filter_df(df)
74 |     print(split_people(filtered_df))
75 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Speech-Accent-Recognition
 2 | 
 3 | ### About
 4 | Every individual has their own dialects or mannerisms in which they speak. This project revolves around the detection of backgrounds of every individual using their speeches. The goal in this project is to classify various types of accents, specifically foreign accents, by the native language of the speaker. This project allows to detect the demographic and linguistic backgrounds of the speakers by comparing different speech outputs with the speech accent archive dataset in order to determine which variables are key predictors of each accent. The speech accent archive demonstrates that accents are systematic rather than merely mistaken speech. Given a recording of a speaker speaking a known script of English words, this project predicts the speaker’s native language.
 5 | 
 6 | ### Dataset
 7 | All of the speech files used for this project come from the Speech Accent Archive, a repository of spoken English hosted by George Mason University. Over 2000 speakers representing over 100 native languages read a common elicitation paragraph in English:
 8 | 
 9 | ```
10 | 'Please call Stella. Ask her to bring these things with her from the store: Six spoons of fresh
11 | snow peas, five thick slabs of blue cheese, and maybe a snack for her brother Bob. We also need 
12 | a small plastic snake and a big toy frog for the kids. She can scoop these things into three red 
13 | bags, and we will go meet her Wednesday at the train station.'
14 | ```
15 | 
16 | The common nature of the dataset makes it ideal for studying accent, being that the wording is provided, and the recording quality is (nearly) uniform across all speakers. Since the dataset was large in the terms of size (approximately 2GB) but the samples were less, so I worked mainly on 3 most spoken accents i.e. English, Mandarin and Arabic.
17 | 
18 | The dataset contained **.mp3** audio files which were converted to **.wav** audio files which allowed easy extraction of the **MFCC (Mel Frequency Cepstral Coefficients)** features to build a 2-D convolution neural network.
19 | 
20 | The MFCC was fed into a 2-Dimensional Convolutional Neural Network (CNN) to predict the native language class.
21 | 
22 | ### Dependencies
23 | • [Python 3.x](https://www.python.org/download/releases/2.7/)
24 | 
25 | • [Keras](https://keras.io/)
26 | 
27 | • [Numpy](http://www.numpy.org/)
28 | 
29 | • [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/) 
30 | 
31 | • [Pydub](https://github.com/jiaaro/pydub)
32 | 
33 | • [Scikit-learn](http://scikit-learn.org/stable/)
34 | 
35 | • [Librosa](http://librosa.github.io/librosa/)
36 | 
37 | ### Execution
38 | To execute the code, please have all the dependencies installed on your system. Next, change execution directory to the src directory of the code and execute the following python commands - 
39 | 
40 | • To download language metadata from [The Speech Accent Archive](http://accent.gmu.edu/index.php) and download audio files:
41 | ```
42 | python fromwebsite.py bio_data.csv mandarin english arabic
43 | ```
44 | • Run getaudio.py to download audio files to the audio directory. All audio files listed in bio_metadata.csv will be downloaded.
45 | ```
46 | python getaudio.py bio_data.csv
47 | ```
48 | • Run trainmodel.py to train the CNN.
49 | ```
50 | python trainmodel.py bio_data.csv model5
51 | ```
52 | ### Results 
53 | This project was able to achieve an accuracy of **over 75%** and used 13 MFCC features of each sample which were fed into the 2-D CNN.
54 | The final accuracies are: -
55 | 
56 | • 96.6% when English samples were given 
57 | 
58 | • 65% when Arabic samples were given
59 | 
60 | • 50% when Mandarin samples were given
61 | 
62 | ### References
63 | • [Morgan Bryant, Amanda Chow & Sydney Li Classification of Accents of English Speakers by Native Language](http://cs229.stanford.edu/proj2014/Morgan%20Bryant,%20Amanda%20Chow,%20Sydney%20Li,%20Classification%20of%20Accents%20of%20English%20Speakers%20by%20Native%20Language.pdf)
64 | 


--------------------------------------------------------------------------------
/speech-accent-recognition/src/fromwebsite.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import pandas as pd
  3 | from bs4 import BeautifulSoup
  4 | import time
  5 | import sys
  6 | import re
  7 | 
  8 | ROOT_URL = 'http://accent.gmu.edu/'
  9 | BROWSE_LANGUAGE_URL = 'browse_language.php?function=find&language={}'
 10 | WAIT = 1.2
 11 | DEBUG = True
 12 | 
 13 | def get_htmls(urls):
 14 |     '''
 15 |     Retrieves html in text form from ROOT_URL
 16 |     :param urls (list): List of urls from which to retrieve html
 17 |     :return (list): list of HTML strings
 18 |     '''
 19 |     htmls = []
 20 |     for url in urls:
 21 |         if DEBUG:
 22 |             print('downloading from {}'.format(url))
 23 |         htmls.append(requests.get(url).text)
 24 |         time.sleep(WAIT)
 25 | 
 26 |     return(htmls)
 27 | 
 28 | 
 29 | def build_search_urls(languages):
 30 |     '''
 31 |     creates url from ROOT_URL and languages
 32 |     :param languages (list): List of languages
 33 |     :return (list): List of urls
 34 |     '''
 35 |     return([ROOT_URL+BROWSE_LANGUAGE_URL.format(language) for language in languages])
 36 | 
 37 | def parse_p(p_tag):
 38 |     '''
 39 |     Extracts href property from HTML <p> tag string
 40 |     :param p_tag (str): HTML string
 41 |     :return (str): string of link
 42 |     '''
 43 |     text = p_tag.text.replace(' ','').split(',')
 44 |     return([ROOT_URL+p_tag.a['href'], text[0], text[1]])
 45 | 
 46 | def get_bio(hrefs):
 47 |     '''
 48 |     Retrieves HTML from list of hrefs and returns bio information
 49 |     :param hrefs (list): list of hrefs
 50 |     :return (DataFrame): Pandas DataFrame with bio information
 51 |     '''
 52 | 
 53 |     htmls = get_htmls(hrefs)
 54 |     bss = [BeautifulSoup(html,'html.parser') for html in htmls]
 55 |     rows = []
 56 |     bio_row = []
 57 |     for bs in bss:
 58 |         rows.append([li.text for li in bs.find('ul','bio').find_all('li')])
 59 |     for row in rows:
 60 |         bio_row.append(parse_bio(row))
 61 | 
 62 |     return(pd.DataFrame(bio_row))
 63 | 
 64 | def parse_bio(row):
 65 |     '''
 66 |     Parse bio data from row string
 67 |     :param row (str): Unparsed bio string
 68 |     :return (list): Bio columns
 69 |     '''
 70 |     cols = []
 71 |     for col in row:
 72 |         try:
 73 |             tmp_col = re.search((r"\:(.+)",col.replace(' ','')).group(1))
 74 |         except:
 75 |             tmp_col = col
 76 |         cols.append(tmp_col)
 77 |     return(cols)
 78 | 
 79 | 
 80 | def create_dataframe(languages):
 81 |     '''
 82 | 
 83 |     :param languages (str): language from which you want to get html
 84 |     :return df (DataFrame): DataFrame that contains all audio metadata from searched language
 85 |     '''
 86 |     htmls = get_htmls(build_search_urls(languages))
 87 |     bss = [BeautifulSoup(html,'html.parser') for html in htmls]
 88 |     persons = []
 89 | 
 90 |     for bs in bss:
 91 |         for p in bs.find_all('p'):
 92 |             if p.a:
 93 |                 persons.append(parse_p(p))
 94 | 
 95 |     df = pd.DataFrame(persons, columns=['href','language_num','sex'])
 96 | 
 97 |     bio_rows = get_bio(df['href'])
 98 | 
 99 |     if DEBUG:
100 |         print('loading finished')
101 | 
102 |     df['birth_place'] = bio_rows.iloc[:,0]
103 |     df['native_language'] = bio_rows.iloc[:,1]
104 |     df['other_languages'] = bio_rows.iloc[:,2]
105 |     df['age_sex'] = bio_rows.iloc[:,3]
106 |     df['age_of_english_onset'] = bio_rows.iloc[:,4]
107 |     df['english_learning_method'] = bio_rows.iloc[:,5]
108 |     df['english_residence'] = bio_rows.iloc[:,6]
109 |     df['length_of_english_residence'] = bio_rows.iloc[:,7]
110 | 
111 |     df['birth_place'] = df['birth_place'].apply(lambda x: x[:-6].split(' ')[-2:])
112 |     # print(df['birth_place'])
113 |     # df['birth_place'] = lambda x: x[:-6].split(' ')[2:], df['birth_place']
114 |     df['native_language'] = df['native_language'].apply(lambda x: x.split(' ')[2])
115 |     # print(df['native_language'])
116 |     # df['native_language'] = lambda x: x.split(' ')[2], df['native_language']
117 |     df['other_languages'] = df['other_languages'].apply(lambda x: x.split(' ')[2:])
118 |     # print(df['other_languages'])
119 |     # df['other_languages'] = lambda x: x.split(' ')[2:], df['other_languages']
120 |     df['age_sex'], df['age'] = df['age_sex'].apply(lambda x: x.split(' ')[2:]), df['age_sex'].apply(lambda x: x.replace('sex:','').split(',')[1])
121 |     # print(df['age'])
122 |     # df['age_sex'] = lambda x: x.split(' ')[2], df['age_sex']
123 |     # df['age_of_english_onset'] = lambda x: float(x.split(' ')[-1]), df['age_of_english_onset']
124 |     df['age_of_english_onset'] = df['age_of_english_onset'].apply(lambda x: float(x.split(' ')[-1]))
125 |     # print(df['age_of_english_onset'])
126 |     # df['english_learning_method'] = lambda x: x.split(' ')[-1], df['english_learning_method']
127 |     df['english_learning_method'] = df['english_learning_method'].apply(lambda x: x.split(' ')[-1])
128 |     # print(df['english_learning_method'])
129 |     # df['english_residence'] = lambda x: x.split(' ')[2:], df['english_residence']
130 |     df['english_residence'] = df['english_residence'].apply(lambda x: x.split(' ')[2:])
131 |     # print(df['english_residence'])
132 |     # df['length_of_english_residence'] = lambda x: float(x.split(' ')[-2]), df['length_of_english_residence']
133 |     df['length_of_english_residence'] = df['length_of_english_residence'].apply(lambda x: float(x.split(' ')[-2]))
134 |     # print(df['length_of_english_residence'])
135 | 
136 |     # df['age'] = lambda x: x.replace(' ','').split(',')[0], df['age_sex']
137 | 
138 |     return(df)
139 | 
140 | if __name__ == '__main__':
141 |     '''
142 |     console command example:
143 |     python fromwebsite.py bio_metadata.csv mandarin english arabic
144 |     '''
145 | 
146 |     df = None
147 | 
148 |     # Set destination file
149 |     destination_file = sys.argv[1]
150 | 
151 |     # If no language arguments, use 'mandarin' as default
152 |     try:
153 |         languages = sys.argv[2:]
154 |     except:
155 |         languages = ['mandarin']
156 |         pass
157 | 
158 |     # Check if destination file exists, else create a new one
159 |     try:
160 |         df = pd.read_csv(destination_file)
161 |         df = df.append(create_dataframe(languages=languages),ignore_index=True)
162 | 
163 |     except:
164 |         df = create_dataframe(languages=languages)
165 | 
166 | 
167 |     df.drop_duplicates(subset='language_num',inplace=True)
168 | 
169 |     df.to_csv(destination_file,index=False)
170 | 


--------------------------------------------------------------------------------
/speech-accent-recognition/src/trainmodel.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from collections import Counter
  3 | import sys
  4 | sys.path.append('../speech-accent-recognition/src>')
  5 | import getsplit
  6 | 
  7 | from keras import utils
  8 | import accuracy
  9 | import multiprocessing
 10 | import librosa
 11 | import numpy as np
 12 | from sklearn.model_selection import train_test_split
 13 | from sklearn.preprocessing import MinMaxScaler
 14 | 
 15 | from keras.models import Sequential
 16 | from keras.layers.core import Dense, Dropout, Flatten
 17 | from keras.layers.convolutional import MaxPooling2D, Conv2D
 18 | from keras.preprocessing.image import ImageDataGenerator
 19 | from keras.callbacks import EarlyStopping, TensorBoard
 20 | 
 21 | DEBUG = True
 22 | SILENCE_THRESHOLD = .01
 23 | RATE = 24000
 24 | N_MFCC = 13
 25 | COL_SIZE = 30
 26 | EPOCHS = 10 #35#250
 27 | 
 28 | def to_categorical(y):
 29 |     '''
 30 |     Converts list of languages into a binary class matrix
 31 |     :param y (list): list of languages
 32 |     :return (numpy array): binary class matrix
 33 |     '''
 34 |     lang_dict = {}
 35 |     for index,language in enumerate(set(y)):
 36 |         lang_dict[language] = index
 37 |     y = list(map(lambda x: lang_dict[x],y))
 38 |     return utils.to_categorical(y, len(lang_dict))
 39 | 
 40 | def get_wav(language_num):
 41 |     '''
 42 |     Load wav file from disk and down-samples to RATE
 43 |     :param language_num (list): list of file names
 44 |     :return (numpy array): Down-sampled wav file
 45 |     '''
 46 | 
 47 |     y, sr = librosa.load('../audio/{}.wav'.format(language_num))
 48 |     return(librosa.core.resample(y=y,orig_sr=sr,target_sr=RATE, scale=True))
 49 | 
 50 | def to_mfcc(wav):
 51 |     '''
 52 |     Converts wav file to Mel Frequency Ceptral Coefficients
 53 |     :param wav (numpy array): Wav form
 54 |     :return (2d numpy array: MFCC
 55 |     '''
 56 |     return(librosa.feature.mfcc(y=wav, sr=RATE, n_mfcc=N_MFCC))
 57 | 
 58 | def remove_silence(wav, thresh=0.04, chunk=5000):
 59 |     '''
 60 |     Searches wav form for segments of silence. If wav form values are lower than 'thresh' for 'chunk' samples, the values will be removed
 61 |     :param wav (np array): Wav array to be filtered
 62 |     :return (np array): Wav array with silence removed
 63 |     '''
 64 | 
 65 |     tf_list = []
 66 |     for x in range(len(wav) / chunk):
 67 |         if (np.any(wav[chunk * x:chunk * (x + 1)] >= thresh) or np.any(wav[chunk * x:chunk * (x + 1)] <= -thresh)):
 68 |             tf_list.extend([True] * chunk)
 69 |         else:
 70 |             tf_list.extend([False] * chunk)
 71 | 
 72 |     tf_list.extend((len(wav) - len(tf_list)) * [False])
 73 |     return(wav[tf_list])
 74 | 
 75 | def normalize_mfcc(mfcc):
 76 |     '''
 77 |     Normalize mfcc
 78 |     :param mfcc:
 79 |     :return:
 80 |     '''
 81 |     mms = MinMaxScaler()
 82 |     return(mms.fit_transform(np.abs(mfcc)))
 83 | 
 84 | def make_segments(mfccs,labels):
 85 |     '''
 86 |     Makes segments of mfccs and attaches them to the labels
 87 |     :param mfccs: list of mfccs
 88 |     :param labels: list of labels
 89 |     :return (tuple): Segments with labels
 90 |     '''
 91 |     segments = []
 92 |     seg_labels = []
 93 |     for mfcc,label in zip(mfccs,labels):
 94 |         for start in range(0, int(mfcc.shape[1] / COL_SIZE)):
 95 |             segments.append(mfcc[:, start * COL_SIZE:(start + 1) * COL_SIZE])
 96 |             seg_labels.append(label)
 97 |     return(segments, seg_labels)
 98 | 
 99 | def segment_one(mfcc):
100 |     '''
101 |     Creates segments from on mfcc image. If last segments is not long enough to be length of columns divided by COL_SIZE
102 |     :param mfcc (numpy array): MFCC array
103 |     :return (numpy array): Segmented MFCC array
104 |     '''
105 |     segments = []
106 |     for start in range(0, int(mfcc.shape[1] / COL_SIZE)):
107 |         segments.append(mfcc[:, start * COL_SIZE:(start + 1) * COL_SIZE])
108 |     return(np.array(segments))
109 | 
110 | def create_segmented_mfccs(X_train):
111 |     '''
112 |     Creates segmented MFCCs from X_train
113 |     :param X_train: list of MFCCs
114 |     :return: segmented mfccs
115 |     '''
116 |     segmented_mfccs = []
117 |     for mfcc in X_train:
118 |         segmented_mfccs.append(segment_one(mfcc))
119 |     return(segmented_mfccs)
120 | 
121 | 
122 | def train_model(X_train,y_train,X_validation,y_validation, batch_size=128): #64
123 |     '''
124 |     Trains 2D convolutional neural network
125 |     :param X_train: Numpy array of mfccs
126 |     :param y_train: Binary matrix based on labels
127 |     :return: Trained model
128 |     '''
129 | 
130 |     # Get row, column, and class sizes
131 |     rows = X_train[0].shape[0]
132 |     cols = X_train[0].shape[1]
133 |     val_rows = X_validation[0].shape[0]
134 |     val_cols = X_validation[0].shape[1]
135 |     num_classes = len(y_train[0])
136 | 
137 |     # input image dimensions to feed into 2D ConvNet Input layer
138 |     input_shape = (rows, cols, 1)
139 |     X_train = X_train.reshape(X_train.shape[0], rows, cols, 1 )
140 |     X_validation = X_validation.reshape(X_validation.shape[0],val_rows,val_cols,1)
141 | 
142 | 
143 |     print('X_train shape:', X_train.shape)
144 |     print(X_train.shape[0], 'training samples')
145 | 
146 |     model = Sequential()
147 | 
148 |     model.add(Conv2D(32, kernel_size=(3,3), activation='relu',
149 |                      data_format="channels_last",
150 |                      input_shape=input_shape))
151 | 
152 |     model.add(MaxPooling2D(pool_size=(2, 2)))
153 |     model.add(Conv2D(64,kernel_size=(3,3), activation='relu'))
154 |     model.add(MaxPooling2D(pool_size=(2, 2)))
155 |     model.add(Dropout(0.25))
156 | 
157 |     model.add(Flatten())
158 |     model.add(Dense(128, activation='relu'))
159 |     model.add(Dropout(0.5))
160 | 
161 |     model.add(Dense(num_classes, activation='softmax'))
162 |     model.compile(loss='categorical_crossentropy',
163 |                   optimizer='adadelta',
164 |                   metrics=['accuracy'])
165 | 
166 |     # Stops training if accuracy does not change at least 0.005 over 10 epochs
167 |     es = EarlyStopping(monitor='acc', min_delta=.005, patience=10, verbose=1, mode='auto')
168 | 
169 |     # Creates log file for graphical interpretation using TensorBoard
170 |     tb = TensorBoard(log_dir='../logs', histogram_freq=0, batch_size=32, write_graph=True, write_grads=True,
171 |                      write_images=True, embeddings_freq=0, embeddings_layer_names=None,
172 |                      embeddings_metadata=None)
173 | 
174 |     # Image shifting
175 |     datagen = ImageDataGenerator(width_shift_range=0.05)
176 | 
177 |     # Fit model using ImageDataGenerator
178 |     model.fit_generator(datagen.flow(X_train, y_train, batch_size=batch_size),
179 |                         steps_per_epoch=len(X_train) / 32
180 |                         , epochs=EPOCHS,
181 |                         callbacks=[es,tb], validation_data=(X_validation,y_validation))
182 | 
183 |     return (model)
184 | 
185 | def save_model(model, model_filename):
186 |     '''
187 |     Save model to file
188 |     :param model: Trained model to be saved
189 |     :param model_filename: Filename
190 |     :return: None
191 |     '''
192 |     model.save('../models/{}.h5'.format(model_filename))  # creates a HDF5 file 'my_model.h5'
193 | 
194 | 
195 | 
196 | ############################################################
197 | 
198 | 
199 | 
200 | 
201 | #######################################
202 | 
203 | if __name__ == '__main__':
204 |     '''
205 |         Console command example:
206 |         python trainmodel.py bio_metadata.csv model50
207 |         '''
208 | 
209 |     # Load arguments
210 |     # print(sys.argv)
211 |     file_name = sys.argv[1]
212 |     model_filename = sys.argv[2]
213 | 
214 |     # Load metadata
215 |     df = pd.read_csv(file_name)
216 | 
217 | 
218 |     # Filter metadata to retrieve only files desired
219 |     filtered_df = getsplit.filter_df(df)
220 | 
221 |     # filtered_df = filter_df(df)
222 | 
223 |     # print(filtered_df)
224 | 
225 |     # print("filterd df is empty {}".format(filtered_df))
226 | 
227 |     # Train test split
228 |     X_train, X_test, y_train, y_test = getsplit.split_people(filtered_df)
229 | 
230 |     # Get statistics
231 |     train_count = Counter(y_train)
232 |     test_count = Counter(y_test)
233 | 
234 |     print("Entering main")
235 | 
236 |     # import ipdb;
237 |     # ipdb.set_trace()
238 | 
239 | 
240 |     acc_to_beat = test_count.most_common(1)[0][1] / float(np.sum(list(test_count.values())))
241 | 
242 |     # To categorical
243 |     y_train = to_categorical(y_train)
244 |     y_test = to_categorical(y_test)
245 | 
246 |     # Get resampled wav files using multiprocessing
247 |     if DEBUG:
248 |         print('Loading wav files....')
249 |     pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
250 |     X_train = pool.map(get_wav, X_train)
251 |     X_test = pool.map(get_wav, X_test)
252 | 
253 |     # Convert to MFCC
254 |     if DEBUG:
255 |         print('Converting to MFCC....')
256 |     X_train = pool.map(to_mfcc, X_train)
257 |     X_test = pool.map(to_mfcc, X_test)
258 | 
259 |     # Create segments from MFCCs
260 |     X_train, y_train = make_segments(X_train, y_train)
261 |     X_validation, y_validation = make_segments(X_test, y_test)
262 | 
263 |     # Randomize training segments
264 |     X_train, _, y_train, _ = train_test_split(X_train, y_train, test_size=0)
265 | 
266 |     # Train model
267 |     model = train_model(np.array(X_train), np.array(y_train), np.array(X_validation),np.array(y_validation))
268 | 
269 |     # Make predictions on full X_test MFCCs
270 |     y_predicted = accuracy.predict_class_all(create_segmented_mfccs(X_test), model)
271 | 
272 |     # Print statistics
273 |     print('Training samples:', train_count)
274 |     print('Testing samples:', test_count)
275 |     print('Accuracy to beat:', acc_to_beat)
276 |     print('Confusion matrix of total samples:\n', np.sum(accuracy.confusion_matrix(y_predicted, y_test),axis=1))
277 |     print('Confusion matrix:\n',accuracy.confusion_matrix(y_predicted, y_test))
278 |     print('Accuracy:', accuracy.get_accuracy(y_predicted,y_test))
279 | 
280 |     # Save model
281 |     save_model(model, model_filename)
282 | 


--------------------------------------------------------------------------------