├── speech-accent-recognition ├── models │ └── model1.h5 ├── __pycache__ │ ├── accuracy.cpython-36.pyc │ └── getsplit.cpython-36.pyc └── src │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── accuracy.cpython-36.pyc │ └── getsplit.cpython-36.pyc │ ├── accuracy.py │ ├── getaudio.py │ ├── getsplit.py │ ├── fromwebsite.py │ └── trainmodel.py └── README.md /speech-accent-recognition/models/model1.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yatharthgarg/Speech-Accent-Recognition/HEAD/speech-accent-recognition/models/model1.h5 -------------------------------------------------------------------------------- /speech-accent-recognition/__pycache__/accuracy.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yatharthgarg/Speech-Accent-Recognition/HEAD/speech-accent-recognition/__pycache__/accuracy.cpython-36.pyc -------------------------------------------------------------------------------- /speech-accent-recognition/__pycache__/getsplit.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yatharthgarg/Speech-Accent-Recognition/HEAD/speech-accent-recognition/__pycache__/getsplit.cpython-36.pyc -------------------------------------------------------------------------------- /speech-accent-recognition/src/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yatharthgarg/Speech-Accent-Recognition/HEAD/speech-accent-recognition/src/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /speech-accent-recognition/src/__pycache__/accuracy.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yatharthgarg/Speech-Accent-Recognition/HEAD/speech-accent-recognition/src/__pycache__/accuracy.cpython-36.pyc -------------------------------------------------------------------------------- /speech-accent-recognition/src/__pycache__/getsplit.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yatharthgarg/Speech-Accent-Recognition/HEAD/speech-accent-recognition/src/__pycache__/getsplit.cpython-36.pyc -------------------------------------------------------------------------------- /speech-accent-recognition/src/accuracy.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import numpy as np 3 | 4 | def predict_class_audio(MFCCs, model): 5 | ''' 6 | Predict class based on MFCC samples 7 | :param MFCCs: Numpy array of MFCCs 8 | :param model: Trained model 9 | :return: Predicted class of MFCC segment group 10 | ''' 11 | MFCCs = MFCCs.reshape(MFCCs.shape[0],MFCCs.shape[1],MFCCs.shape[2],1) 12 | y_predicted = model.predict_classes(MFCCs,verbose=0) 13 | return(Counter(list(y_predicted)).most_common(1)[0][0]) 14 | 15 | 16 | def predict_prob_class_audio(MFCCs, model): 17 | ''' 18 | Predict class based on MFCC samples' probabilities 19 | :param MFCCs: Numpy array of MFCCs 20 | :param model: Trained model 21 | :return: Predicted class of MFCC segment group 22 | ''' 23 | MFCCs = MFCCs.reshape(MFCCs.shape[0],MFCCs.shape[1],MFCCs.shape[2],1) 24 | y_predicted = model.predict_proba(MFCCs,verbose=0) 25 | return(np.argmax(np.sum(y_predicted,axis=0))) 26 | 27 | def predict_class_all(X_train, model): 28 | ''' 29 | :param X_train: List of segmented mfccs 30 | :param model: trained model 31 | :return: list of predictions 32 | ''' 33 | predictions = [] 34 | for mfcc in X_train: 35 | predictions.append(predict_class_audio(mfcc, model)) 36 | # predictions.append(predict_prob_class_audio(mfcc, model)) 37 | return predictions 38 | 39 | def confusion_matrix(y_predicted,y_test): 40 | ''' 41 | Create confusion matrix 42 | :param y_predicted: list of predictions 43 | :param y_test: numpy array of shape (len(y_test), number of classes). 1.'s at index of actual, otherwise 0. 44 | :return: numpy array. confusion matrix 45 | ''' 46 | confusion_matrix = np.zeros((len(y_test[0]),len(y_test[0])),dtype=int ) 47 | for index, predicted in enumerate(y_predicted): 48 | confusion_matrix[np.argmax(y_test[index])][predicted] += 1 49 | return(confusion_matrix) 50 | 51 | def get_accuracy(y_predicted,y_test): 52 | ''' 53 | Get accuracy 54 | :param y_predicted: numpy array of predictions 55 | :param y_test: numpy array of actual 56 | :return: accuracy 57 | ''' 58 | c_matrix = confusion_matrix(y_predicted,y_test) 59 | return( np.sum(c_matrix.diagonal()) / float(np.sum(c_matrix))) 60 | 61 | if __name__ == '__main__': 62 | pass 63 | 64 | 65 | -------------------------------------------------------------------------------- /speech-accent-recognition/src/getaudio.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import urllib.request 3 | import os 4 | import sys 5 | from pydub import AudioSegment 6 | 7 | class GetAudio: 8 | 9 | def __init__(self, csv_filepath, destination_folder= 'audio/', wait= 1.5, debug=False ): 10 | ''' 11 | Initializes GetAudio class object 12 | :param destination_folder (str): Folder where audio files will be saved 13 | :param wait (float): Length (in seconds) between web requests 14 | :param debug (bool): Outputs status indicators to console when True 15 | ''' 16 | self.csv_filepath = csv_filepath 17 | self.audio_df = pd.read_csv(csv_filepath) 18 | self.url = 'http://chnm.gmu.edu/accent/soundtracks/{}.mp3' 19 | self.destination_folder = destination_folder 20 | self.wait = wait 21 | self.debug = False 22 | 23 | def check_path(self): 24 | ''' 25 | Checks if self.distination_folder exists. If not, a folder called self.destination_folder is created 26 | ''' 27 | if not os.path.exists(self.destination_folder): 28 | if self.debug: 29 | print('{} does not exist, creating'.format(self.destination_folder)) 30 | os.makedirs('../' + self.destination_folder) 31 | 32 | def get_audio(self): 33 | ''' 34 | Retrieves all audio files from 'language_num' column of self.audio_df 35 | If audio file already exists, move on to the next 36 | :return (int): Number of audio files downloaded 37 | ''' 38 | 39 | self.check_path() 40 | 41 | counter = 0 42 | 43 | for lang_num in self.audio_df['language_num']: 44 | if not os.path.exists(self.destination_folder +'{}.wav'.format(lang_num)): 45 | if self.debug: 46 | print('downloading {}'.format(lang_num)) 47 | (filename, headers) = urllib.request.urlretrieve(self.url.format(lang_num)) 48 | sound = AudioSegment.from_mp3(filename) 49 | sound.export('../' + self.destination_folder + "{}.wav".format(lang_num), format="wav") 50 | counter += 1 51 | 52 | return counter 53 | 54 | if __name__ == '__main__': 55 | ''' 56 | Example console command 57 | python GetAudio.py audio_metadata.csv 58 | ''' 59 | csv_file = sys.argv[1] 60 | ga = GetAudio(csv_filepath=csv_file) 61 | ga.get_audio() 62 | 63 | -------------------------------------------------------------------------------- /speech-accent-recognition/src/getsplit.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sys 3 | from sklearn.model_selection import train_test_split 4 | 5 | 6 | 7 | 8 | # def filter_df(df): 9 | # ''' 10 | # Function to filter audio files based on df columns 11 | # df column options: [age,age_of_english_onset,age_sex,birth_place,english_learning_method, 12 | # english_residence,length_of_english_residence,native_language,other_languages,sex] 13 | # :param df (DataFrame): Full unfiltered DataFrame 14 | # :return (DataFrame): Filtered DataFrame 15 | # ''' 16 | # 17 | # # Example to filter arabic, mandarin, and english and limit to 73 audio files 18 | # arabic = df[df['native_language'] == 'arabic'] 19 | # mandarin = df[df['native_language'] == 'mandarin'] 20 | # english = df[df.native_language == 'english'][:73] 21 | # mandarin = mandarin[mandarin.length_of_english_residence < 10][:73] 22 | # arabic = arabic[arabic.length_of_english_residence < 10][:73] 23 | # 24 | # df = english.append(arabic) 25 | # df = df.append(mandarin) 26 | # 27 | # 28 | # 29 | # return df 30 | 31 | def filter_df(df): 32 | ''' 33 | Function to filter audio files based on df columns 34 | df column options: [age,age_of_english_onset,age_sex,birth_place,english_learning_method, 35 | english_residence,length_of_english_residence,native_language,other_languages,sex] 36 | :param df (DataFrame): Full unfiltered DataFrame 37 | :return (DataFrame): Filtered DataFrame 38 | ''' 39 | 40 | arabic = df[df.native_language == 'arabic'] 41 | mandarin = df[df.native_language == 'mandarin'] 42 | english = df[df.native_language == 'english'] 43 | 44 | mandarin = mandarin[mandarin.length_of_english_residence < 10] 45 | arabic = arabic[arabic.length_of_english_residence < 10] 46 | 47 | df = df.append(english) 48 | df = df.append(arabic) 49 | df = df.append(mandarin) 50 | 51 | return df 52 | 53 | def split_people(df,test_size=0.2): 54 | ''' 55 | Create train test split of DataFrame 56 | :param df (DataFrame): Pandas DataFrame of audio files to be split 57 | :param test_size (float): Percentage of total files to be split into test 58 | :return X_train, X_test, y_train, y_test (tuple): Xs are list of df['language_num'] and Ys are df['native_language'] 59 | ''' 60 | 61 | 62 | return train_test_split(df['language_num'],df['native_language'],test_size=test_size,random_state=1234) 63 | 64 | 65 | if __name__ == '__main__': 66 | ''' 67 | Console command example: 68 | python bio_data.csv 69 | ''' 70 | 71 | csv_file = sys.argv[1] 72 | df = pd.read_csv(csv_file) 73 | filtered_df = filter_df(df) 74 | print(split_people(filtered_df)) 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Speech-Accent-Recognition 2 | 3 | ### About 4 | Every individual has their own dialects or mannerisms in which they speak. This project revolves around the detection of backgrounds of every individual using their speeches. The goal in this project is to classify various types of accents, specifically foreign accents, by the native language of the speaker. This project allows to detect the demographic and linguistic backgrounds of the speakers by comparing different speech outputs with the speech accent archive dataset in order to determine which variables are key predictors of each accent. The speech accent archive demonstrates that accents are systematic rather than merely mistaken speech. Given a recording of a speaker speaking a known script of English words, this project predicts the speaker’s native language. 5 | 6 | ### Dataset 7 | All of the speech files used for this project come from the Speech Accent Archive, a repository of spoken English hosted by George Mason University. Over 2000 speakers representing over 100 native languages read a common elicitation paragraph in English: 8 | 9 | ``` 10 | 'Please call Stella. Ask her to bring these things with her from the store: Six spoons of fresh 11 | snow peas, five thick slabs of blue cheese, and maybe a snack for her brother Bob. We also need 12 | a small plastic snake and a big toy frog for the kids. She can scoop these things into three red 13 | bags, and we will go meet her Wednesday at the train station.' 14 | ``` 15 | 16 | The common nature of the dataset makes it ideal for studying accent, being that the wording is provided, and the recording quality is (nearly) uniform across all speakers. Since the dataset was large in the terms of size (approximately 2GB) but the samples were less, so I worked mainly on 3 most spoken accents i.e. English, Mandarin and Arabic. 17 | 18 | The dataset contained **.mp3** audio files which were converted to **.wav** audio files which allowed easy extraction of the **MFCC (Mel Frequency Cepstral Coefficients)** features to build a 2-D convolution neural network. 19 | 20 | The MFCC was fed into a 2-Dimensional Convolutional Neural Network (CNN) to predict the native language class. 21 | 22 | ### Dependencies 23 | • [Python 3.x](https://www.python.org/download/releases/2.7/) 24 | 25 | • [Keras](https://keras.io/) 26 | 27 | • [Numpy](http://www.numpy.org/) 28 | 29 | • [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/) 30 | 31 | • [Pydub](https://github.com/jiaaro/pydub) 32 | 33 | • [Scikit-learn](http://scikit-learn.org/stable/) 34 | 35 | • [Librosa](http://librosa.github.io/librosa/) 36 | 37 | ### Execution 38 | To execute the code, please have all the dependencies installed on your system. Next, change execution directory to the src directory of the code and execute the following python commands - 39 | 40 | • To download language metadata from [The Speech Accent Archive](http://accent.gmu.edu/index.php) and download audio files: 41 | ``` 42 | python fromwebsite.py bio_data.csv mandarin english arabic 43 | ``` 44 | • Run getaudio.py to download audio files to the audio directory. All audio files listed in bio_metadata.csv will be downloaded. 45 | ``` 46 | python getaudio.py bio_data.csv 47 | ``` 48 | • Run trainmodel.py to train the CNN. 49 | ``` 50 | python trainmodel.py bio_data.csv model5 51 | ``` 52 | ### Results 53 | This project was able to achieve an accuracy of **over 75%** and used 13 MFCC features of each sample which were fed into the 2-D CNN. 54 | The final accuracies are: - 55 | 56 | • 96.6% when English samples were given 57 | 58 | • 65% when Arabic samples were given 59 | 60 | • 50% when Mandarin samples were given 61 | 62 | ### References 63 | • [Morgan Bryant, Amanda Chow & Sydney Li Classification of Accents of English Speakers by Native Language](http://cs229.stanford.edu/proj2014/Morgan%20Bryant,%20Amanda%20Chow,%20Sydney%20Li,%20Classification%20of%20Accents%20of%20English%20Speakers%20by%20Native%20Language.pdf) 64 | -------------------------------------------------------------------------------- /speech-accent-recognition/src/fromwebsite.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pandas as pd 3 | from bs4 import BeautifulSoup 4 | import time 5 | import sys 6 | import re 7 | 8 | ROOT_URL = 'http://accent.gmu.edu/' 9 | BROWSE_LANGUAGE_URL = 'browse_language.php?function=find&language={}' 10 | WAIT = 1.2 11 | DEBUG = True 12 | 13 | def get_htmls(urls): 14 | ''' 15 | Retrieves html in text form from ROOT_URL 16 | :param urls (list): List of urls from which to retrieve html 17 | :return (list): list of HTML strings 18 | ''' 19 | htmls = [] 20 | for url in urls: 21 | if DEBUG: 22 | print('downloading from {}'.format(url)) 23 | htmls.append(requests.get(url).text) 24 | time.sleep(WAIT) 25 | 26 | return(htmls) 27 | 28 | 29 | def build_search_urls(languages): 30 | ''' 31 | creates url from ROOT_URL and languages 32 | :param languages (list): List of languages 33 | :return (list): List of urls 34 | ''' 35 | return([ROOT_URL+BROWSE_LANGUAGE_URL.format(language) for language in languages]) 36 | 37 | def parse_p(p_tag): 38 | ''' 39 | Extracts href property from HTML
tag string 40 | :param p_tag (str): HTML string 41 | :return (str): string of link 42 | ''' 43 | text = p_tag.text.replace(' ','').split(',') 44 | return([ROOT_URL+p_tag.a['href'], text[0], text[1]]) 45 | 46 | def get_bio(hrefs): 47 | ''' 48 | Retrieves HTML from list of hrefs and returns bio information 49 | :param hrefs (list): list of hrefs 50 | :return (DataFrame): Pandas DataFrame with bio information 51 | ''' 52 | 53 | htmls = get_htmls(hrefs) 54 | bss = [BeautifulSoup(html,'html.parser') for html in htmls] 55 | rows = [] 56 | bio_row = [] 57 | for bs in bss: 58 | rows.append([li.text for li in bs.find('ul','bio').find_all('li')]) 59 | for row in rows: 60 | bio_row.append(parse_bio(row)) 61 | 62 | return(pd.DataFrame(bio_row)) 63 | 64 | def parse_bio(row): 65 | ''' 66 | Parse bio data from row string 67 | :param row (str): Unparsed bio string 68 | :return (list): Bio columns 69 | ''' 70 | cols = [] 71 | for col in row: 72 | try: 73 | tmp_col = re.search((r"\:(.+)",col.replace(' ','')).group(1)) 74 | except: 75 | tmp_col = col 76 | cols.append(tmp_col) 77 | return(cols) 78 | 79 | 80 | def create_dataframe(languages): 81 | ''' 82 | 83 | :param languages (str): language from which you want to get html 84 | :return df (DataFrame): DataFrame that contains all audio metadata from searched language 85 | ''' 86 | htmls = get_htmls(build_search_urls(languages)) 87 | bss = [BeautifulSoup(html,'html.parser') for html in htmls] 88 | persons = [] 89 | 90 | for bs in bss: 91 | for p in bs.find_all('p'): 92 | if p.a: 93 | persons.append(parse_p(p)) 94 | 95 | df = pd.DataFrame(persons, columns=['href','language_num','sex']) 96 | 97 | bio_rows = get_bio(df['href']) 98 | 99 | if DEBUG: 100 | print('loading finished') 101 | 102 | df['birth_place'] = bio_rows.iloc[:,0] 103 | df['native_language'] = bio_rows.iloc[:,1] 104 | df['other_languages'] = bio_rows.iloc[:,2] 105 | df['age_sex'] = bio_rows.iloc[:,3] 106 | df['age_of_english_onset'] = bio_rows.iloc[:,4] 107 | df['english_learning_method'] = bio_rows.iloc[:,5] 108 | df['english_residence'] = bio_rows.iloc[:,6] 109 | df['length_of_english_residence'] = bio_rows.iloc[:,7] 110 | 111 | df['birth_place'] = df['birth_place'].apply(lambda x: x[:-6].split(' ')[-2:]) 112 | # print(df['birth_place']) 113 | # df['birth_place'] = lambda x: x[:-6].split(' ')[2:], df['birth_place'] 114 | df['native_language'] = df['native_language'].apply(lambda x: x.split(' ')[2]) 115 | # print(df['native_language']) 116 | # df['native_language'] = lambda x: x.split(' ')[2], df['native_language'] 117 | df['other_languages'] = df['other_languages'].apply(lambda x: x.split(' ')[2:]) 118 | # print(df['other_languages']) 119 | # df['other_languages'] = lambda x: x.split(' ')[2:], df['other_languages'] 120 | df['age_sex'], df['age'] = df['age_sex'].apply(lambda x: x.split(' ')[2:]), df['age_sex'].apply(lambda x: x.replace('sex:','').split(',')[1]) 121 | # print(df['age']) 122 | # df['age_sex'] = lambda x: x.split(' ')[2], df['age_sex'] 123 | # df['age_of_english_onset'] = lambda x: float(x.split(' ')[-1]), df['age_of_english_onset'] 124 | df['age_of_english_onset'] = df['age_of_english_onset'].apply(lambda x: float(x.split(' ')[-1])) 125 | # print(df['age_of_english_onset']) 126 | # df['english_learning_method'] = lambda x: x.split(' ')[-1], df['english_learning_method'] 127 | df['english_learning_method'] = df['english_learning_method'].apply(lambda x: x.split(' ')[-1]) 128 | # print(df['english_learning_method']) 129 | # df['english_residence'] = lambda x: x.split(' ')[2:], df['english_residence'] 130 | df['english_residence'] = df['english_residence'].apply(lambda x: x.split(' ')[2:]) 131 | # print(df['english_residence']) 132 | # df['length_of_english_residence'] = lambda x: float(x.split(' ')[-2]), df['length_of_english_residence'] 133 | df['length_of_english_residence'] = df['length_of_english_residence'].apply(lambda x: float(x.split(' ')[-2])) 134 | # print(df['length_of_english_residence']) 135 | 136 | # df['age'] = lambda x: x.replace(' ','').split(',')[0], df['age_sex'] 137 | 138 | return(df) 139 | 140 | if __name__ == '__main__': 141 | ''' 142 | console command example: 143 | python fromwebsite.py bio_metadata.csv mandarin english arabic 144 | ''' 145 | 146 | df = None 147 | 148 | # Set destination file 149 | destination_file = sys.argv[1] 150 | 151 | # If no language arguments, use 'mandarin' as default 152 | try: 153 | languages = sys.argv[2:] 154 | except: 155 | languages = ['mandarin'] 156 | pass 157 | 158 | # Check if destination file exists, else create a new one 159 | try: 160 | df = pd.read_csv(destination_file) 161 | df = df.append(create_dataframe(languages=languages),ignore_index=True) 162 | 163 | except: 164 | df = create_dataframe(languages=languages) 165 | 166 | 167 | df.drop_duplicates(subset='language_num',inplace=True) 168 | 169 | df.to_csv(destination_file,index=False) 170 | -------------------------------------------------------------------------------- /speech-accent-recognition/src/trainmodel.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from collections import Counter 3 | import sys 4 | sys.path.append('../speech-accent-recognition/src>') 5 | import getsplit 6 | 7 | from keras import utils 8 | import accuracy 9 | import multiprocessing 10 | import librosa 11 | import numpy as np 12 | from sklearn.model_selection import train_test_split 13 | from sklearn.preprocessing import MinMaxScaler 14 | 15 | from keras.models import Sequential 16 | from keras.layers.core import Dense, Dropout, Flatten 17 | from keras.layers.convolutional import MaxPooling2D, Conv2D 18 | from keras.preprocessing.image import ImageDataGenerator 19 | from keras.callbacks import EarlyStopping, TensorBoard 20 | 21 | DEBUG = True 22 | SILENCE_THRESHOLD = .01 23 | RATE = 24000 24 | N_MFCC = 13 25 | COL_SIZE = 30 26 | EPOCHS = 10 #35#250 27 | 28 | def to_categorical(y): 29 | ''' 30 | Converts list of languages into a binary class matrix 31 | :param y (list): list of languages 32 | :return (numpy array): binary class matrix 33 | ''' 34 | lang_dict = {} 35 | for index,language in enumerate(set(y)): 36 | lang_dict[language] = index 37 | y = list(map(lambda x: lang_dict[x],y)) 38 | return utils.to_categorical(y, len(lang_dict)) 39 | 40 | def get_wav(language_num): 41 | ''' 42 | Load wav file from disk and down-samples to RATE 43 | :param language_num (list): list of file names 44 | :return (numpy array): Down-sampled wav file 45 | ''' 46 | 47 | y, sr = librosa.load('../audio/{}.wav'.format(language_num)) 48 | return(librosa.core.resample(y=y,orig_sr=sr,target_sr=RATE, scale=True)) 49 | 50 | def to_mfcc(wav): 51 | ''' 52 | Converts wav file to Mel Frequency Ceptral Coefficients 53 | :param wav (numpy array): Wav form 54 | :return (2d numpy array: MFCC 55 | ''' 56 | return(librosa.feature.mfcc(y=wav, sr=RATE, n_mfcc=N_MFCC)) 57 | 58 | def remove_silence(wav, thresh=0.04, chunk=5000): 59 | ''' 60 | Searches wav form for segments of silence. If wav form values are lower than 'thresh' for 'chunk' samples, the values will be removed 61 | :param wav (np array): Wav array to be filtered 62 | :return (np array): Wav array with silence removed 63 | ''' 64 | 65 | tf_list = [] 66 | for x in range(len(wav) / chunk): 67 | if (np.any(wav[chunk * x:chunk * (x + 1)] >= thresh) or np.any(wav[chunk * x:chunk * (x + 1)] <= -thresh)): 68 | tf_list.extend([True] * chunk) 69 | else: 70 | tf_list.extend([False] * chunk) 71 | 72 | tf_list.extend((len(wav) - len(tf_list)) * [False]) 73 | return(wav[tf_list]) 74 | 75 | def normalize_mfcc(mfcc): 76 | ''' 77 | Normalize mfcc 78 | :param mfcc: 79 | :return: 80 | ''' 81 | mms = MinMaxScaler() 82 | return(mms.fit_transform(np.abs(mfcc))) 83 | 84 | def make_segments(mfccs,labels): 85 | ''' 86 | Makes segments of mfccs and attaches them to the labels 87 | :param mfccs: list of mfccs 88 | :param labels: list of labels 89 | :return (tuple): Segments with labels 90 | ''' 91 | segments = [] 92 | seg_labels = [] 93 | for mfcc,label in zip(mfccs,labels): 94 | for start in range(0, int(mfcc.shape[1] / COL_SIZE)): 95 | segments.append(mfcc[:, start * COL_SIZE:(start + 1) * COL_SIZE]) 96 | seg_labels.append(label) 97 | return(segments, seg_labels) 98 | 99 | def segment_one(mfcc): 100 | ''' 101 | Creates segments from on mfcc image. If last segments is not long enough to be length of columns divided by COL_SIZE 102 | :param mfcc (numpy array): MFCC array 103 | :return (numpy array): Segmented MFCC array 104 | ''' 105 | segments = [] 106 | for start in range(0, int(mfcc.shape[1] / COL_SIZE)): 107 | segments.append(mfcc[:, start * COL_SIZE:(start + 1) * COL_SIZE]) 108 | return(np.array(segments)) 109 | 110 | def create_segmented_mfccs(X_train): 111 | ''' 112 | Creates segmented MFCCs from X_train 113 | :param X_train: list of MFCCs 114 | :return: segmented mfccs 115 | ''' 116 | segmented_mfccs = [] 117 | for mfcc in X_train: 118 | segmented_mfccs.append(segment_one(mfcc)) 119 | return(segmented_mfccs) 120 | 121 | 122 | def train_model(X_train,y_train,X_validation,y_validation, batch_size=128): #64 123 | ''' 124 | Trains 2D convolutional neural network 125 | :param X_train: Numpy array of mfccs 126 | :param y_train: Binary matrix based on labels 127 | :return: Trained model 128 | ''' 129 | 130 | # Get row, column, and class sizes 131 | rows = X_train[0].shape[0] 132 | cols = X_train[0].shape[1] 133 | val_rows = X_validation[0].shape[0] 134 | val_cols = X_validation[0].shape[1] 135 | num_classes = len(y_train[0]) 136 | 137 | # input image dimensions to feed into 2D ConvNet Input layer 138 | input_shape = (rows, cols, 1) 139 | X_train = X_train.reshape(X_train.shape[0], rows, cols, 1 ) 140 | X_validation = X_validation.reshape(X_validation.shape[0],val_rows,val_cols,1) 141 | 142 | 143 | print('X_train shape:', X_train.shape) 144 | print(X_train.shape[0], 'training samples') 145 | 146 | model = Sequential() 147 | 148 | model.add(Conv2D(32, kernel_size=(3,3), activation='relu', 149 | data_format="channels_last", 150 | input_shape=input_shape)) 151 | 152 | model.add(MaxPooling2D(pool_size=(2, 2))) 153 | model.add(Conv2D(64,kernel_size=(3,3), activation='relu')) 154 | model.add(MaxPooling2D(pool_size=(2, 2))) 155 | model.add(Dropout(0.25)) 156 | 157 | model.add(Flatten()) 158 | model.add(Dense(128, activation='relu')) 159 | model.add(Dropout(0.5)) 160 | 161 | model.add(Dense(num_classes, activation='softmax')) 162 | model.compile(loss='categorical_crossentropy', 163 | optimizer='adadelta', 164 | metrics=['accuracy']) 165 | 166 | # Stops training if accuracy does not change at least 0.005 over 10 epochs 167 | es = EarlyStopping(monitor='acc', min_delta=.005, patience=10, verbose=1, mode='auto') 168 | 169 | # Creates log file for graphical interpretation using TensorBoard 170 | tb = TensorBoard(log_dir='../logs', histogram_freq=0, batch_size=32, write_graph=True, write_grads=True, 171 | write_images=True, embeddings_freq=0, embeddings_layer_names=None, 172 | embeddings_metadata=None) 173 | 174 | # Image shifting 175 | datagen = ImageDataGenerator(width_shift_range=0.05) 176 | 177 | # Fit model using ImageDataGenerator 178 | model.fit_generator(datagen.flow(X_train, y_train, batch_size=batch_size), 179 | steps_per_epoch=len(X_train) / 32 180 | , epochs=EPOCHS, 181 | callbacks=[es,tb], validation_data=(X_validation,y_validation)) 182 | 183 | return (model) 184 | 185 | def save_model(model, model_filename): 186 | ''' 187 | Save model to file 188 | :param model: Trained model to be saved 189 | :param model_filename: Filename 190 | :return: None 191 | ''' 192 | model.save('../models/{}.h5'.format(model_filename)) # creates a HDF5 file 'my_model.h5' 193 | 194 | 195 | 196 | ############################################################ 197 | 198 | 199 | 200 | 201 | ####################################### 202 | 203 | if __name__ == '__main__': 204 | ''' 205 | Console command example: 206 | python trainmodel.py bio_metadata.csv model50 207 | ''' 208 | 209 | # Load arguments 210 | # print(sys.argv) 211 | file_name = sys.argv[1] 212 | model_filename = sys.argv[2] 213 | 214 | # Load metadata 215 | df = pd.read_csv(file_name) 216 | 217 | 218 | # Filter metadata to retrieve only files desired 219 | filtered_df = getsplit.filter_df(df) 220 | 221 | # filtered_df = filter_df(df) 222 | 223 | # print(filtered_df) 224 | 225 | # print("filterd df is empty {}".format(filtered_df)) 226 | 227 | # Train test split 228 | X_train, X_test, y_train, y_test = getsplit.split_people(filtered_df) 229 | 230 | # Get statistics 231 | train_count = Counter(y_train) 232 | test_count = Counter(y_test) 233 | 234 | print("Entering main") 235 | 236 | # import ipdb; 237 | # ipdb.set_trace() 238 | 239 | 240 | acc_to_beat = test_count.most_common(1)[0][1] / float(np.sum(list(test_count.values()))) 241 | 242 | # To categorical 243 | y_train = to_categorical(y_train) 244 | y_test = to_categorical(y_test) 245 | 246 | # Get resampled wav files using multiprocessing 247 | if DEBUG: 248 | print('Loading wav files....') 249 | pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 250 | X_train = pool.map(get_wav, X_train) 251 | X_test = pool.map(get_wav, X_test) 252 | 253 | # Convert to MFCC 254 | if DEBUG: 255 | print('Converting to MFCC....') 256 | X_train = pool.map(to_mfcc, X_train) 257 | X_test = pool.map(to_mfcc, X_test) 258 | 259 | # Create segments from MFCCs 260 | X_train, y_train = make_segments(X_train, y_train) 261 | X_validation, y_validation = make_segments(X_test, y_test) 262 | 263 | # Randomize training segments 264 | X_train, _, y_train, _ = train_test_split(X_train, y_train, test_size=0) 265 | 266 | # Train model 267 | model = train_model(np.array(X_train), np.array(y_train), np.array(X_validation),np.array(y_validation)) 268 | 269 | # Make predictions on full X_test MFCCs 270 | y_predicted = accuracy.predict_class_all(create_segmented_mfccs(X_test), model) 271 | 272 | # Print statistics 273 | print('Training samples:', train_count) 274 | print('Testing samples:', test_count) 275 | print('Accuracy to beat:', acc_to_beat) 276 | print('Confusion matrix of total samples:\n', np.sum(accuracy.confusion_matrix(y_predicted, y_test),axis=1)) 277 | print('Confusion matrix:\n',accuracy.confusion_matrix(y_predicted, y_test)) 278 | print('Accuracy:', accuracy.get_accuracy(y_predicted,y_test)) 279 | 280 | # Save model 281 | save_model(model, model_filename) 282 | --------------------------------------------------------------------------------