├── model_inferencing.py ├── README.md ├── data preparation.py ├── model training.py └── UA_df.csv /model_inferencing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 21 10:57:53 2023 4 | 5 | @author: dreji18 6 | """ 7 | 8 | from huggingsound import SpeechRecognitionModel 9 | 10 | model_dir = r'xx\xxx\xxx' 11 | 12 | model = SpeechRecognitionModel(model_dir) 13 | 14 | audio_dir = r'xx\xxx\xxx' 15 | 16 | import os 17 | os.chdir(audio_dir) 18 | 19 | audio_paths = ["F02_B1_C17_M6.wav"] 20 | 21 | transcriptions = model.transcribe(audio_paths) 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fine-tune-Speech-Recognition 2 | Tutorial on how to train a custom voice recognition model using Hugging face models. 3 | 4 | Video link: https://youtu.be/7e75HuVFpYI 5 | ![thumbnail](https://user-images.githubusercontent.com/49631017/233625269-0609a9f7-6ad1-44d1-8884-86d2846b56f4.png) 6 | 7 | You can support me :) 8 | 9 | Buy Me A Coffee 10 | -------------------------------------------------------------------------------- /data preparation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Feb 12 18:44:57 2023 4 | 5 | @author: dreji18 6 | """ 7 | 8 | from datasets import load_dataset 9 | import numpy as np 10 | import wave 11 | import os 12 | 13 | path = r'xx\xxx\xxx' 14 | os.chdir(path) 15 | 16 | # loading the dataset from huggingface hub 17 | #--https://huggingface.co/datasets/ngdiana/uaspeech_severity_high 18 | UA = load_dataset("ngdiana/uaspeech_severity_high") 19 | 20 | # converting to pandas for easy data handling 21 | UA_df = UA['train'].to_pandas() 22 | UA_df = UA_df[0:20] 23 | 24 | UA_df['filename'] = UA_df['path'].apply(lambda x: x.split("/")[-1]) 25 | 26 | UA_df.to_csv("UA_df.csv") 27 | 28 | 29 | ## the goal is to convert the speech array to WAV file in bulk 30 | 31 | def array2WAV(id): 32 | 33 | # Define the sample rate and number of samples 34 | sample_rate = 16000 35 | num_samples = 1 36 | 37 | # Create a WAV file object 38 | wav_file = wave.open(UA_df['filename'].iloc[id], "w") 39 | 40 | # Set the WAV file parameters 41 | wav_file.setnchannels(1) # 1 channel (mono) 42 | wav_file.setsampwidth(2) # 16-bit sample width 43 | wav_file.setframerate(sample_rate) 44 | 45 | # Write the samples to the WAV file as binary data 46 | 47 | samples = UA_df['speech'].iloc[id] 48 | samples = (samples * (2**15 - 1)).astype(np.int16) 49 | wav_file.writeframes(samples.tobytes()) 50 | 51 | # Close the WAV file 52 | wav_file.close() 53 | 54 | # set the output directory to save the wav files 55 | out_dir = r'xx\xxx\xxx' 56 | 57 | os.chdir(out_dir) 58 | 59 | for id in range(0, len(UA_df)): 60 | array2WAV(id) -------------------------------------------------------------------------------- /model training.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Feb 12 20:01:49 2023 4 | 5 | @author: dreji18 6 | """ 7 | 8 | from huggingsound import TrainingArguments, ModelArguments, SpeechRecognitionModel, TokenSet 9 | #https://github.com/jonatasgrosman/huggingsound 10 | #!pip install huggingsound 11 | import pandas as pd 12 | import os 13 | 14 | 15 | audio_dir = r'xx\xxx\xxx' 16 | 17 | import torch 18 | torch.cuda.empty_cache() 19 | #device = "cuda" if torch.cuda.is_available() else "cpu" 20 | device = "cpu" 21 | #model = SpeechRecognitionModel("facebook/wav2vec2-large-xlsr-53") 22 | model = SpeechRecognitionModel("facebook/wav2vec2-large-xlsr-53", device=device) 23 | torch.cuda.empty_cache() 24 | 25 | # preparing the vocab file 26 | vocab_dict = {'c': 0, 27 | 'q': 1, 28 | 'w': 2, 29 | 'j': 3, 30 | 'r': 4, 31 | 'h': 5, 32 | 'x': 6, 33 | 'm': 7, 34 | 'p': 8, 35 | 'd': 9, 36 | 'f': 10, 37 | 'g': 11, 38 | 'k': 12, 39 | 'u': 13, 40 | 'v': 14, 41 | 'a': 15, 42 | 'n': 16, 43 | ' ': 17, 44 | 'i': 18, 45 | 's': 19, 46 | 'y': 20, 47 | 'l': 21, 48 | 'e': 22, 49 | 'o': 23, 50 | 'z': 24, 51 | 'b': 25, 52 | 't': 26} 53 | 54 | vocab_dict["|"] = vocab_dict[" "] 55 | del vocab_dict[" "] 56 | 57 | vocab_dict["[UNK]"] = len(vocab_dict) 58 | vocab_dict["[PAD]"] = len(vocab_dict) 59 | 60 | tokens = list(vocab_dict.keys()) 61 | token_set = TokenSet(tokens) 62 | 63 | #%% 64 | training_args = TrainingArguments( 65 | learning_rate=3e-4, 66 | max_steps=1000, 67 | eval_steps=200, 68 | per_device_train_batch_size=1, 69 | per_device_eval_batch_size=1, 70 | ) 71 | model_args = ModelArguments( 72 | activation_dropout=0.1, 73 | hidden_dropout=0.1, 74 | ) 75 | 76 | #%% 77 | # preparing the training data 78 | UA_df = pd.read_csv(r'xx\xxx\UA_df.csv') 79 | 80 | os.chdir(audio_dir) 81 | 82 | train_data = [] 83 | for id in range(0, len(UA_df)): 84 | train_data.append({"path": UA_df['filename'].iloc[id], "transcription": UA_df['target'].iloc[id]}) 85 | 86 | # for evaluation data 87 | UA_df[['col1', 'col2', 'col3', 'col4']] = UA_df['filename'].str.split("_", expand=True) 88 | UA_df1 = UA_df.drop_duplicates(subset='col3', keep="first") 89 | 90 | eval_data = [] 91 | for id in range(0, len(UA_df1)): 92 | eval_data.append({"path": UA_df1['filename'].iloc[id], "transcription": UA_df1['target'].iloc[id]}) 93 | 94 | # and finally, fine-tune your model 95 | output_dir = r'xx\xxx\xxx' 96 | model.finetune( 97 | output_dir, 98 | train_data=train_data, 99 | eval_data=eval_data, # the eval_data is optional 100 | token_set=token_set, 101 | training_args=training_args, 102 | model_args=model_args, 103 | ) 104 | #%% -------------------------------------------------------------------------------- /UA_df.csv: -------------------------------------------------------------------------------- 1 | ,id,target,path,dataset,severity,speech,filename 2 | 0,F02,moisten,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B1_UW81_M2.wav,UA,medium,"[ 2.1362305e-04 2.1362305e-04 1.5258789e-04 ... -2.7465820e-04 3 | 3.0517578e-05 -2.4414062e-04]",F02_B1_UW81_M2.wav 4 | 1,F02,abbreviated,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B2_UW15_M3.wav,UA,medium,"[1.8310547e-04 6.1035156e-05 1.2207031e-04 ... 2.6550293e-03 2.5024414e-03 5 | 2.4108887e-03]",F02_B2_UW15_M3.wav 6 | 2,F02,without,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B3_UW40_M5.wav,UA,medium,"[ 0.0000000e+00 3.0517578e-05 1.2207031e-04 ... -2.3132324e-02 7 | -2.3468018e-02 -2.3651123e-02]",F02_B3_UW40_M5.wav 8 | 3,F02,casualties,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B1_UW29_M3.wav,UA,medium,"[ 9.1552734e-05 6.1035156e-05 1.2207031e-04 ... -2.2277832e-03 9 | -1.9531250e-03 -2.0446777e-03]",F02_B1_UW29_M3.wav 10 | 4,F02,downward,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B1_C17_M6.wav,UA,medium,"[-0.00018311 0. 0. ... -0.00198364 -0.00186157 11 | -0.00161743]",F02_B1_C17_M6.wav 12 | 5,F02,each,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B1_CW43_M4.wav,UA,medium,"[ 6.1035156e-05 6.1035156e-04 3.9672852e-04 ... -4.1198730e-03 13 | -4.8522949e-03 -4.8828125e-03]",F02_B1_CW43_M4.wav 14 | 6,F02,how,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B3_CW47_M8.wav,UA,medium,"[ 0.00064087 0.00082397 0.00097656 ... -0.00088501 -0.00283813 15 | -0.0039978 ]",F02_B3_CW47_M8.wav 16 | 7,F02,absolve,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B2_UW17_M8.wav,UA,medium,[0.00076294 0.00079346 0.00088501 ... 0.0090332 0.00875854 0.00857544],F02_B2_UW17_M8.wav 17 | 8,F02,thumb,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B2_UW100_M6.wav,UA,medium,"[-0.00039673 -0.00030518 -0.00030518 ... 0.00280762 0.00280762 18 | 0.00286865]",F02_B2_UW100_M6.wav 19 | 9,F02,episode,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B3_UW68_M3.wav,UA,medium,"[-9.1552734e-05 -1.8310547e-04 -2.7465820e-04 ... -1.5258789e-04 20 | -9.1552734e-05 -3.0517578e-05]",F02_B3_UW68_M3.wav 21 | 10,F02,advantageous,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B2_UW21_M7.wav,UA,medium,"[ 6.1035156e-05 -3.0517578e-05 3.0517578e-05 ... 3.8452148e-03 22 | 3.6315918e-03 3.4179688e-03]",F02_B2_UW21_M7.wav 23 | 11,F02,delta,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B1_LD_M6.wav,UA,medium,"[-1.8310547e-04 -1.5258789e-04 -3.0517578e-05 ... -8.4533691e-03 24 | -8.1787109e-03 -7.9650879e-03]",F02_B1_LD_M6.wav 25 | 12,F02,absorb,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B2_UW18_M2.wav,UA,medium,"[-3.0517578e-05 6.1035156e-05 -6.1035156e-05 ... 1.5258789e-03 26 | 1.4953613e-03 1.2817383e-03]",F02_B2_UW18_M2.wav 27 | 13,F02,watch,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B2_UW82_M3.wav,UA,medium,"[ 3.0517578e-05 3.0517578e-05 6.1035156e-05 ... -1.4648438e-03 28 | -1.5258789e-03 -1.5869141e-03]",F02_B2_UW82_M3.wav 29 | 14,F02,nine,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B2_D9_M3.wav,UA,medium,"[ 1.8310547e-04 1.2207031e-04 6.1035156e-05 ... -1.5258789e-03 30 | -1.4343262e-03 -1.4343262e-03]",F02_B2_D9_M3.wav 31 | 15,F02,yankee,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B3_LY_M2.wav,UA,medium,"[-3.0517578e-05 -6.1035156e-05 6.1035156e-05 ... -1.8310547e-04 32 | 3.0517578e-05 2.7465820e-04]",F02_B3_LY_M2.wav 33 | 16,F02,victor,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B1_LV_M5.wav,UA,medium,"[ 3.0517578e-05 -9.1552734e-05 -6.1035156e-05 ... -1.3427734e-03 34 | -1.3732910e-03 -1.3427734e-03]",F02_B1_LV_M5.wav 35 | 17,F02,tab,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B1_C5_M8.wav,UA,medium,"[ 6.1035156e-05 6.1035156e-05 3.0517578e-05 ... -2.6336670e-02 36 | -2.5299072e-02 -2.6367188e-02]",F02_B1_C5_M8.wav 37 | 18,F02,left,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B1_C18_M4.wav,UA,medium,"[ 3.0517578e-05 9.1552734e-05 0.0000000e+00 ... -2.4108887e-03 38 | -2.8381348e-03 -2.9602051e-03]",F02_B1_C18_M4.wav 39 | 19,F02,digest,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B2_UW75_M7.wav,UA,medium,"[ 3.0517578e-05 3.0517578e-05 -3.0517578e-05 ... 1.3427734e-03 40 | 1.4038086e-03 1.8920898e-03]",F02_B2_UW75_M7.wav 41 | --------------------------------------------------------------------------------