├── model_inferencing.py
├── README.md
├── data preparation.py
├── model training.py
└── UA_df.csv


/model_inferencing.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 21 10:57:53 2023
 4 | 
 5 | @author: dreji18
 6 | """
 7 | 
 8 | from huggingsound import SpeechRecognitionModel
 9 | 
10 | model_dir = r'xx\xxx\xxx'
11 | 
12 | model = SpeechRecognitionModel(model_dir)
13 | 
14 | audio_dir =  r'xx\xxx\xxx'
15 | 
16 | import os
17 | os.chdir(audio_dir)
18 | 
19 | audio_paths = ["F02_B1_C17_M6.wav"]
20 | 
21 | transcriptions = model.transcribe(audio_paths)
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Fine-tune-Speech-Recognition
 2 | Tutorial on how to train a custom voice recognition model using Hugging face models.
 3 | 
 4 | Video link: https://youtu.be/7e75HuVFpYI
 5 | ![thumbnail](https://user-images.githubusercontent.com/49631017/233625269-0609a9f7-6ad1-44d1-8884-86d2846b56f4.png)
 6 | 
 7 | You can support me :)
 8 | 
 9 | <a href="https://www.buymeacoffee.com/deepakjohnreji" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png" alt="Buy Me A Coffee" style="height: 60px !important;width: 217px !important;" ></a>
10 | 


--------------------------------------------------------------------------------
/data preparation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Feb 12 18:44:57 2023
 4 | 
 5 | @author: dreji18
 6 | """
 7 | 
 8 | from datasets import load_dataset
 9 | import numpy as np
10 | import wave
11 | import os
12 | 
13 | path = r'xx\xxx\xxx'
14 | os.chdir(path)
15 | 
16 | # loading the dataset from huggingface hub
17 | #--https://huggingface.co/datasets/ngdiana/uaspeech_severity_high
18 | UA = load_dataset("ngdiana/uaspeech_severity_high")
19 | 
20 | # converting to pandas for easy data handling
21 | UA_df =  UA['train'].to_pandas()
22 | UA_df = UA_df[0:20]
23 | 
24 | UA_df['filename'] = UA_df['path'].apply(lambda x: x.split("/")[-1])
25 | 
26 | UA_df.to_csv("UA_df.csv")
27 | 
28 | 
29 | ## the goal is to convert the speech array to WAV file in bulk
30 | 
31 | def array2WAV(id):
32 |     
33 |     # Define the sample rate and number of samples
34 |     sample_rate = 16000
35 |     num_samples = 1
36 |     
37 |     # Create a WAV file object
38 |     wav_file = wave.open(UA_df['filename'].iloc[id], "w")
39 |     
40 |     # Set the WAV file parameters
41 |     wav_file.setnchannels(1) # 1 channel (mono)
42 |     wav_file.setsampwidth(2) # 16-bit sample width
43 |     wav_file.setframerate(sample_rate)
44 |     
45 |     # Write the samples to the WAV file as binary data
46 |     
47 |     samples = UA_df['speech'].iloc[id]
48 |     samples = (samples * (2**15 - 1)).astype(np.int16)
49 |     wav_file.writeframes(samples.tobytes())
50 |     
51 |     # Close the WAV file
52 |     wav_file.close()
53 | 
54 | # set the output directory to save the wav files
55 | out_dir = r'xx\xxx\xxx'
56 | 
57 | os.chdir(out_dir)
58 | 
59 | for id in range(0, len(UA_df)):
60 |     array2WAV(id)


--------------------------------------------------------------------------------
/model training.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun Feb 12 20:01:49 2023
  4 | 
  5 | @author: dreji18
  6 | """
  7 | 
  8 | from huggingsound import TrainingArguments, ModelArguments, SpeechRecognitionModel, TokenSet
  9 | #https://github.com/jonatasgrosman/huggingsound
 10 | #!pip install huggingsound
 11 | import pandas as pd
 12 | import os
 13 | 
 14 | 
 15 | audio_dir = r'xx\xxx\xxx'
 16 | 
 17 | import torch
 18 | torch.cuda.empty_cache()
 19 | #device = "cuda" if torch.cuda.is_available() else "cpu"
 20 | device = "cpu"
 21 | #model = SpeechRecognitionModel("facebook/wav2vec2-large-xlsr-53")
 22 | model = SpeechRecognitionModel("facebook/wav2vec2-large-xlsr-53", device=device)
 23 | torch.cuda.empty_cache()
 24 | 
 25 | # preparing the vocab file
 26 | vocab_dict = {'c': 0,
 27 |  'q': 1,
 28 |  'w': 2,
 29 |  'j': 3,
 30 |  'r': 4,
 31 |  'h': 5,
 32 |  'x': 6,
 33 |  'm': 7,
 34 |  'p': 8,
 35 |  'd': 9,
 36 |  'f': 10,
 37 |  'g': 11,
 38 |  'k': 12,
 39 |  'u': 13,
 40 |  'v': 14,
 41 |  'a': 15,
 42 |  'n': 16,
 43 |  ' ': 17,
 44 |  'i': 18,
 45 |  's': 19,
 46 |  'y': 20,
 47 |  'l': 21,
 48 |  'e': 22,
 49 |  'o': 23,
 50 |  'z': 24,
 51 |  'b': 25,
 52 |  't': 26}
 53 | 
 54 | vocab_dict["|"] = vocab_dict[" "]
 55 | del vocab_dict[" "]
 56 | 
 57 | vocab_dict["[UNK]"] = len(vocab_dict)
 58 | vocab_dict["[PAD]"] = len(vocab_dict)
 59 | 
 60 | tokens = list(vocab_dict.keys())
 61 | token_set = TokenSet(tokens)
 62 | 
 63 | #%%
 64 | training_args = TrainingArguments(
 65 |     learning_rate=3e-4,
 66 |     max_steps=1000,
 67 |     eval_steps=200,
 68 |     per_device_train_batch_size=1,
 69 |     per_device_eval_batch_size=1,
 70 | )
 71 | model_args = ModelArguments(
 72 |     activation_dropout=0.1,
 73 |     hidden_dropout=0.1,
 74 | ) 
 75 | 
 76 | #%%
 77 | # preparing the training data
 78 | UA_df = pd.read_csv(r'xx\xxx\UA_df.csv')
 79 | 
 80 | os.chdir(audio_dir)
 81 | 
 82 | train_data = []
 83 | for id in range(0, len(UA_df)):
 84 |     train_data.append({"path": UA_df['filename'].iloc[id], "transcription": UA_df['target'].iloc[id]})
 85 | 
 86 | # for evaluation data    
 87 | UA_df[['col1', 'col2', 'col3', 'col4']] = UA_df['filename'].str.split("_", expand=True)
 88 | UA_df1 = UA_df.drop_duplicates(subset='col3', keep="first")
 89 | 
 90 | eval_data = []
 91 | for id in range(0, len(UA_df1)):
 92 |     eval_data.append({"path": UA_df1['filename'].iloc[id], "transcription": UA_df1['target'].iloc[id]})
 93 | 
 94 | # and finally, fine-tune your model
 95 | output_dir = r'xx\xxx\xxx'
 96 | model.finetune(
 97 |     output_dir, 
 98 |     train_data=train_data, 
 99 |     eval_data=eval_data, # the eval_data is optional
100 |     token_set=token_set,
101 |     training_args=training_args,
102 |     model_args=model_args,
103 | )
104 | #%%


--------------------------------------------------------------------------------
/UA_df.csv:
--------------------------------------------------------------------------------
 1 | ,id,target,path,dataset,severity,speech,filename
 2 | 0,F02,moisten,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B1_UW81_M2.wav,UA,medium,"[ 2.1362305e-04  2.1362305e-04  1.5258789e-04 ... -2.7465820e-04
 3 |   3.0517578e-05 -2.4414062e-04]",F02_B1_UW81_M2.wav
 4 | 1,F02,abbreviated,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B2_UW15_M3.wav,UA,medium,"[1.8310547e-04 6.1035156e-05 1.2207031e-04 ... 2.6550293e-03 2.5024414e-03
 5 |  2.4108887e-03]",F02_B2_UW15_M3.wav
 6 | 2,F02,without,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B3_UW40_M5.wav,UA,medium,"[ 0.0000000e+00  3.0517578e-05  1.2207031e-04 ... -2.3132324e-02
 7 |  -2.3468018e-02 -2.3651123e-02]",F02_B3_UW40_M5.wav
 8 | 3,F02,casualties,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B1_UW29_M3.wav,UA,medium,"[ 9.1552734e-05  6.1035156e-05  1.2207031e-04 ... -2.2277832e-03
 9 |  -1.9531250e-03 -2.0446777e-03]",F02_B1_UW29_M3.wav
10 | 4,F02,downward,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B1_C17_M6.wav,UA,medium,"[-0.00018311  0.          0.         ... -0.00198364 -0.00186157
11 |  -0.00161743]",F02_B1_C17_M6.wav
12 | 5,F02,each,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B1_CW43_M4.wav,UA,medium,"[ 6.1035156e-05  6.1035156e-04  3.9672852e-04 ... -4.1198730e-03
13 |  -4.8522949e-03 -4.8828125e-03]",F02_B1_CW43_M4.wav
14 | 6,F02,how,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B3_CW47_M8.wav,UA,medium,"[ 0.00064087  0.00082397  0.00097656 ... -0.00088501 -0.00283813
15 |  -0.0039978 ]",F02_B3_CW47_M8.wav
16 | 7,F02,absolve,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B2_UW17_M8.wav,UA,medium,[0.00076294 0.00079346 0.00088501 ... 0.0090332  0.00875854 0.00857544],F02_B2_UW17_M8.wav
17 | 8,F02,thumb,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B2_UW100_M6.wav,UA,medium,"[-0.00039673 -0.00030518 -0.00030518 ...  0.00280762  0.00280762
18 |   0.00286865]",F02_B2_UW100_M6.wav
19 | 9,F02,episode,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B3_UW68_M3.wav,UA,medium,"[-9.1552734e-05 -1.8310547e-04 -2.7465820e-04 ... -1.5258789e-04
20 |  -9.1552734e-05 -3.0517578e-05]",F02_B3_UW68_M3.wav
21 | 10,F02,advantageous,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B2_UW21_M7.wav,UA,medium,"[ 6.1035156e-05 -3.0517578e-05  3.0517578e-05 ...  3.8452148e-03
22 |   3.6315918e-03  3.4179688e-03]",F02_B2_UW21_M7.wav
23 | 11,F02,delta,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B1_LD_M6.wav,UA,medium,"[-1.8310547e-04 -1.5258789e-04 -3.0517578e-05 ... -8.4533691e-03
24 |  -8.1787109e-03 -7.9650879e-03]",F02_B1_LD_M6.wav
25 | 12,F02,absorb,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B2_UW18_M2.wav,UA,medium,"[-3.0517578e-05  6.1035156e-05 -6.1035156e-05 ...  1.5258789e-03
26 |   1.4953613e-03  1.2817383e-03]",F02_B2_UW18_M2.wav
27 | 13,F02,watch,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B2_UW82_M3.wav,UA,medium,"[ 3.0517578e-05  3.0517578e-05  6.1035156e-05 ... -1.4648438e-03
28 |  -1.5258789e-03 -1.5869141e-03]",F02_B2_UW82_M3.wav
29 | 14,F02,nine,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B2_D9_M3.wav,UA,medium,"[ 1.8310547e-04  1.2207031e-04  6.1035156e-05 ... -1.5258789e-03
30 |  -1.4343262e-03 -1.4343262e-03]",F02_B2_D9_M3.wav
31 | 15,F02,yankee,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B3_LY_M2.wav,UA,medium,"[-3.0517578e-05 -6.1035156e-05  6.1035156e-05 ... -1.8310547e-04
32 |   3.0517578e-05  2.7465820e-04]",F02_B3_LY_M2.wav
33 | 16,F02,victor,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B1_LV_M5.wav,UA,medium,"[ 3.0517578e-05 -9.1552734e-05 -6.1035156e-05 ... -1.3427734e-03
34 |  -1.3732910e-03 -1.3427734e-03]",F02_B1_LV_M5.wav
35 | 17,F02,tab,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B1_C5_M8.wav,UA,medium,"[ 6.1035156e-05  6.1035156e-05  3.0517578e-05 ... -2.6336670e-02
36 |  -2.5299072e-02 -2.6367188e-02]",F02_B1_C5_M8.wav
37 | 18,F02,left,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B1_C18_M4.wav,UA,medium,"[ 3.0517578e-05  9.1552734e-05  0.0000000e+00 ... -2.4108887e-03
38 |  -2.8381348e-03 -2.9602051e-03]",F02_B1_C18_M4.wav
39 | 19,F02,digest,/content/drive/MyDrive/thesis/UASpeech/audio/F02/F02_B2_UW75_M7.wav,UA,medium,"[ 3.0517578e-05  3.0517578e-05 -3.0517578e-05 ...  1.3427734e-03
40 |   1.4038086e-03  1.8920898e-03]",F02_B2_UW75_M7.wav
41 | 


--------------------------------------------------------------------------------