├── LICENSE
├── README.md
├── auto_DataLabeling_long.py
├── auto_DataLabeling_re.py
└── clean_list.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 AliceNavigator
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # auto-VITS-DataLabeling
2 | Simple data labeling script with funasr inside. 使用阿里fanasr进行VITS训练数据标注
3 | 


--------------------------------------------------------------------------------
/auto_DataLabeling_long.py:
--------------------------------------------------------------------------------
 1 | from modelscope.pipelines import pipeline
 2 | from modelscope.utils.constant import Tasks
 3 | import os
 4 | 
 5 | '''
 6 | inference_pipeline = pipeline(
 7 |     task=Tasks.auto_speech_recognition,
 8 |     model='./Model/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
 9 | )
10 | 
11 | rec_result = inference_pipeline(audio_in='ge_1570_2.wav')
12 | print(rec_result)
13 | # {'text': '欢迎大家来体验达摩院推出的语音识别模型'}
14 | '''
15 | 
16 | parent_dir = "./raw_audio/"
17 | local_dir_model = "./Model/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
18 | complete_list = []
19 | filelist = list(os.walk(parent_dir))[0][2]
20 | 
21 | if os.path.exists('long_character_anno.txt'):
22 |     with open("./long_character_anno.txt", 'r', encoding='utf-8') as f:
23 |         for line in f.readlines():
24 |             pt, _, _ = line.strip().split('|')
25 |             complete_list.append(pt)
26 | 
27 | inference_pipeline = pipeline(
28 |     task=Tasks.auto_speech_recognition,
29 |     model=local_dir_model
30 | )
31 | 
32 | 
33 | for file in filelist:
34 |     if file[-3:] != 'wav':
35 |         print(f"{file} not supported, ignoring...\n")
36 |         continue
37 |     print(f"transcribing {parent_dir + file}...\n")
38 | 
39 |     character_name = file.rstrip(".wav").split("_")[0]
40 |     savepth = "./dataset/" + character_name + "/" + file
41 | 
42 |     if savepth in complete_list:
43 |         print(f'{file} is already done, skip!')
44 |         continue
45 | 
46 |     rec_result = inference_pipeline(audio_in=parent_dir + file)
47 | 
48 |     if 'text' not in rec_result:
49 |         print("Text is not recognized，ignoring...\n")
50 |         continue
51 | 
52 |     annos_text = rec_result['text']
53 |     annos_text = '[ZH]' + annos_text.replace("\n", "") + '[ZH]'
54 |     annos_text = annos_text + "\n"
55 |     line1 = savepth + "|" + character_name + "|" + annos_text
56 |     line2 = savepth + "|" + character_name + "|ZH|" + rec_result['text'] + "\n"
57 |     with open("./long_character_anno.txt", 'a', encoding='utf-8') as f:
58 |         f.write(line1)
59 |     with open(f"./barbara.list", 'a', encoding='utf-8') as f:
60 |         f.write(line2)
61 |     print(rec_result['text'])
62 | print("Done!\n")
63 | 
64 | 


--------------------------------------------------------------------------------
/auto_DataLabeling_re.py:
--------------------------------------------------------------------------------
 1 | from modelscope.pipelines import pipeline
 2 | from modelscope.utils.constant import Tasks
 3 | import os
 4 | 
 5 | '''
 6 | inference_pipeline = pipeline(
 7 |     task=Tasks.auto_speech_recognition,
 8 |     model='./Model/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
 9 | )
10 | 
11 | rec_result = inference_pipeline(audio_in='ge_1570_2.wav')
12 | print(rec_result)
13 | # {'text': '欢迎大家来体验达摩院推出的语音识别模型'}
14 | '''
15 | 
16 | parent_dir = "./raw_audio/"
17 | local_dir_root = "./Model/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
18 | target_sr = 44100
19 | 
20 | # speaker_annos = []
21 | # speaker_annos_bert = []
22 | complete_list = []
23 | filelist = list(os.walk(parent_dir))[0][2]
24 | 
25 | if os.path.exists('long_character_anno.txt'):
26 |     with open("./long_character_anno.txt", 'r', encoding='utf-8') as f:
27 |         for line in f.readlines():
28 |             pt, _, _ = line.strip().split('|')
29 |             complete_list.append(pt)
30 | 
31 | inference_pipeline = pipeline(
32 |     task=Tasks.auto_speech_recognition,
33 |     model=local_dir_root,
34 | )
35 | 
36 | for file in filelist:
37 |     if file[-3:] != 'wav':
38 |         print(f"{file} not supported, ignoring...\n")
39 |         continue
40 |     print(f"transcribing {parent_dir + file}...\n")
41 | 
42 |     character_name = file.rstrip(".wav").split("_")[0]
43 |     savepth = "./dataset/" + character_name + "/" + file
44 | 
45 |     if savepth in complete_list:
46 |         print(f'{file} is already done, skip!')
47 |         continue
48 | 
49 |     rec_result = inference_pipeline(audio_in=parent_dir + file)
50 | 
51 |     if 'text' not in rec_result:
52 |         print("Text is not recognized，ignoring...\n")
53 |         continue
54 | 
55 |     annos_text = rec_result['text']
56 |     annos_text = '[ZH]' + annos_text.replace("\n", "") + '[ZH]'
57 |     annos_text = annos_text + "\n"
58 |     # speaker_annos.append(savepth + "|" + character_name + "|" + annos_text)
59 |     line1 = savepth + "|" + character_name + "|" + annos_text
60 |     # speaker_annos_bert.append(savepth + "|" + character_name + "|ZH|" + rec_result['text'] + "\n")
61 |     line2 = savepth + "|" + character_name + "|ZH|" + rec_result['text'] + "\n"
62 |     with open("./long_character_anno.txt", 'a', encoding='utf-8') as f:
63 |         f.write(line1)
64 |     with open(f"./barbara.list", 'a', encoding='utf-8') as f:
65 |         f.write(line2)
66 |     print(rec_result)
67 | print("Done!\n")
68 | 


--------------------------------------------------------------------------------
/clean_list.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | 
 4 | new_annos = []
 5 | cleaned_new_annos = []
 6 | 
 7 | if os.path.exists("./barbara.list"):
 8 |     with open("./barbara.list", 'r', encoding='utf-8') as f:
 9 |         long_character_anno = f.readlines()
10 |         new_annos += long_character_anno
11 | else:
12 |     print('barbara.list cannot be found, please confirm that the path is correct')
13 |     exit()
14 | 
15 | for line in new_annos:
16 |     path, name, lang, text = line.split("|")
17 |     text += "\n" if not text.endswith("\n") else ""
18 |     if len(text) >= 5:
19 |         my_re = re.compile(r'[A-Za-z]', re.S)
20 |         res = re.findall(my_re, text)
21 |         if len(res):
22 |             print(f'Skip non-kanji text : {text}')
23 |         else:
24 |             cleaned_new_annos.append(path + "|" + name + "|" + lang+ "|" + text)
25 |     else:
26 |         print(f'skip too short wav : {text}')
27 | 
28 | 
29 | with open("./clean_barbara.list", 'w', encoding='utf-8') as f:
30 |     for line in cleaned_new_annos:
31 |         f.write(line)
32 | 
33 | print('Done! save as clean_barbara.list')
34 | 


--------------------------------------------------------------------------------