├── LICENSE
├── README.md
├── data_processing
    ├── .DS_Store
    ├── README.md
    ├── data_using
    │   └── pcm2wav.py
    ├── prepare_new_corpus
    │   ├── check_file_existence.py
    │   ├── convert_data_list_to_wenet_json.py
    │   ├── convert_wenet_data_to_whisper.py
    │   ├── do_copy_files.sh
    │   ├── make_copy_sh.py
    │   ├── make_mkdir_sh.py
    │   ├── make_new_data_list.py
    │   ├── make_train_and_test_data.py
    │   ├── make_train_and_test_data_existed.py
    │   └── travel_and_make_all_wavpath_txt.py
    ├── wenet_data_using
    │   ├── data_regularization.py
    │   ├── data_split.py
    │   └── get_new_dict.py
    └── whisper_data_using
    │   ├── data_make_aishell2.py
    │   ├── data_make_kespeech.py
    │   ├── data_wav_exist_checking.py
    │   ├── decode_change_general_to_8k.py
    │   ├── decode_data_preparation.py
    │   ├── decode_data_preparation_for_customer_service.py
    │   ├── decode_data_preparation_for_speechio.py
    │   ├── decode_make_kespeech_test.py
    │   ├── decode_wenetlist2datalist.py
    │   ├── deocde_json2qwen.py
    │   ├── regularization_MyEnglishTextNormalizer.py
    │   ├── regularization_check_data.py
    │   ├── regularization_get_20000_items.py
    │   ├── regularization_text_check_dict.py
    │   └── regularization_text_regularization.py
├── espnet_using
    └── README.md
├── fairseq_using
    └── README.md
├── faster_whisper_using
    ├── README.md
    ├── faster-whisper
    │   ├── CONTRIBUTING.md
    │   ├── LICENSE
    │   ├── MANIFEST.in
    │   ├── README.md
    │   ├── benchmark
    │   │   ├── benchmark.m4a
    │   │   ├── memory_benchmark.py
    │   │   ├── normalizer.json
    │   │   ├── requirements.benchmark.txt
    │   │   ├── speed_benchmark.py
    │   │   ├── utils.py
    │   │   └── wer_benchmark.py
    │   ├── convert.sh
    │   ├── evaluate.py
    │   ├── evaluate.sh
    │   ├── faster_whisper
    │   │   ├── __init__.py
    │   │   ├── assets
    │   │   │   ├── __init__.py
    │   │   │   └── silero_vad.onnx
    │   │   ├── audio.py
    │   │   ├── feature_extractor.py
    │   │   ├── tokenizer.py
    │   │   ├── transcribe.py
    │   │   ├── utils.py
    │   │   ├── vad.py
    │   │   └── version.py
    │   ├── requirements.conversion.txt
    │   ├── requirements.txt
    │   ├── setup.cfg
    │   ├── setup.py
    │   ├── tests
    │   │   ├── conftest.py
    │   │   ├── data
    │   │   │   ├── jfk.flac
    │   │   │   └── stereo_diarization.wav
    │   │   ├── test_transcribe.py
    │   │   └── test_utils.py
    │   └── wenet_utils
    │   │   ├── compute-wer.py
    │   │   └── data_postprocessing.py
    ├── faster_whisper_hyper_parameters.md
    └── vscode
    │   └── launch.json
├── paraformer_using
    ├── 01_map
    │   ├── map_cer.txt
    │   ├── map_hyp.txt
    │   ├── map_hyp_norm.txt
    │   ├── map_lab.txt
    │   ├── map_lab_norm.txt
    │   └── para_output
    │   │   └── 1best_recog
    │   │       ├── rtf
    │   │       ├── score
    │   │       ├── text
    │   │       └── token
    ├── change_result.py
    ├── json2paraformer.py
    └── paraformer_decode_all.sh
├── qwen_using
    ├── Qwen-Audio
    │   └── README.md
    ├── README.md
    └── decode_mutimachine
    │   ├── evaluate_aqa.py
    │   ├── evaluate_asr.py
    │   ├── evaluate_qwen_cer.sh
    │   ├── qwen_audio_evaluate_asr.sh
    │   ├── qwen_audio_evaluate_cry.sh
    │   └── tools
    │       └── data_postprocessing.py
├── tensorrt_llm_using
    └── examples
    │   └── whisper
    │       ├── README.md
    │       ├── build.log
    │       ├── build.py
    │       ├── build.sh
    │       ├── distil_whisper
    │           └── convert_from_distil_whisper.py
    │       ├── requirements.txt
    │       ├── run.log
    │       ├── run.py
    │       ├── run.sh
    │       ├── run_faster_whisper.py
    │       ├── tmp
    │           ├── errs-single_wav_test.txt
    │           ├── recogs-single_wav_test.txt
    │           └── rtf-single_wav_test.txt
    │       ├── tokenizer.py
    │       ├── weight.py
    │       ├── whisper_outputs
    │           ├── decoder_config.json
    │           └── encoder_config.json
    │       └── whisper_utils.py
├── wenet_using
    ├── README.md
    ├── compute-wer.py
    ├── data_postprocessing.py
    ├── norm_and_conpute_cer.sh
    ├── train_mutimachine
    │   ├── multi_nodes.sh
    │   ├── multi_nodes_training.sh
    │   └── one_node.sh
    └── vscode
    │   └── launch.json
├── whisper_finetune_using
    ├── LICENSE
    ├── README.md
    ├── README_en.md
    ├── aishell.py
    ├── convert-ggml.py
    ├── decode_bash
    │   ├── convert_whisper_to_fast_whisper.sh
    │   ├── evaluate_faster_whisper_3.0_1.0_no_timestamp_310000_2.sh
    │   ├── evaluate_faster_whisper_3.0_1.0_no_timestamp_310000_final_en.sh
    │   ├── evaluate_whisper_3.0lr_310000.sh
    │   └── test_language_id_3.0lr_english.sh
    ├── evaluation.py
    ├── evaluation_debug.py
    ├── evaluation_debug2.py
    ├── evaluation_nolora.py
    ├── evaluation_nolora2.py
    ├── evaluation_nolora_rtf.py
    ├── finetune.py
    ├── finetune2.py
    ├── finetune_debug.py
    ├── finetune_lora.py
    ├── finetune_lora_cuichenrui_01.py
    ├── finetune_multimachine.py
    ├── infer.py
    ├── infer_ct2.py
    ├── infer_gui.py
    ├── infer_server.py
    ├── merge_lora.py
    ├── requirements.txt
    ├── run_finetune_debug.sh
    ├── run_finetune_fp16_cuichenrui_01.sh
    ├── run_finetune_fp16_cuichenrui_02.sh
    ├── run_finetune_fp16_cuichenrui_03.sh
    ├── run_finetune_multimachine.sh
    ├── run_finetune_multimachine_1_8.sh
    ├── tools
    │   ├── data_postprocessing.py
    │   ├── data_postprocessing_cantonese.py
    │   ├── data_preprocessing.py
    │   ├── pcm2wav_16000.py
    │   └── pcm2wav_8000.py
    ├── train_bash
    │   ├── train_finetune_whisper_cuichenrui_01.sh
    │   ├── train_finetune_whisper_cuichenrui_02.sh
    │   ├── train_finetune_whisper_cuichenrui_03.sh
    │   └── train_finetune_whisper_multimachine.sh
    ├── utils
    │   ├── __init__.py
    │   ├── binary.py
    │   ├── callback.py
    │   ├── data_utils.py
    │   ├── model_utils.py
    │   ├── reader.py
    │   ├── tarfile_reader.py
    │   └── utils.py
    ├── vscode
    │   ├── debug_chinese_launch.json
    │   ├── debug_faster_whisper_launch.json
    │   ├── debug_new_faster_launch.json
    │   ├── debug_new_faster_whisper_launch.json
    │   ├── debug_none_launch.json
    │   ├── debug_train_2_launch.json
    │   ├── debug_train_launch.json
    │   └── workspace_debug_train_111_launch.json
    └── whisper_explore_using
    │   ├── test_support_languages.py
    │   ├── test_whisper_tokenizer.py
    │   └── test_whisper_tokenizer.txt
├── 工具踩坑记录汇总.md
└── 语音入门资料汇总.md


/README.md:
--------------------------------------------------------------------------------
 1 | # 深度学习语音工具包
 2 | 
 3 | 本项目为个人在深度学习语音领域研究的一些工具汇总，并会分享一些高质量的语音领域学习资料。本项目仅用于个人资料和代码的备份，欢迎大家前来学习讨论交流🎉🎉🎉
 4 | 
 5 | ## 🔄 最新更新
 6 | 
 7 | * [2024_08_01] 进行一些整理，将所有信息汇总完毕，等待进一步代码整理。
 8 | 
 9 | * [2024_07_19] 整理完成：“qwen_using” 文件夹，提供了针对 Qwen-Audio 框架的一些尝试和思考。
10 | 
11 | * [2024_07_19] 整理完成：“wenet_using” 文件夹，添加了 wenet 框架多机多卡训练脚本和 debug 配置。
12 | 
13 | * [2024_05_11] 整理完成：“faster_whisper_using” 文件夹，介绍了 faster_whisper 的使用细节和 debug 进展。
14 | 
15 | * [2024_04_17] 整理完成：“wenet_using” 文件夹，介绍了语音识别框架 wenet 的一些知识和用法。
16 | 
17 | * [2024_04_12] 整理完成：“语音入门资料汇总.md” 和 “工具踩坑记录汇总.md”，介绍了语音入门的一些资料和容易踩坑的一些问题。
18 | 


--------------------------------------------------------------------------------
/data_processing/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cuichenrui2000/barry_speech_tools/4780bd2c0a578821a3bb782240b58e7c0ca44ac7/data_processing/.DS_Store


--------------------------------------------------------------------------------
/data_processing/README.md:
--------------------------------------------------------------------------------
 1 | ##  语音数据处理方法
 2 | 
 3 | 这里将介绍一些语音数据的处理方法，包括如何拉取一个新的数据集，和一些典型的语音数据集处理方法。
 4 | 
 5 | ### 数据处理操作
 6 | 
 7 | 相关文件已汇总至文件夹 ```data_processing/data_using```，具体文件细节如下：
 8 | 
 9 | * pcm2wav.py【批量进行 pcm 文件至 wav 文件的转换，可设置采样率】
10 | 
11 | ### 新数据集拉取
12 | 
13 | 相关文件已汇总至文件夹 ```data_processing/prepare_new_corpus```，具体文件细节如下：
14 | 
15 | * check_file_existence.py【检验所有音频文件是否存在】
16 | 
17 | * convert_data_list_to_wenet_json.py【转换 data.list 数据格式至 wenet 数据格式】
18 | 
19 | * convert_wenet_data_to_whisper.py【转换 wenet 数据格式至 whisper 数据格式】
20 | 
21 | * do_copy_files.sh【数据迁移，多进程执行数据迁移脚本】
22 | 
23 | * make_copy_sh.py【数据迁移，制作 copy.sh】
24 | 
25 | * make_mkdir_sh.py【数据迁移，制作 mkdir.sh】
26 | 
27 | * make_new_data_list.py【数据迁移，制作新的 data.list，里面巧妙计算了音频 duration】
28 | 
29 | * make_train_and_test_data_existed.py【区分并生成训练集和测试集(已有测试集)】
30 | 
31 | * make_train_and_test_data.py【区分并生成训练集和测试集(未有测试集)】
32 | 
33 | * travel_and_make_all_wavpath_txt.py【数据迁移，制作 wavpath.txt】
34 | 
35 | ### wenet 数据的使用
36 | 
37 | 相关文件已汇总至文件 ```data_processing/wenet_data_using```，具体文件细节如下：
38 | 
39 | * data_regularization.py【数据文本正则化】
40 | 
41 | * data_split.py【切分数据集并重新排序，将简单数据集放到前面，困难数据集放到后面】
42 | 
43 | * get_new_dict.py【获取训练文本字典】
44 | 
45 | ### whisper 数据的使用
46 | 
47 | 相关文件已汇总至文件 ```data_processing/whisper_data_using```，具体文件细节如下：
48 | 
49 | * data_make_aishell2.py【制作 AISHELL-2 数据集】
50 | 
51 | * data_make_kespeech.py【制作 Kespeech 数据集】
52 | 
53 | * data_wav_exist_checking.py【检验 jsonl 文件中音频路径是否均存在】
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/data_processing/data_using/pcm2wav.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import wave
 5 | import numpy as np
 6 | 
 7 | # 定义PCM文件和WAV文件的路径
 8 | pcm_file = 'your_pcm_file_list'
 9 | 
10 | pcm_list = []
11 | with open(pcm_file) as f:
12 |     for line in f:
13 |         pcm_list.append(line.strip())
14 | 
15 | # 设置音频参数
16 | num_channels = 1  # 单声道
17 | sample_width = 2  # 16位（2字节）采样深度
18 | frame_rate = 8000  # 采样率，例如16000 Hz
19 | 
20 | for pcmfile in pcm_list:
21 |     # 读取PCM文件
22 |     with open(pcmfile, 'rb') as pcmf:
23 |         pcm_data = pcmf.read()
24 |     
25 |         # 将PCM数据转换为NumPy数组
26 |         pcm_array = np.frombuffer(pcm_data, dtype=np.int16)
27 |     
28 |         # 创建WAV文件并设置音频参数
29 |         with wave.open(pcmfile.strip() + '.wav', 'wb') as wavfile:
30 |             wavfile.setnchannels(num_channels)
31 |             wavfile.setsampwidth(sample_width)
32 |             wavfile.setframerate(frame_rate)
33 |     
34 |             # 将NumPy数组转换为二进制数据并写入WAV文件
35 |             wavfile.writeframes(pcm_array.tobytes())
36 |     
37 |     print(f'Converted {pcmfile} to {pcmfile.strip() + ".wav"}')
38 | 
39 | 


--------------------------------------------------------------------------------
/data_processing/prepare_new_corpus/check_file_existence.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | from tqdm import tqdm
 6 | 
 7 | def check_file_existence(file_path):
 8 |     """检查文件是否存在，不存在则打印路径"""
 9 |     if not os.path.exists(file_path):
10 |         print(f"文件不存在：{file_path}")
11 |         return False
12 |     return True
13 | 
14 | def count_files(file_path):
15 |     """读取文件，检查每一行的路径是否存在，统计总文件数和存在的文件数"""
16 |     total_files = 0
17 |     correct_files = 0
18 |     total_duration = 0
19 |     with open(file_path, 'r') as file:
20 |         lines = file.readlines()
21 |         total_files = len(lines)
22 |         for line in tqdm(lines, desc="检查文件存在性"):
23 |             path = line.strip().split('\t')[0]
24 |             if check_file_existence(path):
25 |                 correct_files += 1
26 |                 total_duration += float(line.strip().split('\t')[2])
27 |     return total_files, correct_files, total_duration
28 | 
29 | if __name__ == "__main__":
30 |     input_file_path = "your_file_path_list"
31 |     total_files, correct_files, total_duration = count_files(input_file_path)
32 |     print(f"总文件数：{total_files}")
33 |     print(f"正确文件数：{correct_files}")
34 |     print(f"总时长：{round(total_duration / 3600, 2)}h")
35 | 


--------------------------------------------------------------------------------
/data_processing/prepare_new_corpus/convert_data_list_to_wenet_json.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import json
 5 | from tqdm import tqdm
 6 | import os
 7 | import string
 8 | import re
 9 | 
10 | def remove_punctuation(input_string):
11 |     """去除所有标点符号"""
12 |     translation_table = str.maketrans("", "", string.punctuation + "，。、；：！？（）【】『』“”《》［］｛｝﹙﹚﹛﹜﹝﹞〔〕〈〉")
13 |     no_punct = input_string.translate(translation_table)
14 |     return no_punct
15 | 
16 | filename_list = ["data.list", "data_test.list", "data_train.list"]
17 | 
18 | for filename in filename_list:
19 |     # 读取原始文件，转换格式，写入新文件
20 |     with open(filename, "r", encoding="utf-8") as f_in, \
21 |         open(filename.replace("data", "wenet"), "w", encoding="utf-8") as f_out:
22 |         lines = f_in.readlines()
23 |         total_lines = len(lines)
24 |         for line in tqdm(lines, desc=f"Processing {filename}", unit=" lines"):
25 |             
26 |             # duration 字段暂时用不到
27 |             path, text, _ = line.strip().split("\t")
28 |             text = remove_punctuation(text)
29 | 
30 |             data = {
31 |                 "key": path,
32 |                 "wav": path,
33 |                 "txt": text
34 |             }
35 |             json.dump(data, f_out, ensure_ascii=False)
36 |             f_out.write('\n')
37 | 


--------------------------------------------------------------------------------
/data_processing/prepare_new_corpus/convert_wenet_data_to_whisper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import jsonlines
 6 | import string
 7 | import re
 8 | from tqdm import tqdm
 9 | 
10 | def remove_punctuation(input_string):
11 |     """去除所有标点符号"""
12 |     translation_table = str.maketrans("", "", string.punctuation + "，。、；：！？（）【】『』“”《》［］｛｝﹙﹚﹛﹜﹝﹞〔〕〈〉")
13 |     no_punct = input_string.translate(translation_table)
14 |     return no_punct
15 | 
16 | # 需要转换的文件列表
17 | data_list_filenames = ["data.list", "data_test.list", "data_train.list"]
18 | 
19 | for data_list_filename in data_list_filenames:
20 |     
21 |     total_items = 0
22 |     total_correct_items = 0
23 |     total_error_items = 0
24 |     total_correct_duration = 0
25 |     
26 |     # 将 data.list 文件转换为 new_data.jsonl 文件
27 |     whisper_json_filename = data_list_filename.replace("data", "new_data").replace(".list", ".jsonl")
28 |     
29 |     with open(data_list_filename, "r", encoding='utf-8') as data_list_file:
30 |         contents = data_list_file.readlines()
31 |         total_items = len(contents)
32 |         
33 |         with jsonlines.open(whisper_json_filename, mode="w") as whisper_json_file:
34 |             for content in tqdm(contents, desc=f"Processing {data_list_filename}", unit=" lines"):
35 |                 
36 |                 # 获取各种文件信息
37 |                 audio_path, text, duration = content.strip().split("\t")
38 |                 
39 |                 # 去除标点符号
40 |                 text = remove_punctuation(text)
41 |                 
42 |                 result_json = {"audio": {"path": audio_path}, "sentence": text, "duration": duration}
43 |                 whisper_json_file.write(result_json)
44 | 
45 |                 total_correct_items += 1
46 |                 total_correct_duration += float(duration)
47 | 
48 |     print(f"file_name = {data_list_filename}")
49 |     print(f"total_items = {total_items}")
50 |     print(f"total_correct_items = {total_correct_items}")
51 |     print(f"total_error_items = {total_error_items}")
52 |     print(f"total_correct_duration = {round(total_correct_duration / 3600, 2)}h")
53 | 


--------------------------------------------------------------------------------
/data_processing/prepare_new_corpus/do_copy_files.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | sh mkdir.sh
 4 | split -n l/10 cp.sh copy
 5 | 
 6 | sh copyaa > logaa.log 2>&1 &
 7 | sh copyab > logab.log 2>&1 &
 8 | sh copyac > logac.log 2>&1 &
 9 | sh copyad > logad.log 2>&1 &
10 | sh copyae > logae.log 2>&1 &
11 | sh copyaf > logaf.log 2>&1 &
12 | sh copyag > logag.log 2>&1 &
13 | sh copyah > logah.log 2>&1 &
14 | sh copyai > logai.log 2>&1 &
15 | sh copyaj > logaj.log 2>&1 &
16 | 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/data_processing/prepare_new_corpus/make_copy_sh.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # 打开原始文件和输出文件
 5 | with open('your_wavpath_file', 'r') as f_original, open('copy.sh', 'w') as f_output:
 6 |     # 逐行读取原始文件内容
 7 |     for line in f_original:
 8 |         # 去除末尾的换行符并生成迁移后路径
 9 |         raw_path = line.strip()
10 |         audio_path = raw_path.replace("your_old_prefix_path", "your_new_prefix_path")
11 | 
12 |         # 写入原始路径和新路径到输出文件
13 |         f_output.write(f"cp {raw_path} {audio_path}\n")
14 | 


--------------------------------------------------------------------------------
/data_processing/prepare_new_corpus/make_mkdir_sh.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | 
 6 | # 用于存储所有.wav文件的前缀路径
 7 | prefixes = set()
 8 | 
 9 | # 打开文件并处理数据
10 | with open("copy.sh", "r") as infile:
11 |     for line in infile:
12 |         # 按空格分割行
13 |         parts = line.strip().split(" ")
14 |         # 获取新音频路径
15 |         audio_path = parts[2]
16 |         if audio_path.endswith(".wav"):
17 |             # 提取前缀路径并添加到集合中
18 |             prefix = os.path.dirname(audio_path)
19 |             prefixes.add(prefix)
20 | 
21 | # 将前缀路径写入新文件
22 | with open("mkdir.sh", "w") as outfile:
23 |     for prefix in prefixes:
24 |         outfile.write(f"mkdir -p {prefix}\n")
25 | 


--------------------------------------------------------------------------------
/data_processing/prepare_new_corpus/make_new_data_list.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from tqdm import tqdm
 5 | import os
 6 | 
 7 | # 打开原始文件和输出文件
 8 | with open('your_wavpath_file', 'r') as f_paths, open('data.list', 'w') as f_output:
 9 |     # 使用 tqdm 包装路径读取以显示进度条
10 |     for line in tqdm(f_paths, desc='Processing paths'):
11 |         # 移除换行符并提取路径
12 |         raw_path = line.strip()
13 |         audio_path = raw_path.replace("your_old_prefix_path", "your_new_prefix_path")
14 | 
15 |         # 巧妙读取音频时长，数据还未迁移完，因此使用原始路径
16 |         filesize = os.path.getsize(raw_path)
17 |         duration = float(filesize-44) / 2 / 16000
18 |         duration = round(duration, 2)
19 | 
20 |         # 将路径的后缀改为 .txt，找到音频对应文本
21 |         text_path = raw_path.replace('.wav', '.txt')
22 |         # 读取 .txt 文件的内容并移除换行符
23 |         with open(text_path, 'r') as f_text:
24 |             text_content = f_text.read().strip()
25 |         # 将路径和文本内容以制表符分隔写入输出文件
26 |         f_output.write(f"{audio_path}\t{text_content}\t{duration}\n")


--------------------------------------------------------------------------------
/data_processing/prepare_new_corpus/make_train_and_test_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import random
 5 | from tqdm import tqdm
 6 | 
 7 | # 读取原始文件内容并显示进度条
 8 | with open("data.list", "r") as file:
 9 |     lines = [line.strip() for line in tqdm(file, desc="Reading Data", unit=" lines")]
10 | 
11 | # 随机抽取 20000 行作为测试数据
12 | test_lines = random.sample(lines, 20000)
13 | 
14 | # 将剩余的行作为训练数据
15 | train_lines = []
16 | for line in tqdm(lines, desc="Filtering Train Data", unit=" lines"):
17 |     if line not in test_lines:
18 |         train_lines.append(line)
19 | 
20 | # 将训练数据写入文件
21 | with open("data_train.list", "w") as train_file:
22 |     for line in tqdm(train_lines, desc="Writing Train Data", unit=" lines"):
23 |         train_file.write(line + "\n")
24 | 
25 | # 将测试数据写入文件
26 | with open("data_test.list", "w") as test_file:
27 |     for line in tqdm(test_lines, desc="Writing Test Data", unit=" lines"):
28 |         test_file.write(line + "\n")
29 | 


--------------------------------------------------------------------------------
/data_processing/prepare_new_corpus/make_train_and_test_data_existed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from tqdm import tqdm
 5 | 
 6 | # 读取已拆分的测试集
 7 | moved_filenames = set()
 8 | with open("data_test_old.list", "r") as moved_file:
 9 |     for line in moved_file:
10 |         moved_filenames.add(line.strip().split("\t")[0])
11 | 
12 | # 打开全部数据文件
13 | with open("data.list", "r") as original_file:
14 |     lines = original_file.readlines()
15 | 
16 | # 将测试集和训练集进行拆分
17 | train_count = 0
18 | test_count = 0
19 | with open("data_train.list", "w") as train_file, open("data_test.list", "w") as test_file:
20 |     for line in tqdm(lines, desc="Processing", unit=" lines"):
21 |         parts = line.strip().split("\t")
22 |         filename = parts[0]
23 |         if filename in moved_filenames:
24 |             test_file.write(line)
25 |             test_count += 1
26 |         else:
27 |             train_file.write(line)
28 |             train_count += 1
29 | 
30 | # 打印新文件的行数
31 | print("data_train.list 行数:", train_count)
32 | print("data_test.list 行数:", test_count)
33 | 


--------------------------------------------------------------------------------
/data_processing/prepare_new_corpus/travel_and_make_all_wavpath_txt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | from pathlib import Path
 6 | from tqdm import tqdm
 7 | 
 8 | # 定义目标目录路径
 9 | target_dir = 'prefix_path_of_all_data'
10 | 
11 | # 遍历目录并找出所有后缀为 .wav 的文件
12 | wav_files = []
13 | for root, dirs, files in os.walk(target_dir):
14 |     for file in tqdm(files, desc='Searching .wav files'):
15 |         if file.endswith('.wav'):
16 |             wav_files.append(os.path.join(root, file))
17 | 
18 | # 将所有音频绝对路径输出到文件
19 | with open('all_wavpath.txt', 'w') as f:
20 |     for wav_file in tqdm(wav_files, desc='Writing paths to file'):
21 |         f.write(wav_file + '\n')
22 | 


--------------------------------------------------------------------------------
/data_processing/wenet_data_using/data_regularization.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import json
  5 | import jsonlines
  6 | from tqdm import tqdm
  7 | 
  8 | # 大写字母字典
  9 | upper_dict = {
 10 |     "a": "A", 
 11 |     "b": "B",
 12 |     "c": "C", 
 13 |     "d": "D", 
 14 |     "e": "E", 
 15 |     "f": "F", 
 16 |     "g": "G",
 17 |     "h": "H", 
 18 |     "i": "I", 
 19 |     "j": "J", 
 20 |     "k": "K", 
 21 |     "l": "L",
 22 |     "m": "M", 
 23 |     "n": "N", 
 24 |     "o": "O", 
 25 |     "p": "P", 
 26 |     "q": "Q",
 27 |     "r": "R", 
 28 |     "s": "S", 
 29 |     "t": "T", 
 30 |     "u": "U", 
 31 |     "v": "V",
 32 |     "w": "W", 
 33 |     "x": "X", 
 34 |     "y": "Y", 
 35 |     "z": "Z"
 36 | }
 37 |     
 38 | # 中英混数据前缀
 39 | zh_en_list = ["zh_en_prefix_path1", 
 40 |               "zh_en_prefix_path2", 
 41 |               "zh_en_prefix_path3"]
 42 | 
 43 | # 正则化文本 dict
 44 | norm_dict = {}
 45 | # 加载已正则化后的数据文本
 46 | for file_path in ["normed_test1.list", "normed_test2.list"]:
 47 |     with open(file_path, "r") as file:
 48 |         lines = file.readlines()
 49 |         for line in tqdm(lines, desc="Processing Reading", unit="lines"):
 50 |             data = json.loads(line.strip())
 51 |             wav = data["wav"]
 52 |             text = data["txt"]
 53 |             norm_dict[wav] = text
 54 | 
 55 | # 文件路径
 56 | raw_file_path = "your_raw_file_path"
 57 | new_file_path = "your_new_file_path"
 58 | others_file_path = "your_others_file_path"
 59 | 
 60 | get_num = 0
 61 | get_duration = 0
 62 | not_get_num = 0
 63 | not_get_duration = 0
 64 | ch_en_num = 0
 65 | ch_en_duration = 0
 66 | special_num = 0
 67 | special_duration = 0
 68 | 
 69 | # 逐行读取文件
 70 | with open(raw_file_path, "r") as file, jsonlines.open(new_file_path, "w") as wenet_file, jsonlines.open(others_file_path, "w") as others_file:
 71 |     lines = file.readlines()
 72 |     for line in tqdm(lines, desc="Processing Checking", unit="lines"):
 73 |         
 74 |         # 解析 json 数据
 75 |         data = json.loads(line.strip())
 76 |         key = data["key"]
 77 |         wav = data["wav"]
 78 |         text = data["txt"]
 79 |         
 80 |         # 1. 中英混数据去除
 81 |         pass_data = False
 82 |         for i in zh_en_list:
 83 |             if i in wav:
 84 |                 pass_data = True
 85 |         # 去除数据
 86 |         if pass_data:
 87 |             ch_en_num += 1
 88 |             # ch_en_duration += duration
 89 |             continue
 90 |                 
 91 |         # 2. 去除所有【含两个单词以上】的句子，单字母不算单词【逻辑不严谨，bug 未修复】
 92 |         text_words = text.split(" ")
 93 |         word_num = 0
 94 |         for text_word in text_words:
 95 |             letter_num = 0
 96 |             for text_letter in text_word:
 97 |                 if text_letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
 98 |                     letter_num += 1
 99 |                     if letter_num == 2:
100 |                         word_num += 1
101 |                         continue
102 |         # 去除数据
103 |         if word_num >= 2:
104 |             print(line)
105 |             special_num += 1
106 |             # special_duration += duration
107 |             continue
108 |         
109 |         # 3. 执行文本正则化替换 text
110 |         text_new = norm_dict.get(wav, "NOT_EXISTES")
111 |         
112 |         # 3.5 英文字母小写转换为大写
113 |         for i in upper_dict.keys():
114 |             text_new = text_new.replace(i, upper_dict[i])
115 |         
116 |         # 4. 无条件去除所有空格
117 |         text_new = text_new.replace(" ", "")
118 |         
119 |         # 若命中，写入 text_new
120 |         if text_new != "NOT_EXISTES":
121 |             get_num += 1
122 |             # get_duration += duration
123 |             result_json = {"key": key, "wav": wav, "txt": text_new}
124 |             wenet_file.write(result_json)
125 |             
126 |         # 若未命中，写入 text
127 |         else:
128 |             not_get_num += 1
129 |             # not_get_duration += duration
130 |             result_json = {"key": key, "wav": wav, "txt": text}
131 |             others_file.write(result_json)
132 | 
133 | # 打印变量
134 | print("GET Numbers:", get_num)
135 | print("GET Duration:", round(get_duration / 3600, 2), "h")
136 | print("NOT GET Numbers:", not_get_num)
137 | print("NOT GET Duration:", round(not_get_duration / 3600, 2), "h")
138 | print("ZH EN Numbers:", ch_en_num)
139 | print("ZH EN Duration:", round(ch_en_duration / 3600, 2), "h")
140 | print("Special Numbers:", special_num)
141 | print("Special Duration:", round(special_duration / 3600, 2), "h")
142 | 


--------------------------------------------------------------------------------
/data_processing/wenet_data_using/data_split.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import json
 5 | import jsonlines
 6 | from tqdm import tqdm
 7 |     
 8 | # 需要拆分的数据前缀
 9 | a_list = "prefix_data_path1"
10 | b_list = "prefix_data_path2"
11 | c_list = "prefix_data_path3"
12 | 
13 | e_list = "prefix_data_path5"
14 | 
15 | a_file_name = "/nfs/volume-225-14/cuichenrui/data_preparation/1.list"
16 | b_file_name = "/nfs/volume-225-14/cuichenrui/data_preparation/2.list"
17 | c_file_name = "/nfs/volume-225-14/cuichenrui/data_preparation/3.list"
18 | d_file_name = "/nfs/volume-225-14/cuichenrui/data_preparation/4.list"
19 | e_file_name = "/nfs/volume-225-14/cuichenrui/data_preparation/5.list"
20 | 
21 | # 文件路径
22 | raw_file_path = "your_raw_file_path"
23 | 
24 | # 逐行读取文件
25 | with open(raw_file_path, "r") as file, \
26 |     jsonlines.open(a_file_name, "w") as a_file, \
27 |     jsonlines.open(b_file_name, "w") as b_file, \
28 |     jsonlines.open(c_file_name, "w") as c_file, \
29 |     jsonlines.open(d_file_name, "w") as d_file, \
30 |     jsonlines.open(e_file_name, "w") as e_file:
31 |     lines = file.readlines()
32 |     for line in tqdm(lines, desc="Processing Checking", unit="lines"):
33 |         
34 |         # 解析 json 数据
35 |         data = json.loads(line.strip())
36 |         key = data["key"]
37 |         wav = data["wav"]
38 |         text = data["txt"]
39 |         
40 |         result_json = {"key": key, "wav": wav, "txt": text}
41 |         
42 |         if a_list in wav:
43 |             a_file.write(result_json)
44 |         elif b_list in wav:
45 |             b_file.write(result_json)
46 |         elif c_list in wav:
47 |             c_file.write(result_json)
48 |         elif e_list in wav:
49 |             e_file.write(result_json)
50 |         else:
51 |             d_file.write(result_json)
52 | 


--------------------------------------------------------------------------------
/data_processing/wenet_data_using/get_new_dict.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import json
 5 | from tqdm import tqdm
 6 | 
 7 | text_dict = set()
 8 | 
 9 | with open("your_data_list_path", 'r') as f:
10 |     lines = f.readlines()
11 |     for line in tqdm(lines, desc="Processing", unit="lines"):
12 |         data = json.loads(line)
13 |         sentence = data['txt']
14 | 
15 |         for i in sentence:
16 |             text_dict.add(i)
17 | 
18 | result = list(text_dict)
19 | 
20 | # 排序便于查看，还可以根据排序后的结果一眼看出特殊字符，并重新进行数据清洗
21 | result.sort()
22 | 
23 | # 添加 wenet 字典的特殊 token
24 | final_result = ["<blank>", "<unk>"] + result + ["<sos>", "<eos>"]
25 | 
26 | for i in range(len(final_result)):
27 |     print(final_result[i] + " " + str(i))
28 | 


--------------------------------------------------------------------------------
/data_processing/whisper_data_using/data_make_aishell2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import json
 5 | import jsonlines
 6 | from tqdm import tqdm
 7 | import os
 8 | 
 9 | data_dict = {}
10 | 
11 | raw_result_file_items = 0
12 | raw_result_file_duration = 0
13 | 
14 | raw_list_filename = "your_aishell2_path/wav.scp"
15 | raw_text_filename = "your_aishell2_path/trans.txt"
16 | raw_result_filename = "your_result_jsonl_path"
17 | 
18 | # 处理 list 文件
19 | with open(raw_list_filename, 'r') as raw_list_file:
20 |     for line in tqdm(raw_list_file, desc="Processing List", unit="lines"):
21 |         key, audio_path = line.strip().split('\t')
22 |         # 补充音频路径前缀
23 |         audio_path = os.path.join("your_aishell2_path", audio_path)
24 |         if os.path.exists(audio_path):
25 |             data_dict[key] = [audio_path]
26 |         else:
27 |             print(f"错误路径：{audio_path}")
28 | 
29 | # 处理 text 文件           
30 | with open(raw_text_filename, 'r') as raw_text_file:
31 |     for line in tqdm(raw_text_file, desc="Processing Text", unit="lines"):
32 |         # 使用 \t 分割每一行的内容
33 |         # 鲁棒解决音频无文本问题
34 |         if "\t" in line:
35 |             key, text = line.strip().split('\t')
36 |         else:
37 |             key, text = line.strip(), ""
38 |             print(f"无文本音频：{line}")
39 |             # 无文本音频是否进行保留
40 |             # continue
41 |         
42 |         data_dict[key].append(text)
43 | 
44 | # 数据写入新文件
45 | with jsonlines.open(raw_result_filename, mode="w") as raw_result_file:
46 |     for key, value in tqdm(data_dict.items(), desc="Processing Result", unit="lines"):
47 |         audio_path, text = value
48 |         
49 |         # 巧妙计算音频时长
50 |         filesize = os.path.getsize(audio_path)
51 |         duration = float(filesize-44) / 2 / 16000
52 |         duration = round(duration, 2)
53 |         
54 |         result_json = {"audio": {"path": audio_path}, "sentence": text, "language": "chinese", "duration": duration}
55 |         
56 |         # 写入文件
57 |         raw_result_file.write(result_json)
58 |         raw_result_file_items += 1
59 |         raw_result_file_duration += duration
60 | 
61 | print(raw_result_filename)
62 | print(f"items = {raw_result_file_items}")
63 | print(f"duration = {round(raw_result_file_duration / 3600, 2)}h")
64 |     


--------------------------------------------------------------------------------
/data_processing/whisper_data_using/data_make_kespeech.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import json
 5 | import jsonlines
 6 | from tqdm import tqdm
 7 | import os
 8 | 
 9 | train_folder_list = ["train_phase1", "train_phase2", "dev_phase1", "dev_phase2"]
10 | wavscp_filename_example = "your_kespeech_path/<need_to_filled>/wav.scp"
11 | text_filename_example = "your_kespeech_path/<need_to_filled>/text"
12 | train_jsonl_filename = "your_result_jsonl_path"
13 | 
14 | data_dict = {}
15 | all_train_items = 0
16 | all_train_duration = 0
17 | 
18 | for train_folder in train_folder_list:
19 |     
20 |     wavscp_filename = wavscp_filename_example.replace("<need_to_filled>", train_folder)
21 |     text_filename = text_filename_example.replace("<need_to_filled>", train_folder)
22 |     
23 |     # 处理 wavscp 文件
24 |     with open(wavscp_filename, 'r') as wavscp_file:
25 |         for line in tqdm(wavscp_file):
26 |             # 使用空格分割每一行的内容
27 |             key, audio_path = line.strip().split(' ')
28 |             audio_path = "your_kespeech_path/" + audio_path
29 |             if os.path.exists(audio_path):
30 |                 data_dict[key] = [audio_path]
31 |             else:
32 |                 print(f"错误路径：{audio_path}")
33 | 
34 |     # 处理 text 文件           
35 |     with open(text_filename, 'r') as text_file:
36 |         for line in tqdm(text_file):
37 |             # 使用空格分割每一行的内容
38 |             # 鲁棒解决音频无文本问题
39 |             if " " in line:
40 |                 key, text = line.strip().split(' ', 1)
41 |             else:
42 |                 key, text = line.strip(), ""
43 |                 print(f"无文本音频：{line}")
44 |                 # 无文本音频是否进行保留
45 |                 # continue
46 |             
47 |             # 这个数据集有特殊字符，进行去除
48 |             text = text.replace("<SPOKEN_NOISE>", "")
49 |             data_dict[key].append(text)
50 | 
51 | # 数据写入新文件
52 | with jsonlines.open(train_jsonl_filename, mode="w") as train_jsonl_file:
53 |             
54 |     for key, value in tqdm(data_dict.items()):
55 |         
56 |         audio_path, text = value
57 |         
58 |         # 巧妙计算音频时长
59 |         filesize = os.path.getsize(audio_path)
60 |         duration = float(filesize-44) / 2 / 16000
61 |         duration = round(duration, 2)
62 |         
63 |         result_json = {"audio": {"path": audio_path}, "sentence": text, "language": "chinese", "duration": duration}
64 |         
65 |         train_jsonl_file.write(result_json)
66 |         all_train_items += 1
67 |         all_train_duration += duration
68 | 
69 | print(f"all_train_items = {all_train_items}")
70 | print(f"all_train_duration = {round(all_train_duration / 3600, 2)}h")
71 | 


--------------------------------------------------------------------------------
/data_processing/whisper_data_using/data_wav_exist_checking.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import json
 6 | from tqdm import tqdm
 7 | 
 8 | # 文件路径
 9 | file_path = "your_result_jsonl_path"
10 | 
11 | # 初始化统计变量
12 | total_files = 0
13 | correct_files = 0
14 | incorrect_files = 0
15 | total_duration = 0
16 | correct_duration = 0
17 | incorrect_duration = 0
18 | 
19 | # 逐行读取文件
20 | with open(file_path, "r") as file:
21 |     lines = file.readlines()
22 |     # 使用tqdm显示进度条
23 |     for line in tqdm(lines, desc="Processing lines", unit="lines"):
24 |         # 解析每一行的JSON数据
25 |         data = json.loads(line.strip())
26 |         audio_path = data["audio"]["path"]
27 |         duration = data["duration"]
28 |         
29 |         # 更新总文件数和总时长
30 |         total_files += 1
31 |         total_duration += duration
32 |         
33 |         # 检查文件是否存在
34 |         if os.path.exists(audio_path):
35 |             correct_files += 1
36 |             correct_duration += duration
37 |         else:
38 |             incorrect_files += 1
39 |             incorrect_duration += duration
40 | 
41 | # 输出结果
42 | print("Total files:", total_files)
43 | print("Correct files:", correct_files)
44 | print("Incorrect files:", incorrect_files)
45 | print("Total duration:", total_duration)
46 | print("Correct duration:", correct_duration)
47 | print("Incorrect duration:", incorrect_duration)
48 | 


--------------------------------------------------------------------------------
/data_processing/whisper_data_using/decode_change_general_to_8k.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tqdm import tqdm
 3 | import librosa
 4 | import soundfile as sf
 5 | import json
 6 | import jsonlines
 7 | 
 8 | # 文件路径
 9 | input_file_path = "/nfs/volume-225-14/cuichenrui/dataset/02_general/general.jsonl"
10 | output_file_path = "/nfs/volume-225-14/cuichenrui/dataset/02_general/general_8k.jsonl"
11 | output_directory = "/ofs/speechssd/datasets/opensource_data/ASR/AISHELL_MAGICDATA_8K"
12 | audio_datas = []
13 | 
14 | # 创建输出目录（如果不存在的话）
15 | os.makedirs(output_directory, exist_ok=True)
16 | 
17 | # 读取文件路径
18 | with open(input_file_path, 'r') as f:
19 |     lines = f.readlines()
20 |     for line in tqdm(lines, desc="read_old_json", unit="lines"):
21 |         data = json.loads(line)
22 |         audio_path = data['audio']['path']
23 |         sentence = data['sentence']
24 |         duration = data['duration']
25 |         audio_datas.append((audio_path, sentence, duration))
26 |         
27 | # 处理并写入新文件
28 | with jsonlines.open(output_file_path, mode="w") as output_file:
29 |     # 处理每一个音频文件
30 |     for audio_data in tqdm(audio_datas, desc="write_new_json", unit="lines"):
31 |         audio_path = audio_data[0].strip()  # 去除行尾的换行符
32 | 
33 |         # 读取原始音频
34 |         data, sr = librosa.load(audio_path, sr=None)
35 | 
36 |         # 检查采样率是否是16kHz
37 |         if sr != 16000:
38 |             print(f"Skipping {audio_path}, not 16kHz sample rate.")
39 |             continue
40 | 
41 |         # 降采样到8kHz
42 |         downsampled_data = librosa.resample(data, orig_sr=sr, target_sr=8000)
43 | 
44 |         # 获取新文件路径
45 |         middle_path = os.path.dirname(audio_path.split("openSLR/")[1])
46 |         base_name = os.path.basename(audio_path)
47 |         new_folder_path = os.path.join(output_directory, middle_path)
48 |         # 创建输出目录（如果不存在的话）
49 |         os.makedirs(new_folder_path, exist_ok=True)
50 |         new_file_path = os.path.join(new_folder_path, base_name)
51 | 
52 |         # 保存新文件
53 |         sf.write(new_file_path, downsampled_data, 8000)
54 |         # 写入json文件
55 |         result_json = {"audio": {"path": new_file_path}, "sentence": audio_data[1], "language": "chinese", "duration": audio_data[2]}
56 |         output_file.write(result_json)


--------------------------------------------------------------------------------
/data_processing/whisper_data_using/decode_data_preparation.py:
--------------------------------------------------------------------------------
 1 | # 2024_02_20
 2 | # 数据准备脚本，将data.list转化为whisper可读的json文件
 3 | 
 4 | data_list_filenames = ["data_out.list"]
 5 | 
 6 | whisper_json_filename = "shunfengche.jsonl"
 7 | 
 8 | import json
 9 | import jsonlines
10 | import soundfile
11 | import string
12 | import tqdm
13 | 
14 | def remove_punctuation(input_string):
15 |     """去除所有标点符号"""
16 |     translation_table = str.maketrans("", "", string.punctuation + "，。、；：！？（）【】『』“”《》［］｛｝﹙﹚﹛﹜﹝﹞〔〕〈〉")
17 |     no_punct = input_string.translate(translation_table)
18 |     return no_punct
19 | 
20 | total_items = 0
21 | total_correct_items = 0
22 | total_error_items = 0
23 | total_correct_duration = 0
24 | progress = 0
25 | 
26 | for data_list_filename in data_list_filenames:
27 |     print(f"处理文件: {data_list_filename}")
28 |     with open(data_list_filename, "r", encoding='utf-8') as data_list_file:
29 |         contents = data_list_file.readlines()
30 | 
31 |         total_items = len(contents)
32 |         
33 |         with jsonlines.open(whisper_json_filename, mode="a") as whisper_json_file:
34 |             for content in contents:
35 |                 progress += 1
36 |                 if progress % 100 == 0:
37 |                     print(f"{progress} / {total_items}")
38 | 
39 |                 audio_path, text = content.strip().split("\t")
40 |                 try:
41 |                     text = remove_punctuation(text)
42 |                     sample, sr = soundfile.read(audio_path)
43 |                     duration = round(sample.shape[-1] / float(sr), 2)
44 |                     result_json = {"audio": {"path": audio_path}, "sentence": text, "duration": duration}
45 |                     whisper_json_file.write(result_json)
46 | 
47 |                     total_correct_items += 1
48 |                     total_correct_duration += duration
49 | 
50 |                 except Exception as e:
51 |                     print("error audio path :" + audio)
52 |                     total_error_items += 1
53 | 
54 | print(f"total_items = {total_items}")
55 | print(f"total_correct_items = {total_correct_items}")
56 | print(f"total_error_items = {total_error_items}")
57 | print(f"total_correct_duration = {round(total_correct_duration / 3600, 2)}h")


--------------------------------------------------------------------------------
/data_processing/whisper_data_using/decode_data_preparation_for_customer_service.py:
--------------------------------------------------------------------------------
 1 | # 2024_02_20
 2 | # 数据准备脚本，将data.list转化为whisper可读的json文件
 3 | 
 4 | data_list_filenames = ["data_out.list"]
 5 | 
 6 | whisper_json_filename = "shunfengche.jsonl"
 7 | 
 8 | import json
 9 | import jsonlines
10 | import soundfile
11 | import string
12 | import tqdm
13 | 
14 | def remove_punctuation(input_string):
15 |     """去除所有标点符号"""
16 |     translation_table = str.maketrans("", "", string.punctuation + "，。、；：！？（）【】『』“”《》［］｛｝﹙﹚﹛﹜﹝﹞〔〕〈〉")
17 |     no_punct = input_string.translate(translation_table)
18 |     return no_punct
19 | 
20 | total_items = 0
21 | total_correct_items = 0
22 | total_error_items = 0
23 | total_correct_duration = 0
24 | progress = 0
25 | 
26 | for data_list_filename in data_list_filenames:
27 |     print(f"处理文件: {data_list_filename}")
28 |     with open(data_list_filename, "r", encoding='utf-8') as data_list_file:
29 |         contents = data_list_file.readlines()
30 | 
31 |         total_items = len(contents)
32 |         
33 |         with jsonlines.open(whisper_json_filename, mode="a") as whisper_json_file:
34 |             for content in contents:
35 |                 progress += 1
36 |                 if progress % 100 == 0:
37 |                     print(f"{progress} / {total_items}")
38 | 
39 |                 # audio_path, text = content.strip().split("\t")
40 |                 
41 |                 parts = content.strip().split("\t")
42 |                 if len(parts) == 2:
43 |                     text = parts[1]
44 |                 else:
45 |                     text = ""
46 |                 audio_path = parts[0]
47 |                 
48 |                 try:
49 |                     text = remove_punctuation(text)
50 |                     # sample, sr = soundfile.read(audio_path)
51 |                     # duration = round(sample.shape[-1] / float(sr), 2)
52 |                     duration = 1.00
53 |                     result_json = {"audio": {"path": audio_path}, "sentence": text, "duration": duration}
54 |                     whisper_json_file.write(result_json)
55 | 
56 |                     total_correct_items += 1
57 |                     total_correct_duration += duration
58 | 
59 |                 except Exception as e:
60 |                     print("error audio path :" + audio)
61 |                     total_error_items += 1
62 | 
63 | print(f"total_items = {total_items}")
64 | print(f"total_correct_items = {total_correct_items}")
65 | print(f"total_error_items = {total_error_items}")
66 | print(f"total_correct_duration = {round(total_correct_duration / 3600, 2)}h")


--------------------------------------------------------------------------------
/data_processing/whisper_data_using/decode_data_preparation_for_speechio.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import json
 5 | import jsonlines
 6 | from tqdm import tqdm
 7 | import os
 8 | 
 9 | filename_list = ["SPEECHIO_ASR_ZH00000", 
10 |                 "SPEECHIO_ASR_ZH00001", 
11 |                 "SPEECHIO_ASR_ZH00002", 
12 |                 "SPEECHIO_ASR_ZH00003", 
13 |                 "SPEECHIO_ASR_ZH00004", 
14 |                 "SPEECHIO_ASR_ZH00005"]
15 | 
16 | for i, filename in enumerate(filename_list):
17 |     
18 |     total_items = 0
19 |     total_correct_items = 0
20 |     total_error_items = 0
21 |     total_correct_duration = 0
22 |     
23 |     metadata_filename = f"/ofs/speechssd/datasets/opensource_data/ASR/{filename}/metadata.tsv"
24 |     whisper_json_filename = f"/nfs/volume-225-14/cuichenrui/dataset/12_speechio/0{i+1}_speechio0{i}/speechio0{i}.jsonl"
25 | 
26 |     with open(metadata_filename, "r") as metadata_file:
27 |         
28 |         contents = metadata_file.readlines()
29 |         total_items = len(contents)
30 |         
31 |         with jsonlines.open(whisper_json_filename, mode="w") as whisper_json_file:
32 |             
33 |             for j, content in enumerate(tqdm(contents, desc="Processing contents")):
34 | 
35 |                 # 跳过 csv 文件第一行
36 |                 if j == 0:
37 |                     continue
38 | 
39 |                 _, audio_path, duration, text = content.strip().split("\t")
40 |                 
41 |                 audio_path = metadata_filename[:-12] + audio_path
42 |                 if os.path.exists(audio_path):
43 |                     duration = float(duration)
44 |                     result_json = {"audio": {"path": audio_path}, "sentence": text, "language": "chinese", "duration": duration}
45 |                     whisper_json_file.write(result_json)
46 | 
47 |                     total_correct_items += 1
48 |                     total_correct_duration += duration
49 | 
50 |                 else:
51 |                     print("error audio path :" + audio_pathio)
52 |                     total_error_items += 1
53 | 
54 |     print(f"total_items = {total_items}")
55 |     print(f"total_correct_items = {total_correct_items}")
56 |     print(f"total_error_items = {total_error_items}")
57 |     print(f"total_correct_duration = {round(total_correct_duration / 3600, 2)}h")


--------------------------------------------------------------------------------
/data_processing/whisper_data_using/decode_wenetlist2datalist.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | input_file = "data.list"
 4 | output_file = "data_out.list"
 5 | 
 6 | with open(input_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
 7 |     for line in f_in:
 8 |         # 解析每一行 JSON 数据
 9 |         data = json.loads(line.strip())
10 |         
11 |         # 提取 "wav" 和 "txt" 字段的值
12 |         wav_value = data.get("wav", "")
13 |         txt_value = data.get("txt", "")
14 |         
15 |         # 写入新文件，用 \t 分隔
16 |         f_out.write(f"{wav_value}\t{txt_value}\n")
17 | 
18 | 


--------------------------------------------------------------------------------
/data_processing/whisper_data_using/deocde_json2qwen.py:
--------------------------------------------------------------------------------
 1 | # 2024_02_21
 2 | # 将whisper的jsonl格式转换为qwen的jsonl格式
 3 | 
 4 | input_filename = "map.jsonl"
 5 | output_filename = "qwen_map.jsonl"
 6 | 
 7 | import jsonlines
 8 | 
 9 | # 读取原始文件并转换格式
10 | converted_data = []
11 | with jsonlines.open(input_filename, "r") as input_file:
12 |     for line in input_file:
13 |         converted_data.append({
14 |             "audio": line["audio"]["path"],
15 |             "text": line["sentence"]
16 |         })
17 | 
18 | # 将转换后的数据写入新文件
19 | with jsonlines.open(output_filename, "w") as output_file:
20 |     for data in converted_data:
21 |         output_file.write(data)
22 | 


--------------------------------------------------------------------------------
/data_processing/whisper_data_using/regularization_MyEnglishTextNormalizer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | class EnglishTextNormalizer:
 4 |     def __init__(self):
 5 |         self.replacers = {
 6 |             # 常见的缩写形式替换为其完整形式
 7 |             r"\bwon't\b": "will not",
 8 |             r"\bcan't\b": "can not",
 9 |             r"\blet's\b": "let us",
10 |             r"\bain't\b": "aint",
11 |             r"\by'all\b": "you all",
12 |             r"\bwanna\b": "want to",
13 |             r"\bgotta\b": "got to",
14 |             r"\bgonna\b": "going to",
15 |             r"\bi'ma\b": "i am going to",
16 |             r"\bimma\b": "i am going to",
17 |             r"\bwoulda\b": "would have",
18 |             r"\bcoulda\b": "could have",
19 |             r"\bshoulda\b": "should have",
20 |             r"\bma'am\b": "madam",
21 |             # 头衔或前缀的缩写替换
22 |             r"\bmr\b": "mister ",
23 |             r"\bmrs\b": "missus ",
24 |             r"\bst\b": "saint ",
25 |             r"\bdr\b": "doctor ",
26 |             r"\bprof\b": "professor ",
27 |             r"\bcapt\b": "captain ",
28 |             r"\bgov\b": "governor ",
29 |             r"\bald\b": "alderman ",
30 |             r"\bgen\b": "general ",
31 |             r"\bsen\b": "senator ",
32 |             r"\brep\b": "representative ",
33 |             r"\bpres\b": "president ",
34 |             r"\brev\b": "reverend ",
35 |             r"\bhon\b": "honorable ",
36 |             r"\basst\b": "assistant ",
37 |             r"\bassoc\b": "associate ",
38 |             r"\blt\b": "lieutenant ",
39 |             r"\bcol\b": "colonel ",
40 |             r"\bjr\b": "junior ",
41 |             r"\bsr\b": "senior ",
42 |             r"\besq\b": "esquire ",
43 |             # 过去分词形式的缩写替换
44 |             r"'d been\b": " had been",
45 |             r"'s been\b": " has been",
46 |             r"'d gone\b": " had gone",
47 |             r"'s gone\b": " has gone",
48 |             r"'d done\b": " had done",  # "'s done"可能有歧义
49 |             r"'s got\b": " has got",
50 |             # 一般性的缩写替换
51 |             r"n't\b": " not",
52 |             r"'re\b": " are",
53 |             r"'s\b": " is",
54 |             r"'d\b": " would",
55 |             r"'ll\b": " will",
56 |             r"'t\b": " not",
57 |             r"'ve\b": " have",
58 |             r"'m\b": " am",
59 |         }
60 | 
61 |     def __call__(self, s: str):
62 |         
63 |         # 全部文本小写
64 |         s = s.lower()
65 |         # 当撇号前有空白字符时，移除空白字符
66 |         s = re.sub(r"\s+'", "'", s)
67 | 
68 |         # 执行上述字典的替换操作
69 |         for pattern, replacement in self.replacers.items():
70 |             s = re.sub(pattern, replacement, s)
71 | 
72 |         # 将连续的空白字符替换为单个空格
73 |         s = re.sub(r"\s+", " ", s)
74 | 
75 |         return s


--------------------------------------------------------------------------------
/data_processing/whisper_data_using/regularization_check_data.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import json
  3 | import jsonlines
  4 | from tqdm import tqdm
  5 | 
  6 | quanjiao_dict = {
  7 |     "０": "0",
  8 |     "１": "1",
  9 |     "２": "2",
 10 |     "３": "3",
 11 |     "４": "4",
 12 |     "５": "5",
 13 |     "６": "6",
 14 |     "７": "7",
 15 |     "８": "8",
 16 |     "９": "9",
 17 |     "Ａ": "A",
 18 |     "Ｂ": "B",
 19 |     "Ｃ": "C",
 20 |     "Ｄ": "D",
 21 |     "Ｅ": "E",
 22 |     "Ｆ": "F",
 23 |     "Ｇ": "G",
 24 |     "Ｈ": "H",
 25 |     "Ｉ": "I",
 26 |     "Ｊ": "J",
 27 |     "Ｋ": "K",
 28 |     "Ｌ": "L",
 29 |     "Ｍ": "M",
 30 |     "Ｎ": "N",
 31 |     "Ｏ": "O",
 32 |     "Ｐ": "P",
 33 |     "Ｑ": "Q",
 34 |     "Ｒ": "R",
 35 |     "Ｓ": "S",
 36 |     "Ｔ": "T",
 37 |     "Ｕ": "U",
 38 |     "Ｖ": "V",
 39 |     "Ｗ": "W",
 40 |     "Ｘ": "X",
 41 |     "Ｙ": "Y",
 42 |     "Ｚ": "Z",
 43 |     "ａ": "a",
 44 |     "ｂ": "b",
 45 |     "ｃ": "c",
 46 |     "ｄ": "d",
 47 |     "ｅ": "e",
 48 |     "ｆ": "f",
 49 |     "ｇ": "g",
 50 |     "ｈ": "h",
 51 |     "ｉ": "i",
 52 |     "ｊ": "j",
 53 |     "ｋ": "k",
 54 |     "ｌ": "l",
 55 |     "ｍ": "m",
 56 |     "ｎ": "n",
 57 |     "ｏ": "o",
 58 |     "ｐ": "p",
 59 |     "ｑ": "q",
 60 |     "ｒ": "r",
 61 |     "ｓ": "s",
 62 |     "ｔ": "t",
 63 |     "ｕ": "u",
 64 |     "ｖ": "v",
 65 |     "ｗ": "w",
 66 |     "ｘ": "x",
 67 |     "ｙ": "y",
 68 |     "ｚ": "z"
 69 | }
 70 | 
 71 | # 定义文件路径
 72 | file_path = "/nfs/volume-225-14/cuichenrui/whisper/whisper_medium_full_finetune_2.0/2_special_characters.txt"
 73 | 
 74 | # 初始化一个空列表
 75 | special_characters_list = []
 76 | special_characters_path = 
 77 | # 打开文件并读取每一行
 78 | with open(file_path, 'r', encoding='utf-8') as file:
 79 |     for line in file:
 80 |         # 有的特殊字符就是空白，不能直接strip()
 81 |         line = line.replace("\n", "")
 82 |         special_characters_list.append(line)
 83 | 
 84 | with jsonlines.open("train_all_new_checked_without_8k_regularization_2.jsonl", mode="w") as output_file, open("train_all_new_checked_without_8k_regularization.jsonl", 'r') as f:
 85 |     lines = f.readlines()
 86 |     for line in tqdm(lines, desc="Processing", unit="lines"):
 87 |         pass_data = False
 88 |         data = json.loads(line)
 89 |         audio_path = data['audio']['path']
 90 |         sentence = data['sentence']
 91 |         duration = data['duration']
 92 |         language = data['language']
 93 |         
 94 |         sentence = sentence.strip()
 95 |         # 去除几个中文标点
 96 |         for i in ["。", "？", "﹐", "，"]:
 97 |             sentence = sentence.replace(i, "")
 98 |         # 全角字符转化为半角字符
 99 |         for i in quanjiao_dict.keys():
100 |             sentence = sentence.replace(i, quanjiao_dict[i])
101 |         
102 |         result_json = {"audio": {"path": audio_path}, "sentence": sentence, "language": language, "duration": duration}
103 |         
104 |         for i in special_characters_list:
105 |             # 文本中有特殊字符
106 |             if i in sentence:
107 |                 print(result_json)
108 |                 pass_data = True
109 |                 break
110 |             
111 |         if not pass_data:
112 |             output_file.write(result_json)
113 | 


--------------------------------------------------------------------------------
/data_processing/whisper_data_using/regularization_get_20000_items.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from tqdm import tqdm
 3 | 
 4 | # 生成20000个随机数并排序
 5 | random_numbers = [random.randint(0, 8000) for _ in range(20000)]
 6 | line_number = 0
 7 | 
 8 | # 逐行读取两个文件的内容，并将对应行写入到shuffle_10000.jsonl文件中
 9 | with open('train_all_new_checked_without_8k.jsonl', 'r') as file1, \
10 |         open('train_all_new_checked_without_8k_regularization.jsonl', 'r') as file2, \
11 |         open('shuffle_20000.jsonl', 'w') as shuffle_file:
12 |     
13 |     for number in tqdm(random_numbers, desc="Processing", unit=" lines"):
14 |         
15 |         line_number += number + 1
16 |         shuffle_file.write("行号：" + str(line_number) + "\n")
17 |         # 跳过file1前number-1行
18 |         for _ in range(number):
19 |             next(file1)
20 |         # 读取file1的第number行并写入到shuffle_file
21 |         shuffle_file.write(next(file1))
22 |         
23 |         # 跳过file2前number-1行
24 |         for _ in range(number):
25 |             next(file2)
26 |         # 读取file2的第number行并写入到shuffle_file
27 |         shuffle_file.write(next(file2))
28 | 


--------------------------------------------------------------------------------
/data_processing/whisper_data_using/regularization_text_check_dict.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from tqdm import tqdm
 3 | 
 4 | text_dict = set()
 5 | 
 6 | with open("train_all_new_checked_without_8k.jsonl", 'r') as f:
 7 |     lines = f.readlines()
 8 |     for line in tqdm(lines, desc="Processing", unit="lines"):
 9 |         data = json.loads(line)
10 |         sentence = data['sentence']
11 | 
12 |         for i in sentence:
13 |             text_dict.add(i)
14 | 
15 | result = list(text_dict)
16 | result.sort()
17 | for i in result:
18 |     print(i)
19 | 


--------------------------------------------------------------------------------
/data_processing/whisper_data_using/regularization_text_regularization.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | import jsonlines
 4 | from tqdm import tqdm
 5 | 
 6 | import jiwer
 7 | from MyEnglishTextNormalizer import EnglishTextNormalizer
 8 | 
 9 | def normalize_text(text):
10 |     
11 |     text = text.strip()
12 |     alpha = "'-0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
13 |     normalized_text = ""
14 |     
15 |     fill_blanks = False
16 |     # 处理每个字符
17 |     for i in range(len(text)):
18 |         # 如果是空格
19 |         if text[i] == " ":
20 |             # 如果空格左右有一侧是英文字母则保留
21 |             # 空格左边右边一定有字符，[i-1][i+1]不用考虑越界
22 |             if text[i-1] in alpha or text[i+1] in alpha:
23 |                 normalized_text += text[i]
24 |         
25 |         # 如果 i 不是最后一个字符；i+1 不是空格；i 和 i+1 一英一中
26 |         elif (i+1 < len(text)) and (text[i+1] != " ") and ((text[i] in alpha and text[i+1] not in alpha) or (text[i+1] in alpha and text[i] not in alpha)):
27 |             fill_blanks = True
28 |             normalized_text += text[i]
29 |             normalized_text += " "
30 |         
31 |         # 如果非空格或英文字母，不做改变
32 |         else:
33 |             normalized_text += text[i]
34 | 
35 |     if fill_blanks:
36 |         print(f"原始文本：{text}")
37 |         print(f"补空格后：{normalized_text}")
38 |             
39 |     return normalized_text
40 | 
41 | normalizer = EnglishTextNormalizer()
42 | 
43 | with jsonlines.open("train_all_new_checked_without_8k_regularization_3.jsonl", mode="w") as output_file, open("train_all_new_checked_without_8k_regularization_2.jsonl", 'r') as f:
44 |     lines = f.readlines()
45 |     for line in tqdm(lines, desc="Processing", unit="lines"):
46 |         data = json.loads(line)
47 |         audio_path = data['audio']['path']
48 |         sentence = data['sentence']
49 |         duration = data['duration']
50 |         language = data['language']
51 |         
52 |         sentence1 = normalizer(sentence)
53 |         sentence2 = normalize_text(sentence1)
54 |         
55 |         result_json = {"audio": {"path": audio_path}, "sentence": sentence2, "language": language, "duration": duration}
56 |         output_file.write(result_json)


--------------------------------------------------------------------------------
/espnet_using/README.md:
--------------------------------------------------------------------------------
1 | # Espnet 语音框架的使用
2 | 
3 | ## ⌛️ TODO
4 | 


--------------------------------------------------------------------------------
/fairseq_using/README.md:
--------------------------------------------------------------------------------
1 | # Fairseq 语音框架的使用
2 | 
3 | ## ⌛️ TODO
4 | 


--------------------------------------------------------------------------------
/faster_whisper_using/README.md:
--------------------------------------------------------------------------------
 1 | ##  faster-whisper 语音框架的使用
 2 | 
 3 | * [Github 开源地址](https://github.com/SYSTRAN/faster-whisper)
 4 | 
 5 | * [Hugging face 模型地址](https://huggingface.co/Systran)
 6 | 
 7 | faster-whisper 是基于 OpenAI 的 Whisper 模型的高效实现，它利用 CTranslate2，一个专为 Transformer 模型设计的快速推理引擎。这种实现不仅提高了语音识别的速度，还优化了内存使用效率。faster-whisper 的核心优势在于其能够在保持原有模型准确度的同时，大幅提升处理速度，这使得它在处理大规模语音数据时更加高效。
 8 | 
 9 | 就我个人而言，是在学习使用 Whisper 框架中了解到 faster-whisper 的，其性能确实非常高效，相比于 openai-whisper 还实现了一些额外的功能。faster-whisper 的相关功能和解码超参数可以参照本仓库的 ```faster_whisper_using/faster_whisper_hyper_parameters.md``` 文件，里面是我 debug 这个框架的一些分析思考，过程中 vscode 使用的 ```launch.json``` 文件也附在了 ```faster_whisper_using/vscode/launch.json``` 。
10 | 
11 | 目录 ```faster-whisper``` 是我于 2024_05_11 在上述开源仓库克隆的版本 0.10.1。该仓库还在不断更新，不过基本框架没有太大变动。文件夹中的 ```faster-whisper/convert.sh``` , ```faster-whisper/evaluate.sh``` 和 ```faster-whisper/evaluate.py``` 文件是我自己写的解码相关代码。```faster-whisper/wenet_utils``` 是解码后处理的一些文件，拷贝自 wenet 框架，详细介绍可以参见本仓库 ```wenet_using``` 文件夹。
12 | 
13 | 解码使用的数据集格式如下：
14 | 
15 | ```
16 | # 数据列表采用 jsonl 格式，每行是一个 dict 元素
17 | # dict 中 duration 键对应的值为 float 格式，其余值均为 str 格式
18 | # dict 中 language 和 duration 对应的值其实是没有用上的，但是为了整个框架数据的统一性，还是放在了数据集中
19 | # jsonl 文件处理脚本可以参考本仓库的 data_processing 文件夹
20 | # 数据列表格式沿用的是 whisper-finetune 项目框架，可以参考本仓库的 whisper_finetune 文件夹
21 | 
22 | {"audio": {"path": /your_audio_file/1.wav}, "sentence": audio_transcribe_text, "language": "your_audio_language", "duration": your_audio_duration}
23 | {"audio": {"path": /your_audio_file/2.wav}, "sentence": audio_transcribe_text, "language": "your_audio_language", "duration": your_audio_duration}
24 | {"audio": {"path": /your_audio_file/3.wav}, "sentence": audio_transcribe_text, "language": "your_audio_language", "duration": your_audio_duration}
25 | 
26 |         ... ...              ... ...
27 | 
28 | {"audio": {"path": /your_audio_file/10000.wav}, "sentence": audio_transcribe_text, "language": "your_audio_language", "duration": your_audio_duration}
29 | ```
30 | 
31 | 执行脚本：
32 | 
33 | 先执行 checkpoint 转换脚本，将 hugging face 格式的 checkpoint 转换为 faster-whisper 使用的 checkpoint：
34 | 
35 | ```
36 | . faster-whisper/convert.sh
37 | ```
38 | 
39 | 然后执行解码脚本，该脚本执行解码、文本正则化、计算字错误率过程，另需要单独安装 wenet 环境，相关细节可以参见本仓库 ```wenet_using``` 文件夹：
40 | 
41 | ```
42 | . faster-whisper/evaluate.sh
43 | ```
44 | 
45 | 这个脚本可以实现批量数据集，批量 checkpoint 的解码测试。测试程序不仅生成识别结果，同时还生成了语种识别的准确率和 rtf。缺点是解码的 batch_size 只能为 1 （faster-whisper框架所限制），导致用一些高性能卡时显存和算力都跑不满，不过可以单卡同时提交多个解码进程，具体细节再次不作赘述。
46 | 
47 | 别的细节就没什么了，这个框架主要就是提供一个接口来加速 Whisper 的解码速度，不过 faster-whisper 的 VAD 时间戳好像还挺准的，有人通过这个框架实现了自动剪辑并生成字幕，还是挺有意思的，感兴趣的同学可以去了解一下。


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to faster-whisper
 2 | 
 3 | Contributions are welcome! Here are some pointers to help you install the library for development and validate your changes before submitting a pull request.
 4 | 
 5 | ## Install the library for development
 6 | 
 7 | We recommend installing the module in editable mode with the `dev` extra requirements:
 8 | 
 9 | ```bash
10 | git clone https://github.com/SYSTRAN/faster-whisper.git
11 | cd faster-whisper/
12 | pip install -e .[dev]
13 | ```
14 | 
15 | ## Validate the changes before creating a pull request
16 | 
17 | 1. Make sure the existing tests are still passing (and consider adding new tests as well!):
18 | 
19 | ```bash
20 | pytest tests/
21 | ```
22 | 
23 | 2. Reformat and validate the code with the following tools:
24 | 
25 | ```bash
26 | black .
27 | isort .
28 | flake8 .
29 | ```
30 | 
31 | These steps are also run automatically in the CI when you open the pull request.
32 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 SYSTRAN
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include faster_whisper/assets/silero_vad.onnx
2 | include requirements.txt
3 | include requirements.conversion.txt
4 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/benchmark/benchmark.m4a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cuichenrui2000/barry_speech_tools/4780bd2c0a578821a3bb782240b58e7c0ca44ac7/faster_whisper_using/faster-whisper/benchmark/benchmark.m4a


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/benchmark/memory_benchmark.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import time
 3 | 
 4 | from typing import Callable
 5 | 
 6 | import py3nvml.py3nvml as nvml
 7 | 
 8 | from memory_profiler import memory_usage
 9 | from utils import MyThread, get_logger, inference
10 | 
11 | logger = get_logger("faster-whisper")
12 | parser = argparse.ArgumentParser(description="Memory benchmark")
13 | parser.add_argument(
14 |     "--gpu_memory", action="store_true", help="Measure GPU memory usage"
15 | )
16 | parser.add_argument("--device-index", type=int, default=0, help="GPU device index")
17 | parser.add_argument(
18 |     "--interval",
19 |     type=float,
20 |     default=0.5,
21 |     help="Interval at which measurements are collected",
22 | )
23 | args = parser.parse_args()
24 | device_idx = args.device_index
25 | interval = args.interval
26 | 
27 | 
28 | def measure_memory(func: Callable[[], None]):
29 |     if args.gpu_memory:
30 |         logger.info(
31 |             "Measuring maximum GPU memory usage on GPU device."
32 |             " Make sure to not have additional processes running on the same GPU."
33 |         )
34 |         # init nvml
35 |         nvml.nvmlInit()
36 |         handle = nvml.nvmlDeviceGetHandleByIndex(device_idx)
37 |         gpu_name = nvml.nvmlDeviceGetName(handle)
38 |         gpu_memory_limit = nvml.nvmlDeviceGetMemoryInfo(handle).total >> 20
39 |         gpu_power_limit = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000.0
40 |         info = {"gpu_memory_usage": [], "gpu_power_usage": []}
41 | 
42 |         def _get_gpu_info():
43 |             while True:
44 |                 info["gpu_memory_usage"].append(
45 |                     nvml.nvmlDeviceGetMemoryInfo(handle).used >> 20
46 |                 )
47 |                 info["gpu_power_usage"].append(
48 |                     nvml.nvmlDeviceGetPowerUsage(handle) / 1000
49 |                 )
50 |                 time.sleep(interval)
51 | 
52 |                 if stop:
53 |                     break
54 | 
55 |             return info
56 | 
57 |         stop = False
58 |         thread = MyThread(_get_gpu_info, params=())
59 |         thread.start()
60 |         func()
61 |         stop = True
62 |         thread.join()
63 |         result = thread.get_result()
64 | 
65 |         # shutdown nvml
66 |         nvml.nvmlShutdown()
67 |         max_memory_usage = max(result["gpu_memory_usage"])
68 |         max_power_usage = max(result["gpu_power_usage"])
69 |         print("GPU name: %s" % gpu_name)
70 |         print("GPU device index: %s" % device_idx)
71 |         print(
72 |             "Maximum GPU memory usage: %dMiB / %dMiB (%.2f%%)"
73 |             % (
74 |                 max_memory_usage,
75 |                 gpu_memory_limit,
76 |                 (max_memory_usage / gpu_memory_limit) * 100,
77 |             )
78 |         )
79 |         print(
80 |             "Maximum GPU power usage: %dW / %dW (%.2f%%)"
81 |             % (
82 |                 max_power_usage,
83 |                 gpu_power_limit,
84 |                 (max_power_usage / gpu_power_limit) * 100,
85 |             )
86 |         )
87 |     else:
88 |         logger.info("Measuring maximum increase of memory usage.")
89 |         max_usage = memory_usage(func, max_usage=True, interval=interval)
90 |         print("Maximum increase of RAM memory usage: %d MiB" % max_usage)
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     measure_memory(inference)
95 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/benchmark/requirements.benchmark.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | jiwer
3 | evaluate
4 | datasets
5 | memory_profiler
6 | py3nvml
7 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/benchmark/speed_benchmark.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import timeit
 3 | 
 4 | from typing import Callable
 5 | 
 6 | from utils import inference
 7 | 
 8 | parser = argparse.ArgumentParser(description="Speed benchmark")
 9 | parser.add_argument(
10 |     "--repeat",
11 |     type=int,
12 |     default=3,
13 |     help="Times an experiment will be run.",
14 | )
15 | args = parser.parse_args()
16 | 
17 | 
18 | def measure_speed(func: Callable[[], None]):
19 |     # as written in https://docs.python.org/3/library/timeit.html#timeit.Timer.repeat,
20 |     # min should be taken rather than the average
21 |     runtimes = timeit.repeat(
22 |         func,
23 |         repeat=args.repeat,
24 |         number=10,
25 |     )
26 |     print(runtimes)
27 |     print("Min execution time: %.3fs" % (min(runtimes) / 10.0))
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     measure_speed(inference)
32 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/benchmark/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from threading import Thread
 4 | from typing import Optional
 5 | 
 6 | from faster_whisper import WhisperModel
 7 | 
 8 | model_path = "large-v3"
 9 | model = WhisperModel(model_path, device="cuda")
10 | 
11 | 
12 | def inference():
13 |     segments, info = model.transcribe("benchmark.m4a", language="fr")
14 |     for segment in segments:
15 |         print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
16 | 
17 | 
18 | def get_logger(name: Optional[str] = None) -> logging.Logger:
19 |     formatter = logging.Formatter("%(levelname)s: %(message)s")
20 |     logger = logging.getLogger(name)
21 |     logger.setLevel(logging.DEBUG)
22 |     handler = logging.StreamHandler()
23 |     handler.setFormatter(formatter)
24 |     logger.addHandler(handler)
25 |     return logger
26 | 
27 | 
28 | class MyThread(Thread):
29 |     def __init__(self, func, params):
30 |         super(MyThread, self).__init__()
31 |         self.func = func
32 |         self.params = params
33 |         self.result = None
34 | 
35 |     def run(self):
36 |         self.result = self.func(*self.params)
37 | 
38 |     def get_result(self):
39 |         return self.result
40 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/benchmark/wer_benchmark.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | from datasets import load_dataset
 5 | from evaluate import load
 6 | from tqdm import tqdm
 7 | from transformers.models.whisper.english_normalizer import EnglishTextNormalizer
 8 | 
 9 | from faster_whisper import WhisperModel
10 | 
11 | parser = argparse.ArgumentParser(description="WER benchmark")
12 | parser.add_argument(
13 |     "--audio_numb",
14 |     type=int,
15 |     default=None,
16 |     help="Specify the number of validation audio files in the dataset."
17 |     " Set to None to retrieve all audio files.",
18 | )
19 | args = parser.parse_args()
20 | 
21 | model_path = "large-v3"
22 | model = WhisperModel(model_path, device="cuda")
23 | 
24 | # load the dataset with streaming mode
25 | dataset = load_dataset("librispeech_asr", "clean", split="validation", streaming=True)
26 | 
27 | # define the evaluation metric
28 | wer_metric = load("wer")
29 | normalizer = EnglishTextNormalizer(json.load(open("normalizer.json")))
30 | 
31 | 
32 | def inference(batch):
33 |     batch["transcription"] = []
34 |     for sample in batch["audio"]:
35 |         segments, info = model.transcribe(sample["array"], language="en")
36 |         batch["transcription"].append("".join([segment.text for segment in segments]))
37 |     batch["reference"] = batch["text"]
38 |     return batch
39 | 
40 | 
41 | dataset = dataset.map(function=inference, batched=True, batch_size=16)
42 | 
43 | all_transcriptions = []
44 | all_references = []
45 | 
46 | # iterate over the dataset and run inference
47 | for i, result in tqdm(enumerate(dataset), desc="Evaluating..."):
48 |     all_transcriptions.append(result["transcription"])
49 |     all_references.append(result["reference"])
50 |     if args.audio_numb and i == (args.audio_numb - 1):
51 |         break
52 | 
53 | # normalize predictions and references
54 | all_transcriptions = [normalizer(transcription) for transcription in all_transcriptions]
55 | all_references = [normalizer(reference) for reference in all_references]
56 | 
57 | # compute the WER metric
58 | wer = 100 * wer_metric.compute(
59 |     predictions=all_transcriptions, references=all_references
60 | )
61 | print("WER: %.3f" % wer)
62 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/convert.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 激活 faster-whisper 虚拟环境
 4 | source your_anaconda_path
 5 | conda activate your_faster_whisper_env_name
 6 | 
 7 | # 执行模型转换脚本
 8 | ct2-transformers-converter \
 9 |     --model your_hugging_face_model_path \
10 |     --output_dir your_output_model_path[the_folder_should_not_exist] \
11 |     --copy_files tokenizer.json preprocessor_config.json \
12 |     --quantization float16
13 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/evaluate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from faster_whisper import WhisperModel
 5 | import json
 6 | import datetime
 7 | import argparse
 8 | 
 9 | result_dict = {}
10 | 
11 | def load_speech_data(path):
12 |     """读取数据集"""
13 |     with open(path, "r") as f:
14 |         contents = f.readlines()
15 |     return contents
16 | 
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument('--hyp_path', type=str, help="解码结果存储路径")
19 | parser.add_argument('--lab_path', type=str, help="解码答案存储路径")
20 | parser.add_argument('--test_data', type=str, help="测试数据集")
21 | parser.add_argument('--model_path', type=str, help="模型路径")
22 | args = parser.parse_args()
23 | 
24 | path = args.test_data
25 | model_path = args.model_path
26 | hyp_path = args.hyp_path
27 | lab_path = args.lab_path
28 | 
29 | model = WhisperModel(model_path, device="cuda", compute_type="float16")
30 | speech_data = load_speech_data((path))
31 | 
32 | total_duration = 0
33 | decode_duration = 0
34 | 
35 | with open(hyp_path, "w") as f1, open(lab_path, "w") as f2:
36 |     for line in speech_data:
37 |         audio_path = json.loads(line)["audio"]["path"]
38 |         label_text = json.loads(line)['sentence']
39 |         duration = json.loads(line)['duration']
40 | 
41 |         # 此处作为 rtf 起始时间点
42 |         starttime = datetime.datetime.now()
43 |         
44 |         # 这里可以添加各种解码超参数，详见 debug_faster_whisper_hyper_parameters.md
45 |         segments, info = model.transcribe(audio_path, beam_size=5)
46 |         predict_text = ""
47 |         for segment in segments:
48 |             predict_text += segment.text
49 |         
50 |         # 此处作为 rtf 结束时间点
51 |         endtime = datetime.datetime.now()
52 |         
53 |         total_duration += duration
54 |         decode_duration += (endtime - starttime).total_seconds()
55 |         
56 |         # 更新字典中对应字符的计数，不简洁但直观
57 |         language_result = info.language
58 |         if language_result in result_dict:
59 |             result_dict[language_result] += 1
60 |         else:
61 |             result_dict[language_result] = 1
62 |         
63 |         print("Detected language '%s' with probability %f" % (language_result, info.language_probability))
64 |         print(f"PATH: {audio_path}")
65 |         print(f"LAB: {label_text}")
66 |         print(f"HYP: {predict_text}")
67 |         f1.write(audio_path + "\t" + predict_text + '\n')
68 |         f2.write(audio_path + "\t" + label_text + '\n')  
69 | 
70 | # 打印总输出次数
71 | total_output = sum(result_dict.values())
72 | print("Total output:", total_output)
73 | 
74 | # 打印各个语言预测结果的比例，可以进行语种识别的准确率
75 | print("Character counts:")
76 | for char, count in result_dict.items():
77 |     print(char, ":", count, round((count/total_output*100), 2), "%")
78 | 
79 | # 打印 rtf 相关信息
80 | print(f"TOTAL_DURATION: {round(total_duration, 0)}s")
81 | print(f"DECODEING_TIME: {round(decode_duration, 0)}s")
82 | print(f"RTF: {round(decode_duration / total_duration, 4)}")


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/evaluate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 处理的数据集列表，代码会依次找每一项名字 .jsonl 文件
 4 | # 如 datasets=("test01"），则会匹配到 test01.jsonl 文件
 5 | datasets_path="your_datasets_path"
 6 | datasets=(
 7 |     "test01" \
 8 | )
 9 | 
10 | # 使用的模型列表，便于测试 whisper 的微调结果
11 | # 你只需要给出 checkpoint 的 step 数就可以了，如 models=("100000")
12 | # 如果你只需要测试开源的模型，可以忽略这一变量
13 | model_path="your_model_path"
14 | models=(
15 |     "100000" \
16 | )
17 | 
18 | # 解码结果输出路径
19 | output_path="your_output_path"
20 | 
21 | # 如果输出路径不存在，则创建文件夹
22 | if [ ! -d "$output_path" ]; then
23 |     mkdir -p "$output_path"
24 | 
25 | for dataset in "${datasets[@]}"; do
26 |     for model in "${models[@]}"; do
27 |         
28 |         # 初始化相关变量
29 |         hyp_path="${output_path}/${dataset}_fast_whisper_${model}_hyp.txt"
30 |         lab_path="${output_path}/${dataset}_fast_whisper_${model}_lab.txt"
31 |         norm_hyp_path="${output_path}/${dataset}_fast_whisper_${model}_hyp_norm.txt"
32 |         norm_lab_path="${output_path}/${dataset}_fast_whisper_${model}_lab_norm.txt"
33 |         test_data="${datasets_path}/${dataset}.jsonl"
34 |         model_path="${model_path}/checkpoint-${model}"
35 |         log_path="${output_path}/${dataset}_fast_whisper_${model}.log"
36 |         cer_path="${output_path}/${dataset}_fast_whisper_${model}_cer.txt"
37 | 
38 |         # 激活 faster-whisper 虚拟环境
39 |         source your_anaconda_path
40 |         conda activate your_faster_whisper_env_name
41 | 
42 |         # 解码 whisper
43 |         python evaluate.py \
44 |             --hyp_path=$hyp_path \
45 |             --lab_path=$lab_path \
46 |             --test_data=$test_data \
47 |             --model_path=$model_path \
48 |             > $log_path 2>&1
49 |         
50 |         # 激活 wenet 虚拟环境
51 |         source your_anaconda_path
52 |         conda activate your_wenet_env_name
53 | 
54 |         # hpy 和 lab 文件文本正则化
55 |         cd wenet_utils
56 | 
57 |         python data_postprocessing.py \
58 |             --raw_file_path=$hyp_path \
59 |             --norm_file_path=$norm_hyp_path
60 | 
61 |         python data_postprocessing.py \
62 |             --raw_file_path=$lab_path \
63 |             --norm_file_path=$norm_lab_path
64 | 
65 |         # 计算 cer
66 |         python compute-wer.py \
67 |             --char=1 \
68 |             --v=1 \
69 |             $norm_lab_path \
70 |             $norm_hyp_path \
71 |             > $cer_path 2>&1
72 | 
73 |     done
74 | done
75 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/faster_whisper/__init__.py:
--------------------------------------------------------------------------------
 1 | from faster_whisper.audio import decode_audio
 2 | from faster_whisper.transcribe import WhisperModel
 3 | from faster_whisper.utils import available_models, download_model, format_timestamp
 4 | from faster_whisper.version import __version__
 5 | 
 6 | __all__ = [
 7 |     "available_models",
 8 |     "decode_audio",
 9 |     "WhisperModel",
10 |     "download_model",
11 |     "format_timestamp",
12 |     "__version__",
13 | ]
14 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/faster_whisper/assets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cuichenrui2000/barry_speech_tools/4780bd2c0a578821a3bb782240b58e7c0ca44ac7/faster_whisper_using/faster-whisper/faster_whisper/assets/__init__.py


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/faster_whisper/assets/silero_vad.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cuichenrui2000/barry_speech_tools/4780bd2c0a578821a3bb782240b58e7c0ca44ac7/faster_whisper_using/faster-whisper/faster_whisper/assets/silero_vad.onnx


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/faster_whisper/audio.py:
--------------------------------------------------------------------------------
  1 | """We use the PyAV library to decode the audio: https://github.com/PyAV-Org/PyAV
  2 | 
  3 | The advantage of PyAV is that it bundles the FFmpeg libraries so there is no additional
  4 | system dependencies. FFmpeg does not need to be installed on the system.
  5 | 
  6 | However, the API is quite low-level so we need to manipulate audio frames directly.
  7 | """
  8 | 
  9 | import gc
 10 | import io
 11 | import itertools
 12 | 
 13 | from typing import BinaryIO, Union
 14 | 
 15 | import av
 16 | import numpy as np
 17 | 
 18 | 
 19 | def decode_audio(
 20 |     input_file: Union[str, BinaryIO],
 21 |     sampling_rate: int = 16000,
 22 |     split_stereo: bool = False,
 23 | ):
 24 |     """Decodes the audio.
 25 | 
 26 |     Args:
 27 |       input_file: Path to the input file or a file-like object.
 28 |       sampling_rate: Resample the audio to this sample rate.
 29 |       split_stereo: Return separate left and right channels.
 30 | 
 31 |     Returns:
 32 |       A float32 Numpy array.
 33 | 
 34 |       If `split_stereo` is enabled, the function returns a 2-tuple with the
 35 |       separated left and right channels.
 36 |     """
 37 |     resampler = av.audio.resampler.AudioResampler(
 38 |         format="s16",
 39 |         layout="mono" if not split_stereo else "stereo",
 40 |         rate=sampling_rate,
 41 |     )
 42 | 
 43 |     raw_buffer = io.BytesIO()
 44 |     dtype = None
 45 | 
 46 |     with av.open(input_file, mode="r", metadata_errors="ignore") as container:
 47 |         frames = container.decode(audio=0)
 48 |         frames = _ignore_invalid_frames(frames)
 49 |         frames = _group_frames(frames, 500000)
 50 |         frames = _resample_frames(frames, resampler)
 51 | 
 52 |         for frame in frames:
 53 |             array = frame.to_ndarray()
 54 |             dtype = array.dtype
 55 |             raw_buffer.write(array)
 56 | 
 57 |     # It appears that some objects related to the resampler are not freed
 58 |     # unless the garbage collector is manually run.
 59 |     del resampler
 60 |     gc.collect()
 61 | 
 62 |     audio = np.frombuffer(raw_buffer.getbuffer(), dtype=dtype)
 63 | 
 64 |     # Convert s16 back to f32.
 65 |     audio = audio.astype(np.float32) / 32768.0
 66 | 
 67 |     if split_stereo:
 68 |         left_channel = audio[0::2]
 69 |         right_channel = audio[1::2]
 70 |         return left_channel, right_channel
 71 | 
 72 |     return audio
 73 | 
 74 | 
 75 | def _ignore_invalid_frames(frames):
 76 |     iterator = iter(frames)
 77 | 
 78 |     while True:
 79 |         try:
 80 |             yield next(iterator)
 81 |         except StopIteration:
 82 |             break
 83 |         except av.error.InvalidDataError:
 84 |             continue
 85 | 
 86 | 
 87 | def _group_frames(frames, num_samples=None):
 88 |     fifo = av.audio.fifo.AudioFifo()
 89 | 
 90 |     for frame in frames:
 91 |         frame.pts = None  # Ignore timestamp check.
 92 |         fifo.write(frame)
 93 | 
 94 |         if num_samples is not None and fifo.samples >= num_samples:
 95 |             yield fifo.read()
 96 | 
 97 |     if fifo.samples > 0:
 98 |         yield fifo.read()
 99 | 
100 | 
101 | def _resample_frames(frames, resampler):
102 |     # Add None to flush the resampler.
103 |     for frame in itertools.chain(frames, [None]):
104 |         yield from resampler.resample(frame)
105 | 
106 | 
107 | def pad_or_trim(array, length: int, *, axis: int = -1):
108 |     """
109 |     Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
110 |     """
111 |     if array.shape[axis] > length:
112 |         array = array.take(indices=range(length), axis=axis)
113 | 
114 |     if array.shape[axis] < length:
115 |         pad_widths = [(0, 0)] * array.ndim
116 |         pad_widths[axis] = (0, length - array.shape[axis])
117 |         array = np.pad(array, pad_widths)
118 | 
119 |     return array
120 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/faster_whisper/version.py:
--------------------------------------------------------------------------------
1 | """Version information."""
2 | 
3 | __version__ = "1.0.2"
4 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/requirements.conversion.txt:
--------------------------------------------------------------------------------
1 | transformers[torch]>=4.23
2 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/requirements.txt:
--------------------------------------------------------------------------------
1 | av>=11.0,<13
2 | ctranslate2>=4.0,<5
3 | huggingface_hub>=0.13
4 | tokenizers>=0.13,<1
5 | onnxruntime>=1.14,<2
6 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 100
 3 | ignore =
 4 |   E203,
 5 |   W503,
 6 | 
 7 | [isort]
 8 | profile=black
 9 | lines_between_types=1
10 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | base_dir = os.path.dirname(os.path.abspath(__file__))
 6 | 
 7 | 
 8 | def get_long_description():
 9 |     readme_path = os.path.join(base_dir, "README.md")
10 |     with open(readme_path, encoding="utf-8") as readme_file:
11 |         return readme_file.read()
12 | 
13 | 
14 | def get_project_version():
15 |     version_path = os.path.join(base_dir, "faster_whisper", "version.py")
16 |     version = {}
17 |     with open(version_path, encoding="utf-8") as fp:
18 |         exec(fp.read(), version)
19 |     return version["__version__"]
20 | 
21 | 
22 | def get_requirements(path):
23 |     with open(path, encoding="utf-8") as requirements:
24 |         return [requirement.strip() for requirement in requirements]
25 | 
26 | 
27 | install_requires = get_requirements(os.path.join(base_dir, "requirements.txt"))
28 | conversion_requires = get_requirements(
29 |     os.path.join(base_dir, "requirements.conversion.txt")
30 | )
31 | 
32 | setup(
33 |     name="faster-whisper",
34 |     version=get_project_version(),
35 |     license="MIT",
36 |     description="Faster Whisper transcription with CTranslate2",
37 |     long_description=get_long_description(),
38 |     long_description_content_type="text/markdown",
39 |     author="Guillaume Klein",
40 |     url="https://github.com/SYSTRAN/faster-whisper",
41 |     classifiers=[
42 |         "Development Status :: 4 - Beta",
43 |         "Intended Audience :: Developers",
44 |         "Intended Audience :: Science/Research",
45 |         "License :: OSI Approved :: MIT License",
46 |         "Programming Language :: Python :: 3",
47 |         "Programming Language :: Python :: 3 :: Only",
48 |         "Programming Language :: Python :: 3.8",
49 |         "Programming Language :: Python :: 3.9",
50 |         "Programming Language :: Python :: 3.10",
51 |         "Programming Language :: Python :: 3.11",
52 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
53 |     ],
54 |     keywords="openai whisper speech ctranslate2 inference quantization transformer",
55 |     python_requires=">=3.8",
56 |     install_requires=install_requires,
57 |     extras_require={
58 |         "conversion": conversion_requires,
59 |         "dev": [
60 |             "black==23.*",
61 |             "flake8==6.*",
62 |             "isort==5.*",
63 |             "pytest==7.*",
64 |         ],
65 |     },
66 |     packages=find_packages(),
67 |     include_package_data=True,
68 | )
69 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def data_dir():
 8 |     return os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
 9 | 
10 | 
11 | @pytest.fixture
12 | def jfk_path(data_dir):
13 |     return os.path.join(data_dir, "jfk.flac")
14 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/tests/data/jfk.flac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cuichenrui2000/barry_speech_tools/4780bd2c0a578821a3bb782240b58e7c0ca44ac7/faster_whisper_using/faster-whisper/tests/data/jfk.flac


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/tests/data/stereo_diarization.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cuichenrui2000/barry_speech_tools/4780bd2c0a578821a3bb782240b58e7c0ca44ac7/faster_whisper_using/faster-whisper/tests/data/stereo_diarization.wav


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/tests/test_transcribe.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from faster_whisper import WhisperModel, decode_audio
  4 | 
  5 | 
  6 | def test_supported_languages():
  7 |     model = WhisperModel("tiny.en")
  8 |     assert model.supported_languages == ["en"]
  9 | 
 10 | 
 11 | def test_transcribe(jfk_path):
 12 |     model = WhisperModel("tiny")
 13 |     segments, info = model.transcribe(jfk_path, word_timestamps=True)
 14 |     assert info.all_language_probs is not None
 15 | 
 16 |     assert info.language == "en"
 17 |     assert info.language_probability > 0.9
 18 |     assert info.duration == 11
 19 | 
 20 |     # Get top language info from all results, which should match the
 21 |     # already existing metadata
 22 |     top_lang, top_lang_score = info.all_language_probs[0]
 23 |     assert info.language == top_lang
 24 |     assert abs(info.language_probability - top_lang_score) < 1e-16
 25 | 
 26 |     segments = list(segments)
 27 | 
 28 |     assert len(segments) == 1
 29 | 
 30 |     segment = segments[0]
 31 | 
 32 |     assert segment.text == (
 33 |         " And so my fellow Americans ask not what your country can do for you, "
 34 |         "ask what you can do for your country."
 35 |     )
 36 | 
 37 |     assert segment.text == "".join(word.word for word in segment.words)
 38 |     assert segment.start == segment.words[0].start
 39 |     assert segment.end == segment.words[-1].end
 40 | 
 41 | 
 42 | def test_prefix_with_timestamps(jfk_path):
 43 |     model = WhisperModel("tiny")
 44 |     segments, _ = model.transcribe(jfk_path, prefix="And so my fellow Americans")
 45 |     segments = list(segments)
 46 | 
 47 |     assert len(segments) == 1
 48 | 
 49 |     segment = segments[0]
 50 | 
 51 |     assert segment.text == (
 52 |         " And so my fellow Americans ask not what your country can do for you, "
 53 |         "ask what you can do for your country."
 54 |     )
 55 | 
 56 |     assert segment.start == 0
 57 |     assert 10 < segment.end < 11
 58 | 
 59 | 
 60 | def test_vad(jfk_path):
 61 |     model = WhisperModel("tiny")
 62 |     segments, info = model.transcribe(
 63 |         jfk_path,
 64 |         vad_filter=True,
 65 |         vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
 66 |     )
 67 |     segments = list(segments)
 68 | 
 69 |     assert len(segments) == 1
 70 |     segment = segments[0]
 71 | 
 72 |     assert segment.text == (
 73 |         " And so my fellow Americans ask not what your country can do for you, "
 74 |         "ask what you can do for your country."
 75 |     )
 76 | 
 77 |     assert 0 < segment.start < 1
 78 |     assert 10 < segment.end < 11
 79 | 
 80 |     assert info.vad_options.min_silence_duration_ms == 500
 81 |     assert info.vad_options.speech_pad_ms == 200
 82 | 
 83 | 
 84 | def test_stereo_diarization(data_dir):
 85 |     model = WhisperModel("tiny")
 86 | 
 87 |     audio_path = os.path.join(data_dir, "stereo_diarization.wav")
 88 |     left, right = decode_audio(audio_path, split_stereo=True)
 89 | 
 90 |     segments, _ = model.transcribe(left)
 91 |     transcription = "".join(segment.text for segment in segments).strip()
 92 |     assert transcription == (
 93 |         "He began a confused complaint against the wizard, "
 94 |         "who had vanished behind the curtain on the left."
 95 |     )
 96 | 
 97 |     segments, _ = model.transcribe(right)
 98 |     transcription = "".join(segment.text for segment in segments).strip()
 99 |     assert transcription == "The horizon seems extremely distant."
100 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from faster_whisper import available_models, download_model
 4 | 
 5 | 
 6 | def test_available_models():
 7 |     models = available_models()
 8 |     assert isinstance(models, list)
 9 |     assert "tiny" in models
10 | 
11 | 
12 | def test_download_model(tmpdir):
13 |     output_dir = str(tmpdir.join("model"))
14 | 
15 |     model_dir = download_model("tiny", output_dir=output_dir)
16 | 
17 |     assert model_dir == output_dir
18 |     assert os.path.isdir(model_dir)
19 |     assert not os.path.islink(model_dir)
20 | 
21 |     for filename in os.listdir(model_dir):
22 |         path = os.path.join(model_dir, filename)
23 |         assert not os.path.islink(path)
24 | 
25 | 
26 | def test_download_model_in_cache(tmpdir):
27 |     cache_dir = str(tmpdir.join("model"))
28 |     download_model("tiny", cache_dir=cache_dir)
29 |     assert os.path.isdir(cache_dir)
30 | 


--------------------------------------------------------------------------------
/faster_whisper_using/faster-whisper/wenet_utils/data_postprocessing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | from tn.chinese.normalizer import Normalizer
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument('--raw_file_path', type=str, help="输入文件路径")
 9 | parser.add_argument('--norm_file_path', type=str, help="输出文件路径")
10 | args = parser.parse_args()
11 | 
12 | raw_file_path = args.raw_file_path
13 | norm_file_path = args.norm_file_path
14 | 
15 | normalizer = Normalizer()
16 | 
17 | with open(raw_file_path, "r") as raw_file, open(norm_file_path, "w") as norm_file:
18 |     for line in raw_file:
19 |         parts = line.strip().split("\t")
20 |         
21 |         # 若该文件存在识别结果，则对识别结果进行文本正则化
22 |         if len(parts) >= 2:
23 |             norm_text = normalizer.normalize(parts[1])
24 |             norm_file.write(f"{parts[0]}\t{norm_text}\n")
25 |             
26 |         # 若该文件不存在识别结果，则识别结果仍然为空
27 |         else:
28 |             norm_file.write(f"{parts[0]}\t\n")
29 | 


--------------------------------------------------------------------------------
/faster_whisper_using/vscode/launch.json:
--------------------------------------------------------------------------------
 1 | // 在低版本 python 中："type": "python",
 2 | // evaluate.py 是我自己写的推理代码，已附在 Github 仓库中，请手动复制到 faster-whisper 项目内
 3 | 
 4 | {
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "Faster-Whisper Evaluation",
 9 |             "type": "debugpy",
10 |             "request": "launch",
11 |             "program": "your_faster_whisper_path/evaluate.py",
12 |             "args": [
13 |                 "--hyp_path=your_hyp_path",
14 |                 "--lab_path=your_lab_path",
15 |                 "--test_data=your_test_data_path",
16 |                 "--model_path=your_faster_whisper_model_path",
17 |             ],
18 |             "cwd": "your_faster_whisper_path",
19 |             "python": "your_python_path",
20 |             "console": "integratedTerminal",
21 |             "justMyCode": false,
22 |         }
23 |     ]
24 | }


--------------------------------------------------------------------------------
/paraformer_using/01_map/map_cer.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | utt: /nfs/project/225/test_set/16k_mona_poi_2000/wav/008534_00093.wav
 3 | WER: 7.69 % N=13 C=12 S=1 D=0 I=0
 4 | lab: 赶 紧 给 我 导 到 出 口 成 衣 地 址 去 
 5 | rec: 赶 紧 给 我 导 到 出 口 成 一 地 址 去 
 6 | 
 7 | 
 8 | utt: /nfs/project/225/test_set/16k_mona_poi_2000/wav/006843_00101.wav
 9 | WER: 0.00 % N=12 C=12 S=0 D=0 I=0
10 | lab: 请 导 航 到 中 国 农 业 银 行 大 厦 
11 | rec: 请 导 航 到 中 国 农 业 银 行 大 厦 
12 | 
13 | 
14 | utt: /nfs/project/225/test_set/16k_mona_poi_2000/wav/004159_00158.wav
15 | WER: 0.00 % N=11 C=11 S=0 D=0 I=0
16 | lab: 地 图 旋 转 模 式 为 图 随 车 转 
17 | rec: 地 图 旋 转 模 式 为 图 随 车 转 
18 | 


--------------------------------------------------------------------------------
/paraformer_using/01_map/map_hyp.txt:
--------------------------------------------------------------------------------
1 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/008534_00093.wav	赶 紧 给 我 导 到 出 口 成 一 地 址 去
2 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/006843_00101.wav	请 导 航 到 中 国 农 业 银 行 大 厦
3 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/004159_00158.wav	地 图 旋 转 模 式 为 图 随 车 转
4 | 


--------------------------------------------------------------------------------
/paraformer_using/01_map/map_hyp_norm.txt:
--------------------------------------------------------------------------------
1 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/008534_00093.wav	赶 紧 给 我 导 到 出 口 成 一 地 址 去
2 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/006843_00101.wav	请 导 航 到 中 国 农 业 银 行 大 厦
3 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/004159_00158.wav	地 图 旋 转 模 式 为 图 随 车 转
4 | 


--------------------------------------------------------------------------------
/paraformer_using/01_map/map_lab.txt:
--------------------------------------------------------------------------------
1 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/008534_00093.wav	赶紧给我导到出口成衣地址去
2 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/006843_00101.wav	请导航到中国农业银行大厦
3 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/004159_00158.wav	地图旋转模式为图随车转
4 | 


--------------------------------------------------------------------------------
/paraformer_using/01_map/map_lab_norm.txt:
--------------------------------------------------------------------------------
1 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/008534_00093.wav	赶紧给我导到出口成衣地址去
2 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/006843_00101.wav	请导航到中国农业银行大厦
3 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/004159_00158.wav	地图旋转模式为图随车转
4 | 


--------------------------------------------------------------------------------
/paraformer_using/01_map/para_output/1best_recog/rtf:
--------------------------------------------------------------------------------
1 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/008534_00093.wav decoding, feature length: 59, forward_time: 11.7701, rtf: 3.3249
2 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/006843_00101.wav decoding, feature length: 81, forward_time: 0.0740, rtf: 0.0152
3 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/004159_00158.wav decoding, feature length: 71, forward_time: 0.0695, rtf: 0.0163
4 | 
5 | rtf_avf decoding, feature length total: 479889.0, forward_time total: 806.2844, rtf avg: 0.0280
6 | 


--------------------------------------------------------------------------------
/paraformer_using/01_map/para_output/1best_recog/score:
--------------------------------------------------------------------------------
1 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/008534_00093.wav tensor(-2.3537, device='cuda:0')
2 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/006843_00101.wav tensor(-1.0936, device='cuda:0')
3 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/004159_00158.wav tensor(-1.3214, device='cuda:0')
4 | 
5 | 


--------------------------------------------------------------------------------
/paraformer_using/01_map/para_output/1best_recog/text:
--------------------------------------------------------------------------------
1 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/008534_00093.wav 赶 紧 给 我 导 到 出 口 成 一 地 址 去
2 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/006843_00101.wav 请 导 航 到 中 国 农 业 银 行 大 厦
3 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/004159_00158.wav 地 图 旋 转 模 式 为 图 随 车 转
4 | 
5 | 


--------------------------------------------------------------------------------
/paraformer_using/01_map/para_output/1best_recog/token:
--------------------------------------------------------------------------------
1 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/008534_00093.wav 赶 紧 给 我 导 到 出 口 成 一 地 址 去
2 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/006843_00101.wav 请 导 航 到 中 国 农 业 银 行 大 厦
3 | /nfs/project/225/test_set/16k_mona_poi_2000/wav/004159_00158.wav 地 图 旋 转 模 式 为 图 随 车 转
4 | 
5 | 


--------------------------------------------------------------------------------
/paraformer_using/change_result.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | # 从命令行参数获取文件名
 4 | input_file_name = sys.argv[1]
 5 | output_file_name = sys.argv[2]
 6 | 
 7 | # 打开输入文件并处理数据
 8 | with open(input_file_name, 'r', encoding='utf-8') as input_file:
 9 |     lines = input_file.readlines()
10 | 
11 | # 处理每一行数据
12 | modified_lines = []
13 | for line in lines:
14 |     # 用制表符替换第一个空格
15 |     modified_line = line.replace(' ', '\t', 1)
16 |     modified_lines.append(modified_line)
17 | 
18 | # 将修改后的内容写入新文件
19 | with open(output_file_name, 'w', encoding='utf-8') as output_file:
20 |     output_file.writelines(modified_lines)
21 | 


--------------------------------------------------------------------------------
/paraformer_using/json2paraformer.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | 
 4 | # 从命令行参数获取文件名
 5 | test_data_raw_file = sys.argv[1]
 6 | test_data_file = sys.argv[2]
 7 | lab_path_file = sys.argv[3]
 8 | 
 9 | # 打开文件并处理数据
10 | with open(test_data_raw_file, 'r', encoding='utf-8') as raw_file, \
11 |      open(test_data_file, 'w', encoding='utf-8') as data_file, \
12 |      open(lab_path_file, 'w', encoding='utf-8') as lab_file:
13 | 
14 |     for line in raw_file:
15 |         # 解析 JSON 行
16 |         data = json.loads(line)
17 |         
18 |         # 提取路径和句子
19 |         audio_path = data['audio']['path']
20 |         sentence = data['sentence']
21 |         
22 |         # 写入 test_data 文件
23 |         data_file.write(f"{audio_path}\t{audio_path}\n")
24 |         
25 |         # 写入 lab_path 文件
26 |         lab_file.write(f"{audio_path}\t{sentence}\n")
27 | 


--------------------------------------------------------------------------------
/paraformer_using/paraformer_decode_all.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # 2024_02_24
  3 | # paraformer批量解码脚本
  4 | 
  5 | # 挂载相关数据
  6 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh speech-datasets /ofs/speech-datasets dc680445ad5745c4871a9aeeebd988a4 nmgpu
  7 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh corebackup /ofs/corebackup 05b7b93388ef48cf932b72f4017c6e31 nmgpu
  8 | sudo bash /mnt/common/jianshu/liquidio/release/current/script/liquid_mount_s3.sh  k80-dataset AKDD00000000000SGIPX2FHPLPMALX  ASDDCqkYLLUApBQrKInMsKjUECKbIZulHzdLTtlQ / /nfs/s3_k80_dataset
  9 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh speechssd /ofs/speechssd b46e06b5108e4fdd911a610d0faa5380 hbbpussd
 10 | 
 11 | # 处理的数据集列表
 12 | datasets=(
 13 |     "01_in_car/01_map/map" \
 14 |     "01_in_car/02_music/music" \
 15 |     "01_in_car/03_car_control/car_control" \
 16 |     "01_in_car/04_dynamic/dynamic" \
 17 |     "01_in_car/05_static/static"
 18 | )
 19 | # datasets=(
 20 | #     "01_in_car/01_map/map" \
 21 | #     "01_in_car/02_music/music" \
 22 | #     "01_in_car/03_car_control/car_control" \
 23 | #     "01_in_car/04_dynamic/dynamic" \
 24 | #     "01_in_car/05_static/static" \
 25 | #     "02_general/general" \
 26 | #     "03_mix_chinese_and_english/mix_chinese_and_english" \
 27 | #     "04_pure_english/pure_english" \
 28 | #     "06_dialect/01_cantonese/cantonese_moved_test20000" \
 29 | #     "06_dialect/02_shanghai/shanghai_moved_test20000" \
 30 | #     "06_dialect/03_sichuan/sichuan_test20000" \
 31 | #     "07_customer_service/customer_service" \
 32 | #     "08_search/search"
 33 | # )
 34 | 
 35 | for dataset in "${datasets[@]}"; do
 36 |         
 37 |     # 初始化相关变量
 38 |     hyp_path="/nfs/volume-225-14/cuichenrui/paraformer/experimen_decode/${dataset}_hyp.txt"
 39 |     lab_path="/nfs/volume-225-14/cuichenrui/paraformer/experimen_decode/${dataset}_lab.txt"
 40 |     norm_hyp_path="/nfs/volume-225-14/cuichenrui/paraformer/experimen_decode/${dataset}_hyp_norm.txt"
 41 |     norm_lab_path="/nfs/volume-225-14/cuichenrui/paraformer/experimen_decode/${dataset}_lab_norm.txt"
 42 |     test_data_raw="/nfs/volume-225-14/cuichenrui/dataset/${dataset}.jsonl"
 43 |     test_data="/nfs/volume-225-14/cuichenrui/dataset/${dataset}.scp"
 44 |     output_path=$(echo "${hyp_path}}" | sed 's/^\(.*\)\/.*$/\1/')
 45 |     log_path="/nfs/volume-225-14/cuichenrui/paraformer/experimen_decode/${dataset}.log"
 46 |     cer_path="/nfs/volume-225-14/cuichenrui/paraformer/experimen_decode/${dataset}_cer.txt"
 47 | 
 48 |     # 激活paraformer解码环境
 49 |     # >>> conda initialize >>>
 50 |     # !! Contents within this block are managed by 'conda init' !!
 51 |     __conda_setup="$('/nfs/volume-225-14/yanyuchen_i/tools/miniconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
 52 |     if [ $? -eq 0 ]; then
 53 |     eval "$__conda_setup"
 54 |     else
 55 |     if [ -f "/nfs/volume-225-14/yanyuchen_i/tools/miniconda3/etc/profile.d/conda.sh" ]; then
 56 |         . "/nfs/volume-225-14/yanyuchen_i/tools/miniconda3/etc/profile.d/conda.sh"
 57 |     else
 58 |         export PATH="/nfs/volume-225-14/yanyuchen_i/tools/miniconda3/bin:$PATH"
 59 |     fi
 60 |     fi
 61 |     unset __conda_setup
 62 |     # <<< conda initialize <<<
 63 |     conda activate /nfs/volume-225-14/yanyuchen_i/envs/modelscope
 64 | 
 65 |     # 生成scp文件和lab文件
 66 |     python /nfs/volume-225-14/cuichenrui/dataset/json2paraformer.py \
 67 |             ${test_data_raw} \
 68 |             ${test_data} \
 69 |             ${lab_path}
 70 | 
 71 |     # 解码paraformer
 72 |     python /nfs/volume-225-35/huangpeiyao/paraformer/FunASR-main/hpy/para_scp.py \
 73 |             ${test_data} \
 74 |             ${output_path}
 75 | 
 76 |     # 将paraformer结果后处理生成hyp文件
 77 |     output_text_file=${output_path}/para_output/1best_recog/text
 78 |     python /nfs/volume-225-14/cuichenrui/paraformer/change_result.py \
 79 |             ${output_text_file} \
 80 |             ${hyp_path}
 81 | 
 82 |     # 激活代码环境
 83 |     source /nfs/volume-225-14/laizhihao_i/env/anaconda3/bin/activate
 84 |     conda activate whisper
 85 | 
 86 |     # hpy和lab文件文本归一化
 87 |     cd /nfs/volume-225-14/cuichenrui/whisper/experiment_decode/tools
 88 | 
 89 |     python data_postprocessing.py \
 90 |         --raw_file_path=$hyp_path \
 91 |         --norm_file_path=$norm_hyp_path
 92 | 
 93 |     python data_postprocessing.py \
 94 |         --raw_file_path=$lab_path \
 95 |         --norm_file_path=$norm_lab_path
 96 | 
 97 |     # 计算cer
 98 |     cd /nfs/volume-225-14/laizhihao_i/Wenet/wenet
 99 |     python tools/compute-wer.py \
100 |         --char=1 \
101 |         --v=1 \
102 |         $norm_lab_path \
103 |         $norm_hyp_path \
104 |         > $cer_path 2>&1
105 | 
106 | done


--------------------------------------------------------------------------------
/qwen_using/Qwen-Audio/README.md:
--------------------------------------------------------------------------------
1 | git clone https://github.com/QwenLM/Qwen-Audio.git


--------------------------------------------------------------------------------
/qwen_using/README.md:
--------------------------------------------------------------------------------
 1 | ##  Qwen-Audio 语音框架的使用
 2 | 
 3 | * [Qwen-Audio 论文](https://arxiv.org/abs/2311.07919)
 4 | 
 5 | * [Github 开源地址](https://github.com/QwenLM/Qwen-Audio)
 6 | 
 7 | * [Hugging face 模型地址](https://huggingface.co/Qwen/Qwen-Audio)
 8 | 
 9 | Qwen-Audio 是阿里开源的基于 LLM 的多任务语音系统，其使用 whisper encoder + LLM 结构，在多种语音任务上达到了非常好的性能。
10 | 
11 | 就我个人而言，其模型设计思路和开源代码逻辑都是十分值得学习的，在本仓库提供了以下两个样例：
12 | 
13 | * Qwen-Audio 语音识别
14 | 
15 | * Qwen-Audio 哭声检测
16 | 
17 | ###  Qwen-Audio 语音识别
18 | 
19 | Qwen-Audio 最为知名的就是其语音识别的能力，我们只需先替换 ```decode_mutimachine/evaluate_asr.py``` 文件，再执行 ```decode_mutimachine/qwen_audio_evaluate_cry.sh``` 脚本即可，数据 jsonl 格式如下：
20 | 
21 | ```
22 | # 数据列表采用 jsonl 格式，每行是一个 dict 元素
23 | 
24 | {"audio": /your_audio_file/1.wav, "text": audio_transcribe_text}
25 | {"audio": /your_audio_file/2.wav, "text": audio_transcribe_text}
26 | {"audio": /your_audio_file/3.wav, "text": audio_transcribe_text}
27 | 
28 |         ... ...              ... ...
29 | 
30 | {"audio": /your_audio_file/10000.wav, "text": audio_transcribe_text}
31 | ```
32 | 
33 | 在解码过程中，速度奇慢无比，使用 8*A100 感觉 rtf 也在 0.6 左右，可以进行拉取学习，别真批量解码。对于 Qwen-Audio 的 ASR 结果，感觉效果确实不错，但是存在一些“同义不同表示”现象，如“高兴”识别成“开心”（例子很极端，能理解就好）。可以看出模型是完全理解语义的，并进行了自己的重新表述，我猜测原因来自于较小的 audio encoder 和巨大的 LLM decoder 之间的大小不匹配。
34 | 
35 | ###  Qwen-Audio 哭声检测
36 | 
37 | Qwen-Audio 最为知名的就是其语音识别的能力，我们只需先替换 ```decode_mutimachine/evaluate_aqa.py``` 文件，再执行 ```decode_mutimachine/qwen_audio_evaluate_aqa.sh``` 脚本即可，数据 jsonl 格式如下：
38 | 
39 | ```
40 | # 数据列表采用 jsonl 格式，每行是一个 dict 元素
41 | # question: 针对音频的问题
42 | # audio: 音频路径
43 | # gt: 这里是放和标签相关的，我们用不到，这里可以放任何你想要的信息
44 | # source: 这里不知道是放什么的，我在这里又放了一遍针对音频的问题，方便打印结果
45 | 
46 | {"question": your_question, "audio": /your_audio_file/1.wav, "gt": you_can_put_some_other_things_here, "source": your_question}
47 | {"question": your_question, "audio": /your_audio_file/2.wav, "gt": you_can_put_some_other_things_here, "source": your_question}
48 | {"question": your_question, "audio": /your_audio_file/3.wav, "gt": you_can_put_some_other_things_here, "source": your_question}
49 | 
50 |         ... ...              ... ...
51 | 
52 | {"question": your_question, "audio": /your_audio_file/10000.wav, "gt": you_can_put_some_other_things_here, "source": your_question}
53 | ```
54 | 
55 | 这个音频任务本来是音频问答，在官方提供的测试集上性能很好。但是我们的哭声检测任务模型在训练时基本没见过，所以经常会吐很多无关的话语（问他 yes or no 也不会只回答这俩词），应该是 unseen task 的原因，感觉进行少量的微调就能解决这个任务了。因为就输出结果来看，Qwen-Audio 知道哭声是什么样的，后续的微调我没有执行，感兴趣的同学可以进行一下尝试。
56 | 
57 | 别的语音任务我还没进行尝试，感觉 Qwen-Audio 除了速度慢，别的没什么缺点，框架也清晰文档也全面，性能也很不错，推荐大家学习一下。


--------------------------------------------------------------------------------
/qwen_using/decode_mutimachine/evaluate_qwen_cer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 处理的数据集列表
 4 | # datasets=(
 5 | #     "dynamic" \
 6 | #     "mix_chinese_and_english"
 7 | # )
 8 | datasets=(
 9 |     "general"
10 | )
11 | 
12 | # 激活代码环境
13 | source /nfs/volume-225-14/laizhihao_i/env/anaconda3/bin/activate
14 | conda activate whisper
15 | 
16 | for dataset in "${datasets[@]}"; do
17 | 
18 |         hyp_path="/nfs/volume-225-14/cuichenrui/qwen/experimen_decode/${dataset}_hyp.jsonl"
19 |         lab_path="/nfs/volume-225-14/cuichenrui/qwen/experimen_decode/${dataset}_lab.jsonl"
20 |         norm_hyp_path="/nfs/volume-225-14/cuichenrui/qwen/experimen_decode/${dataset}_hyp_norm.txt"
21 |         norm_lab_path="/nfs/volume-225-14/cuichenrui/qwen/experimen_decode/${dataset}_lab_norm.txt"
22 |         cer_path="/nfs/volume-225-14/cuichenrui/qwen/experimen_decode/${dataset}_cer.txt"
23 | 
24 |         # hpy和lab文件文本归一化
25 |         cd /nfs/volume-225-14/cuichenrui/qwen/tools
26 | 
27 |         python data_postprocessing.py \
28 |             --raw_file_path=$hyp_path \
29 |             --norm_file_path=$norm_hyp_path
30 | 
31 |         python data_postprocessing.py \
32 |             --raw_file_path=$lab_path \
33 |             --norm_file_path=$norm_lab_path
34 | 
35 |         # 计算cer
36 |         cd /nfs/volume-225-14/laizhihao_i/Wenet/wenet
37 |         python tools/compute-wer.py \
38 |             --char=1 \
39 |             --v=1 \
40 |             $norm_lab_path \
41 |             $norm_hyp_path \
42 |             > $cer_path 2>&1
43 | 
44 | done
45 | 


--------------------------------------------------------------------------------
/qwen_using/decode_mutimachine/qwen_audio_evaluate_asr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 处理的数据集列表，代码会依次找每一项名字 .jsonl 文件
 4 | # 如 datasets=("test01"），则会匹配到 test01.jsonl 文件
 5 | datasets_path="your_datasets_path"
 6 | datasets=(
 7 |     "test01" \
 8 | )
 9 | 
10 | # 激活 qwen_audio 虚拟环境
11 | source your_anaconda_path
12 | conda activate your_qwen_audio_env_name
13 | 
14 | cd /your_qwen_audio_path/eval_audio
15 | 
16 | for dataset in "${datasets[@]}"; do
17 | 
18 |     # 初始化相关变量
19 |     hyp_path="${output_path}/${dataset}_qwen_audio_${model}_hyp.txt"
20 |     lab_path="${output_path}/${dataset}_qwen_audio_${model}_lab.txt"
21 |     model_path="your_qwen_audio_model_path"
22 |     log_path="${output_path}/${dataset}_qwen_audio_${model}.log"
23 | 
24 |     # 替换本仓库的 evaluate_asr.py 至 /your_qwen_audio_path/eval_audio 下的 evaluate_asr.py
25 |     python3 -m torch.distributed.launch \
26 |             --use-env \
27 |             --nproc_per_node ${NPROC_PER_NODE:-8} \
28 |             --nnodes 1 evaluate_asr.py \
29 |             --checkpoint ${model_path} \
30 |             --dataset ${dataset} \
31 |             --batch-size 4 \
32 |             --num-workers 8 \
33 |             --hyp_path ${hyp_path} \
34 |             --lab_path ${lab_path} \
35 |             > ${log_path} 2>&1
36 | 
37 | done
38 | 


--------------------------------------------------------------------------------
/qwen_using/decode_mutimachine/qwen_audio_evaluate_cry.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 处理的数据集列表，代码会依次找每一项名字 .jsonl 文件
 4 | # 如 datasets=("test01"），则会匹配到 test01.jsonl 文件
 5 | datasets_path="your_datasets_path"
 6 | datasets=(
 7 |     "test01" \
 8 | )
 9 | 
10 | # 激活 qwen_audio 虚拟环境
11 | source your_anaconda_path
12 | conda activate your_qwen_audio_env_name
13 | 
14 | cd /your_qwen_audio_path/eval_audio
15 | 
16 | for dataset in "${datasets[@]}"; do
17 | 
18 |     # 初始化相关变量
19 |     model_path="your_qwen_audio_model_path"
20 |     log_path="${output_path}/${dataset}_qwen_audio_${model}.log"
21 | 
22 |     # 替换本仓库的 evaluate_aqa.py 至 /your_qwen_audio_path/eval_audio 下的 evaluate_aqa.py
23 |     python -m torch.distributed.launch \
24 |             --use-env \
25 |             --nproc_per_node ${NPROC_PER_NODE:-8} \
26 |             --nnodes 1 evaluate_aqa.py \
27 |             --checkpoint ${model_path} \
28 |             --dataset ${dataset} \
29 |             --batch-size 1 \
30 |             --num-workers 2 \
31 |             > ${log_path} 2>&1
32 | 
33 | done
34 | 


--------------------------------------------------------------------------------
/qwen_using/decode_mutimachine/tools/data_postprocessing.py:
--------------------------------------------------------------------------------
 1 | # 2024_02_04
 2 | # 数据后处理脚本，负责将文本规范化
 3 | 
 4 | import argparse
 5 | from tn.chinese.normalizer import Normalizer
 6 | import json
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('--raw_file_path', type=str, help="需要进行后处理的文件路径")
10 | parser.add_argument('--norm_file_path', type=str, help="处理后的文件路径")
11 | args = parser.parse_args()
12 | 
13 | raw_file_path = args.raw_file_path
14 | norm_file_path = args.norm_file_path
15 | 
16 | normalizer = Normalizer()
17 | 
18 | with open(raw_file_path, "r") as raw_file, open(norm_file_path, "w") as norm_file:
19 |     for line in raw_file:
20 |         data = json.loads(line)  # 将 JSON 字符串解析为 Python 字典
21 |         audio_path = data['audio']  # 提取音频路径
22 |         text = data['text']  # 提取文本内容
23 | 
24 |         norm_text = normalizer.normalize(text)
25 |         norm_file.write(f"{audio_path}\t{norm_text}\n")
26 | 


--------------------------------------------------------------------------------
/tensorrt_llm_using/examples/whisper/README.md:
--------------------------------------------------------------------------------
 1 | # Whisper
 2 | 
 3 | This document shows how to build and run a [whisper model](https://github.com/openai/whisper/tree/main) in TensorRT-LLM on a single GPU.
 4 | 
 5 | - [Whisper](#whisper)
 6 |   - [Overview](#overview)
 7 |   - [Support Matrix](#support-matrix)
 8 |   - [Usage](#usage)
 9 |     - [Build TensorRT engine(s)](#build-tensorrt-engines)
10 |     - [Run](#run)
11 |     - [Distil-Whisper](#distil-whisper)
12 |     - [Acknowledgment](#acknowledgment)
13 | 
14 | ## Overview
15 | 
16 | The TensorRT-LLM Whisper example code is located in [`examples/whisper`](./). There are three main files in that folder:
17 | 
18 |  * [`build.py`](./build.py) to build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run the Whisper model.
19 |  * [`run.py`](./run.py) to run the inference on a single wav file, or [a HuggingFace dataset](https://huggingface.co/datasets/librispeech_asr) [\(Librispeech test clean\)](https://www.openslr.org/12).
20 |  * [`run_faster_whisper.py`](./run_faster_whisper.py) to do benchmark comparison with [Faster Whisper](https://github.com/SYSTRAN/faster-whisper/tree/master).
21 | 
22 | ## Support Matrix
23 |   * FP16
24 |   * INT8 (Weight Only Quant)
25 | 
26 | ## Usage
27 | 
28 | The TensorRT-LLM Whisper example code locates at [examples/whisper](./). It takes whisper pytorch weights as input, and builds the corresponding TensorRT engines.
29 | 
30 | ### Build TensorRT engine(s)
31 | 
32 | Need to prepare the whisper checkpoint first by downloading models from [here](https://github.com/openai/whisper/blob/main/whisper/__init__.py#L22-L28).
33 | 
34 | 
35 | ```bash
36 | wget --directory-prefix=assets https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken
37 | wget --directory-prefix=assets assets/mel_filters.npz https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz
38 | wget --directory-prefix=assets https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav
39 | # take large-v3 model as an example
40 | wget --directory-prefix=assets https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt
41 | ```
42 | 
43 | TensorRT-LLM Whisper builds TensorRT engine(s) from the pytorch checkpoint.
44 | 
45 | ```bash
46 | # install requirements first
47 | pip install -r requirements.txt
48 | 
49 | # Build the large-v3 model using a single GPU with plugins.
50 | python3 build.py --output_dir whisper_large_v3 --use_gpt_attention_plugin --use_gemm_plugin  --use_bert_attention_plugin --enable_context_fmha
51 | 
52 | # Build the large-v3 model using a single GPU with plugins and int8 weight-only quantization.
53 | python3 build.py --output_dir whisper_large_v3_weight_only --use_gpt_attention_plugin --use_gemm_plugin  --use_bert_attention_plugin --enable_context_fmha --use_weight_only
54 | ```
55 | 
56 | ### Run
57 | 
58 | ```bash
59 | # choose the engine you build [./whisper_large_v3, ./whisper_large_weight_only]
60 | output_dir=./whisper_large_v3
61 | # decode a single audio file
62 | # If the input file does not have a .wav extension, ffmpeg needs to be installed with the following command:
63 | # apt-get update && apt-get install -y ffmpeg
64 | python3 run.py --name single_wav_test --engine_dir $output_dir --input_file assets/1221-135766-0002.wav
65 | # decode a whole dataset
66 | python3 run.py --engine_dir $output_dir --dataset hf-internal-testing/librispeech_asr_dummy --enable_warmup --name librispeech_dummy_large_v3_plugin
67 | ```
68 | ### Distil-Whisper
69 | TensorRT-LLM also supports using [distil-whisper's](https://github.com/huggingface/distil-whisper) different models by first converting their params and weights from huggingface's naming format to [openai whisper](https://github.com/openai/whisper) naming format.
70 | You can do so by running the script [distil_whisper/convert_from_distil_whisper.py](./convert_from_distil_whisper.py) as follows:
71 | 
72 | ```bash
73 | # take distil-medium.en as an example
74 | # download the gpt2.tiktoken
75 | wget --directory-prefix=assets https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/gpt2.tiktoken
76 | 
77 | # will download the model weights from huggingface and convert them to openai-whisper's pytorch format
78 | # model is saved to ./assets/ by default
79 | python3 distil_whisper/convert_from_distil_whisper.py --model_name distil-whisper/distil-medium.en --output_name distil-medium.en
80 | 
81 | # now we can build and run the model like before:
82 | output_dir=distil_whisper_medium_en
83 | python3 build.py --model_name distil-medium.en --output_dir $output_dir --use_gpt_attention_plugin --use_gemm_plugin --use_bert_attention_plugin --enable_context_fmha
84 | 
85 | python3 run.py --engine_dir $output_dir --dataset hf-internal-testing/librispeech_asr_dummy --name librispeech_dummy_${output_dir} --tokenizer_name gpt2
86 | ```
87 | 
88 | ### Acknowledgment
89 | 
90 | This implementation of TensorRT-LLM for Whisper has been adapted from the [NVIDIA TensorRT-LLM Hackathon 2023](https://github.com/NVIDIA/trt-samples-for-hackathon-cn/tree/master/Hackathon2023) submission of Jinheng Wang, which can be found in the repository [Eddie-Wang-Hackathon2023](https://github.com/Eddie-Wang1120/Eddie-Wang-Hackathon2023) on GitHub. We extend our gratitude to Jinheng for providing a foundation for the implementation.
91 | 


--------------------------------------------------------------------------------
/tensorrt_llm_using/examples/whisper/build.sh:
--------------------------------------------------------------------------------
 1 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/nfs/volume-225-14/cuichenrui/anaconda3/envs/TensorRT-LLM/lib
 2 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/nfs/volume-225-14/laizhihao_i/env/anaconda3/envs/blsp/lib/python3.8/site-packages/nvidia/cudnn/lib
 3 | 
 4 | source /nfs/volume-225-14/cuichenrui/anaconda3/bin/activate
 5 | conda activate TensorRT-LLM
 6 | 
 7 | cd /nfs/volume-225-14/cuichenrui/TensorRT-LLM/examples/whisper
 8 | python build.py \
 9 |         --world_size 1 \
10 |         --model_dir /nfs/volume-225-14/cuichenrui/TensorRT-LLM/examples/whisper/TensorRT-medium \
11 |         --model_name medium \
12 |         --quantize_dir quantize/1-gpu \
13 |         --dtype float16 \
14 |         --log_level info \
15 |         --max_batch_size 8 \
16 |         --max_input_len 14 \
17 |         --max_output_len 100 \
18 |         --max_beam_width 4 \
19 |         --use_gpt_attention_plugin \
20 |         --use_bert_attention_plugin \
21 |         --use_gemm_plugin \
22 |         --output_dir whisper_outputs \
23 |         --enable_context_fmha


--------------------------------------------------------------------------------
/tensorrt_llm_using/examples/whisper/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://pypi.nvidia.com
 2 | tensorrt_llm==0.9.0.dev2024040900
 3 | tiktoken
 4 | datasets
 5 | kaldialign
 6 | openai-whisper
 7 | librosa
 8 | soundfile
 9 | safetensors
10 | transformers
11 | janus
12 | 


--------------------------------------------------------------------------------
/tensorrt_llm_using/examples/whisper/run.log:
--------------------------------------------------------------------------------
 1 | bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount_base.sh speechssd /ofs/speechssd b46e06b5108e4fdd911a610d0faa5380 hbbpussd
 2 | current user: root
 3 | use volume-name speechssd
 4 | use mount-point /ofs/speechssd
 5 | use region hbbpussd
 6 | use token b46e06b5108e4fdd911a610d0faa5380
 7 | use prefetch-enable true
 8 | use subpath /
 9 | use readOnly false
10 | use debug false
11 | /usr/bin/fusermount
12 | no fuse dev, create it
13 | mknod: /dev/fuse: File exists
14 | device allow path is /sys/fs/cgroup/devices//kubepods/burstable/pod5f683388-59ed-4d14-ac08-e1c56f12e62c/1417304c2cf68160ddd383520fc4b410e78af7ce263319833f7ecf38ccaf8a1d/user.slice/devices.allow
15 | Device: 600083h/6291587d	Inode: 3222900756  Links: 2
16 | ====== MOUNT_POINT[/ofs/speechssd]: not exist ======
17 | umount: /ofs/speechssd: not mounted.
18 | Device: 700016h/7340054d	Inode: 1           Links: 3
19 | ====== MOUNT_POINT[/ofs/speechssd]: exist ======
20 | ===== Volume[speechssd] mounted to Path[/ofs/speechssd] success =====
21 | orangefs:speech-datasets on /ofs/speech-datasets type fuse.ofs (rw,relatime,user_id=0,group_id=0,default_permissions,allow_other)
22 | orangefs:speechssd on /ofs/speechssd type fuse.ofs (rw,relatime,user_id=0,group_id=0,default_permissions,allow_other)
23 | [TensorRT-LLM] TensorRT-LLM version: 0.9.0.dev2024040900
24 | prediction:  I would never enter a restaurant again. I would carry bread and cheese in my pocket or eat chocolate out of automatic machines.
25 | RTF: 0.2196
26 | total_duration: 7.750 seconds
27 | (0.00 hours)
28 | processing time: 1.702 seconds (0.00 hours)
29 | batch size: 4
30 | num_beams: 1
31 | 
32 | 参考答案：
33 | I WOULD NEVER ENTER A RESTAURANT AGAIN I WOULD CARRY BREAD AND CHEESE IN MY POCKET OR EAT CHOCOLATE OUT OF AUTOMATIC MACHINES


--------------------------------------------------------------------------------
/tensorrt_llm_using/examples/whisper/run.sh:
--------------------------------------------------------------------------------
 1 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh speechssd /ofs/speechssd b46e06b5108e4fdd911a610d0faa5380 hbbpussd
 2 | 
 3 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/nfs/volume-225-14/cuichenrui/anaconda3/envs/TensorRT-LLM/lib
 4 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/nfs/volume-225-14/laizhihao_i/env/anaconda3/envs/blsp/lib/python3.8/site-packages/nvidia/cudnn/lib
 5 | 
 6 | source /nfs/volume-225-14/cuichenrui/anaconda3/bin/activate
 7 | conda activate TensorRT-LLM
 8 | 
 9 | cd /nfs/volume-225-14/cuichenrui/TensorRT-LLM/examples/whisper
10 | 
11 | # choose the engine you build [./whisper_large_v3, ./whisper_large_weight_only]
12 | output_dir=./whisper_outputs
13 | # decode a single audio file
14 | # If the input file does not have a .wav extension, ffmpeg needs to be installed with the following command:
15 | # apt-get update && apt-get install -y ffmpeg
16 | python run.py \
17 |     --name single_wav_test \
18 |     --assets_dir /nfs/volume-225-14/cuichenrui/TensorRT-LLM/examples/whisper/TensorRT-medium \
19 |     --engine_dir $output_dir \
20 |     --input_file /ofs/speechssd/datasets/s3_common_dataset/GigaSpeech/audio/AUD0000001679_S0001425.wav
21 | # decode a whole dataset
22 | # python3 run.py --engine_dir $output_dir --dataset hf-internal-testing/librispeech_asr_dummy --enable_warmup --name librispeech_dummy_large_v3_plugin


--------------------------------------------------------------------------------
/tensorrt_llm_using/examples/whisper/tmp/errs-single_wav_test.txt:
--------------------------------------------------------------------------------
 1 | %WER = 2300.00
 2 | Errors: 22 insertions, 0 deletions, 1 substitutions, over 1 reference words (0 correct)
 3 | Search below for sections starting with PER-UTT DETAILS:, SUBSTITUTIONS:, DELETIONS:, INSERTIONS:, PER-WORD STATS:
 4 | 
 5 | PER-UTT DETAILS: corr or (ref->hyp)  
 6 | 0:	(->I would never enter a restaurant again. I would carry bread and cheese in my pocket or eat chocolate out of automatic machines.)
 7 | 
 8 | SUBSTITUTIONS: count ref -> hyp
 9 | 1    -> I
10 | 
11 | DELETIONS: count ref
12 | 
13 | INSERTIONS: count hyp
14 | 2   would
15 | 1   restaurant
16 | 1   pocket
17 | 1   out
18 | 1   or
19 | 1   of
20 | 1   never
21 | 1   my
22 | 1   machines.
23 | 1   in
24 | 1   enter
25 | 1   eat
26 | 1   chocolate
27 | 1   cheese
28 | 1   carry
29 | 1   bread
30 | 1   automatic
31 | 1   and
32 | 1   again.
33 | 1   a
34 | 1   I
35 | 
36 | PER-WORD STATS: word  corr tot_errs count_in_ref count_in_hyp
37 | would   0 2 0 2
38 | I   0 2 0 2
39 | restaurant   0 1 0 1
40 | pocket   0 1 0 1
41 | out   0 1 0 1
42 | or   0 1 0 1
43 | of   0 1 0 1
44 | never   0 1 0 1
45 | my   0 1 0 1
46 | machines.   0 1 0 1
47 | in   0 1 0 1
48 | enter   0 1 0 1
49 | eat   0 1 0 1
50 | chocolate   0 1 0 1
51 | cheese   0 1 0 1
52 | carry   0 1 0 1
53 | bread   0 1 0 1
54 | automatic   0 1 0 1
55 | and   0 1 0 1
56 | again.   0 1 0 1
57 | a   0 1 0 1
58 |    0 1 1 0
59 | 


--------------------------------------------------------------------------------
/tensorrt_llm_using/examples/whisper/tmp/recogs-single_wav_test.txt:
--------------------------------------------------------------------------------
1 | 0:	ref=['']
2 | 0:	hyp=['I', 'would', 'never', 'enter', 'a', 'restaurant', 'again.', 'I', 'would', 'carry', 'bread', 'and', 'cheese', 'in', 'my', 'pocket', 'or', 'eat', 'chocolate', 'out', 'of', 'automatic', 'machines.']
3 | 


--------------------------------------------------------------------------------
/tensorrt_llm_using/examples/whisper/tmp/rtf-single_wav_test.txt:
--------------------------------------------------------------------------------
1 | RTF: 0.2196
2 | total_duration: 7.750 seconds
3 | (0.00 hours)
4 | processing time: 1.702 seconds (0.00 hours)
5 | batch size: 4
6 | num_beams: 1
7 | 


--------------------------------------------------------------------------------
/tensorrt_llm_using/examples/whisper/whisper_outputs/decoder_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "builder_config": {
 3 |     "apply_query_key_layer_scaling": false,
 4 |     "cross_attention": true,
 5 |     "has_position_embedding": true,
 6 |     "has_token_type_embedding": false,
 7 |     "hidden_act": "gelu",
 8 |     "hidden_size": 1024,
 9 |     "int8": false,
10 |     "max_batch_size": 8,
11 |     "max_beam_width": 4,
12 |     "max_input_len": 14,
13 |     "max_output_len": 100,
14 |     "max_position_embeddings": 448,
15 |     "name": "whisper_decoder",
16 |     "num_heads": 16,
17 |     "num_layers": 24,
18 |     "precision": "float16",
19 |     "strongly_typed": false,
20 |     "tensor_parallel": 1,
21 |     "use_refit": false,
22 |     "vocab_size": 51865
23 |   },
24 |   "plugin_config": {
25 |     "attention_qk_half_accumulation": false,
26 |     "bert_attention_plugin": null,
27 |     "context_fmha": true,
28 |     "context_fmha_fp32_acc": false,
29 |     "enable_xqa": false,
30 |     "gemm_plugin": "float16",
31 |     "gpt_attention_plugin": "float16",
32 |     "identity_plugin": null,
33 |     "layernorm_quantization_plugin": null,
34 |     "lookup_plugin": null,
35 |     "lora_plugin": null,
36 |     "mamba_conv1d_plugin": null,
37 |     "moe_plugin": null,
38 |     "multi_block_mode": false,
39 |     "multiple_profiles": false,
40 |     "nccl_plugin": null,
41 |     "paged_kv_cache": false,
42 |     "paged_state": false,
43 |     "quantize_per_token_plugin": false,
44 |     "quantize_tensor_plugin": false,
45 |     "remove_input_padding": false,
46 |     "rmsnorm_quantization_plugin": null,
47 |     "smooth_quant_gemm_plugin": null,
48 |     "streamingllm": false,
49 |     "tokens_per_block": 128,
50 |     "use_context_fmha_for_generation": false,
51 |     "use_custom_all_reduce": false,
52 |     "use_fp8_context_fmha": false,
53 |     "use_paged_context_fmha": false,
54 |     "weight_only_groupwise_quant_matmul_plugin": null,
55 |     "weight_only_quant_matmul_plugin": null
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/tensorrt_llm_using/examples/whisper/whisper_outputs/encoder_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "builder_config": {
 3 |     "hidden_size": 1024,
 4 |     "int8": false,
 5 |     "max_batch_size": 8,
 6 |     "max_beam_width": 4,
 7 |     "n_mels": 80,
 8 |     "name": "whisper_encoder",
 9 |     "num_heads": 16,
10 |     "num_languages": 99,
11 |     "num_layers": 24,
12 |     "precision": "float16",
13 |     "strongly_typed": false,
14 |     "tensor_parallel": 1,
15 |     "use_refit": false
16 |   },
17 |   "plugin_config": {
18 |     "attention_qk_half_accumulation": false,
19 |     "bert_attention_plugin": "float16",
20 |     "context_fmha": true,
21 |     "context_fmha_fp32_acc": false,
22 |     "enable_xqa": false,
23 |     "gemm_plugin": "float16",
24 |     "gpt_attention_plugin": null,
25 |     "identity_plugin": null,
26 |     "layernorm_quantization_plugin": null,
27 |     "lookup_plugin": null,
28 |     "lora_plugin": null,
29 |     "mamba_conv1d_plugin": null,
30 |     "moe_plugin": null,
31 |     "multi_block_mode": false,
32 |     "multiple_profiles": false,
33 |     "nccl_plugin": null,
34 |     "paged_kv_cache": false,
35 |     "paged_state": false,
36 |     "quantize_per_token_plugin": false,
37 |     "quantize_tensor_plugin": false,
38 |     "remove_input_padding": false,
39 |     "rmsnorm_quantization_plugin": null,
40 |     "smooth_quant_gemm_plugin": null,
41 |     "streamingllm": false,
42 |     "tokens_per_block": 128,
43 |     "use_context_fmha_for_generation": false,
44 |     "use_custom_all_reduce": false,
45 |     "use_fp8_context_fmha": false,
46 |     "use_paged_context_fmha": false,
47 |     "weight_only_groupwise_quant_matmul_plugin": null,
48 |     "weight_only_quant_matmul_plugin": null
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/wenet_using/README.md:
--------------------------------------------------------------------------------
 1 | ##  wenet 语音框架的使用
 2 | 
 3 | * [Github 开源地址](https://github.com/wenet-e2e/wenet.git)
 4 | 
 5 | * [Github 技术文档](https://wenet-e2e.github.io/wenet)
 6 | 
 7 | wenet 是出门问问语音团队联合西工大语音实验室开源的一款面向工业落地应用的语音识别工具包，该工具用一套简洁的方案提供了语音识别从训练到部署的一条龙服务，其主要特点如下：
 8 | 
 9 | * 使用 conformer 网络结构和 CTC / attention loss 联合优化方法，具有业界一流的识别效果。
10 | 
11 | * 提供云上和端上直接部署的方案，最小化模型训练和产品落地之间的工程工作。
12 | 
13 | * 框架简洁，模型训练部分完全基于 pytorch 生态，不依赖于 kaldi 等安装复杂的工具。
14 | 
15 | * 详细的注释和文档，十分适合用于学习端到端语音识别的基础知识和实现细节。
16 | 
17 | 就我个人来讲，wenet 框架是我接触语音识别领域学习的第一个框架。wenet 框架非常的轻量，代码也十分通俗易懂，安装部署简介易用，网上的中文教程也非常多，非常适合作为语音识别的初学者学习了解。其独特的 attention rescoring 机制兼顾了流式的速度和模型的性能，很多实际场景都是基于 wenet 框架进行部署使用的。
18 | 
19 | 作为第一次接触 wenet 框架的同学们，强烈建议跟随 ```wenet/examples/aishell/s0/run.sh``` 或者 ```wenet/examples/librispeech/s0/run.sh``` 走一遍整个语音识别的数据拉取，模型训练，模型解码，模型部署（可选）的流程，这会让你迅速了解语音识别这一领域。相关学习文档可参照本仓库的 ```barry_speech_tools/语音入门资料汇总.md``` 进行学习了解。
20 | 
21 | 本人现在对其的使用场景主要为：
22 | 
23 | * wenetspeech 数据集的拉取
24 | 
25 | * 识别结果的 WER 和 CER 的检测
26 | 
27 | 对于 wenetspeech 数据集，其是目前开源的最大的中文语音数据集，包含了 10000 小时左右的标签数据。但是其数据集比较复杂，还包括一些批量解压切分之类的操作，可以参照 ```wenet/examples/wenetspeech/s0/run.sh``` 进行拉取处理。
28 | 
29 | 对于识别结果的 WER 和 CER 的检测，我们首先进行文本正则化，从而减少不同书写习惯带来的识别结果的不对齐，如 ```2.5平方电线``` 和 ```二点五平方电线``` ，具体实现细节可以参照下面的博客：
30 | 
31 | * [WeNet 丨 WeTextProcessing](https://blog.csdn.net/weixin_48827824/article/details/127207360)
32 | 
33 | 然后我们进行 WER 和 CER 的计算，这里复制了 ```wenet/tools/compute-wer.py``` 的计算文件，整体流程可以查看 ```wenet_using/norm_and_conpute_cer.sh``` 脚本，我们只需准备识别结果 ```hyp.txt``` 和标签文件 ```lab.txt``` 即可，脚本将依次生成正则化后的识别结果 ```norm_hyp.txt``` 和正则化后的标签文件 ```norm_lab.txt```，和最终的 CER 计算结果文件 ```cer.txt```。中文会按字分割，英文会按词分割，wenet 的 CER 计算输出的结果非常直观，因此一直作为我个人最常用的 CER 检测工具。程序需要的 ```hyp.txt``` 和 ```lab.txt``` 格式如下：
34 | 
35 | ```
36 | # 每行格式为：<唯一标识符> \t <文本>
37 | # 其中 <唯一标识符> 通常为音频路径，或音频路径的一部分
38 | # 文本间不用进行空格去除，脚本会自动执行该步骤，中英混也可以自动识别
39 | 
40 | /your_audio_file/1.wav  这 是 第一条 音频
41 | /your_audio_file/2.wav  This is 第二条 音频
42 | /your_audio_file/3.wav  这是 the third 音频
43 | 
44 |         ... ...              ... ...
45 | 
46 | /your_audio_file/10000.wav  这是 the last audio
47 | ```
48 | 
49 | ```wenet_using/train_mutimachine``` 提供了 wenet 单机多卡和多机多卡的训练启动脚本和 debug torchrun 的 vscode 格式的 ```wenet_using/train_mutimachine/vscode/launch.json``` 文件，希望可以对初次接触 torchrun 的你有所帮助。
50 | 
51 | 总的来说，wenet 框架简单轻量，上手难度不大。其中文社区发展迅速，是非常优秀的语音识别框架，希望可以给大家带来帮助！
52 | 


--------------------------------------------------------------------------------
/wenet_using/data_postprocessing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | from tn.chinese.normalizer import Normalizer
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument('--raw_file_path', type=str, help="输入文件路径")
 9 | parser.add_argument('--norm_file_path', type=str, help="输出文件路径")
10 | args = parser.parse_args()
11 | 
12 | raw_file_path = args.raw_file_path
13 | norm_file_path = args.norm_file_path
14 | 
15 | normalizer = Normalizer()
16 | 
17 | with open(raw_file_path, "r") as raw_file, open(norm_file_path, "w") as norm_file:
18 |     for line in raw_file:
19 |         parts = line.strip().split("\t")
20 |         
21 |         # 若该文件存在识别结果，则对识别结果进行文本正则化
22 |         if len(parts) >= 2:
23 |             norm_text = normalizer.normalize(parts[1])
24 |             norm_file.write(f"{parts[0]}\t{norm_text}\n")
25 |             
26 |         # 若该文件不存在识别结果，则识别结果仍然为空
27 |         else:
28 |             norm_file.write(f"{parts[0]}\t\n")
29 | 


--------------------------------------------------------------------------------
/wenet_using/norm_and_conpute_cer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 使用本脚本需要先安装 wenet 框架和文本正则化的库
 3 | 
 4 | # 激活 wenet 虚拟环境
 5 | source your_anaconda_path
 6 | conda activate your_wenet_env_name
 7 | 
 8 | hyp_path="your_hyp_path"
 9 | lab_path="your_lab_path"
10 | norm_hyp_path="your_norm_hyp_path_will_be_generated"
11 | norm_lab_path="your_norm_lab_path_will_be_generated"
12 | cer_path="your_cer_path_will_be_generated"
13 | 
14 | # hpy 和 lab 文件文本正则化
15 | python data_postprocessing.py \
16 |     --raw_file_path=$hyp_path \
17 |     --norm_file_path=$norm_hyp_path
18 | 
19 | python data_postprocessing.py \
20 |     --raw_file_path=$lab_path \
21 |     --norm_file_path=$norm_lab_path
22 | 
23 | # 计算 cer
24 | python compute-wer.py \
25 |     --char=1 \
26 |     --v=1 \
27 |     $norm_lab_path \
28 |     $norm_hyp_path \
29 |     > $cer_path 2>&1
30 | 


--------------------------------------------------------------------------------
/wenet_using/train_mutimachine/multi_nodes.sh:
--------------------------------------------------------------------------------
1 | cd "your_wenet_path"
2 | sh multi_nodes_training.sh > "your_log_path"


--------------------------------------------------------------------------------
/wenet_using/train_mutimachine/multi_nodes_training.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd "your_wenet_path"
 4 | 
 5 | # 激活 wenet 虚拟环境
 6 | source your_anaconda_path
 7 | conda activate your_faster_whisper_env_name
 8 | 
 9 | nvidia-smi
10 | export NCCL_IB_DISABLE=1
11 | 
12 | num_gpus=$RESOURCE_NUM_GPU
13 | master_addr=$DISTRIBUTED_MASTER_HOSTS
14 | master_port=$DISTRIBUTED_PYTORCH_PORT
15 | 
16 | num_nodes=$DISTRIBUTED_NODE_COUNT
17 | node_rank=$DISTRIBUTED_NODE_RANK
18 | echo $num_gpus $master_addr $master_port $num_nodes $node_rank
19 | job_id="your_job_id"
20 | nj=16
21 | 
22 | train_set="your_train_set"
23 | train_config="your_train_config"
24 | dir="your_output_dir"
25 | mkdir -p $dir
26 | tensorboard_dir="your_tensorboard_dir"
27 | checkpoint=
28 | 
29 | . tools/parse_options.sh || exit 1;
30 | 
31 | # You have to rm `INIT_FILE` manually when you resume or restart a
32 | # multi-machine training.
33 | INIT_FILE=$dir/ddp_init
34 | if [ -f ${INIT_FILE} ]; then
35 |   rm ${INIT_FILE}
36 | fi
37 | 
38 | # cat /nfs/volume-225-13/zhangruixiong/wenet_space/wenet_xingcheng8k/examples/xingcheng8k/s0/data/train/exclude_sensitive_cantonese/16k_24000_tar.list /nfs/volume-225-13/zhangruixiong/training_data/sicheng_waihu.list /nfs/volume-225-13/zhangruixiong/wenet_space/wenet_xingcheng8k/examples/xingcheng8k/s0/data/train/exclude_sensitive/sensitive.list /nfs/volume-225-13/zhangruixiong/wenet_space/wenet_xingcheng8k/examples/xingcheng8k/s0/data/train/exclude_sensitive_cantonese/data_exclude_16ksearch_wenetspeech.list > /nfs/volume-225-13/zhangruixiong/wenet_space/wenet_xingcheng8k/examples/xingcheng8k/s0/data/train/exclude_sensitive_cantonese/16k_24000_sichangwaihu_sensitive_exclude_wenetspeech.list
39 | 
40 | torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
41 |          --node_rank=$node_rank --master_addr=$master_addr --master_port=$master_port \
42 |           wenet/bin/train.py \
43 |           --config $train_config \
44 |           --data_type mix \
45 |           --train_data data/train/$train_set/data.list \
46 |           --cv_data data/dev/data.list \
47 |           --tensorboard_dir $tensorboard_dir \
48 |           ${checkpoint:+--checkpoint $checkpoint} \
49 |           --model_dir $dir \
50 |           --num_workers 8 \
51 |           --pin_memory \
52 |           --use_amp
53 | 


--------------------------------------------------------------------------------
/wenet_using/train_mutimachine/one_node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd "your_wenet_path"
 4 | 
 5 | # 激活 wenet 虚拟环境
 6 | source your_anaconda_path
 7 | conda activate your_faster_whisper_env_name
 8 | 
 9 | nvidia-smi
10 | export NCCL_IB_DISABLE=1
11 | 
12 | num_nodes=1
13 | num_gpus=8
14 | job_id="your_job_id"
15 | HOST_NODE_ADDR="localhost:0"
16 | 
17 | nj=16
18 | 
19 | train_set="your_train_set"
20 | train_config="your_train_config"
21 | dir="your_output_dir"
22 | mkdir -p $dir
23 | 
24 | checkpoint=
25 | 
26 | . tools/parse_options.sh || exit 1;
27 | 
28 | # You have to rm `INIT_FILE` manually when you resume or restart a
29 | # multi-machine training.
30 | INIT_FILE=$dir/ddp_init
31 | if [ -f ${INIT_FILE} ]; then
32 |   rm ${INIT_FILE}
33 | fi
34 | 
35 | torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus \
36 |          --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR \
37 |           wenet/bin/train.py \
38 |           --config $train_config \
39 |           --data_type raw \
40 |           --train_data data/train/$train_set/data.list \
41 |           --cv_data data/dev/data.list \
42 |           ${checkpoint:+--checkpoint $checkpoint} \
43 |           --model_dir $dir \
44 |           --num_workers 8 \
45 |           --pin_memory \
46 |           --use_amp
47 | 


--------------------------------------------------------------------------------
/wenet_using/vscode/launch.json:
--------------------------------------------------------------------------------
 1 | // 在低版本 python 中："type": "python"
 2 | 
 3 | {
 4 |     "version": "0.2.0",
 5 |     "configurations": [
 6 |         {
 7 |             "name": "wenet debugger",
 8 |             "type": "debugpy",
 9 |             "request": "launch",
10 |             "program": "/your_conda_env_path/lib/python3.8/site-packages/torch/distributed/run.py",
11 |             "args": [
12 |                 "--nnodes=1", 
13 |                 "--nproc_per_node=8", 
14 |                 "--rdzv_id=debug_wenet", 
15 |                 "--rdzv_backend=c10d", 
16 |                 "--rdzv_endpoint=localhost:0", 
17 |                 "your_python_program_path", 
18 |                 "--config", "your_wenet_config_path", 
19 |                 "--data_type", "raw", 
20 |                 "--train_data", "your_wenet_train_data_list_path", 
21 |                 "--cv_data", "your_wenet_dev_data_list_path", 
22 |                 "--model_dir", "your_wenet_output_path", 
23 |                 "--num_workers", "8",
24 |                 "--pin_memory", 
25 |                 "--use_amp", 
26 |             ],
27 |             "env": {
28 |                 "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
29 |             },
30 |             "cwd": "your_wenet_path",
31 |             "python": "your_python_path",
32 |             "console": "integratedTerminal",
33 |             "justMyCode": false,
34 |         }
35 |     ]
36 | }


--------------------------------------------------------------------------------
/whisper_finetune_using/decode_bash/convert_whisper_to_fast_whisper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 2023_02_05
 3 | 
 4 | # 激活虚拟环境
 5 | source /nfs/volume-225-14/laizhihao_i/env/anaconda3/bin/activate
 6 | conda activate blsp
 7 | 
 8 | # 执行模型转换脚本
 9 | ct2-transformers-converter \
10 |     --model /nfs/volume-225-14/cuichenrui/whisper/whisper_medium_full_finetune_3.0/medium_kespeech_1e-4-fp32/checkpoint-310000/checkpoint-25000 \
11 |     --output_dir /nfs/volume-225-14/cuichenrui/whisper/faster_whisper_models/whisper_kespeech_1e-4-fp32/checkpoint-25000 \
12 |     --copy_files tokenizer.json preprocessor_config.json \
13 |     --quantization float16
14 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/decode_bash/evaluate_faster_whisper_3.0_1.0_no_timestamp_310000_2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 挂载相关数据
 4 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh speech-datasets /ofs/speech-datasets dc680445ad5745c4871a9aeeebd988a4 nmgpu
 5 | sudo bash /mnt/com/nfs/volume-225-14/cuichenrui/scriptmon/jianshu/ofs/release/current/script/ofs_mount.sh corebackup /ofs/corebackup 05b7b93388ef48cf932b72f4017c6e31 nmgpu
 6 | sudo bash /mnt/common/jianshu/liquidio/release/current/script/liquid_mount_s3.sh  k80-dataset AKDD00000000000SGIPX2FHPLPMALX  ASDDCqkYLLUApBQrKInMsKjUECKbIZulHzdLTtlQ / /nfs/s3_k80_dataset
 7 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh speechssd /ofs/speechssd b46e06b5108e4fdd911a610d0faa5380 hbbpussd
 8 | 
 9 | # 处理的数据集列表
10 | datasets=(
11 |     "06_dialect/01_cantonese/cantonese_moved_test20000" \
12 |     "06_dialect/02_shanghai/shanghai_moved_test20000" \
13 |     "06_dialect/03_sichuan/sichuan_test20000" \
14 |     "07_customer_service/customer_service" \
15 |     "08_search/search" \
16 |     "09_8k_rengongkefu/8k_rengongkefu" \
17 |     "10_8k_sicheng/8k_sicheng" \
18 |     "12_speechio/01_speechio00/speechio00" \
19 |     "12_speechio/02_speechio01/speechio01" \
20 |     "12_speechio/03_speechio02/speechio02" \
21 |     "12_speechio/04_speechio03/speechio03" \
22 |     "12_speechio/05_speechio04/speechio04" \
23 |     "12_speechio/06_speechio05/speechio05" \
24 |     "13_kespeech/kespeech" \
25 |     "13_kespeech/00_mandarin/mandarin" \
26 |     "13_kespeech/01_bj/bj" \
27 |     "13_kespeech/02_xn/xn" \
28 |     "13_kespeech/03_zy/zy" \
29 |     "13_kespeech/04_db/db" \
30 |     "13_kespeech/05_ly/ly" \
31 |     "13_kespeech/06_jh/jh" \
32 |     "13_kespeech/07_jl/jl" \
33 |     "13_kespeech/08_jl/jl" \
34 | )
35 | 
36 | # 使用的模型列表
37 | models=(
38 |     "310000"
39 | )
40 | 
41 | for dataset in "${datasets[@]}"; do
42 |     for model in "${models[@]}"; do
43 |         
44 |         # 初始化相关变量
45 |         hyp_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2/decode_logs/${dataset}_fast_whisper_medium_3.0_1.0_no_timestamp${model}_hyp.txt"
46 |         lab_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2/decode_logs/${dataset}_fast_whisper_medium_3.0_1.0_no_timestamp${model}_lab.txt"
47 |         norm_hyp_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2/decode_logs/${dataset}_fast_whisper_medium_3.0_1.0_no_timestamp${model}_hyp_norm.txt"
48 |         norm_lab_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2/decode_logs/${dataset}_fast_whisper_medium_3.0_1.0_no_timestamp${model}_lab_norm.txt"
49 |         test_data="/nfs/volume-225-14/cuichenrui/dataset/${dataset}.jsonl"
50 |         model_path="/nfs/volume-225-14/cuichenrui/whisper/faster_whisper_models/whisper_medium_3.0/checkpoint-${model}"
51 |         log_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2/decode_logs/${dataset}_fast_whisper_medium_3.0_1.0_no_timestamp${model}.log"
52 |         cer_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2/decode_logs/${dataset}_fast_whisper_medium_3.0_1.0_no_timestamp${model}_cer.txt"
53 | 
54 |         # 激活代码环境
55 |         source /nfs/volume-225-14/laizhihao_i/env/anaconda3/bin/activate
56 |         conda activate blsp
57 | 
58 |         # 解码whisper
59 |         cd /nfs/volume-225-14/laizhihao_i/Whisper/faster-whisper
60 |         python evaluate_cuichenrui_no_timestamp.py \
61 |             --hyp_path=$hyp_path \
62 |             --lab_path=$lab_path \
63 |             --test_data=$test_data \
64 |             --model_path=$model_path \
65 |             > $log_path 2>&1
66 |         
67 |         # 激活代码环境
68 |         source /nfs/volume-225-14/laizhihao_i/env/anaconda3/bin/activate
69 |         conda activate whisper
70 | 
71 |         # hpy和lab文件文本归一化
72 |         cd /nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2/tools
73 | 
74 |         python data_postprocessing.py \
75 |             --raw_file_path=$hyp_path \
76 |             --norm_file_path=$norm_hyp_path
77 | 
78 |         python data_postprocessing.py \
79 |             --raw_file_path=$lab_path \
80 |             --norm_file_path=$norm_lab_path
81 | 
82 |         # 计算cer
83 |         cd /nfs/volume-225-14/laizhihao_i/Wenet/wenet
84 |         python tools/compute-wer.py \
85 |             --char=1 \
86 |             --v=1 \
87 |             $norm_lab_path \
88 |             $norm_hyp_path \
89 |             > $cer_path 2>&1
90 | 
91 |     done
92 | done
93 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/decode_bash/evaluate_faster_whisper_3.0_1.0_no_timestamp_310000_final_en.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 挂载相关数据
 4 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh speech-datasets /ofs/speech-datasets dc680445ad5745c4871a9aeeebd988a4 nmgpu
 5 | sudo bash /mnt/com/nfs/volume-225-14/cuichenrui/scriptmon/jianshu/ofs/release/current/script/ofs_mount.sh corebackup /ofs/corebackup 05b7b93388ef48cf932b72f4017c6e31 nmgpu
 6 | sudo bash /mnt/common/jianshu/liquidio/release/current/script/liquid_mount_s3.sh  k80-dataset AKDD00000000000SGIPX2FHPLPMALX  ASDDCqkYLLUApBQrKInMsKjUECKbIZulHzdLTtlQ / /nfs/s3_k80_dataset
 7 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh speechssd /ofs/speechssd b46e06b5108e4fdd911a610d0faa5380 hbbpussd
 8 | 
 9 | # 处理的数据集列表
10 | datasets=(
11 |     "04_pure_english/pure_english" \
12 | )
13 | 
14 | # 使用的模型列表
15 | models=(
16 |     "310000"
17 | )
18 | 
19 | for dataset in "${datasets[@]}"; do
20 |     for model in "${models[@]}"; do
21 |         
22 |         # 初始化相关变量
23 |         hyp_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2_final/decode_logs/${dataset}_fast_whisper_medium_3.0_1.0_no_timestamp${model}_hyp.txt"
24 |         lab_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2_final/decode_logs/${dataset}_fast_whisper_medium_3.0_1.0_no_timestamp${model}_lab.txt"
25 |         norm_hyp_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2_final/decode_logs/${dataset}_fast_whisper_medium_3.0_1.0_no_timestamp${model}_hyp_norm.txt"
26 |         norm_lab_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2_final/decode_logs/${dataset}_fast_whisper_medium_3.0_1.0_no_timestamp${model}_lab_norm.txt"
27 |         test_data="/nfs/volume-225-14/cuichenrui/dataset/${dataset}.jsonl"
28 |         model_path="/nfs/volume-225-14/cuichenrui/whisper/faster_whisper_models/whisper_medium_3.0/checkpoint-${model}"
29 |         log_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2_final/decode_logs/${dataset}_fast_whisper_medium_3.0_1.0_no_timestamp${model}.log"
30 |         cer_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2_final/decode_logs/${dataset}_fast_whisper_medium_3.0_1.0_no_timestamp${model}_cer.txt"
31 | 
32 |         # 激活代码环境
33 |         source /nfs/volume-225-14/laizhihao_i/env/anaconda3/bin/activate
34 |         conda activate blsp
35 | 
36 |         # 解码whisper
37 |         cd /nfs/volume-225-14/laizhihao_i/Whisper/faster-whisper
38 |         python evaluate_cuichenrui_no_timestamp_choice_language.py \
39 |             --hyp_path=$hyp_path \
40 |             --lab_path=$lab_path \
41 |             --test_data=$test_data \
42 |             --model_path=$model_path \
43 |             --language_id="en" \
44 |             > $log_path 2>&1
45 |         
46 |         # 激活代码环境
47 |         source /nfs/volume-225-14/laizhihao_i/env/anaconda3/bin/activate
48 |         conda activate whisper
49 | 
50 |         # hpy和lab文件文本归一化
51 |         cd /nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2_final/tools
52 | 
53 |         python data_postprocessing.py \
54 |             --raw_file_path=$hyp_path \
55 |             --norm_file_path=$norm_hyp_path
56 | 
57 |         python data_postprocessing.py \
58 |             --raw_file_path=$lab_path \
59 |             --norm_file_path=$norm_lab_path
60 | 
61 |         # 计算cer
62 |         cd /nfs/volume-225-14/laizhihao_i/Wenet/wenet
63 |         python tools/compute-wer.py \
64 |             --char=1 \
65 |             --v=1 \
66 |             $norm_lab_path \
67 |             $norm_hyp_path \
68 |             > $cer_path 2>&1
69 | 
70 |     done
71 | done
72 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/decode_bash/evaluate_whisper_3.0lr_310000.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 挂载相关数据
 4 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh speech-datasets /ofs/speech-datasets dc680445ad5745c4871a9aeeebd988a4 nmgpu
 5 | sudo bash /mnt/com/nfs/volume-225-14/cuichenrui/scriptmon/jianshu/ofs/release/current/script/ofs_mount.sh corebackup /ofs/corebackup 05b7b93388ef48cf932b72f4017c6e31 nmgpu
 6 | sudo bash /mnt/common/jianshu/liquidio/release/current/script/liquid_mount_s3.sh  k80-dataset AKDD00000000000SGIPX2FHPLPMALX  ASDDCqkYLLUApBQrKInMsKjUECKbIZulHzdLTtlQ / /nfs/s3_k80_dataset
 7 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh speechssd /ofs/speechssd b46e06b5108e4fdd911a610d0faa5380 hbbpussd
 8 | 
 9 | # 激活代码环境
10 | source /nfs/volume-225-14/laizhihao_i/env/anaconda3/bin/activate
11 | conda activate whisper
12 | 
13 | # 处理的数据集列表
14 | datasets=(
15 |     "01_in_car/01_map/map" \
16 |     "01_in_car/02_music/music" \
17 |     "01_in_car/03_car_control/car_control" \
18 |     "01_in_car/04_dynamic/dynamic" \
19 |     "01_in_car/05_static/static" \
20 |     "02_general/general" \
21 |     "02_general/01_aishell/aishell" \
22 |     "02_general/02_magicdata/magicdata" \
23 |     "02_general/03_hkust_dev/hkust_dev" \
24 |     "02_general/04_wenetspeech_test_meeting/wenetspeech_test_meeting" \
25 |     "02_general/05_wenetspeech_test_net/wenetspeech_test_net" \
26 |     "03_mix_chinese_and_english/mix_chinese_and_english" \
27 |     "04_pure_english/pure_english" \
28 |     "06_dialect/01_cantonese/cantonese_moved_test20000" \
29 |     "06_dialect/02_shanghai/shanghai_moved_test20000" \
30 |     "06_dialect/03_sichuan/sichuan_test20000" \
31 |     "07_customer_service/customer_service" \
32 |     "08_search/search" \
33 |     "09_8k_rengongkefu/8k_rengongkefu" \
34 |     "10_8k_sicheng/8k_sicheng" \
35 | )
36 | 
37 | # 使用的模型列表
38 | models=(
39 |     "310000"
40 | )
41 | 
42 | for dataset in "${datasets[@]}"; do
43 |     for model in "${models[@]}"; do
44 |         
45 |         # 初始化相关变量
46 |         hyp_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2/decode_logs/${dataset}_whisper_medium_3.0lr_${model}_hyp.txt"
47 |         lab_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2/decode_logs/${dataset}_whisper_medium_3.0lr_${model}_lab.txt"
48 |         norm_hyp_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2/decode_logs/${dataset}_whisper_medium_3.0lr_${model}_hyp_norm.txt"
49 |         norm_lab_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2/decode_logs/${dataset}_whisper_medium_3.0lr_${model}_lab_norm.txt"
50 |         test_data="/nfs/volume-225-14/cuichenrui/dataset/${dataset}.jsonl"
51 |         model_path="/nfs/volume-225-14/cuichenrui/whisper/whisper_medium_full_finetune_2.0/medium_8w_without8k_fp16_3.0lr/model_medium/checkpoint-${model}"
52 |         log_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2/decode_logs/${dataset}_whisper_medium_3.0lr_${model}.log"
53 |         cer_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2/decode_logs/${dataset}_whisper_medium_3.0lr_${model}_cer.txt"
54 | 
55 |         # 解码whisper
56 |         cd /nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune
57 |         python evaluation_nolora.py \
58 |             --hyp_path=$hyp_path \
59 |             --lab_path=$lab_path \
60 |             --test_data=$test_data \
61 |             --model_path=$model_path \
62 |             --batch_size=16 \
63 |             --num_workers=8 \
64 |             --language=Chinese \
65 |             --remove_pun=True \
66 |             --to_simple=True \
67 |             --timestamps=False \
68 |             --min_audio_len=0.5 \
69 |             --max_audio_len=30 \
70 |             --local_files_only=True \
71 |             --task=transcribe \
72 |             --metric=cer \
73 |             > $log_path 2>&1
74 |         
75 |         # hpy和lab文件文本归一化
76 |         cd /nfs/volume-225-14/cuichenrui/whisper/experiment_decode/tools
77 | 
78 |         python data_postprocessing.py \
79 |         --raw_file_path=$hyp_path \
80 |         --norm_file_path=$norm_hyp_path
81 | 
82 |         python data_postprocessing.py \
83 |         --raw_file_path=$lab_path \
84 |         --norm_file_path=$norm_lab_path
85 | 
86 |         # 计算cer
87 |         cd /nfs/volume-225-14/laizhihao_i/Wenet/wenet
88 |         python tools/compute-wer.py \
89 |             --char=1 \
90 |             --v=1 \
91 |             $norm_lab_path \
92 |             $norm_hyp_path \
93 |             > $cer_path 2>&1
94 | 
95 |     done
96 | done
97 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/decode_bash/test_language_id_3.0lr_english.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 挂载相关数据
 4 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh speech-datasets /ofs/speech-datasets dc680445ad5745c4871a9aeeebd988a4 nmgpu
 5 | sudo bash /mnt/com/nfs/volume-225-14/cuichenrui/scriptmon/jianshu/ofs/release/current/script/ofs_mount.sh corebackup /ofs/corebackup 05b7b93388ef48cf932b72f4017c6e31 nmgpu
 6 | sudo bash /mnt/common/jianshu/liquidio/release/current/script/liquid_mount_s3.sh  k80-dataset AKDD00000000000SGIPX2FHPLPMALX  ASDDCqkYLLUApBQrKInMsKjUECKbIZulHzdLTtlQ / /nfs/s3_k80_dataset
 7 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh speechssd /ofs/speechssd b46e06b5108e4fdd911a610d0faa5380 hbbpussd
 8 | 
 9 | # 激活代码环境
10 | source /nfs/volume-225-14/laizhihao_i/env/anaconda3/bin/activate
11 | conda activate whisper
12 | 
13 | # 处理的数据集列表
14 | datasets=(
15 |     "03_mix_chinese_and_english/mix_chinese_and_english" \
16 |     "04_pure_english/pure_english" \
17 | )
18 | 
19 | # 使用的模型列表
20 | models=(
21 |     "310000"
22 | )
23 | 
24 | for dataset in "${datasets[@]}"; do
25 |     for model in "${models[@]}"; do
26 |         
27 |         # 初始化相关变量
28 |         hyp_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2/decode_logs/${dataset}_whisper_medium_3.0lr_english_${model}_hyp.txt"
29 |         lab_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2/decode_logs/${dataset}_whisper_medium_3.0lr_english_${model}_lab.txt"
30 |         norm_hyp_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2/decode_logs/${dataset}_whisper_medium_3.0lr_english_${model}_hyp_norm.txt"
31 |         norm_lab_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2/decode_logs/${dataset}_whisper_medium_3.0lr_english_${model}_lab_norm.txt"
32 |         test_data="/nfs/volume-225-14/cuichenrui/dataset/${dataset}.jsonl"
33 |         model_path="/nfs/volume-225-14/cuichenrui/whisper/whisper_medium_full_finetune_2.0/medium_8w_without8k_fp16_3.0lr/model_medium/checkpoint-${model}"
34 |         log_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2/decode_logs/${dataset}_whisper_medium_3.0lr_english_${model}.log"
35 |         cer_path="/nfs/volume-225-14/cuichenrui/whisper/experiment_decode_2/decode_logs/${dataset}_whisper_medium_3.0lr_english_${model}_cer.txt"
36 | 
37 |         # 解码whisper
38 |         cd /nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune
39 |         python evaluation_nolora.py \
40 |             --hyp_path=$hyp_path \
41 |             --lab_path=$lab_path \
42 |             --test_data=$test_data \
43 |             --model_path=$model_path \
44 |             --batch_size=16 \
45 |             --num_workers=8 \
46 |             --language=English \
47 |             --remove_pun=True \
48 |             --to_simple=True \
49 |             --timestamps=False \
50 |             --min_audio_len=0.5 \
51 |             --max_audio_len=30 \
52 |             --local_files_only=True \
53 |             --task=transcribe \
54 |             --metric=cer \
55 |             > $log_path 2>&1
56 |         
57 |         # hpy和lab文件文本归一化
58 |         cd /nfs/volume-225-14/cuichenrui/whisper/experiment_decode/tools
59 | 
60 |         python data_postprocessing.py \
61 |         --raw_file_path=$hyp_path \
62 |         --norm_file_path=$norm_hyp_path
63 | 
64 |         python data_postprocessing.py \
65 |         --raw_file_path=$lab_path \
66 |         --norm_file_path=$norm_lab_path
67 | 
68 |         # 计算cer
69 |         cd /nfs/volume-225-14/laizhihao_i/Wenet/wenet
70 |         python tools/compute-wer.py \
71 |             --char=1 \
72 |             --v=1 \
73 |             $norm_lab_path \
74 |             $norm_hyp_path \
75 |             > $cer_path 2>&1
76 | 
77 |     done
78 | done
79 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/evaluation.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import functools
 3 | import gc
 4 | import os
 5 | 
 6 | import evaluate
 7 | import numpy as np
 8 | import torch
 9 | from torch.utils.data import DataLoader
10 | from tqdm import tqdm
11 | from transformers import WhisperForConditionalGeneration, WhisperProcessor
12 | 
13 | from utils.data_utils import DataCollatorSpeechSeq2SeqWithPadding, remove_punctuation, to_simple
14 | from utils.reader import CustomDataset
15 | from utils.utils import print_arguments, add_arguments
16 | 
17 | parser = argparse.ArgumentParser(description=__doc__)
18 | add_arg = functools.partial(add_arguments, argparser=parser)
19 | add_arg("test_data",   type=str, default="dataset/test.json",            help="测试集的路径")
20 | add_arg("model_path",  type=str, default="models/whisper-tiny-finetune", help="合并模型的路径，或者是huggingface上模型的名称")
21 | add_arg("batch_size",  type=int, default=16,        help="评估的batch size")
22 | add_arg("num_workers", type=int, default=8,         help="读取数据的线程数量")
23 | add_arg("language",    type=str, default="Chinese", help="设置语言，可全称也可简写，如果为None则评估的是多语言")
24 | add_arg("remove_pun",  type=bool, default=True,     help="是否移除标点符号")
25 | add_arg("to_simple",   type=bool, default=True,     help="是否转为简体中文")
26 | add_arg("timestamps",  type=bool, default=False,    help="评估时是否使用时间戳数据")
27 | add_arg("min_audio_len",     type=float, default=0.5,  help="最小的音频长度，单位秒")
28 | add_arg("max_audio_len",     type=float, default=30,   help="最大的音频长度，单位秒")
29 | add_arg("local_files_only",  type=bool,  default=True, help="是否只在本地加载模型，不尝试下载")
30 | add_arg("task",       type=str, default="transcribe", choices=['transcribe', 'translate'], help="模型的任务")
31 | add_arg("metric",     type=str, default="cer",        choices=['cer', 'wer'],              help="评估方式")
32 | args = parser.parse_args()
33 | print_arguments(args)
34 | 
35 | # 判断模型路径是否合法
36 | assert 'openai' == os.path.dirname(args.model_path) or os.path.exists(args.model_path), \
37 |     f"模型文件{args.model_path}不存在，请检查是否已经成功合并模型，或者是否为huggingface存在模型"
38 | # 获取Whisper的数据处理器，这个包含了特征提取器、tokenizer
39 | processor = WhisperProcessor.from_pretrained(args.model_path,
40 |                                              language=args.language,
41 |                                              task=args.task,
42 |                                              no_timestamps=not args.timestamps,
43 |                                              local_files_only=args.local_files_only)
44 | forced_decoder_ids = processor.get_decoder_prompt_ids()
45 | # 获取模型
46 | model = WhisperForConditionalGeneration.from_pretrained(args.model_path,
47 |                                                         device_map="auto",
48 |                                                         local_files_only=args.local_files_only)
49 | model.eval()
50 | 
51 | # 获取测试数据
52 | test_dataset = CustomDataset(data_list_path=args.test_data,
53 |                              processor=processor,
54 |                              timestamps=args.timestamps,
55 |                              min_duration=args.min_audio_len,
56 |                              max_duration=args.max_audio_len)
57 | print(f"测试数据：{len(test_dataset)}")
58 | 
59 | # 数据padding器
60 | data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
61 | eval_dataloader = DataLoader(test_dataset, batch_size=args.batch_size,
62 |                              num_workers=args.num_workers, collate_fn=data_collator)
63 | 
64 | # 获取评估方法
65 | metric = evaluate.load(f'metrics/{args.metric}.py')
66 | 
67 | # 开始评估
68 | for step, batch in enumerate(tqdm(eval_dataloader)):
69 |     with torch.cuda.amp.autocast():
70 |         with torch.no_grad():
71 |             generated_tokens = (
72 |                 model.generate(
73 |                     input_features=batch["input_features"].cuda(),
74 |                     decoder_input_ids=batch["labels"][:, :4].cuda(),
75 |                     forced_decoder_ids=forced_decoder_ids,
76 |                     max_new_tokens=255).cpu().numpy())
77 |             labels = batch["labels"].cpu().numpy()
78 |             labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)
79 |             # 将预测和实际的token转换为文本
80 |             decoded_preds = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
81 |             decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)
82 |             # 删除标点符号
83 |             if args.remove_pun:
84 |                 decoded_preds = remove_punctuation(decoded_preds)
85 |                 decoded_labels = remove_punctuation(decoded_labels)
86 |             # 将繁体中文总成简体中文
87 |             if args.to_simple:
88 |                 decoded_preds = to_simple(decoded_preds)
89 |                 decoded_labels = to_simple(decoded_labels)
90 |             metric.add_batch(predictions=decoded_preds, references=decoded_labels)
91 |     # 删除计算的记录
92 |     del generated_tokens, labels, batch
93 |     gc.collect()
94 | # 计算评估结果
95 | m = metric.compute()
96 | print(f"评估结果：{args.metric}={round(m, 5)}")
97 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/evaluation_debug2.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import functools
 3 | import gc
 4 | import os
 5 | 
 6 | import evaluate
 7 | import numpy as np
 8 | import torch
 9 | from torch.utils.data import DataLoader
10 | from tqdm import tqdm
11 | from transformers import WhisperForConditionalGeneration, WhisperProcessor
12 | 
13 | from utils.data_utils import DataCollatorSpeechSeq2SeqWithPadding, remove_punctuation, to_simple
14 | from utils.reader import CustomDataset
15 | from utils.utils import print_arguments, add_arguments
16 | 
17 | parser = argparse.ArgumentParser(description=__doc__)
18 | add_arg = functools.partial(add_arguments, argparser=parser)
19 | add_arg("test_data",   type=str, default="/nfs/volume-225-14/laizhihao_i/data/train_data/valid_all.json",            help="测试集的路径")
20 | add_arg("model_path",  type=str, default="/nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune/checkpoint/output_3000/model_large_v2-finetune", help="合并模型的路径，或者是huggingface上模型的名称")
21 | add_arg("batch_size",  type=int, default=16,        help="评估的batch size")
22 | add_arg("num_workers", type=int, default=8,         help="读取数据的线程数量")
23 | add_arg("language",    type=str, default="Chinese", help="设置语言，可全称也可简写，如果为None则评估的是多语言")
24 | add_arg("remove_pun",  type=bool, default=True,     help="是否移除标点符号")
25 | add_arg("to_simple",   type=bool, default=True,     help="是否转为简体中文")
26 | add_arg("timestamps",  type=bool, default=False,    help="评估时是否使用时间戳数据")
27 | add_arg("min_audio_len",     type=float, default=0.5,  help="最小的音频长度，单位秒")
28 | add_arg("max_audio_len",     type=float, default=30,   help="最大的音频长度，单位秒")
29 | add_arg("local_files_only",  type=bool,  default=True, help="是否只在本地加载模型，不尝试下载")
30 | add_arg("task",       type=str, default="transcribe", choices=['transcribe', 'translate'], help="模型的任务")
31 | add_arg("metric",     type=str, default="cer",        choices=['cer', 'wer'],              help="评估方式")
32 | args = parser.parse_args()
33 | print_arguments(args)
34 | 
35 | # 判断模型路径是否合法
36 | assert 'openai' == os.path.dirname(args.model_path) or os.path.exists(args.model_path), \
37 |     f"模型文件{args.model_path}不存在，请检查是否已经成功合并模型，或者是否为huggingface存在模型"
38 | # 获取Whisper的数据处理器，这个包含了特征提取器、tokenizer
39 | processor = WhisperProcessor.from_pretrained(args.model_path,
40 |                                              language=args.language,
41 |                                              task=args.task,
42 |                                              no_timestamps=not args.timestamps,
43 |                                              local_files_only=args.local_files_only)
44 | forced_decoder_ids = processor.get_decoder_prompt_ids()
45 | # 获取模型
46 | model = WhisperForConditionalGeneration.from_pretrained(args.model_path,
47 |                                                         device_map="auto",
48 |                                                         local_files_only=args.local_files_only)
49 | model.eval()
50 | 
51 | # 获取测试数据
52 | test_dataset = CustomDataset(data_list_path=args.test_data,
53 |                              processor=processor,
54 |                              timestamps=args.timestamps,
55 |                              min_duration=args.min_audio_len,
56 |                              max_duration=args.max_audio_len)
57 | print(f"测试数据：{len(test_dataset)}")
58 | 
59 | # 数据padding器
60 | data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
61 | eval_dataloader = DataLoader(test_dataset, batch_size=args.batch_size,
62 |                              num_workers=args.num_workers, collate_fn=data_collator)
63 | 
64 | # 获取评估方法
65 | metric = evaluate.load(f'/nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune/metrics/{args.metric}.py')
66 | 
67 | # 开始评估
68 | 
69 | for step, batch in enumerate(tqdm(eval_dataloader)):
70 |     with torch.cuda.amp.autocast():
71 |         with torch.no_grad():
72 |             generated_tokens = (
73 |                 model.generate(
74 |                     input_features=batch["input_features"].cuda(),
75 |                     decoder_input_ids=batch["labels"][:, :4].cuda(),
76 |                     forced_decoder_ids=forced_decoder_ids,
77 |                     max_new_tokens=255).cpu().numpy())
78 |             labels = batch["labels"].cpu().numpy()
79 |             labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)
80 |             # 将预测和实际的token转换为文本
81 |             decoded_preds = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
82 |             decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)
83 |             # 删除标点符号
84 |             if args.remove_pun:
85 |                 decoded_preds = remove_punctuation(decoded_preds)
86 |                 decoded_labels = remove_punctuation(decoded_labels)
87 |             # 将繁体中文总成简体中文
88 |             if args.to_simple:
89 |                 decoded_preds = to_simple(decoded_preds)
90 |                 decoded_labels = to_simple(decoded_labels)
91 |             metric.add_batch(predictions=decoded_preds, references=decoded_labels)
92 |     # 删除计算的记录
93 |     del generated_tokens, labels, batch
94 |     gc.collect()
95 | # 计算评估结果
96 | m = metric.compute()
97 | print(f"评估结果：{args.metric}={round(m, 5)}")
98 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/infer.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import functools
 3 | import platform
 4 | 
 5 | import torch
 6 | from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoModelForCausalLM
 7 | 
 8 | from utils.utils import print_arguments, add_arguments
 9 | 
10 | parser = argparse.ArgumentParser(description=__doc__)
11 | add_arg = functools.partial(add_arguments, argparser=parser)
12 | add_arg("audio_path",  type=str,  default="dataset/test.wav", help="预测的音频路径")
13 | add_arg("model_path",  type=str,  default="models/whisper-tiny-finetune/", help="合并模型的路径，或者是huggingface上模型的名称")
14 | add_arg("use_gpu",     type=bool, default=True,      help="是否使用gpu进行预测")
15 | add_arg("language",    type=str,  default="chinese", help="设置语言，如果为None则预测的是多语言")
16 | add_arg("num_beams",   type=int,  default=1,         help="解码搜索大小")
17 | add_arg("batch_size",  type=int,  default=16,        help="预测batch_size大小")
18 | add_arg("use_compile", type=bool, default=False,     help="是否使用Pytorch2.0的编译器")
19 | add_arg("task",        type=str,  default="transcribe", choices=['transcribe', 'translate'], help="模型的任务")
20 | add_arg("assistant_model_path",  type=str,  default=None,  help="助手模型，可以提高推理速度，例如openai/whisper-tiny")
21 | add_arg("local_files_only",      type=bool, default=True,  help="是否只在本地加载模型，不尝试下载")
22 | add_arg("use_flash_attention_2", type=bool, default=False, help="是否使用FlashAttention2加速")
23 | add_arg("use_bettertransformer", type=bool, default=False, help="是否使用BetterTransformer加速")
24 | args = parser.parse_args()
25 | print_arguments(args)
26 | 
27 | # 设置设备
28 | device = "cuda:0" if torch.cuda.is_available() and args.use_gpu else "cpu"
29 | torch_dtype = torch.float16 if torch.cuda.is_available() and args.use_gpu else torch.float32
30 | 
31 | # 获取Whisper的特征提取器、编码器和解码器
32 | processor = AutoProcessor.from_pretrained(args.model_path)
33 | 
34 | # 获取模型
35 | model = AutoModelForSpeechSeq2Seq.from_pretrained(
36 |     args.model_path, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True,
37 |     use_flash_attention_2=args.use_flash_attention_2
38 | )
39 | if args.use_bettertransformer and not args.use_flash_attention_2:
40 |     model = model.to_bettertransformer()
41 | # 使用Pytorch2.0的编译器
42 | if args.use_compile:
43 |     if torch.__version__ >= "2" and platform.system().lower() != 'windows':
44 |         model = torch.compile(model)
45 | model.to(device)
46 | 
47 | # 获取助手模型
48 | generate_kwargs_pipeline = None
49 | if args.assistant_model_path is not None:
50 |     assistant_model = AutoModelForCausalLM.from_pretrained(
51 |         args.assistant_model_path, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
52 |     )
53 |     assistant_model.to(device)
54 |     generate_kwargs_pipeline = {"assistant_model": assistant_model}
55 | 
56 | # 获取管道
57 | infer_pipe = pipeline("automatic-speech-recognition",
58 |                       model=model,
59 |                       tokenizer=processor.tokenizer,
60 |                       feature_extractor=processor.feature_extractor,
61 |                       max_new_tokens=128,
62 |                       chunk_length_s=30,
63 |                       batch_size=args.batch_size,
64 |                       torch_dtype=torch_dtype,
65 |                       generate_kwargs=generate_kwargs_pipeline,
66 |                       device=device)
67 | 
68 | # 推理参数
69 | generate_kwargs = {"task": args.task, "num_beams": args.num_beams}
70 | if args.language is not None:
71 |     generate_kwargs["language"] = args.language
72 | # 推理
73 | result = infer_pipe(args.audio_path, return_timestamps=True, generate_kwargs=generate_kwargs)
74 | 
75 | for chunk in result["chunks"]:
76 |     print(f"[{chunk['timestamp'][0]}-{chunk['timestamp'][1]}s] {chunk['text']}")
77 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/infer_ct2.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import functools
 3 | import os
 4 | 
 5 | from faster_whisper import WhisperModel
 6 | 
 7 | from utils.utils import print_arguments, add_arguments
 8 | 
 9 | os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
10 | parser = argparse.ArgumentParser(description=__doc__)
11 | add_arg = functools.partial(add_arguments, argparser=parser)
12 | add_arg("audio_path",  type=str,  default="dataset/test.wav",        help="预测的音频路径")
13 | add_arg("model_path",  type=str,  default="models/whisper-tiny-finetune-ct2", help="转换后的模型路径，转换方式看文档")
14 | add_arg("language",    type=str,  default="zh",   help="设置语言，必须简写，如果为None则自动检测语言")
15 | add_arg("task",        type=str,  default="transcribe", choices=['transcribe', 'translate'], help="模型的任务")
16 | add_arg("use_gpu",     type=bool, default=True,   help="是否使用gpu进行预测")
17 | add_arg("use_int8",    type=bool, default=False,  help="是否使用int8进行预测")
18 | add_arg("beam_size",   type=int,  default=10,     help="解码搜索大小")
19 | add_arg("num_workers", type=int,  default=1,      help="预测器的并发数量")
20 | add_arg("vad_filter",  type=bool, default=False,  help="是否使用VAD过滤掉部分没有讲话的音频")
21 | add_arg("local_files_only", type=bool, default=True, help="是否只在本地加载模型，不尝试下载")
22 | args = parser.parse_args()
23 | print_arguments(args)
24 | 
25 | # 检查模型文件是否存在
26 | assert os.path.exists(args.model_path), f"模型文件{args.model_path}不存在"
27 | # 加载模型
28 | if args.use_gpu:
29 |     if not args.use_int8:
30 |         model = WhisperModel(args.model_path, device="cuda", compute_type="float16", num_workers=args.num_workers,
31 |                              local_files_only=args.local_files_only)
32 |     else:
33 |         model = WhisperModel(args.model_path, device="cuda", compute_type="int8_float16", num_workers=args.num_workers,
34 |                              local_files_only=args.local_files_only)
35 | else:
36 |     model = WhisperModel(args.model_path, device="cpu", compute_type="int8", num_workers=args.num_workers,
37 |                          local_files_only=args.local_files_only)
38 | # 支持large-v3模型
39 | if 'large-v3' in args.model_path:
40 |     model.feature_extractor.mel_filters = \
41 |         model.feature_extractor.get_mel_filters(model.feature_extractor.sampling_rate,
42 |                                                 model.feature_extractor.n_fft, n_mels=128)
43 | # 预热
44 | _, _ = model.transcribe("dataset/test.wav", beam_size=5)
45 | 
46 | 
47 | # 语音识别
48 | segments, info = model.transcribe(args.audio_path, beam_size=args.beam_size, language=args.language, task=args.task,
49 |                                   vad_filter=args.vad_filter)
50 | for segment in segments:
51 |     text = segment.text
52 |     print(f"[{round(segment.start, 2)} - {round(segment.end, 2)}]：{text}\n")
53 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/merge_lora.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import functools
 3 | import os
 4 | 
 5 | from transformers import WhisperForConditionalGeneration, WhisperFeatureExtractor, WhisperTokenizerFast,\
 6 |     WhisperProcessor
 7 | from peft import PeftModel, PeftConfig
 8 | from utils.utils import print_arguments, add_arguments
 9 | 
10 | parser = argparse.ArgumentParser(description=__doc__)
11 | add_arg = functools.partial(add_arguments, argparser=parser)
12 | add_arg("lora_model", type=str, default="output/whisper-tiny/checkpoint-best/", help="微调保存的模型路径")
13 | add_arg('output_dir', type=str, default='models/',    help="合并模型的保存目录")
14 | add_arg("local_files_only", type=bool, default=False, help="是否只在本地加载模型，不尝试下载")
15 | args = parser.parse_args()
16 | print_arguments(args)
17 | 
18 | # 检查模型文件是否存在
19 | assert os.path.exists(args.lora_model), f"模型文件{args.lora_model}不存在"
20 | # 获取Lora配置参数
21 | peft_config = PeftConfig.from_pretrained(args.lora_model)
22 | # 获取Whisper的基本模型
23 | base_model = WhisperForConditionalGeneration.from_pretrained(peft_config.base_model_name_or_path, device_map={"": "cpu"},
24 |                                                              local_files_only=args.local_files_only)
25 | # 与Lora模型合并
26 | model = PeftModel.from_pretrained(base_model, args.lora_model, local_files_only=args.local_files_only)
27 | feature_extractor = WhisperFeatureExtractor.from_pretrained(peft_config.base_model_name_or_path,
28 |                                                             local_files_only=args.local_files_only)
29 | tokenizer = WhisperTokenizerFast.from_pretrained(peft_config.base_model_name_or_path,
30 |                                                  local_files_only=args.local_files_only)
31 | processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path,
32 |                                              local_files_only=args.local_files_only)
33 | 
34 | # 合并参数
35 | model = model.merge_and_unload()
36 | model.train(False)
37 | 
38 | # 保存的文件夹路径
39 | if peft_config.base_model_name_or_path.endswith("/"):
40 |     peft_config.base_model_name_or_path = peft_config.base_model_name_or_path[:-1]
41 | save_directory = os.path.join(args.output_dir, f'{os.path.basename(peft_config.base_model_name_or_path)}-finetune')
42 | os.makedirs(save_directory, exist_ok=True)
43 | 
44 | # 保存模型到指定目录中
45 | model.save_pretrained(save_directory, max_shard_size='4GB')
46 | feature_extractor.save_pretrained(save_directory)
47 | tokenizer.save_pretrained(save_directory)
48 | processor.save_pretrained(save_directory)
49 | print(f'合并模型保持在：{save_directory}')
50 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy>=1.23.1
 2 | soundfile>=0.12.1
 3 | librosa>=0.10.0
 4 | dataclasses>=0.6
 5 | transformers>=4.35.0
 6 | bitsandbytes>=0.41.0
 7 | datasets>=2.11.0
 8 | evaluate>=0.4.0
 9 | ctranslate2>=3.21.0
10 | faster-whisper>=0.10.0
11 | jiwer>=2.5.1
12 | peft>=0.6.2
13 | accelerate>=0.21.0
14 | zhconv>=1.4.2
15 | tqdm>=4.62.1
16 | soundcard>=0.4.2
17 | uvicorn>=0.21.1
18 | fastapi>=0.95.1
19 | starlette>=0.26.1
20 | tensorboardX>=2.2
21 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/run_finetune_debug.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4 finetune_debug.py --base_model=/nfs/volume-225-14/laizhihao_i/Whisper/pretrain_model/model_small --per_device_train_batch_size=8 --per_device_eval_batch_size=8 --gradient_accumulation_steps=4 --output_dir=/nfs/volume-225-14/laizhihao_i/Whisper/checkpoint/model_small --train_data=/nfs/volume-225-14/laizhihao_i/data/train_data_add8k/all8k_filter.json --test_data=/nfs/volume-225-14/laizhihao_i/data/train_data_add8k/all8k_filter.json
4 | 
5 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/run_finetune_fp16_cuichenrui_01.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | nvidia-smi
 4 | 
 5 | export NCCL_IB_DISABLE=1
 6 | 
 7 | num_nodes=3
 8 | node_rank=0
 9 | num_gpus=8
10 | job_id=whisperMediumLoraFinetune01
11 | HOST_NODE_ADDR="10.191.156.77:12348"
12 | master_addr=10.191.156.77
13 | master_port=12348
14 | 
15 | torchrun --nnodes=$num_nodes \
16 |         --nproc_per_node=$num_gpus \
17 |         --node_rank=$node_rank \
18 |         --rdzv_id=$job_id \
19 |         --rdzv_backend="c10d" \
20 |         --rdzv_endpoint=$HOST_NODE_ADDR \
21 |         --master_addr=$master_addr \
22 |         --master_port=$master_port finetune_multimachine.py \
23 |         --base_model=/nfs/volume-225-14/laizhihao_i/Whisper/pretrain_model/model_medium \
24 |         --per_device_train_batch_size=2 \
25 |         --per_device_eval_batch_size=4 \
26 |         --gradient_accumulation_steps=5 \
27 |         --num_workers=8 \
28 |         --output_dir=/nfs/volume-225-14/cuichenrui/whisper/whisper_medium_speed_test/A6000_environmrnt/whisper_medium_full_finetune_fp16/rank0 \
29 |         --train_data=/nfs/volume-225-14/cuichenrui/whisper/whisper_medium_speed_test/train2wh_head10w.json \
30 |         --test_data=/nfs/volume-225-14/cuichenrui/whisper/whisper_medium_speed_test/test2wh_head5000.json \
31 |         --fp16=True \
32 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/run_finetune_fp16_cuichenrui_02.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | nvidia-smi
 4 | 
 5 | export NCCL_IB_DISABLE=1
 6 | 
 7 | num_nodes=3
 8 | node_rank=1
 9 | num_gpus=8
10 | job_id=whisperMediumLoraFinetune01
11 | HOST_NODE_ADDR="10.191.156.77:12348"
12 | master_addr=10.191.156.77
13 | master_port=12348
14 | 
15 | torchrun --nnodes=$num_nodes \
16 |         --nproc_per_node=$num_gpus \
17 |         --node_rank=$node_rank \
18 |         --rdzv_id=$job_id \
19 |         --rdzv_backend="c10d" \
20 |         --rdzv_endpoint=$HOST_NODE_ADDR \
21 |         --master_addr=$master_addr \
22 |         --master_port=$master_port finetune_multimachine.py \
23 |         --base_model=/nfs/volume-225-14/laizhihao_i/Whisper/pretrain_model/model_medium \
24 |         --per_device_train_batch_size=2 \
25 |         --per_device_eval_batch_size=4 \
26 |         --gradient_accumulation_steps=5 \
27 |         --num_workers=8 \
28 |         --output_dir=/nfs/volume-225-14/cuichenrui/whisper/whisper_medium_speed_test/A6000_environmrnt/whisper_medium_full_finetune_fp16/rank1 \
29 |         --train_data=/nfs/volume-225-14/cuichenrui/whisper/whisper_medium_speed_test/train2wh_head10w.json \
30 |         --test_data=/nfs/volume-225-14/cuichenrui/whisper/whisper_medium_speed_test/test2wh_head5000.json \
31 |         --fp16=True \
32 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/run_finetune_fp16_cuichenrui_03.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | nvidia-smi
 4 | 
 5 | export NCCL_IB_DISABLE=1
 6 | 
 7 | num_nodes=3
 8 | node_rank=2
 9 | num_gpus=8
10 | job_id=whisperMediumLoraFinetune01
11 | HOST_NODE_ADDR="10.191.156.77:12348"
12 | master_addr=10.191.156.77
13 | master_port=12348
14 | 
15 | torchrun --nnodes=$num_nodes \
16 |         --nproc_per_node=$num_gpus \
17 |         --node_rank=$node_rank \
18 |         --rdzv_id=$job_id \
19 |         --rdzv_backend="c10d" \
20 |         --rdzv_endpoint=$HOST_NODE_ADDR \
21 |         --master_addr=$master_addr \
22 |         --master_port=$master_port finetune_multimachine.py \
23 |         --base_model=/nfs/volume-225-14/laizhihao_i/Whisper/pretrain_model/model_medium \
24 |         --per_device_train_batch_size=2 \
25 |         --per_device_eval_batch_size=4 \
26 |         --gradient_accumulation_steps=5 \
27 |         --num_workers=8 \
28 |         --output_dir=/nfs/volume-225-14/cuichenrui/whisper/whisper_medium_speed_test/A6000_environmrnt/whisper_medium_full_finetune_fp16/rank2 \
29 |         --train_data=/nfs/volume-225-14/cuichenrui/whisper/whisper_medium_speed_test/train2wh_head10w.json \
30 |         --test_data=/nfs/volume-225-14/cuichenrui/whisper/whisper_medium_speed_test/test2wh_head5000.json \
31 |         --fp16=True \
32 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/run_finetune_multimachine_1_8.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export NCCL_IB_DISABLE=1
 3 | 
 4 | num_nodes=1
 5 | num_gpus=8
 6 | job_id=xingcheng8k
 7 | HOST_NODE_ADDR="localhost:0"
 8 | 
 9 | torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR finetune_multimachine.py --base_model=/nfs/volume-225-14/cuichenrui/whisper/whisper_medium_full_finetune_2.0/medium_8w_without8k_fp16_6_id/model_medium/checkpoint-310000 --per_device_train_batch_size=8 --per_device_eval_batch_size=8 --gradient_accumulation_steps=4 --num_workers=8 --output_dir=/nfs/dataset-411-391/speech2024/whisper/test/whisper_train_xingcheng --train_data=/nfs/dataset-411-391/wenet/examples/multi_cn/s0/data/train/filter_blank_zxbjjl_sichuanhua_national2.data --test_data=/nfs/volume-225-14/laizhihao_i/data/train_data/test1.json --learning_rate=5e-5 --fp16=True --use_tar_file_datalist=True
10 | # torchrun --nnodes=$num_nodes --nproc_per_node=$num_gpus --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint=$HOST_NODE_ADDR finetune_multimachine.py --base_model=/nfs/volume-225-14/cuichenrui/whisper/whisper_medium_full_finetune_2.0/medium_8w_without8k_fp16_6_id/model_medium/checkpoint-310000 --per_device_train_batch_size=8 --per_device_eval_batch_size=8 --gradient_accumulation_steps=4 --num_workers=8 --output_dir=/nfs/dataset-411-391/speech2024/whisper/test/whisper_train_xingcheng --train_data=/nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune/dataset/test_tar.jsonl --test_data=/nfs/volume-225-14/laizhihao_i/data/train_data/test1.json --learning_rate=5e-5 --fp16=True --use_tar_file_datalist=True


--------------------------------------------------------------------------------
/whisper_finetune_using/tools/data_postprocessing.py:
--------------------------------------------------------------------------------
 1 | # 2024_02_04
 2 | # 数据后处理脚本，负责将文本规范化
 3 | 
 4 | import argparse
 5 | from tn.chinese.normalizer import Normalizer
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument('--raw_file_path', type=str, help="需要进行后处理的文件路径")
 9 | parser.add_argument('--norm_file_path', type=str, help="处理后的文件路径")
10 | args = parser.parse_args()
11 | 
12 | raw_file_path = args.raw_file_path
13 | norm_file_path = args.norm_file_path
14 | 
15 | normalizer = Normalizer()
16 | 
17 | with open(raw_file_path, "r") as raw_file, open(norm_file_path, "w") as norm_file:
18 |     for line in raw_file:
19 |         parts = line.strip().split("\t")
20 |         if len(parts) >= 2:  # 检查是否存在第二项元素
21 |             norm_text = normalizer.normalize(parts[1])
22 |             norm_file.write(f"{parts[0]}\t{norm_text}\n")
23 |         else:
24 |             norm_file.write(f"{parts[0]}\t\n")
25 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/tools/data_postprocessing_cantonese.py:
--------------------------------------------------------------------------------
 1 | # 2024_02_04
 2 | # 数据后处理脚本，负责将文本规范化
 3 | 
 4 | import argparse
 5 | from tn.chinese.normalizer import Normalizer
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument('--raw_file_path', type=str, help="需要进行后处理的文件路径")
 9 | parser.add_argument('--norm_file_path', type=str, help="处理后的文件路径")
10 | args = parser.parse_args()
11 | 
12 | raw_file_path = args.raw_file_path
13 | norm_file_path = args.norm_file_path
14 | 
15 | normalizer = Normalizer()
16 | 
17 | with open(raw_file_path, "r") as raw_file, open(norm_file_path, "w") as norm_file:
18 |     for line in raw_file:
19 | 
20 |         parts = line.strip().split("\t")
21 |         norm_text = normalizer.normalize(parts[1])
22 |         norm_file.write(f"{parts[0][-30:]}\t{norm_text}\n")


--------------------------------------------------------------------------------
/whisper_finetune_using/tools/data_preprocessing.py:
--------------------------------------------------------------------------------
 1 | # 2024_02_02
 2 | # 数据准备脚本，将data.list转化为whisper可读的json文件
 3 | # 示例为dynamic场景，其他场景以此类推
 4 | 
 5 | data_list_filenames = ["dynamic1_scene1/wav.ref",
 6 |                     "dynamic1_scene2/wav.ref",
 7 |                     "dynamic1_scene3/wav.ref",
 8 |                     "dynamic1_scene4/wav.ref",
 9 |                     "dynamic1_scene5/wav.ref",
10 |                     "dynamic1_scene6/wav.ref",
11 |                     "dynamic2_scene1/wav.ref",
12 |                     "dynamic2_scene2/wav.ref",
13 |                     "dynamic2_scene3/wav.ref",
14 |                     "dynamic2_scene4/wav.ref"]
15 | 
16 | whisper_json_filename = "dynamic.json"
17 | 
18 | import json
19 | import jsonlines
20 | import soundfile
21 | import string
22 | import tqdm
23 | 
24 | def remove_punctuation(input_string):
25 |     """去除所有标点符号"""
26 |     translation_table = str.maketrans("", "", string.punctuation + "，。、；：！？（）【】『』“”《》［］｛｝﹙﹚﹛﹜﹝﹞〔〕〈〉")
27 |     no_punct = input_string.translate(translation_table)
28 |     return no_punct
29 | 
30 | total_items = 0
31 | total_correct_items = 0
32 | total_error_items = 0
33 | total_correct_duration = 0
34 | progress = 0
35 | 
36 | for data_list_filename in data_list_filenames:
37 |     print(f"处理文件: {data_list_filename}")
38 |     with open(data_list_filename, "r", encoding='gb18030') as data_list_file:
39 |         contents = data_list_file.readlines()
40 | 
41 |         total_items = len(contents)
42 |         
43 |         with jsonlines.open(whisper_json_filename, mode="a") as whisper_json_file:
44 |             for content in contents:
45 |                 progress += 1
46 |                 if progress % 100 == 0:
47 |                     print(f"{progress} / {total_items}")
48 | 
49 |                 audio_path, text = content.strip().split("\t")
50 |                 try:
51 |                     text = remove_punctuation(text)
52 |                     sample, sr = soundfile.read(audio_path)
53 |                     duration = round(sample.shape[-1] / float(sr), 2)
54 |                     result_json = {"audio": {"path": audio_path}, "sentence": text, "duration": duration}
55 |                     whisper_json_file.write(result_json)
56 | 
57 |                     total_correct_items += 1
58 |                     total_correct_duration += duration
59 | 
60 |                 except Exception as e:
61 |                     print("error audio path :" + audio)
62 |                     total_error_items += 1
63 | 
64 | print(f"total_items = {total_items}")
65 | print(f"total_correct_items = {total_correct_items}")
66 | print(f"total_error_items = {total_error_items}")
67 | print(f"total_correct_duration = {round(total_correct_duration / 3600, 2)}h")


--------------------------------------------------------------------------------
/whisper_finetune_using/tools/pcm2wav_16000.py:
--------------------------------------------------------------------------------
 1 | import wave
 2 | import numpy as np
 3 | 
 4 | # 定义PCM文件和WAV文件的路径
 5 | pcm_file = '/nfs/volume-225-14/cuichenrui/dataset/8k.lst'
 6 | 
 7 | pcm_list = []
 8 | with open(pcm_file) as f:
 9 |     for line in f:
10 |         pcm_list.append(line.strip())
11 | 
12 | # 设置音频参数
13 | num_channels = 1  # 单声道
14 | sample_width = 2  # 16位（2字节）采样深度
15 | frame_rate = 16000  # 采样率，例如16000 Hz
16 | 
17 | for pcmfile in pcm_list:
18 |     # 读取PCM文件
19 |     with open(pcmfile, 'rb') as pcmf:
20 |         pcm_data = pcmf.read()
21 |     
22 |         # 将PCM数据转换为NumPy数组
23 |         pcm_array = np.frombuffer(pcm_data, dtype=np.int16)
24 |     
25 |         # 创建WAV文件并设置音频参数
26 |         with wave.open(pcmfile.strip() + '.wav', 'wb') as wavfile:
27 |             wavfile.setnchannels(num_channels)
28 |             wavfile.setsampwidth(sample_width)
29 |             wavfile.setframerate(frame_rate)
30 |     
31 |             # 将NumPy数组转换为二进制数据并写入WAV文件
32 |             wavfile.writeframes(pcm_array.tobytes())
33 |     
34 |     print(f'Converted {pcmfile} to {pcmfile.strip() + ".wav"}')
35 | 
36 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/tools/pcm2wav_8000.py:
--------------------------------------------------------------------------------
 1 | import wave
 2 | import numpy as np
 3 | 
 4 | # 定义PCM文件和WAV文件的路径
 5 | pcm_file = '/nfs/volume-225-14/cuichenrui/dataset/8k.lst'
 6 | 
 7 | pcm_list = []
 8 | with open(pcm_file) as f:
 9 |     for line in f:
10 |         pcm_list.append(line.strip())
11 | 
12 | # 设置音频参数
13 | num_channels = 1  # 单声道
14 | sample_width = 2  # 16位（2字节）采样深度
15 | frame_rate = 8000  # 采样率，例如16000 Hz
16 | 
17 | for pcmfile in pcm_list:
18 |     # 读取PCM文件
19 |     with open(pcmfile, 'rb') as pcmf:
20 |         pcm_data = pcmf.read()
21 |     
22 |         # 将PCM数据转换为NumPy数组
23 |         pcm_array = np.frombuffer(pcm_data, dtype=np.int16)
24 |     
25 |         # 创建WAV文件并设置音频参数
26 |         with wave.open(pcmfile.strip() + '.wav', 'wb') as wavfile:
27 |             wavfile.setnchannels(num_channels)
28 |             wavfile.setsampwidth(sample_width)
29 |             wavfile.setframerate(frame_rate)
30 |     
31 |             # 将NumPy数组转换为二进制数据并写入WAV文件
32 |             wavfile.writeframes(pcm_array.tobytes())
33 |     
34 |     print(f'Converted {pcmfile} to {pcmfile.strip() + ".wav"}')
35 | 
36 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/train_bash/train_finetune_whisper_cuichenrui_01.sh:
--------------------------------------------------------------------------------
 1 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh speech-datasets /ofs/speech-datasets dc680445ad5745c4871a9aeeebd988a4 nmgpu
 2 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh corebackup /ofs/corebackup 05b7b93388ef48cf932b72f4017c6e31 nmgpu
 3 | sudo bash /mnt/common/jianshu/liquidio/release/current/script/liquid_mount_s3.sh  k80-dataset AKDD00000000000SGIPX2FHPLPMALX  ASDDCqkYLLUApBQrKInMsKjUECKbIZulHzdLTtlQ / /nfs/s3_k80_dataset
 4 | sudo bash /mnt/com/nfs/volume-225-14/cuichenrui/scriptmon/jianshu/ofs/release/current/script/ofs_mount.sh corebackup /ofs/corebackup 05b7b93388ef48cf932b72f4017c6e31 nmgpu
 5 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh speechssd /ofs/speechssd b46e06b5108e4fdd911a610d0faa5380 hbbpussd
 6 | 
 7 | # source /nfs/volume-225-14/laizhihao_i/env/anaconda3/bin/activate
 8 | source /nfs/volume-225-14/cuichenrui/anaconda3/bin/activate
 9 | conda activate /nfs/volume-225-14/cuichenrui/anaconda3/envs/whisperA6000
10 | 
11 | cd /nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune
12 | bash run_finetune_2wh_lr_cuichenrui_01.sh > /nfs/volume-225-14/cuichenrui/whisper/whisper_medium_full_finetune/medium_2w_data_fp32_1e-5lr/rank0/finetune.log 2>&1
13 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/train_bash/train_finetune_whisper_cuichenrui_02.sh:
--------------------------------------------------------------------------------
 1 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh speech-datasets /ofs/speech-datasets dc680445ad5745c4871a9aeeebd988a4 nmgpu
 2 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh corebackup /ofs/corebackup 05b7b93388ef48cf932b72f4017c6e31 nmgpu
 3 | sudo bash /mnt/common/jianshu/liquidio/release/current/script/liquid_mount_s3.sh  k80-dataset AKDD00000000000SGIPX2FHPLPMALX  ASDDCqkYLLUApBQrKInMsKjUECKbIZulHzdLTtlQ / /nfs/s3_k80_dataset
 4 | sudo bash /mnt/com/nfs/volume-225-14/cuichenrui/scriptmon/jianshu/ofs/release/current/script/ofs_mount.sh corebackup /ofs/corebackup 05b7b93388ef48cf932b72f4017c6e31 nmgpu
 5 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh speechssd /ofs/speechssd b46e06b5108e4fdd911a610d0faa5380 hbbpussd
 6 | 
 7 | # source /nfs/volume-225-14/laizhihao_i/env/anaconda3/bin/activate
 8 | source /nfs/volume-225-14/cuichenrui/anaconda3/bin/activate
 9 | conda activate /nfs/volume-225-14/cuichenrui/anaconda3/envs/whisperA6000
10 | 
11 | cd /nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune
12 | bash run_finetune_2wh_lr_cuichenrui_02.sh > /nfs/volume-225-14/cuichenrui/whisper/whisper_medium_full_finetune/medium_2w_data_fp32_1e-5lr/rank1/finetune.log 2>&1
13 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/train_bash/train_finetune_whisper_cuichenrui_03.sh:
--------------------------------------------------------------------------------
 1 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh speech-datasets /ofs/speech-datasets dc680445ad5745c4871a9aeeebd988a4 nmgpu
 2 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh corebackup /ofs/corebackup 05b7b93388ef48cf932b72f4017c6e31 nmgpu
 3 | sudo bash /mnt/common/jianshu/liquidio/release/current/script/liquid_mount_s3.sh  k80-dataset AKDD00000000000SGIPX2FHPLPMALX  ASDDCqkYLLUApBQrKInMsKjUECKbIZulHzdLTtlQ / /nfs/s3_k80_dataset
 4 | sudo bash /mnt/com/nfs/volume-225-14/cuichenrui/scriptmon/jianshu/ofs/release/current/script/ofs_mount.sh corebackup /ofs/corebackup 05b7b93388ef48cf932b72f4017c6e31 nmgpu
 5 | sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh speechssd /ofs/speechssd b46e06b5108e4fdd911a610d0faa5380 hbbpussd
 6 | 
 7 | # source /nfs/volume-225-14/laizhihao_i/env/anaconda3/bin/activate
 8 | source /nfs/volume-225-14/cuichenrui/anaconda3/bin/activate
 9 | conda activate /nfs/volume-225-14/cuichenrui/anaconda3/envs/whisperA6000
10 | 
11 | cd /nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune
12 | bash run_finetune_2wh_lr_cuichenrui_03.sh > /nfs/volume-225-14/cuichenrui/whisper/whisper_medium_full_finetune/medium_2w_data_fp32_1e-5lr/rank2/finetune.log 2>&1
13 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/train_bash/train_finetune_whisper_multimachine.sh:
--------------------------------------------------------------------------------
 1 | # sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh corebackup /ofs/corebackup 05b7b93388ef48cf932b72f4017c6e31 nmgpu
 2 | # sudo bash /mnt/common/jianshu/ofs/release/current/script/ofs_mount.sh speechssd /ofs/speechssd b46e06b5108e4fdd911a610d0faa5380 hbbpussd
 3 | 
 4 | source /nfs/volume-225-14/laizhihao_i/env/anaconda3/bin/activate
 5 | conda activate whisper
 6 | 
 7 | cd /nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune
 8 | # bash run_finetune_multimachine.sh > train_finetune_whisper_medium_data10w.log 2>&1
 9 | # bash run_finetune_multimachine.sh > /nfs/volume-225-14/cuichenrui/whisper/whisper_medium_full_finetune/medium_2w_data_fp32/finetune.log 2>&1
10 | # bash run_finetune_multimachine.sh > /nfs/volume-225-14/cuichenrui/whisper/whisper_medium_full_finetune_2.0/medium_8w_data_fp32_no_8k_2.0lr/finetune.log 2>&1
11 | # bash run_finetune_multimachine.sh > /nfs/volume-225-14/cuichenrui/whisper/whisper_medium_full_finetune_2.0/medium_8w_data_fp32_no_8k_2.0lr/finetune02.log 2>&1
12 | # bash run_finetune_multimachine.sh > /nfs/volume-225-14/cuichenrui/whisper/whisper_medium_full_finetune_2.0/medium_8w_without8k_fp16_3.0lr/finetune.log 2>&1
13 | # bash run_finetune_multimachine.sh > /nfs/volume-225-14/cuichenrui/whisper/whisper_medium_full_finetune_2.0/medium_8w_without8k_fp32_5.0lr/finetune02.log 2>&1
14 | # bash run_finetune_multimachine.sh
15 | # bash run_finetune_multimachine_1_8.sh
16 | bash run_finetune_multimachine_1_4.sh


--------------------------------------------------------------------------------
/whisper_finetune_using/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cuichenrui2000/barry_speech_tools/4780bd2c0a578821a3bb782240b58e7c0ca44ac7/whisper_finetune_using/utils/__init__.py


--------------------------------------------------------------------------------
/whisper_finetune_using/utils/binary.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import mmap
 3 | 
 4 | import struct
 5 | 
 6 | from tqdm import tqdm
 7 | 
 8 | 
 9 | class DatasetWriter(object):
10 |     def __init__(self, prefix):
11 |         # 创建对应的数据文件
12 |         self.data_file = open(prefix + '.data', 'wb')
13 |         self.header_file = open(prefix + '.header', 'wb')
14 |         self.data_sum = 0
15 |         self.offset = 0
16 |         self.header = ''
17 | 
18 |     def add_data(self, data):
19 |         key = str(self.data_sum)
20 |         data = bytes(data, encoding="utf8")
21 |         # 写入图像数据
22 |         self.data_file.write(struct.pack('I', len(key)))
23 |         self.data_file.write(key.encode('ascii'))
24 |         self.data_file.write(struct.pack('I', len(data)))
25 |         self.data_file.write(data)
26 |         # 写入索引
27 |         self.offset += 4 + len(key) + 4
28 |         self.header = key + '\t' + str(self.offset) + '\t' + str(len(data)) + '\n'
29 |         self.header_file.write(self.header.encode('ascii'))
30 |         self.offset += len(data)
31 |         self.data_sum += 1
32 | 
33 |     def close(self):
34 |         self.data_file.close()
35 |         self.header_file.close()
36 | 
37 | 
38 | class DatasetReader(object):
39 |     def __init__(self, data_header_path, min_duration=0, max_duration=30):
40 |         self.keys = []
41 |         self.offset_dict = {}
42 |         self.fp = open(data_header_path.replace('.header', '.data'), 'rb')
43 |         self.m = mmap.mmap(self.fp.fileno(), 0, access=mmap.ACCESS_READ)
44 |         for line in tqdm(open(data_header_path, 'rb'), desc='读取数据列表'):
45 |             key, val_pos, val_len = line.split('\t'.encode('ascii'))
46 |             data = self.m[int(val_pos):int(val_pos) + int(val_len)]
47 |             data = str(data, encoding="utf-8")
48 |             data = json.loads(data)
49 |             # 跳过超出长度限制的音频
50 |             if data["duration"] < min_duration:
51 |                 continue
52 |             if max_duration != -1 and data["duration"] > max_duration:
53 |                 continue
54 |             self.keys.append(key)
55 |             self.offset_dict[key] = (int(val_pos), int(val_len))
56 | 
57 |     # 获取一行列表数据
58 |     def get_data(self, key):
59 |         p = self.offset_dict.get(key, None)
60 |         if p is None:
61 |             return None
62 |         val_pos, val_len = p
63 |         data = self.m[val_pos:val_pos + val_len]
64 |         data = str(data, encoding="utf-8")
65 |         return json.loads(data)
66 | 
67 |     # 获取keys
68 |     def get_keys(self):
69 |         return self.keys
70 | 
71 |     def __len__(self):
72 |         return len(self.keys)
73 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/utils/callback.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os
 3 | import shutil
 4 | 
 5 | from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl
 6 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 7 | 
 8 | 
 9 | # 保存模型时的回调函数
10 | class SavePeftModelCallback(TrainerCallback):
11 |     def on_save(self,
12 |                 args: TrainingArguments,
13 |                 state: TrainerState,
14 |                 control: TrainerControl,
15 |                 **kwargs, ):
16 |         if args.local_rank == 0 or args.local_rank == -1:
17 |             # # 保存效果最好的模型
18 |             # best_checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-best")
19 |             # # 因为只保存最新5个检查点，所以要确保不是之前的检查点
20 |             # if os.path.exists(state.best_model_checkpoint):
21 |             #     if os.path.exists(best_checkpoint_folder):
22 |             #         shutil.rmtree(best_checkpoint_folder)
23 |             #     shutil.copytree(state.best_model_checkpoint, best_checkpoint_folder)
24 |             print(f"效果最好的检查点为：{state.best_model_checkpoint}，评估结果为：{state.best_metric}")
25 |         return control
26 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/utils/data_utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from dataclasses import dataclass
 3 | from typing import Any, List, Dict, Union
 4 | 
 5 | import torch
 6 | from zhconv import convert
 7 | 
 8 | 
 9 | # 删除标点符号
10 | def remove_punctuation(text: str or List[str]):
11 |     punctuation = '!,.;:?、！，。；：？'
12 |     if isinstance(text, str):
13 |         text = re.sub(r'[{}]+'.format(punctuation), '', text).strip()
14 |         return text
15 |     elif isinstance(text, list):
16 |         result_text = []
17 |         for t in text:
18 |             t = re.sub(r'[{}]+'.format(punctuation), '', t).strip()
19 |             result_text.append(t)
20 |         return result_text
21 |     else:
22 |         raise Exception(f'不支持该类型{type(text)}')
23 | 
24 | 
25 | # 将繁体中文总成简体中文
26 | def to_simple(text: str or List[str]):
27 |     if isinstance(text, str):
28 |         text = convert(text, 'zh-cn')
29 |         return text
30 |     elif isinstance(text, list):
31 |         result_text = []
32 |         for t in text:
33 |             t = convert(t, 'zh-cn')
34 |             result_text.append(t)
35 |         return result_text
36 |     else:
37 |         raise Exception(f'不支持该类型{type(text)}')
38 | 
39 | 
40 | @dataclass
41 | class DataCollatorSpeechSeq2SeqWithPadding:
42 |     processor: Any
43 | 
44 |     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
45 |         # split inputs and labels since they have to be of different lengths and need different padding methods
46 |         # first treat the audio inputs by simply returning torch tensors
47 |         input_features = [{"input_features": feature["input_features"][0]} for feature in features]
48 |         batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
49 | 
50 |         # get the tokenized label sequences
51 |         label_features = [{"input_ids": feature["labels"]} for feature in features]
52 |         # pad the labels to max length
53 |         labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
54 | 
55 |         # replace padding with -100 to ignore loss correctly
56 |         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
57 | 
58 |         # if bos token is appended in previous tokenization step,
59 |         # cut bos token here as it's append later anyways
60 |         if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
61 |             labels = labels[:, 1:]
62 | 
63 |         batch["labels"] = labels
64 | 
65 |         return batch
66 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/utils/model_utils.py:
--------------------------------------------------------------------------------
 1 | import bitsandbytes as bnb
 2 | import torch
 3 | from transformers.trainer_pt_utils import LabelSmoother
 4 | 
 5 | IGNORE_TOKEN_ID = LabelSmoother.ignore_index
 6 | 
 7 | 
 8 | def find_all_linear_names(use_8bit, model):
 9 |     cls = bnb.nn.Linear8bitLt if use_8bit else torch.nn.Linear
10 |     lora_module_names = set()
11 |     for name, module in model.named_modules():
12 |         if isinstance(module, cls):
13 |             names = name.split('.')
14 |             lora_module_names.add(names[0] if len(names) == 1 else names[-1])
15 |     target_modules = list(lora_module_names)
16 |     return target_modules
17 | 
18 | 
19 | def load_from_checkpoint(resume_from_checkpoint, model=None):
20 |     pass
21 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/utils/tarfile_reader.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | import numpy as np
 3 | import soundfile
 4 | import torch
 5 | 
 6 | import random
 7 | import tarfile
 8 | import torchaudio
 9 | import logging
10 | 
11 | class TarFileDataset():
12 |     def __init__(self, data_list_path):
13 |         """
14 |         Args:
15 |             data_list_path: 数据列表文件的路径
16 |             tar_filename_list: tar 文件名的临时存放列表，会在内部进行 shuffle
17 |             audio_sample_list: audio 的临时存放列表，会在内部进行 shuffle
18 |             sample_rate: 目标采样率，whisper 只适配 16k 音频，其他采样率会被强制转换至 16k 采样率再训练
19 |         """
20 |         self.data_list_path = data_list_path
21 |         self.tar_filename_list = []
22 |         self.audio_sample_list = []
23 |         self.sample_rate = 16000
24 |     
25 |     # 返回一条音频组
26 |     def get_one_sample(self):
27 |         # 如果 tar_filename_list 空了，则补充，进入下一个 epoch
28 |         if not self.tar_filename_list and not self.audio_sample_list:
29 |             # 获取数据列表
30 |             with open(self.data_list_path, 'r', encoding='utf-8') as f:
31 |                 lines = f.readlines()
32 |             for line in lines:
33 |                 # 读取文件中所有 tar 文件
34 |                 if line.strip().endswith('.tar'):
35 |                     self.tar_filename_list.append(line.strip())
36 |         # 如果 audio_sample_list 空了，则补充
37 |         if not self.audio_sample_list:
38 |             # 随机 pop 出一个 tar 文件名
39 |             index_to_pop = random.choice(range(len(self.tar_filename_list)))
40 |             tar_filename = self.tar_filename_list.pop(index_to_pop)
41 |             stream = open(tar_filename, 'rb')
42 |             # stream = tarfile.open(fileobj=stream, mode="r|*")
43 |             stream = tarfile.open(fileobj=stream, mode="r:*")
44 |             prev_prefix = None
45 |             example = {}
46 |             # valid 变量表示该数据是否正常，不正常就不返回
47 |             valid = True
48 |             for tarinfo in stream:
49 |                 name = tarinfo.name
50 |                 pos = name.rfind('.')
51 |                 assert pos > 0
52 |                 prefix, postfix = name[:pos], name[pos + 1:]
53 |                 if prev_prefix is not None and prefix != prev_prefix:
54 |                     example['language'] = "chinese"
55 |                     if valid:
56 |                         self.audio_sample_list.append([example['sample'], example['sample_rate'], example['transcript'], example['language']]) 
57 |                     example = {}
58 |                     valid = True
59 |                 with stream.extractfile(tarinfo) as file_obj:
60 |                     try:
61 |                         if postfix == 'txt':
62 |                             example['transcript'] = file_obj.read().decode('utf8').strip()
63 |                         elif postfix in ['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']:
64 |                             
65 |                             # sample, sample_rate = soundfile.read(file_obj, dtype='float32')
66 |                             sample, sample_rate = torchaudio.load(file_obj)
67 |                             sample = sample.squeeze(0)
68 |                             sample = sample.numpy()
69 | 
70 |                             # 判断音频时长是否为 0.5s ～ 30s
71 |                             duration = len(sample) / sample_rate
72 |                             if duration <= 0.5 or duration >= 30:
73 |                                 valid = False
74 |                             sample = sample.T
75 |                             if self.sample_rate != sample_rate:
76 |                                 sample = librosa.resample(sample, orig_sr=sample_rate, target_sr=self.sample_rate)
77 |                             example['sample'], example['sample_rate'] = sample, sample_rate
78 |                         else:
79 |                             other_files = file_obj.read()
80 |                     except Exception as ex:
81 |                         logging.warning(ex, exc_info=True)
82 |                         valid = False
83 |                         logging.warning('error to parse {}'.format(name))        
84 |                 prev_prefix = prefix
85 |             if prev_prefix is not None:
86 |                 example['language'] = "chinese"
87 |                 duration = len(example['sample']) / example['sample_rate']
88 |                 if duration > 0.5 and duration < 30:
89 |                     self.audio_sample_list.append([example['sample'], example['sample_rate'], example['transcript'], example['language']]) 
90 |             stream.close()
91 |             # if 'process' in sample:
92 |             #     sample['process'].communicate()
93 |         # 随机 pop 出一个 audio_sample 文件名
94 |         index_to_pop = random.choice(range(len(self.audio_sample_list)))
95 |         audio_sample = self.audio_sample_list.pop(index_to_pop)
96 |         # return sample, sample_rate, transcript, language
97 |         return audio_sample[0], audio_sample[1], audio_sample[2], audio_sample[3]
98 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import os
 3 | import tarfile
 4 | import urllib.request
 5 | 
 6 | from tqdm import tqdm
 7 | 
 8 | 
 9 | def print_arguments(args):
10 |     print("-----------  Configuration Arguments -----------")
11 |     for arg, value in vars(args).items():
12 |         print("%s: %s" % (arg, value))
13 |     print("------------------------------------------------")
14 | 
15 | 
16 | def strtobool(val):
17 |     val = val.lower()
18 |     if val in ('y', 'yes', 't', 'true', 'on', '1'):
19 |         return True
20 |     elif val in ('n', 'no', 'f', 'false', 'off', '0'):
21 |         return False
22 |     else:
23 |         raise ValueError("invalid truth value %r" % (val,))
24 | 
25 | 
26 | def str_none(val):
27 |     if val == 'None':
28 |         return None
29 |     else:
30 |         return val
31 | 
32 | 
33 | def add_arguments(argname, type, default, help, argparser, **kwargs):
34 |     type = strtobool if type == bool else type
35 |     type = str_none if type == str else type
36 |     argparser.add_argument("--" + argname,
37 |                            default=default,
38 |                            type=type,
39 |                            help=help + ' Default: %(default)s.',
40 |                            **kwargs)
41 | 
42 | 
43 | def md5file(fname):
44 |     hash_md5 = hashlib.md5()
45 |     f = open(fname, "rb")
46 |     for chunk in iter(lambda: f.read(4096), b""):
47 |         hash_md5.update(chunk)
48 |     f.close()
49 |     return hash_md5.hexdigest()
50 | 
51 | 
52 | def download(url, md5sum, target_dir):
53 |     """Download file from url to target_dir, and check md5sum."""
54 |     if not os.path.exists(target_dir): os.makedirs(target_dir)
55 |     filepath = os.path.join(target_dir, url.split("/")[-1])
56 |     if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
57 |         print(f"Downloading {url} to {filepath} ...")
58 |         with urllib.request.urlopen(url) as source, open(filepath, "wb") as output:
59 |             with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True,
60 |                       unit_divisor=1024) as loop:
61 |                 while True:
62 |                     buffer = source.read(8192)
63 |                     if not buffer:
64 |                         break
65 | 
66 |                     output.write(buffer)
67 |                     loop.update(len(buffer))
68 |         print(f"\nMD5 Chesksum {filepath} ...")
69 |         if not md5file(filepath) == md5sum:
70 |             raise RuntimeError("MD5 checksum failed.")
71 |     else:
72 |         print(f"File exists, skip downloading. ({filepath})")
73 |     return filepath
74 | 
75 | 
76 | def unpack(filepath, target_dir, rm_tar=False):
77 |     """Unpack the file to the target_dir."""
78 |     print("Unpacking %s ..." % filepath)
79 |     tar = tarfile.open(filepath)
80 |     tar.extractall(target_dir)
81 |     tar.close()
82 |     if rm_tar:
83 |         os.remove(filepath)
84 | 
85 | 
86 | def make_inputs_require_grad(module, input, output):
87 |     output.requires_grad_(True)
88 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/vscode/debug_chinese_launch.json:
--------------------------------------------------------------------------------
 1 | // 在低版本python中："type": "python",
 2 | 
 3 | {
 4 |     "version": "0.2.0",
 5 |     "configurations": [
 6 |         {
 7 |             "name": "Whisper Evaluation",
 8 |             "type": "debugpy",
 9 |             "request": "launch",
10 |             "program": "/nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune/evaluation_nolora.py",
11 |             "args": [
12 |                 "--hyp_path=/nfs/volume-225-14/cuichenrui/whisper/experiment_decode/decode_logs/debug_hyp.txt",
13 |                 "--lab_path=/nfs/volume-225-14/cuichenrui/whisper/experiment_decode/decode_logs/debug_lab.txt",
14 |                 "--test_data=/nfs/volume-225-14/cuichenrui/dataset/debug.jsonl",
15 |                 "--model_path=/nfs/volume-225-14/cuichenrui/whisper/whisper_models/whisper_medium/checkpoint-85000",
16 |                 "--batch_size=1",
17 |                 "--num_workers=1",
18 |                 "--language=Chinese",
19 |                 "--remove_pun=True",
20 |                 "--to_simple=True",
21 |                 "--timestamps=False",
22 |                 "--min_audio_len=0.5",
23 |                 "--max_audio_len=30",
24 |                 "--local_files_only=True",
25 |                 "--task=transcribe",
26 |                 "--metric=cer"
27 |             ],
28 |             "cwd": "/nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune",
29 |             "python": "/nfs/volume-225-14/laizhihao_i/env/anaconda3/envs/whisper/bin/python",
30 |             "console": "integratedTerminal",
31 |             "justMyCode": false,
32 |         }
33 |     ]
34 | }


--------------------------------------------------------------------------------
/whisper_finetune_using/vscode/debug_faster_whisper_launch.json:
--------------------------------------------------------------------------------
 1 | // # 在低版本python中："type": "python",
 2 | 
 3 | {
 4 |     "version": "0.2.0",
 5 |     "configurations": [
 6 |         {
 7 |             "name": "Whisper Evaluation",
 8 |             "type": "debugpy",
 9 |             "request": "launch",
10 |             "program": "/nfs/volume-225-14/laizhihao_i/Whisper/faster-whisper/evaluate_cuichenrui.py",
11 |             "args": [
12 |                 "--hyp_path=/nfs/volume-225-14/cuichenrui/whisper/experiment_decode/decode_logs/debug_hyp.txt",
13 |                 "--lab_path=/nfs/volume-225-14/cuichenrui/whisper/experiment_decode/decode_logs/debug_lab.txt",
14 |                 "--test_data=/nfs/volume-225-14/cuichenrui/dataset/debug.jsonl",
15 |                 "--model_path=/nfs/volume-225-14/cuichenrui/whisper/faster_whisper_models/whisper_medium/checkpoint-85000",
16 |             ],
17 |             "cwd": "/nfs/volume-225-14/laizhihao_i/Whisper/faster-whisper",
18 |             "python": "/nfs/volume-225-14/laizhihao_i/env/anaconda3/envs/blsp/bin/python",
19 |             "console": "integratedTerminal",
20 |             "justMyCode": false,
21 |         }
22 |     ]
23 | }
24 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/vscode/debug_new_faster_launch.json:
--------------------------------------------------------------------------------
 1 | // # 在低版本python中："type": "python",
 2 | 
 3 | {
 4 |     "version": "0.2.0",
 5 |     "configurations": [
 6 |         {
 7 |             "name": "Whisper Evaluation",
 8 |             "type": "debugpy",
 9 |             "request": "launch",
10 |             "program": "/nfs/volume-225-14/laizhihao_i/Whisper/faster-whisper/evaluate_cuichenrui.py",
11 |             "args": [
12 |                 "--hyp_path=/nfs/volume-225-14/cuichenrui/whisper/experiment_decode/decode_logs/debug_hyp.txt",
13 |                 "--lab_path=/nfs/volume-225-14/cuichenrui/whisper/experiment_decode/decode_logs/debug_lab.txt",
14 |                 "--test_data=/nfs/volume-225-14/cuichenrui/dataset/debug_8k.jsonl",
15 |                 "--model_path=/nfs/volume-225-14/cuichenrui/whisper/faster_whisper_models/whisper_medium/checkpoint-85000",
16 |             ],
17 |             "env": {
18 |                 "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7,8",
19 |                 "LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/nfs/volume-225-14/laizhihao_i/env/anaconda3/envs/blsp/lib/python3.8/site-packages/nvidia/cudnn/lib", 
20 |             },
21 |             "cwd": "/nfs/volume-225-14/laizhihao_i/Whisper/faster-whisper",
22 |             "python": "/nfs/volume-225-14/laizhihao_i/env/anaconda3/envs/blsp/bin/python",
23 |             "console": "integratedTerminal",
24 |             "justMyCode": false,
25 |         }
26 |     ]
27 | }
28 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/vscode/debug_new_faster_whisper_launch.json:
--------------------------------------------------------------------------------
 1 | // 在低版本python中："type": "python",
 2 | // 可以改这个设置来让代码从当前文件debug："program": "${file}",
 3 | 
 4 | {
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "Whisper Evaluation",
 9 |             "type": "debugpy",
10 |             "request": "launch",
11 |             "justMyCode": false,
12 |             "purpose": ["debug-in-terminal"], 
13 |             "console": "integratedTerminal",
14 |             "cwd": "/nfs/volume-225-14/cuichenrui/whisper/script",
15 |             "python": "/nfs/volume-225-14/cuichenrui/anaconda3/envs/fasterwhisper/bin/python",
16 |             "program": "/nfs/volume-225-14/cuichenrui/whisper/script/evaluate_cuichenrui_debug.py",
17 |             "args": [
18 |                 "--hyp_path=/nfs/volume-225-14/cuichenrui/whisper/experiment_decode/decode_logs/debug_hyp.txt",
19 |                 "--lab_path=/nfs/volume-225-14/cuichenrui/whisper/experiment_decode/decode_logs/debug_lab.txt",
20 |                 "--test_data=/nfs/volume-225-14/cuichenrui/dataset/debug_longwav.jsonl",
21 |                 "--model_path=/nfs/volume-225-14/cuichenrui/whisper/faster_whisper_models/whisper_medium/checkpoint-85000",
22 |             ],
23 |             "env": {
24 |                 "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7,8",
25 |                 "LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/nfs/volume-225-14/laizhihao_i/env/anaconda3/envs/blsp/lib/python3.8/site-packages/nvidia/cudnn/lib", 
26 |             },
27 |         }
28 |     ]
29 | }
30 | 
31 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/vscode/debug_none_launch.json:
--------------------------------------------------------------------------------
 1 | // 在低版本python中："type": "python",
 2 | 
 3 | {
 4 |     "version": "0.2.0",
 5 |     "configurations": [
 6 |         {
 7 |             "name": "Whisper Evaluation",
 8 |             "type": "debugpy",
 9 |             "request": "launch",
10 |             "program": "/nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune/evaluation_nolora.py",
11 |             "args": [
12 |                 "--hyp_path=/nfs/volume-225-14/cuichenrui/whisper/experiment_decode/decode_logs/debug_hyp.txt",
13 |                 "--lab_path=/nfs/volume-225-14/cuichenrui/whisper/experiment_decode/decode_logs/debug_lab.txt",
14 |                 "--test_data=/nfs/volume-225-14/cuichenrui/dataset/debug.jsonl",
15 |                 "--model_path=/nfs/volume-225-14/cuichenrui/whisper/whisper_models/whisper_medium/checkpoint-85000",
16 |                 "--batch_size=1",
17 |                 "--num_workers=1",
18 |                 "--language=None",
19 |                 "--remove_pun=True",
20 |                 "--to_simple=True",
21 |                 "--timestamps=False",
22 |                 "--min_audio_len=0.5",
23 |                 "--max_audio_len=30",
24 |                 "--local_files_only=True",
25 |                 "--task=transcribe",
26 |                 "--metric=cer"
27 |             ],
28 |             "cwd": "/nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune",
29 |             "python": "/nfs/volume-225-14/laizhihao_i/env/anaconda3/envs/whisper/bin/python",
30 |             "console": "integratedTerminal",
31 |             "justMyCode": false,
32 |         }
33 |     ]
34 | }


--------------------------------------------------------------------------------
/whisper_finetune_using/vscode/debug_train_2_launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "0.2.0",
 3 |     "configurations": [
 4 |         {
 5 |             "name": "Whisper Evaluation",
 6 |             "type": "debugpy",
 7 |             "request": "launch",
 8 |             "program": "/nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune/finetune_multimachine.py",
 9 |             "args": [
10 |                 "--base_model=/nfs/volume-225-14/laizhihao_i/Whisper/pretrain_model/model_medium", 
11 |                 "--per_device_train_batch_size=1", 
12 |                 "--per_device_eval_batch_size=1", 
13 |                 "--gradient_accumulation_steps=2", 
14 |                 "--num_workers=1", 
15 |                 "--output_dir=/nfs/volume-225-14/cuichenrui/whisper/whisper_train_debug", 
16 |                 "--train_data=/nfs/volume-225-14/cuichenrui/dataset/debug_train.jsonl", 
17 |                 "--test_data=/nfs/volume-225-14/cuichenrui/dataset/debug_train.jsonl", 
18 |             ],
19 |             "env": {
20 |                 "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7,8",
21 |                 "LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/nfs/volume-225-14/laizhihao_i/env/anaconda3/envs/blsp/lib/python3.8/site-packages/nvidia/cudnn/lib", 
22 |             },
23 |             "cwd": "/nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune",
24 |             "python": "/nfs/volume-225-14/cuichenrui/anaconda3/envs/whisperA6000/bin/python",
25 |             "console": "integratedTerminal",
26 |             "justMyCode": false,
27 |         }
28 |     ]
29 | }
30 | 
31 | // {
32 | //     "version": "0.2.0",
33 | //     "configurations": [
34 | //         {
35 | //             "name": "Whisper Evaluation",
36 | //             "type": "debugpy",
37 | //             "request": "launch",
38 | //             "program": "/nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune/finetune_multimachine.py",
39 | //             "args": [
40 | //                 "--base_model=/nfs/volume-225-14/laizhihao_i/Whisper/pretrain_model/model_medium", 
41 | //                 "--per_device_train_batch_size=1", 
42 | //                 "--per_device_eval_batch_size=1", 
43 | //                 "--gradient_accumulation_steps=2", 
44 | //                 "--num_workers=1", 
45 | //                 "--output_dir=/nfs/volume-225-14/cuichenrui/whisper/whisper_train_debug", 
46 | //                 "--train_data=/nfs/volume-225-14/cuichenrui/whisper/whisper_medium_full_finetune_2.0/train_all_new_checked_without_8k.jsonl", 
47 | //                 "--test_data=/nfs/volume-225-14/cuichenrui/dataset/debug_train.jsonl", 
48 | //             ],
49 | //             "env": {
50 | //                 "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7,8",
51 | //                 "LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/nfs/volume-225-14/laizhihao_i/env/anaconda3/envs/blsp/lib/python3.8/site-packages/nvidia/cudnn/lib", 
52 | //             },
53 | //             "cwd": "/nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune",
54 | //             "python": "/nfs/volume-225-14/cuichenrui/anaconda3/envs/whisperA6000/bin/python",
55 | //             "console": "integratedTerminal",
56 | //             "justMyCode": false,
57 | //         }
58 | //     ]
59 | // }


--------------------------------------------------------------------------------
/whisper_finetune_using/vscode/debug_train_launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "0.2.0",
 3 |     "configurations": [
 4 |         {
 5 |             "name": "Whisper Evaluation",
 6 |             "type": "debugpy",
 7 |             "request": "launch",
 8 |             "program": "/nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune/finetune_multimachine.py",
 9 |             "args": [
10 |                 "--base_model=/nfs/volume-225-14/laizhihao_i/Whisper/pretrain_model/model_medium", 
11 |                 "--per_device_train_batch_size=1", 
12 |                 "--per_device_eval_batch_size=1", 
13 |                 "--gradient_accumulation_steps=2", 
14 |                 "--num_workers=1", 
15 |                 "--output_dir=/nfs/volume-225-14/cuichenrui/whisper/whisper_train_debug", 
16 |                 "--train_data=/nfs/volume-225-14/cuichenrui/dataset/debug.jsonl", 
17 |                 "--test_data=/nfs/volume-225-14/cuichenrui/dataset/debug.jsonl", 
18 |             ],
19 |             "cwd": "/nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune",
20 |             "python": "/nfs/volume-225-14/cuichenrui/anaconda3/envs/whisperA6000/bin/python",
21 |             "console": "integratedTerminal",
22 |             "justMyCode": false,
23 |         }
24 |     ]
25 | }
26 | 
27 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/vscode/workspace_debug_train_111_launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "0.2.0",
 3 |     "configurations": [
 4 |         {
 5 |             "name": "Whisper Evaluation",
 6 |             "type": "debugpy",
 7 |             "request": "launch",
 8 |             "program": "/nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune/finetune_multimachine.py",
 9 |             "args": [
10 |                 "--base_model=/nfs/volume-225-14/laizhihao_i/Whisper/pretrain_model/model_medium", 
11 |                 "--per_device_train_batch_size=3", 
12 |                 "--per_device_eval_batch_size=4", 
13 |                 "--gradient_accumulation_steps=1", 
14 |                 "--num_workers=4", 
15 |                 "--output_dir=/nfs/volume-225-14/cuichenrui/whisper/whisper_train_debug", 
16 |                 "--train_data=/nfs/volume-225-14/cuichenrui/dataset/debug_8k.jsonl", 
17 |                 "--test_data=/nfs/volume-225-14/cuichenrui/dataset/debug_train.jsonl", 
18 |             ],
19 |             "env": {
20 |                 "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7,8",
21 |                 "LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/nfs/volume-225-14/laizhihao_i/env/anaconda3/envs/blsp/lib/python3.8/site-packages/nvidia/cudnn/lib", 
22 |             },
23 |             "cwd": "/nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune",
24 |             "python": "/nfs/volume-225-14/cuichenrui/anaconda3/envs/whisperA6000/bin/python",
25 |             "console": "integratedTerminal",
26 |             "justMyCode": false,
27 |         }
28 |     ]
29 | }
30 | 
31 | // {
32 | //     "version": "0.2.0",
33 | //     "configurations": [
34 | //         {
35 | //             "name": "Whisper Evaluation",
36 | //             "type": "debugpy",
37 | //             "request": "launch",
38 | //             "program": "/nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune/finetune_multimachine.py",
39 | //             "args": [
40 | //                 "--base_model=/nfs/volume-225-14/laizhihao_i/Whisper/pretrain_model/model_medium", 
41 | //                 "--per_device_train_batch_size=4", 
42 | //                 "--per_device_eval_batch_size=4", 
43 | //                 "--gradient_accumulation_steps=2", 
44 | //                 "--num_workers=4", 
45 | //                 "--output_dir=/nfs/volume-225-14/cuichenrui/whisper/whisper_train_debug", 
46 | //                 "--train_data=/nfs/volume-225-14/cuichenrui/whisper/whisper_medium_full_finetune_2.0/train_all_new_checked_without_8k.jsonl", 
47 | //                 "--test_data=/nfs/volume-225-14/cuichenrui/dataset/debug_train.jsonl", 
48 | //             ],
49 | //             "env": {
50 | //                 "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7,8",
51 | //                 "LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/nfs/volume-225-14/laizhihao_i/env/anaconda3/envs/blsp/lib/python3.8/site-packages/nvidia/cudnn/lib", 
52 | //             },
53 | //             "cwd": "/nfs/volume-225-14/laizhihao_i/Whisper/Whisper-Finetune",
54 | //             "python": "/nfs/volume-225-14/cuichenrui/anaconda3/envs/whisperA6000/bin/python",
55 | //             "console": "integratedTerminal",
56 | //             "justMyCode": false,
57 | //         }
58 | //     ]
59 | // }


--------------------------------------------------------------------------------
/whisper_finetune_using/whisper_explore_using/test_whisper_tokenizer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from transformers import WhisperProcessor
 5 | 
 6 | texts = ["This is an English sentence test hello I'm very glad to meet you", 
 7 |          "THIS IS AN ENGLISH SENTENCE TEST HELLO I'M VERY GLAD TO MEET YOU", 
 8 |          "this is an english sentence test hello i'm very glad to meet you", 
 9 |          "这是一条中文语句测试你好见到你很高兴", 
10 |          "这是 一条 中文 语 句 测试 你好 见 到 你 很 高兴", 
11 |          "这是一条中文语句测试，你好，见到你很高兴。"]
12 | 
13 | model_path = "your_model_path"
14 | 
15 | processor = WhisperProcessor.from_pretrained(model_path,
16 |                                              language="English",
17 |                                              task="transcribe",
18 |                                              no_timestamps=True,
19 |                                              local_files_only=True)
20 | 
21 | forced_decoder_ids = processor.get_decoder_prompt_ids()
22 | 
23 | for text in texts:
24 |     
25 |     token_ids = processor.tokenizer(text)["input_ids"]
26 |     outputs = processor.tokenizer.batch_decode(token_ids, skip_special_tokens=True)
27 |     
28 |     print(f"原始句子：\n{text}")
29 |     print(f"分词结果：\n{token_ids}")
30 |     print(f"直观结果：\n{outputs}")
31 |     print("\n")
32 | 


--------------------------------------------------------------------------------
/whisper_finetune_using/whisper_explore_using/test_whisper_tokenizer.txt:
--------------------------------------------------------------------------------
 1 | 原始句子：
 2 | This is an English sentence test hello I'm very glad to meet you
 3 | 分词结果：
 4 | [50258, 50259, 50359, 50363, 5723, 307, 364, 3669, 8174, 220, 31636, 7751, 286, 478, 588, 5404, 220, 1353, 1677, 291, 50257]
 5 | 直观结果：
 6 | ['<|startoftranscript|>', '<|en|>', '<|transcribe|>', '<|notimestamps|>', 'This', ' is', ' an', ' English', ' sentence', ' ', 'test', ' hello', ' I', "'m", ' very', ' glad', ' ', 'to', ' meet', ' you', '<|endoftext|>']
 7 | 
 8 | 
 9 | 原始句子：
10 | THIS IS AN ENGLISH SENTENCE TEST HELLO I'M VERY GLAD TO MEET YOU
11 | 分词结果：
12 | [50258, 50259, 50359, 50363, 9620, 2343, 6205, 5252, 15244, 41920, 318, 9536, 7163, 314, 14497, 389, 8763, 46, 286, 6, 44, 45655, 16225, 6112, 8232, 12003, 4850, 7928, 50257]
13 | 直观结果：
14 | ['<|startoftranscript|>', '<|en|>', '<|transcribe|>', '<|notimestamps|>', 'TH', 'IS', ' IS', ' AN', ' EN', 'GLISH', ' S', 'ENT', 'ENCE', ' T', 'EST', ' H', 'ELL', 'O', ' I', "'", 'M', ' VERY', ' GL', 'AD', ' TO', ' ME', 'ET', ' YOU', '<|endoftext|>']
15 | 
16 | 
17 | 原始句子：
18 | this is an english sentence test hello i'm very glad to meet you
19 | 分词结果：
20 | [50258, 50259, 50359, 50363, 11176, 307, 364, 32169, 8174, 220, 31636, 7751, 741, 478, 588, 5404, 220, 1353, 1677, 291, 50257]
21 | 直观结果：
22 | ['<|startoftranscript|>', '<|en|>', '<|transcribe|>', '<|notimestamps|>', 'this', ' is', ' an', ' english', ' sentence', ' ', 'test', ' hello', ' i', "'m", ' very', ' glad', ' ', 'to', ' meet', ' you', '<|endoftext|>']
23 | 
24 | 
25 | 原始句子：
26 | 这是一条中文语句测试你好见到你很高兴
27 | 分词结果：
28 | [50258, 50259, 50359, 50363, 27455, 2257, 48837, 5975, 17174, 5233, 255, 34592, 11038, 233, 5233, 243, 26410, 23813, 4511, 2166, 4563, 12979, 2347, 112, 50257]
29 | 直观结果：
30 | ['<|startoftranscript|>', '<|en|>', '<|transcribe|>', '<|notimestamps|>', '这是', '一', '条', '中', '文', '�', '�', '句', '�', '�', '�', '�', '你好', '见', '到', '你', '很', '高', '�', '�', '<|endoftext|>']
31 | 
32 | 
33 | 原始句子：
34 | 这是 一条 中文 语 句 测试 你好 见 到 你 很 高兴
35 | 分词结果：
36 | [50258, 50259, 50359, 50363, 27455, 26923, 48837, 220, 5975, 17174, 220, 5233, 255, 220, 34592, 220, 11038, 233, 5233, 243, 10930, 2131, 220, 23813, 220, 4511, 10930, 26029, 220, 12979, 2347, 112, 50257]
37 | 直观结果：
38 | ['<|startoftranscript|>', '<|en|>', '<|transcribe|>', '<|notimestamps|>', '这是', ' 一', '条', ' ', '中', '文', ' ', '�', '�', ' ', '句', ' ', '�', '�', '�', '�', ' 你', '好', ' ', '见', ' ', '到', ' 你', ' 很', ' ', '高', '�', '�', '<|endoftext|>']
39 | 
40 | 
41 | 原始句子：
42 | 这是一条中文语句测试，你好，见到你很高兴。
43 | 分词结果：
44 | [50258, 50259, 50359, 50363, 27455, 2257, 48837, 5975, 17174, 5233, 255, 34592, 11038, 233, 5233, 243, 171, 120, 234, 26410, 171, 120, 234, 23813, 4511, 2166, 4563, 12979, 2347, 112, 1543, 50257]
45 | 直观结果：
46 | ['<|startoftranscript|>', '<|en|>', '<|transcribe|>', '<|notimestamps|>', '这是', '一', '条', '中', '文', '�', '�', '句', '�', '�', '�', '�', '�', '�', '�', '你好', '�', '�', '�', '见', '到', '你', '很', '高', '�', '�', '。', '<|endoftext|>']
47 | 


--------------------------------------------------------------------------------
/工具踩坑记录汇总.md:
--------------------------------------------------------------------------------
 1 | ### Linux 环境配置相关：
 2 | 
 3 | * [无 root 权限安装 cuda](https://zhuanlan.zhihu.com/p/476313656)
 4 | 
 5 | * [Nvcc, cuda driver, cudatoolkit, cudnn 的关系和区别](https://www.cnblogs.com/marsggbo/p/11838823)
 6 | 
 7 | * [CondaHTTPError: HTTP 000 CONNECTION FAILED](https://zhuanlan.zhihu.com/p/260034241)
 8 | 
 9 | * [Conda 更新后激活环境不显示用户名和工作目录](https://blog.csdn.net/m0_56484411/article/details/127515359)
10 | 
11 | * [Linux 修改命令提示符前面的路径显示](https://blog.csdn.net/LSG_Down/article/details/112058574)
12 | 
13 | * [重启 nvidia 服务](https://stackoverflow.com/questions/43022843/nvidia-nvml-driver-library-version-mismatch/45319156#45319156)
14 | 
15 | * [yum install ffmpeg 报错：ffmpeg-devel No package ffmpeg available.](https://linuxcpp.0voice.com/?id=38808)
16 | 
17 | ### Tmux 介绍相关：
18 | 
19 | * [Tmux 使用教程](https://www.ruanyifeng.com/blog/2019/10/tmux.html)
20 | 
21 | * [Tmux 常用快捷键](https://www.cnblogs.com/eirrac-rain/p/17803549.html)
22 | 
23 | ### Vscode 配置相关：
24 | 
25 | * [Vscode 免密登录服务器](https://www.jianshu.com/p/e3d63fa3ef63)
26 | 
27 | * [Mac 更新 Ventura 后 ssh 无法正常使用](https://www.cnblogs.com/xhyccc/p/16836587.html)
28 | 
29 | * [Windows ssh 命令 "key.pem" 权限问题](https://tool.4xseo.com/a/24567.html)
30 | 
31 | * [Vscode 中使用远胜 python cell](https://www.jianshu.com/p/fa90e902c6ae)
32 | 
33 | * [Vscode 中使用 jupyter notebook](https://zhuanlan.zhihu.com/p/140899377)
34 | 
35 | * [Vscode 切换虚拟环境后 pip 仍绑定系统 python](https://blog.csdn.net/Sharpneo/article/details/130527402)
36 | 
37 | * [Vscode debug py 文件出现：Error while enumerating installed packages](https://stackoverflow.com/questions/77840099/vscode-python-error-while-enumerating-installed-packages)
38 | 
39 | * [Vscode 对比两个文件夹的代码异同](https://zhuanlan.zhihu.com/p/677637988)
40 | 
41 | ### Zotero 配置相关：
42 | 
43 | * [Zotero：科研小白的第一款文献管理软件](https://zhuanlan.zhihu.com/p/347493385)
44 | 
45 | * [我的 Zotero 实践汇总](https://zhuanlan.zhihu.com/p/108366072)
46 | 
47 | * [文献管理神器 —— Zotero 配置及实用插件扩展](https://blog.csdn.net/qq_40918859/article/details/124380201)
48 | 
49 | * [文献管理软件 Zotero 常用插件安装及配置使用](https://blog.csdn.net/qq_43309940/article/details/117126357)
50 | 
51 | ### 模型训练踩坑相关：
52 | 
53 | * [Git 管理工具 SourceTree 使用方法](https://zhuanlan.zhihu.com/p/254909901)
54 | 
55 | * [pip install -e . 使用方法](https://blog.csdn.net/weixin_45440484/article/details/130009424)
56 | 
57 | * [parser.add_argument() 使用方法](https://blog.csdn.net/weixin_47414034/article/details/124962600)
58 | 
59 | * [AverageMeter() 使用方法](https://blog.csdn.net/rytyy/article/details/105944813)
60 | 
61 | * [Logging 日志的使用方法](https://blog.csdn.net/sl01224318/article/details/125474332)
62 | 
63 | * [Pytorch 分布式训练 DistributedDataParallel](https://blog.csdn.net/weixin_43229348/article/details/124112404)
64 | 
65 | * [Pytorch 多机多卡训练保存的文件无法读取，报错文件已经损坏](https://blog.csdn.net/tongjingqi_/article/details/132697541)
66 | 
67 | * [Pytorch torchrun 命令踩坑记录](https://blog.csdn.net/Komach/article/details/130765773)
68 | 
69 | * [PyTorch 报错 CUDA 版本：Error 804: forward compatibility was attempted on non supported HW](https://zhuanlan.zhihu.com/p/361545761)
70 | 
71 | ### 其他工具相关：
72 | 
73 | * [Mac 如何更改用户名或个人文件夹名称](https://zhuanlan.zhihu.com/p/361131804)
74 | 
75 | * [代码生成图片](https://www.jyshare.com/front-end/7433)
76 | 
77 | * [PDF 格式转换工具](https://www.ilovepdf.com/zh-cn)
78 | 
79 | * [Convertio 格式转换](https://convertio.co/zh/webm-mp4)
80 | 
81 | * [SMS 虚拟电话号码购买](https://sms-activate.org/cn)
82 | 
83 | * [Apache ECharts 开源可视化图表库](https://echarts.apache.org/zh/index.html)
84 | 
85 | * [Pyecharts Python图表教程](https://www.heywhale.com/mw/project/5eb7958f366f4d002d783d4a)
86 | 
87 | * [Visio 流程图工具](https://www.microsoft.com/zh-cn/microsoft-365/visio/flowchart-software)
88 | 
89 | * [ZEROTIER 多平台远程连接工具](https://www.zerotier.com)
90 | 
91 | * [Midjourney AI 生成图片](https://www.midjourney.com/home)
92 | 
93 | * [Render AI 生成音乐](https://mubert.com/render)
94 | 
95 | * [Suno AI 音乐生成](https://suno.com/blog/v3)
96 | 


--------------------------------------------------------------------------------