├── .cache └── .temp │ └── 1740397786743_resample.wav ├── C2SER-llm ├── config.yaml ├── infer_runtime.py ├── prompt_config.yaml ├── requirements.txt ├── setup.py └── wenet │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── __init__.cpython-311.pyc │ └── __init__.cpython-39.pyc │ ├── bin │ ├── alignment.py │ ├── average_model.py │ ├── export_ipex.py │ ├── export_jit.py │ ├── export_onnx_bpu.py │ ├── export_onnx_cpu.py │ ├── export_onnx_gpu.py │ ├── recognize.py │ ├── recognize4llmasr.py │ ├── recognize_onnx_gpu.py │ └── train.py │ ├── cli │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── __init__.cpython-311.pyc │ │ ├── __init__.cpython-39.pyc │ │ ├── hub.cpython-310.pyc │ │ ├── hub.cpython-311.pyc │ │ ├── hub.cpython-39.pyc │ │ ├── model.cpython-310.pyc │ │ ├── model.cpython-311.pyc │ │ └── model.cpython-39.pyc │ ├── hub.py │ └── model.py │ ├── dataset │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── __init__.cpython-311.pyc │ │ ├── dataset.cpython-310.pyc │ │ └── dataset.cpython-311.pyc │ ├── datapipes.py │ ├── dataset.py │ ├── kaldi_io.py │ ├── process │ │ ├── __pycache__ │ │ │ ├── processor.cpython-310.pyc │ │ │ └── processor.cpython-311.pyc │ │ ├── processor.py │ │ ├── processor_base-version.py │ │ ├── processor_base-version_emotion-only_with-ssl-vec.py │ │ └── processor_instrcut-version.py │ └── wav_distortion.py │ ├── efficient_conformer │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ ├── attention.cpython-39.pyc │ │ └── subsampling.cpython-39.pyc │ ├── attention.py │ └── subsampling.py │ ├── llm_asr │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ ├── downsampler.cpython-39.pyc │ │ ├── init_llmasr.cpython-39.pyc │ │ ├── llmasr_model.cpython-39.pyc │ │ └── utils4llmasr.cpython-39.pyc │ ├── downsampler.py │ ├── init_llmasr.py │ ├── llmasr_model.py │ └── utils4llmasr.py │ ├── paraformer │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ ├── embedding.cpython-39.pyc │ │ └── search.cpython-39.pyc │ ├── embedding.py │ └── search.py │ ├── squeezeformer │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ ├── conv2d.cpython-39.pyc │ │ └── subsampling.cpython-39.pyc │ ├── conv2d.py │ └── subsampling.py │ ├── text │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── __init__.cpython-311.pyc │ │ ├── __init__.cpython-39.pyc │ │ ├── base_tokenizer.cpython-310.pyc │ │ ├── base_tokenizer.cpython-311.pyc │ │ ├── base_tokenizer.cpython-39.pyc │ │ ├── bpe_tokenizer.cpython-310.pyc │ │ ├── bpe_tokenizer.cpython-311.pyc │ │ ├── bpe_tokenizer.cpython-39.pyc │ │ ├── char_tokenizer.cpython-310.pyc │ │ ├── char_tokenizer.cpython-311.pyc │ │ ├── char_tokenizer.cpython-39.pyc │ │ ├── hugging_face_tokenizer.cpython-310.pyc │ │ ├── hugging_face_tokenizer.cpython-311.pyc │ │ ├── hugging_face_tokenizer.cpython-39.pyc │ │ ├── paraformer_tokenizer.cpython-310.pyc │ │ ├── paraformer_tokenizer.cpython-311.pyc │ │ ├── paraformer_tokenizer.cpython-39.pyc │ │ ├── tokenize_utils.cpython-310.pyc │ │ ├── tokenize_utils.cpython-311.pyc │ │ ├── tokenize_utils.cpython-39.pyc │ │ ├── whisper_tokenizer.cpython-310.pyc │ │ ├── whisper_tokenizer.cpython-311.pyc │ │ └── whisper_tokenizer.cpython-39.pyc │ ├── base_tokenizer.py │ ├── bpe_tokenizer.py │ ├── char_tokenizer.py │ ├── hugging_face_tokenizer.py │ ├── paraformer_tokenizer.py │ ├── tokenize_utils.py │ └── whisper_tokenizer.py │ ├── transformer │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── __init__.cpython-311.pyc │ │ ├── __init__.cpython-39.pyc │ │ ├── asr_model.cpython-310.pyc │ │ ├── asr_model.cpython-311.pyc │ │ ├── asr_model.cpython-39.pyc │ │ ├── attention.cpython-310.pyc │ │ ├── attention.cpython-311.pyc │ │ ├── attention.cpython-39.pyc │ │ ├── cmvn.cpython-310.pyc │ │ ├── cmvn.cpython-311.pyc │ │ ├── cmvn.cpython-39.pyc │ │ ├── convolution.cpython-310.pyc │ │ ├── convolution.cpython-311.pyc │ │ ├── convolution.cpython-39.pyc │ │ ├── ctc.cpython-310.pyc │ │ ├── ctc.cpython-311.pyc │ │ ├── ctc.cpython-39.pyc │ │ ├── decoder.cpython-310.pyc │ │ ├── decoder.cpython-311.pyc │ │ ├── decoder.cpython-39.pyc │ │ ├── decoder_layer.cpython-310.pyc │ │ ├── decoder_layer.cpython-311.pyc │ │ ├── decoder_layer.cpython-39.pyc │ │ ├── embedding.cpython-310.pyc │ │ ├── embedding.cpython-311.pyc │ │ ├── embedding.cpython-39.pyc │ │ ├── encoder.cpython-310.pyc │ │ ├── encoder.cpython-311.pyc │ │ ├── encoder.cpython-39.pyc │ │ ├── encoder_layer.cpython-310.pyc │ │ ├── encoder_layer.cpython-311.pyc │ │ ├── encoder_layer.cpython-39.pyc │ │ ├── label_smoothing_loss.cpython-310.pyc │ │ ├── label_smoothing_loss.cpython-311.pyc │ │ ├── label_smoothing_loss.cpython-39.pyc │ │ ├── norm.cpython-310.pyc │ │ ├── norm.cpython-311.pyc │ │ ├── norm.cpython-39.pyc │ │ ├── positionwise_feed_forward.cpython-310.pyc │ │ ├── positionwise_feed_forward.cpython-311.pyc │ │ ├── positionwise_feed_forward.cpython-39.pyc │ │ ├── search.cpython-310.pyc │ │ ├── search.cpython-311.pyc │ │ ├── search.cpython-39.pyc │ │ ├── subsampling.cpython-310.pyc │ │ ├── subsampling.cpython-311.pyc │ │ ├── subsampling.cpython-39.pyc │ │ ├── swish.cpython-310.pyc │ │ ├── swish.cpython-311.pyc │ │ └── swish.cpython-39.pyc │ ├── asr_model.py │ ├── attention.py │ ├── cmvn.py │ ├── convolution.py │ ├── ctc.py │ ├── decoder.py │ ├── decoder_layer.py │ ├── embedding.py │ ├── encoder.py │ ├── encoder_layer.py │ ├── label_smoothing_loss.py │ ├── norm.py │ ├── positionwise_feed_forward.py │ ├── search.py │ ├── subsampling.py │ └── swish.py │ ├── utils │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── __init__.cpython-311.pyc │ │ ├── __init__.cpython-39.pyc │ │ ├── checkpoint.cpython-310.pyc │ │ ├── checkpoint.cpython-311.pyc │ │ ├── checkpoint.cpython-39.pyc │ │ ├── class_utils.cpython-310.pyc │ │ ├── class_utils.cpython-311.pyc │ │ ├── class_utils.cpython-39.pyc │ │ ├── cmvn.cpython-310.pyc │ │ ├── cmvn.cpython-311.pyc │ │ ├── cmvn.cpython-39.pyc │ │ ├── common.cpython-310.pyc │ │ ├── common.cpython-311.pyc │ │ ├── common.cpython-39.pyc │ │ ├── config.cpython-310.pyc │ │ ├── config.cpython-311.pyc │ │ ├── context_graph.cpython-310.pyc │ │ ├── context_graph.cpython-311.pyc │ │ ├── context_graph.cpython-39.pyc │ │ ├── ctc_utils.cpython-310.pyc │ │ ├── ctc_utils.cpython-311.pyc │ │ ├── ctc_utils.cpython-39.pyc │ │ ├── executor.cpython-310.pyc │ │ ├── executor.cpython-311.pyc │ │ ├── file_utils.cpython-310.pyc │ │ ├── file_utils.cpython-311.pyc │ │ ├── file_utils.cpython-39.pyc │ │ ├── fsdp_utils.cpython-310.pyc │ │ ├── fsdp_utils.cpython-311.pyc │ │ ├── init_dataset.cpython-310.pyc │ │ ├── init_dataset.cpython-311.pyc │ │ ├── init_model.cpython-310.pyc │ │ ├── init_model.cpython-311.pyc │ │ ├── init_model.cpython-39.pyc │ │ ├── init_tokenizer.cpython-310.pyc │ │ ├── init_tokenizer.cpython-311.pyc │ │ ├── init_tokenizer.cpython-39.pyc │ │ ├── mask.cpython-310.pyc │ │ ├── mask.cpython-311.pyc │ │ ├── mask.cpython-39.pyc │ │ ├── rope_utils.cpython-310.pyc │ │ ├── rope_utils.cpython-311.pyc │ │ ├── rope_utils.cpython-39.pyc │ │ ├── scheduler.cpython-310.pyc │ │ ├── scheduler.cpython-311.pyc │ │ ├── train_utils.cpython-310.pyc │ │ └── train_utils.cpython-311.pyc │ ├── checkpoint.py │ ├── class_utils.py │ ├── cmvn.py │ ├── common.py │ ├── config.py │ ├── context_graph.py │ ├── ctc_utils.py │ ├── executor.py │ ├── file_utils.py │ ├── fsdp_utils.py │ ├── init_dataset.py │ ├── init_model.py │ ├── init_tokenizer.py │ ├── mask.py │ ├── rope_utils.py │ ├── scheduler.py │ └── train_utils.py │ └── whisper │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── __init__.cpython-311.pyc │ ├── __init__.cpython-39.pyc │ ├── whisper.cpython-310.pyc │ ├── whisper.cpython-311.pyc │ └── whisper.cpython-39.pyc │ ├── convert_whisper_to_wenet_config_and_ckpt.py │ ├── whisper.py │ └── whisper_with_clap.py ├── Emo-Emilia └── Emo-Emilia-ALL.jsonl ├── Emotion2Vec-S ├── downstream_EmoBox │ └── k_fold_CV.sh ├── examples │ ├── .gitignore │ └── data2vec │ │ └── models │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ └── data2vec2.cpython-38.pyc │ │ ├── data2vec2.py │ │ └── modalities │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── audio.cpython-38.pyc │ │ ├── base.cpython-38.pyc │ │ └── modules.cpython-38.pyc │ │ ├── audio.py │ │ ├── base.py │ │ └── modules.py ├── extract_feature.sh ├── features │ ├── features_frm │ │ ├── 4YJy1uDx0jM_769.npy │ │ └── vo_EQAST002_1_paimon_07.npy │ └── features_utt │ │ ├── 4YJy1uDx0jM_769.npy │ │ └── vo_EQAST002_1_paimon_07.npy ├── speech_feature_extraction.py ├── test_wav │ ├── 4YJy1uDx0jM_769.wav │ └── vo_EQAST002_1_paimon_07.wav └── wav.scp ├── README.md └── figs └── c2ser.png /.cache/.temp/1740397786743_resample.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/.cache/.temp/1740397786743_resample.wav -------------------------------------------------------------------------------- /C2SER-llm/config.yaml: -------------------------------------------------------------------------------- 1 | model: llmasr 2 | 3 | # tokenizer 4 | tokenizer: huggingface 5 | tokenizer_conf: 6 | llm_path: Qwen/Qwen2-7B 7 | use_lora: true 8 | lora_alpha: 32 9 | lora_rank: 8 10 | lora_dropout: 0.1 11 | speech_token_num: 4097 12 | 13 | fire_module: link_and_encoder_and_lora # link encoder llm link_and_encoder link_and_encoder_and_lora, 14 | downsample_rate: 4 # 1 2 4 8 15 | adapter_type: gxl 16 | llm_path: Qwen/Qwen2-7B 17 | optim: adamw 18 | optim_conf: 19 | betas: 20 | - 0.9 21 | - 0.99 22 | eps: 1.0e-06 23 | lr: 5.0e-05 24 | weight_decay: 0.01 25 | scheduler: warmuplr 26 | scheduler_conf: 27 | warmup_steps: 8000 28 | 29 | cmvn: null 30 | cmvn_conf: 31 | cmvn_file: null 32 | is_json_cmvn: null 33 | ctc_conf: 34 | ctc_blank_id: 50362 35 | 36 | dataset: asr 37 | dataset_conf: 38 | batch_conf: 39 | batch_size: 26 40 | batch_type: dynamic 41 | max_frames_in_batch: 3900 # 3900 42 | max_seq_in_batch: 1900 # 1900 43 | feats_type: log_mel_spectrogram 44 | filter_conf: 45 | max_length: 2900 46 | min_length: 0 47 | token_max_length: 200 48 | token_min_length: 1 49 | filter_no_extra_info: true # 如果没有task lang 等信息,直接过滤掉, 适用于通用多任务训练, 推理时应该关掉 50 | max_seq_len: 1000 #1000 51 | language_conf: 52 | limited_langs: 53 | - zh 54 | log_mel_spectrogram_conf: 55 | hop_length: 160 56 | n_fft: 400 57 | num_mel_bins: 80 58 | padding: 0 59 | resample_conf: 60 | resample_rate: 16000 61 | shuffle: true 62 | shuffle_conf: 63 | shuffle_size: 1500 64 | sort: true 65 | sort_conf: 66 | sort_size: 500 67 | spec_aug: true 68 | spec_aug_conf: 69 | max_f: 10 70 | max_t: 50 71 | num_f_mask: 2 72 | num_t_mask: 2 73 | spec_sub: true 74 | spec_sub_conf: 75 | max_t: 30 76 | num_t_sub: 3 77 | spec_trim: false 78 | speed_perturb: false 79 | eod_id: 151643 # for whisper 80 | split_num: 1 # 25000tar -> /split_Num 1000 81 | multi_num: 1 # 2 82 | prompt_conf_path: ./prompt_config.yaml 83 | continue_data: true 84 | 85 | decoder: transformer 86 | decoder_conf: 87 | activation_type: gelu 88 | attention_heads: 16 89 | dropout_rate: 0.1 90 | gradient_checkpointing: true 91 | input_layer: embed_learnable_pe 92 | key_bias: false 93 | linear_units: 4096 94 | normalize_before: true 95 | num_blocks: 24 96 | positional_dropout_rate: 0.0 97 | self_attention_dropout_rate: 0.0 98 | src_attention: true 99 | src_attention_dropout_rate: 0.0 100 | tie_word_embedding: true 101 | use_output_layer: true 102 | encoder: transformer 103 | encoder_conf: 104 | activation_type: gelu 105 | attention_dropout_rate: 0.0 106 | attention_heads: 16 107 | dropout_rate: 0.1 108 | gradient_checkpointing: true 109 | input_layer: conv1d2 110 | key_bias: false 111 | linear_units: 4096 112 | normalize_before: true 113 | num_blocks: 24 114 | output_size: 1024 115 | pos_enc_layer_type: abs_pos_whisper 116 | positional_dropout_rate: 0.1 117 | static_chunk_size: -1 118 | use_dynamic_chunk: false 119 | use_dynamic_left_chunk: false 120 | grad_clip: 5 121 | accum_grad: 4 122 | input_dim: 80 123 | log_interval: 10 124 | save_interval: 1250 125 | max_epoch: 100 126 | 127 | model_conf: 128 | ctc_weight: 0 129 | length_normalized_loss: false 130 | lsm_weight: 0.1 131 | 132 | init_step: true 133 | -------------------------------------------------------------------------------- /C2SER-llm/infer_runtime.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch.nn.functional as F 3 | from gxl_ai_utils.utils import utils_file 4 | from wenet.utils.init_tokenizer import init_tokenizer 5 | from gxl_ai_utils.config.gxl_config import GxlNode 6 | from wenet.utils.init_model import init_model 7 | import logging 8 | import librosa 9 | import torch 10 | import torchaudio 11 | import numpy as np 12 | 13 | logging.basicConfig(level=logging.DEBUG, 14 | format='%(asctime)s %(levelname)s %(message)s') 15 | config_path = "./C2SER-llm/config.yaml" 16 | checkpoint_path = "/home/work_nfs16/xlgeng/code/wenet_undersdand_and_speech_xlgeng_emotion_only/examples/wenetspeech/whisper/exp/two_stage_train/stage_2_plus_meld/step_9999.pt" 17 | args = GxlNode({ 18 | "checkpoint": checkpoint_path, 19 | }) 20 | configs = utils_file.load_dict_from_yaml(config_path) 21 | model, configs = init_model(args, configs) 22 | gpu_id = 0 23 | model = model.cuda(gpu_id) 24 | tokenizer = init_tokenizer(configs) 25 | print(model) 26 | resample_rate = 16000 27 | 28 | def do_resample(input_wav_path, output_wav_path): 29 | """""" 30 | print(f'input_wav_path: {input_wav_path}, output_wav_path: {output_wav_path}') 31 | waveform, sample_rate = torchaudio.load(input_wav_path) 32 | # 检查音频的维度 33 | num_channels = waveform.shape[0] 34 | # 如果音频是多通道的,则进行通道平均 35 | if num_channels > 1: 36 | waveform = torch.mean(waveform, dim=0, keepdim=True) 37 | waveform = torchaudio.transforms.Resample( 38 | orig_freq=sample_rate, new_freq=16000)(waveform) 39 | utils_file.makedir_for_file(output_wav_path) 40 | torchaudio.save(output_wav_path, waveform, 16000) 41 | 42 | 43 | def do_decode(input_wav_path, input_prompt, ssl_vector_path): 44 | # input_prompt = TASK_PROMPT_MAPPING.get(input_prompt, "未知任务类型") 45 | print(f"wav_path: {input_wav_path}, prompt:{input_prompt}") 46 | timestamp_ms = int(time.time() * 1000) 47 | now_file_tmp_path_resample = f'./.cache/.temp/{timestamp_ms}_resample.wav' 48 | do_resample(input_wav_path, now_file_tmp_path_resample) 49 | input_wav_path = now_file_tmp_path_resample 50 | waveform, sample_rate = torchaudio.load(input_wav_path) 51 | waveform = waveform.squeeze(0) # (channel=1, sample) -> (sample,) 52 | print(f'wavform shape: {waveform.shape}, sample_rate: {sample_rate}') 53 | window = torch.hann_window(400) 54 | stft = torch.stft(waveform, 55 | 400, 56 | 160, 57 | window=window, 58 | return_complex=True) 59 | magnitudes = stft[..., :-1].abs() ** 2 60 | 61 | filters = torch.from_numpy( 62 | librosa.filters.mel(sr=sample_rate, 63 | n_fft=400, 64 | n_mels=80)) 65 | mel_spec = filters @ magnitudes 66 | 67 | # NOTE(): https://github.com/openai/whisper/discussions/269 68 | log_spec = torch.clamp(mel_spec, min=1e-10).log10() 69 | log_spec = torch.maximum(log_spec, log_spec.max() - 8.0) 70 | log_spec = (log_spec + 4.0) / 4.0 71 | feat = log_spec.transpose(0, 1) 72 | feat_lens = torch.tensor([feat.shape[0]], dtype=torch.int64).to(gpu_id) 73 | feat = feat.unsqueeze(0).to(gpu_id) 74 | # feat = feat.half() 75 | # feat_lens = feat_lens.half() 76 | numpy_array = np.load(ssl_vector_path) 77 | 78 | tensor = torch.from_numpy(numpy_array) 79 | pad_amount = 1024 - tensor.size(1) 80 | padded_tensor_ssl = F.pad(tensor, (0, pad_amount), mode='constant', value=0) 81 | res_text = model.generate(wavs=feat, wavs_len=feat_lens, prompt=input_prompt, padded_tensor_ssl=padded_tensor_ssl)[0] 82 | print("result:", res_text) 83 | return res_text 84 | 85 | 86 | if __name__ == "__main__": 87 | input_wav_path = "./Emotion2Vec-S/test_wav/vo_EQAST002_1_paimon_07.wav" 88 | input_prompt = "Please consider the speaking style, content, and directly provide the speaker's emotion in this speech." # for stage1, more prompt refer to ./prompt_config.yaml 89 | ssl_vector_path = "./Emotion2Vec-S/features/features_utt/vo_EQAST002_1_paimon_07.npy" # for ssl, the path of ssl vector 90 | res_text_list = do_decode(input_wav_path, input_prompt, ssl_vector_path) 91 | # print(res_text_list) 92 | 93 | -------------------------------------------------------------------------------- /C2SER-llm/prompt_config.yaml: -------------------------------------------------------------------------------- 1 | : 2 | - Please describe the speaking style, content, and the speaker's emotional state of this speech. 3 | - Please describe the speaking style, content, and the speaker's emotional state of this speech. 4 | - Please describe the speaking style, content, and the speaker's emotional state of this speech. 5 | - Please describe the speaking style, content, and the speaker's emotional state of this speech. 6 | - Please describe the speaking style, content, and the speaker's emotional state of this speech. 7 | : 8 | - Please consider the speaking style, content, and directly provide the speaker's emotion in this speech. 9 | - Please consider the speaking style, content, and directly provide the speaker's emotion in this speech. 10 | - Please consider the speaking style, content, and directly provide the speaker's emotion in this speech. 11 | - Please consider the speaking style, content, and directly provide the speaker's emotion in this speech. 12 | - Please consider the speaking style, content, and directly provide the speaker's emotion in this speech. 13 | -------------------------------------------------------------------------------- /C2SER-llm/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.24 2 | jsonlines==4.0.0 3 | torch==2.1.0 4 | transformers==4.44.0 5 | torchaudio==2.1.0 6 | librosa 7 | tensorboardX>=2.5 8 | tqdm 9 | absl-py 10 | psutil 11 | cloudpickle 12 | ml-dtypes 13 | tornado 14 | openai-whisper 15 | colorama 16 | peft 17 | sox 18 | deepspeed 19 | librosa 20 | gxl_ai_utils 21 | jsonlines 22 | -------------------------------------------------------------------------------- /C2SER-llm/setup.py: -------------------------------------------------------------------------------- 1 | import platform 2 | from setuptools import setup, find_packages 3 | 4 | requirements = [ 5 | "numpy", 6 | "requests", 7 | "tqdm", 8 | "torch>=1.13.0", 9 | "torchaudio>=0.13.0", 10 | "openai-whisper", 11 | "librosa", 12 | ] 13 | 14 | extra_require = { 15 | "torch-npu": [ 16 | "torch==2.2.0", "torch-npu==2.2.0", "torchaudio==2.2.0", "decorator", 17 | "numpy<2.0.0", "attrs", "psutil" 18 | ], 19 | } 20 | 21 | if platform.system() == 'Windows': 22 | requirements += ['PySoundFile'] 23 | 24 | setup( 25 | name="wenet", 26 | install_requires=requirements, 27 | packages=find_packages(), 28 | entry_points={"console_scripts": [ 29 | "wenet = wenet.cli.transcribe:main", 30 | ]}, 31 | extras_require=extra_require, 32 | ) 33 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/__init__.py: -------------------------------------------------------------------------------- 1 | from wenet.cli.model import load_model # noqa 2 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/bin/average_model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Mobvoi Inc (Di Wu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import argparse 17 | import glob 18 | import sys 19 | 20 | import yaml 21 | import torch 22 | 23 | 24 | def get_args(): 25 | parser = argparse.ArgumentParser(description='average model') 26 | parser.add_argument('--dst_model', required=True, help='averaged model') 27 | parser.add_argument('--src_path', 28 | required=True, 29 | help='src model path for average') 30 | parser.add_argument('--val_best', 31 | action="store_true", 32 | help='averaged model') 33 | parser.add_argument('--num', 34 | default=5, 35 | type=int, 36 | help='nums for averaged model') 37 | parser.add_argument('--min_epoch', 38 | default=0, 39 | type=int, 40 | help='min epoch used for averaging model') 41 | parser.add_argument('--max_epoch', 42 | default=sys.maxsize, 43 | type=int, 44 | help='max epoch used for averaging model') 45 | parser.add_argument('--min_step', 46 | default=0, 47 | type=int, 48 | help='min step used for averaging model') 49 | parser.add_argument('--max_step', 50 | default=sys.maxsize, 51 | type=int, 52 | help='max step used for averaging model') 53 | parser.add_argument('--mode', 54 | default="hybrid", 55 | choices=["hybrid", "epoch", "step"], 56 | type=str, 57 | help='average mode') 58 | 59 | args = parser.parse_args() 60 | print(args) 61 | return args 62 | 63 | 64 | def main(): 65 | args = get_args() 66 | checkpoints = [] 67 | val_scores = [] 68 | if args.val_best: 69 | if args.mode == "hybrid": 70 | yamls = glob.glob('{}/*.yaml'.format(args.src_path)) 71 | yamls = [ 72 | f for f in yamls 73 | if not (os.path.basename(f).startswith('train') 74 | or os.path.basename(f).startswith('init')) 75 | ] 76 | elif args.mode == "step": 77 | yamls = glob.glob('{}/step_*.yaml'.format(args.src_path)) 78 | else: 79 | yamls = glob.glob('{}/epoch_*.yaml'.format(args.src_path)) 80 | for y in yamls: 81 | with open(y, 'r') as f: 82 | dic_yaml = yaml.load(f, Loader=yaml.FullLoader) 83 | loss = dic_yaml['loss_dict']['loss'] 84 | epoch = dic_yaml['epoch'] 85 | step = dic_yaml['step'] 86 | tag = dic_yaml['tag'] 87 | if epoch >= args.min_epoch and epoch <= args.max_epoch \ 88 | and step >= args.min_step and step <= args.max_step: 89 | val_scores += [[epoch, step, loss, tag]] 90 | sorted_val_scores = sorted(val_scores, 91 | key=lambda x: x[2], 92 | reverse=False) 93 | print("best val (epoch, step, loss, tag) = " + 94 | str(sorted_val_scores[:args.num])) 95 | path_list = [ 96 | args.src_path + '/{}.pt'.format(score[-1]) 97 | for score in sorted_val_scores[:args.num] 98 | ] 99 | else: 100 | path_list = glob.glob('{}/[!init]*.pt'.format(args.src_path)) 101 | path_list = sorted(path_list, key=os.path.getmtime) 102 | path_list = path_list[-args.num:] 103 | print(path_list) 104 | avg = {} 105 | num = args.num 106 | assert num == len(path_list) 107 | for path in path_list: 108 | print('Processing {}'.format(path)) 109 | states = torch.load(path, map_location=torch.device('cpu')) 110 | for k in states.keys(): 111 | if k not in avg.keys(): 112 | avg[k] = states[k].clone() 113 | else: 114 | avg[k] += states[k] 115 | # average 116 | for k in avg.keys(): 117 | if avg[k] is not None: 118 | # pytorch 1.6 use true_divide instead of /= 119 | avg[k] = torch.true_divide(avg[k], num) 120 | print('Saving to {}'.format(args.dst_model)) 121 | torch.save(avg, args.dst_model) 122 | 123 | 124 | if __name__ == '__main__': 125 | main() 126 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/bin/export_ipex.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2021-2023 Intel Corporation 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from __future__ import print_function 5 | 6 | import argparse 7 | import logging 8 | import os 9 | 10 | import torch 11 | import yaml 12 | 13 | from wenet.utils.init_model import init_model 14 | import intel_extension_for_pytorch as ipex 15 | from intel_extension_for_pytorch.quantization import prepare, convert 16 | 17 | 18 | def get_args(): 19 | parser = argparse.ArgumentParser(description='export your script model') 20 | parser.add_argument('--config', required=True, help='config file') 21 | parser.add_argument('--checkpoint', required=True, help='checkpoint model') 22 | parser.add_argument('--output_file', default=None, help='output file') 23 | parser.add_argument('--dtype', 24 | default="fp32", 25 | help='choose the dtype to run:[fp32,bf16]') 26 | parser.add_argument('--output_quant_file', 27 | default=None, 28 | help='output quantized model file') 29 | args = parser.parse_args() 30 | return args 31 | 32 | 33 | def scripting(model): 34 | with torch.inference_mode(): 35 | script_model = torch.jit.script(model) 36 | script_model = torch.jit.freeze( 37 | script_model, 38 | preserved_attrs=[ 39 | "forward_encoder_chunk", "ctc_activation", 40 | "forward_attention_decoder", "subsampling_rate", 41 | "right_context", "sos_symbol", "eos_symbol", 42 | "is_bidirectional_decoder" 43 | ]) 44 | return script_model 45 | 46 | 47 | def main(): 48 | args = get_args() 49 | logging.basicConfig(level=logging.DEBUG, 50 | format='%(asctime)s %(levelname)s %(message)s') 51 | # No need gpu for model export 52 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 53 | 54 | with open(args.config, 'r') as fin: 55 | configs = yaml.load(fin, Loader=yaml.FullLoader) 56 | model, configs = init_model(args, configs) 57 | print(model) 58 | 59 | # Apply IPEX optimization 60 | model.eval() 61 | torch._C._jit_set_texpr_fuser_enabled(False) 62 | model.to(memory_format=torch.channels_last) 63 | if args.dtype == "fp32": 64 | ipex_model = ipex.optimize(model) 65 | elif args.dtype == "bf16": # For Intel 4th generation Xeon (SPR) 66 | ipex_model = ipex.optimize(model, 67 | dtype=torch.bfloat16, 68 | weights_prepack=False) 69 | 70 | # Export jit torch script model 71 | if args.output_file: 72 | if args.dtype == "fp32": 73 | script_model = scripting(ipex_model) 74 | elif args.dtype == "bf16": 75 | torch._C._jit_set_autocast_mode(True) 76 | with torch.cpu.amp.autocast(): 77 | script_model = scripting(ipex_model) 78 | script_model.save(args.output_file) 79 | print('Export model successfully, see {}'.format(args.output_file)) 80 | 81 | # Export quantized jit torch script model 82 | if args.output_quant_file: 83 | dynamic_qconfig = ipex.quantization.default_dynamic_qconfig 84 | dummy_data = (torch.zeros(1, 67, 80), 16, -16, 85 | torch.zeros(12, 4, 32, 128), torch.zeros(12, 1, 256, 7)) 86 | model = prepare(model, dynamic_qconfig, dummy_data) 87 | model = convert(model) 88 | script_quant_model = scripting(model) 89 | script_quant_model.save(args.output_quant_file) 90 | print('Export quantized model successfully, ' 91 | 'see {}'.format(args.output_quant_file)) 92 | 93 | 94 | if __name__ == '__main__': 95 | main() 96 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/bin/export_jit.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | 17 | import argparse 18 | import logging 19 | import os 20 | 21 | import torch 22 | import yaml 23 | 24 | from wenet.utils.init_model import init_model 25 | 26 | 27 | def get_args(): 28 | parser = argparse.ArgumentParser(description='export your script model') 29 | parser.add_argument('--config', required=True, help='config file') 30 | parser.add_argument('--checkpoint', required=True, help='checkpoint model') 31 | parser.add_argument('--output_file', default=None, help='output file') 32 | parser.add_argument('--output_quant_file', 33 | default=None, 34 | help='output quantized model file') 35 | args = parser.parse_args() 36 | return args 37 | 38 | 39 | def main(): 40 | args = get_args() 41 | args.jit = True 42 | logging.basicConfig(level=logging.DEBUG, 43 | format='%(asctime)s %(levelname)s %(message)s') 44 | # No need gpu for model export 45 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 46 | 47 | with open(args.config, 'r') as fin: 48 | configs = yaml.load(fin, Loader=yaml.FullLoader) 49 | model, configs = init_model(args, configs) 50 | model.eval() 51 | print(model) 52 | # Export jit torch script model 53 | 54 | if args.output_file: 55 | script_model = torch.jit.script(model) 56 | script_model.save(args.output_file) 57 | print('Export model successfully, see {}'.format(args.output_file)) 58 | 59 | # Export quantized jit torch script model 60 | if args.output_quant_file: 61 | quantized_model = torch.quantization.quantize_dynamic( 62 | model, {torch.nn.Linear}, dtype=torch.qint8) 63 | print(quantized_model) 64 | script_quant_model = torch.jit.script(quantized_model) 65 | script_quant_model.save(args.output_quant_file) 66 | print('Export quantized model successfully, ' 67 | 'see {}'.format(args.output_quant_file)) 68 | 69 | 70 | if __name__ == '__main__': 71 | main() 72 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__init__.py -------------------------------------------------------------------------------- /C2SER-llm/wenet/cli/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/cli/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/cli/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/cli/__pycache__/hub.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__pycache__/hub.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/cli/__pycache__/hub.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__pycache__/hub.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/cli/__pycache__/hub.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__pycache__/hub.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/cli/__pycache__/model.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__pycache__/model.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/cli/__pycache__/model.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__pycache__/model.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/cli/__pycache__/model.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/cli/__pycache__/model.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/cli/hub.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Mddct(hamddct@gmail.com) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import requests 17 | import sys 18 | import tarfile 19 | from pathlib import Path 20 | from urllib.request import urlretrieve 21 | 22 | import tqdm 23 | 24 | 25 | def download(url: str, dest: str, only_child=True): 26 | """ download from url to dest 27 | """ 28 | assert os.path.exists(dest) 29 | print('Downloading {} to {}'.format(url, dest)) 30 | 31 | def progress_hook(t): 32 | last_b = [0] 33 | 34 | def update_to(b=1, bsize=1, tsize=None): 35 | if tsize not in (None, -1): 36 | t.total = tsize 37 | displayed = t.update((b - last_b[0]) * bsize) 38 | last_b[0] = b 39 | return displayed 40 | 41 | return update_to 42 | 43 | # *.tar.gz 44 | name = url.split('?')[0].split('/')[-1] 45 | tar_path = os.path.join(dest, name) 46 | with tqdm.tqdm(unit='B', 47 | unit_scale=True, 48 | unit_divisor=1024, 49 | miniters=1, 50 | desc=(name)) as t: 51 | urlretrieve(url, 52 | filename=tar_path, 53 | reporthook=progress_hook(t), 54 | data=None) 55 | t.total = t.n 56 | 57 | with tarfile.open(tar_path) as f: 58 | if not only_child: 59 | f.extractall(dest) 60 | else: 61 | for tarinfo in f: 62 | if "/" not in tarinfo.name: 63 | continue 64 | name = os.path.basename(tarinfo.name) 65 | fileobj = f.extractfile(tarinfo) 66 | with open(os.path.join(dest, name), "wb") as writer: 67 | writer.write(fileobj.read()) 68 | 69 | 70 | class Hub(object): 71 | """Hub for wenet pretrain runtime model 72 | """ 73 | # TODO(Mddct): make assets class to support other language 74 | Assets = { 75 | # wenetspeech 76 | "chinese": "wenetspeech_u2pp_conformer_libtorch.tar.gz", 77 | # gigaspeech 78 | "english": "gigaspeech_u2pp_conformer_libtorch.tar.gz", 79 | # paraformer 80 | "paraformer": "paraformer.tar.gz" 81 | } 82 | 83 | def __init__(self) -> None: 84 | pass 85 | 86 | @staticmethod 87 | def get_model_by_lang(lang: str) -> str: 88 | if lang not in Hub.Assets.keys(): 89 | print('ERROR: Unsupported language {} !!!'.format(lang)) 90 | sys.exit(1) 91 | 92 | # NOTE(Mddct): model_dir structure 93 | # Path.Home()/.wenet 94 | # - chs 95 | # - units.txt 96 | # - final.zip 97 | # - en 98 | # - units.txt 99 | # - final.zip 100 | model = Hub.Assets[lang] 101 | model_dir = os.path.join(Path.home(), ".wenet", lang) 102 | if not os.path.exists(model_dir): 103 | os.makedirs(model_dir) 104 | # TODO(Mddct): model metadata 105 | if set(["final.zip", 106 | "units.txt"]).issubset(set(os.listdir(model_dir))): 107 | return model_dir 108 | # If not exist, download 109 | response = requests.get( 110 | "https://modelscope.cn/api/v1/datasets/wenet/wenet_pretrained_models/oss/tree" # noqa 111 | ) 112 | model_info = next(data for data in response.json()["Data"] 113 | if data["Key"] == model) 114 | model_url = model_info['Url'] 115 | download(model_url, model_dir, only_child=True) 116 | return model_dir 117 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/dataset/__init__.py -------------------------------------------------------------------------------- /C2SER-llm/wenet/dataset/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/dataset/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/dataset/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/dataset/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/dataset/__pycache__/dataset.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/dataset/__pycache__/dataset.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/dataset/__pycache__/dataset.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/dataset/__pycache__/dataset.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/dataset/process/__pycache__/processor.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/dataset/process/__pycache__/processor.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/dataset/process/__pycache__/processor.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/dataset/process/__pycache__/processor.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/efficient_conformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/efficient_conformer/__init__.py -------------------------------------------------------------------------------- /C2SER-llm/wenet/efficient_conformer/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/efficient_conformer/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/efficient_conformer/__pycache__/attention.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/efficient_conformer/__pycache__/attention.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/efficient_conformer/__pycache__/subsampling.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/efficient_conformer/__pycache__/subsampling.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/efficient_conformer/subsampling.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) 2 | # 2022 58.com(Wuba) Inc AI Lab. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # Modified from ESPnet(https://github.com/espnet/espnet) 16 | """Subsampling layer definition.""" 17 | 18 | from typing import Tuple, Union 19 | 20 | import torch 21 | from wenet.transformer.subsampling import BaseSubsampling 22 | 23 | 24 | class Conv2dSubsampling2(BaseSubsampling): 25 | """Convolutional 2D subsampling (to 1/4 length). 26 | 27 | Args: 28 | idim (int): Input dimension. 29 | odim (int): Output dimension. 30 | dropout_rate (float): Dropout rate. 31 | 32 | """ 33 | 34 | def __init__(self, idim: int, odim: int, dropout_rate: float, 35 | pos_enc_class: torch.nn.Module): 36 | """Construct an Conv2dSubsampling4 object.""" 37 | super().__init__() 38 | self.conv = torch.nn.Sequential(torch.nn.Conv2d(1, odim, 3, 2), 39 | torch.nn.ReLU()) 40 | self.out = torch.nn.Sequential( 41 | torch.nn.Linear(odim * ((idim - 1) // 2), odim)) 42 | self.pos_enc = pos_enc_class 43 | # The right context for every conv layer is computed by: 44 | # (kernel_size - 1) * frame_rate_of_this_layer 45 | self.subsampling_rate = 2 46 | # 2 = (3 - 1) * 1 47 | self.right_context = 2 48 | 49 | def forward( 50 | self, 51 | x: torch.Tensor, 52 | x_mask: torch.Tensor, 53 | offset: Union[int, torch.Tensor] = 0 54 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: 55 | """Subsample x. 56 | 57 | Args: 58 | x (torch.Tensor): Input tensor (#batch, time, idim). 59 | x_mask (torch.Tensor): Input mask (#batch, 1, time). 60 | 61 | Returns: 62 | torch.Tensor: Subsampled tensor (#batch, time', odim), 63 | where time' = time // 2. 64 | torch.Tensor: Subsampled mask (#batch, 1, time'), 65 | where time' = time // 2. 66 | torch.Tensor: positional encoding 67 | 68 | """ 69 | x = x.unsqueeze(1) # (b, c=1, t, f) 70 | x = self.conv(x) 71 | b, c, t, f = x.size() 72 | x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) 73 | x, pos_emb = self.pos_enc(x, offset) 74 | return x, pos_emb, x_mask[:, :, :-2:2] 75 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/llm_asr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/llm_asr/__init__.py -------------------------------------------------------------------------------- /C2SER-llm/wenet/llm_asr/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/llm_asr/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/llm_asr/__pycache__/downsampler.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/llm_asr/__pycache__/downsampler.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/llm_asr/__pycache__/init_llmasr.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/llm_asr/__pycache__/init_llmasr.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/llm_asr/__pycache__/llmasr_model.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/llm_asr/__pycache__/llmasr_model.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/llm_asr/__pycache__/utils4llmasr.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/llm_asr/__pycache__/utils4llmasr.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/llm_asr/downsampler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class GxlConv1dSubsampling2(nn.Module): 6 | """Conv1d subsampling module. 7 | 8 | Args: 9 | idim (int): Input dimension. 10 | odim (int): Output dimension. 11 | dropout_rate (float): Dropout rate. 12 | 13 | """ 14 | 15 | def __init__(self, idim: int, odim: int): 16 | """Construct an Conv1dSubsampling object.""" 17 | super().__init__() 18 | self.conv = torch.nn.Sequential( 19 | torch.nn.Conv1d(idim, odim, 3, 1), 20 | torch.nn.GELU(), 21 | torch.nn.Conv1d(odim, odim, 3, 2), 22 | torch.nn.GELU(), 23 | ) 24 | 25 | def forward(self, x): 26 | """ 27 | 28 | Args: 29 | x: (B, T, idim) 30 | 31 | Returns: 32 | """ 33 | x = x.transpose(1, 2) 34 | x = self.conv(x) 35 | x = x.transpose(1, 2) 36 | return x 37 | 38 | 39 | class GxlConv1dSubsampling4(nn.Module): 40 | """Conv1d subsampling module. 41 | 42 | Args: 43 | idim (int): Input dimension. 44 | odim (int): Output dimension. 45 | dropout_rate (float): Dropout rate. 46 | 47 | """ 48 | 49 | def __init__(self, idim: int, odim: int): 50 | """Construct an Conv1dSubsampling object.""" 51 | super().__init__() 52 | self.conv = torch.nn.Sequential( 53 | torch.nn.ConstantPad1d((2, 0), 0.0), 54 | torch.nn.Conv1d(idim, odim, 3, 1), 55 | torch.nn.GELU(), 56 | torch.nn.ConstantPad1d((2, 0), 0.0), 57 | torch.nn.Conv1d(odim, odim, 3, 2), 58 | torch.nn.GELU(), 59 | torch.nn.ConstantPad1d((2, 0), 0.0), 60 | torch.nn.Conv1d(odim, odim, 3, 2), 61 | torch.nn.GELU(), 62 | ) 63 | 64 | def forward(self, x, mask_pad): 65 | """ 66 | 67 | Args: 68 | x: (B, T, idim) 69 | 70 | Returns: 71 | """ 72 | x = x.transpose(1, 2) 73 | x = self.conv(x) 74 | x = x.transpose(1, 2) 75 | mask_pad = mask_pad[:, :, 0::2] 76 | mask_pad = mask_pad[:, :, 0::2] 77 | return x, mask_pad 78 | 79 | 80 | class GxlConv1dSubsampling6(nn.Module): 81 | """Conv1d subsampling module. 82 | 83 | Args: 84 | idim (int): Input dimension. 85 | odim (int): Output dimension. 86 | dropout_rate (float): Dropout rate. 87 | 88 | """ 89 | 90 | def __init__(self, idim: int, odim: int): 91 | """Construct an Conv1dSubsampling object.""" 92 | super().__init__() 93 | self.conv = torch.nn.Sequential( 94 | torch.nn.Conv1d(idim, odim, 3, 1), 95 | torch.nn.GELU(), 96 | torch.nn.Conv1d(odim, odim, 3, 2), 97 | torch.nn.GELU(), 98 | torch.nn.Conv1d(odim, odim, 3, 3), 99 | torch.nn.GELU(), 100 | ) 101 | 102 | def forward(self, x): 103 | """ 104 | 105 | Args: 106 | x: (B, T, idim) 107 | 108 | Returns: 109 | """ 110 | x = x.transpose(1, 2) 111 | x = self.conv(x) 112 | x = x.transpose(1, 2) 113 | return x 114 | 115 | 116 | class GxlConv1dSubsampling8(nn.Module): 117 | """Conv1d subsampling module. 118 | 119 | Args: 120 | idim (int): Input dimension. 121 | odim (int): Output dimension. 122 | dropout_rate (float): Dropout rate. 123 | 124 | """ 125 | 126 | def __init__(self, idim: int, odim: int): 127 | """Construct an Conv1dSubsampling object.""" 128 | super().__init__() 129 | self.conv = torch.nn.Sequential( 130 | torch.nn.Conv1d(idim, odim, 3, 1), 131 | torch.nn.GELU(), 132 | torch.nn.Conv1d(odim, odim, 3, 2), 133 | torch.nn.GELU(), 134 | torch.nn.Conv1d(odim, odim, 3, 2), 135 | torch.nn.GELU(), 136 | torch.nn.Conv1d(odim, odim, 3, 2), 137 | torch.nn.GELU(), 138 | ) 139 | 140 | def forward(self, x): 141 | """ 142 | 143 | Args: 144 | x: (B, T, idim) 145 | 146 | Returns: 147 | """ 148 | x = x.transpose(1, 2) 149 | x = self.conv(x) 150 | x = x.transpose(1, 2) 151 | return x 152 | 153 | class LyzConv1dSubsampling(torch.nn.Module): 154 | def __init__( 155 | self, 156 | enc_out_dim: int = 512, 157 | llm_embed_dim: int = 4096, 158 | kernel_size: int = 5, 159 | activation_func: str = 'relu', 160 | norm: str = 'batch', 161 | ): 162 | super().__init__() 163 | 164 | if enc_out_dim * 4 < llm_embed_dim: 165 | self.left_padding1 = nn.ConstantPad1d((kernel_size - 1, 0), 0.0) 166 | self.conv1d1 = nn.Conv1d(enc_out_dim, 2 * enc_out_dim, kernel_size, 1, 0) 167 | self.bn1 = nn.BatchNorm1d(2 * enc_out_dim, eps=1e-3, momentum=0.99) 168 | self.relu1 = nn.ReLU() 169 | 170 | self.left_padding2 = nn.ConstantPad1d((kernel_size - 1, 0), 0.0) 171 | self.conv1d2 = nn.Conv1d(2 * enc_out_dim, 4 * enc_out_dim, kernel_size, 2, 0) 172 | self.bn2 = nn.BatchNorm1d(4 * enc_out_dim, eps=1e-3, momentum=0.99) 173 | self.relu2 = nn.ReLU() 174 | 175 | self.project = nn.Linear(4 * enc_out_dim, llm_embed_dim) 176 | self.cnn_num = 2 177 | else: 178 | self.left_padding2 = nn.ConstantPad1d((kernel_size - 1, 0), 0.0) 179 | self.conv1d2 = nn.Conv1d(enc_out_dim, 2 * enc_out_dim, kernel_size, 2, 0) 180 | if norm == 'batch': 181 | self.bn2 = nn.BatchNorm1d(2 * enc_out_dim, eps=1e-3, momentum=0.99) 182 | elif norm == 'layer': 183 | self.bn2 = nn.LayerNorm(2 * enc_out_dim, eps=1e-3) 184 | if activation_func == 'gelu': 185 | self.relu2 = nn.GELU() 186 | else: 187 | self.relu2 = nn.ReLU() 188 | self.project = nn.Linear(2 * enc_out_dim, llm_embed_dim) 189 | self.cnn_num = 1 190 | 191 | def forward(self, x, mask_pad): 192 | """ 193 | x: B, T, enc_out_dim 194 | mask: (B, T) or (B, 1, T) 195 | """ 196 | x = x.transpose(1, 2) # B, channels, T 197 | 198 | # mask batch padding 199 | if mask_pad.size(2) > 0: # time > 0 200 | x.masked_fill_(~mask_pad, 0.0) 201 | 202 | if self.cnn_num == 2: 203 | x = self.left_padding1(x) 204 | x = self.conv1d1(x) 205 | x = self.bn1(x) 206 | x = self.relu1(x) 207 | 208 | x = self.left_padding2(x) 209 | x = self.conv1d2(x) 210 | if isinstance(self.bn2, nn.LayerNorm): 211 | x = x.transpose(1, 2) 212 | x = self.bn2(x) 213 | if isinstance(self.bn2, nn.LayerNorm): 214 | x = x.transpose(1, 2) 215 | x = self.relu2(x) 216 | 217 | x = x.transpose(1, 2) 218 | x = self.project(x) 219 | 220 | return x, mask_pad[:, :, 0::2] 221 | 222 | def get_downsampler(downsample_rate, ndim=1280): 223 | down_sample_2 = nn.Identity() 224 | if downsample_rate == 2: 225 | down_sample_2 = GxlConv1dSubsampling2(ndim, ndim) 226 | elif downsample_rate == 4: 227 | down_sample_2 = GxlConv1dSubsampling4(ndim, ndim) 228 | elif downsample_rate == 8: 229 | down_sample_2 = GxlConv1dSubsampling8(ndim, ndim) 230 | elif downsample_rate == 6: 231 | down_sample_2 = GxlConv1dSubsampling6(ndim, ndim) 232 | return down_sample_2 -------------------------------------------------------------------------------- /C2SER-llm/wenet/llm_asr/init_llmasr.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import torch 4 | 5 | from wenet.llm_asr.llmasr_model import LLMASR_Model 6 | from wenet.transformer.cmvn import GlobalCMVN 7 | from wenet.utils.checkpoint import load_checkpoint, load_trained_modules 8 | from wenet.utils.cmvn import load_cmvn 9 | 10 | from gxl_ai_utils.utils import utils_file 11 | 12 | def init_llmasr(args, configs, is_inference=False): 13 | llm_path = configs["llm_path"] 14 | lora = configs["use_lora"] 15 | lora_alpha = configs["lora_alpha"] 16 | lora_rank = configs["lora_rank"] 17 | lora_dropout = configs["lora_dropout"] 18 | # prompt_pattern = configs['prompt_pattern'] 19 | 20 | encoder_output_dim = -1 21 | if configs['encoder'] == 'transformer': 22 | if configs.get('cmvn', None) == 'global_cmvn': 23 | mean, istd = load_cmvn(configs['cmvn_conf']['cmvn_file'], 24 | configs['cmvn_conf']['is_json_cmvn']) 25 | global_cmvn = GlobalCMVN( 26 | torch.from_numpy(mean).float(), 27 | torch.from_numpy(istd).float()) 28 | else: 29 | global_cmvn = None 30 | encoder_type = configs.get('encoder', 'conformer') 31 | input_dim = configs['input_dim'] 32 | from wenet.utils.init_model import WENET_ENCODER_CLASSES 33 | encoder = WENET_ENCODER_CLASSES[encoder_type]( 34 | input_dim, 35 | global_cmvn=global_cmvn, 36 | **configs['encoder_conf'], 37 | **configs['encoder_conf']['efficient_conf'] 38 | if 'efficient_conf' in configs['encoder_conf'] else {}) 39 | encoder_output_dim = configs['encoder_conf']['output_size'] 40 | elif configs['encoder'] == 'whisper': 41 | raise NotImplementedError('whisper 还没实现') 42 | elif configs['encoder'] == 'hubert': 43 | raise NotImplementedError('hubert 还没实现') 44 | else: 45 | encoder = None 46 | logging.info(f'encoder output dim:{encoder_output_dim}') 47 | 48 | 49 | # encoder = encoder.to(torch.float16) 50 | speech_token_num = configs.get('speech_token_num', 0) 51 | train_speech_out = speech_token_num != 0 52 | 53 | model = LLMASR_Model( 54 | encoder=encoder, 55 | encoder_output_dim=encoder_output_dim, 56 | llm_path=llm_path, 57 | lora=lora, 58 | lora_alpha=lora_alpha, 59 | lora_rank=lora_rank, 60 | lora_dropout=lora_dropout, 61 | is_inference=is_inference, 62 | downsample_rate=configs.get('downsample_rate',1), 63 | adapter_type=configs.get('adapter_type', 'lyz'), 64 | speech_token_num=speech_token_num, 65 | train_speech_out=train_speech_out, 66 | ) 67 | 68 | utils_file.print_model_size(model.encoder) 69 | utils_file.print_model_size(model.llama_model) 70 | # utils_file.print_model_size(model.speech_transformer) 71 | # utils_file.print_model_size(model.speech_llama_proj) 72 | 73 | logging.info(f'开始加载初始化模型') 74 | if hasattr(args, 'checkpoint') and args.checkpoint is not None: 75 | logging.info(f'设置了初始化模型位置,开始加载,参数文件位置:{args.checkpoint}') 76 | infos = load_checkpoint(model, args.checkpoint) 77 | elif hasattr(args, 'checkpoint') and args.enc_init is not None: 78 | infos = load_trained_modules(model, args) 79 | else: 80 | infos = {} 81 | 82 | if configs.get('init_step', False): 83 | infos = {} 84 | configs["init_infos"] = infos 85 | print(configs) 86 | logging.info('加载初始化模型完毕') 87 | 88 | if not is_inference: 89 | logging.info('不更换LLM的参数') 90 | else: 91 | logging.info(' 不更换LLM的参数') 92 | 93 | logging.info('开始选择性冻结模块') 94 | fire_module = configs.get("fire_module", None) 95 | if fire_module is None: 96 | logging.info('没有选择解冻的模块,也就是没有训练参数,直接报错返回') 97 | raise ValueError('没有选择解冻的模块,也就是没有训练参数,直接报错返回') 98 | for k, p in model.named_parameters(): 99 | if fire_module == 'link': 100 | if k.startswith("llama_model") or k.startswith("encoder"): 101 | p.requires_grad = False 102 | elif fire_module == 'encoder': 103 | if not k.startswith("encoder"): 104 | p.requires_grad = False 105 | elif fire_module == 'llm': 106 | if not k.startswith("llama_model"): 107 | p.requires_grad = False 108 | elif fire_module == 'link_and_encoder': 109 | # 这里和speech token相关的层不会被冻结 110 | if k.startswith("llama_model"): 111 | p.requires_grad = False 112 | elif fire_module == "link_and_encoder_and_lora": 113 | break 114 | logging.info(f"{k} {p.requires_grad}") 115 | logging.info('冻结完毕') 116 | 117 | return model, configs 118 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/llm_asr/utils4llmasr.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import Tuple 3 | 4 | import torch 5 | 6 | from wenet.utils.common import pad_list 7 | from gxl_ai_utils.utils import utils_file 8 | 9 | 10 | def add_sos_eos4speech_llm(ys_pad: torch.Tensor, sos: int, eos: int, 11 | ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]: 12 | """Add and labels. 13 | 为out后接一个eos. in基本保持不变 14 | 15 | Args: 16 | ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax) 17 | sos (int): index of 18 | eos (int): index of 19 | ignore_id (int): index of padding 20 | 21 | Returns: 22 | ys_in (torch.Tensor) : (B, Lmax) 23 | ys_out (torch.Tensor) : (B, Lmax + 1) 24 | 25 | Examples: 26 | >>> sos_id = 10 27 | >>> eos_id = 11 28 | >>> ignore_id = -1 29 | >>> ys_pad 30 | tensor([[ 1, 2, 3, 4, 5], 31 | [ 4, 5, 6, -1, -1], 32 | [ 7, 8, 9, -1, -1]], dtype=torch.int32) 33 | >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) 34 | >>> ys_in 35 | tensor([[ 1, 2, 3, 4, 5], 36 | [ 4, 5, 6, 11, 11], 37 | [ 7, 8, 9, 11, 11]]) 38 | >>> ys_out 39 | tensor([[ 1, 2, 3, 4, 5, 11], 40 | [ 4, 5, 6, 11, -1, -1], 41 | [ 7, 8, 9, 11, -1, -1]]) 42 | """ 43 | _sos = torch.tensor([sos], 44 | dtype=torch.long, 45 | requires_grad=False, 46 | device=ys_pad.device) 47 | _eos = torch.tensor([eos], 48 | dtype=torch.long, 49 | requires_grad=False, 50 | device=ys_pad.device) 51 | ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys 52 | # ys_in = [torch.cat([_sos, y], dim=0) for y in ys] 53 | ys_in = [y for y in ys] 54 | ys_out = [torch.cat([y, _eos], dim=0) for y in ys] 55 | return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) 56 | 57 | global_prompt_dict = None 58 | def get_prompt_by_task(task_name): 59 | """ 60 | 根据task给定指定的prompt, 并实现prompt的多样随意性 61 | Args: 62 | task_name: 63 | 64 | Returns: 65 | 66 | """ 67 | global global_prompt_dict 68 | if global_prompt_dict is None: 69 | global_prompt_dict = utils_file.load_dict_from_yaml('conf/prompt.yaml') 70 | random_index = random.randint(0, len(global_prompt_dict[task_name])-1) 71 | return global_prompt_dict[task_name][random_index] 72 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/paraformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/paraformer/__init__.py -------------------------------------------------------------------------------- /C2SER-llm/wenet/paraformer/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/paraformer/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/paraformer/__pycache__/embedding.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/paraformer/__pycache__/embedding.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/paraformer/__pycache__/search.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/paraformer/__pycache__/search.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/paraformer/embedding.py: -------------------------------------------------------------------------------- 1 | from wenet.transformer.embedding import WhisperPositionalEncoding 2 | 3 | 4 | class ParaformerPositinoalEncoding(WhisperPositionalEncoding): 5 | """ Sinusoids position encoding used in paraformer.encoder 6 | """ 7 | 8 | def __init__(self, 9 | depth: int, 10 | d_model: int, 11 | dropout_rate: float = 0.1, 12 | max_len: int = 1500): 13 | super().__init__(depth, dropout_rate, max_len) 14 | self.xscale = d_model**0.5 15 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/squeezeformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/squeezeformer/__init__.py -------------------------------------------------------------------------------- /C2SER-llm/wenet/squeezeformer/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/squeezeformer/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/squeezeformer/__pycache__/conv2d.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/squeezeformer/__pycache__/conv2d.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/squeezeformer/__pycache__/subsampling.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/squeezeformer/__pycache__/subsampling.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/squeezeformer/conv2d.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Conv2d Module with Valid Padding""" 15 | 16 | import torch.nn.functional as F 17 | from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional 18 | 19 | 20 | class Conv2dValid(_ConvNd): 21 | """ 22 | Conv2d operator for VALID mode padding. 23 | """ 24 | 25 | def __init__( 26 | self, 27 | in_channels: int, 28 | out_channels: int, 29 | kernel_size: _size_2_t, 30 | stride: _size_2_t = 1, 31 | padding: Union[str, _size_2_t] = 0, 32 | dilation: _size_2_t = 1, 33 | groups: int = 1, 34 | bias: bool = True, 35 | padding_mode: str = 'zeros', # TODO: refine this type 36 | device=None, 37 | dtype=None, 38 | valid_trigx: bool = False, 39 | valid_trigy: bool = False) -> None: 40 | factory_kwargs = {'device': device, 'dtype': dtype} 41 | kernel_size_ = _pair(kernel_size) 42 | stride_ = _pair(stride) 43 | padding_ = padding if isinstance(padding, str) else _pair(padding) 44 | dilation_ = _pair(dilation) 45 | super(Conv2dValid, 46 | self).__init__(in_channels, out_channels, 47 | kernel_size_, stride_, padding_, dilation_, False, 48 | _pair(0), groups, bias, padding_mode, 49 | **factory_kwargs) 50 | self.valid_trigx = valid_trigx 51 | self.valid_trigy = valid_trigy 52 | 53 | def _conv_forward(self, input: Tensor, weight: Tensor, 54 | bias: Optional[Tensor]): 55 | validx, validy = 0, 0 56 | if self.valid_trigx: 57 | validx = (input.size(-2) * 58 | (self.stride[-2] - 1) - 1 + self.kernel_size[-2]) // 2 59 | if self.valid_trigy: 60 | validy = (input.size(-1) * 61 | (self.stride[-1] - 1) - 1 + self.kernel_size[-1]) // 2 62 | return F.conv2d(input, weight, bias, self.stride, (validx, validy), 63 | self.dilation, self.groups) 64 | 65 | def forward(self, input: Tensor) -> Tensor: 66 | return self._conv_forward(input, self.weight, self.bias) 67 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__init__.py -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/base_tokenizer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/base_tokenizer.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/base_tokenizer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/base_tokenizer.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/base_tokenizer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/base_tokenizer.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/bpe_tokenizer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/bpe_tokenizer.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/bpe_tokenizer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/bpe_tokenizer.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/bpe_tokenizer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/bpe_tokenizer.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/char_tokenizer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/char_tokenizer.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/char_tokenizer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/char_tokenizer.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/char_tokenizer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/char_tokenizer.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/hugging_face_tokenizer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/hugging_face_tokenizer.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/hugging_face_tokenizer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/hugging_face_tokenizer.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/hugging_face_tokenizer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/hugging_face_tokenizer.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/paraformer_tokenizer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/paraformer_tokenizer.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/paraformer_tokenizer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/paraformer_tokenizer.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/paraformer_tokenizer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/paraformer_tokenizer.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/tokenize_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/tokenize_utils.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/tokenize_utils.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/tokenize_utils.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/tokenize_utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/tokenize_utils.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/whisper_tokenizer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/whisper_tokenizer.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/whisper_tokenizer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/whisper_tokenizer.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/__pycache__/whisper_tokenizer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/text/__pycache__/whisper_tokenizer.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/base_tokenizer.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod, abstractproperty 2 | from typing import Dict, List, Tuple, Union 3 | 4 | T = Union[str, bytes] 5 | 6 | 7 | class BaseTokenizer(ABC): 8 | 9 | def tokenize(self, line: str) -> Tuple[List[T], List[int]]: 10 | tokens = self.text2tokens(line) 11 | ids = self.tokens2ids(tokens) 12 | return tokens, ids 13 | 14 | def detokenize(self, ids: List[int]) -> Tuple[str, List[T]]: 15 | tokens = self.ids2tokens(ids) 16 | text = self.tokens2text(tokens) 17 | return text, tokens 18 | 19 | @abstractmethod 20 | def text2tokens(self, line: str) -> List[T]: 21 | raise NotImplementedError("abstract method") 22 | 23 | @abstractmethod 24 | def tokens2text(self, tokens: List[T]) -> str: 25 | raise NotImplementedError("abstract method") 26 | 27 | @abstractmethod 28 | def tokens2ids(self, tokens: List[T]) -> List[int]: 29 | raise NotImplementedError("abstract method") 30 | 31 | @abstractmethod 32 | def ids2tokens(self, ids: List[int]) -> List[T]: 33 | raise NotImplementedError("abstract method") 34 | 35 | @abstractmethod 36 | def vocab_size(self) -> int: 37 | raise NotImplementedError("abstract method") 38 | 39 | @abstractproperty 40 | def symbol_table(self) -> Dict[T, int]: 41 | raise NotImplementedError("abstract method") 42 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/bpe_tokenizer.py: -------------------------------------------------------------------------------- 1 | from os import PathLike 2 | from typing import Dict, List, Optional, Union 3 | from wenet.text.char_tokenizer import CharTokenizer 4 | from wenet.text.tokenize_utils import tokenize_by_bpe_model 5 | 6 | 7 | class BpeTokenizer(CharTokenizer): 8 | 9 | def __init__( 10 | self, 11 | bpe_model: Union[PathLike, str], 12 | symbol_table: Union[str, PathLike, Dict], 13 | non_lang_syms: Optional[Union[str, PathLike, List]] = None, 14 | split_with_space: bool = False, 15 | connect_symbol: str = '', 16 | unk='', 17 | ) -> None: 18 | super().__init__(symbol_table, non_lang_syms, split_with_space, 19 | connect_symbol, unk) 20 | self._model = bpe_model 21 | # NOTE(Mddct): multiprocessing.Process() issues 22 | # don't build sp here 23 | self.bpe_model = None 24 | 25 | def _build_sp(self): 26 | if self.bpe_model is None: 27 | import sentencepiece as spm 28 | self.bpe_model = spm.SentencePieceProcessor() 29 | self.bpe_model.load(self._model) 30 | 31 | def text2tokens(self, line: str) -> List[str]: 32 | self._build_sp() 33 | line = line.strip() 34 | if self.non_lang_syms_pattern is not None: 35 | parts = self.non_lang_syms_pattern.split(line.upper()) 36 | parts = [w for w in parts if len(w.strip()) > 0] 37 | else: 38 | parts = [line] 39 | 40 | tokens = [] 41 | for part in parts: 42 | if part in self.non_lang_syms: 43 | tokens.append(part) 44 | else: 45 | tokens.extend(tokenize_by_bpe_model(self.bpe_model, part)) 46 | return tokens 47 | 48 | def tokens2text(self, tokens: List[str]) -> str: 49 | self._build_sp() 50 | text = super().tokens2text(tokens) 51 | return text.replace("▁", ' ').strip() 52 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/char_tokenizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from os import PathLike 4 | from typing import Dict, List, Optional, Union 5 | from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols 6 | from wenet.text.base_tokenizer import BaseTokenizer 7 | 8 | 9 | class CharTokenizer(BaseTokenizer): 10 | 11 | def __init__( 12 | self, 13 | symbol_table: Union[str, PathLike, Dict], 14 | non_lang_syms: Optional[Union[str, PathLike, List]] = None, 15 | split_with_space: bool = False, 16 | connect_symbol: str = '', 17 | unk='', 18 | ) -> None: 19 | self.non_lang_syms_pattern = None 20 | if non_lang_syms is not None: 21 | self.non_lang_syms_pattern = re.compile( 22 | r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") 23 | if not isinstance(symbol_table, Dict): 24 | self._symbol_table = read_symbol_table(symbol_table) 25 | else: 26 | # symbol_table = {"我": 1, "是": 2, "{NOISE}": 3} 27 | self._symbol_table = symbol_table 28 | if not isinstance(non_lang_syms, List): 29 | self.non_lang_syms = read_non_lang_symbols(non_lang_syms) 30 | else: 31 | # non_lang_syms=["{NOISE}"] 32 | self.non_lang_syms = non_lang_syms 33 | self.char_dict = {v: k for k, v in self._symbol_table.items()} 34 | self.split_with_space = split_with_space 35 | self.connect_symbol = connect_symbol 36 | self.unk = unk 37 | 38 | def text2tokens(self, line: str) -> List[str]: 39 | line = line.strip() 40 | if self.non_lang_syms_pattern is not None: 41 | parts = self.non_lang_syms_pattern.split(line.upper()) 42 | parts = [w for w in parts if len(w.strip()) > 0] 43 | else: 44 | parts = [line] 45 | 46 | tokens = [] 47 | for part in parts: 48 | if part in self.non_lang_syms: 49 | tokens.append(part) 50 | else: 51 | if self.split_with_space: 52 | part = part.split(" ") 53 | for ch in part: 54 | if ch == ' ': 55 | ch = "▁" 56 | tokens.append(ch) 57 | return tokens 58 | 59 | def tokens2text(self, tokens: List[str]) -> str: 60 | return self.connect_symbol.join(tokens) 61 | 62 | def tokens2ids(self, tokens: List[str]) -> List[int]: 63 | ids = [] 64 | for ch in tokens: 65 | if ch in self._symbol_table: 66 | ids.append(self._symbol_table[ch]) 67 | elif self.unk in self._symbol_table: 68 | ids.append(self._symbol_table[self.unk]) 69 | return ids 70 | 71 | def ids2tokens(self, ids: List[int]) -> List[str]: 72 | content = [self.char_dict[w] for w in ids] 73 | return content 74 | 75 | def vocab_size(self) -> int: 76 | return len(self.char_dict) 77 | 78 | @property 79 | def symbol_table(self) -> Dict[str, int]: 80 | return self._symbol_table 81 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/hugging_face_tokenizer.py: -------------------------------------------------------------------------------- 1 | from os import PathLike 2 | from typing import Dict, List, Union 3 | from wenet.text.base_tokenizer import BaseTokenizer, T as Type 4 | 5 | 6 | class HuggingFaceTokenizer(BaseTokenizer): 7 | 8 | def __init__(self, model: Union[str, PathLike], *args, **kwargs) -> None: 9 | # NOTE(Mddct): don't build here, pickle issues 10 | self.model = model 11 | self.tokenizer = None 12 | 13 | self.args = args 14 | self.kwargs = kwargs 15 | 16 | def __getstate__(self): 17 | state = self.__dict__.copy() 18 | del state['tokenizer'] 19 | return state 20 | 21 | def __setstate__(self, state): 22 | self.__dict__.update(state) 23 | recovery = {'tokenizer': None} 24 | self.__dict__.update(recovery) 25 | 26 | def _build_hugging_face(self): 27 | from transformers import AutoTokenizer 28 | if self.tokenizer is None: 29 | self.tokenizer = AutoTokenizer.from_pretrained( 30 | self.model, **self.kwargs) 31 | self.t2i = self.tokenizer.get_vocab() 32 | 33 | def text2tokens(self, line: str) -> List[Type]: 34 | self._build_hugging_face() 35 | return self.tokenizer.tokenize(line) 36 | 37 | def tokens2text(self, tokens: List[Type]) -> str: 38 | self._build_hugging_face() 39 | ids = self.tokens2ids(tokens) 40 | return self.tokenizer.decode(ids) 41 | 42 | def tokens2ids(self, tokens: List[Type]) -> List[int]: 43 | self._build_hugging_face() 44 | return self.tokenizer.convert_tokens_to_ids(tokens) 45 | 46 | def ids2tokens(self, ids: List[int]) -> List[Type]: 47 | self._build_hugging_face() 48 | return self.tokenizer.convert_ids_to_tokens(ids) 49 | 50 | def vocab_size(self) -> int: 51 | self._build_hugging_face() 52 | # TODO: we need special tokenize size in future 53 | return len(self.tokenizer) 54 | 55 | @property 56 | def symbol_table(self) -> Dict[Type, int]: 57 | self._build_hugging_face() 58 | return self.t2i 59 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/paraformer_tokenizer.py: -------------------------------------------------------------------------------- 1 | from os import PathLike 2 | from typing import Dict, List, Optional, Union 3 | from wenet.paraformer.search import paraformer_beautify_result 4 | from wenet.text.char_tokenizer import CharTokenizer 5 | from wenet.text.tokenize_utils import tokenize_by_seg_dict 6 | 7 | 8 | def read_seg_dict(path): 9 | seg_table = {} 10 | with open(path, 'r', encoding='utf8') as fin: 11 | for line in fin: 12 | arr = line.strip().split('\t') 13 | assert len(arr) == 2 14 | seg_table[arr[0]] = arr[1] 15 | return seg_table 16 | 17 | 18 | class ParaformerTokenizer(CharTokenizer): 19 | 20 | def __init__(self, 21 | symbol_table: Union[str, PathLike, Dict], 22 | seg_dict: Optional[Union[str, PathLike, Dict]] = None, 23 | split_with_space: bool = False, 24 | connect_symbol: str = '', 25 | unk='') -> None: 26 | super().__init__(symbol_table, None, split_with_space, connect_symbol, 27 | unk) 28 | self.seg_dict = seg_dict 29 | if seg_dict is not None and not isinstance(seg_dict, Dict): 30 | self.seg_dict = read_seg_dict(seg_dict) 31 | 32 | def text2tokens(self, line: str) -> List[str]: 33 | assert self.seg_dict is not None 34 | 35 | # TODO(Mddct): duplicated here, refine later 36 | line = line.strip() 37 | if self.non_lang_syms_pattern is not None: 38 | parts = self.non_lang_syms_pattern.split(line) 39 | parts = [w for w in parts if len(w.strip()) > 0] 40 | else: 41 | parts = [line] 42 | 43 | tokens = [] 44 | for part in parts: 45 | if part in self.non_lang_syms: 46 | tokens.append(part) 47 | else: 48 | tokens.extend(tokenize_by_seg_dict(self.seg_dict, part)) 49 | return tokens 50 | 51 | def tokens2text(self, tokens: List[str]) -> str: 52 | return paraformer_beautify_result(tokens) 53 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/tokenize_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) 2 | # 2023 Tsinghua Univ. (authors: Xingchen Song) 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import re 16 | 17 | 18 | def tokenize_by_bpe_model(sp, txt): 19 | return _tokenize_by_seg_dic_or_bpe_model(txt, sp=sp, upper=True) 20 | 21 | 22 | def tokenize_by_seg_dict(seg_dict, txt): 23 | return _tokenize_by_seg_dic_or_bpe_model(txt, 24 | seg_dict=seg_dict, 25 | upper=False) 26 | 27 | 28 | def _tokenize_by_seg_dic_or_bpe_model( 29 | txt, 30 | sp=None, 31 | seg_dict=None, 32 | upper=True, 33 | ): 34 | if sp is None: 35 | assert seg_dict is not None 36 | if seg_dict is None: 37 | assert sp is not None 38 | tokens = [] 39 | # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: 40 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 41 | pattern = re.compile(r'([\u4e00-\u9fff])') 42 | # Example: 43 | # txt = "你好 ITS'S OKAY 的" 44 | # chars = ["你", "好", " ITS'S OKAY ", "的"] 45 | chars = pattern.split(txt.upper() if upper else txt) 46 | mix_chars = [w for w in chars if len(w.strip()) > 0] 47 | for ch_or_w in mix_chars: 48 | # ch_or_w is a single CJK charater(i.e., "你"), do nothing. 49 | if pattern.fullmatch(ch_or_w) is not None: 50 | tokens.append(ch_or_w) 51 | # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), 52 | # encode ch_or_w using bpe_model. 53 | else: 54 | if sp is not None: 55 | for p in sp.encode_as_pieces(ch_or_w): 56 | tokens.append(p) 57 | else: 58 | for en_token in ch_or_w.split(): 59 | en_token = en_token.strip() 60 | if en_token in seg_dict: 61 | tokens.extend(seg_dict[en_token].split(' ')) 62 | else: 63 | tokens.append(en_token) 64 | 65 | return tokens 66 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/text/whisper_tokenizer.py: -------------------------------------------------------------------------------- 1 | from os import PathLike 2 | from typing import Dict, List, Optional, Tuple, Union 3 | from wenet.text.base_tokenizer import BaseTokenizer 4 | 5 | from wenet.utils.file_utils import read_non_lang_symbols 6 | 7 | 8 | class WhisperTokenizer(BaseTokenizer): 9 | 10 | def __init__( 11 | self, 12 | multilingual: bool, 13 | num_languages: int = 99, 14 | language: Optional[str] = None, 15 | task: Optional[str] = None, 16 | non_lang_syms: Optional[Union[str, PathLike, List]] = None, 17 | *args, 18 | **kwargs, 19 | ) -> None: 20 | # NOTE(Mddct): don't build here, pickle issues 21 | self.tokenizer = None 22 | # TODO: we don't need this in future 23 | self.multilingual = multilingual 24 | self.num_languages = num_languages 25 | self.language = language 26 | self.task = task 27 | 28 | if not isinstance(non_lang_syms, List): 29 | self.non_lang_syms = read_non_lang_symbols(non_lang_syms) 30 | else: 31 | # non_lang_syms=["{NOISE}"] 32 | self.non_lang_syms = non_lang_syms 33 | # TODO(Mddct): add special tokens, like non_lang_syms 34 | del self.non_lang_syms 35 | 36 | def __getstate__(self): 37 | state = self.__dict__.copy() 38 | del state['tokenizer'] 39 | return state 40 | 41 | def __setstate__(self, state): 42 | self.__dict__.update(state) 43 | recovery = {'tokenizer': None} 44 | self.__dict__.update(recovery) 45 | 46 | def _build_tiktoken(self): 47 | if self.tokenizer is None: 48 | from whisper.tokenizer import get_tokenizer 49 | self.tokenizer = get_tokenizer(multilingual=self.multilingual, 50 | num_languages=self.num_languages, 51 | language=self.language, 52 | task=self.task) 53 | self.t2i = {} 54 | self.i2t = {} 55 | for i in range(self.tokenizer.encoding.n_vocab): 56 | unit = str( 57 | self.tokenizer.encoding.decode_single_token_bytes(i)) 58 | if len(unit) == 0: 59 | unit = str(i) 60 | unit = unit.replace(" ", "") 61 | # unit = bytes(unit, 'utf-8') 62 | self.t2i[unit] = i 63 | self.i2t[i] = unit 64 | assert len(self.t2i) == len(self.i2t) 65 | 66 | def tokenize(self, line: str) -> Tuple[List[str], List[int]]: 67 | self._build_tiktoken() 68 | ids = self.tokenizer.encoding.encode(line) 69 | text = [self.i2t[d] for d in ids] 70 | return text, ids 71 | 72 | def detokenize(self, ids: List[int]) -> Tuple[str, List[str]]: 73 | self._build_tiktoken() 74 | tokens = [self.i2t[d] for d in ids] 75 | text = self.tokenizer.encoding.decode(ids) 76 | return text, tokens 77 | 78 | def text2tokens(self, line: str) -> List[str]: 79 | self._build_tiktoken() 80 | return self.tokenize(line)[0] 81 | 82 | def tokens2text(self, tokens: List[str]) -> str: 83 | self._build_tiktoken() 84 | ids = [self.t2i[t] for t in tokens] 85 | return self.detokenize(ids)[0] 86 | 87 | def tokens2ids(self, tokens: List[str]) -> List[int]: 88 | self._build_tiktoken() 89 | ids = [self.t2i[t] for t in tokens] 90 | return ids 91 | 92 | def ids2tokens(self, ids: List[int]) -> List[str]: 93 | self._build_tiktoken() 94 | return [self.tokenizer.encoding.decode([id]) for id in ids] 95 | 96 | def vocab_size(self) -> int: 97 | self._build_tiktoken() 98 | return len(self.t2i) 99 | 100 | @property 101 | def symbol_table(self) -> Dict[str, int]: 102 | self._build_tiktoken() 103 | return self.t2i 104 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__init__.py -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/asr_model.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/asr_model.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/asr_model.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/asr_model.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/asr_model.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/asr_model.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/attention.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/attention.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/attention.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/attention.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/attention.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/attention.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/cmvn.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/cmvn.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/cmvn.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/cmvn.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/cmvn.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/cmvn.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/convolution.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/convolution.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/convolution.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/convolution.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/convolution.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/convolution.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/ctc.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/ctc.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/ctc.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/ctc.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/ctc.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/ctc.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/decoder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/decoder.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/decoder.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/decoder.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/decoder.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/decoder.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/decoder_layer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/decoder_layer.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/decoder_layer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/decoder_layer.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/decoder_layer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/decoder_layer.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/embedding.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/embedding.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/embedding.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/embedding.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/embedding.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/embedding.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/encoder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/encoder.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/encoder.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/encoder.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/encoder.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/encoder.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/encoder_layer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/encoder_layer.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/encoder_layer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/encoder_layer.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/encoder_layer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/encoder_layer.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/label_smoothing_loss.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/label_smoothing_loss.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/label_smoothing_loss.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/label_smoothing_loss.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/label_smoothing_loss.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/label_smoothing_loss.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/norm.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/norm.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/norm.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/norm.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/norm.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/norm.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/positionwise_feed_forward.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/search.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/search.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/search.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/search.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/search.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/search.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/subsampling.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/subsampling.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/subsampling.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/subsampling.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/subsampling.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/subsampling.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/swish.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/swish.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/swish.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/swish.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/__pycache__/swish.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/transformer/__pycache__/swish.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/cmvn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | 17 | 18 | class GlobalCMVN(torch.nn.Module): 19 | 20 | def __init__(self, 21 | mean: torch.Tensor, 22 | istd: torch.Tensor, 23 | norm_var: bool = True): 24 | """ 25 | Args: 26 | mean (torch.Tensor): mean stats 27 | istd (torch.Tensor): inverse std, std which is 1.0 / std 28 | """ 29 | super().__init__() 30 | assert mean.shape == istd.shape 31 | self.norm_var = norm_var 32 | # The buffer can be accessed from this module using self.mean 33 | self.register_buffer("mean", mean) 34 | self.register_buffer("istd", istd) 35 | 36 | def forward(self, x: torch.Tensor): 37 | """ 38 | Args: 39 | x (torch.Tensor): (batch, max_len, feat_dim) 40 | 41 | Returns: 42 | (torch.Tensor): normalized feature 43 | """ 44 | x = x - self.mean 45 | if self.norm_var: 46 | x = x * self.istd 47 | return x 48 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/convolution.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Modified from ESPnet(https://github.com/espnet/espnet) 15 | """ConvolutionModule definition.""" 16 | 17 | from typing import Tuple 18 | 19 | import torch 20 | from torch import nn 21 | 22 | from wenet.utils.class_utils import WENET_NORM_CLASSES 23 | 24 | 25 | class ConvolutionModule(nn.Module): 26 | """ConvolutionModule in Conformer model.""" 27 | 28 | def __init__( 29 | self, 30 | channels: int, 31 | kernel_size: int = 15, 32 | activation: nn.Module = nn.ReLU(), 33 | norm: str = "batch_norm", 34 | causal: bool = False, 35 | bias: bool = True, 36 | norm_eps: float = 1e-5, 37 | ): 38 | """Construct an ConvolutionModule object. 39 | Args: 40 | channels (int): The number of channels of conv layers. 41 | kernel_size (int): Kernel size of conv layers. 42 | causal (int): Whether use causal convolution or not 43 | """ 44 | super().__init__() 45 | 46 | self.pointwise_conv1 = nn.Conv1d( 47 | channels, 48 | 2 * channels, 49 | kernel_size=1, 50 | stride=1, 51 | padding=0, 52 | bias=bias, 53 | ) 54 | # self.lorder is used to distinguish if it's a causal convolution, 55 | # if self.lorder > 0: it's a causal convolution, the input will be 56 | # padded with self.lorder frames on the left in forward. 57 | # else: it's a symmetrical convolution 58 | if causal: 59 | padding = 0 60 | self.lorder = kernel_size - 1 61 | else: 62 | # kernel_size should be an odd number for none causal convolution 63 | assert (kernel_size - 1) % 2 == 0 64 | padding = (kernel_size - 1) // 2 65 | self.lorder = 0 66 | self.depthwise_conv = nn.Conv1d( 67 | channels, 68 | channels, 69 | kernel_size, 70 | stride=1, 71 | padding=padding, 72 | groups=channels, 73 | bias=bias, 74 | ) 75 | 76 | assert norm in ['batch_norm', 'layer_norm', 'rms_norm'] 77 | if norm == "batch_norm": 78 | self.use_layer_norm = False 79 | self.norm = WENET_NORM_CLASSES['batch_norm'](channels, 80 | eps=norm_eps) 81 | else: 82 | self.use_layer_norm = True 83 | self.norm = WENET_NORM_CLASSES[norm](channels, eps=norm_eps) 84 | 85 | self.pointwise_conv2 = nn.Conv1d( 86 | channels, 87 | channels, 88 | kernel_size=1, 89 | stride=1, 90 | padding=0, 91 | bias=bias, 92 | ) 93 | self.activation = activation 94 | 95 | def forward( 96 | self, 97 | x: torch.Tensor, 98 | mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), 99 | cache: torch.Tensor = torch.zeros((0, 0, 0)), 100 | ) -> Tuple[torch.Tensor, torch.Tensor]: 101 | """Compute convolution module. 102 | Args: 103 | x (torch.Tensor): Input tensor (#batch, time, channels). 104 | mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), 105 | (0, 0, 0) means fake mask. 106 | cache (torch.Tensor): left context cache, it is only 107 | used in causal convolution (#batch, channels, cache_t), 108 | (0, 0, 0) meas fake cache. 109 | Returns: 110 | torch.Tensor: Output tensor (#batch, time, channels). 111 | """ 112 | # exchange the temporal dimension and the feature dimension 113 | x = x.transpose(1, 2) # (#batch, channels, time) 114 | 115 | # mask batch padding 116 | if mask_pad.size(2) > 0: # time > 0 117 | x.masked_fill_(~mask_pad, 0.0) 118 | 119 | if self.lorder > 0: 120 | if cache.size(2) == 0: # cache_t == 0 121 | x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) 122 | else: 123 | assert cache.size(0) == x.size(0) # equal batch 124 | assert cache.size(1) == x.size(1) # equal channel 125 | x = torch.cat((cache, x), dim=2) 126 | assert (x.size(2) > self.lorder) 127 | new_cache = x[:, :, -self.lorder:] 128 | else: 129 | # It's better we just return None if no cache is required, 130 | # However, for JIT export, here we just fake one tensor instead of 131 | # None. 132 | new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) 133 | 134 | # GLU mechanism 135 | x = self.pointwise_conv1(x) # (batch, 2*channel, dim) 136 | x = nn.functional.glu(x, dim=1) # (batch, channel, dim) 137 | 138 | # 1D Depthwise Conv 139 | x = self.depthwise_conv(x) 140 | if self.use_layer_norm: 141 | x = x.transpose(1, 2) 142 | x = self.activation(self.norm(x)) 143 | if self.use_layer_norm: 144 | x = x.transpose(1, 2) 145 | x = self.pointwise_conv2(x) 146 | # mask batch padding 147 | if mask_pad.size(2) > 0: # time > 0 148 | x.masked_fill_(~mask_pad, 0.0) 149 | 150 | return x.transpose(1, 2), new_cache 151 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/ctc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Modified from ESPnet(https://github.com/espnet/espnet) 15 | 16 | from typing import Tuple 17 | 18 | import torch 19 | import torch.nn.functional as F 20 | 21 | 22 | class CTC(torch.nn.Module): 23 | """CTC module""" 24 | 25 | def __init__( 26 | self, 27 | odim: int, 28 | encoder_output_size: int, 29 | dropout_rate: float = 0.0, 30 | reduce: bool = True, 31 | blank_id: int = 0, 32 | ): 33 | """ Construct CTC module 34 | Args: 35 | odim: dimension of outputs 36 | encoder_output_size: number of encoder projection units 37 | dropout_rate: dropout rate (0.0 ~ 1.0) 38 | reduce: reduce the CTC loss into a scalar 39 | blank_id: blank label. 40 | """ 41 | super().__init__() 42 | eprojs = encoder_output_size 43 | self.dropout_rate = dropout_rate 44 | self.ctc_lo = torch.nn.Linear(eprojs, odim) 45 | 46 | reduction_type = "sum" if reduce else "none" 47 | self.ctc_loss = torch.nn.CTCLoss(blank=blank_id, 48 | reduction=reduction_type, 49 | zero_infinity=True) 50 | 51 | def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor, 52 | ys_pad: torch.Tensor, 53 | ys_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 54 | """Calculate CTC loss. 55 | 56 | Args: 57 | hs_pad: batch of padded hidden state sequences (B, Tmax, D) 58 | hlens: batch of lengths of hidden state sequences (B) 59 | ys_pad: batch of padded character id sequence tensor (B, Lmax) 60 | ys_lens: batch of lengths of character sequence (B) 61 | """ 62 | # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab) 63 | ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) 64 | # ys_hat: (B, L, D) -> (L, B, D) 65 | ys_hat = ys_hat.transpose(0, 1) 66 | ys_hat = ys_hat.log_softmax(2) 67 | loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens) 68 | # Batch-size average 69 | loss = loss / ys_hat.size(1) 70 | ys_hat = ys_hat.transpose(0, 1) 71 | return loss, ys_hat 72 | 73 | def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor: 74 | """log_softmax of frame activations 75 | 76 | Args: 77 | Tensor hs_pad: 3d tensor (B, Tmax, eprojs) 78 | Returns: 79 | torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim) 80 | """ 81 | return F.log_softmax(self.ctc_lo(hs_pad), dim=2) 82 | 83 | def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor: 84 | """argmax of frame activations 85 | 86 | Args: 87 | torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) 88 | Returns: 89 | torch.Tensor: argmax applied 2d tensor (B, Tmax) 90 | """ 91 | return torch.argmax(self.ctc_lo(hs_pad), dim=2) 92 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/decoder_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Shigeki Karita 2 | # 2020 Mobvoi Inc (Binbin Zhang) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Decoder self-attention layer definition.""" 16 | from typing import Dict, Optional, Tuple 17 | 18 | import torch 19 | from torch import nn 20 | from wenet.transformer.attention import T_CACHE 21 | 22 | from wenet.utils.class_utils import WENET_NORM_CLASSES 23 | 24 | 25 | class DecoderLayer(nn.Module): 26 | """Single decoder layer module. 27 | 28 | Args: 29 | size (int): Input dimension. 30 | self_attn (torch.nn.Module): Self-attention module instance. 31 | `MultiHeadedAttention` instance can be used as the argument. 32 | src_attn (torch.nn.Module): Inter-attention module instance. 33 | `MultiHeadedAttention` instance can be used as the argument. 34 | If `None` is passed, Inter-attention is not used, such as 35 | CIF, GPT, and other decoder only model. 36 | feed_forward (torch.nn.Module): Feed-forward module instance. 37 | `PositionwiseFeedForward` instance can be used as the argument. 38 | dropout_rate (float): Dropout rate. 39 | normalize_before (bool): 40 | True: use layer_norm before each sub-block. 41 | False: to use layer_norm after each sub-block. 42 | """ 43 | 44 | def __init__( 45 | self, 46 | size: int, 47 | self_attn: nn.Module, 48 | src_attn: Optional[nn.Module], 49 | feed_forward: nn.Module, 50 | dropout_rate: float, 51 | normalize_before: bool = True, 52 | layer_norm_type: str = 'layer_norm', 53 | norm_eps: float = 1e-5, 54 | ): 55 | """Construct an DecoderLayer object.""" 56 | super().__init__() 57 | self.size = size 58 | self.self_attn = self_attn 59 | self.src_attn = src_attn 60 | self.feed_forward = feed_forward 61 | assert layer_norm_type in ['layer_norm', 'rms_norm'] 62 | self.norm1 = WENET_NORM_CLASSES[layer_norm_type](size, eps=norm_eps) 63 | self.norm2 = WENET_NORM_CLASSES[layer_norm_type](size, eps=norm_eps) 64 | self.norm3 = WENET_NORM_CLASSES[layer_norm_type](size, eps=norm_eps) 65 | self.dropout = nn.Dropout(dropout_rate) 66 | self.normalize_before = normalize_before 67 | 68 | def forward( 69 | self, 70 | tgt: torch.Tensor, 71 | tgt_mask: torch.Tensor, 72 | memory: torch.Tensor, 73 | memory_mask: torch.Tensor, 74 | cache: Optional[Dict[str, Optional[T_CACHE]]] = None 75 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: 76 | """Compute decoded features. 77 | 78 | Args: 79 | tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size). 80 | tgt_mask (torch.Tensor): Mask for input tensor 81 | (#batch, maxlen_out). 82 | memory (torch.Tensor): Encoded memory 83 | (#batch, maxlen_in, size). 84 | memory_mask (torch.Tensor): Encoded memory mask 85 | (#batch, maxlen_in). 86 | cache (torch.Tensor): cached tensors. 87 | (#batch, maxlen_out - 1, size). 88 | 89 | Returns: 90 | torch.Tensor: Output tensor (#batch, maxlen_out, size). 91 | torch.Tensor: Mask for output tensor (#batch, maxlen_out). 92 | torch.Tensor: Encoded memory (#batch, maxlen_in, size). 93 | torch.Tensor: Encoded memory mask (#batch, maxlen_in). 94 | 95 | """ 96 | if cache is not None: 97 | att_cache = cache['self_att_cache'] 98 | cross_att_cache = cache['cross_att_cache'] 99 | else: 100 | att_cache, cross_att_cache = None, None 101 | 102 | residual = tgt 103 | if self.normalize_before: 104 | tgt = self.norm1(tgt) 105 | 106 | if att_cache is None: 107 | tgt_q = tgt 108 | tgt_q_mask = tgt_mask 109 | att_cache = (torch.empty(0, 0, 0, 0), torch.empty(0, 0, 0, 0)) 110 | else: 111 | tgt_q = tgt[:, -1:, :] 112 | residual = residual[:, -1:, :] 113 | tgt_q_mask = tgt_mask[:, -1:, :] 114 | 115 | x, new_att_cache = self.self_attn( 116 | tgt_q, 117 | tgt_q, 118 | tgt_q, 119 | tgt_q_mask, 120 | cache=att_cache, 121 | ) 122 | if cache is not None: 123 | cache['self_att_cache'] = new_att_cache 124 | x = residual + self.dropout(x) 125 | if not self.normalize_before: 126 | x = self.norm1(x) 127 | 128 | if self.src_attn is not None: 129 | residual = x 130 | if self.normalize_before: 131 | x = self.norm2(x) 132 | if cross_att_cache is None: 133 | cross_att_cache = (torch.empty(0, 0, 0, 134 | 0), torch.empty(0, 0, 0, 0)) 135 | x, new_cross_cache = self.src_attn(x, 136 | memory, 137 | memory, 138 | memory_mask, 139 | cache=cross_att_cache) 140 | if cache is not None: 141 | cache['cross_att_cache'] = new_cross_cache 142 | x = residual + self.dropout(x) 143 | if not self.normalize_before: 144 | x = self.norm2(x) 145 | 146 | residual = x 147 | if self.normalize_before: 148 | x = self.norm3(x) 149 | x = residual + self.dropout(self.feed_forward(x)) 150 | if not self.normalize_before: 151 | x = self.norm3(x) 152 | 153 | return x, tgt_mask, memory, memory_mask 154 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/label_smoothing_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Shigeki Karita 2 | # 2020 Mobvoi Inc (Binbin Zhang) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Label smoothing module.""" 16 | 17 | import torch 18 | from torch import nn 19 | 20 | 21 | class LabelSmoothingLoss(nn.Module): 22 | """Label-smoothing loss. 23 | 24 | In a standard CE loss, the label's data distribution is: 25 | [0,1,2] -> 26 | [ 27 | [1.0, 0.0, 0.0], 28 | [0.0, 1.0, 0.0], 29 | [0.0, 0.0, 1.0], 30 | ] 31 | 32 | In the smoothing version CE Loss,some probabilities 33 | are taken from the true label prob (1.0) and are divided 34 | among other labels. 35 | 36 | e.g. 37 | smoothing=0.1 38 | [0,1,2] -> 39 | [ 40 | [0.9, 0.05, 0.05], 41 | [0.05, 0.9, 0.05], 42 | [0.05, 0.05, 0.9], 43 | ] 44 | 45 | Args: 46 | size (int): the number of class 47 | padding_idx (int): padding class id which will be ignored for loss 48 | smoothing (float): smoothing rate (0.0 means the conventional CE) 49 | normalize_length (bool): 50 | normalize loss by sequence length if True 51 | normalize loss by batch size if False 52 | """ 53 | 54 | def __init__(self, 55 | size: int, 56 | padding_idx: int, 57 | smoothing: float, 58 | normalize_length: bool = False): 59 | """Construct an LabelSmoothingLoss object.""" 60 | super(LabelSmoothingLoss, self).__init__() 61 | self.criterion = nn.KLDivLoss(reduction="none") 62 | self.padding_idx = padding_idx 63 | self.confidence = 1.0 - smoothing 64 | self.smoothing = smoothing 65 | self.size = size 66 | self.normalize_length = normalize_length 67 | 68 | def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: 69 | """Compute loss between x and target. 70 | 71 | The model outputs and data labels tensors are flatten to 72 | (batch*seqlen, class) shape and a mask is applied to the 73 | padding part which should not be calculated for loss. 74 | 75 | Args: 76 | x (torch.Tensor): prediction (batch, seqlen, class) 77 | target (torch.Tensor): 78 | target signal masked with self.padding_id (batch, seqlen) 79 | Returns: 80 | loss (torch.Tensor) : The KL loss, scalar float value 81 | """ 82 | assert x.size(2) == self.size 83 | batch_size = x.size(0) 84 | x = x.view(-1, self.size) 85 | target = target.view(-1) 86 | # use zeros_like instead of torch.no_grad() for true_dist, 87 | # since no_grad() can not be exported by JIT 88 | true_dist = torch.zeros_like(x) 89 | true_dist.fill_(self.smoothing / (self.size - 1)) 90 | ignore = target == self.padding_idx # (B,) 91 | total = len(target) - ignore.sum().item() 92 | target = target.masked_fill(ignore, 0) # avoid -1 index 93 | true_dist.scatter_(1, target.unsqueeze(1), self.confidence) 94 | kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) 95 | denom = total if self.normalize_length else batch_size 96 | return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom 97 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class RMSNorm(torch.nn.Module): 5 | """ https://arxiv.org/pdf/1910.07467.pdf 6 | """ 7 | 8 | def __init__( 9 | self, 10 | dim: int, 11 | eps: float = 1e-6, 12 | add_unit_offset: bool = True, 13 | ): 14 | super().__init__() 15 | self.eps = eps 16 | self.weight = torch.nn.Parameter(torch.ones(dim)) 17 | self.add_unit_offset = add_unit_offset 18 | 19 | def _norm(self, x): 20 | return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) 21 | 22 | def forward(self, x): 23 | x = self._norm(x.float()).type_as(x) 24 | if self.add_unit_offset: 25 | return x * (1 + self.weight) 26 | else: 27 | return x * self.weight 28 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/positionwise_feed_forward.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Shigeki Karita 2 | # 2020 Mobvoi Inc (Binbin Zhang) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Positionwise feed forward layer definition.""" 16 | 17 | import torch 18 | 19 | 20 | class PositionwiseFeedForward(torch.nn.Module): 21 | """Positionwise feed forward layer. 22 | 23 | FeedForward are appied on each position of the sequence. 24 | The output dim is same with the input dim. 25 | 26 | Args: 27 | idim (int): Input dimenstion. 28 | hidden_units (int): The number of hidden units. 29 | dropout_rate (float): Dropout rate. 30 | activation (torch.nn.Module): Activation function 31 | """ 32 | 33 | def __init__( 34 | self, 35 | idim: int, 36 | hidden_units: int, 37 | dropout_rate: float, 38 | activation: torch.nn.Module = torch.nn.ReLU(), 39 | bias: bool = True, 40 | *dummy_args, 41 | **dummy_kwargs, 42 | ): 43 | """Construct a PositionwiseFeedForward object.""" 44 | super(PositionwiseFeedForward, self).__init__() 45 | self.w_1 = torch.nn.Linear(idim, hidden_units, bias=bias) 46 | self.activation = activation 47 | self.dropout = torch.nn.Dropout(dropout_rate) 48 | self.w_2 = torch.nn.Linear(hidden_units, idim, bias=bias) 49 | 50 | def forward(self, xs: torch.Tensor) -> torch.Tensor: 51 | """Forward function. 52 | 53 | Args: 54 | xs: input tensor (B, L, D) 55 | Returns: 56 | output tensor, (B, L, D) 57 | """ 58 | return self.w_2(self.dropout(self.activation(self.w_1(xs)))) 59 | 60 | 61 | class MoEFFNLayer(torch.nn.Module): 62 | """ 63 | Mixture of expert with Positionwise feed forward layer 64 | See also figure 1 in https://arxiv.org/pdf/2305.15663.pdf 65 | The output dim is same with the input dim. 66 | 67 | Modified from https://github.com/Lightning-AI/lit-gpt/pull/823 68 | https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219 69 | Args: 70 | n_expert: number of expert. 71 | n_expert_activated: The actual number of experts used for each frame 72 | idim (int): Input dimenstion. 73 | hidden_units (int): The number of hidden units. 74 | dropout_rate (float): Dropout rate. 75 | activation (torch.nn.Module): Activation function 76 | """ 77 | 78 | def __init__( 79 | self, 80 | idim: int, 81 | hidden_units: int, 82 | dropout_rate: float, 83 | activation: torch.nn.Module = torch.nn.ReLU(), 84 | bias: bool = False, 85 | n_expert: int = 8, 86 | n_expert_activated: int = 2, 87 | ): 88 | super(MoEFFNLayer, self).__init__() 89 | self.gate = torch.nn.Linear(idim, n_expert, bias=False) 90 | self.experts = torch.nn.ModuleList( 91 | PositionwiseFeedForward( 92 | idim, hidden_units, dropout_rate, activation, bias=bias) 93 | for _ in range(n_expert)) 94 | self.n_expert = n_expert 95 | self.n_expert_activated = n_expert_activated 96 | 97 | def forward(self, xs: torch.Tensor) -> torch.Tensor: 98 | """Foward function. 99 | Args: 100 | xs: input tensor (B, L, D) 101 | Returns: 102 | output tensor, (B, L, D) 103 | 104 | """ 105 | B, L, D = xs.size( 106 | ) # batch size, sequence length, embedding dimension (idim) 107 | xs = xs.view(-1, D) # (B*L, D) 108 | router = self.gate(xs) # (B*L, n_expert) 109 | logits, selected_experts = torch.topk( 110 | router, self.n_expert_activated 111 | ) # probs:(B*L, n_expert_activated), selected_exp: (B*L, n_expert_activated) 112 | weights = torch.nn.functional.softmax( 113 | logits, dim=1, 114 | dtype=torch.float).to(dtype=xs.dtype) # (B*L, n_expert_activated) 115 | output = torch.zeros_like(xs) # (B*L, D) 116 | for i, expert in enumerate(self.experts): 117 | mask = selected_experts == i 118 | token_ids, ith_expert = torch.where(mask) 119 | output[token_ids] += weights[token_ids, ith_expert, None] * expert( 120 | xs[token_ids]) 121 | return output.view(B, L, D) 122 | 123 | 124 | class GatedVariantsMLP(torch.nn.Module): 125 | """ https://arxiv.org/pdf/2002.05202.pdf 126 | """ 127 | 128 | def __init__( 129 | self, 130 | idim: int, 131 | hidden_units: int, 132 | dropout_rate: float, 133 | activation: torch.nn.Module = torch.nn.GELU(), 134 | bias: bool = True, 135 | *dummy_args, 136 | **dummy_kwargs, 137 | ): 138 | """Construct a PositionwiseFeedForward object.""" 139 | super(GatedVariantsMLP, self).__init__() 140 | self.gate = torch.nn.Linear(idim, hidden_units, bias=False) 141 | self.activation = activation 142 | # w_1 as up proj 143 | self.w_1 = torch.nn.Linear(idim, hidden_units, bias=bias) 144 | self.dropout = torch.nn.Dropout(dropout_rate) 145 | # w_2 as down proj 146 | self.w_2 = torch.nn.Linear(hidden_units, idim, bias=bias) 147 | 148 | def forward(self, x) -> torch.Tensor: 149 | """Foward function. 150 | Args: 151 | xs: input tensor (B, L, D) 152 | Returns: 153 | output tensor, (B, L, D) 154 | 155 | """ 156 | gate = self.activation(self.gate(x)) 157 | up = self.w_1(x) 158 | fuse = gate * up 159 | return self.w_2(self.dropout(fuse)) 160 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/transformer/swish.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe) 2 | # 2020 Northwestern Polytechnical University (Pengcheng Guo) 3 | # 2020 Mobvoi Inc (Binbin Zhang) 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """Swish() activation function for Conformer.""" 17 | import math 18 | 19 | import torch 20 | 21 | 22 | class Swish(torch.nn.Module): 23 | """Construct an Swish object.""" 24 | 25 | def forward(self, x: torch.Tensor) -> torch.Tensor: 26 | """Return Swish activation function.""" 27 | return x * torch.sigmoid(x) 28 | 29 | class New_gelu4npu(torch.nn.Module): 30 | """Construct an Swish object.""" 31 | 32 | def forward(self, x: torch.Tensor) -> torch.Tensor: 33 | """Return Swish activation function.""" 34 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) 35 | 36 | def new_gelu_func(x: torch.Tensor): 37 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) 38 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__init__.py -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/checkpoint.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/checkpoint.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/checkpoint.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/checkpoint.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/checkpoint.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/checkpoint.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/class_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/class_utils.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/class_utils.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/class_utils.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/class_utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/class_utils.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/cmvn.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/cmvn.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/cmvn.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/cmvn.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/cmvn.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/cmvn.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/common.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/common.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/common.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/common.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/common.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/common.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/config.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/config.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/config.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/config.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/context_graph.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/context_graph.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/context_graph.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/context_graph.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/context_graph.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/context_graph.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/ctc_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/ctc_utils.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/ctc_utils.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/ctc_utils.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/ctc_utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/ctc_utils.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/executor.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/executor.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/executor.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/executor.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/file_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/file_utils.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/file_utils.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/file_utils.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/file_utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/file_utils.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/fsdp_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/fsdp_utils.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/fsdp_utils.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/fsdp_utils.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/init_dataset.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/init_dataset.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/init_dataset.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/init_dataset.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/init_model.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/init_model.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/init_model.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/init_model.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/init_model.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/init_model.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/init_tokenizer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/init_tokenizer.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/init_tokenizer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/init_tokenizer.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/init_tokenizer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/init_tokenizer.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/mask.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/mask.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/mask.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/mask.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/mask.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/mask.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/rope_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/rope_utils.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/rope_utils.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/rope_utils.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/rope_utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/rope_utils.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/scheduler.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/scheduler.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/scheduler.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/scheduler.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/train_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/train_utils.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/__pycache__/train_utils.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/utils/__pycache__/train_utils.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/checkpoint.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | import os 17 | import re 18 | 19 | import yaml 20 | import torch 21 | from collections import OrderedDict 22 | 23 | import datetime 24 | 25 | 26 | def load_checkpoint(model: torch.nn.Module, path: str) -> dict: 27 | rank = int(os.environ.get('RANK', 0)) 28 | logging.info('[Rank {}] Checkpoint: loading from checkpoint {}'.format( 29 | rank, path)) 30 | checkpoint = torch.load(path, map_location='cpu') 31 | missing_keys, unexpected_keys = model.load_state_dict(checkpoint, 32 | strict=False) 33 | if rank == 0: 34 | for key in missing_keys: 35 | logging.info("missing tensor: {}".format(key)) 36 | for key in unexpected_keys: 37 | logging.info("unexpected tensor: {}".format(key)) 38 | info_path = re.sub('.pt$', '.yaml', path) 39 | configs = {} 40 | if os.path.exists(info_path): 41 | with open(info_path, 'r') as fin: 42 | configs = yaml.load(fin, Loader=yaml.FullLoader) 43 | if configs is None: 44 | configs = {} 45 | return configs 46 | 47 | 48 | def save_state_dict_and_infos(state_dict, path: str, infos=None): 49 | rank = int(os.environ.get('RANK', 0)) 50 | logging.info('[Rank {}] Checkpoint: save to checkpoint {}'.format( 51 | rank, path)) 52 | torch.save(state_dict, path) 53 | info_path = re.sub('.pt$', '.yaml', path) 54 | if infos is None: 55 | infos = {} 56 | infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S') 57 | with open(info_path, 'w') as fout: 58 | data = yaml.dump(infos) 59 | fout.write(data) 60 | 61 | 62 | def save_checkpoint(model: torch.nn.Module, path: str, infos=None): 63 | ''' 64 | Args: 65 | infos (dict or None): any info you want to save. 66 | ''' 67 | if isinstance(model, torch.nn.DataParallel): 68 | state_dict = model.module.state_dict() 69 | elif isinstance(model, torch.nn.parallel.DistributedDataParallel): 70 | state_dict = model.module.state_dict() 71 | else: 72 | state_dict = model.state_dict() 73 | save_state_dict_and_infos(state_dict, path, infos) 74 | 75 | 76 | def filter_modules(model_state_dict, modules): 77 | rank = int(os.environ.get('RANK', 0)) 78 | new_mods = [] 79 | incorrect_mods = [] 80 | mods_model = model_state_dict.keys() 81 | for mod in modules: 82 | if any(key.startswith(mod) for key in mods_model): 83 | new_mods += [mod] 84 | else: 85 | incorrect_mods += [mod] 86 | if incorrect_mods and rank == 0: 87 | logging.warning( 88 | "module(s) %s don't match or (partially match) " 89 | "available modules in model.", 90 | incorrect_mods, 91 | ) 92 | logging.warning("for information, the existing modules in model are:") 93 | logging.warning("%s", mods_model) 94 | 95 | return new_mods 96 | 97 | 98 | def load_trained_modules(model: torch.nn.Module, args: None): 99 | # Load encoder modules with pre-trained model(s). 100 | enc_model_path = args.enc_init 101 | enc_modules = args.enc_init_mods 102 | main_state_dict = model.state_dict() 103 | logging.warning("model(s) found for pre-initialization") 104 | if os.path.isfile(enc_model_path): 105 | logging.info('Checkpoint: loading from checkpoint %s for CPU' % 106 | enc_model_path) 107 | model_state_dict = torch.load(enc_model_path, map_location='cpu') 108 | modules = filter_modules(model_state_dict, enc_modules) 109 | partial_state_dict = OrderedDict() 110 | for key, value in model_state_dict.items(): 111 | if any(key.startswith(m) for m in modules): 112 | partial_state_dict[key] = value 113 | main_state_dict.update(partial_state_dict) 114 | else: 115 | logging.warning("model was not found : %s", enc_model_path) 116 | 117 | model.load_state_dict(main_state_dict) 118 | configs = {} 119 | return configs 120 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/class_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Copyright [2023-11-28] 4 | import torch 5 | from torch.nn import BatchNorm1d, LayerNorm 6 | from wenet.paraformer.embedding import ParaformerPositinoalEncoding 7 | from wenet.transformer.norm import RMSNorm 8 | from wenet.transformer.positionwise_feed_forward import ( 9 | GatedVariantsMLP, MoEFFNLayer, PositionwiseFeedForward) 10 | 11 | from wenet.transformer.swish import Swish, New_gelu4npu 12 | from wenet.transformer.subsampling import ( 13 | LinearNoSubsampling, 14 | EmbedinigNoSubsampling, 15 | Conv1dSubsampling2, 16 | Conv2dSubsampling4, 17 | Conv2dSubsampling6, 18 | Conv2dSubsampling8, 19 | StackNFramesSubsampling, 20 | ) 21 | from wenet.efficient_conformer.subsampling import Conv2dSubsampling2 22 | from wenet.squeezeformer.subsampling import DepthwiseConv2dSubsampling4 23 | from wenet.transformer.embedding import (PositionalEncoding, 24 | RelPositionalEncoding, 25 | RopePositionalEncoding, 26 | WhisperPositionalEncoding, 27 | LearnablePositionalEncoding, 28 | NoPositionalEncoding) 29 | from wenet.transformer.attention import (MultiHeadedAttention, 30 | MultiHeadedCrossAttention, 31 | RelPositionMultiHeadedAttention, 32 | RopeMultiHeadedAttention, 33 | ShawRelPositionMultiHeadedAttention) 34 | from wenet.efficient_conformer.attention import ( 35 | GroupedRelPositionMultiHeadedAttention) 36 | 37 | WENET_ACTIVATION_CLASSES = { 38 | "hardtanh": torch.nn.Hardtanh, 39 | "tanh": torch.nn.Tanh, 40 | "relu": torch.nn.ReLU, 41 | "selu": torch.nn.SELU, 42 | "swish": getattr(torch.nn, "SiLU", Swish), 43 | "gelu": New_gelu4npu, 44 | } 45 | 46 | WENET_RNN_CLASSES = { 47 | "rnn": torch.nn.RNN, 48 | "lstm": torch.nn.LSTM, 49 | "gru": torch.nn.GRU, 50 | } 51 | 52 | WENET_SUBSAMPLE_CLASSES = { 53 | "linear": LinearNoSubsampling, 54 | "embed": EmbedinigNoSubsampling, 55 | "conv1d2": Conv1dSubsampling2, 56 | "conv2d2": Conv2dSubsampling2, 57 | "conv2d": Conv2dSubsampling4, 58 | "dwconv2d4": DepthwiseConv2dSubsampling4, 59 | "conv2d6": Conv2dSubsampling6, 60 | "conv2d8": Conv2dSubsampling8, 61 | 'paraformer_dummy': torch.nn.Identity, 62 | 'stack_n_frames': StackNFramesSubsampling, 63 | } 64 | 65 | WENET_EMB_CLASSES = { 66 | "embed": PositionalEncoding, 67 | "abs_pos": PositionalEncoding, 68 | "rel_pos": RelPositionalEncoding, 69 | "no_pos": NoPositionalEncoding, 70 | "abs_pos_whisper": WhisperPositionalEncoding, 71 | "embed_learnable_pe": LearnablePositionalEncoding, 72 | "abs_pos_paraformer": ParaformerPositinoalEncoding, 73 | 'rope_pos': RopePositionalEncoding, 74 | } 75 | 76 | WENET_ATTENTION_CLASSES = { 77 | "selfattn": MultiHeadedAttention, 78 | "rel_selfattn": RelPositionMultiHeadedAttention, 79 | "grouped_rel_selfattn": GroupedRelPositionMultiHeadedAttention, 80 | "crossattn": MultiHeadedCrossAttention, 81 | 'shaw_rel_selfattn': ShawRelPositionMultiHeadedAttention, 82 | 'rope_abs_selfattn': RopeMultiHeadedAttention, 83 | } 84 | 85 | WENET_MLP_CLASSES = { 86 | 'position_wise_feed_forward': PositionwiseFeedForward, 87 | 'moe': MoEFFNLayer, 88 | 'gated': GatedVariantsMLP 89 | } 90 | 91 | WENET_NORM_CLASSES = { 92 | 'layer_norm': LayerNorm, 93 | 'batch_norm': BatchNorm1d, 94 | 'rms_norm': RMSNorm 95 | } 96 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/cmvn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | import math 17 | 18 | import numpy as np 19 | 20 | 21 | def _load_json_cmvn(json_cmvn_file): 22 | """ Load the json format cmvn stats file and calculate cmvn 23 | 24 | Args: 25 | json_cmvn_file: cmvn stats file in json format 26 | 27 | Returns: 28 | a numpy array of [means, vars] 29 | """ 30 | with open(json_cmvn_file) as f: 31 | cmvn_stats = json.load(f) 32 | 33 | means = cmvn_stats['mean_stat'] 34 | variance = cmvn_stats['var_stat'] 35 | count = cmvn_stats['frame_num'] 36 | for i in range(len(means)): 37 | means[i] /= count 38 | variance[i] = variance[i] / count - means[i] * means[i] 39 | if variance[i] < 1.0e-20: 40 | variance[i] = 1.0e-20 41 | variance[i] = 1.0 / math.sqrt(variance[i]) 42 | cmvn = np.array([means, variance]) 43 | return cmvn 44 | 45 | 46 | def _load_kaldi_cmvn(kaldi_cmvn_file): 47 | """ Load the kaldi format cmvn stats file and calculate cmvn 48 | 49 | Args: 50 | kaldi_cmvn_file: kaldi text style global cmvn file, which 51 | is generated by: 52 | compute-cmvn-stats --binary=false scp:feats.scp global_cmvn 53 | 54 | Returns: 55 | a numpy array of [means, vars] 56 | """ 57 | means = [] 58 | variance = [] 59 | with open(kaldi_cmvn_file, 'r') as fid: 60 | # kaldi binary file start with '\0B' 61 | if fid.read(2) == '\0B': 62 | logging.error('kaldi cmvn binary file is not supported, please ' 63 | 'recompute it by: compute-cmvn-stats --binary=false ' 64 | ' scp:feats.scp global_cmvn') 65 | sys.exit(1) 66 | fid.seek(0) 67 | arr = fid.read().split() 68 | assert (arr[0] == '[') 69 | assert (arr[-2] == '0') 70 | assert (arr[-1] == ']') 71 | feat_dim = int((len(arr) - 2 - 2) / 2) 72 | for i in range(1, feat_dim + 1): 73 | means.append(float(arr[i])) 74 | count = float(arr[feat_dim + 1]) 75 | for i in range(feat_dim + 2, 2 * feat_dim + 2): 76 | variance.append(float(arr[i])) 77 | 78 | for i in range(len(means)): 79 | means[i] /= count 80 | variance[i] = variance[i] / count - means[i] * means[i] 81 | if variance[i] < 1.0e-20: 82 | variance[i] = 1.0e-20 83 | variance[i] = 1.0 / math.sqrt(variance[i]) 84 | cmvn = np.array([means, variance]) 85 | return cmvn 86 | 87 | 88 | def load_cmvn(cmvn_file, is_json): 89 | if is_json: 90 | cmvn = _load_json_cmvn(cmvn_file) 91 | else: 92 | cmvn = _load_kaldi_cmvn(cmvn_file) 93 | return cmvn[0], cmvn[1] 94 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Shaoshang Qi 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import copy 16 | 17 | 18 | def override_config(configs, override_list): 19 | new_configs = copy.deepcopy(configs) 20 | for item in override_list: 21 | arr = item.split() 22 | if len(arr) != 2: 23 | print(f"the overrive {item} format not correct, skip it") 24 | continue 25 | keys = arr[0].split('.') 26 | s_configs = new_configs 27 | for i, key in enumerate(keys): 28 | if key not in s_configs: 29 | print(f"the overrive {item} format not correct, skip it") 30 | if i == len(keys) - 1: 31 | param_type = type(s_configs[key]) 32 | if param_type != bool: 33 | s_configs[key] = param_type(arr[1]) 34 | else: 35 | s_configs[key] = arr[1] in ['true', 'True'] 36 | print(f"override {arr[0]} with {arr[1]}") 37 | else: 38 | s_configs = s_configs[key] 39 | return new_configs 40 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/ctc_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List, Tuple 16 | 17 | import numpy as np 18 | 19 | import torch 20 | import torchaudio.functional as F 21 | 22 | 23 | def remove_duplicates_and_blank(hyp: List[int], 24 | blank_id: int = 0) -> List[int]: 25 | new_hyp: List[int] = [] 26 | cur = 0 27 | while cur < len(hyp): 28 | if hyp[cur] != blank_id: 29 | new_hyp.append(hyp[cur]) 30 | prev = cur 31 | while cur < len(hyp) and hyp[cur] == hyp[prev]: 32 | cur += 1 33 | return new_hyp 34 | 35 | 36 | def replace_duplicates_with_blank(hyp: List[int], 37 | blank_id: int = 0) -> List[int]: 38 | new_hyp: List[int] = [] 39 | cur = 0 40 | while cur < len(hyp): 41 | new_hyp.append(hyp[cur]) 42 | prev = cur 43 | cur += 1 44 | while cur < len( 45 | hyp) and hyp[cur] == hyp[prev] and hyp[cur] != blank_id: 46 | new_hyp.append(blank_id) 47 | cur += 1 48 | return new_hyp 49 | 50 | 51 | def gen_ctc_peak_time(hyp: List[int], blank_id: int = 0) -> List[int]: 52 | times = [] 53 | cur = 0 54 | while cur < len(hyp): 55 | if hyp[cur] != blank_id: 56 | times.append(cur) 57 | prev = cur 58 | while cur < len(hyp) and hyp[cur] == hyp[prev]: 59 | cur += 1 60 | return times 61 | 62 | 63 | def gen_timestamps_from_peak( 64 | peaks: List[int], 65 | max_duration: float, 66 | frame_rate: float = 0.04, 67 | max_token_duration: float = 1.0, 68 | ) -> List[Tuple[float, float]]: 69 | """ 70 | Args: 71 | peaks: ctc peaks time stamp 72 | max_duration: max_duration of the sentence 73 | frame_rate: frame rate of every time stamp, in seconds 74 | max_token_duration: max duration of the token, in seconds 75 | Returns: 76 | list(start, end) of each token 77 | """ 78 | times = [] 79 | half_max = max_token_duration / 2 80 | for i in range(len(peaks)): 81 | if i == 0: 82 | start = max(0, peaks[0] * frame_rate - half_max) 83 | else: 84 | start = max((peaks[i - 1] + peaks[i]) / 2 * frame_rate, 85 | peaks[i] * frame_rate - half_max) 86 | 87 | if i == len(peaks) - 1: 88 | end = min(max_duration, peaks[-1] * frame_rate + half_max) 89 | else: 90 | end = min((peaks[i] + peaks[i + 1]) / 2 * frame_rate, 91 | peaks[i] * frame_rate + half_max) 92 | times.append((start, end)) 93 | return times 94 | 95 | 96 | def insert_blank(label, blank_id=0): 97 | """Insert blank token between every two label token.""" 98 | label = np.expand_dims(label, 1) 99 | blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id 100 | label = np.concatenate([blanks, label], axis=1) 101 | label = label.reshape(-1) 102 | label = np.append(label, label[0]) 103 | return label 104 | 105 | 106 | def force_align(ctc_probs: torch.Tensor, y: torch.Tensor, blank_id=0) -> list: 107 | """ctc forced alignment. 108 | 109 | Args: 110 | torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D) 111 | torch.Tensor y: id sequence tensor 1d tensor (L) 112 | int blank_id: blank symbol index 113 | Returns: 114 | torch.Tensor: alignment result 115 | """ 116 | ctc_probs = ctc_probs[None].cpu() 117 | y = y[None].cpu() 118 | alignments, _ = F.forced_align(ctc_probs, y, blank=blank_id) 119 | return alignments[0] 120 | 121 | 122 | def get_blank_id(configs, symbol_table): 123 | if 'ctc_conf' not in configs: 124 | configs['ctc_conf'] = {} 125 | 126 | if '' in symbol_table: 127 | if 'ctc_blank_id' in configs['ctc_conf']: 128 | assert configs['ctc_conf']['ctc_blank_id'] == symbol_table[ 129 | ''] 130 | else: 131 | configs['ctc_conf']['ctc_blank_id'] = symbol_table[''] 132 | else: 133 | assert 'ctc_blank_id' in configs[ 134 | 'ctc_conf'], "PLZ set ctc_blank_id in yaml" 135 | 136 | return configs, configs['ctc_conf']['ctc_blank_id'] 137 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import re 16 | 17 | 18 | def read_lists(list_file): 19 | lists = [] 20 | with open(list_file, 'r', encoding='utf8') as fin: 21 | for line in fin: 22 | lists.append(line.strip()) 23 | return lists 24 | 25 | 26 | def read_non_lang_symbols(non_lang_sym_path): 27 | """read non-linguistic symbol from file. 28 | 29 | The file format is like below: 30 | 31 | {NOISE}\n 32 | {BRK}\n 33 | ... 34 | 35 | 36 | Args: 37 | non_lang_sym_path: non-linguistic symbol file path, None means no any 38 | syms. 39 | 40 | """ 41 | if non_lang_sym_path is None: 42 | return [] 43 | else: 44 | syms = read_lists(non_lang_sym_path) 45 | non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})") 46 | for sym in syms: 47 | if non_lang_syms_pattern.fullmatch(sym) is None: 48 | 49 | class BadSymbolFormat(Exception): 50 | pass 51 | 52 | raise BadSymbolFormat( 53 | "Non-linguistic symbols should be " 54 | "formatted in {xxx}//[xxx], consider" 55 | " modify '%s' to meet the requirment. " 56 | "More details can be found in discussions here : " 57 | "https://github.com/wenet-e2e/wenet/pull/819" % (sym)) 58 | return syms 59 | 60 | 61 | def read_symbol_table(symbol_table_file): 62 | symbol_table = {} 63 | with open(symbol_table_file, 'r', encoding='utf8') as fin: 64 | for line in fin: 65 | arr = line.strip().split() 66 | assert len(arr) == 2 67 | symbol_table[arr[0]] = int(arr[1]) 68 | return symbol_table 69 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/fsdp_utils.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import os 3 | from torch.distributed.fsdp import (FullyShardedDataParallel as FSDP, 4 | FullStateDictConfig, StateDictType) 5 | 6 | from torch.distributed.fsdp.wrap import (lambda_auto_wrap_policy, 7 | transformer_auto_wrap_policy) 8 | from wenet.LLM.decoder import DecoderOnly 9 | from wenet.branchformer.encoder_layer import BranchformerEncoderLayer 10 | from wenet.e_branchformer.encoder_layer import EBranchformerEncoderLayer 11 | from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer 12 | from wenet.paraformer.layers import AliParaformerEncoderLayer, SanmDecoderLayer 13 | from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer 14 | from wenet.transformer.encoder_layer import (ConformerEncoderLayer, 15 | TransformerEncoderLayer) 16 | from wenet.transformer.decoder_layer import DecoderLayer 17 | from wenet.utils.checkpoint import save_state_dict_and_infos 18 | from wenet.utils.init_model import WENET_DECODER_CLASSES, WENET_ENCODER_CLASSES 19 | 20 | WENET_ENCODER_LAYERS_CLASSES = { 21 | 'transformer_encoder_layer': TransformerEncoderLayer, 22 | 'conformer_encoder_layer': ConformerEncoderLayer, 23 | 'paraformer_encoder_layer': AliParaformerEncoderLayer, 24 | 'squeezeformer_encoder_layer': SqueezeformerEncoderLayer, 25 | 'ebranchformer_encoder_layer': EBranchformerEncoderLayer, 26 | 'efficient_conformer_encoder_layer': StrideConformerEncoderLayer, 27 | 'branchformer_encoder_layer': BranchformerEncoderLayer, 28 | } 29 | 30 | WENET_DECODER_LAYERS_CLASSES = { 31 | 'transformer_decoder_layer': DecoderLayer, 32 | 'paraformer_decoder_layer': SanmDecoderLayer, 33 | # TODO(Mddct): 34 | # 1 wrap transducer's predictor and joint 35 | # 2 wrap paraformer's cif and ignore lstm 36 | } 37 | 38 | 39 | def wenet_fsdp_wrap_policy(mode): 40 | # different wrap methods 41 | # please refer: https://openmmlab.medium.com/its-2023-is-pytorch-s-fsdp-the-best-choice-for-training-large-models-fe8d2848832f # noqa 42 | assert mode in ['no_shard', 'model', 'zero2', 'zero3'] 43 | if mode == 'no_shard': 44 | return None 45 | else: 46 | # TODO(Mddct): Support user customization 47 | # see more wrap methods: 48 | # https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/utils/fsdp_utils.py#L13 # noqa 49 | if mode == 'model': 50 | enc_dec_wrap_policy = partial( 51 | lambda_auto_wrap_policy, 52 | lambda_fn=lambda module: isinstance( 53 | module, 54 | tuple(WENET_ENCODER_CLASSES.values()) + tuple( 55 | WENET_DECODER_CLASSES.values()))) 56 | return enc_dec_wrap_policy 57 | else: 58 | to_wrap_class = set() 59 | to_wrap_class.update(set(WENET_ENCODER_LAYERS_CLASSES.values())) 60 | to_wrap_class.update(set(WENET_DECODER_LAYERS_CLASSES.values())) 61 | layers_wrap_policy = partial(transformer_auto_wrap_policy, 62 | transformer_layer_cls=to_wrap_class) 63 | return layers_wrap_policy 64 | 65 | 66 | fullstate_save_policy = FullStateDictConfig(offload_to_cpu=True, 67 | rank0_only=True) 68 | 69 | 70 | def fsdp_save_model(model, save_model_path, info_dict): 71 | # TODO(Mddct); When the model is large, saving a model will take a long time. 72 | # We only need to keep the sharding in an asynchronous manner, but it is 73 | # good now. This feature will be supported when llm is supported in the future. 74 | 75 | rank = int(os.environ.get('RANK', 0)) 76 | with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT, 77 | fullstate_save_policy): 78 | state_dict = model.state_dict() 79 | if rank == 0: 80 | save_state_dict_and_infos(state_dict, save_model_path, info_dict) 81 | 82 | 83 | def check_gradient_checkpoint(model): 84 | ckpt_laye_types = [] 85 | if hasattr(model, 'encoder') and hasattr(model.encoder, 86 | 'gradient_checkpointing'): 87 | if model.encoder.gradient_checkpointing: 88 | model.encoder.gradient_checkpointing = False 89 | ckpt_laye_types += list(WENET_ENCODER_LAYERS_CLASSES.values()) 90 | if hasattr(model, 'decoder') and hasattr(model.decoder, 91 | 'gradient_checkpointing'): 92 | if model.decoder.gradient_checkpointing: 93 | model.decoder.gradient_checkpointing = False 94 | ckpt_laye_types += list(WENET_DECODER_LAYERS_CLASSES.values()) 95 | if isinstance(model.decoder, DecoderOnly): 96 | ckpt_laye_types += [DecoderOnly] 97 | return tuple(ckpt_laye_types) 98 | 99 | 100 | def apply_fsdp_checkpointing(model, ckpt_layer_types: tuple): 101 | # NOTE(Mddct): torch.utils.checkpoint is currently incompatible with 102 | # wenet's model mode. Using this writing method, Please refer to 103 | # https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/policies/activation_checkpointing_functions.py#L21 # noqa 104 | if len(ckpt_layer_types) == 0: 105 | return 106 | from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( 107 | checkpoint_wrapper, 108 | CheckpointImpl, 109 | apply_activation_checkpointing, 110 | ) 111 | non_reentrant_wrapper = partial( 112 | checkpoint_wrapper, 113 | checkpoint_impl=CheckpointImpl.NO_REENTRANT, 114 | ) 115 | apply_activation_checkpointing( 116 | model, 117 | checkpoint_wrapper_fn=non_reentrant_wrapper, 118 | check_fn=lambda submodule: isinstance(submodule, ckpt_layer_types)) 119 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/init_dataset.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Optional 3 | from wenet.dataset.dataset import Dataset 4 | 5 | from wenet.text.base_tokenizer import BaseTokenizer 6 | 7 | 8 | def init_asr_dataset(data_type, 9 | data_list_file, 10 | tokenizer: Optional[BaseTokenizer] = None, 11 | conf=None, 12 | partition=True): 13 | return Dataset(data_type, data_list_file, tokenizer, conf, partition) 14 | 15 | 16 | def init_dataset(dataset_type, 17 | data_type, 18 | data_list_file, 19 | tokenizer: Optional[BaseTokenizer] = None, 20 | conf=None, 21 | partition=True, 22 | split='train'): 23 | assert dataset_type in ['asr', 'ssl'] 24 | 25 | if split != 'train': 26 | cv_conf = copy.deepcopy(conf) 27 | cv_conf['cycle'] = 1 28 | cv_conf['speed_perturb'] = False 29 | cv_conf['spec_aug'] = False 30 | cv_conf['spec_sub'] = False 31 | cv_conf['spec_trim'] = False 32 | cv_conf['shuffle'] = False 33 | cv_conf['list_shuffle'] = False 34 | conf = cv_conf 35 | 36 | if dataset_type == 'asr': 37 | return init_asr_dataset(data_type, data_list_file, tokenizer, conf, 38 | partition) 39 | else: 40 | from wenet.ssl.init_dataset import init_dataset as init_ssl_dataset 41 | return init_ssl_dataset(data_type, data_list_file, conf, partition) 42 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/init_model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Binbin Zhang (binbzha@qq.com) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import torch 17 | 18 | from wenet.llm_asr.init_llmasr import init_llmasr 19 | from wenet.transformer.asr_model import ASRModel 20 | from wenet.transformer.cmvn import GlobalCMVN 21 | from wenet.transformer.ctc import CTC 22 | from wenet.transformer.encoder import TransformerEncoder, ConformerEncoder 23 | from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder 24 | from wenet.whisper.whisper import Whisper 25 | from wenet.utils.cmvn import load_cmvn 26 | from wenet.utils.checkpoint import load_checkpoint, load_trained_modules 27 | 28 | 29 | WENET_ENCODER_CLASSES = { 30 | "transformer": TransformerEncoder, 31 | "conformer": ConformerEncoder, 32 | } 33 | 34 | WENET_DECODER_CLASSES = { 35 | "transformer": TransformerDecoder, 36 | "bitransformer": BiTransformerDecoder, 37 | } 38 | 39 | WENET_CTC_CLASSES = { 40 | "ctc": CTC, 41 | } 42 | 43 | WENET_MODEL_CLASSES = { 44 | "asr_model": ASRModel, 45 | "whisper": Whisper, 46 | } 47 | 48 | 49 | def init_speech_model(args, configs): 50 | # TODO(xcsong): Forcefully read the 'cmvn' attribute. 51 | if configs.get('cmvn', None) == 'global_cmvn': 52 | mean, istd = load_cmvn(configs['cmvn_conf']['cmvn_file'], 53 | configs['cmvn_conf']['is_json_cmvn']) 54 | global_cmvn = GlobalCMVN( 55 | torch.from_numpy(mean).float(), 56 | torch.from_numpy(istd).float()) 57 | else: 58 | global_cmvn = None 59 | 60 | input_dim = configs['input_dim'] 61 | vocab_size = configs['output_dim'] 62 | 63 | encoder_type = configs.get('encoder', 'conformer') 64 | decoder_type = configs.get('decoder', 'bitransformer') 65 | ctc_type = configs.get('ctc', 'ctc') 66 | 67 | encoder = WENET_ENCODER_CLASSES[encoder_type]( 68 | input_dim, 69 | global_cmvn=global_cmvn, 70 | **configs['encoder_conf'], 71 | **configs['encoder_conf']['efficient_conf'] 72 | if 'efficient_conf' in configs['encoder_conf'] else {}) 73 | 74 | decoder = WENET_DECODER_CLASSES[decoder_type](vocab_size, 75 | encoder.output_size(), 76 | **configs['decoder_conf']) 77 | 78 | ctc = WENET_CTC_CLASSES[ctc_type]( 79 | vocab_size, 80 | encoder.output_size(), 81 | blank_id=configs['ctc_conf']['ctc_blank_id'] 82 | if 'ctc_conf' in configs else 0) 83 | 84 | model_type = configs.get('model', 'asr_model') 85 | 86 | model = WENET_MODEL_CLASSES[model_type]( 87 | vocab_size=vocab_size, 88 | encoder=encoder, 89 | decoder=decoder, 90 | ctc=ctc, 91 | special_tokens=configs.get('tokenizer_conf', 92 | {}).get('special_tokens', None), 93 | **configs['model_conf']) 94 | return model, configs 95 | 96 | 97 | 98 | def init_model(args, configs): 99 | 100 | model_type = configs.get('model', 'asr_model') 101 | configs['model'] = model_type 102 | if model_type == "llmasr": 103 | model = init_llmasr(args, configs) 104 | return model 105 | else: 106 | model, configs = init_speech_model(args, configs) 107 | 108 | 109 | # If specify checkpoint, load some info from checkpoint 110 | if hasattr(args, 'checkpoint') and args.checkpoint is not None: 111 | infos = load_checkpoint(model, args.checkpoint) 112 | elif hasattr(args, 'enc_init') and args.enc_init is not None: 113 | infos = load_trained_modules(model, args) 114 | else: 115 | infos = {} 116 | if configs.get('init_step', False): 117 | infos = {} 118 | configs["init_infos"] = infos 119 | 120 | if hasattr(args, 'use_lora') and args.use_lora: 121 | if hasattr(args, 'lora_ckpt_path') and args.lora_ckpt_path: 122 | load_checkpoint(model, args.lora_ckpt_path) 123 | 124 | print(configs) 125 | # Trye to tie some weights 126 | if hasattr(model, 'tie_or_clone_weights'): 127 | if not hasattr(args, 'jit'): 128 | args.jit = True # i.e. export onnx/jit/ipex 129 | model.tie_or_clone_weights(args.jit) 130 | 131 | if int(os.environ.get('RANK', 0)) == 0: 132 | print(configs) 133 | 134 | return model, configs 135 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/init_tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Wenet Community. (authors: Dinghao Zhou) 2 | # (authors: Xingchen Song) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import logging 17 | 18 | from wenet.text.base_tokenizer import BaseTokenizer 19 | from wenet.text.bpe_tokenizer import BpeTokenizer 20 | from wenet.text.char_tokenizer import CharTokenizer 21 | from wenet.text.hugging_face_tokenizer import HuggingFaceTokenizer 22 | from wenet.text.paraformer_tokenizer import ParaformerTokenizer 23 | from wenet.text.whisper_tokenizer import WhisperTokenizer 24 | 25 | 26 | def init_tokenizer(configs) -> BaseTokenizer: 27 | # TODO(xcsong): Forcefully read the 'tokenizer' attribute. 28 | tokenizer_type = configs.get("tokenizer", "char") 29 | if tokenizer_type == "whisper": 30 | tokenizer = WhisperTokenizer( 31 | multilingual=configs['tokenizer_conf']['is_multilingual'], 32 | num_languages=configs['tokenizer_conf']['num_languages']) 33 | elif tokenizer_type == "char": 34 | tokenizer = CharTokenizer( 35 | configs['tokenizer_conf']['symbol_table_path'], 36 | configs['tokenizer_conf']['non_lang_syms_path'], 37 | split_with_space=configs['tokenizer_conf'].get( 38 | 'split_with_space', False), 39 | connect_symbol=configs['tokenizer_conf'].get('connect_symbol', '')) 40 | elif tokenizer_type == "bpe": 41 | tokenizer = BpeTokenizer( 42 | configs['tokenizer_conf']['bpe_path'], 43 | configs['tokenizer_conf']['symbol_table_path'], 44 | configs['tokenizer_conf']['non_lang_syms_path'], 45 | split_with_space=configs['tokenizer_conf'].get( 46 | 'split_with_space', False)) 47 | elif tokenizer_type == 'paraformer': 48 | tokenizer = ParaformerTokenizer( 49 | symbol_table=configs['tokenizer_conf']['symbol_table_path'], 50 | seg_dict=configs['tokenizer_conf']['seg_dict_path']) 51 | elif tokenizer_type == 'huggingface': 52 | tokenizer = HuggingFaceTokenizer( 53 | model=configs['tokenizer_conf']['llm_path']) 54 | else: 55 | raise NotImplementedError 56 | logging.info("use {} tokenizer".format(configs["tokenizer"])) 57 | 58 | return tokenizer 59 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/utils/rope_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | # copy from:https://github.com/google/gemma_pytorch/blob/main/gemma/model.py#L84 5 | def precompute_freqs_cis(dim: int, 6 | end: int, 7 | theta: float = 10000.0) -> torch.Tensor: 8 | """Precomputes the frequency cis.""" 9 | freqs = 1.0 / (theta**(torch.arange(0, dim, 2)[:(dim // 2)].float() / dim)) 10 | t = torch.arange(end, device=freqs.device) 11 | freqs = torch.outer(t, freqs).float() 12 | freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64 13 | return freqs_cis 14 | 15 | 16 | # modified from: 17 | # https://github.com/google/gemma_pytorch/blob/main/gemma/model.py#L95 18 | def google_apply_rotary_emb(x: torch.Tensor, 19 | freqs_cis: torch.Tensor) -> torch.Tensor: 20 | """Applies the rotary embedding to the query and key tensors.""" 21 | x_ = torch.view_as_complex( 22 | torch.stack(torch.chunk(x.float(), 2, dim=-1), dim=-1)) 23 | x_out = torch.view_as_real(x_ * freqs_cis).type_as(x) 24 | x_out = torch.cat(torch.chunk(x_out, 2, dim=-1), dim=-2) 25 | x_out = x_out.reshape(x_out.shape[0], x_out.shape[1], x_out.shape[2], -1) 26 | return x_out 27 | 28 | 29 | def llama_apply_rotary_emb(x: torch.Tensor, 30 | freqs_cis: torch.Tensor) -> torch.Tensor: 31 | x_ = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2)) 32 | x_out = torch.view_as_real(x_ * freqs_cis).flatten(3) 33 | return x_out.type_as(x) 34 | 35 | 36 | WENET_APPLY_ROTARY_EMB = { 37 | 'google': google_apply_rotary_emb, 38 | 'llama': llama_apply_rotary_emb, 39 | } 40 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/whisper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/whisper/__init__.py -------------------------------------------------------------------------------- /C2SER-llm/wenet/whisper/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/whisper/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/whisper/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/whisper/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/whisper/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/whisper/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/whisper/__pycache__/whisper.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/whisper/__pycache__/whisper.cpython-310.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/whisper/__pycache__/whisper.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/whisper/__pycache__/whisper.cpython-311.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/whisper/__pycache__/whisper.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/C2SER-llm/wenet/whisper/__pycache__/whisper.cpython-39.pyc -------------------------------------------------------------------------------- /C2SER-llm/wenet/whisper/whisper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Wenet Community. (authors: Xingchen Song) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Modified from [Whisper](https://github.com/openai/whisper) 16 | 17 | import torch 18 | 19 | from typing import Tuple, Dict, List 20 | 21 | from wenet.transformer.asr_model import ASRModel 22 | from wenet.transformer.ctc import CTC 23 | from wenet.transformer.encoder import TransformerEncoder 24 | from wenet.transformer.decoder import TransformerDecoder 25 | from wenet.utils.common import IGNORE_ID, add_whisper_tokens, th_accuracy 26 | 27 | 28 | class Whisper(ASRModel): 29 | 30 | def __init__( 31 | self, 32 | vocab_size: int, 33 | encoder: TransformerEncoder, 34 | decoder: TransformerDecoder, 35 | ctc: CTC = None, 36 | ctc_weight: float = 0.5, 37 | ignore_id: int = IGNORE_ID, 38 | reverse_weight: float = 0.0, 39 | lsm_weight: float = 0.0, 40 | length_normalized_loss: bool = False, 41 | special_tokens: dict = None, 42 | ): 43 | super().__init__(vocab_size, encoder, decoder, ctc, ctc_weight, 44 | ignore_id, reverse_weight, lsm_weight, 45 | length_normalized_loss, special_tokens) 46 | assert reverse_weight == 0.0 47 | self.sos = special_tokens["sot"] 48 | self.eos = special_tokens["eot"] 49 | self.decode_maxlen = self.decoder.embed[1].max_len 50 | 51 | # TODO(xcsong): time align 52 | def set_alignment_heads(self, dump: bytes): 53 | raise NotImplementedError 54 | 55 | @property 56 | def is_multilingual(self): 57 | return self.vocab_size >= 51865 58 | 59 | @property 60 | def num_languages(self): 61 | return self.vocab_size - 51765 - int(self.is_multilingual) 62 | 63 | def _calc_att_loss( 64 | self, 65 | encoder_out: torch.Tensor, 66 | encoder_mask: torch.Tensor, 67 | ys_pad: torch.Tensor, 68 | ys_pad_lens: torch.Tensor, 69 | infos: Dict[str, List[str]], 70 | ) -> Tuple[torch.Tensor, float]: 71 | prev_len = ys_pad.size(1) 72 | ys_in_pad, ys_out_pad = add_whisper_tokens(self.special_tokens, 73 | ys_pad, 74 | self.ignore_id, 75 | tasks=infos['tasks'], 76 | no_timestamp=True, 77 | langs=infos['langs'], 78 | use_prev=False) 79 | cur_len = ys_in_pad.size(1) 80 | ys_in_lens = ys_pad_lens + cur_len - prev_len 81 | 82 | # 1. Forward decoder 83 | decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, 84 | ys_in_pad, ys_in_lens) 85 | 86 | # 2. Compute attention loss 87 | loss_att = self.criterion_att(decoder_out, ys_out_pad) 88 | acc_att = th_accuracy( 89 | decoder_out.view(-1, self.vocab_size), 90 | ys_out_pad, 91 | ignore_label=self.ignore_id, 92 | ) 93 | return loss_att, acc_att 94 | -------------------------------------------------------------------------------- /C2SER-llm/wenet/whisper/whisper_with_clap.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Wenet Community. (authors: Xingchen Song) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Modified from [Whisper](https://github.com/openai/whisper) 16 | 17 | import torch 18 | 19 | from typing import Tuple, Dict, List 20 | 21 | from torch import nn 22 | 23 | from wenet.transformer.asr_model import ASRModel 24 | from wenet.transformer.ctc import CTC 25 | from wenet.transformer.encoder import TransformerEncoder 26 | from wenet.transformer.decoder import TransformerDecoder 27 | from wenet.utils.common import IGNORE_ID, add_whisper_tokens, th_accuracy 28 | 29 | 30 | class Whisper(ASRModel): 31 | 32 | def __init__( 33 | self, 34 | vocab_size: int, 35 | encoder: TransformerEncoder, 36 | decoder: TransformerDecoder, 37 | ctc: CTC = None, 38 | ctc_weight: float = 0.5, 39 | ignore_id: int = IGNORE_ID, 40 | reverse_weight: float = 0.0, 41 | lsm_weight: float = 0.0, 42 | length_normalized_loss: bool = False, 43 | special_tokens: dict = None, 44 | ): 45 | super().__init__(vocab_size, encoder, decoder, ctc, ctc_weight, 46 | ignore_id, reverse_weight, lsm_weight, 47 | length_normalized_loss, special_tokens) 48 | assert reverse_weight == 0.0 49 | self.sos = special_tokens["sot"] 50 | self.eos = special_tokens["eot"] 51 | self.decode_maxlen = self.decoder.embed[1].max_len 52 | 53 | # 添加clap 54 | self.clip_length = 40 55 | self.prefix_length = 40 56 | num_layers = 12 57 | dim_embedding = 1024 58 | dim_clip = 512 59 | # 修改一下使用nn.transformer 60 | nhead = 8 61 | self.ttt = nn.TransformerEncoder( 62 | encoder_layer=nn.TransformerEncoderLayer(d_model=dim_embedding, nhead=nhead), 63 | num_layers=num_layers 64 | ) 65 | self.linear = nn.Linear(dim_clip, self.clip_length * dim_embedding) 66 | self.prefix_const = nn.Parameter(torch.randn(self.prefix_length, dim_embedding), requires_grad=True) 67 | 68 | from transformers import ClapModel, AutoFeatureExtractor 69 | # 加载模型和处理器 70 | self.model = ClapModel.from_pretrained( 71 | "/home/work_nfs11/wjtian/work_space/wenet_whisper_finetune/examples/wenetspeech/whisper/pretrain_ckpt/clap-htsat-unfused") 72 | self.processor = AutoFeatureExtractor.from_pretrained( 73 | "/home/work_nfs11/wjtian/work_space/wenet_whisper_finetune/examples/wenetspeech/whisper/pretrain_ckpt/clap-htsat-unfused") 74 | for param in self.model.parameters(): 75 | param.requires_grad = False 76 | 77 | # TODO(xcsong): time align 78 | def set_alignment_heads(self, dump: bytes): 79 | raise NotImplementedError 80 | 81 | @property 82 | def is_multilingual(self): 83 | return self.vocab_size >= 51865 84 | 85 | @property 86 | def num_languages(self): 87 | return self.vocab_size - 51765 - int(self.is_multilingual) 88 | 89 | def _calc_att_loss( 90 | self, 91 | encoder_out: torch.Tensor, 92 | encoder_mask: torch.Tensor, 93 | ys_pad: torch.Tensor, 94 | ys_pad_lens: torch.Tensor, 95 | infos: Dict[str, List[str]], 96 | ) -> Tuple[torch.Tensor, float]: 97 | prev_len = ys_pad.size(1) 98 | ys_in_pad, ys_out_pad = add_whisper_tokens(self.special_tokens, 99 | ys_pad, 100 | self.ignore_id, 101 | tasks=infos['tasks'], 102 | no_timestamp=True, 103 | langs=infos['langs'], 104 | use_prev=False) 105 | cur_len = ys_in_pad.size(1) 106 | ys_in_lens = ys_pad_lens + cur_len - prev_len 107 | 108 | # 1. Forward decoder 109 | decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, 110 | ys_in_pad, ys_in_lens) 111 | 112 | # 2. Compute attention loss 113 | loss_att = self.criterion_att(decoder_out, ys_out_pad) 114 | acc_att = th_accuracy( 115 | decoder_out.view(-1, self.vocab_size), 116 | ys_out_pad, 117 | ignore_label=self.ignore_id, 118 | ) 119 | return loss_att, acc_att 120 | -------------------------------------------------------------------------------- /Emotion2Vec-S/downstream_EmoBox/k_fold_CV.sh: -------------------------------------------------------------------------------- 1 | cd examples/sb 2 | data=/path/to/your/data_files 3 | lrs=(1e-3 1e-4) # Learning rate list 4 | hidden_sizes=(128 256) # Hidden size list 5 | gpus=(0 1 2 3) # GPU list 6 | task_id=0 7 | declare -A dataset_folds=( 8 | ["mesd"]=1 9 | ) 10 | declare -A dataset_classes=( 11 | ["mesd"]=6 12 | ) 13 | datasets=("mesd") 14 | 15 | for dataset in "${datasets[@]}"; do 16 | folds=${dataset_folds[$dataset]} 17 | n_classes=${dataset_classes[$dataset]} 18 | 19 | for lr in "${lrs[@]}"; do 20 | for hidden_size in "${hidden_sizes[@]}"; do 21 | gpu=${gpus[$task_id % ${#gpus[@]}]} 22 | export CUDA_VISIBLE_DEVICES=$gpu 23 | task_number=$((task_id + 1)) 24 | for fold in $(seq 1 $folds); do 25 | echo "Training fold $fold with lr=$lr, hidden_size=$hidden_size on GPU $gpu, task_number=$task_number, dataset=$dataset..." 26 | python3 train.py \ 27 | hparams/data2vec2-large_freeze.yaml \ 28 | --output_folder /path/to/your/${dataset}-S/fold${fold}_lr${lr}_hidden${hidden_size} \ 29 | --seed 1234 \ 30 | --batch_size 32 \ 31 | --lr $lr \ 32 | --train_annotation ${data}/${dataset}/fold_${fold}/${dataset}_train_fold_${fold}.json \ 33 | --test_annotation ${data}/${dataset}/fold_${fold}/${dataset}_test_fold_${fold}.json \ 34 | --number_of_epochs 100 \ 35 | --feat_dir /path/to/your/dump_${dataset}-S \ 36 | --label_map ${data}/${dataset}/label_map.json \ 37 | --device cuda \ 38 | --out_n_neurons ${n_classes} \ 39 | --hidden_size $hidden_size & 40 | done 41 | task_id=$((task_id + 1)) 42 | done 43 | done 44 | done 45 | 46 | wait 47 | echo "All training tasks completed." -------------------------------------------------------------------------------- /Emotion2Vec-S/examples/.gitignore: -------------------------------------------------------------------------------- 1 | !*/*.sh 2 | !*/*.md 3 | -------------------------------------------------------------------------------- /Emotion2Vec-S/examples/data2vec/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/examples/data2vec/models/__init__.py -------------------------------------------------------------------------------- /Emotion2Vec-S/examples/data2vec/models/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/examples/data2vec/models/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /Emotion2Vec-S/examples/data2vec/models/__pycache__/data2vec2.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/examples/data2vec/models/__pycache__/data2vec2.cpython-38.pyc -------------------------------------------------------------------------------- /Emotion2Vec-S/examples/data2vec/models/modalities/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/examples/data2vec/models/modalities/__init__.py -------------------------------------------------------------------------------- /Emotion2Vec-S/examples/data2vec/models/modalities/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/examples/data2vec/models/modalities/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /Emotion2Vec-S/examples/data2vec/models/modalities/__pycache__/audio.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/examples/data2vec/models/modalities/__pycache__/audio.cpython-38.pyc -------------------------------------------------------------------------------- /Emotion2Vec-S/examples/data2vec/models/modalities/__pycache__/base.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/examples/data2vec/models/modalities/__pycache__/base.cpython-38.pyc -------------------------------------------------------------------------------- /Emotion2Vec-S/examples/data2vec/models/modalities/__pycache__/modules.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/examples/data2vec/models/modalities/__pycache__/modules.cpython-38.pyc -------------------------------------------------------------------------------- /Emotion2Vec-S/examples/data2vec/models/modalities/audio.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | from functools import partial 7 | import torch 8 | import torch.nn as nn 9 | import numpy as np 10 | from dataclasses import dataclass, field 11 | from typing import Callable, Dict, Optional 12 | from fairseq.models.wav2vec import ConvFeatureExtractionModel 13 | from fairseq.modules import ( 14 | LayerNorm, 15 | SamePad, 16 | TransposeLast, 17 | ) 18 | from fairseq.tasks import FairseqTask 19 | from .base import D2vModalityConfig, ModalitySpecificEncoder, get_alibi_bias 20 | from .modules import BlockEncoder, Decoder1d 21 | from enum import Enum, auto 22 | 23 | class Modality(Enum): 24 | AUDIO = auto() 25 | IMAGE = auto() 26 | TEXT = auto() 27 | 28 | @dataclass 29 | class D2vAudioConfig(D2vModalityConfig): 30 | type: Modality = Modality.AUDIO 31 | extractor_mode: str = "layer_norm" 32 | feature_encoder_spec: str = field( 33 | default="[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]", 34 | metadata={ 35 | "help": "string describing convolutional feature extraction layers in form of a python list that contains " 36 | "[(dim, kernel_size, stride), ...]" 37 | }, 38 | ) 39 | conv_pos_width: int = field( 40 | default=95, 41 | metadata={"help": "number of filters for convolutional positional embeddings"}, 42 | ) 43 | conv_pos_groups: int = field( 44 | default=16, 45 | metadata={"help": "number of groups for convolutional positional embedding"}, 46 | ) 47 | conv_pos_depth: int = field( 48 | default=5, 49 | metadata={"help": "depth of positional encoder network"}, 50 | ) 51 | conv_pos_pre_ln: bool = False 52 | 53 | 54 | class AudioEncoder(ModalitySpecificEncoder): 55 | 56 | modality_cfg: D2vAudioConfig 57 | 58 | def __init__( 59 | self, 60 | modality_cfg: D2vAudioConfig, 61 | embed_dim: int, 62 | make_block: Callable[[float], nn.ModuleList], 63 | norm_layer: Callable[[int], nn.LayerNorm], 64 | layer_norm_first: bool, 65 | alibi_biases: Dict, 66 | task: Optional[FairseqTask], 67 | ): 68 | 69 | self.feature_enc_layers = eval(modality_cfg.feature_encoder_spec) 70 | feature_embed_dim = self.feature_enc_layers[-1][0] 71 | 72 | local_encoder = ConvFeatureExtractionModel( 73 | conv_layers=self.feature_enc_layers, 74 | dropout=0.0, 75 | mode=modality_cfg.extractor_mode, 76 | conv_bias=False, 77 | ) 78 | 79 | project_features = nn.Sequential( 80 | TransposeLast(), 81 | nn.LayerNorm(feature_embed_dim), 82 | nn.Linear(feature_embed_dim, embed_dim), 83 | ) 84 | 85 | num_pos_layers = modality_cfg.conv_pos_depth 86 | k = max(3, modality_cfg.conv_pos_width // num_pos_layers) 87 | 88 | positional_encoder = nn.Sequential( 89 | TransposeLast(), 90 | *[ 91 | nn.Sequential( 92 | nn.Conv1d( 93 | embed_dim, 94 | embed_dim, 95 | kernel_size=k, 96 | padding=k // 2, 97 | groups=modality_cfg.conv_pos_groups, 98 | ), 99 | SamePad(k), 100 | TransposeLast(), 101 | LayerNorm(embed_dim, elementwise_affine=False), 102 | TransposeLast(), 103 | nn.GELU(), 104 | ) 105 | for _ in range(num_pos_layers) 106 | ], 107 | TransposeLast(), 108 | ) 109 | 110 | if modality_cfg.conv_pos_pre_ln: 111 | positional_encoder = nn.Sequential(LayerNorm(embed_dim), positional_encoder) 112 | 113 | dpr = np.linspace( 114 | modality_cfg.start_drop_path_rate, 115 | modality_cfg.end_drop_path_rate, 116 | modality_cfg.prenet_depth, 117 | ) 118 | context_encoder = BlockEncoder( 119 | nn.ModuleList(make_block(dpr[i]) for i in range(modality_cfg.prenet_depth)), 120 | norm_layer(embed_dim) if not layer_norm_first else None, 121 | layer_norm_first, 122 | modality_cfg.prenet_layerdrop, 123 | modality_cfg.prenet_dropout, 124 | ) 125 | 126 | decoder = ( 127 | Decoder1d(modality_cfg.decoder, embed_dim) 128 | if modality_cfg.decoder is not None 129 | else None 130 | ) 131 | 132 | alibi_bias_fn = partial(get_alibi_bias, alibi_biases=alibi_biases) 133 | 134 | super().__init__( 135 | modality_cfg=modality_cfg, 136 | embed_dim=embed_dim, 137 | local_encoder=local_encoder, 138 | project_features=project_features, 139 | fixed_positional_encoder=None, 140 | relative_positional_encoder=positional_encoder, 141 | context_encoder=context_encoder, 142 | decoder=decoder, 143 | get_alibi_bias=alibi_bias_fn, 144 | ) 145 | 146 | def convert_padding_mask(self, x, padding_mask): 147 | def get_feat_extract_output_lengths(input_lengths: torch.LongTensor): 148 | """ 149 | Computes the output length of the convolutional layers 150 | """ 151 | 152 | def _conv_out_length(input_length, kernel_size, stride): 153 | return torch.floor((input_length - kernel_size) / stride + 1) 154 | 155 | for i in range(len(self.feature_enc_layers)): 156 | input_lengths = _conv_out_length( 157 | input_lengths, 158 | self.feature_enc_layers[i][1], 159 | self.feature_enc_layers[i][2], 160 | ) 161 | 162 | return input_lengths.to(torch.long) 163 | 164 | if padding_mask is not None: 165 | input_lengths = (1 - padding_mask.long()).sum(-1) 166 | # apply conv formula to get real output_lengths 167 | output_lengths = get_feat_extract_output_lengths(input_lengths) 168 | 169 | if padding_mask.any(): 170 | padding_mask = torch.zeros(x.shape[:2], dtype=x.dtype, device=x.device) 171 | 172 | # these two operations makes sure that all values 173 | # before the output lengths indices are attended to 174 | padding_mask[ 175 | ( 176 | torch.arange(padding_mask.shape[0], device=padding_mask.device), 177 | output_lengths - 1, 178 | ) 179 | ] = 1 180 | padding_mask = ( 181 | 1 - padding_mask.flip([-1]).cumsum(-1).flip([-1]) 182 | ).bool() 183 | else: 184 | padding_mask = torch.zeros( 185 | x.shape[:2], dtype=torch.bool, device=x.device 186 | ) 187 | 188 | return padding_mask 189 | 190 | def reset_parameters(self): 191 | super().reset_parameters() 192 | for mod in self.project_features.children(): 193 | if isinstance(mod, nn.Linear): 194 | mod.reset_parameters() 195 | if self.decoder is not None: 196 | self.decoder.reset_parameters() 197 | -------------------------------------------------------------------------------- /Emotion2Vec-S/extract_feature.sh: -------------------------------------------------------------------------------- 1 | datasets=("m3ed" "iemocap") # Add dataset names to this array e.g., iempcap 2 | 3 | for dataset in "${datasets[@]}"; do 4 | echo "Processing dataset: $dataset" 5 | python3 speech_feature_extraction.py \ 6 | --model_path C2SER/Emotion2Vec-S/ckpt/checkpoint.pt \ 7 | --model_dir C2SER/Emotion2Vec-S/examples/data2vec/ \ 8 | --dump_dir C2SER/Emotion2Vec-S/fea_${dataset} \ 9 | --device cuda \ 10 | --data C2SER/Emotion2Vec-S/${dataset}.scp \ 11 | --level frame 12 | done -------------------------------------------------------------------------------- /Emotion2Vec-S/features/features_frm/4YJy1uDx0jM_769.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/features/features_frm/4YJy1uDx0jM_769.npy -------------------------------------------------------------------------------- /Emotion2Vec-S/features/features_frm/vo_EQAST002_1_paimon_07.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/features/features_frm/vo_EQAST002_1_paimon_07.npy -------------------------------------------------------------------------------- /Emotion2Vec-S/features/features_utt/4YJy1uDx0jM_769.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/features/features_utt/4YJy1uDx0jM_769.npy -------------------------------------------------------------------------------- /Emotion2Vec-S/features/features_utt/vo_EQAST002_1_paimon_07.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/features/features_utt/vo_EQAST002_1_paimon_07.npy -------------------------------------------------------------------------------- /Emotion2Vec-S/speech_feature_extraction.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | import sys 4 | import json 5 | import numpy as np 6 | import argparse 7 | from tqdm import tqdm 8 | import torchaudio 9 | import torch.nn.functional as F 10 | import fairseq 11 | from dataclasses import dataclass 12 | 13 | SAMPLING_RATE=16000 14 | 15 | @dataclass 16 | class UserDirModule: 17 | user_dir: str 18 | 19 | def extract_fairseq_feature(wav_path, model, device): 20 | try: 21 | wav, sr = torchaudio.load(wav_path) 22 | # 合并多声道为单声道(取平均) 23 | if wav.size(0) > 1: 24 | wav = torch.mean(wav, dim=0, keepdim=True) 25 | if sr != SAMPLING_RATE: 26 | wav = torchaudio.functional.resample(wav, sr, SAMPLING_RATE) 27 | wav = wav[0, :].view(1, -1) 28 | wav = wav.to(device) 29 | out = model.extract_features(wav) 30 | return out 31 | except Exception as e: 32 | print(f"Error processing audio file {wav_path}: {e}") 33 | return None 34 | 35 | if __name__ == '__main__': 36 | 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument('--model_path', type=str, default="/home/work_nfs15/sywang/work_space/fairseq/1_public/checkpoint.pt", help="Path to the model checkpoint file") 39 | parser.add_argument('--model_dir', type=str, default="./Emotion2Vec-S/examples/data2vec/", help="Path to the model directory") 40 | parser.add_argument('--dump_dir', type=str, default="./features_frm", help="Directory to save extracted features") 41 | parser.add_argument('--device', type=str, default='cuda', help="Device to use for computation (e.g., 'cuda' or 'cpu')") 42 | parser.add_argument('--data', type=str, default="./Emotion2Vec-S/wav.scp", help="Path to the wav.scp file containing audio paths") 43 | parser.add_argument('--level', type=str, default="frame", help="frame or utterance") 44 | args = parser.parse_args() 45 | 46 | data = {} 47 | with open(args.data, 'r') as f: 48 | for line in f: 49 | seg_id, wav_path = line.strip().split(maxsplit=1) 50 | data[seg_id] = wav_path 51 | 52 | os.makedirs(args.dump_dir, exist_ok=True) 53 | 54 | seg_ids = data.keys() 55 | print(f'Loaded {len(seg_ids)} audio entries') 56 | # load models 57 | my_model_path = UserDirModule(args.model_dir) 58 | fairseq.utils.import_user_module(my_model_path) 59 | model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([args.model_path]) 60 | model = model[0].to(args.device) 61 | 62 | for seg_id in tqdm(seg_ids): 63 | 64 | wav_path = data[seg_id] 65 | if not os.path.exists(wav_path): 66 | print(f"WARNING: {wav_path} does not exist") 67 | continue 68 | try: 69 | torchaudio.load(wav_path) 70 | except: 71 | print(f'ERROR: Failed to load {wav_path}') 72 | continue 73 | 74 | # 提取特征 75 | feat = extract_fairseq_feature(wav_path, model, args.device) 76 | 77 | if feat is not None: 78 | # 处理特征输出 79 | if args.level == 'frame': 80 | feat = feat['x'].cpu().detach().numpy()[0] 81 | elif args.level == 'utterance': 82 | feat = feat['utt_x'].cpu().detach().numpy()[0] 83 | else: 84 | raise ValueError("Unknown level: {}".format(args.level)) 85 | 86 | save_path = os.path.join(args.dump_dir, f"{seg_id}.npy") 87 | os.makedirs(os.path.dirname(save_path), exist_ok=True) 88 | np.save(save_path, feat) 89 | print(f"Processed: {seg_id} | Shape: {feat.shape} | Saved to: {save_path}") 90 | else: 91 | print(f"Skipped problematic file: {seg_id}") 92 | -------------------------------------------------------------------------------- /Emotion2Vec-S/test_wav/4YJy1uDx0jM_769.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/test_wav/4YJy1uDx0jM_769.wav -------------------------------------------------------------------------------- /Emotion2Vec-S/test_wav/vo_EQAST002_1_paimon_07.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/Emotion2Vec-S/test_wav/vo_EQAST002_1_paimon_07.wav -------------------------------------------------------------------------------- /Emotion2Vec-S/wav.scp: -------------------------------------------------------------------------------- 1 | 4YJy1uDx0jM_769 ./Emotion2Vec-S/test_wav/4YJy1uDx0jM_769.wav 2 | vo_EQAST002_1_paimon_07 ./Emotion2Vec-S/test_wav/vo_EQAST002_1_paimon_07.wav -------------------------------------------------------------------------------- /figs/c2ser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zxzhao0/C2SER/93169740847eed63f0ce47818ccd0b20aabd048c/figs/c2ser.png --------------------------------------------------------------------------------